@wentorai/research-plugins 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +204 -0
  3. package/curated/analysis/README.md +64 -0
  4. package/curated/domains/README.md +104 -0
  5. package/curated/literature/README.md +53 -0
  6. package/curated/research/README.md +62 -0
  7. package/curated/tools/README.md +87 -0
  8. package/curated/writing/README.md +61 -0
  9. package/index.ts +39 -0
  10. package/mcp-configs/academic-db/ChatSpatial.json +17 -0
  11. package/mcp-configs/academic-db/academia-mcp.json +17 -0
  12. package/mcp-configs/academic-db/academic-paper-explorer.json +17 -0
  13. package/mcp-configs/academic-db/academic-search-mcp-server.json +17 -0
  14. package/mcp-configs/academic-db/agentinterviews-mcp.json +17 -0
  15. package/mcp-configs/academic-db/all-in-mcp.json +17 -0
  16. package/mcp-configs/academic-db/apple-health-mcp.json +17 -0
  17. package/mcp-configs/academic-db/arxiv-latex-mcp.json +17 -0
  18. package/mcp-configs/academic-db/arxiv-mcp-server.json +17 -0
  19. package/mcp-configs/academic-db/bgpt-mcp.json +17 -0
  20. package/mcp-configs/academic-db/biomcp.json +17 -0
  21. package/mcp-configs/academic-db/biothings-mcp.json +17 -0
  22. package/mcp-configs/academic-db/catalysishub-mcp-server.json +17 -0
  23. package/mcp-configs/academic-db/clinicaltrialsgov-mcp-server.json +17 -0
  24. package/mcp-configs/academic-db/deep-research-mcp.json +17 -0
  25. package/mcp-configs/academic-db/dicom-mcp.json +17 -0
  26. package/mcp-configs/academic-db/enrichr-mcp-server.json +17 -0
  27. package/mcp-configs/academic-db/fec-mcp-server.json +17 -0
  28. package/mcp-configs/academic-db/fhir-mcp-server-themomentum.json +17 -0
  29. package/mcp-configs/academic-db/fhir-mcp.json +19 -0
  30. package/mcp-configs/academic-db/gget-mcp.json +17 -0
  31. package/mcp-configs/academic-db/google-researcher-mcp.json +17 -0
  32. package/mcp-configs/academic-db/idea-reality-mcp.json +17 -0
  33. package/mcp-configs/academic-db/legiscan-mcp.json +19 -0
  34. package/mcp-configs/academic-db/lex.json +17 -0
  35. package/mcp-configs/ai-platform/Adaptive-Graph-of-Thoughts-MCP-server.json +17 -0
  36. package/mcp-configs/ai-platform/ai-counsel.json +17 -0
  37. package/mcp-configs/ai-platform/atlas-mcp-server.json +17 -0
  38. package/mcp-configs/ai-platform/counsel-mcp.json +17 -0
  39. package/mcp-configs/ai-platform/cross-llm-mcp.json +17 -0
  40. package/mcp-configs/ai-platform/gptr-mcp.json +17 -0
  41. package/mcp-configs/browser/decipher-research-agent.json +17 -0
  42. package/mcp-configs/browser/deep-research.json +17 -0
  43. package/mcp-configs/browser/everything-claude-code.json +17 -0
  44. package/mcp-configs/browser/gpt-researcher.json +17 -0
  45. package/mcp-configs/browser/heurist-agent-framework.json +17 -0
  46. package/mcp-configs/data-platform/4everland-hosting-mcp.json +17 -0
  47. package/mcp-configs/data-platform/context-keeper.json +17 -0
  48. package/mcp-configs/data-platform/context7.json +19 -0
  49. package/mcp-configs/data-platform/contextstream-mcp.json +17 -0
  50. package/mcp-configs/data-platform/email-mcp.json +17 -0
  51. package/mcp-configs/note-knowledge/ApeRAG.json +17 -0
  52. package/mcp-configs/note-knowledge/In-Memoria.json +17 -0
  53. package/mcp-configs/note-knowledge/agent-memory.json +17 -0
  54. package/mcp-configs/note-knowledge/aimemo.json +17 -0
  55. package/mcp-configs/note-knowledge/biel-mcp.json +19 -0
  56. package/mcp-configs/note-knowledge/cognee.json +17 -0
  57. package/mcp-configs/note-knowledge/context-awesome.json +17 -0
  58. package/mcp-configs/note-knowledge/context-mcp.json +17 -0
  59. package/mcp-configs/note-knowledge/conversation-handoff-mcp.json +17 -0
  60. package/mcp-configs/note-knowledge/cortex.json +17 -0
  61. package/mcp-configs/note-knowledge/devrag.json +17 -0
  62. package/mcp-configs/note-knowledge/easy-obsidian-mcp.json +17 -0
  63. package/mcp-configs/note-knowledge/engram.json +17 -0
  64. package/mcp-configs/note-knowledge/gnosis-mcp.json +17 -0
  65. package/mcp-configs/note-knowledge/graphlit-mcp-server.json +19 -0
  66. package/mcp-configs/reference-mgr/arxiv-cli.json +17 -0
  67. package/mcp-configs/reference-mgr/arxiv-search-mcp.json +17 -0
  68. package/mcp-configs/reference-mgr/chiken.json +17 -0
  69. package/mcp-configs/reference-mgr/claude-scholar.json +17 -0
  70. package/mcp-configs/reference-mgr/devonthink-mcp.json +17 -0
  71. package/mcp-configs/registry.json +447 -0
  72. package/openclaw.plugin.json +21 -0
  73. package/package.json +61 -0
  74. package/skills/analysis/dataviz/color-accessibility-guide/SKILL.md +230 -0
  75. package/skills/analysis/dataviz/geospatial-viz-guide/SKILL.md +218 -0
  76. package/skills/analysis/dataviz/interactive-viz-guide/SKILL.md +287 -0
  77. package/skills/analysis/dataviz/network-visualization-guide/SKILL.md +195 -0
  78. package/skills/analysis/dataviz/publication-figures-guide/SKILL.md +238 -0
  79. package/skills/analysis/dataviz/python-dataviz-guide/SKILL.md +195 -0
  80. package/skills/analysis/econometrics/causal-inference-guide/SKILL.md +197 -0
  81. package/skills/analysis/econometrics/iv-regression-guide/SKILL.md +198 -0
  82. package/skills/analysis/econometrics/panel-data-guide/SKILL.md +274 -0
  83. package/skills/analysis/econometrics/robustness-checks/SKILL.md +250 -0
  84. package/skills/analysis/econometrics/stata-regression/SKILL.md +117 -0
  85. package/skills/analysis/econometrics/time-series-guide/SKILL.md +235 -0
  86. package/skills/analysis/statistics/bayesian-statistics-guide/SKILL.md +221 -0
  87. package/skills/analysis/statistics/hypothesis-testing-guide/SKILL.md +210 -0
  88. package/skills/analysis/statistics/meta-analysis-guide/SKILL.md +206 -0
  89. package/skills/analysis/statistics/nonparametric-tests-guide/SKILL.md +221 -0
  90. package/skills/analysis/statistics/power-analysis-guide/SKILL.md +240 -0
  91. package/skills/analysis/statistics/sem-guide/SKILL.md +231 -0
  92. package/skills/analysis/statistics/survival-analysis-guide/SKILL.md +195 -0
  93. package/skills/analysis/wrangling/missing-data-handling/SKILL.md +224 -0
  94. package/skills/analysis/wrangling/pandas-data-wrangling/SKILL.md +242 -0
  95. package/skills/analysis/wrangling/questionnaire-design-guide/SKILL.md +234 -0
  96. package/skills/analysis/wrangling/text-mining-guide/SKILL.md +225 -0
  97. package/skills/domains/ai-ml/computer-vision-guide/SKILL.md +213 -0
  98. package/skills/domains/ai-ml/deep-learning-papers-guide/SKILL.md +200 -0
  99. package/skills/domains/ai-ml/llm-evaluation-guide/SKILL.md +194 -0
  100. package/skills/domains/ai-ml/prompt-engineering-research/SKILL.md +233 -0
  101. package/skills/domains/ai-ml/reinforcement-learning-guide/SKILL.md +254 -0
  102. package/skills/domains/ai-ml/transformer-architecture-guide/SKILL.md +233 -0
  103. package/skills/domains/biomedical/clinical-research-guide/SKILL.md +232 -0
  104. package/skills/domains/biomedical/clinicaltrials-api/SKILL.md +177 -0
  105. package/skills/domains/biomedical/epidemiology-guide/SKILL.md +200 -0
  106. package/skills/domains/biomedical/genomics-analysis-guide/SKILL.md +270 -0
  107. package/skills/domains/business/market-analysis-guide/SKILL.md +112 -0
  108. package/skills/domains/business/strategic-management-guide/SKILL.md +154 -0
  109. package/skills/domains/chemistry/computational-chemistry-guide/SKILL.md +266 -0
  110. package/skills/domains/chemistry/retrosynthesis-guide/SKILL.md +215 -0
  111. package/skills/domains/cs/algorithms-complexity-guide/SKILL.md +194 -0
  112. package/skills/domains/cs/dblp-api/SKILL.md +129 -0
  113. package/skills/domains/cs/software-engineering-research/SKILL.md +218 -0
  114. package/skills/domains/ecology/biodiversity-data-guide/SKILL.md +296 -0
  115. package/skills/domains/ecology/conservation-biology-guide/SKILL.md +198 -0
  116. package/skills/domains/ecology/gbif-api/SKILL.md +158 -0
  117. package/skills/domains/ecology/inaturalist-api/SKILL.md +173 -0
  118. package/skills/domains/economics/behavioral-economics-guide/SKILL.md +239 -0
  119. package/skills/domains/economics/development-economics-guide/SKILL.md +181 -0
  120. package/skills/domains/economics/fred-api/SKILL.md +189 -0
  121. package/skills/domains/education/curriculum-design-guide/SKILL.md +144 -0
  122. package/skills/domains/education/learning-science-guide/SKILL.md +150 -0
  123. package/skills/domains/finance/financial-data-analysis/SKILL.md +152 -0
  124. package/skills/domains/finance/quantitative-finance-guide/SKILL.md +151 -0
  125. package/skills/domains/geoscience/climate-science-guide/SKILL.md +158 -0
  126. package/skills/domains/geoscience/gis-remote-sensing-guide/SKILL.md +129 -0
  127. package/skills/domains/humanities/digital-humanities-guide/SKILL.md +181 -0
  128. package/skills/domains/humanities/philosophy-research-guide/SKILL.md +148 -0
  129. package/skills/domains/law/courtlistener-api/SKILL.md +213 -0
  130. package/skills/domains/law/legal-research-guide/SKILL.md +250 -0
  131. package/skills/domains/math/linear-algebra-applications/SKILL.md +227 -0
  132. package/skills/domains/math/numerical-methods-guide/SKILL.md +236 -0
  133. package/skills/domains/math/oeis-api/SKILL.md +158 -0
  134. package/skills/domains/pharma/clinical-pharmacology-guide/SKILL.md +165 -0
  135. package/skills/domains/pharma/drug-development-guide/SKILL.md +177 -0
  136. package/skills/domains/physics/computational-physics-guide/SKILL.md +300 -0
  137. package/skills/domains/physics/nasa-ads-api/SKILL.md +150 -0
  138. package/skills/domains/physics/quantum-computing-guide/SKILL.md +234 -0
  139. package/skills/domains/social-science/social-research-methods/SKILL.md +194 -0
  140. package/skills/domains/social-science/survey-research-guide/SKILL.md +182 -0
  141. package/skills/literature/discovery/citation-alert-guide/SKILL.md +154 -0
  142. package/skills/literature/discovery/conference-proceedings-guide/SKILL.md +142 -0
  143. package/skills/literature/discovery/literature-mapping-guide/SKILL.md +175 -0
  144. package/skills/literature/discovery/paper-tracking-guide/SKILL.md +211 -0
  145. package/skills/literature/discovery/rss-paper-feeds/SKILL.md +214 -0
  146. package/skills/literature/discovery/semantic-scholar-recs-guide/SKILL.md +164 -0
  147. package/skills/literature/fulltext/doaj-api/SKILL.md +120 -0
  148. package/skills/literature/fulltext/interlibrary-loan-guide/SKILL.md +163 -0
  149. package/skills/literature/fulltext/open-access-guide/SKILL.md +183 -0
  150. package/skills/literature/fulltext/pmc-oai-api/SKILL.md +184 -0
  151. package/skills/literature/fulltext/preprint-servers-guide/SKILL.md +128 -0
  152. package/skills/literature/fulltext/repository-harvesting-guide/SKILL.md +207 -0
  153. package/skills/literature/fulltext/unpaywall-api/SKILL.md +113 -0
  154. package/skills/literature/metadata/altmetrics-guide/SKILL.md +132 -0
  155. package/skills/literature/metadata/citation-network-guide/SKILL.md +236 -0
  156. package/skills/literature/metadata/crossref-api/SKILL.md +133 -0
  157. package/skills/literature/metadata/datacite-api/SKILL.md +126 -0
  158. package/skills/literature/metadata/doi-resolution-guide/SKILL.md +168 -0
  159. package/skills/literature/metadata/h-index-guide/SKILL.md +183 -0
  160. package/skills/literature/metadata/journal-metrics-guide/SKILL.md +188 -0
  161. package/skills/literature/metadata/opencitations-api/SKILL.md +128 -0
  162. package/skills/literature/metadata/orcid-api/SKILL.md +136 -0
  163. package/skills/literature/metadata/orcid-integration-guide/SKILL.md +178 -0
  164. package/skills/literature/search/arxiv-api/SKILL.md +95 -0
  165. package/skills/literature/search/biorxiv-api/SKILL.md +123 -0
  166. package/skills/literature/search/boolean-search-guide/SKILL.md +199 -0
  167. package/skills/literature/search/citation-chaining-guide/SKILL.md +148 -0
  168. package/skills/literature/search/database-comparison-guide/SKILL.md +100 -0
  169. package/skills/literature/search/europe-pmc-api/SKILL.md +120 -0
  170. package/skills/literature/search/google-scholar-guide/SKILL.md +182 -0
  171. package/skills/literature/search/mesh-terms-guide/SKILL.md +164 -0
  172. package/skills/literature/search/openalex-api/SKILL.md +134 -0
  173. package/skills/literature/search/pubmed-api/SKILL.md +130 -0
  174. package/skills/literature/search/scientify-literature-survey/SKILL.md +203 -0
  175. package/skills/literature/search/semantic-scholar-api/SKILL.md +134 -0
  176. package/skills/literature/search/systematic-search-strategy/SKILL.md +214 -0
  177. package/skills/research/automation/ai-scientist-guide/SKILL.md +228 -0
  178. package/skills/research/automation/data-collection-automation/SKILL.md +248 -0
  179. package/skills/research/automation/research-workflow-automation/SKILL.md +266 -0
  180. package/skills/research/deep-research/meta-synthesis-guide/SKILL.md +174 -0
  181. package/skills/research/deep-research/research-cog/SKILL.md +153 -0
  182. package/skills/research/deep-research/scoping-review-guide/SKILL.md +217 -0
  183. package/skills/research/deep-research/systematic-review-guide/SKILL.md +250 -0
  184. package/skills/research/funding/figshare-api/SKILL.md +163 -0
  185. package/skills/research/funding/grant-writing-guide/SKILL.md +233 -0
  186. package/skills/research/funding/nsf-grant-guide/SKILL.md +206 -0
  187. package/skills/research/funding/open-science-guide/SKILL.md +255 -0
  188. package/skills/research/funding/zenodo-api/SKILL.md +174 -0
  189. package/skills/research/methodology/action-research-guide/SKILL.md +201 -0
  190. package/skills/research/methodology/experimental-design-guide/SKILL.md +236 -0
  191. package/skills/research/methodology/grad-school-guide/SKILL.md +182 -0
  192. package/skills/research/methodology/grounded-theory-guide/SKILL.md +171 -0
  193. package/skills/research/methodology/mixed-methods-guide/SKILL.md +208 -0
  194. package/skills/research/methodology/qualitative-research-guide/SKILL.md +234 -0
  195. package/skills/research/methodology/scientify-idea-generation/SKILL.md +222 -0
  196. package/skills/research/paper-review/paper-reading-assistant/SKILL.md +266 -0
  197. package/skills/research/paper-review/peer-review-guide/SKILL.md +227 -0
  198. package/skills/research/paper-review/rebuttal-writing-guide/SKILL.md +185 -0
  199. package/skills/research/paper-review/scientify-write-review-paper/SKILL.md +209 -0
  200. package/skills/tools/code-exec/jupyter-notebook-guide/SKILL.md +178 -0
  201. package/skills/tools/code-exec/python-reproducibility-guide/SKILL.md +341 -0
  202. package/skills/tools/code-exec/r-reproducibility-guide/SKILL.md +236 -0
  203. package/skills/tools/code-exec/sandbox-execution-guide/SKILL.md +221 -0
  204. package/skills/tools/diagram/mermaid-diagram-guide/SKILL.md +269 -0
  205. package/skills/tools/diagram/plantuml-guide/SKILL.md +397 -0
  206. package/skills/tools/diagram/scientific-illustration-guide/SKILL.md +225 -0
  207. package/skills/tools/document/anystyle-api/SKILL.md +199 -0
  208. package/skills/tools/document/grobid-pdf-parsing/SKILL.md +294 -0
  209. package/skills/tools/document/markdown-academic-guide/SKILL.md +217 -0
  210. package/skills/tools/document/pdf-extraction-guide/SKILL.md +321 -0
  211. package/skills/tools/knowledge-graph/knowledge-graph-construction/SKILL.md +306 -0
  212. package/skills/tools/knowledge-graph/ontology-design-guide/SKILL.md +214 -0
  213. package/skills/tools/knowledge-graph/rag-methodology-guide/SKILL.md +325 -0
  214. package/skills/tools/ocr-translate/formula-recognition-guide/SKILL.md +367 -0
  215. package/skills/tools/ocr-translate/handwriting-recognition-guide/SKILL.md +211 -0
  216. package/skills/tools/ocr-translate/latex-ocr-guide/SKILL.md +204 -0
  217. package/skills/tools/ocr-translate/multilingual-research-guide/SKILL.md +234 -0
  218. package/skills/tools/scraping/academic-web-scraping/SKILL.md +326 -0
  219. package/skills/tools/scraping/api-data-collection-guide/SKILL.md +301 -0
  220. package/skills/tools/scraping/web-scraping-ethics-guide/SKILL.md +250 -0
  221. package/skills/writing/citation/bibtex-management-guide/SKILL.md +246 -0
  222. package/skills/writing/citation/citation-style-guide/SKILL.md +248 -0
  223. package/skills/writing/citation/reference-manager-comparison/SKILL.md +208 -0
  224. package/skills/writing/citation/zotero-api/SKILL.md +188 -0
  225. package/skills/writing/composition/abstract-writing-guide/SKILL.md +188 -0
  226. package/skills/writing/composition/discussion-writing-guide/SKILL.md +194 -0
  227. package/skills/writing/composition/introduction-writing-guide/SKILL.md +194 -0
  228. package/skills/writing/composition/literature-review-writing/SKILL.md +196 -0
  229. package/skills/writing/composition/methods-section-guide/SKILL.md +185 -0
  230. package/skills/writing/composition/response-to-reviewers/SKILL.md +215 -0
  231. package/skills/writing/composition/scientific-writing-guide/SKILL.md +152 -0
  232. package/skills/writing/latex/bibliography-management-guide/SKILL.md +206 -0
  233. package/skills/writing/latex/latex-drawing-guide/SKILL.md +234 -0
  234. package/skills/writing/latex/latex-ecosystem-guide/SKILL.md +240 -0
  235. package/skills/writing/latex/math-typesetting-guide/SKILL.md +231 -0
  236. package/skills/writing/latex/overleaf-collaboration-guide/SKILL.md +211 -0
  237. package/skills/writing/latex/tikz-diagrams-guide/SKILL.md +211 -0
  238. package/skills/writing/polish/academic-translation-guide/SKILL.md +175 -0
  239. package/skills/writing/polish/academic-writing-refiner/SKILL.md +143 -0
  240. package/skills/writing/polish/ai-writing-humanizer/SKILL.md +178 -0
  241. package/skills/writing/polish/grammar-checker-guide/SKILL.md +184 -0
  242. package/skills/writing/polish/plagiarism-detection-guide/SKILL.md +167 -0
  243. package/skills/writing/templates/beamer-presentation-guide/SKILL.md +263 -0
  244. package/skills/writing/templates/conference-paper-template/SKILL.md +219 -0
  245. package/skills/writing/templates/thesis-template-guide/SKILL.md +200 -0
  246. package/skills/writing/templates/thesis-writing-guide/SKILL.md +220 -0
  247. package/src/tools/arxiv.ts +131 -0
  248. package/src/tools/crossref.ts +112 -0
  249. package/src/tools/openalex.ts +174 -0
  250. package/src/tools/pubmed.ts +166 -0
  251. package/src/tools/semantic-scholar.ts +108 -0
  252. package/src/tools/unpaywall.ts +58 -0
@@ -0,0 +1,250 @@
1
+ ---
2
+ name: web-scraping-ethics-guide
3
+ description: "Scrape web data ethically and legally for research purposes"
4
+ metadata:
5
+ openclaw:
6
+ emoji: "globe_with_meridians"
7
+ category: "tools"
8
+ subcategory: "scraping"
9
+ keywords: ["web scraping", "ethical scraping", "robots.txt", "rate limiting", "research data collection", "crawling"]
10
+ source: "wentor-research-plugins"
11
+ ---
12
+
13
+ # Ethical Web Scraping for Research
14
+
15
+ A skill for collecting web data ethically and legally for research purposes. Covers robots.txt compliance, rate limiting, legal frameworks, data privacy considerations, and practical scraping techniques that respect website operators and comply with institutional review requirements.
16
+
17
+ ## Ethical Framework
18
+
19
+ ### Principles of Ethical Scraping
20
+
21
+ ```
22
+ 1. Respect robots.txt and Terms of Service
23
+ - Check robots.txt before scraping any site
24
+ - Review the site's ToS for explicit prohibitions
25
+ - When in doubt, contact the site operator
26
+
27
+ 2. Minimize server impact
28
+ - Use rate limiting (1-2 requests per second maximum)
29
+ - Scrape during off-peak hours when possible
30
+ - Cache responses to avoid redundant requests
31
+ - Use conditional requests (If-Modified-Since headers)
32
+
33
+ 3. Collect only what you need
34
+ - Define your data requirements before scraping
35
+ - Do not scrape personal data without ethical justification
36
+ - Anonymize or pseudonymize personal information
37
+
38
+ 4. Attribution and transparency
39
+ - Set a descriptive User-Agent header with contact info
40
+ - Be prepared to identify yourself if contacted
41
+ - Credit data sources in publications
42
+
43
+ 5. Institutional compliance
44
+ - Check if your IRB/ethics board requires approval for web data
45
+ - Follow your institution's acceptable use policy
46
+ - Consider data protection regulations (GDPR, CCPA)
47
+ ```
48
+
49
+ ## Checking robots.txt
50
+
51
+ ### Parsing Robots.txt
52
+
53
+ ```python
54
+ import urllib.request
55
+ import urllib.robotparser
56
+
57
+
58
+ def check_robots_txt(base_url: str, target_path: str,
59
+ user_agent: str = "*") -> dict:
60
+ """
61
+ Check if a URL is allowed by robots.txt.
62
+
63
+ Args:
64
+ base_url: The website's base URL (e.g., 'https://example.com')
65
+ target_path: The path you want to scrape (e.g., '/data/papers')
66
+ user_agent: Your bot's user agent string
67
+ """
68
+ robots_url = f"{base_url}/robots.txt"
69
+
70
+ rp = urllib.robotparser.RobotFileParser()
71
+ rp.set_url(robots_url)
72
+
73
+ try:
74
+ rp.read()
75
+ except Exception as e:
76
+ return {
77
+ "robots_txt_found": False,
78
+ "error": str(e),
79
+ "recommendation": "Proceed with caution; use conservative rate limiting"
80
+ }
81
+
82
+ full_url = f"{base_url}{target_path}"
83
+ allowed = rp.can_fetch(user_agent, full_url)
84
+ crawl_delay = rp.crawl_delay(user_agent)
85
+
86
+ return {
87
+ "robots_txt_found": True,
88
+ "url_checked": full_url,
89
+ "allowed": allowed,
90
+ "crawl_delay": crawl_delay or "Not specified (use 1-2 seconds)",
91
+ "recommendation": (
92
+ "Proceed with specified crawl delay"
93
+ if allowed
94
+ else "Do NOT scrape this path -- it is disallowed"
95
+ )
96
+ }
97
+ ```
98
+
99
+ ## Rate-Limited Scraping
100
+
101
+ ### Respectful Request Pattern
102
+
103
+ ```python
104
+ import time
105
+ import urllib.request
106
+
107
+
108
+ def scrape_with_rate_limit(urls: list[str],
109
+ delay: float = 1.0,
110
+ user_agent: str = None) -> list[dict]:
111
+ """
112
+ Scrape a list of URLs with rate limiting and proper headers.
113
+
114
+ Args:
115
+ urls: List of URLs to fetch
116
+ delay: Seconds to wait between requests
117
+ user_agent: Custom user agent string
118
+ """
119
+ if user_agent is None:
120
+ user_agent = (
121
+ "ResearchBot/1.0 (Academic research; "
122
+ "contact: researcher@university.edu)"
123
+ )
124
+
125
+ results = []
126
+
127
+ for i, url in enumerate(urls):
128
+ try:
129
+ req = urllib.request.Request(url, headers={
130
+ "User-Agent": user_agent,
131
+ "Accept": "text/html",
132
+ })
133
+
134
+ response = urllib.request.urlopen(req, timeout=30)
135
+ content = response.read().decode("utf-8", errors="replace")
136
+
137
+ results.append({
138
+ "url": url,
139
+ "status": response.status,
140
+ "content_length": len(content),
141
+ "success": True
142
+ })
143
+
144
+ except Exception as e:
145
+ results.append({
146
+ "url": url,
147
+ "error": str(e),
148
+ "success": False
149
+ })
150
+
151
+ # Rate limiting
152
+ if i < len(urls) - 1:
153
+ time.sleep(delay)
154
+
155
+ return results
156
+ ```
157
+
158
+ ## Legal Considerations
159
+
160
+ ### Key Legal Frameworks
161
+
162
+ ```
163
+ United States:
164
+ - CFAA (Computer Fraud and Abuse Act): Unauthorized access is illegal
165
+ - hiQ v. LinkedIn (2022): Scraping public data is generally permissible
166
+ - Key question: Is the data publicly accessible without authentication?
167
+
168
+ European Union:
169
+ - GDPR: Personal data requires legal basis for processing
170
+ - Database Directive: Protects substantial investment in databases
171
+ - Text and Data Mining exception (DSM Directive, Art. 3-4):
172
+ Research organizations can mine lawfully accessible content
173
+
174
+ General guidance:
175
+ - Public data is more defensible than data behind login walls
176
+ - Scraping that circumvents technical measures is riskier
177
+ - Academic fair use / research exceptions vary by jurisdiction
178
+ - When in doubt, consult your institution's legal counsel
179
+ ```
180
+
181
+ ## Research-Specific Considerations
182
+
183
+ ### IRB and Ethics Approval
184
+
185
+ ```python
186
+ def assess_irb_requirements(data_type: str,
187
+ contains_pii: bool) -> dict:
188
+ """
189
+ Assess whether web scraping requires IRB review.
190
+
191
+ Args:
192
+ data_type: Type of data being collected
193
+ contains_pii: Whether data includes personally identifiable information
194
+ """
195
+ if contains_pii:
196
+ return {
197
+ "irb_required": "Likely yes",
198
+ "rationale": (
199
+ "Data that identifies or can re-identify individuals "
200
+ "generally requires ethics review, even if publicly posted."
201
+ ),
202
+ "steps": [
203
+ "Submit IRB protocol describing data collection",
204
+ "Justify why PII is necessary for the research",
205
+ "Describe de-identification procedures",
206
+ "Explain data storage and security measures",
207
+ "Plan for data destruction after the study"
208
+ ]
209
+ }
210
+
211
+ return {
212
+ "irb_required": "Possibly exempt, but check with your IRB",
213
+ "rationale": (
214
+ "Non-human-subjects data (e.g., product prices, publication "
215
+ "metadata) typically does not require IRB review, but policies "
216
+ "vary by institution."
217
+ ),
218
+ "recommendation": "Submit an exemption request to be safe"
219
+ }
220
+ ```
221
+
222
+ ## Best Practices Summary
223
+
224
+ ### Scraping Checklist for Researchers
225
+
226
+ ```
227
+ Before scraping:
228
+ [ ] Check robots.txt
229
+ [ ] Review Terms of Service
230
+ [ ] Consider whether an API exists (prefer API over scraping)
231
+ [ ] Assess IRB requirements
232
+ [ ] Define minimal data needed
233
+
234
+ During scraping:
235
+ [ ] Set descriptive User-Agent with contact email
236
+ [ ] Implement rate limiting (min 1 second between requests)
237
+ [ ] Handle errors gracefully (do not retry aggressively)
238
+ [ ] Log all requests for reproducibility
239
+ [ ] Cache responses to avoid re-fetching
240
+
241
+ After scraping:
242
+ [ ] Anonymize personal data if present
243
+ [ ] Store data securely
244
+ [ ] Document the scraping methodology for your paper
245
+ [ ] Credit the data source
246
+ [ ] Consider whether the scraped data can be shared
247
+ (check copyright and ToS)
248
+ ```
249
+
250
+ Whenever a public API is available (e.g., Twitter/X API, Reddit API, CrossRef API), use the API instead of scraping HTML. APIs provide structured data, respect rate limits by design, and demonstrate good faith in your research methodology.
@@ -0,0 +1,246 @@
1
+ ---
2
+ name: bibtex-management-guide
3
+ description: "Clean, format, deduplicate, and manage BibTeX bibliography files for LaTeX"
4
+ metadata:
5
+ openclaw:
6
+ emoji: "card_file_box"
7
+ category: "writing"
8
+ subcategory: "citation"
9
+ keywords: ["BibTeX formatting", "BibTeX conversion", "bibliography cleanup", "reference deduplication", "citation management"]
10
+ source: "wentor"
11
+ ---
12
+
13
+ # BibTeX Management Guide
14
+
15
+ A skill for maintaining clean, consistent, and complete BibTeX bibliography files. Covers formatting standards, deduplication, common errors, and automated cleanup workflows essential for LaTeX-based academic writing.
16
+
17
+ ## BibTeX Entry Standards
18
+
19
+ ### Required Fields by Entry Type
20
+
21
+ ```bibtex
22
+ % Article in a journal
23
+ @article{smith2024deep,
24
+ author = {Smith, John A. and Doe, Jane B.},
25
+ title = {Deep Learning for Climate Prediction: A Comparative Study},
26
+ journal = {Nature Machine Intelligence},
27
+ year = {2024},
28
+ volume = {6},
29
+ number = {3},
30
+ pages = {234--248},
31
+ doi = {10.1038/s42256-024-00001-1}
32
+ }
33
+
34
+ % Conference proceedings
35
+ @inproceedings{lee2024attention,
36
+ author = {Lee, Wei and Chen, Li},
37
+ title = {Attention Mechanisms for Scientific Document Understanding},
38
+ booktitle = {Proceedings of the 62nd Annual Meeting of the ACL},
39
+ year = {2024},
40
+ pages = {1123--1135},
41
+ publisher = {Association for Computational Linguistics},
42
+ doi = {10.18653/v1/2024.acl-main.89}
43
+ }
44
+
45
+ % Book
46
+ @book{bishop2006pattern,
47
+ author = {Bishop, Christopher M.},
48
+ title = {Pattern Recognition and Machine Learning},
49
+ publisher = {Springer},
50
+ year = {2006},
51
+ isbn = {978-0387310732}
52
+ }
53
+ ```
54
+
55
+ ## Automated BibTeX Cleanup
56
+
57
+ ### Deduplication
58
+
59
+ ```python
60
+ import re
61
+ from collections import defaultdict
62
+
63
+ def parse_bibtex_entries(bib_content: str) -> list[dict]:
64
+ """
65
+ Parse a BibTeX file into structured entries.
66
+ """
67
+ entries = []
68
+ pattern = r'@(\w+)\{([^,]+),\s*(.*?)\n\}'
69
+ matches = re.finditer(pattern, bib_content, re.DOTALL)
70
+
71
+ for match in matches:
72
+ entry = {
73
+ 'type': match.group(1).lower(),
74
+ 'key': match.group(2).strip(),
75
+ 'raw': match.group(0),
76
+ 'fields': {}
77
+ }
78
+
79
+ fields_str = match.group(3)
80
+ field_pattern = r'(\w+)\s*=\s*[{\"](.+?)[}\"]'
81
+ for field_match in re.finditer(field_pattern, fields_str, re.DOTALL):
82
+ entry['fields'][field_match.group(1).lower()] = field_match.group(2).strip()
83
+
84
+ entries.append(entry)
85
+
86
+ return entries
87
+
88
+
89
+ def deduplicate_bibtex(entries: list[dict]) -> dict:
90
+ """
91
+ Find and remove duplicate BibTeX entries.
92
+
93
+ Deduplication strategy:
94
+ 1. Exact DOI match
95
+ 2. Fuzzy title match (normalized)
96
+ 3. Author + year + first title word match
97
+ """
98
+ seen_dois = {}
99
+ seen_titles = {}
100
+ duplicates = []
101
+ unique = []
102
+
103
+ for entry in entries:
104
+ doi = entry['fields'].get('doi', '').lower().strip()
105
+ title = entry['fields'].get('title', '').lower().strip()
106
+ title_normalized = re.sub(r'[^a-z0-9\s]', '', title)
107
+
108
+ is_duplicate = False
109
+
110
+ # Check DOI match
111
+ if doi and doi in seen_dois:
112
+ duplicates.append({
113
+ 'entry': entry['key'],
114
+ 'duplicate_of': seen_dois[doi],
115
+ 'reason': 'same DOI'
116
+ })
117
+ is_duplicate = True
118
+ elif doi:
119
+ seen_dois[doi] = entry['key']
120
+
121
+ # Check title match
122
+ if not is_duplicate and title_normalized:
123
+ if title_normalized in seen_titles:
124
+ duplicates.append({
125
+ 'entry': entry['key'],
126
+ 'duplicate_of': seen_titles[title_normalized],
127
+ 'reason': 'same title'
128
+ })
129
+ is_duplicate = True
130
+ else:
131
+ seen_titles[title_normalized] = entry['key']
132
+
133
+ if not is_duplicate:
134
+ unique.append(entry)
135
+
136
+ return {
137
+ 'unique_entries': len(unique),
138
+ 'duplicates_found': len(duplicates),
139
+ 'duplicates': duplicates,
140
+ 'entries': unique
141
+ }
142
+ ```
143
+
144
+ ### Field Formatting
145
+
146
+ ```python
147
+ def clean_bibtex_entry(entry: dict) -> dict:
148
+ """
149
+ Clean and standardize a BibTeX entry.
150
+ """
151
+ cleaned = entry.copy()
152
+ fields = cleaned['fields']
153
+
154
+ # Standardize author names: "Last, First and Last, First"
155
+ if 'author' in fields:
156
+ authors = fields['author']
157
+ # Fix common issues
158
+ authors = authors.replace(' AND ', ' and ')
159
+ authors = authors.replace(' & ', ' and ')
160
+ fields['author'] = authors
161
+
162
+ # Ensure proper page ranges with en-dash
163
+ if 'pages' in fields:
164
+ fields['pages'] = fields['pages'].replace('-', '--').replace('---', '--')
165
+
166
+ # Capitalize title properly (protect proper nouns with braces)
167
+ if 'title' in fields:
168
+ title = fields['title']
169
+ # Protect acronyms and proper nouns
170
+ words = title.split()
171
+ for i, word in enumerate(words):
172
+ if word.isupper() and len(word) > 1:
173
+ words[i] = '{' + word + '}'
174
+ fields['title'] = ' '.join(words)
175
+
176
+ # Add missing DOI prefix
177
+ if 'doi' in fields:
178
+ doi = fields['doi']
179
+ doi = doi.replace('https://doi.org/', '')
180
+ doi = doi.replace('http://dx.doi.org/', '')
181
+ fields['doi'] = doi
182
+
183
+ # Remove empty fields
184
+ fields = {k: v for k, v in fields.items() if v.strip()}
185
+ cleaned['fields'] = fields
186
+
187
+ return cleaned
188
+ ```
189
+
190
+ ## DOI-Based Entry Generation
191
+
192
+ ### Fetch Complete BibTeX from DOI
193
+
194
+ ```python
195
+ import requests
196
+
197
+ def doi_to_bibtex(doi: str) -> str:
198
+ """
199
+ Retrieve a complete BibTeX entry from a DOI using CrossRef.
200
+ """
201
+ url = f"https://doi.org/{doi}"
202
+ headers = {'Accept': 'application/x-bibtex'}
203
+ response = requests.get(url, headers=headers, allow_redirects=True)
204
+
205
+ if response.status_code == 200:
206
+ return response.text
207
+ else:
208
+ return f"% Error: Could not retrieve BibTeX for DOI {doi}"
209
+
210
+ # Example
211
+ bibtex = doi_to_bibtex('10.1038/s41586-021-03819-2')
212
+ print(bibtex)
213
+ ```
214
+
215
+ ## Citation Key Conventions
216
+
217
+ Consistent citation keys improve readability:
218
+
219
+ ```
220
+ Convention: authorYEARfirstword
221
+ Examples:
222
+ smith2024deep
223
+ lee2024attention
224
+ bishop2006pattern
225
+
226
+ For multiple papers by same author in same year:
227
+ smith2024a, smith2024b
228
+
229
+ For papers with many authors:
230
+ smithetal2024deep (use "etal" for 3+ authors)
231
+ ```
232
+
233
+ ## Validation Checklist
234
+
235
+ Before submitting a manuscript, validate your BibTeX file:
236
+
237
+ 1. Every `\cite{}` in the manuscript has a matching entry in the .bib file
238
+ 2. No orphaned entries (entries in .bib not cited in manuscript)
239
+ 3. All entries have at minimum: author, title, year
240
+ 4. All journal articles have: volume, pages (or article number), DOI
241
+ 5. Page ranges use en-dash (`--`), not single hyphen
242
+ 6. No encoding errors in author names (check accented characters)
243
+ 7. Proper nouns and acronyms in titles are protected with braces
244
+ 8. No duplicate entries exist
245
+
246
+ Use `biber --validate-datamodel` or `checkcites` for automated validation.