@wentorai/research-plugins 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +204 -0
  3. package/curated/analysis/README.md +64 -0
  4. package/curated/domains/README.md +104 -0
  5. package/curated/literature/README.md +53 -0
  6. package/curated/research/README.md +62 -0
  7. package/curated/tools/README.md +87 -0
  8. package/curated/writing/README.md +61 -0
  9. package/index.ts +39 -0
  10. package/mcp-configs/academic-db/ChatSpatial.json +17 -0
  11. package/mcp-configs/academic-db/academia-mcp.json +17 -0
  12. package/mcp-configs/academic-db/academic-paper-explorer.json +17 -0
  13. package/mcp-configs/academic-db/academic-search-mcp-server.json +17 -0
  14. package/mcp-configs/academic-db/agentinterviews-mcp.json +17 -0
  15. package/mcp-configs/academic-db/all-in-mcp.json +17 -0
  16. package/mcp-configs/academic-db/apple-health-mcp.json +17 -0
  17. package/mcp-configs/academic-db/arxiv-latex-mcp.json +17 -0
  18. package/mcp-configs/academic-db/arxiv-mcp-server.json +17 -0
  19. package/mcp-configs/academic-db/bgpt-mcp.json +17 -0
  20. package/mcp-configs/academic-db/biomcp.json +17 -0
  21. package/mcp-configs/academic-db/biothings-mcp.json +17 -0
  22. package/mcp-configs/academic-db/catalysishub-mcp-server.json +17 -0
  23. package/mcp-configs/academic-db/clinicaltrialsgov-mcp-server.json +17 -0
  24. package/mcp-configs/academic-db/deep-research-mcp.json +17 -0
  25. package/mcp-configs/academic-db/dicom-mcp.json +17 -0
  26. package/mcp-configs/academic-db/enrichr-mcp-server.json +17 -0
  27. package/mcp-configs/academic-db/fec-mcp-server.json +17 -0
  28. package/mcp-configs/academic-db/fhir-mcp-server-themomentum.json +17 -0
  29. package/mcp-configs/academic-db/fhir-mcp.json +19 -0
  30. package/mcp-configs/academic-db/gget-mcp.json +17 -0
  31. package/mcp-configs/academic-db/google-researcher-mcp.json +17 -0
  32. package/mcp-configs/academic-db/idea-reality-mcp.json +17 -0
  33. package/mcp-configs/academic-db/legiscan-mcp.json +19 -0
  34. package/mcp-configs/academic-db/lex.json +17 -0
  35. package/mcp-configs/ai-platform/Adaptive-Graph-of-Thoughts-MCP-server.json +17 -0
  36. package/mcp-configs/ai-platform/ai-counsel.json +17 -0
  37. package/mcp-configs/ai-platform/atlas-mcp-server.json +17 -0
  38. package/mcp-configs/ai-platform/counsel-mcp.json +17 -0
  39. package/mcp-configs/ai-platform/cross-llm-mcp.json +17 -0
  40. package/mcp-configs/ai-platform/gptr-mcp.json +17 -0
  41. package/mcp-configs/browser/decipher-research-agent.json +17 -0
  42. package/mcp-configs/browser/deep-research.json +17 -0
  43. package/mcp-configs/browser/everything-claude-code.json +17 -0
  44. package/mcp-configs/browser/gpt-researcher.json +17 -0
  45. package/mcp-configs/browser/heurist-agent-framework.json +17 -0
  46. package/mcp-configs/data-platform/4everland-hosting-mcp.json +17 -0
  47. package/mcp-configs/data-platform/context-keeper.json +17 -0
  48. package/mcp-configs/data-platform/context7.json +19 -0
  49. package/mcp-configs/data-platform/contextstream-mcp.json +17 -0
  50. package/mcp-configs/data-platform/email-mcp.json +17 -0
  51. package/mcp-configs/note-knowledge/ApeRAG.json +17 -0
  52. package/mcp-configs/note-knowledge/In-Memoria.json +17 -0
  53. package/mcp-configs/note-knowledge/agent-memory.json +17 -0
  54. package/mcp-configs/note-knowledge/aimemo.json +17 -0
  55. package/mcp-configs/note-knowledge/biel-mcp.json +19 -0
  56. package/mcp-configs/note-knowledge/cognee.json +17 -0
  57. package/mcp-configs/note-knowledge/context-awesome.json +17 -0
  58. package/mcp-configs/note-knowledge/context-mcp.json +17 -0
  59. package/mcp-configs/note-knowledge/conversation-handoff-mcp.json +17 -0
  60. package/mcp-configs/note-knowledge/cortex.json +17 -0
  61. package/mcp-configs/note-knowledge/devrag.json +17 -0
  62. package/mcp-configs/note-knowledge/easy-obsidian-mcp.json +17 -0
  63. package/mcp-configs/note-knowledge/engram.json +17 -0
  64. package/mcp-configs/note-knowledge/gnosis-mcp.json +17 -0
  65. package/mcp-configs/note-knowledge/graphlit-mcp-server.json +19 -0
  66. package/mcp-configs/reference-mgr/arxiv-cli.json +17 -0
  67. package/mcp-configs/reference-mgr/arxiv-search-mcp.json +17 -0
  68. package/mcp-configs/reference-mgr/chiken.json +17 -0
  69. package/mcp-configs/reference-mgr/claude-scholar.json +17 -0
  70. package/mcp-configs/reference-mgr/devonthink-mcp.json +17 -0
  71. package/mcp-configs/registry.json +447 -0
  72. package/openclaw.plugin.json +21 -0
  73. package/package.json +61 -0
  74. package/skills/analysis/dataviz/color-accessibility-guide/SKILL.md +230 -0
  75. package/skills/analysis/dataviz/geospatial-viz-guide/SKILL.md +218 -0
  76. package/skills/analysis/dataviz/interactive-viz-guide/SKILL.md +287 -0
  77. package/skills/analysis/dataviz/network-visualization-guide/SKILL.md +195 -0
  78. package/skills/analysis/dataviz/publication-figures-guide/SKILL.md +238 -0
  79. package/skills/analysis/dataviz/python-dataviz-guide/SKILL.md +195 -0
  80. package/skills/analysis/econometrics/causal-inference-guide/SKILL.md +197 -0
  81. package/skills/analysis/econometrics/iv-regression-guide/SKILL.md +198 -0
  82. package/skills/analysis/econometrics/panel-data-guide/SKILL.md +274 -0
  83. package/skills/analysis/econometrics/robustness-checks/SKILL.md +250 -0
  84. package/skills/analysis/econometrics/stata-regression/SKILL.md +117 -0
  85. package/skills/analysis/econometrics/time-series-guide/SKILL.md +235 -0
  86. package/skills/analysis/statistics/bayesian-statistics-guide/SKILL.md +221 -0
  87. package/skills/analysis/statistics/hypothesis-testing-guide/SKILL.md +210 -0
  88. package/skills/analysis/statistics/meta-analysis-guide/SKILL.md +206 -0
  89. package/skills/analysis/statistics/nonparametric-tests-guide/SKILL.md +221 -0
  90. package/skills/analysis/statistics/power-analysis-guide/SKILL.md +240 -0
  91. package/skills/analysis/statistics/sem-guide/SKILL.md +231 -0
  92. package/skills/analysis/statistics/survival-analysis-guide/SKILL.md +195 -0
  93. package/skills/analysis/wrangling/missing-data-handling/SKILL.md +224 -0
  94. package/skills/analysis/wrangling/pandas-data-wrangling/SKILL.md +242 -0
  95. package/skills/analysis/wrangling/questionnaire-design-guide/SKILL.md +234 -0
  96. package/skills/analysis/wrangling/text-mining-guide/SKILL.md +225 -0
  97. package/skills/domains/ai-ml/computer-vision-guide/SKILL.md +213 -0
  98. package/skills/domains/ai-ml/deep-learning-papers-guide/SKILL.md +200 -0
  99. package/skills/domains/ai-ml/llm-evaluation-guide/SKILL.md +194 -0
  100. package/skills/domains/ai-ml/prompt-engineering-research/SKILL.md +233 -0
  101. package/skills/domains/ai-ml/reinforcement-learning-guide/SKILL.md +254 -0
  102. package/skills/domains/ai-ml/transformer-architecture-guide/SKILL.md +233 -0
  103. package/skills/domains/biomedical/clinical-research-guide/SKILL.md +232 -0
  104. package/skills/domains/biomedical/clinicaltrials-api/SKILL.md +177 -0
  105. package/skills/domains/biomedical/epidemiology-guide/SKILL.md +200 -0
  106. package/skills/domains/biomedical/genomics-analysis-guide/SKILL.md +270 -0
  107. package/skills/domains/business/market-analysis-guide/SKILL.md +112 -0
  108. package/skills/domains/business/strategic-management-guide/SKILL.md +154 -0
  109. package/skills/domains/chemistry/computational-chemistry-guide/SKILL.md +266 -0
  110. package/skills/domains/chemistry/retrosynthesis-guide/SKILL.md +215 -0
  111. package/skills/domains/cs/algorithms-complexity-guide/SKILL.md +194 -0
  112. package/skills/domains/cs/dblp-api/SKILL.md +129 -0
  113. package/skills/domains/cs/software-engineering-research/SKILL.md +218 -0
  114. package/skills/domains/ecology/biodiversity-data-guide/SKILL.md +296 -0
  115. package/skills/domains/ecology/conservation-biology-guide/SKILL.md +198 -0
  116. package/skills/domains/ecology/gbif-api/SKILL.md +158 -0
  117. package/skills/domains/ecology/inaturalist-api/SKILL.md +173 -0
  118. package/skills/domains/economics/behavioral-economics-guide/SKILL.md +239 -0
  119. package/skills/domains/economics/development-economics-guide/SKILL.md +181 -0
  120. package/skills/domains/economics/fred-api/SKILL.md +189 -0
  121. package/skills/domains/education/curriculum-design-guide/SKILL.md +144 -0
  122. package/skills/domains/education/learning-science-guide/SKILL.md +150 -0
  123. package/skills/domains/finance/financial-data-analysis/SKILL.md +152 -0
  124. package/skills/domains/finance/quantitative-finance-guide/SKILL.md +151 -0
  125. package/skills/domains/geoscience/climate-science-guide/SKILL.md +158 -0
  126. package/skills/domains/geoscience/gis-remote-sensing-guide/SKILL.md +129 -0
  127. package/skills/domains/humanities/digital-humanities-guide/SKILL.md +181 -0
  128. package/skills/domains/humanities/philosophy-research-guide/SKILL.md +148 -0
  129. package/skills/domains/law/courtlistener-api/SKILL.md +213 -0
  130. package/skills/domains/law/legal-research-guide/SKILL.md +250 -0
  131. package/skills/domains/math/linear-algebra-applications/SKILL.md +227 -0
  132. package/skills/domains/math/numerical-methods-guide/SKILL.md +236 -0
  133. package/skills/domains/math/oeis-api/SKILL.md +158 -0
  134. package/skills/domains/pharma/clinical-pharmacology-guide/SKILL.md +165 -0
  135. package/skills/domains/pharma/drug-development-guide/SKILL.md +177 -0
  136. package/skills/domains/physics/computational-physics-guide/SKILL.md +300 -0
  137. package/skills/domains/physics/nasa-ads-api/SKILL.md +150 -0
  138. package/skills/domains/physics/quantum-computing-guide/SKILL.md +234 -0
  139. package/skills/domains/social-science/social-research-methods/SKILL.md +194 -0
  140. package/skills/domains/social-science/survey-research-guide/SKILL.md +182 -0
  141. package/skills/literature/discovery/citation-alert-guide/SKILL.md +154 -0
  142. package/skills/literature/discovery/conference-proceedings-guide/SKILL.md +142 -0
  143. package/skills/literature/discovery/literature-mapping-guide/SKILL.md +175 -0
  144. package/skills/literature/discovery/paper-tracking-guide/SKILL.md +211 -0
  145. package/skills/literature/discovery/rss-paper-feeds/SKILL.md +214 -0
  146. package/skills/literature/discovery/semantic-scholar-recs-guide/SKILL.md +164 -0
  147. package/skills/literature/fulltext/doaj-api/SKILL.md +120 -0
  148. package/skills/literature/fulltext/interlibrary-loan-guide/SKILL.md +163 -0
  149. package/skills/literature/fulltext/open-access-guide/SKILL.md +183 -0
  150. package/skills/literature/fulltext/pmc-oai-api/SKILL.md +184 -0
  151. package/skills/literature/fulltext/preprint-servers-guide/SKILL.md +128 -0
  152. package/skills/literature/fulltext/repository-harvesting-guide/SKILL.md +207 -0
  153. package/skills/literature/fulltext/unpaywall-api/SKILL.md +113 -0
  154. package/skills/literature/metadata/altmetrics-guide/SKILL.md +132 -0
  155. package/skills/literature/metadata/citation-network-guide/SKILL.md +236 -0
  156. package/skills/literature/metadata/crossref-api/SKILL.md +133 -0
  157. package/skills/literature/metadata/datacite-api/SKILL.md +126 -0
  158. package/skills/literature/metadata/doi-resolution-guide/SKILL.md +168 -0
  159. package/skills/literature/metadata/h-index-guide/SKILL.md +183 -0
  160. package/skills/literature/metadata/journal-metrics-guide/SKILL.md +188 -0
  161. package/skills/literature/metadata/opencitations-api/SKILL.md +128 -0
  162. package/skills/literature/metadata/orcid-api/SKILL.md +136 -0
  163. package/skills/literature/metadata/orcid-integration-guide/SKILL.md +178 -0
  164. package/skills/literature/search/arxiv-api/SKILL.md +95 -0
  165. package/skills/literature/search/biorxiv-api/SKILL.md +123 -0
  166. package/skills/literature/search/boolean-search-guide/SKILL.md +199 -0
  167. package/skills/literature/search/citation-chaining-guide/SKILL.md +148 -0
  168. package/skills/literature/search/database-comparison-guide/SKILL.md +100 -0
  169. package/skills/literature/search/europe-pmc-api/SKILL.md +120 -0
  170. package/skills/literature/search/google-scholar-guide/SKILL.md +182 -0
  171. package/skills/literature/search/mesh-terms-guide/SKILL.md +164 -0
  172. package/skills/literature/search/openalex-api/SKILL.md +134 -0
  173. package/skills/literature/search/pubmed-api/SKILL.md +130 -0
  174. package/skills/literature/search/scientify-literature-survey/SKILL.md +203 -0
  175. package/skills/literature/search/semantic-scholar-api/SKILL.md +134 -0
  176. package/skills/literature/search/systematic-search-strategy/SKILL.md +214 -0
  177. package/skills/research/automation/ai-scientist-guide/SKILL.md +228 -0
  178. package/skills/research/automation/data-collection-automation/SKILL.md +248 -0
  179. package/skills/research/automation/research-workflow-automation/SKILL.md +266 -0
  180. package/skills/research/deep-research/meta-synthesis-guide/SKILL.md +174 -0
  181. package/skills/research/deep-research/research-cog/SKILL.md +153 -0
  182. package/skills/research/deep-research/scoping-review-guide/SKILL.md +217 -0
  183. package/skills/research/deep-research/systematic-review-guide/SKILL.md +250 -0
  184. package/skills/research/funding/figshare-api/SKILL.md +163 -0
  185. package/skills/research/funding/grant-writing-guide/SKILL.md +233 -0
  186. package/skills/research/funding/nsf-grant-guide/SKILL.md +206 -0
  187. package/skills/research/funding/open-science-guide/SKILL.md +255 -0
  188. package/skills/research/funding/zenodo-api/SKILL.md +174 -0
  189. package/skills/research/methodology/action-research-guide/SKILL.md +201 -0
  190. package/skills/research/methodology/experimental-design-guide/SKILL.md +236 -0
  191. package/skills/research/methodology/grad-school-guide/SKILL.md +182 -0
  192. package/skills/research/methodology/grounded-theory-guide/SKILL.md +171 -0
  193. package/skills/research/methodology/mixed-methods-guide/SKILL.md +208 -0
  194. package/skills/research/methodology/qualitative-research-guide/SKILL.md +234 -0
  195. package/skills/research/methodology/scientify-idea-generation/SKILL.md +222 -0
  196. package/skills/research/paper-review/paper-reading-assistant/SKILL.md +266 -0
  197. package/skills/research/paper-review/peer-review-guide/SKILL.md +227 -0
  198. package/skills/research/paper-review/rebuttal-writing-guide/SKILL.md +185 -0
  199. package/skills/research/paper-review/scientify-write-review-paper/SKILL.md +209 -0
  200. package/skills/tools/code-exec/jupyter-notebook-guide/SKILL.md +178 -0
  201. package/skills/tools/code-exec/python-reproducibility-guide/SKILL.md +341 -0
  202. package/skills/tools/code-exec/r-reproducibility-guide/SKILL.md +236 -0
  203. package/skills/tools/code-exec/sandbox-execution-guide/SKILL.md +221 -0
  204. package/skills/tools/diagram/mermaid-diagram-guide/SKILL.md +269 -0
  205. package/skills/tools/diagram/plantuml-guide/SKILL.md +397 -0
  206. package/skills/tools/diagram/scientific-illustration-guide/SKILL.md +225 -0
  207. package/skills/tools/document/anystyle-api/SKILL.md +199 -0
  208. package/skills/tools/document/grobid-pdf-parsing/SKILL.md +294 -0
  209. package/skills/tools/document/markdown-academic-guide/SKILL.md +217 -0
  210. package/skills/tools/document/pdf-extraction-guide/SKILL.md +321 -0
  211. package/skills/tools/knowledge-graph/knowledge-graph-construction/SKILL.md +306 -0
  212. package/skills/tools/knowledge-graph/ontology-design-guide/SKILL.md +214 -0
  213. package/skills/tools/knowledge-graph/rag-methodology-guide/SKILL.md +325 -0
  214. package/skills/tools/ocr-translate/formula-recognition-guide/SKILL.md +367 -0
  215. package/skills/tools/ocr-translate/handwriting-recognition-guide/SKILL.md +211 -0
  216. package/skills/tools/ocr-translate/latex-ocr-guide/SKILL.md +204 -0
  217. package/skills/tools/ocr-translate/multilingual-research-guide/SKILL.md +234 -0
  218. package/skills/tools/scraping/academic-web-scraping/SKILL.md +326 -0
  219. package/skills/tools/scraping/api-data-collection-guide/SKILL.md +301 -0
  220. package/skills/tools/scraping/web-scraping-ethics-guide/SKILL.md +250 -0
  221. package/skills/writing/citation/bibtex-management-guide/SKILL.md +246 -0
  222. package/skills/writing/citation/citation-style-guide/SKILL.md +248 -0
  223. package/skills/writing/citation/reference-manager-comparison/SKILL.md +208 -0
  224. package/skills/writing/citation/zotero-api/SKILL.md +188 -0
  225. package/skills/writing/composition/abstract-writing-guide/SKILL.md +188 -0
  226. package/skills/writing/composition/discussion-writing-guide/SKILL.md +194 -0
  227. package/skills/writing/composition/introduction-writing-guide/SKILL.md +194 -0
  228. package/skills/writing/composition/literature-review-writing/SKILL.md +196 -0
  229. package/skills/writing/composition/methods-section-guide/SKILL.md +185 -0
  230. package/skills/writing/composition/response-to-reviewers/SKILL.md +215 -0
  231. package/skills/writing/composition/scientific-writing-guide/SKILL.md +152 -0
  232. package/skills/writing/latex/bibliography-management-guide/SKILL.md +206 -0
  233. package/skills/writing/latex/latex-drawing-guide/SKILL.md +234 -0
  234. package/skills/writing/latex/latex-ecosystem-guide/SKILL.md +240 -0
  235. package/skills/writing/latex/math-typesetting-guide/SKILL.md +231 -0
  236. package/skills/writing/latex/overleaf-collaboration-guide/SKILL.md +211 -0
  237. package/skills/writing/latex/tikz-diagrams-guide/SKILL.md +211 -0
  238. package/skills/writing/polish/academic-translation-guide/SKILL.md +175 -0
  239. package/skills/writing/polish/academic-writing-refiner/SKILL.md +143 -0
  240. package/skills/writing/polish/ai-writing-humanizer/SKILL.md +178 -0
  241. package/skills/writing/polish/grammar-checker-guide/SKILL.md +184 -0
  242. package/skills/writing/polish/plagiarism-detection-guide/SKILL.md +167 -0
  243. package/skills/writing/templates/beamer-presentation-guide/SKILL.md +263 -0
  244. package/skills/writing/templates/conference-paper-template/SKILL.md +219 -0
  245. package/skills/writing/templates/thesis-template-guide/SKILL.md +200 -0
  246. package/skills/writing/templates/thesis-writing-guide/SKILL.md +220 -0
  247. package/src/tools/arxiv.ts +131 -0
  248. package/src/tools/crossref.ts +112 -0
  249. package/src/tools/openalex.ts +174 -0
  250. package/src/tools/pubmed.ts +166 -0
  251. package/src/tools/semantic-scholar.ts +108 -0
  252. package/src/tools/unpaywall.ts +58 -0
@@ -0,0 +1,326 @@
1
+ ---
2
+ name: academic-web-scraping
3
+ description: "Ethical web scraping and API-based data collection for research"
4
+ metadata:
5
+ openclaw:
6
+ emoji: "🌐"
7
+ category: "tools"
8
+ subcategory: "scraping"
9
+ keywords: ["web scraping", "API data collection", "web search strategies", "data extraction"]
10
+ source: "N/A"
11
+ ---
12
+
13
+ # Academic Web Scraping Guide
14
+
15
+ ## Overview
16
+
17
+ Research often requires collecting data from the web -- whether it is bibliographic metadata from academic databases, experimental datasets from public repositories, social media posts for computational social science, or economic indicators from government portals. Web scraping and API-based data collection are essential skills for modern researchers across disciplines.
18
+
19
+ This guide covers both approaches: structured API access for platforms that provide one, and web scraping for when no API exists. It emphasizes ethical data collection practices, including respecting robots.txt, rate limiting, terms of service compliance, and IRB considerations for human-subject data. The goal is to collect research data reliably and responsibly.
20
+
21
+ Whether you are building a dataset for a machine learning paper, collecting metadata for a systematic review, or gathering public data for policy research, these patterns help you do it correctly and efficiently.
22
+
23
+ ## API-Based Data Collection
24
+
25
+ APIs are always preferable to scraping when available. They provide structured data, are officially supported, and have clear usage terms.
26
+
27
+ ### Academic APIs
28
+
29
+ | API | Data | Rate Limit | Auth |
30
+ |-----|------|-----------|------|
31
+ | Semantic Scholar | Papers, authors, citations | 100 req/sec (with key) | API key (free) |
32
+ | OpenAlex | Papers, authors, venues, concepts | 100K req/day | Email in header |
33
+ | Crossref | DOI metadata | 50 req/sec (polite pool) | Email in header |
34
+ | PubMed (Entrez) | Biomedical literature | 10 req/sec (with key) | API key (free) |
35
+ | arXiv | Preprints | 1 req/3sec | None |
36
+ | CORE | Open access papers | 10 req/sec | API key (free) |
37
+
38
+ ### Example: Collecting Papers from OpenAlex
39
+
40
+ ```python
41
+ import requests
42
+ import time
43
+
44
+ class OpenAlexClient:
45
+ BASE_URL = "https://api.openalex.org"
46
+
47
+ def __init__(self, email):
48
+ self.session = requests.Session()
49
+ self.session.headers.update({
50
+ 'User-Agent': f'ResearchBot/1.0 (mailto:{email})'
51
+ })
52
+
53
+ def search_works(self, query, filters=None, per_page=25, max_results=100):
54
+ """Search for works with optional filters."""
55
+ results = []
56
+ page = 1
57
+
58
+ while len(results) < max_results:
59
+ params = {
60
+ 'search': query,
61
+ 'per_page': min(per_page, max_results - len(results)),
62
+ 'page': page,
63
+ }
64
+ if filters:
65
+ params['filter'] = ','.join(f'{k}:{v}' for k, v in filters.items())
66
+
67
+ resp = self.session.get(f'{self.BASE_URL}/works', params=params)
68
+ resp.raise_for_status()
69
+ data = resp.json()
70
+
71
+ works = data.get('results', [])
72
+ if not works:
73
+ break
74
+
75
+ results.extend(works)
76
+ page += 1
77
+ time.sleep(0.1) # Polite rate limiting
78
+
79
+ return results[:max_results]
80
+
81
+ def get_work(self, openalex_id):
82
+ """Get a single work by OpenAlex ID."""
83
+ resp = self.session.get(f'{self.BASE_URL}/works/{openalex_id}')
84
+ resp.raise_for_status()
85
+ return resp.json()
86
+
87
+ # Usage
88
+ client = OpenAlexClient(email="researcher@university.edu")
89
+ papers = client.search_works(
90
+ "transformer attention mechanism",
91
+ filters={
92
+ 'publication_year': '2023-2024',
93
+ 'type': 'journal-article',
94
+ 'open_access.is_oa': 'true'
95
+ },
96
+ max_results=200
97
+ )
98
+
99
+ for paper in papers[:5]:
100
+ print(f"- {paper['title']} ({paper['publication_year']})")
101
+ print(f" DOI: {paper['doi']}")
102
+ print(f" Citations: {paper['cited_by_count']}")
103
+ ```
104
+
105
+ ### Example: PubMed Entrez API
106
+
107
+ ```python
108
+ from Bio import Entrez
109
+
110
+ Entrez.email = "researcher@university.edu"
111
+ Entrez.api_key = os.environ.get("NCBI_API_KEY") # optional
112
+
113
+ def search_pubmed(query, max_results=100):
114
+ """Search PubMed and retrieve article details."""
115
+ # Search
116
+ handle = Entrez.esearch(db="pubmed", term=query,
117
+ retmax=max_results, sort="relevance")
118
+ search_results = Entrez.read(handle)
119
+ id_list = search_results["IdList"]
120
+
121
+ if not id_list:
122
+ return []
123
+
124
+ # Fetch details
125
+ handle = Entrez.efetch(db="pubmed", id=id_list,
126
+ rettype="xml", retmode="xml")
127
+ records = Entrez.read(handle)
128
+
129
+ articles = []
130
+ for article in records['PubmedArticle']:
131
+ medline = article['MedlineCitation']
132
+ art_info = medline['Article']
133
+ articles.append({
134
+ 'pmid': str(medline['PMID']),
135
+ 'title': art_info.get('ArticleTitle', ''),
136
+ 'abstract': art_info.get('Abstract', {}).get(
137
+ 'AbstractText', [''])[0] if 'Abstract' in art_info else '',
138
+ 'journal': art_info['Journal']['Title'],
139
+ 'year': art_info['Journal']['JournalIssue'].get(
140
+ 'PubDate', {}).get('Year', ''),
141
+ })
142
+
143
+ return articles
144
+ ```
145
+
146
+ ## Web Scraping Fundamentals
147
+
148
+ When no API exists, scraping becomes necessary. Always check for an API first.
149
+
150
+ ### Tools Comparison
151
+
152
+ | Tool | Type | JavaScript Support | Speed | Learning Curve |
153
+ |------|------|-------------------|-------|---------------|
154
+ | requests + BeautifulSoup | HTTP + parsing | No | Fast | Low |
155
+ | Scrapy | Framework | No (without middleware) | Very fast | Medium |
156
+ | Selenium | Browser automation | Yes | Slow | Medium |
157
+ | Playwright | Browser automation | Yes | Medium | Medium |
158
+ | httpx | Async HTTP | No | Very fast | Low |
159
+
160
+ ### Basic Scraping with BeautifulSoup
161
+
162
+ ```python
163
+ import requests
164
+ from bs4 import BeautifulSoup
165
+ import time
166
+
167
+ def scrape_conference_proceedings(url, delay=2.0):
168
+ """Scrape paper titles and links from a conference page."""
169
+ headers = {
170
+ 'User-Agent': 'ResearchBot/1.0 (Academic research; contact@university.edu)'
171
+ }
172
+
173
+ response = requests.get(url, headers=headers, timeout=30)
174
+ response.raise_for_status()
175
+
176
+ soup = BeautifulSoup(response.text, 'html.parser')
177
+
178
+ papers = []
179
+ for item in soup.select('.paper-item, .proceeding-entry'):
180
+ title_el = item.select_one('.title, h3, h4')
181
+ link_el = item.select_one('a[href]')
182
+ authors_el = item.select_one('.authors, .author-list')
183
+
184
+ if title_el:
185
+ papers.append({
186
+ 'title': title_el.get_text(strip=True),
187
+ 'url': link_el['href'] if link_el else None,
188
+ 'authors': authors_el.get_text(strip=True) if authors_el else '',
189
+ })
190
+
191
+ time.sleep(delay) # Respect the server
192
+ return papers
193
+ ```
194
+
195
+ ### Handling JavaScript-Rendered Pages
196
+
197
+ ```python
198
+ from playwright.sync_api import sync_playwright
199
+
200
+ def scrape_dynamic_page(url):
201
+ """Scrape a JavaScript-rendered page using Playwright."""
202
+ with sync_playwright() as p:
203
+ browser = p.chromium.launch(headless=True)
204
+ page = browser.new_page()
205
+ page.goto(url, wait_until='networkidle')
206
+
207
+ # Wait for content to load
208
+ page.wait_for_selector('.results-container', timeout=10000)
209
+
210
+ # Extract data
211
+ items = page.query_selector_all('.result-item')
212
+ results = []
213
+ for item in items:
214
+ title = item.query_selector('.title')
215
+ results.append({
216
+ 'title': title.inner_text() if title else '',
217
+ })
218
+
219
+ browser.close()
220
+ return results
221
+ ```
222
+
223
+ ## Ethical Guidelines
224
+
225
+ ### The Researcher's Scraping Checklist
226
+
227
+ 1. **Check for an API first.** Most academic platforms have one.
228
+ 2. **Read robots.txt.** `https://example.com/robots.txt` specifies what is allowed.
229
+ 3. **Review Terms of Service.** Some sites explicitly prohibit scraping.
230
+ 4. **Rate limit aggressively.** 1 request per 2-5 seconds minimum. Never parallelize without permission.
231
+ 5. **Identify yourself.** Include your email and institution in the User-Agent header.
232
+ 6. **Minimize data collection.** Only collect what your research question requires.
233
+ 7. **Consider IRB requirements.** If collecting data about identifiable humans, consult your IRB.
234
+ 8. **Store data securely.** Follow your institution's data management policies.
235
+ 9. **Cite your data sources.** Acknowledge where the data came from in your publications.
236
+ 10. **Check copyright.** Scraping publicly visible data does not mean you can redistribute it.
237
+
238
+ ### robots.txt Parsing
239
+
240
+ ```python
241
+ from urllib.robotparser import RobotFileParser
242
+
243
+ def can_scrape(url, user_agent='*'):
244
+ """Check if scraping a URL is allowed by robots.txt."""
245
+ from urllib.parse import urlparse
246
+ parsed = urlparse(url)
247
+ robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
248
+
249
+ rp = RobotFileParser()
250
+ rp.set_url(robots_url)
251
+ rp.read()
252
+
253
+ allowed = rp.can_fetch(user_agent, url)
254
+ crawl_delay = rp.crawl_delay(user_agent)
255
+
256
+ return {
257
+ 'allowed': allowed,
258
+ 'crawl_delay': crawl_delay or 1.0,
259
+ }
260
+ ```
261
+
262
+ ## Data Storage and Export
263
+
264
+ ### Saving Results Reliably
265
+
266
+ ```python
267
+ import json
268
+ import csv
269
+ from pathlib import Path
270
+ from datetime import datetime
271
+
272
+ class DataCollector:
273
+ def __init__(self, output_dir='collected_data'):
274
+ self.output_dir = Path(output_dir)
275
+ self.output_dir.mkdir(parents=True, exist_ok=True)
276
+ self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
277
+
278
+ def save_json(self, data, filename):
279
+ path = self.output_dir / f'{filename}_{self.timestamp}.json'
280
+ with open(path, 'w', encoding='utf-8') as f:
281
+ json.dump(data, f, indent=2, ensure_ascii=False)
282
+ print(f"Saved {len(data)} records to {path}")
283
+
284
+ def save_csv(self, data, filename, fieldnames=None):
285
+ if not data:
286
+ return
287
+ if fieldnames is None:
288
+ fieldnames = list(data[0].keys())
289
+
290
+ path = self.output_dir / f'{filename}_{self.timestamp}.csv'
291
+ with open(path, 'w', newline='', encoding='utf-8') as f:
292
+ writer = csv.DictWriter(f, fieldnames=fieldnames,
293
+ extrasaction='ignore')
294
+ writer.writeheader()
295
+ writer.writerows(data)
296
+ print(f"Saved {len(data)} records to {path}")
297
+
298
+ def save_checkpoint(self, data, filename):
299
+ """Save intermediate results for resumable collection."""
300
+ path = self.output_dir / f'{filename}_checkpoint.json'
301
+ with open(path, 'w', encoding='utf-8') as f:
302
+ json.dump({
303
+ 'timestamp': self.timestamp,
304
+ 'n_records': len(data),
305
+ 'data': data,
306
+ }, f, indent=2, ensure_ascii=False)
307
+ ```
308
+
309
+ ## Best Practices
310
+
311
+ - **Always prefer APIs over scraping.** APIs are more reliable, structured, and legally clear.
312
+ - **Implement exponential backoff.** If a request fails, wait 1s, then 2s, then 4s before retrying.
313
+ - **Save checkpoints.** For large collections, save progress incrementally so you can resume after interruptions.
314
+ - **Log everything.** Record which URLs were accessed, when, and what was returned for reproducibility.
315
+ - **Test on a small sample first.** Verify your parsing logic on 10 records before running on 10,000.
316
+ - **Respect rate limits.** Getting blocked hurts everyone -- other researchers included.
317
+ - **Document your collection methodology.** Your paper's Methods section should describe how data was collected, when, and what filters were applied.
318
+
319
+ ## References
320
+
321
+ - [OpenAlex API Documentation](https://docs.openalex.org/) -- Open bibliographic data API
322
+ - [Semantic Scholar API](https://api.semanticscholar.org/) -- Paper and author data
323
+ - [BeautifulSoup Documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) -- HTML parsing
324
+ - [Scrapy Documentation](https://docs.scrapy.org/) -- Web scraping framework
325
+ - [Playwright Documentation](https://playwright.dev/python/) -- Browser automation
326
+ - [Web Scraping Ethics](https://towardsdatascience.com/ethics-in-web-scraping-b96b18136f01) -- Ethical considerations
@@ -0,0 +1,301 @@
1
+ ---
2
+ name: api-data-collection-guide
3
+ description: "API-based data collection and web scraping for research"
4
+ metadata:
5
+ openclaw:
6
+ emoji: "spider"
7
+ category: "tools"
8
+ subcategory: "scraping"
9
+ keywords: ["API data collection", "web search strategies", "data extraction", "web scraping"]
10
+ source: "wentor-research-plugins"
11
+ ---
12
+
13
+ # API Data Collection Guide
14
+
15
+ Collect research data from web APIs and structured sources using Python, with proper rate limiting, error handling, pagination, and ethical considerations.
16
+
17
+ ## API vs. Web Scraping
18
+
19
+ | Approach | When to Use | Reliability | Legal Risk |
20
+ |----------|------------|-------------|------------|
21
+ | Official API | API exists and provides needed data | High | Low (within TOS) |
22
+ | Unofficial API | Browser dev tools reveal JSON endpoints | Medium | Medium |
23
+ | Web scraping | No API available, data is publicly accessible | Low (pages change) | Medium-High |
24
+ | Bulk data download | Provider offers data dumps | High | Low |
25
+
26
+ **Always prefer official APIs over scraping**. Check for APIs first at: ProgrammableWeb, RapidAPI, or the data provider's developer documentation.
27
+
28
+ ## RESTful API Fundamentals
29
+
30
+ ### HTTP Methods
31
+
32
+ | Method | Purpose | Example |
33
+ |--------|---------|---------|
34
+ | GET | Retrieve data | `GET /api/papers?q=machine+learning` |
35
+ | POST | Create or submit data | `POST /api/annotations` |
36
+ | PUT | Update existing data | `PUT /api/papers/123` |
37
+ | DELETE | Remove data | `DELETE /api/papers/123` |
38
+
39
+ ### Common Response Codes
40
+
41
+ | Code | Meaning | Action |
42
+ |------|---------|--------|
43
+ | 200 | Success | Process response |
44
+ | 201 | Created | Resource created successfully |
45
+ | 400 | Bad request | Fix query parameters |
46
+ | 401 | Unauthorized | Check API key |
47
+ | 403 | Forbidden | Access denied; check permissions |
48
+ | 404 | Not found | Resource does not exist |
49
+ | 429 | Rate limited | Wait and retry with backoff |
50
+ | 500 | Server error | Retry later |
51
+
52
+ ## Python API Client Template
53
+
54
+ ```python
55
+ import requests
56
+ import time
57
+ import json
58
+ import logging
59
+ from pathlib import Path
60
+ from datetime import datetime
61
+
62
+ logging.basicConfig(level=logging.INFO)
63
+ logger = logging.getLogger(__name__)
64
+
65
+ class APIClient:
66
+ """Reusable API client with rate limiting, retries, and caching."""
67
+
68
+ def __init__(self, base_url, api_key=None, rate_limit=1.0, max_retries=3):
69
+ self.base_url = base_url.rstrip("/")
70
+ self.session = requests.Session()
71
+ if api_key:
72
+ self.session.headers["Authorization"] = f"Bearer {api_key}"
73
+ self.session.headers["User-Agent"] = "ResearchCollector/1.0 (academic research)"
74
+ self.rate_limit = rate_limit # seconds between requests
75
+ self.max_retries = max_retries
76
+ self.last_request_time = 0
77
+ self.cache_dir = Path("./cache")
78
+ self.cache_dir.mkdir(exist_ok=True)
79
+
80
+ def _rate_limit_wait(self):
81
+ """Enforce minimum time between requests."""
82
+ elapsed = time.time() - self.last_request_time
83
+ if elapsed < self.rate_limit:
84
+ time.sleep(self.rate_limit - elapsed)
85
+ self.last_request_time = time.time()
86
+
87
+ def _get_cache_key(self, endpoint, params):
88
+ """Generate a cache key from the request."""
89
+ import hashlib
90
+ key_string = f"{endpoint}_{json.dumps(params, sort_keys=True)}"
91
+ return hashlib.md5(key_string.encode()).hexdigest()
92
+
93
+ def get(self, endpoint, params=None, use_cache=True):
94
+ """Make a GET request with rate limiting, retries, and caching."""
95
+ cache_key = self._get_cache_key(endpoint, params or {})
96
+ cache_file = self.cache_dir / f"{cache_key}.json"
97
+
98
+ # Check cache
99
+ if use_cache and cache_file.exists():
100
+ logger.debug(f"Cache hit: {endpoint}")
101
+ return json.loads(cache_file.read_text())
102
+
103
+ url = f"{self.base_url}/{endpoint.lstrip('/')}"
104
+
105
+ for attempt in range(self.max_retries):
106
+ self._rate_limit_wait()
107
+ try:
108
+ response = self.session.get(url, params=params, timeout=30)
109
+
110
+ if response.status_code == 200:
111
+ data = response.json()
112
+ # Save to cache
113
+ cache_file.write_text(json.dumps(data))
114
+ return data
115
+
116
+ elif response.status_code == 429:
117
+ retry_after = int(response.headers.get("Retry-After", 60))
118
+ logger.warning(f"Rate limited. Waiting {retry_after}s...")
119
+ time.sleep(retry_after)
120
+
121
+ elif response.status_code >= 500:
122
+ logger.warning(f"Server error {response.status_code}. Retry {attempt+1}/{self.max_retries}")
123
+ time.sleep(2 ** attempt) # Exponential backoff
124
+
125
+ else:
126
+ logger.error(f"Request failed: {response.status_code} {response.text[:200]}")
127
+ return None
128
+
129
+ except requests.exceptions.RequestException as e:
130
+ logger.error(f"Request exception: {e}")
131
+ time.sleep(2 ** attempt)
132
+
133
+ logger.error(f"Max retries exceeded for {endpoint}")
134
+ return None
135
+
136
+ def paginate(self, endpoint, params=None, page_key="page",
137
+ results_key="results", max_pages=100):
138
+ """Automatically paginate through all results."""
139
+ params = params or {}
140
+ all_results = []
141
+ page = 1
142
+
143
+ while page <= max_pages:
144
+ params[page_key] = page
145
+ data = self.get(endpoint, params)
146
+
147
+ if not data or not data.get(results_key):
148
+ break
149
+
150
+ results = data[results_key]
151
+ all_results.extend(results)
152
+ logger.info(f"Page {page}: {len(results)} results (total: {len(all_results)})")
153
+
154
+ # Check if more pages exist
155
+ if len(results) < params.get("per_page", params.get("limit", 20)):
156
+ break
157
+
158
+ page += 1
159
+
160
+ return all_results
161
+ ```
162
+
163
+ ## Academic API Examples
164
+
165
+ ### OpenAlex (Open Scholarly Metadata)
166
+
167
+ ```python
168
+ # OpenAlex: free, comprehensive, no authentication required
169
+ client = APIClient("https://api.openalex.org", rate_limit=0.1)
170
+
171
+ # Search for works
172
+ results = client.get("works", params={
173
+ "filter": "title.search:transformer attention mechanism",
174
+ "sort": "cited_by_count:desc",
175
+ "per_page": 25
176
+ })
177
+
178
+ for work in results.get("results", []):
179
+ print(f"[{work.get('publication_year')}] {work.get('title')}")
180
+ print(f" Citations: {work.get('cited_by_count')}")
181
+ print(f" DOI: {work.get('doi')}")
182
+ ```
183
+
184
+ ### CrossRef (DOI Metadata)
185
+
186
+ ```python
187
+ client = APIClient("https://api.crossref.org", rate_limit=0.05)
188
+ client.session.headers["User-Agent"] = "ResearchClaw/1.0 (mailto:researcher@university.edu)"
189
+
190
+ # Search for works
191
+ results = client.get("works", params={
192
+ "query": "machine learning drug discovery",
193
+ "rows": 20,
194
+ "sort": "relevance",
195
+ "order": "desc"
196
+ })
197
+
198
+ for item in results.get("message", {}).get("items", []):
199
+ title = item.get("title", ["N/A"])[0]
200
+ doi = item.get("DOI", "N/A")
201
+ cited = item.get("is-referenced-by-count", 0)
202
+ print(f" {title} | DOI: {doi} | Cited: {cited}")
203
+ ```
204
+
205
+ ### GitHub API (Code and Repositories)
206
+
207
+ ```python
208
+ # GitHub API for finding research code repositories
209
+ client = APIClient("https://api.github.com", api_key=os.environ["GITHUB_TOKEN"], rate_limit=0.75)
210
+
211
+ # Search repositories
212
+ results = client.get("search/repositories", params={
213
+ "q": "topic:machine-learning language:python stars:>100",
214
+ "sort": "stars",
215
+ "order": "desc",
216
+ "per_page": 30
217
+ })
218
+
219
+ for repo in results.get("items", []):
220
+ print(f"{repo['full_name']} ({repo['stargazers_count']} stars)")
221
+ print(f" {repo.get('description', 'No description')[:80]}")
222
+ ```
223
+
224
+ ## Web Scraping (When APIs Are Unavailable)
225
+
226
+ ```python
227
+ import requests
228
+ from bs4 import BeautifulSoup
229
+ import time
230
+
231
+ def scrape_conference_proceedings(url, delay=2.0):
232
+ """Scrape paper titles and authors from a conference page."""
233
+ headers = {
234
+ "User-Agent": "Mozilla/5.0 (Research Bot; academic research only)"
235
+ }
236
+
237
+ response = requests.get(url, headers=headers)
238
+ response.raise_for_status()
239
+
240
+ soup = BeautifulSoup(response.text, "html.parser")
241
+
242
+ papers = []
243
+ for article in soup.find_all("div", class_="paper-entry"):
244
+ title = article.find("h3")
245
+ authors = article.find("span", class_="authors")
246
+ abstract = article.find("p", class_="abstract")
247
+
248
+ papers.append({
249
+ "title": title.text.strip() if title else "N/A",
250
+ "authors": authors.text.strip() if authors else "N/A",
251
+ "abstract": abstract.text.strip() if abstract else "N/A"
252
+ })
253
+
254
+ time.sleep(delay) # Be polite
255
+ return papers
256
+ ```
257
+
258
+ ## Data Storage and Management
259
+
260
+ ```python
261
+ import pandas as pd
262
+ import sqlite3
263
+
264
+ def save_to_sqlite(data, db_path="research_data.db", table_name="papers"):
265
+ """Save collected data to SQLite database."""
266
+ df = pd.DataFrame(data)
267
+ conn = sqlite3.connect(db_path)
268
+ df.to_sql(table_name, conn, if_exists="append", index=False)
269
+ conn.close()
270
+ logger.info(f"Saved {len(df)} records to {db_path}:{table_name}")
271
+
272
+ def save_incremental_json(data, output_file="collected_data.jsonl"):
273
+ """Append data as JSON Lines (one JSON object per line)."""
274
+ with open(output_file, "a") as f:
275
+ for record in data:
276
+ f.write(json.dumps(record) + "\n")
277
+ ```
278
+
279
+ ## Ethical and Legal Considerations
280
+
281
+ | Principle | Description |
282
+ |-----------|-------------|
283
+ | **Respect robots.txt** | Check `robots.txt` before scraping any site |
284
+ | **Rate limiting** | Never exceed 1 request/second unless the API permits more |
285
+ | **Identify yourself** | Use a descriptive User-Agent with contact email |
286
+ | **Terms of Service** | Read and follow the API/website TOS |
287
+ | **Data minimization** | Only collect data you actually need |
288
+ | **Privacy** | Do not scrape personal data without consent |
289
+ | **Acknowledge sources** | Cite data sources in publications |
290
+ | **IRB review** | Consult your IRB if collecting human-related data |
291
+
292
+ ## Troubleshooting Common Issues
293
+
294
+ | Problem | Cause | Solution |
295
+ |---------|-------|----------|
296
+ | 403 Forbidden | Missing or incorrect authentication | Check API key, update User-Agent |
297
+ | Timeout errors | Slow server or large response | Increase timeout, reduce page size |
298
+ | Inconsistent data | API schema changed | Version-lock API endpoints, validate schema |
299
+ | Missing fields | Optional fields are null | Use `.get()` with defaults, handle None |
300
+ | Encoding errors | Non-UTF8 characters | Set `response.encoding = "utf-8"`, use `errors="replace"` |
301
+ | IP blocking | Too many requests | Use exponential backoff, rotate IPs (with caution) |