@wentorai/research-plugins 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +204 -0
  3. package/curated/analysis/README.md +64 -0
  4. package/curated/domains/README.md +104 -0
  5. package/curated/literature/README.md +53 -0
  6. package/curated/research/README.md +62 -0
  7. package/curated/tools/README.md +87 -0
  8. package/curated/writing/README.md +61 -0
  9. package/index.ts +39 -0
  10. package/mcp-configs/academic-db/ChatSpatial.json +17 -0
  11. package/mcp-configs/academic-db/academia-mcp.json +17 -0
  12. package/mcp-configs/academic-db/academic-paper-explorer.json +17 -0
  13. package/mcp-configs/academic-db/academic-search-mcp-server.json +17 -0
  14. package/mcp-configs/academic-db/agentinterviews-mcp.json +17 -0
  15. package/mcp-configs/academic-db/all-in-mcp.json +17 -0
  16. package/mcp-configs/academic-db/apple-health-mcp.json +17 -0
  17. package/mcp-configs/academic-db/arxiv-latex-mcp.json +17 -0
  18. package/mcp-configs/academic-db/arxiv-mcp-server.json +17 -0
  19. package/mcp-configs/academic-db/bgpt-mcp.json +17 -0
  20. package/mcp-configs/academic-db/biomcp.json +17 -0
  21. package/mcp-configs/academic-db/biothings-mcp.json +17 -0
  22. package/mcp-configs/academic-db/catalysishub-mcp-server.json +17 -0
  23. package/mcp-configs/academic-db/clinicaltrialsgov-mcp-server.json +17 -0
  24. package/mcp-configs/academic-db/deep-research-mcp.json +17 -0
  25. package/mcp-configs/academic-db/dicom-mcp.json +17 -0
  26. package/mcp-configs/academic-db/enrichr-mcp-server.json +17 -0
  27. package/mcp-configs/academic-db/fec-mcp-server.json +17 -0
  28. package/mcp-configs/academic-db/fhir-mcp-server-themomentum.json +17 -0
  29. package/mcp-configs/academic-db/fhir-mcp.json +19 -0
  30. package/mcp-configs/academic-db/gget-mcp.json +17 -0
  31. package/mcp-configs/academic-db/google-researcher-mcp.json +17 -0
  32. package/mcp-configs/academic-db/idea-reality-mcp.json +17 -0
  33. package/mcp-configs/academic-db/legiscan-mcp.json +19 -0
  34. package/mcp-configs/academic-db/lex.json +17 -0
  35. package/mcp-configs/ai-platform/Adaptive-Graph-of-Thoughts-MCP-server.json +17 -0
  36. package/mcp-configs/ai-platform/ai-counsel.json +17 -0
  37. package/mcp-configs/ai-platform/atlas-mcp-server.json +17 -0
  38. package/mcp-configs/ai-platform/counsel-mcp.json +17 -0
  39. package/mcp-configs/ai-platform/cross-llm-mcp.json +17 -0
  40. package/mcp-configs/ai-platform/gptr-mcp.json +17 -0
  41. package/mcp-configs/browser/decipher-research-agent.json +17 -0
  42. package/mcp-configs/browser/deep-research.json +17 -0
  43. package/mcp-configs/browser/everything-claude-code.json +17 -0
  44. package/mcp-configs/browser/gpt-researcher.json +17 -0
  45. package/mcp-configs/browser/heurist-agent-framework.json +17 -0
  46. package/mcp-configs/data-platform/4everland-hosting-mcp.json +17 -0
  47. package/mcp-configs/data-platform/context-keeper.json +17 -0
  48. package/mcp-configs/data-platform/context7.json +19 -0
  49. package/mcp-configs/data-platform/contextstream-mcp.json +17 -0
  50. package/mcp-configs/data-platform/email-mcp.json +17 -0
  51. package/mcp-configs/note-knowledge/ApeRAG.json +17 -0
  52. package/mcp-configs/note-knowledge/In-Memoria.json +17 -0
  53. package/mcp-configs/note-knowledge/agent-memory.json +17 -0
  54. package/mcp-configs/note-knowledge/aimemo.json +17 -0
  55. package/mcp-configs/note-knowledge/biel-mcp.json +19 -0
  56. package/mcp-configs/note-knowledge/cognee.json +17 -0
  57. package/mcp-configs/note-knowledge/context-awesome.json +17 -0
  58. package/mcp-configs/note-knowledge/context-mcp.json +17 -0
  59. package/mcp-configs/note-knowledge/conversation-handoff-mcp.json +17 -0
  60. package/mcp-configs/note-knowledge/cortex.json +17 -0
  61. package/mcp-configs/note-knowledge/devrag.json +17 -0
  62. package/mcp-configs/note-knowledge/easy-obsidian-mcp.json +17 -0
  63. package/mcp-configs/note-knowledge/engram.json +17 -0
  64. package/mcp-configs/note-knowledge/gnosis-mcp.json +17 -0
  65. package/mcp-configs/note-knowledge/graphlit-mcp-server.json +19 -0
  66. package/mcp-configs/reference-mgr/arxiv-cli.json +17 -0
  67. package/mcp-configs/reference-mgr/arxiv-search-mcp.json +17 -0
  68. package/mcp-configs/reference-mgr/chiken.json +17 -0
  69. package/mcp-configs/reference-mgr/claude-scholar.json +17 -0
  70. package/mcp-configs/reference-mgr/devonthink-mcp.json +17 -0
  71. package/mcp-configs/registry.json +447 -0
  72. package/openclaw.plugin.json +21 -0
  73. package/package.json +61 -0
  74. package/skills/analysis/dataviz/color-accessibility-guide/SKILL.md +230 -0
  75. package/skills/analysis/dataviz/geospatial-viz-guide/SKILL.md +218 -0
  76. package/skills/analysis/dataviz/interactive-viz-guide/SKILL.md +287 -0
  77. package/skills/analysis/dataviz/network-visualization-guide/SKILL.md +195 -0
  78. package/skills/analysis/dataviz/publication-figures-guide/SKILL.md +238 -0
  79. package/skills/analysis/dataviz/python-dataviz-guide/SKILL.md +195 -0
  80. package/skills/analysis/econometrics/causal-inference-guide/SKILL.md +197 -0
  81. package/skills/analysis/econometrics/iv-regression-guide/SKILL.md +198 -0
  82. package/skills/analysis/econometrics/panel-data-guide/SKILL.md +274 -0
  83. package/skills/analysis/econometrics/robustness-checks/SKILL.md +250 -0
  84. package/skills/analysis/econometrics/stata-regression/SKILL.md +117 -0
  85. package/skills/analysis/econometrics/time-series-guide/SKILL.md +235 -0
  86. package/skills/analysis/statistics/bayesian-statistics-guide/SKILL.md +221 -0
  87. package/skills/analysis/statistics/hypothesis-testing-guide/SKILL.md +210 -0
  88. package/skills/analysis/statistics/meta-analysis-guide/SKILL.md +206 -0
  89. package/skills/analysis/statistics/nonparametric-tests-guide/SKILL.md +221 -0
  90. package/skills/analysis/statistics/power-analysis-guide/SKILL.md +240 -0
  91. package/skills/analysis/statistics/sem-guide/SKILL.md +231 -0
  92. package/skills/analysis/statistics/survival-analysis-guide/SKILL.md +195 -0
  93. package/skills/analysis/wrangling/missing-data-handling/SKILL.md +224 -0
  94. package/skills/analysis/wrangling/pandas-data-wrangling/SKILL.md +242 -0
  95. package/skills/analysis/wrangling/questionnaire-design-guide/SKILL.md +234 -0
  96. package/skills/analysis/wrangling/text-mining-guide/SKILL.md +225 -0
  97. package/skills/domains/ai-ml/computer-vision-guide/SKILL.md +213 -0
  98. package/skills/domains/ai-ml/deep-learning-papers-guide/SKILL.md +200 -0
  99. package/skills/domains/ai-ml/llm-evaluation-guide/SKILL.md +194 -0
  100. package/skills/domains/ai-ml/prompt-engineering-research/SKILL.md +233 -0
  101. package/skills/domains/ai-ml/reinforcement-learning-guide/SKILL.md +254 -0
  102. package/skills/domains/ai-ml/transformer-architecture-guide/SKILL.md +233 -0
  103. package/skills/domains/biomedical/clinical-research-guide/SKILL.md +232 -0
  104. package/skills/domains/biomedical/clinicaltrials-api/SKILL.md +177 -0
  105. package/skills/domains/biomedical/epidemiology-guide/SKILL.md +200 -0
  106. package/skills/domains/biomedical/genomics-analysis-guide/SKILL.md +270 -0
  107. package/skills/domains/business/market-analysis-guide/SKILL.md +112 -0
  108. package/skills/domains/business/strategic-management-guide/SKILL.md +154 -0
  109. package/skills/domains/chemistry/computational-chemistry-guide/SKILL.md +266 -0
  110. package/skills/domains/chemistry/retrosynthesis-guide/SKILL.md +215 -0
  111. package/skills/domains/cs/algorithms-complexity-guide/SKILL.md +194 -0
  112. package/skills/domains/cs/dblp-api/SKILL.md +129 -0
  113. package/skills/domains/cs/software-engineering-research/SKILL.md +218 -0
  114. package/skills/domains/ecology/biodiversity-data-guide/SKILL.md +296 -0
  115. package/skills/domains/ecology/conservation-biology-guide/SKILL.md +198 -0
  116. package/skills/domains/ecology/gbif-api/SKILL.md +158 -0
  117. package/skills/domains/ecology/inaturalist-api/SKILL.md +173 -0
  118. package/skills/domains/economics/behavioral-economics-guide/SKILL.md +239 -0
  119. package/skills/domains/economics/development-economics-guide/SKILL.md +181 -0
  120. package/skills/domains/economics/fred-api/SKILL.md +189 -0
  121. package/skills/domains/education/curriculum-design-guide/SKILL.md +144 -0
  122. package/skills/domains/education/learning-science-guide/SKILL.md +150 -0
  123. package/skills/domains/finance/financial-data-analysis/SKILL.md +152 -0
  124. package/skills/domains/finance/quantitative-finance-guide/SKILL.md +151 -0
  125. package/skills/domains/geoscience/climate-science-guide/SKILL.md +158 -0
  126. package/skills/domains/geoscience/gis-remote-sensing-guide/SKILL.md +129 -0
  127. package/skills/domains/humanities/digital-humanities-guide/SKILL.md +181 -0
  128. package/skills/domains/humanities/philosophy-research-guide/SKILL.md +148 -0
  129. package/skills/domains/law/courtlistener-api/SKILL.md +213 -0
  130. package/skills/domains/law/legal-research-guide/SKILL.md +250 -0
  131. package/skills/domains/math/linear-algebra-applications/SKILL.md +227 -0
  132. package/skills/domains/math/numerical-methods-guide/SKILL.md +236 -0
  133. package/skills/domains/math/oeis-api/SKILL.md +158 -0
  134. package/skills/domains/pharma/clinical-pharmacology-guide/SKILL.md +165 -0
  135. package/skills/domains/pharma/drug-development-guide/SKILL.md +177 -0
  136. package/skills/domains/physics/computational-physics-guide/SKILL.md +300 -0
  137. package/skills/domains/physics/nasa-ads-api/SKILL.md +150 -0
  138. package/skills/domains/physics/quantum-computing-guide/SKILL.md +234 -0
  139. package/skills/domains/social-science/social-research-methods/SKILL.md +194 -0
  140. package/skills/domains/social-science/survey-research-guide/SKILL.md +182 -0
  141. package/skills/literature/discovery/citation-alert-guide/SKILL.md +154 -0
  142. package/skills/literature/discovery/conference-proceedings-guide/SKILL.md +142 -0
  143. package/skills/literature/discovery/literature-mapping-guide/SKILL.md +175 -0
  144. package/skills/literature/discovery/paper-tracking-guide/SKILL.md +211 -0
  145. package/skills/literature/discovery/rss-paper-feeds/SKILL.md +214 -0
  146. package/skills/literature/discovery/semantic-scholar-recs-guide/SKILL.md +164 -0
  147. package/skills/literature/fulltext/doaj-api/SKILL.md +120 -0
  148. package/skills/literature/fulltext/interlibrary-loan-guide/SKILL.md +163 -0
  149. package/skills/literature/fulltext/open-access-guide/SKILL.md +183 -0
  150. package/skills/literature/fulltext/pmc-oai-api/SKILL.md +184 -0
  151. package/skills/literature/fulltext/preprint-servers-guide/SKILL.md +128 -0
  152. package/skills/literature/fulltext/repository-harvesting-guide/SKILL.md +207 -0
  153. package/skills/literature/fulltext/unpaywall-api/SKILL.md +113 -0
  154. package/skills/literature/metadata/altmetrics-guide/SKILL.md +132 -0
  155. package/skills/literature/metadata/citation-network-guide/SKILL.md +236 -0
  156. package/skills/literature/metadata/crossref-api/SKILL.md +133 -0
  157. package/skills/literature/metadata/datacite-api/SKILL.md +126 -0
  158. package/skills/literature/metadata/doi-resolution-guide/SKILL.md +168 -0
  159. package/skills/literature/metadata/h-index-guide/SKILL.md +183 -0
  160. package/skills/literature/metadata/journal-metrics-guide/SKILL.md +188 -0
  161. package/skills/literature/metadata/opencitations-api/SKILL.md +128 -0
  162. package/skills/literature/metadata/orcid-api/SKILL.md +136 -0
  163. package/skills/literature/metadata/orcid-integration-guide/SKILL.md +178 -0
  164. package/skills/literature/search/arxiv-api/SKILL.md +95 -0
  165. package/skills/literature/search/biorxiv-api/SKILL.md +123 -0
  166. package/skills/literature/search/boolean-search-guide/SKILL.md +199 -0
  167. package/skills/literature/search/citation-chaining-guide/SKILL.md +148 -0
  168. package/skills/literature/search/database-comparison-guide/SKILL.md +100 -0
  169. package/skills/literature/search/europe-pmc-api/SKILL.md +120 -0
  170. package/skills/literature/search/google-scholar-guide/SKILL.md +182 -0
  171. package/skills/literature/search/mesh-terms-guide/SKILL.md +164 -0
  172. package/skills/literature/search/openalex-api/SKILL.md +134 -0
  173. package/skills/literature/search/pubmed-api/SKILL.md +130 -0
  174. package/skills/literature/search/scientify-literature-survey/SKILL.md +203 -0
  175. package/skills/literature/search/semantic-scholar-api/SKILL.md +134 -0
  176. package/skills/literature/search/systematic-search-strategy/SKILL.md +214 -0
  177. package/skills/research/automation/ai-scientist-guide/SKILL.md +228 -0
  178. package/skills/research/automation/data-collection-automation/SKILL.md +248 -0
  179. package/skills/research/automation/research-workflow-automation/SKILL.md +266 -0
  180. package/skills/research/deep-research/meta-synthesis-guide/SKILL.md +174 -0
  181. package/skills/research/deep-research/research-cog/SKILL.md +153 -0
  182. package/skills/research/deep-research/scoping-review-guide/SKILL.md +217 -0
  183. package/skills/research/deep-research/systematic-review-guide/SKILL.md +250 -0
  184. package/skills/research/funding/figshare-api/SKILL.md +163 -0
  185. package/skills/research/funding/grant-writing-guide/SKILL.md +233 -0
  186. package/skills/research/funding/nsf-grant-guide/SKILL.md +206 -0
  187. package/skills/research/funding/open-science-guide/SKILL.md +255 -0
  188. package/skills/research/funding/zenodo-api/SKILL.md +174 -0
  189. package/skills/research/methodology/action-research-guide/SKILL.md +201 -0
  190. package/skills/research/methodology/experimental-design-guide/SKILL.md +236 -0
  191. package/skills/research/methodology/grad-school-guide/SKILL.md +182 -0
  192. package/skills/research/methodology/grounded-theory-guide/SKILL.md +171 -0
  193. package/skills/research/methodology/mixed-methods-guide/SKILL.md +208 -0
  194. package/skills/research/methodology/qualitative-research-guide/SKILL.md +234 -0
  195. package/skills/research/methodology/scientify-idea-generation/SKILL.md +222 -0
  196. package/skills/research/paper-review/paper-reading-assistant/SKILL.md +266 -0
  197. package/skills/research/paper-review/peer-review-guide/SKILL.md +227 -0
  198. package/skills/research/paper-review/rebuttal-writing-guide/SKILL.md +185 -0
  199. package/skills/research/paper-review/scientify-write-review-paper/SKILL.md +209 -0
  200. package/skills/tools/code-exec/jupyter-notebook-guide/SKILL.md +178 -0
  201. package/skills/tools/code-exec/python-reproducibility-guide/SKILL.md +341 -0
  202. package/skills/tools/code-exec/r-reproducibility-guide/SKILL.md +236 -0
  203. package/skills/tools/code-exec/sandbox-execution-guide/SKILL.md +221 -0
  204. package/skills/tools/diagram/mermaid-diagram-guide/SKILL.md +269 -0
  205. package/skills/tools/diagram/plantuml-guide/SKILL.md +397 -0
  206. package/skills/tools/diagram/scientific-illustration-guide/SKILL.md +225 -0
  207. package/skills/tools/document/anystyle-api/SKILL.md +199 -0
  208. package/skills/tools/document/grobid-pdf-parsing/SKILL.md +294 -0
  209. package/skills/tools/document/markdown-academic-guide/SKILL.md +217 -0
  210. package/skills/tools/document/pdf-extraction-guide/SKILL.md +321 -0
  211. package/skills/tools/knowledge-graph/knowledge-graph-construction/SKILL.md +306 -0
  212. package/skills/tools/knowledge-graph/ontology-design-guide/SKILL.md +214 -0
  213. package/skills/tools/knowledge-graph/rag-methodology-guide/SKILL.md +325 -0
  214. package/skills/tools/ocr-translate/formula-recognition-guide/SKILL.md +367 -0
  215. package/skills/tools/ocr-translate/handwriting-recognition-guide/SKILL.md +211 -0
  216. package/skills/tools/ocr-translate/latex-ocr-guide/SKILL.md +204 -0
  217. package/skills/tools/ocr-translate/multilingual-research-guide/SKILL.md +234 -0
  218. package/skills/tools/scraping/academic-web-scraping/SKILL.md +326 -0
  219. package/skills/tools/scraping/api-data-collection-guide/SKILL.md +301 -0
  220. package/skills/tools/scraping/web-scraping-ethics-guide/SKILL.md +250 -0
  221. package/skills/writing/citation/bibtex-management-guide/SKILL.md +246 -0
  222. package/skills/writing/citation/citation-style-guide/SKILL.md +248 -0
  223. package/skills/writing/citation/reference-manager-comparison/SKILL.md +208 -0
  224. package/skills/writing/citation/zotero-api/SKILL.md +188 -0
  225. package/skills/writing/composition/abstract-writing-guide/SKILL.md +188 -0
  226. package/skills/writing/composition/discussion-writing-guide/SKILL.md +194 -0
  227. package/skills/writing/composition/introduction-writing-guide/SKILL.md +194 -0
  228. package/skills/writing/composition/literature-review-writing/SKILL.md +196 -0
  229. package/skills/writing/composition/methods-section-guide/SKILL.md +185 -0
  230. package/skills/writing/composition/response-to-reviewers/SKILL.md +215 -0
  231. package/skills/writing/composition/scientific-writing-guide/SKILL.md +152 -0
  232. package/skills/writing/latex/bibliography-management-guide/SKILL.md +206 -0
  233. package/skills/writing/latex/latex-drawing-guide/SKILL.md +234 -0
  234. package/skills/writing/latex/latex-ecosystem-guide/SKILL.md +240 -0
  235. package/skills/writing/latex/math-typesetting-guide/SKILL.md +231 -0
  236. package/skills/writing/latex/overleaf-collaboration-guide/SKILL.md +211 -0
  237. package/skills/writing/latex/tikz-diagrams-guide/SKILL.md +211 -0
  238. package/skills/writing/polish/academic-translation-guide/SKILL.md +175 -0
  239. package/skills/writing/polish/academic-writing-refiner/SKILL.md +143 -0
  240. package/skills/writing/polish/ai-writing-humanizer/SKILL.md +178 -0
  241. package/skills/writing/polish/grammar-checker-guide/SKILL.md +184 -0
  242. package/skills/writing/polish/plagiarism-detection-guide/SKILL.md +167 -0
  243. package/skills/writing/templates/beamer-presentation-guide/SKILL.md +263 -0
  244. package/skills/writing/templates/conference-paper-template/SKILL.md +219 -0
  245. package/skills/writing/templates/thesis-template-guide/SKILL.md +200 -0
  246. package/skills/writing/templates/thesis-writing-guide/SKILL.md +220 -0
  247. package/src/tools/arxiv.ts +131 -0
  248. package/src/tools/crossref.ts +112 -0
  249. package/src/tools/openalex.ts +174 -0
  250. package/src/tools/pubmed.ts +166 -0
  251. package/src/tools/semantic-scholar.ts +108 -0
  252. package/src/tools/unpaywall.ts +58 -0
@@ -0,0 +1,199 @@
1
+ ---
2
+ name: anystyle-api
3
+ description: "Citation reference parser using machine learning"
4
+ metadata:
5
+ openclaw:
6
+ emoji: "🔍"
7
+ category: "tools"
8
+ subcategory: "document"
9
+ keywords: ["PDF parsing", "document chunking", "format conversion", "PDF extraction"]
10
+ source: "https://anystyle.io/"
11
+ ---
12
+
13
+ # AnyStyle API Guide
14
+
15
+ ## Overview
16
+
17
+ AnyStyle is a fast and smart citation reference parser that uses machine learning (specifically conditional random fields, CRFs) to extract structured bibliographic data from unformatted citation strings. It can parse raw reference text into structured fields such as author, title, journal, volume, pages, year, and DOI, handling the enormous variety of citation formats found in academic literature.
18
+
19
+ The AnyStyle service provides both a web interface and an API endpoint for programmatic citation parsing. Unlike rule-based parsers that rely on specific citation style templates, AnyStyle uses a trained machine learning model that generalizes across citation formats, making it effective for parsing references from diverse disciplines and publication traditions where citation styles vary widely.
20
+
21
+ Researchers, librarians, digital humanists, and research software developers use AnyStyle to extract structured references from PDF documents, legacy bibliographies, dissertation reference lists, and scanned documents. It is particularly valuable for building citation networks, enriching bibliographic databases, migrating references between management tools, and processing large volumes of unstructured citation data that would be impractical to parse manually.
22
+
23
+ ## Authentication
24
+
25
+ No authentication required. The AnyStyle web service is freely accessible without any API key, token, or registration. The service can be used via the web interface at https://anystyle.io/ or through its API endpoint. For heavy usage or private deployments, AnyStyle is also available as an open-source Ruby gem that can be installed locally.
26
+
27
+ ## Core Endpoints
28
+
29
+ ### parse: Parse Citation References
30
+
31
+ Submit raw citation text and receive structured bibliographic data extracted by the machine learning parser. The endpoint accepts one or more citation strings and returns parsed fields for each reference.
32
+
33
+ - **URL**: `POST https://anystyle.io/parse`
34
+ - **Parameters**:
35
+
36
+ | Parameter | Type | Required | Description |
37
+ |-----------|--------|----------|--------------------------------------------------------------|
38
+ | body | string | Yes | Raw citation text (one reference per line in the POST body) |
39
+ | format | string | No | Output format: `json` (default), `xml`, `bib` (BibTeX) |
40
+
41
+ - **Example**:
42
+
43
+ ```bash
44
+ # Parse a single citation
45
+ curl -X POST "https://anystyle.io/parse" \
46
+ -H "Content-Type: text/plain" \
47
+ -d "Vaswani, A., Shazeer, N., Parmar, N., et al. (2017). Attention is all you need. Advances in Neural Information Processing Systems, 30, 5998-6008."
48
+
49
+ # Parse multiple citations (one per line)
50
+ curl -X POST "https://anystyle.io/parse" \
51
+ -H "Content-Type: text/plain" \
52
+ -d "Vaswani, A. et al. (2017). Attention is all you need. NeurIPS 30, 5998-6008.
53
+ LeCun, Y., Bengio, Y., & Hinton, G. (2015). Deep learning. Nature, 521(7553), 436-444.
54
+ Krizhevsky, A., Sutskever, I., & Hinton, G. E. (2012). ImageNet classification with deep convolutional neural networks. NeurIPS 25."
55
+ ```
56
+
57
+ - **Response**: Returns an array of parsed citation objects, each containing extracted fields:
58
+
59
+ ```json
60
+ [
61
+ {
62
+ "author": [{"family": "Vaswani", "given": "A."}, {"family": "Shazeer", "given": "N."}],
63
+ "title": ["Attention is all you need"],
64
+ "date": ["2017"],
65
+ "container-title": ["Advances in Neural Information Processing Systems"],
66
+ "volume": ["30"],
67
+ "pages": ["5998-6008"],
68
+ "type": "article-journal"
69
+ }
70
+ ]
71
+ ```
72
+
73
+ Key response fields include `author` (array of name objects), `title`, `date`, `container-title` (journal/conference name), `volume`, `issue`, `pages`, `doi`, `url`, `publisher`, `location`, and `type` (inferred reference type).
74
+
75
+ ## Rate Limits
76
+
77
+ No formal rate limits are documented for the AnyStyle web service. However, the service is provided as a free community resource, so users should exercise responsible usage patterns. For high-volume parsing tasks (thousands of citations or more), it is strongly recommended to install the AnyStyle Ruby gem locally:
78
+
79
+ ```bash
80
+ gem install anystyle
81
+ ```
82
+
83
+ The local installation provides the same parsing capabilities without any network dependencies or rate concerns, and supports batch processing of large reference lists and PDF files directly.
84
+
85
+ ## Common Patterns
86
+
87
+ ### Parse a Reference List from a Paper
88
+
89
+ Extract structured data from a raw reference list copied from a PDF:
90
+
91
+ ```python
92
+ import requests
93
+
94
+ references = """Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017). Attention is all you need. Advances in neural information processing systems, 30.
95
+ Devlin, J., Chang, M. W., Lee, K., & Toutanova, K. (2019). BERT: Pre-training of deep bidirectional transformers for language understanding. NAACL-HLT, 4171-4186.
96
+ Brown, T. B., Mann, B., Ryder, N., Subbiah, M., et al. (2020). Language models are few-shot learners. NeurIPS 33, 1877-1901."""
97
+
98
+ resp = requests.post(
99
+ "https://anystyle.io/parse",
100
+ headers={"Content-Type": "text/plain"},
101
+ data=references
102
+ )
103
+
104
+ for ref in resp.json():
105
+ authors = ", ".join(
106
+ f"{a.get('family', '')} {a.get('given', '')}" for a in ref.get("author", [])
107
+ )
108
+ title = ref.get("title", [""])[0]
109
+ year = ref.get("date", [""])[0]
110
+ journal = ref.get("container-title", [""])[0]
111
+ print(f"{authors} ({year}). {title}. {journal}")
112
+ ```
113
+
114
+ ### Batch Process Citations from Multiple Documents
115
+
116
+ Process reference lists from multiple papers for citation network analysis:
117
+
118
+ ```python
119
+ import requests
120
+
121
+ def parse_references(raw_text):
122
+ """Parse raw citation text into structured records."""
123
+ resp = requests.post(
124
+ "https://anystyle.io/parse",
125
+ headers={"Content-Type": "text/plain"},
126
+ data=raw_text
127
+ )
128
+ if resp.status_code == 200:
129
+ return resp.json()
130
+ return []
131
+
132
+ # Process references from multiple source documents
133
+ documents = {
134
+ "paper_A": "Smith, J. (2020). Title A. Journal X, 1, 1-10.\nDoe, J. (2019). Title B. Journal Y, 2, 20-30.",
135
+ "paper_B": "Jones, K. (2021). Title C. Conference Z, 100-110.\nSmith, J. (2020). Title A. Journal X, 1, 1-10."
136
+ }
137
+
138
+ citation_graph = {}
139
+ for doc_id, refs in documents.items():
140
+ parsed = parse_references(refs)
141
+ citation_graph[doc_id] = parsed
142
+ print(f"{doc_id}: parsed {len(parsed)} references")
143
+ ```
144
+
145
+ ### Convert Citations to BibTeX Format
146
+
147
+ Transform unstructured references into BibTeX entries for use with LaTeX:
148
+
149
+ ```python
150
+ import requests
151
+
152
+ citation = "Goodfellow, I., Bengio, Y., & Courville, A. (2016). Deep Learning. MIT Press."
153
+
154
+ resp = requests.post(
155
+ "https://anystyle.io/parse",
156
+ headers={"Content-Type": "text/plain"},
157
+ data=citation
158
+ )
159
+
160
+ parsed = resp.json()[0]
161
+ # Build a BibTeX entry from parsed fields
162
+ authors = " and ".join(
163
+ f"{a.get('family', '')}, {a.get('given', '')}" for a in parsed.get("author", [])
164
+ )
165
+ bib_key = parsed.get("author", [{}])[0].get("family", "unknown").lower() + parsed.get("date", ["0000"])[0]
166
+
167
+ print(f"@book{{{bib_key},")
168
+ print(f" author = {{{authors}}},")
169
+ print(f" title = {{{parsed.get('title', [''])[0]}}},")
170
+ print(f" year = {{{parsed.get('date', [''])[0]}}},")
171
+ print(f" publisher = {{{parsed.get('publisher', [''])[0] if parsed.get('publisher') else 'Unknown'}}}")
172
+ print("}")
173
+ ```
174
+
175
+ ### Local Installation for High-Volume Processing
176
+
177
+ For large-scale processing, install AnyStyle locally as a Ruby gem:
178
+
179
+ ```bash
180
+ # Install the gem
181
+ gem install anystyle
182
+
183
+ # Parse references from command line
184
+ anystyle parse "Smith, J. (2020). My Paper. Journal, 1, 1-10."
185
+
186
+ # Parse references from a text file
187
+ anystyle parse references.txt --format json > parsed.json
188
+
189
+ # Parse references directly from a PDF
190
+ anystyle find document.pdf --format json > extracted_refs.json
191
+ ```
192
+
193
+ ## References
194
+
195
+ - AnyStyle web service: https://anystyle.io/
196
+ - AnyStyle GitHub repository: https://github.com/inukshuk/anystyle
197
+ - AnyStyle Ruby gem: https://rubygems.org/gems/anystyle
198
+ - AnyStyle CLI documentation: https://github.com/inukshuk/anystyle-cli
199
+ - CSL (Citation Style Language): https://citationstyles.org/
@@ -0,0 +1,294 @@
1
+ ---
2
+ name: grobid-pdf-parsing
3
+ description: "Extract structured text, metadata, and references from academic PDFs"
4
+ metadata:
5
+ openclaw:
6
+ emoji: "📄"
7
+ category: "tools"
8
+ subcategory: "document"
9
+ keywords: ["PDF parsing", "PDF extraction", "document chunking", "format conversion"]
10
+ source: "https://github.com/kermitt2/grobid"
11
+ ---
12
+
13
+ # GROBID PDF Parsing Guide
14
+
15
+ ## Overview
16
+
17
+ Academic PDFs are the primary format for distributing research, yet extracting structured data from them remains challenging. PDFs encode visual layout, not semantic structure -- headings, paragraphs, equations, tables, and citations are all just positioned text and graphics. GROBID (GeneRation Of BIbliographic Data) is the leading open-source tool for parsing academic PDFs into structured XML/TEI format, extracting metadata, body text, references, and figures with high accuracy.
18
+
19
+ GROBID is used by major academic platforms including Semantic Scholar, CORE, and ResearchGate for large-scale document processing. It combines machine learning models (CRF and deep learning) with heuristic rules to handle the diverse formatting of academic papers across publishers and disciplines.
20
+
21
+ This guide covers installing and running GROBID, using its REST API for batch processing, extracting specific elements (metadata, references, body sections), and integrating GROBID output into downstream workflows such as knowledge bases, systematic reviews, and literature analysis pipelines.
22
+
23
+ ## Installation
24
+
25
+ ### Docker (Recommended)
26
+
27
+ ```bash
28
+ # Pull the latest GROBID image
29
+ docker pull grobid/grobid:0.8.1
30
+
31
+ # Run GROBID server
32
+ docker run --rm --init \
33
+ --ulimit core=0 \
34
+ -p 8070:8070 \
35
+ grobid/grobid:0.8.1
36
+
37
+ # GROBID is now running at http://localhost:8070
38
+ # Web console: http://localhost:8070/console
39
+ ```
40
+
41
+ ### From Source
42
+
43
+ ```bash
44
+ git clone https://github.com/kermitt2/grobid.git
45
+ cd grobid
46
+ ./gradlew clean install
47
+ ./gradlew run
48
+ ```
49
+
50
+ ## REST API Usage
51
+
52
+ ### Process Full Document
53
+
54
+ ```bash
55
+ # Process a single PDF and get TEI XML
56
+ curl -v --form input=@paper.pdf \
57
+ http://localhost:8070/api/processFulltextDocument \
58
+ -o paper.tei.xml
59
+
60
+ # With options
61
+ curl -v --form input=@paper.pdf \
62
+ --form consolidateHeader=1 \
63
+ --form consolidateCitations=1 \
64
+ --form includeRawCitations=1 \
65
+ http://localhost:8070/api/processFulltextDocument \
66
+ -o paper.tei.xml
67
+ ```
68
+
69
+ ### API Endpoints
70
+
71
+ | Endpoint | Purpose | Input | Output |
72
+ |----------|---------|-------|--------|
73
+ | `/api/processFulltextDocument` | Full paper parsing | PDF | TEI XML |
74
+ | `/api/processHeaderDocument` | Metadata only | PDF | TEI XML (header) |
75
+ | `/api/processReferences` | Reference parsing | PDF | TEI XML (refs) |
76
+ | `/api/processCitation` | Parse citation string | Text | TEI XML |
77
+ | `/api/processDate` | Parse date string | Text | Structured date |
78
+
79
+ ### Python Client
80
+
81
+ ```python
82
+ import requests
83
+ from pathlib import Path
84
+
85
+ class GrobidClient:
86
+ def __init__(self, base_url='http://localhost:8070'):
87
+ self.base_url = base_url
88
+
89
+ def process_fulltext(self, pdf_path, consolidate_header=True,
90
+ consolidate_citations=True):
91
+ """Process a PDF and return TEI XML."""
92
+ url = f'{self.base_url}/api/processFulltextDocument'
93
+ files = {'input': open(pdf_path, 'rb')}
94
+ data = {
95
+ 'consolidateHeader': '1' if consolidate_header else '0',
96
+ 'consolidateCitations': '1' if consolidate_citations else '0',
97
+ }
98
+ response = requests.post(url, files=files, data=data)
99
+ response.raise_for_status()
100
+ return response.text
101
+
102
+ def process_header(self, pdf_path):
103
+ """Extract only header metadata from PDF."""
104
+ url = f'{self.base_url}/api/processHeaderDocument'
105
+ files = {'input': open(pdf_path, 'rb')}
106
+ response = requests.post(url, files=files)
107
+ response.raise_for_status()
108
+ return response.text
109
+
110
+ def is_alive(self):
111
+ """Check if GROBID server is running."""
112
+ try:
113
+ resp = requests.get(f'{self.base_url}/api/isalive')
114
+ return resp.status_code == 200
115
+ except requests.ConnectionError:
116
+ return False
117
+
118
+ # Usage
119
+ client = GrobidClient()
120
+ if client.is_alive():
121
+ tei_xml = client.process_fulltext('paper.pdf')
122
+ with open('paper.tei.xml', 'w') as f:
123
+ f.write(tei_xml)
124
+ ```
125
+
126
+ ## Parsing TEI XML Output
127
+
128
+ ### Extracting Metadata
129
+
130
+ ```python
131
+ from lxml import etree
132
+
133
+ def parse_tei_metadata(tei_xml):
134
+ """Extract title, authors, abstract from TEI XML."""
135
+ ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
136
+ root = etree.fromstring(tei_xml.encode('utf-8'))
137
+
138
+ # Title
139
+ title_el = root.find('.//tei:titleStmt/tei:title', ns)
140
+ title = title_el.text if title_el is not None else ''
141
+
142
+ # Authors
143
+ authors = []
144
+ for author in root.findall('.//tei:sourceDesc//tei:author', ns):
145
+ forename = author.findtext('.//tei:forename', '', ns)
146
+ surname = author.findtext('.//tei:surname', '', ns)
147
+ if surname:
148
+ authors.append(f'{forename} {surname}'.strip())
149
+
150
+ # Abstract
151
+ abstract_el = root.find('.//tei:profileDesc/tei:abstract', ns)
152
+ abstract = ''.join(abstract_el.itertext()).strip() if abstract_el is not None else ''
153
+
154
+ # DOI
155
+ doi_el = root.find('.//tei:idno[@type="DOI"]', ns)
156
+ doi = doi_el.text if doi_el is not None else ''
157
+
158
+ return {
159
+ 'title': title,
160
+ 'authors': authors,
161
+ 'abstract': abstract,
162
+ 'doi': doi,
163
+ }
164
+ ```
165
+
166
+ ### Extracting Body Sections
167
+
168
+ ```python
169
+ def parse_tei_sections(tei_xml):
170
+ """Extract structured sections from TEI XML body."""
171
+ ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
172
+ root = etree.fromstring(tei_xml.encode('utf-8'))
173
+
174
+ sections = []
175
+ for div in root.findall('.//tei:body/tei:div', ns):
176
+ head = div.findtext('tei:head', '', ns).strip()
177
+ paragraphs = []
178
+ for p in div.findall('tei:p', ns):
179
+ text = ''.join(p.itertext()).strip()
180
+ if text:
181
+ paragraphs.append(text)
182
+ sections.append({
183
+ 'heading': head,
184
+ 'n': div.get('n', ''),
185
+ 'paragraphs': paragraphs,
186
+ })
187
+
188
+ return sections
189
+ ```
190
+
191
+ ### Extracting References
192
+
193
+ ```python
194
+ def parse_tei_references(tei_xml):
195
+ """Extract structured references from TEI XML."""
196
+ ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
197
+ root = etree.fromstring(tei_xml.encode('utf-8'))
198
+
199
+ refs = []
200
+ for bib in root.findall('.//tei:listBibl/tei:biblStruct', ns):
201
+ ref = {'id': bib.get('{http://www.w3.org/XML/1998/namespace}id', '')}
202
+
203
+ # Title
204
+ title_el = bib.find('.//tei:title[@level="a"]', ns)
205
+ if title_el is None:
206
+ title_el = bib.find('.//tei:title', ns)
207
+ ref['title'] = title_el.text if title_el is not None else ''
208
+
209
+ # Authors
210
+ ref['authors'] = []
211
+ for author in bib.findall('.//tei:author', ns):
212
+ name = f"{author.findtext('.//tei:forename', '', ns)} {author.findtext('.//tei:surname', '', ns)}".strip()
213
+ if name:
214
+ ref['authors'].append(name)
215
+
216
+ # Year
217
+ date_el = bib.find('.//tei:date[@type="published"]', ns)
218
+ ref['year'] = date_el.get('when', '') if date_el is not None else ''
219
+
220
+ # DOI
221
+ doi_el = bib.find('.//tei:idno[@type="DOI"]', ns)
222
+ ref['doi'] = doi_el.text if doi_el is not None else ''
223
+
224
+ refs.append(ref)
225
+
226
+ return refs
227
+ ```
228
+
229
+ ## Batch Processing
230
+
231
+ ### Processing a Directory of PDFs
232
+
233
+ ```python
234
+ from pathlib import Path
235
+ import json
236
+ from concurrent.futures import ThreadPoolExecutor
237
+
238
+ def batch_process(pdf_dir, output_dir, max_workers=4):
239
+ """Process all PDFs in a directory using GROBID."""
240
+ client = GrobidClient()
241
+ pdf_dir = Path(pdf_dir)
242
+ output_dir = Path(output_dir)
243
+ output_dir.mkdir(parents=True, exist_ok=True)
244
+
245
+ pdf_files = list(pdf_dir.glob('*.pdf'))
246
+ print(f"Processing {len(pdf_files)} PDFs...")
247
+
248
+ def process_one(pdf_path):
249
+ try:
250
+ tei = client.process_fulltext(str(pdf_path))
251
+ meta = parse_tei_metadata(tei)
252
+ refs = parse_tei_references(tei)
253
+
254
+ # Save TEI XML
255
+ tei_path = output_dir / f'{pdf_path.stem}.tei.xml'
256
+ tei_path.write_text(tei)
257
+
258
+ # Save structured JSON
259
+ json_path = output_dir / f'{pdf_path.stem}.json'
260
+ json_path.write_text(json.dumps({
261
+ 'metadata': meta,
262
+ 'references': refs,
263
+ 'n_references': len(refs),
264
+ }, indent=2))
265
+
266
+ return pdf_path.name, 'success'
267
+ except Exception as e:
268
+ return pdf_path.name, f'error: {str(e)}'
269
+
270
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
271
+ results = list(executor.map(process_one, pdf_files))
272
+
273
+ for name, status in results:
274
+ print(f" {name}: {status}")
275
+
276
+ batch_process('papers/', 'parsed_output/')
277
+ ```
278
+
279
+ ## Best Practices
280
+
281
+ - **Use consolidation flags.** `consolidateHeader=1` and `consolidateCitations=1` cross-reference against Crossref for better metadata.
282
+ - **Handle errors gracefully.** Some PDFs are scanned images, corrupted, or have unusual layouts. Always wrap processing in try/except.
283
+ - **Limit concurrent requests.** GROBID is CPU-intensive. 4-8 concurrent requests is usually optimal.
284
+ - **Validate output.** Spot-check a sample of parsed documents against the original PDFs.
285
+ - **Use GROBID for structured extraction, not OCR.** For scanned documents, run OCR first (Tesseract) then GROBID.
286
+ - **Keep GROBID updated.** Each release improves parsing accuracy, especially for newer publisher formats.
287
+
288
+ ## References
289
+
290
+ - [GROBID Documentation](https://grobid.readthedocs.io/) -- Official documentation
291
+ - [GROBID GitHub](https://github.com/kermitt2/grobid) -- Source code
292
+ - [TEI Guidelines](https://tei-c.org/release/doc/tei-p5-doc/en/html/) -- TEI XML standard
293
+ - [grobid-client-python](https://github.com/kermitt2/grobid_client_python) -- Official Python client
294
+ - [Science Parse](https://github.com/allenai/science-parse) -- Allen AI alternative parser