asta-papers 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. asta_papers-0.0.1/.gitignore +135 -0
  2. asta_papers-0.0.1/LICENSE +21 -0
  3. asta_papers-0.0.1/PKG-INFO +183 -0
  4. asta_papers-0.0.1/README.md +127 -0
  5. asta_papers-0.0.1/docs/CHANGELOG.md +195 -0
  6. asta_papers-0.0.1/docs/DESIGN.md +664 -0
  7. asta_papers-0.0.1/docs/agents.md +155 -0
  8. asta_papers-0.0.1/docs/byo.md +128 -0
  9. asta_papers-0.0.1/docs/concepts.md +142 -0
  10. asta_papers-0.0.1/docs/converters.md +184 -0
  11. asta_papers-0.0.1/docs/index.md +22 -0
  12. asta_papers-0.0.1/docs/licensing.md +242 -0
  13. asta_papers-0.0.1/docs/quickstart.md +127 -0
  14. asta_papers-0.0.1/docs/rate-limiting.md +109 -0
  15. asta_papers-0.0.1/docs/troubleshooting.md +174 -0
  16. asta_papers-0.0.1/pyproject.toml +95 -0
  17. asta_papers-0.0.1/src/asta_papers/__init__.py +63 -0
  18. asta_papers-0.0.1/src/asta_papers/byo.py +82 -0
  19. asta_papers-0.0.1/src/asta_papers/cache.py +194 -0
  20. asta_papers-0.0.1/src/asta_papers/client.py +561 -0
  21. asta_papers-0.0.1/src/asta_papers/config.py +174 -0
  22. asta_papers-0.0.1/src/asta_papers/converters/__init__.py +36 -0
  23. asta_papers-0.0.1/src/asta_papers/converters/_jats_core.py +435 -0
  24. asta_papers-0.0.1/src/asta_papers/converters/jats.py +24 -0
  25. asta_papers-0.0.1/src/asta_papers/converters/mistral.py +114 -0
  26. asta_papers-0.0.1/src/asta_papers/converters/olmocr.py +148 -0
  27. asta_papers-0.0.1/src/asta_papers/enums.py +89 -0
  28. asta_papers-0.0.1/src/asta_papers/errors.py +26 -0
  29. asta_papers-0.0.1/src/asta_papers/identifiers.py +195 -0
  30. asta_papers-0.0.1/src/asta_papers/license.py +83 -0
  31. asta_papers-0.0.1/src/asta_papers/rate_limit.py +86 -0
  32. asta_papers-0.0.1/src/asta_papers/result.py +152 -0
  33. asta_papers-0.0.1/src/asta_papers/strategies/__init__.py +107 -0
  34. asta_papers-0.0.1/src/asta_papers/strategies/_common.py +187 -0
  35. asta_papers-0.0.1/src/asta_papers/strategies/arxiv.py +40 -0
  36. asta_papers-0.0.1/src/asta_papers/strategies/biorxiv.py +82 -0
  37. asta_papers-0.0.1/src/asta_papers/strategies/europepmc_pdf.py +84 -0
  38. asta_papers-0.0.1/src/asta_papers/strategies/pmc_efetch.py +81 -0
  39. asta_papers-0.0.1/src/asta_papers/strategies/pmid_elink.py +56 -0
  40. asta_papers-0.0.1/src/asta_papers/strategies/published_version.py +96 -0
  41. asta_papers-0.0.1/src/asta_papers/strategies/repo_scrape.py +130 -0
  42. asta_papers-0.0.1/src/asta_papers/strategies/unpaywall.py +128 -0
  43. asta_papers-0.0.1/tests/conftest.py +41 -0
  44. asta_papers-0.0.1/tests/integration/recovery_baseline.json +58 -0
  45. asta_papers-0.0.1/tests/integration/test_byo_real.py +86 -0
  46. asta_papers-0.0.1/tests/integration/test_pmc_efetch_real.py +65 -0
  47. asta_papers-0.0.1/tests/integration/test_rate_limits_real.py +146 -0
  48. asta_papers-0.0.1/tests/integration/test_recovery_benchmark.py +107 -0
  49. asta_papers-0.0.1/tests/integration/test_strategies_real.py +92 -0
  50. asta_papers-0.0.1/tests/unit/test_cache.py +110 -0
  51. asta_papers-0.0.1/tests/unit/test_config.py +80 -0
  52. asta_papers-0.0.1/tests/unit/test_identifiers.py +156 -0
  53. asta_papers-0.0.1/tests/unit/test_jats_to_md.py +270 -0
  54. asta_papers-0.0.1/tests/unit/test_license_classifier.py +92 -0
  55. asta_papers-0.0.1/tests/unit/test_rate_limiter.py +68 -0
  56. asta_papers-0.0.1/tests/unit/test_result.py +221 -0
  57. asta_papers-0.0.1/tools/check_test_legitimacy.py +133 -0
@@ -0,0 +1,135 @@
1
+ # =========================
2
+ # OS / System files
3
+ # =========================
4
+ .DS_Store
5
+ .DS_Store?
6
+ ._*
7
+ .Spotlight-V100
8
+ .Trashes
9
+ Thumbs.db
10
+ ehthumbs.db
11
+ Desktop.ini
12
+
13
+ # =========================
14
+ # Editor / IDE files
15
+ # =========================
16
+ .vscode/
17
+ .idea/
18
+ *.swp
19
+ *.swo
20
+ *.swn
21
+ *~
22
+ *.bak
23
+ *.tmp
24
+ *.orig
25
+
26
+ # =========================
27
+ # Logs
28
+ # =========================
29
+ logs/
30
+ *.log
31
+ npm-debug.log*
32
+ yarn-debug.log*
33
+ yarn-error.log*
34
+ pnpm-debug.log*
35
+
36
+ # =========================
37
+ # Environment variables / secrets
38
+ # =========================
39
+ .env
40
+ .env.*
41
+ !.env.example
42
+ *.pem
43
+ *.key
44
+ *.crt
45
+
46
+ # =========================
47
+ # Build / dist artifacts
48
+ # =========================
49
+ dist/
50
+ build/
51
+ out/
52
+ target/
53
+ obj/
54
+
55
+ # =========================
56
+ # Dependency directories
57
+ # =========================
58
+ node_modules/
59
+ vendor/
60
+ .venv/
61
+ venv/
62
+ env/
63
+ __pycache__/
64
+
65
+ # Yarn
66
+ **/.yarn/*
67
+ !.yarn/patches
68
+ !.yarn/plugins
69
+ !.yarn/releases
70
+ !.yarn/sdks
71
+ !.yarn/versions
72
+ .pnp.*
73
+
74
+ # =========================
75
+ # Test / coverage output
76
+ # =========================
77
+ coverage/
78
+ .coverage
79
+ .nyc_output/
80
+ test-results/
81
+ pytest_cache/
82
+
83
+ # =========================
84
+ # Temporary / cache files
85
+ # =========================
86
+ .cache/
87
+ .tmp/
88
+ .temp/
89
+ *.cache
90
+
91
+ # =========================
92
+ # Archive files
93
+ # =========================
94
+ *.zip
95
+ *.tar
96
+ *.tar.gz
97
+ *.rar
98
+ *.7z
99
+
100
+ # =========================
101
+ # Runtime files
102
+ # =========================
103
+ *.pid
104
+ *.seed
105
+ *.pid.lock
106
+
107
+ # =========================
108
+ # Misc
109
+ # =========================
110
+ *.lock
111
+ !.gitignore
112
+
113
+ # =========================
114
+ # Terraform
115
+ # =========================
116
+ *.tfstate
117
+ *.tfstate.*
118
+ .terraform/
119
+ .terraform.lock.hcl
120
+ infra/terraform.tfvars
121
+
122
+ # =========================
123
+ # Generated deploy files
124
+ # =========================
125
+ _asta_modal_*.py
126
+
127
+ # =========================
128
+ # Python packaging
129
+ # =========================
130
+ *.egg-info/
131
+
132
+ # =========================
133
+ # Claude Code worktrees
134
+ # =========================
135
+ .claude/worktrees/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Allen Institute for AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,183 @@
1
+ Metadata-Version: 2.4
2
+ Name: asta-papers
3
+ Version: 0.0.1
4
+ Summary: Legal-only paper full-text retrieval and conversion. DOI/PMID/PMCID/arXiv/Corpus ID + BYO PDF/JATS → markdown with license classification.
5
+ Project-URL: Homepage, https://github.com/allenai/asta-sdk
6
+ Project-URL: Repository, https://github.com/allenai/asta-sdk
7
+ Project-URL: Issues, https://github.com/allenai/asta-sdk/issues
8
+ Project-URL: Documentation, https://github.com/allenai/asta-sdk/tree/main/src/python/asta/papers/docs
9
+ Project-URL: Changelog, https://github.com/allenai/asta-sdk/blob/main/src/python/asta/papers/docs/CHANGELOG.md
10
+ Author: Allen Institute for AI
11
+ License: MIT
12
+ License-File: LICENSE
13
+ Keywords: arxiv,europepmc,jats,license-classification,ocr,open-access,pdf,pubmed,scientific-papers,text-mining,unpaywall
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Intended Audience :: Science/Research
17
+ Classifier: License :: OSI Approved :: MIT License
18
+ Classifier: Operating System :: OS Independent
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Classifier: Topic :: Scientific/Engineering
25
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
+ Classifier: Topic :: Text Processing :: Markup :: XML
27
+ Classifier: Typing :: Typed
28
+ Requires-Python: >=3.10
29
+ Requires-Dist: requests>=2.31
30
+ Provides-Extra: all
31
+ Requires-Dist: boto3>=1.34; extra == 'all'
32
+ Requires-Dist: mistralai<2,>=1.5; extra == 'all'
33
+ Requires-Dist: olmocr>=0.4; extra == 'all'
34
+ Requires-Dist: openai>=1.40; extra == 'all'
35
+ Requires-Dist: pillow>=10.0; extra == 'all'
36
+ Requires-Dist: pypdf>=4.0; extra == 'all'
37
+ Provides-Extra: dev
38
+ Requires-Dist: freezegun>=1.5; extra == 'dev'
39
+ Requires-Dist: mypy>=1.10; extra == 'dev'
40
+ Requires-Dist: pytest-benchmark>=4.0; extra == 'dev'
41
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
42
+ Requires-Dist: pytest-xdist>=3.6; extra == 'dev'
43
+ Requires-Dist: pytest>=8.0; extra == 'dev'
44
+ Requires-Dist: responses>=0.25; extra == 'dev'
45
+ Requires-Dist: ruff>=0.6; extra == 'dev'
46
+ Provides-Extra: mistral
47
+ Requires-Dist: mistralai<2,>=1.5; extra == 'mistral'
48
+ Provides-Extra: olmocr
49
+ Requires-Dist: olmocr>=0.4; extra == 'olmocr'
50
+ Requires-Dist: openai>=1.40; extra == 'olmocr'
51
+ Requires-Dist: pillow>=10.0; extra == 'olmocr'
52
+ Requires-Dist: pypdf>=4.0; extra == 'olmocr'
53
+ Provides-Extra: s3
54
+ Requires-Dist: boto3>=1.34; extra == 's3'
55
+ Description-Content-Type: text/markdown
56
+
57
+ # asta-papers
58
+
59
+ Legal-only paper full-text retrieval and conversion. Identifier (DOI / PMID /
60
+ PMCID / arXiv / Semantic Scholar Corpus ID) — or BYO PDF / JATS — to markdown
61
+ with explicit license classification.
62
+
63
+ Lifts paper recovery on biomedical literature from ~22% (Mistral OCR alone)
64
+ to ~85% using only publisher-blessed legal channels (NCBI E-utilities,
65
+ Unpaywall, EuropePMC, bioRxiv, institutional repositories).
66
+
67
+ ## Install
68
+
69
+ ```bash
70
+ pip install asta-papers # core (JATS conversion only)
71
+ pip install 'asta-papers[mistral]' # + Mistral OCR for PDFs
72
+ pip install 'asta-papers[olmocr]' # + local olmOCR for PDFs (offline)
73
+ pip install 'asta-papers[s3]' # + s3:// BYO support
74
+ pip install 'asta-papers[all]' # everything
75
+ ```
76
+
77
+ ## Quickstart
78
+
79
+ ```python
80
+ import os
81
+ from asta_papers import Client
82
+ from asta_papers.converters.mistral import MistralConverter
83
+
84
+ c = Client(
85
+ email="me@allenai.org",
86
+ ncbi_api_key=os.environ.get("NCBI_API_KEY"), # optional, lifts NCBI 3→10 rps
87
+ converters=[MistralConverter()], # for PDF→markdown
88
+ )
89
+
90
+ # By identifier
91
+ r = c.fetch(doi="10.1186/s12943-024-02093-w")
92
+ print(r.success, r.license_class, r.markdown[:200])
93
+
94
+ # Storage-tier policy
95
+ if r.may_redistribute: # CC BY / CC0 / CC BY-SA
96
+ save_artifact(r.bytes)
97
+ elif r.may_use_for_tdm: # TDM-permissive licenses; not BRONZE/UNKNOWN/CLOSED
98
+ extract_inline(r.markdown)
99
+
100
+ # BYO PDF — bytes, local path, or URI
101
+ r = c.fetch(pdf=b"%PDF-...")
102
+ r = c.fetch(pdf="paper.pdf")
103
+ r = c.fetch(pdf="s3://my-bucket/paper.pdf") # requires [s3]
104
+
105
+ # Batch with bounded concurrency + per-host rate limits
106
+ results = c.fetch_many([
107
+ {"doi": "10.1038/foo"},
108
+ {"pmcid": "PMC123"},
109
+ {"pdf": "paper.pdf", "doi": "10.99/local"},
110
+ ])
111
+ ```
112
+
113
+ ## How it works
114
+
115
+ A strategy ladder runs against legal aggregator APIs in order, returning the
116
+ first successful result:
117
+
118
+ 1. **PMC E-utilities efetch** — JATS XML for PMC OA Subset articles
119
+ 2. **NCBI elink** — PMID → PMC self-link when S2 didn't surface it
120
+ 3. **Published-version handoff** — when input is a preprint DOI, route to
121
+ the published version (bioRxiv API or Crossref `relation`) so callers
122
+ get the most-recent public version of the paper
123
+ 4. **arXiv** — direct PDF for arXiv papers
124
+ 5. **bioRxiv / medRxiv API** — JATS XML or PDF for preprints
125
+ 6. **Unpaywall** — best legal OA URL; routes PMC URLs through efetch
126
+ 7. **Institutional repo scrape** — `hdl.handle.net`, `pure.eur.nl`, etc.
127
+ (now respects `robots.txt`)
128
+ 8. **EuropePMC PDF render** — text-mining-licensed PDFs for free-to-read
129
+ papers NCBI's OA Subset doesn't include
130
+
131
+ Per-host token-bucket rate limiting honors every publisher's published quota
132
+ exactly. arXiv at 0.33 rps (their explicit rule). NCBI at 3 rps (10 with key)
133
+ shared across `eutils.*`, `pmc.*`, `www.*` hostnames. Independent services
134
+ run fully in parallel.
135
+
136
+ ## License classification
137
+
138
+ Every successful fetch carries a `LicenseClass`:
139
+
140
+ ```
141
+ cc-by cc-by-sa cc-by-nd cc-by-nc cc-by-nc-sa cc-by-nc-nd cc0
142
+ text-mining-only bronze arxiv-default closed unknown
143
+ ```
144
+
145
+ Plus helper booleans on `FetchResult`: `may_redistribute`,
146
+ `may_redistribute_nc`, `may_make_derivatives`, `may_train_models`,
147
+ `may_use_for_tdm`, plus a `source_type` field (`publisher` /
148
+ `repository` / `other`) for callers who want publisher-vs-repository
149
+ policy without parsing strings. Storage-tier policy is a one-line check.
150
+
151
+ Successful results also carry an attribution blockquote at the top of
152
+ the markdown by default (source URL + DOI/PMCID + license + retrieval
153
+ strategy), so the markdown is self-attributing when it travels to end
154
+ users. Disable with `Client(include_attribution=False)`.
155
+
156
+ ## What's NOT here
157
+
158
+ - No Sci-Hub, no archive scraping, no UA spoofing past WAFs.
159
+ - No title-only paper search (use PaperFinder / S2 first to get an identifier).
160
+ - No multi-tenant `Credentials` per-call object (one Client per credential set).
161
+ - No async API (sync only in v0.1).
162
+
163
+ ## Configuration
164
+
165
+ See `Client.__init__` and `docs/concepts.md` for the full list. Required:
166
+ `email` (Crossref polite-pool identifier; kwarg or `ASTA_PAPERS_EMAIL` env).
167
+ Recommended: `NCBI_API_KEY` env (free, 5-minute registration, 3.3× throughput).
168
+
169
+ ## Tests
170
+
171
+ ```bash
172
+ pytest tests/integration -v # 29 real-API tests
173
+ python tools/check_test_legitimacy.py --strict # asserts mock-ratio < 30%
174
+ ```
175
+
176
+ The full integration suite hits live upstream APIs — no mocks. Tests run in
177
+ ~90 seconds. A per-paper snapshot recovery benchmark (53 biomedical DOIs
178
+ that fail Mistral-OCR-only retrieval; 46/53 = 87% recovered) gates
179
+ regressions.
180
+
181
+ ## Design
182
+
183
+ Full design at [`docs/DESIGN.md`](docs/DESIGN.md).
@@ -0,0 +1,127 @@
1
+ # asta-papers
2
+
3
+ Legal-only paper full-text retrieval and conversion. Identifier (DOI / PMID /
4
+ PMCID / arXiv / Semantic Scholar Corpus ID) — or BYO PDF / JATS — to markdown
5
+ with explicit license classification.
6
+
7
+ Lifts paper recovery on biomedical literature from ~22% (Mistral OCR alone)
8
+ to ~85% using only publisher-blessed legal channels (NCBI E-utilities,
9
+ Unpaywall, EuropePMC, bioRxiv, institutional repositories).
10
+
11
+ ## Install
12
+
13
+ ```bash
14
+ pip install asta-papers # core (JATS conversion only)
15
+ pip install 'asta-papers[mistral]' # + Mistral OCR for PDFs
16
+ pip install 'asta-papers[olmocr]' # + local olmOCR for PDFs (offline)
17
+ pip install 'asta-papers[s3]' # + s3:// BYO support
18
+ pip install 'asta-papers[all]' # everything
19
+ ```
20
+
21
+ ## Quickstart
22
+
23
+ ```python
24
+ import os
25
+ from asta_papers import Client
26
+ from asta_papers.converters.mistral import MistralConverter
27
+
28
+ c = Client(
29
+ email="me@allenai.org",
30
+ ncbi_api_key=os.environ.get("NCBI_API_KEY"), # optional, lifts NCBI 3→10 rps
31
+ converters=[MistralConverter()], # for PDF→markdown
32
+ )
33
+
34
+ # By identifier
35
+ r = c.fetch(doi="10.1186/s12943-024-02093-w")
36
+ print(r.success, r.license_class, r.markdown[:200])
37
+
38
+ # Storage-tier policy
39
+ if r.may_redistribute: # CC BY / CC0 / CC BY-SA
40
+ save_artifact(r.bytes)
41
+ elif r.may_use_for_tdm: # TDM-permissive licenses; not BRONZE/UNKNOWN/CLOSED
42
+ extract_inline(r.markdown)
43
+
44
+ # BYO PDF — bytes, local path, or URI
45
+ r = c.fetch(pdf=b"%PDF-...")
46
+ r = c.fetch(pdf="paper.pdf")
47
+ r = c.fetch(pdf="s3://my-bucket/paper.pdf") # requires [s3]
48
+
49
+ # Batch with bounded concurrency + per-host rate limits
50
+ results = c.fetch_many([
51
+ {"doi": "10.1038/foo"},
52
+ {"pmcid": "PMC123"},
53
+ {"pdf": "paper.pdf", "doi": "10.99/local"},
54
+ ])
55
+ ```
56
+
57
+ ## How it works
58
+
59
+ A strategy ladder runs against legal aggregator APIs in order, returning the
60
+ first successful result:
61
+
62
+ 1. **PMC E-utilities efetch** — JATS XML for PMC OA Subset articles
63
+ 2. **NCBI elink** — PMID → PMC self-link when S2 didn't surface it
64
+ 3. **Published-version handoff** — when input is a preprint DOI, route to
65
+ the published version (bioRxiv API or Crossref `relation`) so callers
66
+ get the most-recent public version of the paper
67
+ 4. **arXiv** — direct PDF for arXiv papers
68
+ 5. **bioRxiv / medRxiv API** — JATS XML or PDF for preprints
69
+ 6. **Unpaywall** — best legal OA URL; routes PMC URLs through efetch
70
+ 7. **Institutional repo scrape** — `hdl.handle.net`, `pure.eur.nl`, etc.
71
+ (now respects `robots.txt`)
72
+ 8. **EuropePMC PDF render** — text-mining-licensed PDFs for free-to-read
73
+ papers NCBI's OA Subset doesn't include
74
+
75
+ Per-host token-bucket rate limiting honors every publisher's published quota
76
+ exactly. arXiv at 0.33 rps (their explicit rule). NCBI at 3 rps (10 with key)
77
+ shared across `eutils.*`, `pmc.*`, `www.*` hostnames. Independent services
78
+ run fully in parallel.
79
+
80
+ ## License classification
81
+
82
+ Every successful fetch carries a `LicenseClass`:
83
+
84
+ ```
85
+ cc-by cc-by-sa cc-by-nd cc-by-nc cc-by-nc-sa cc-by-nc-nd cc0
86
+ text-mining-only bronze arxiv-default closed unknown
87
+ ```
88
+
89
+ Plus helper booleans on `FetchResult`: `may_redistribute`,
90
+ `may_redistribute_nc`, `may_make_derivatives`, `may_train_models`,
91
+ `may_use_for_tdm`, plus a `source_type` field (`publisher` /
92
+ `repository` / `other`) for callers who want publisher-vs-repository
93
+ policy without parsing strings. Storage-tier policy is a one-line check.
94
+
95
+ Successful results also carry an attribution blockquote at the top of
96
+ the markdown by default (source URL + DOI/PMCID + license + retrieval
97
+ strategy), so the markdown is self-attributing when it travels to end
98
+ users. Disable with `Client(include_attribution=False)`.
99
+
100
+ ## What's NOT here
101
+
102
+ - No Sci-Hub, no archive scraping, no UA spoofing past WAFs.
103
+ - No title-only paper search (use PaperFinder / S2 first to get an identifier).
104
+ - No multi-tenant `Credentials` per-call object (one Client per credential set).
105
+ - No async API (sync only in v0.1).
106
+
107
+ ## Configuration
108
+
109
+ See `Client.__init__` and `docs/concepts.md` for the full list. Required:
110
+ `email` (Crossref polite-pool identifier; kwarg or `ASTA_PAPERS_EMAIL` env).
111
+ Recommended: `NCBI_API_KEY` env (free, 5-minute registration, 3.3× throughput).
112
+
113
+ ## Tests
114
+
115
+ ```bash
116
+ pytest tests/integration -v # 29 real-API tests
117
+ python tools/check_test_legitimacy.py --strict # asserts mock-ratio < 30%
118
+ ```
119
+
120
+ The full integration suite hits live upstream APIs — no mocks. Tests run in
121
+ ~90 seconds. A per-paper snapshot recovery benchmark (53 biomedical DOIs
122
+ that fail Mistral-OCR-only retrieval; 46/53 = 87% recovered) gates
123
+ regressions.
124
+
125
+ ## Design
126
+
127
+ Full design at [`docs/DESIGN.md`](docs/DESIGN.md).
@@ -0,0 +1,195 @@
1
+ # Changelog
2
+
3
+ All notable changes to `asta-papers`. Format: [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
4
+ Versioning: [Semver](https://semver.org/).
5
+
6
+ ## [Unreleased]
7
+
8
+ ### Added — license-aware fields and policies
9
+
10
+ - **`LicenseClass.ARXIV_DEFAULT`**: arXiv's non-exclusive distribution
11
+ grant. Distinct from `UNKNOWN` — callers know they have at least the
12
+ arXiv default rights (academic / non-commercial redistribution). The
13
+ `arxiv` strategy now returns `ARXIV_DEFAULT` instead of `UNKNOWN`.
14
+ - **`SourceType` enum** (`PUBLISHER`, `REPOSITORY`, `OTHER`) and a
15
+ matching `FetchResult.source_type` field. Promoted from Unpaywall's
16
+ `host_type` in `unpaywall`; inferred from the strategy in
17
+ `arxiv`/`biorxiv`/`pmc-efetch`/`europepmc-pdf`/`repo-scrape`. Lets
18
+ callers implement publisher-vs-repository policy without parsing
19
+ `notes` strings.
20
+ - **`FetchProvenance.enriched_identifiers`**: dict of identifiers that
21
+ were added by the Client's enrichment step (S2 corpus_id resolve, NCBI
22
+ ID converter). Empty when the caller supplied all identifiers
23
+ themselves. Audit trail for "did this fetch hit PMC because the caller
24
+ asked, or because enrichment supplied a PMCID?"
25
+ - **Attribution blockquote** auto-prepended to converted markdown so the
26
+ output is self-attributing when it travels to end users. Includes
27
+ source URL, DOI/PMCID, license class + URL, and the strategy that
28
+ retrieved the bytes. Controlled by a new `Client(include_attribution=True)`
29
+ flag (default on); pass `False` for pure-inference workflows that
30
+ prefer clean prose.
31
+ - **Bronze fallback** in `unpaywall`: when Unpaywall says `is_oa=True`
32
+ with `host_type=publisher` but no `license` tag, the result is now
33
+ classified as `LicenseClass.BRONZE` (previously fell through to
34
+ `UNKNOWN`). A note `unpaywall:no-license-tag-classified-as-bronze` is
35
+ added so the audit trail records the inference.
36
+ - **Robots.txt check** in `repo_scrape`. Before scraping an
37
+ institutional repo's landing page (and again for the discovered PDF
38
+ URL), the strategy checks the host's `robots.txt` and skips disallowed
39
+ paths. The check uses stdlib `urllib.robotparser`, is cached per host
40
+ via `ctx.api_cache`, and fails permissive (allows the scrape if
41
+ robots.txt is unreachable, which is the conservative-permissive
42
+ default — most repos don't ship robots).
43
+
44
+ ### Changed — strategy ladder reordered
45
+
46
+ Default ladder now runs `published-version` *before* the preprint-fetching
47
+ strategies (`arxiv`, `biorxiv`). Realises the user-facing contract "give
48
+ me the most recent public version of the paper": when a preprint DOI
49
+ maps to a published version with a usable license, fetch the published
50
+ version instead of the preprint. Returns None for non-preprint DOIs and
51
+ falls through to the preprint strategies. The `result.identifiers`
52
+ field always reflects the actually-fetched DOI, so callers can compare
53
+ against the request to detect handoffs.
54
+
55
+ ### Documented — NIH/PMC and publisher TOS posture
56
+
57
+ Researched and documented the AI-training-vs-inference distinction in
58
+ `docs/licensing.md`. Key takeaways:
59
+
60
+ - PMC's E-Utilities (the `pmc-efetch` strategy) is on the explicit list
61
+ of permitted automated retrieval services in PMC's copyright policy —
62
+ not just tolerated, explicitly named.
63
+ - PMC does NOT carve AI training out of "text mining" in its TOS.
64
+ - The pressure to do so is coming from individual publishers (Elsevier,
65
+ Wiley, etc.) via website footers added since 2024 — not from PMC, NIH,
66
+ EuropePMC, or arXiv. Whether those footers can override CC-BY on
67
+ individual articles is legally contested.
68
+ - Practical implication: when a downstream caller is **training a model**
69
+ (not just running inference), they should filter
70
+ `source_type == SourceType.REPOSITORY` to avoid the publisher-CDN
71
+ paths whose TOS may reserve AI rights. Library defaults remain right
72
+ for the inference use case.
73
+
74
+ ### Changed — `may_use_for_tdm` semantics tightened
75
+
76
+ Previously returned `True` for any successful fetch — over-permissive
77
+ because it equated "library may ingest" with "caller may run TDM
78
+ downstream." Now returns `True` only for license classes that grant TDM
79
+ explicitly (CC*, `TEXT_MINING_ONLY`, `ARXIV_DEFAULT`, etc.). Returns
80
+ `False` for `BRONZE`, `UNKNOWN`, and `CLOSED` because TDM rights for
81
+ those are ambiguous. Callers should pair with `may_redistribute` for
82
+ redistribution gates; `may_use_for_tdm` is now strictly about
83
+ *caller-side* TDM permission.
84
+
85
+ ### Fixed — JATS converter coverage
86
+
87
+ Driven by a 26-paper diverse stress-test corpus spanning 9 publishers
88
+ (PNAS, Cell, Lancet, JAMA, Nature, Science, PLOS, BMC, MDPI). Element-by-element
89
+ audit found 10 categories of silently-dropped content; all closed.
90
+
91
+ - **Walk `<floats-group>`** sibling of `<body>`. Recovers figures and tables
92
+ for MDPI, AACR, JCI, OUP, NEJM (≈ half the PMC corpus by publisher).
93
+ - **Descend into `<p>` block children** (fig / table-wrap / disp-formula /
94
+ list / supplementary-material / boxed-text / def-list / ack / app /
95
+ media / chem-struct-wrap / sub-article / glossary). Recovers
96
+ Nature/BMC/OUP figures wrapped inside body paragraphs.
97
+ - **Walk all `<back>` children**, not just `<ref-list>`. Recovers `<ack>`,
98
+ `<app>`, `<glossary>` (was 19 of 26 papers dropping these).
99
+ - **Strip whitespace `<label>`** before emitting caption blocks. Lancet and
100
+ similar publishers emit `<label>\n</label>`, which previously produced
101
+ malformed `**\n: caption**` blocks that downstream regexes couldn't
102
+ match.
103
+ - **Render `<def-list>`** as `**term**: definition` lines.
104
+ - **Render `<ack>` / `<app>` / `<app-group>` / `<glossary>`** as labelled
105
+ sections, honouring the publisher's `<title>` if present.
106
+ - **Nested `<list>` inside `<list-item>`** now dispatched as a block
107
+ (preserves nested-list structure instead of flattening).
108
+ - **Tables now honour `<thead>` / `<tbody>` / `<tfoot>`**: thead rows
109
+ become the markdown header (multi-row headers preserved as pre-header
110
+ body rows); cells from nested tables can no longer bleed into the
111
+ parent table's row set.
112
+ - **`<inline-formula>` MathML fallback**: when no `<tex-math>` is present
113
+ but the formula contains `<mml:math>`, render plaintext content as
114
+ `$mathml-text$` (or `$[math]$` if empty). Math-heavy papers no longer
115
+ have empty formula slots in the markdown.
116
+ - **`<disp-formula>` always emits `$$...$$`** regardless of source
117
+ (tex-math / MathML / plaintext sub-sup / equation-graphic). Display
118
+ math is now machine-detectable in markdown without inspecting the JATS.
119
+ - **`<supplementary-material>` and `<table-wrap-foot>`** rendered. Surfaces
120
+ supplementary file captions and table abbreviation glossaries needed to
121
+ interpret cells.
122
+
123
+ ### Tested
124
+
125
+ - Unit suite grew from 113 → 121 tests (8 new tests in
126
+ `test_jats_to_md.py` lock the behaviours above).
127
+ - Full unit suite passes; no regressions.
128
+ - Stress-audit drop counts (papers where JATS XML had > 0 of an element
129
+ but markdown had 0): figures 2 → 0; tables 1 → 0; disp-formula 2 → 0;
130
+ inline-formula 1 → 0; ack 19 → 3 (the remaining 3 are publisher-custom
131
+ titles like "Sources of Funding:" — converter is correct, audit-script
132
+ is too narrow); def-list 3 → 0; app 3 → 0; glossary 4 → 0 (same
133
+ custom-title caveat as ack).
134
+
135
+ ### Known gaps (deferred)
136
+
137
+ A separate legal review surfaced **3 P0 findings** that the *caller*
138
+ should be aware of when implementing license-aware vending policy. See
139
+ [`docs/licensing.md`](licensing.md#known-caveats) for the current list and
140
+ recommended mitigations.
141
+
142
+ ## [0.0.1] — 2026-05-06
143
+
144
+ Initial release.
145
+
146
+ ### Added
147
+
148
+ - `Client` (sync) with `fetch(...)` and `fetch_many(...)` entry points.
149
+ - 8-strategy ladder: PMC efetch, NCBI elink, arXiv, bioRxiv API, Unpaywall,
150
+ bioRxiv published-version handoff (with Crossref `relation` fallback for
151
+ non-bioRxiv DOIs), institutional repo scrape, EuropePMC PDF render.
152
+ - BYO inputs via `pdf=` and `jats=` accepting bytes / `Path` / URI
153
+ (`file://`, `http(s)://`, `s3://`).
154
+ - Pluggable converters: built-in `JatsConverter` (stdlib only),
155
+ `MistralConverter` (extra `[mistral]`), `OlmocrConverter` (extra `[olmocr]`).
156
+ - Per-host token-bucket `RateLimiter` with hostname aliasing — independent
157
+ services run in parallel, same-service workers serialize at the configured
158
+ rps.
159
+ - License classifier producing one of 11 `LicenseClass` values; `FetchResult`
160
+ helpers `may_redistribute`, `may_redistribute_nc`, `may_make_derivatives`,
161
+ `may_train_models`, `may_use_for_tdm`.
162
+ - `FileCache` keyed by canonical identifier set; multi-key aliasing for
163
+ cross-identifier hits.
164
+ - Identifier enrichment: NCBI ID converter (DOI ↔ PMID/PMCID) and S2 (Corpus
165
+ ID → others). Off by default; opt-in via `enrich_identifiers=True` (or
166
+ always-on for corpus-id-only inputs).
167
+ - `tools/check_test_legitimacy.py` — fails CI if mock ratio exceeds 30% or any
168
+ strategy has no real-API test.
169
+
170
+ ### Tested
171
+
172
+ - 113 unit tests (license classifier, identifier normalization, rate limiter
173
+ alias chain, FileCache round-trip, JATS converter, FetchResult helpers,
174
+ config redaction).
175
+ - 29 real-API integration tests, zero mocks (verified by the legitimacy
176
+ sweep).
177
+ - Per-paper snapshot recovery benchmark (53 biomedical DOIs that fail
178
+ Mistral-OCR-only retrieval): 46/53 (86.8%) recovered. Diff-based to
179
+ catch regressions on specific DOIs.
180
+ - Rate-limit timing tests with real wall-clock assertions: arXiv 0.33 rps
181
+ strict, NCBI 3 rps shared across `eutils`/`pmc`/`www` hostnames, service
182
+ independence under 4-thread concurrency, 100-acquire concurrent safety.
183
+
184
+ ### Known caveats
185
+
186
+ - `mistralai` pinned to `>=1.5,<2`; v2 reorganized imports and breaks the
187
+ converter.
188
+ - `olmocr` requires LM Studio (or compatible) loaded with **8k+ context** for
189
+ reliable output on 2-column biomedical PDFs. Default 4k context yields
190
+ ~67% per-page failure rate.
191
+ - Per-Client rate limiter — multi-instance deployments don't share quota.
192
+ Distributed limiter deferred to v0.2.
193
+ - No async API. Sync only.
194
+ - No multi-tenant `Credentials` per-call object. One Client per credential set.
195
+ - `gs://` BYO is not implemented (placeholder enum value reserved).