asta-papers 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- asta_papers-0.0.1/.gitignore +135 -0
- asta_papers-0.0.1/LICENSE +21 -0
- asta_papers-0.0.1/PKG-INFO +183 -0
- asta_papers-0.0.1/README.md +127 -0
- asta_papers-0.0.1/docs/CHANGELOG.md +195 -0
- asta_papers-0.0.1/docs/DESIGN.md +664 -0
- asta_papers-0.0.1/docs/agents.md +155 -0
- asta_papers-0.0.1/docs/byo.md +128 -0
- asta_papers-0.0.1/docs/concepts.md +142 -0
- asta_papers-0.0.1/docs/converters.md +184 -0
- asta_papers-0.0.1/docs/index.md +22 -0
- asta_papers-0.0.1/docs/licensing.md +242 -0
- asta_papers-0.0.1/docs/quickstart.md +127 -0
- asta_papers-0.0.1/docs/rate-limiting.md +109 -0
- asta_papers-0.0.1/docs/troubleshooting.md +174 -0
- asta_papers-0.0.1/pyproject.toml +95 -0
- asta_papers-0.0.1/src/asta_papers/__init__.py +63 -0
- asta_papers-0.0.1/src/asta_papers/byo.py +82 -0
- asta_papers-0.0.1/src/asta_papers/cache.py +194 -0
- asta_papers-0.0.1/src/asta_papers/client.py +561 -0
- asta_papers-0.0.1/src/asta_papers/config.py +174 -0
- asta_papers-0.0.1/src/asta_papers/converters/__init__.py +36 -0
- asta_papers-0.0.1/src/asta_papers/converters/_jats_core.py +435 -0
- asta_papers-0.0.1/src/asta_papers/converters/jats.py +24 -0
- asta_papers-0.0.1/src/asta_papers/converters/mistral.py +114 -0
- asta_papers-0.0.1/src/asta_papers/converters/olmocr.py +148 -0
- asta_papers-0.0.1/src/asta_papers/enums.py +89 -0
- asta_papers-0.0.1/src/asta_papers/errors.py +26 -0
- asta_papers-0.0.1/src/asta_papers/identifiers.py +195 -0
- asta_papers-0.0.1/src/asta_papers/license.py +83 -0
- asta_papers-0.0.1/src/asta_papers/rate_limit.py +86 -0
- asta_papers-0.0.1/src/asta_papers/result.py +152 -0
- asta_papers-0.0.1/src/asta_papers/strategies/__init__.py +107 -0
- asta_papers-0.0.1/src/asta_papers/strategies/_common.py +187 -0
- asta_papers-0.0.1/src/asta_papers/strategies/arxiv.py +40 -0
- asta_papers-0.0.1/src/asta_papers/strategies/biorxiv.py +82 -0
- asta_papers-0.0.1/src/asta_papers/strategies/europepmc_pdf.py +84 -0
- asta_papers-0.0.1/src/asta_papers/strategies/pmc_efetch.py +81 -0
- asta_papers-0.0.1/src/asta_papers/strategies/pmid_elink.py +56 -0
- asta_papers-0.0.1/src/asta_papers/strategies/published_version.py +96 -0
- asta_papers-0.0.1/src/asta_papers/strategies/repo_scrape.py +130 -0
- asta_papers-0.0.1/src/asta_papers/strategies/unpaywall.py +128 -0
- asta_papers-0.0.1/tests/conftest.py +41 -0
- asta_papers-0.0.1/tests/integration/recovery_baseline.json +58 -0
- asta_papers-0.0.1/tests/integration/test_byo_real.py +86 -0
- asta_papers-0.0.1/tests/integration/test_pmc_efetch_real.py +65 -0
- asta_papers-0.0.1/tests/integration/test_rate_limits_real.py +146 -0
- asta_papers-0.0.1/tests/integration/test_recovery_benchmark.py +107 -0
- asta_papers-0.0.1/tests/integration/test_strategies_real.py +92 -0
- asta_papers-0.0.1/tests/unit/test_cache.py +110 -0
- asta_papers-0.0.1/tests/unit/test_config.py +80 -0
- asta_papers-0.0.1/tests/unit/test_identifiers.py +156 -0
- asta_papers-0.0.1/tests/unit/test_jats_to_md.py +270 -0
- asta_papers-0.0.1/tests/unit/test_license_classifier.py +92 -0
- asta_papers-0.0.1/tests/unit/test_rate_limiter.py +68 -0
- asta_papers-0.0.1/tests/unit/test_result.py +221 -0
- asta_papers-0.0.1/tools/check_test_legitimacy.py +133 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# =========================
|
|
2
|
+
# OS / System files
|
|
3
|
+
# =========================
|
|
4
|
+
.DS_Store
|
|
5
|
+
.DS_Store?
|
|
6
|
+
._*
|
|
7
|
+
.Spotlight-V100
|
|
8
|
+
.Trashes
|
|
9
|
+
Thumbs.db
|
|
10
|
+
ehthumbs.db
|
|
11
|
+
Desktop.ini
|
|
12
|
+
|
|
13
|
+
# =========================
|
|
14
|
+
# Editor / IDE files
|
|
15
|
+
# =========================
|
|
16
|
+
.vscode/
|
|
17
|
+
.idea/
|
|
18
|
+
*.swp
|
|
19
|
+
*.swo
|
|
20
|
+
*.swn
|
|
21
|
+
*~
|
|
22
|
+
*.bak
|
|
23
|
+
*.tmp
|
|
24
|
+
*.orig
|
|
25
|
+
|
|
26
|
+
# =========================
|
|
27
|
+
# Logs
|
|
28
|
+
# =========================
|
|
29
|
+
logs/
|
|
30
|
+
*.log
|
|
31
|
+
npm-debug.log*
|
|
32
|
+
yarn-debug.log*
|
|
33
|
+
yarn-error.log*
|
|
34
|
+
pnpm-debug.log*
|
|
35
|
+
|
|
36
|
+
# =========================
|
|
37
|
+
# Environment variables / secrets
|
|
38
|
+
# =========================
|
|
39
|
+
.env
|
|
40
|
+
.env.*
|
|
41
|
+
!.env.example
|
|
42
|
+
*.pem
|
|
43
|
+
*.key
|
|
44
|
+
*.crt
|
|
45
|
+
|
|
46
|
+
# =========================
|
|
47
|
+
# Build / dist artifacts
|
|
48
|
+
# =========================
|
|
49
|
+
dist/
|
|
50
|
+
build/
|
|
51
|
+
out/
|
|
52
|
+
target/
|
|
53
|
+
obj/
|
|
54
|
+
|
|
55
|
+
# =========================
|
|
56
|
+
# Dependency directories
|
|
57
|
+
# =========================
|
|
58
|
+
node_modules/
|
|
59
|
+
vendor/
|
|
60
|
+
.venv/
|
|
61
|
+
venv/
|
|
62
|
+
env/
|
|
63
|
+
__pycache__/
|
|
64
|
+
|
|
65
|
+
# Yarn
|
|
66
|
+
**/.yarn/*
|
|
67
|
+
!.yarn/patches
|
|
68
|
+
!.yarn/plugins
|
|
69
|
+
!.yarn/releases
|
|
70
|
+
!.yarn/sdks
|
|
71
|
+
!.yarn/versions
|
|
72
|
+
.pnp.*
|
|
73
|
+
|
|
74
|
+
# =========================
|
|
75
|
+
# Test / coverage output
|
|
76
|
+
# =========================
|
|
77
|
+
coverage/
|
|
78
|
+
.coverage
|
|
79
|
+
.nyc_output/
|
|
80
|
+
test-results/
|
|
81
|
+
pytest_cache/
|
|
82
|
+
|
|
83
|
+
# =========================
|
|
84
|
+
# Temporary / cache files
|
|
85
|
+
# =========================
|
|
86
|
+
.cache/
|
|
87
|
+
.tmp/
|
|
88
|
+
.temp/
|
|
89
|
+
*.cache
|
|
90
|
+
|
|
91
|
+
# =========================
|
|
92
|
+
# Archive files
|
|
93
|
+
# =========================
|
|
94
|
+
*.zip
|
|
95
|
+
*.tar
|
|
96
|
+
*.tar.gz
|
|
97
|
+
*.rar
|
|
98
|
+
*.7z
|
|
99
|
+
|
|
100
|
+
# =========================
|
|
101
|
+
# Runtime files
|
|
102
|
+
# =========================
|
|
103
|
+
*.pid
|
|
104
|
+
*.seed
|
|
105
|
+
*.pid.lock
|
|
106
|
+
|
|
107
|
+
# =========================
|
|
108
|
+
# Misc
|
|
109
|
+
# =========================
|
|
110
|
+
*.lock
|
|
111
|
+
!.gitignore
|
|
112
|
+
|
|
113
|
+
# =========================
|
|
114
|
+
# Terraform
|
|
115
|
+
# =========================
|
|
116
|
+
*.tfstate
|
|
117
|
+
*.tfstate.*
|
|
118
|
+
.terraform/
|
|
119
|
+
.terraform.lock.hcl
|
|
120
|
+
infra/terraform.tfvars
|
|
121
|
+
|
|
122
|
+
# =========================
|
|
123
|
+
# Generated deploy files
|
|
124
|
+
# =========================
|
|
125
|
+
_asta_modal_*.py
|
|
126
|
+
|
|
127
|
+
# =========================
|
|
128
|
+
# Python packaging
|
|
129
|
+
# =========================
|
|
130
|
+
*.egg-info/
|
|
131
|
+
|
|
132
|
+
# =========================
|
|
133
|
+
# Claude Code worktrees
|
|
134
|
+
# =========================
|
|
135
|
+
.claude/worktrees/
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Allen Institute for AI
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: asta-papers
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Legal-only paper full-text retrieval and conversion. DOI/PMID/PMCID/arXiv/Corpus ID + BYO PDF/JATS → markdown with license classification.
|
|
5
|
+
Project-URL: Homepage, https://github.com/allenai/asta-sdk
|
|
6
|
+
Project-URL: Repository, https://github.com/allenai/asta-sdk
|
|
7
|
+
Project-URL: Issues, https://github.com/allenai/asta-sdk/issues
|
|
8
|
+
Project-URL: Documentation, https://github.com/allenai/asta-sdk/tree/main/src/python/asta/papers/docs
|
|
9
|
+
Project-URL: Changelog, https://github.com/allenai/asta-sdk/blob/main/src/python/asta/papers/docs/CHANGELOG.md
|
|
10
|
+
Author: Allen Institute for AI
|
|
11
|
+
License: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: arxiv,europepmc,jats,license-classification,ocr,open-access,pdf,pubmed,scientific-papers,text-mining,unpaywall
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
18
|
+
Classifier: Operating System :: OS Independent
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
26
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
27
|
+
Classifier: Typing :: Typed
|
|
28
|
+
Requires-Python: >=3.10
|
|
29
|
+
Requires-Dist: requests>=2.31
|
|
30
|
+
Provides-Extra: all
|
|
31
|
+
Requires-Dist: boto3>=1.34; extra == 'all'
|
|
32
|
+
Requires-Dist: mistralai<2,>=1.5; extra == 'all'
|
|
33
|
+
Requires-Dist: olmocr>=0.4; extra == 'all'
|
|
34
|
+
Requires-Dist: openai>=1.40; extra == 'all'
|
|
35
|
+
Requires-Dist: pillow>=10.0; extra == 'all'
|
|
36
|
+
Requires-Dist: pypdf>=4.0; extra == 'all'
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: freezegun>=1.5; extra == 'dev'
|
|
39
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
40
|
+
Requires-Dist: pytest-benchmark>=4.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
42
|
+
Requires-Dist: pytest-xdist>=3.6; extra == 'dev'
|
|
43
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
44
|
+
Requires-Dist: responses>=0.25; extra == 'dev'
|
|
45
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
46
|
+
Provides-Extra: mistral
|
|
47
|
+
Requires-Dist: mistralai<2,>=1.5; extra == 'mistral'
|
|
48
|
+
Provides-Extra: olmocr
|
|
49
|
+
Requires-Dist: olmocr>=0.4; extra == 'olmocr'
|
|
50
|
+
Requires-Dist: openai>=1.40; extra == 'olmocr'
|
|
51
|
+
Requires-Dist: pillow>=10.0; extra == 'olmocr'
|
|
52
|
+
Requires-Dist: pypdf>=4.0; extra == 'olmocr'
|
|
53
|
+
Provides-Extra: s3
|
|
54
|
+
Requires-Dist: boto3>=1.34; extra == 's3'
|
|
55
|
+
Description-Content-Type: text/markdown
|
|
56
|
+
|
|
57
|
+
# asta-papers
|
|
58
|
+
|
|
59
|
+
Legal-only paper full-text retrieval and conversion. Identifier (DOI / PMID /
|
|
60
|
+
PMCID / arXiv / Semantic Scholar Corpus ID) — or BYO PDF / JATS — to markdown
|
|
61
|
+
with explicit license classification.
|
|
62
|
+
|
|
63
|
+
Lifts paper recovery on biomedical literature from ~22% (Mistral OCR alone)
|
|
64
|
+
to ~85% using only publisher-blessed legal channels (NCBI E-utilities,
|
|
65
|
+
Unpaywall, EuropePMC, bioRxiv, institutional repositories).
|
|
66
|
+
|
|
67
|
+
## Install
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install asta-papers # core (JATS conversion only)
|
|
71
|
+
pip install 'asta-papers[mistral]' # + Mistral OCR for PDFs
|
|
72
|
+
pip install 'asta-papers[olmocr]' # + local olmOCR for PDFs (offline)
|
|
73
|
+
pip install 'asta-papers[s3]' # + s3:// BYO support
|
|
74
|
+
pip install 'asta-papers[all]' # everything
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Quickstart
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
import os
|
|
81
|
+
from asta_papers import Client
|
|
82
|
+
from asta_papers.converters.mistral import MistralConverter
|
|
83
|
+
|
|
84
|
+
c = Client(
|
|
85
|
+
email="me@allenai.org",
|
|
86
|
+
ncbi_api_key=os.environ.get("NCBI_API_KEY"), # optional, lifts NCBI 3→10 rps
|
|
87
|
+
converters=[MistralConverter()], # for PDF→markdown
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# By identifier
|
|
91
|
+
r = c.fetch(doi="10.1186/s12943-024-02093-w")
|
|
92
|
+
print(r.success, r.license_class, r.markdown[:200])
|
|
93
|
+
|
|
94
|
+
# Storage-tier policy
|
|
95
|
+
if r.may_redistribute: # CC BY / CC0 / CC BY-SA
|
|
96
|
+
save_artifact(r.bytes)
|
|
97
|
+
elif r.may_use_for_tdm: # TDM-permissive licenses; not BRONZE/UNKNOWN/CLOSED
|
|
98
|
+
extract_inline(r.markdown)
|
|
99
|
+
|
|
100
|
+
# BYO PDF — bytes, local path, or URI
|
|
101
|
+
r = c.fetch(pdf=b"%PDF-...")
|
|
102
|
+
r = c.fetch(pdf="paper.pdf")
|
|
103
|
+
r = c.fetch(pdf="s3://my-bucket/paper.pdf") # requires [s3]
|
|
104
|
+
|
|
105
|
+
# Batch with bounded concurrency + per-host rate limits
|
|
106
|
+
results = c.fetch_many([
|
|
107
|
+
{"doi": "10.1038/foo"},
|
|
108
|
+
{"pmcid": "PMC123"},
|
|
109
|
+
{"pdf": "paper.pdf", "doi": "10.99/local"},
|
|
110
|
+
])
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## How it works
|
|
114
|
+
|
|
115
|
+
A strategy ladder runs against legal aggregator APIs in order, returning the
|
|
116
|
+
first successful result:
|
|
117
|
+
|
|
118
|
+
1. **PMC E-utilities efetch** — JATS XML for PMC OA Subset articles
|
|
119
|
+
2. **NCBI elink** — PMID → PMC self-link when S2 didn't surface it
|
|
120
|
+
3. **Published-version handoff** — when input is a preprint DOI, route to
|
|
121
|
+
the published version (bioRxiv API or Crossref `relation`) so callers
|
|
122
|
+
get the most-recent public version of the paper
|
|
123
|
+
4. **arXiv** — direct PDF for arXiv papers
|
|
124
|
+
5. **bioRxiv / medRxiv API** — JATS XML or PDF for preprints
|
|
125
|
+
6. **Unpaywall** — best legal OA URL; routes PMC URLs through efetch
|
|
126
|
+
7. **Institutional repo scrape** — `hdl.handle.net`, `pure.eur.nl`, etc.
|
|
127
|
+
(now respects `robots.txt`)
|
|
128
|
+
8. **EuropePMC PDF render** — text-mining-licensed PDFs for free-to-read
|
|
129
|
+
papers NCBI's OA Subset doesn't include
|
|
130
|
+
|
|
131
|
+
Per-host token-bucket rate limiting honors every publisher's published quota
|
|
132
|
+
exactly. arXiv at 0.33 rps (their explicit rule). NCBI at 3 rps (10 with key)
|
|
133
|
+
shared across `eutils.*`, `pmc.*`, `www.*` hostnames. Independent services
|
|
134
|
+
run fully in parallel.
|
|
135
|
+
|
|
136
|
+
## License classification
|
|
137
|
+
|
|
138
|
+
Every successful fetch carries a `LicenseClass`:
|
|
139
|
+
|
|
140
|
+
```
|
|
141
|
+
cc-by cc-by-sa cc-by-nd cc-by-nc cc-by-nc-sa cc-by-nc-nd cc0
|
|
142
|
+
text-mining-only bronze arxiv-default closed unknown
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Plus helper booleans on `FetchResult`: `may_redistribute`,
|
|
146
|
+
`may_redistribute_nc`, `may_make_derivatives`, `may_train_models`,
|
|
147
|
+
`may_use_for_tdm`, plus a `source_type` field (`publisher` /
|
|
148
|
+
`repository` / `other`) for callers who want publisher-vs-repository
|
|
149
|
+
policy without parsing strings. Storage-tier policy is a one-line check.
|
|
150
|
+
|
|
151
|
+
Successful results also carry an attribution blockquote at the top of
|
|
152
|
+
the markdown by default (source URL + DOI/PMCID + license + retrieval
|
|
153
|
+
strategy), so the markdown is self-attributing when it travels to end
|
|
154
|
+
users. Disable with `Client(include_attribution=False)`.
|
|
155
|
+
|
|
156
|
+
## What's NOT here
|
|
157
|
+
|
|
158
|
+
- No Sci-Hub, no archive scraping, no UA spoofing past WAFs.
|
|
159
|
+
- No title-only paper search (use PaperFinder / S2 first to get an identifier).
|
|
160
|
+
- No multi-tenant `Credentials` per-call object (one Client per credential set).
|
|
161
|
+
- No async API (sync only in v0.1).
|
|
162
|
+
|
|
163
|
+
## Configuration
|
|
164
|
+
|
|
165
|
+
See `Client.__init__` and `docs/concepts.md` for the full list. Required:
|
|
166
|
+
`email` (Crossref polite-pool identifier; kwarg or `ASTA_PAPERS_EMAIL` env).
|
|
167
|
+
Recommended: `NCBI_API_KEY` env (free, 5-minute registration, 3.3× throughput).
|
|
168
|
+
|
|
169
|
+
## Tests
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
pytest tests/integration -v # 29 real-API tests
|
|
173
|
+
python tools/check_test_legitimacy.py --strict # asserts mock-ratio < 30%
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
The full integration suite hits live upstream APIs — no mocks. Tests run in
|
|
177
|
+
~90 seconds. A per-paper snapshot recovery benchmark (53 biomedical DOIs
|
|
178
|
+
that fail Mistral-OCR-only retrieval; 46/53 = 87% recovered) gates
|
|
179
|
+
regressions.
|
|
180
|
+
|
|
181
|
+
## Design
|
|
182
|
+
|
|
183
|
+
Full design at [`docs/DESIGN.md`](docs/DESIGN.md).
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# asta-papers
|
|
2
|
+
|
|
3
|
+
Legal-only paper full-text retrieval and conversion. Identifier (DOI / PMID /
|
|
4
|
+
PMCID / arXiv / Semantic Scholar Corpus ID) — or BYO PDF / JATS — to markdown
|
|
5
|
+
with explicit license classification.
|
|
6
|
+
|
|
7
|
+
Lifts paper recovery on biomedical literature from ~22% (Mistral OCR alone)
|
|
8
|
+
to ~85% using only publisher-blessed legal channels (NCBI E-utilities,
|
|
9
|
+
Unpaywall, EuropePMC, bioRxiv, institutional repositories).
|
|
10
|
+
|
|
11
|
+
## Install
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install asta-papers # core (JATS conversion only)
|
|
15
|
+
pip install 'asta-papers[mistral]' # + Mistral OCR for PDFs
|
|
16
|
+
pip install 'asta-papers[olmocr]' # + local olmOCR for PDFs (offline)
|
|
17
|
+
pip install 'asta-papers[s3]' # + s3:// BYO support
|
|
18
|
+
pip install 'asta-papers[all]' # everything
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Quickstart
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
import os
|
|
25
|
+
from asta_papers import Client
|
|
26
|
+
from asta_papers.converters.mistral import MistralConverter
|
|
27
|
+
|
|
28
|
+
c = Client(
|
|
29
|
+
email="me@allenai.org",
|
|
30
|
+
ncbi_api_key=os.environ.get("NCBI_API_KEY"), # optional, lifts NCBI 3→10 rps
|
|
31
|
+
converters=[MistralConverter()], # for PDF→markdown
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# By identifier
|
|
35
|
+
r = c.fetch(doi="10.1186/s12943-024-02093-w")
|
|
36
|
+
print(r.success, r.license_class, r.markdown[:200])
|
|
37
|
+
|
|
38
|
+
# Storage-tier policy
|
|
39
|
+
if r.may_redistribute: # CC BY / CC0 / CC BY-SA
|
|
40
|
+
save_artifact(r.bytes)
|
|
41
|
+
elif r.may_use_for_tdm: # TDM-permissive licenses; not BRONZE/UNKNOWN/CLOSED
|
|
42
|
+
extract_inline(r.markdown)
|
|
43
|
+
|
|
44
|
+
# BYO PDF — bytes, local path, or URI
|
|
45
|
+
r = c.fetch(pdf=b"%PDF-...")
|
|
46
|
+
r = c.fetch(pdf="paper.pdf")
|
|
47
|
+
r = c.fetch(pdf="s3://my-bucket/paper.pdf") # requires [s3]
|
|
48
|
+
|
|
49
|
+
# Batch with bounded concurrency + per-host rate limits
|
|
50
|
+
results = c.fetch_many([
|
|
51
|
+
{"doi": "10.1038/foo"},
|
|
52
|
+
{"pmcid": "PMC123"},
|
|
53
|
+
{"pdf": "paper.pdf", "doi": "10.99/local"},
|
|
54
|
+
])
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## How it works
|
|
58
|
+
|
|
59
|
+
A strategy ladder runs against legal aggregator APIs in order, returning the
|
|
60
|
+
first successful result:
|
|
61
|
+
|
|
62
|
+
1. **PMC E-utilities efetch** — JATS XML for PMC OA Subset articles
|
|
63
|
+
2. **NCBI elink** — PMID → PMC self-link when S2 didn't surface it
|
|
64
|
+
3. **Published-version handoff** — when input is a preprint DOI, route to
|
|
65
|
+
the published version (bioRxiv API or Crossref `relation`) so callers
|
|
66
|
+
get the most-recent public version of the paper
|
|
67
|
+
4. **arXiv** — direct PDF for arXiv papers
|
|
68
|
+
5. **bioRxiv / medRxiv API** — JATS XML or PDF for preprints
|
|
69
|
+
6. **Unpaywall** — best legal OA URL; routes PMC URLs through efetch
|
|
70
|
+
7. **Institutional repo scrape** — `hdl.handle.net`, `pure.eur.nl`, etc.
|
|
71
|
+
(now respects `robots.txt`)
|
|
72
|
+
8. **EuropePMC PDF render** — text-mining-licensed PDFs for free-to-read
|
|
73
|
+
papers NCBI's OA Subset doesn't include
|
|
74
|
+
|
|
75
|
+
Per-host token-bucket rate limiting honors every publisher's published quota
|
|
76
|
+
exactly. arXiv at 0.33 rps (their explicit rule). NCBI at 3 rps (10 with key)
|
|
77
|
+
shared across `eutils.*`, `pmc.*`, `www.*` hostnames. Independent services
|
|
78
|
+
run fully in parallel.
|
|
79
|
+
|
|
80
|
+
## License classification
|
|
81
|
+
|
|
82
|
+
Every successful fetch carries a `LicenseClass`:
|
|
83
|
+
|
|
84
|
+
```
|
|
85
|
+
cc-by cc-by-sa cc-by-nd cc-by-nc cc-by-nc-sa cc-by-nc-nd cc0
|
|
86
|
+
text-mining-only bronze arxiv-default closed unknown
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Plus helper booleans on `FetchResult`: `may_redistribute`,
|
|
90
|
+
`may_redistribute_nc`, `may_make_derivatives`, `may_train_models`,
|
|
91
|
+
`may_use_for_tdm`, plus a `source_type` field (`publisher` /
|
|
92
|
+
`repository` / `other`) for callers who want publisher-vs-repository
|
|
93
|
+
policy without parsing strings. Storage-tier policy is a one-line check.
|
|
94
|
+
|
|
95
|
+
Successful results also carry an attribution blockquote at the top of
|
|
96
|
+
the markdown by default (source URL + DOI/PMCID + license + retrieval
|
|
97
|
+
strategy), so the markdown is self-attributing when it travels to end
|
|
98
|
+
users. Disable with `Client(include_attribution=False)`.
|
|
99
|
+
|
|
100
|
+
## What's NOT here
|
|
101
|
+
|
|
102
|
+
- No Sci-Hub, no archive scraping, no UA spoofing past WAFs.
|
|
103
|
+
- No title-only paper search (use PaperFinder / S2 first to get an identifier).
|
|
104
|
+
- No multi-tenant `Credentials` per-call object (one Client per credential set).
|
|
105
|
+
- No async API (sync only in v0.1).
|
|
106
|
+
|
|
107
|
+
## Configuration
|
|
108
|
+
|
|
109
|
+
See `Client.__init__` and `docs/concepts.md` for the full list. Required:
|
|
110
|
+
`email` (Crossref polite-pool identifier; kwarg or `ASTA_PAPERS_EMAIL` env).
|
|
111
|
+
Recommended: `NCBI_API_KEY` env (free, 5-minute registration, 3.3× throughput).
|
|
112
|
+
|
|
113
|
+
## Tests
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
pytest tests/integration -v # 29 real-API tests
|
|
117
|
+
python tools/check_test_legitimacy.py --strict # asserts mock-ratio < 30%
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
The full integration suite hits live upstream APIs — no mocks. Tests run in
|
|
121
|
+
~90 seconds. A per-paper snapshot recovery benchmark (53 biomedical DOIs
|
|
122
|
+
that fail Mistral-OCR-only retrieval; 46/53 = 87% recovered) gates
|
|
123
|
+
regressions.
|
|
124
|
+
|
|
125
|
+
## Design
|
|
126
|
+
|
|
127
|
+
Full design at [`docs/DESIGN.md`](docs/DESIGN.md).
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to `asta-papers`. Format: [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|
4
|
+
Versioning: [Semver](https://semver.org/).
|
|
5
|
+
|
|
6
|
+
## [Unreleased]
|
|
7
|
+
|
|
8
|
+
### Added — license-aware fields and policies
|
|
9
|
+
|
|
10
|
+
- **`LicenseClass.ARXIV_DEFAULT`**: arXiv's non-exclusive distribution
|
|
11
|
+
grant. Distinct from `UNKNOWN` — callers know they have at least the
|
|
12
|
+
arXiv default rights (academic / non-commercial redistribution). The
|
|
13
|
+
`arxiv` strategy now returns `ARXIV_DEFAULT` instead of `UNKNOWN`.
|
|
14
|
+
- **`SourceType` enum** (`PUBLISHER`, `REPOSITORY`, `OTHER`) and a
|
|
15
|
+
matching `FetchResult.source_type` field. Promoted from Unpaywall's
|
|
16
|
+
`host_type` in `unpaywall`; inferred from the strategy in
|
|
17
|
+
`arxiv`/`biorxiv`/`pmc-efetch`/`europepmc-pdf`/`repo-scrape`. Lets
|
|
18
|
+
callers implement publisher-vs-repository policy without parsing
|
|
19
|
+
`notes` strings.
|
|
20
|
+
- **`FetchProvenance.enriched_identifiers`**: dict of identifiers that
|
|
21
|
+
were added by the Client's enrichment step (S2 corpus_id resolve, NCBI
|
|
22
|
+
ID converter). Empty when the caller supplied all identifiers
|
|
23
|
+
themselves. Audit trail for "did this fetch hit PMC because the caller
|
|
24
|
+
asked, or because enrichment supplied a PMCID?"
|
|
25
|
+
- **Attribution blockquote** auto-prepended to converted markdown so the
|
|
26
|
+
output is self-attributing when it travels to end users. Includes
|
|
27
|
+
source URL, DOI/PMCID, license class + URL, and the strategy that
|
|
28
|
+
retrieved the bytes. Controlled by a new `Client(include_attribution=True)`
|
|
29
|
+
flag (default on); pass `False` for pure-inference workflows that
|
|
30
|
+
prefer clean prose.
|
|
31
|
+
- **Bronze fallback** in `unpaywall`: when Unpaywall says `is_oa=True`
|
|
32
|
+
with `host_type=publisher` but no `license` tag, the result is now
|
|
33
|
+
classified as `LicenseClass.BRONZE` (previously fell through to
|
|
34
|
+
`UNKNOWN`). A note `unpaywall:no-license-tag-classified-as-bronze` is
|
|
35
|
+
added so the audit trail records the inference.
|
|
36
|
+
- **Robots.txt check** in `repo_scrape`. Before scraping an
|
|
37
|
+
institutional repo's landing page (and again for the discovered PDF
|
|
38
|
+
URL), the strategy checks the host's `robots.txt` and skips disallowed
|
|
39
|
+
paths. The check uses stdlib `urllib.robotparser`, is cached per host
|
|
40
|
+
via `ctx.api_cache`, and fails permissive (allows the scrape if
|
|
41
|
+
robots.txt is unreachable, which is the conservative-permissive
|
|
42
|
+
default — most repos don't ship robots).
|
|
43
|
+
|
|
44
|
+
### Changed — strategy ladder reordered
|
|
45
|
+
|
|
46
|
+
Default ladder now runs `published-version` *before* the preprint-fetching
|
|
47
|
+
strategies (`arxiv`, `biorxiv`). Realises the user-facing contract "give
|
|
48
|
+
me the most recent public version of the paper": when a preprint DOI
|
|
49
|
+
maps to a published version with a usable license, fetch the published
|
|
50
|
+
version instead of the preprint. Returns None for non-preprint DOIs and
|
|
51
|
+
falls through to the preprint strategies. The `result.identifiers`
|
|
52
|
+
field always reflects the actually-fetched DOI, so callers can compare
|
|
53
|
+
against the request to detect handoffs.
|
|
54
|
+
|
|
55
|
+
### Documented — NIH/PMC and publisher TOS posture
|
|
56
|
+
|
|
57
|
+
Researched and documented the AI-training-vs-inference distinction in
|
|
58
|
+
`docs/licensing.md`. Key takeaways:
|
|
59
|
+
|
|
60
|
+
- PMC's E-Utilities (the `pmc-efetch` strategy) is on the explicit list
|
|
61
|
+
of permitted automated retrieval services in PMC's copyright policy —
|
|
62
|
+
not just tolerated, explicitly named.
|
|
63
|
+
- PMC does NOT carve AI training out of "text mining" in its TOS.
|
|
64
|
+
- The pressure to do so is coming from individual publishers (Elsevier,
|
|
65
|
+
Wiley, etc.) via website footers added since 2024 — not from PMC, NIH,
|
|
66
|
+
EuropePMC, or arXiv. Whether those footers can override CC-BY on
|
|
67
|
+
individual articles is legally contested.
|
|
68
|
+
- Practical implication: when a downstream caller is **training a model**
|
|
69
|
+
(not just running inference), they should filter
|
|
70
|
+
`source_type == SourceType.REPOSITORY` to avoid the publisher-CDN
|
|
71
|
+
paths whose TOS may reserve AI rights. Library defaults remain right
|
|
72
|
+
for the inference use case.
|
|
73
|
+
|
|
74
|
+
### Changed — `may_use_for_tdm` semantics tightened
|
|
75
|
+
|
|
76
|
+
Previously returned `True` for any successful fetch — over-permissive
|
|
77
|
+
because it equated "library may ingest" with "caller may run TDM
|
|
78
|
+
downstream." Now returns `True` only for license classes that grant TDM
|
|
79
|
+
explicitly (CC*, `TEXT_MINING_ONLY`, `ARXIV_DEFAULT`, etc.). Returns
|
|
80
|
+
`False` for `BRONZE`, `UNKNOWN`, and `CLOSED` because TDM rights for
|
|
81
|
+
those are ambiguous. Callers should pair with `may_redistribute` for
|
|
82
|
+
redistribution gates; `may_use_for_tdm` is now strictly about
|
|
83
|
+
*caller-side* TDM permission.
|
|
84
|
+
|
|
85
|
+
### Fixed — JATS converter coverage
|
|
86
|
+
|
|
87
|
+
Driven by a 26-paper diverse stress-test corpus spanning 9 publishers
|
|
88
|
+
(PNAS, Cell, Lancet, JAMA, Nature, Science, PLOS, BMC, MDPI). Element-by-element
|
|
89
|
+
audit found 10 categories of silently-dropped content; all closed.
|
|
90
|
+
|
|
91
|
+
- **Walk `<floats-group>`** sibling of `<body>`. Recovers figures and tables
|
|
92
|
+
for MDPI, AACR, JCI, OUP, NEJM (≈ half the PMC corpus by publisher).
|
|
93
|
+
- **Descend into `<p>` block children** (fig / table-wrap / disp-formula /
|
|
94
|
+
list / supplementary-material / boxed-text / def-list / ack / app /
|
|
95
|
+
media / chem-struct-wrap / sub-article / glossary). Recovers
|
|
96
|
+
Nature/BMC/OUP figures wrapped inside body paragraphs.
|
|
97
|
+
- **Walk all `<back>` children**, not just `<ref-list>`. Recovers `<ack>`,
|
|
98
|
+
`<app>`, `<glossary>` (was 19 of 26 papers dropping these).
|
|
99
|
+
- **Strip whitespace `<label>`** before emitting caption blocks. Lancet and
|
|
100
|
+
similar publishers emit `<label>\n</label>`, which previously produced
|
|
101
|
+
malformed `**\n: caption**` blocks that downstream regexes couldn't
|
|
102
|
+
match.
|
|
103
|
+
- **Render `<def-list>`** as `**term**: definition` lines.
|
|
104
|
+
- **Render `<ack>` / `<app>` / `<app-group>` / `<glossary>`** as labelled
|
|
105
|
+
sections, honouring the publisher's `<title>` if present.
|
|
106
|
+
- **Nested `<list>` inside `<list-item>`** now dispatched as a block
|
|
107
|
+
(preserves nested-list structure instead of flattening).
|
|
108
|
+
- **Tables now honour `<thead>` / `<tbody>` / `<tfoot>`**: thead rows
|
|
109
|
+
become the markdown header (multi-row headers preserved as pre-header
|
|
110
|
+
body rows); cells from nested tables can no longer bleed into the
|
|
111
|
+
parent table's row set.
|
|
112
|
+
- **`<inline-formula>` MathML fallback**: when no `<tex-math>` is present
|
|
113
|
+
but the formula contains `<mml:math>`, render plaintext content as
|
|
114
|
+
`$mathml-text$` (or `$[math]$` if empty). Math-heavy papers no longer
|
|
115
|
+
have empty formula slots in the markdown.
|
|
116
|
+
- **`<disp-formula>` always emits `$$...$$`** regardless of source
|
|
117
|
+
(tex-math / MathML / plaintext sub-sup / equation-graphic). Display
|
|
118
|
+
math is now machine-detectable in markdown without inspecting the JATS.
|
|
119
|
+
- **`<supplementary-material>` and `<table-wrap-foot>`** rendered. Surfaces
|
|
120
|
+
supplementary file captions and table abbreviation glossaries needed to
|
|
121
|
+
interpret cells.
|
|
122
|
+
|
|
123
|
+
### Tested
|
|
124
|
+
|
|
125
|
+
- Unit suite grew from 113 → 121 tests (8 new tests in
|
|
126
|
+
`test_jats_to_md.py` lock the behaviours above).
|
|
127
|
+
- Full unit suite passes; no regressions.
|
|
128
|
+
- Stress-audit drop counts (papers where JATS XML had > 0 of an element
|
|
129
|
+
but markdown had 0): figures 2 → 0; tables 1 → 0; disp-formula 2 → 0;
|
|
130
|
+
inline-formula 1 → 0; ack 19 → 3 (the remaining 3 are publisher-custom
|
|
131
|
+
titles like "Sources of Funding:" — converter is correct, audit-script
|
|
132
|
+
is too narrow); def-list 3 → 0; app 3 → 0; glossary 4 → 0 (same
|
|
133
|
+
custom-title caveat as ack).
|
|
134
|
+
|
|
135
|
+
### Known gaps (deferred)
|
|
136
|
+
|
|
137
|
+
A separate legal review surfaced **3 P0 findings** that the *caller*
|
|
138
|
+
should be aware of when implementing license-aware vending policy. See
|
|
139
|
+
[`docs/licensing.md`](licensing.md#known-caveats) for the current list and
|
|
140
|
+
recommended mitigations.
|
|
141
|
+
|
|
142
|
+
## [0.0.1] — 2026-05-06
|
|
143
|
+
|
|
144
|
+
Initial release.
|
|
145
|
+
|
|
146
|
+
### Added
|
|
147
|
+
|
|
148
|
+
- `Client` (sync) with `fetch(...)` and `fetch_many(...)` entry points.
|
|
149
|
+
- 8-strategy ladder: PMC efetch, NCBI elink, arXiv, bioRxiv API, Unpaywall,
|
|
150
|
+
bioRxiv published-version handoff (with Crossref `relation` fallback for
|
|
151
|
+
non-bioRxiv DOIs), institutional repo scrape, EuropePMC PDF render.
|
|
152
|
+
- BYO inputs via `pdf=` and `jats=` accepting bytes / `Path` / URI
|
|
153
|
+
(`file://`, `http(s)://`, `s3://`).
|
|
154
|
+
- Pluggable converters: built-in `JatsConverter` (stdlib only),
|
|
155
|
+
`MistralConverter` (extra `[mistral]`), `OlmocrConverter` (extra `[olmocr]`).
|
|
156
|
+
- Per-host token-bucket `RateLimiter` with hostname aliasing — independent
|
|
157
|
+
services run in parallel, same-service workers serialize at the configured
|
|
158
|
+
rps.
|
|
159
|
+
- License classifier producing one of 11 `LicenseClass` values; `FetchResult`
|
|
160
|
+
helpers `may_redistribute`, `may_redistribute_nc`, `may_make_derivatives`,
|
|
161
|
+
`may_train_models`, `may_use_for_tdm`.
|
|
162
|
+
- `FileCache` keyed by canonical identifier set; multi-key aliasing for
|
|
163
|
+
cross-identifier hits.
|
|
164
|
+
- Identifier enrichment: NCBI ID converter (DOI ↔ PMID/PMCID) and S2 (Corpus
|
|
165
|
+
ID → others). Off by default; opt-in via `enrich_identifiers=True` (or
|
|
166
|
+
always-on for corpus-id-only inputs).
|
|
167
|
+
- `tools/check_test_legitimacy.py` — fails CI if mock ratio exceeds 30% or any
|
|
168
|
+
strategy has no real-API test.
|
|
169
|
+
|
|
170
|
+
### Tested
|
|
171
|
+
|
|
172
|
+
- 113 unit tests (license classifier, identifier normalization, rate limiter
|
|
173
|
+
alias chain, FileCache round-trip, JATS converter, FetchResult helpers,
|
|
174
|
+
config redaction).
|
|
175
|
+
- 29 real-API integration tests, zero mocks (verified by the legitimacy
|
|
176
|
+
sweep).
|
|
177
|
+
- Per-paper snapshot recovery benchmark (53 biomedical DOIs that fail
|
|
178
|
+
Mistral-OCR-only retrieval): 46/53 (86.8%) recovered. Diff-based to
|
|
179
|
+
catch regressions on specific DOIs.
|
|
180
|
+
- Rate-limit timing tests with real wall-clock assertions: arXiv 0.33 rps
|
|
181
|
+
strict, NCBI 3 rps shared across `eutils`/`pmc`/`www` hostnames, service
|
|
182
|
+
independence under 4-thread concurrency, 100-acquire concurrent safety.
|
|
183
|
+
|
|
184
|
+
### Known caveats
|
|
185
|
+
|
|
186
|
+
- `mistralai` pinned to `>=1.5,<2`; v2 reorganized imports and breaks the
|
|
187
|
+
converter.
|
|
188
|
+
- `olmocr` requires LM Studio (or compatible) loaded with **8k+ context** for
|
|
189
|
+
reliable output on 2-column biomedical PDFs. Default 4k context yields
|
|
190
|
+
~67% per-page failure rate.
|
|
191
|
+
- Per-Client rate limiter — multi-instance deployments don't share quota.
|
|
192
|
+
Distributed limiter deferred to v0.2.
|
|
193
|
+
- No async API. Sync only.
|
|
194
|
+
- No multi-tenant `Credentials` per-call object. One Client per credential set.
|
|
195
|
+
- `gs://` BYO is not implemented (placeholder enum value reserved).
|