pubmed-markdown 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pubmed_markdown-0.1.0/.claude/settings.local.json +43 -0
- pubmed_markdown-0.1.0/.env.example +6 -0
- pubmed_markdown-0.1.0/.gitattributes +2 -0
- pubmed_markdown-0.1.0/.gitignore +18 -0
- pubmed_markdown-0.1.0/.vscode/launch.json +49 -0
- pubmed_markdown-0.1.0/Dockerfile +13 -0
- pubmed_markdown-0.1.0/LICENSE +21 -0
- pubmed_markdown-0.1.0/PKG-INFO +179 -0
- pubmed_markdown-0.1.0/README.md +150 -0
- pubmed_markdown-0.1.0/api.py +376 -0
- pubmed_markdown-0.1.0/cpic_pmids.txt +190 -0
- pubmed_markdown-0.1.0/docs/api_endpoint_plan.md +215 -0
- pubmed_markdown-0.1.0/docs/packaging_prd.md +279 -0
- pubmed_markdown-0.1.0/docs/pubmed_html_to_markdown_conversion_process.md +240 -0
- pubmed_markdown-0.1.0/notebooks/runner.ipynb +26542 -0
- pubmed_markdown-0.1.0/pixi.lock +1763 -0
- pubmed_markdown-0.1.0/pixi.toml +32 -0
- pubmed_markdown-0.1.0/pubmed_downloader/__init__.py +17 -0
- pubmed_markdown-0.1.0/pubmed_downloader/abstract_from_pmid.py +109 -0
- pubmed_markdown-0.1.0/pubmed_downloader/copy_markdown.py +32 -0
- pubmed_markdown-0.1.0/pubmed_downloader/html_from_pmcid.py +77 -0
- pubmed_markdown-0.1.0/pubmed_downloader/manage_records.py +158 -0
- pubmed_markdown-0.1.0/pubmed_downloader/markdown_from_html.py +724 -0
- pubmed_markdown-0.1.0/pubmed_downloader/pharmgkb_annotations.py +137 -0
- pubmed_markdown-0.1.0/pubmed_downloader/pmcid_from_pmid.py +235 -0
- pubmed_markdown-0.1.0/pubmed_downloader/pubmed_downloader.py +522 -0
- pubmed_markdown-0.1.0/pubmed_downloader/utils_bioc.py +298 -0
- pubmed_markdown-0.1.0/pyproject.toml +45 -0
- pubmed_markdown-0.1.0/requirements.txt +10 -0
- pubmed_markdown-0.1.0/test_auth.py +16 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(grep:*)",
|
|
5
|
+
"Bash(rg:*)",
|
|
6
|
+
"Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n 'class=\"\".*article.*\"\"' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
|
|
7
|
+
"Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n 'section.*abstract|section.*introduction|section.*methods|section.*results|section.*discussion|section.*conclusion' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
|
|
8
|
+
"Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n '<h[1-6]' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
|
|
9
|
+
"Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n '<figure|<table' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
|
|
10
|
+
"Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n 'href=\"\".*\\.pdf\"\"|\\.jpg\"\"|\\.png\"\"|\\.svg\"\"' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
|
|
11
|
+
"Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n 'cdn\\.ncbi\\.nlm\\.nih\\.gov.*\\.jpg|\\.png' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
|
|
12
|
+
"Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n 'References|ref-list|class=\"\"ref\"\"' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
|
|
13
|
+
"Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n 'Discussion|discussion' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
|
|
14
|
+
"Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n '<h[2-6].*class=\"pmc_sec_title\"' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
|
|
15
|
+
"Bash(python:*)",
|
|
16
|
+
"Bash(pixi run:*)",
|
|
17
|
+
"Bash(find:*)",
|
|
18
|
+
"Bash(rm:*)",
|
|
19
|
+
"Bash(ls:*)",
|
|
20
|
+
"Bash(du -sh:*)",
|
|
21
|
+
"Bash(pixi install:*)",
|
|
22
|
+
"Bash(curl:*)",
|
|
23
|
+
"Bash(pkill:*)",
|
|
24
|
+
"Bash(node:*)",
|
|
25
|
+
"Bash(git config:*)",
|
|
26
|
+
"Bash(python3:*)",
|
|
27
|
+
"Bash(test -f ~/.gsd/defaults.json)",
|
|
28
|
+
"Read(//Users/shloknatarajan/.gsd/**)",
|
|
29
|
+
"Skill(gsd:progress)",
|
|
30
|
+
"WebSearch",
|
|
31
|
+
"Bash(pip install:*)",
|
|
32
|
+
"Bash(python -c \"from pubmed_downloader.pubmed_downloader import main; print\\('Entry point OK'\\)\" && unzip -l dist/pubmed_downloader-0.1.0-py3-none-any.whl | head -20)",
|
|
33
|
+
"Read(//private/tmp/**)",
|
|
34
|
+
"Bash(source pubmed_test_env/bin/activate)",
|
|
35
|
+
"Bash(source /tmp/pubmed_test_env/bin/activate && pubmed-download --help 2>&1)",
|
|
36
|
+
"Bash(source /tmp/pubmed_test_env/bin/activate && pip install /Users/shloknatarajan/stanford/research/daneshjou/PubMedDownloader 2>&1 | tail -3)",
|
|
37
|
+
"Bash(source /tmp/pubmed_test_env/bin/activate && cd /tmp && rm -rf test_output && pubmed-download --file_path=test_pmids.txt --save_dir=test_output 2>&1)",
|
|
38
|
+
"Bash(twine check:*)",
|
|
39
|
+
"Bash(pip index:*)"
|
|
40
|
+
]
|
|
41
|
+
},
|
|
42
|
+
"enableAllProjectMcpServers": false
|
|
43
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
{
|
|
2
|
+
// Use IntelliSense to learn about possible attributes.
|
|
3
|
+
// Hover to view descriptions of existing attributes.
|
|
4
|
+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
|
5
|
+
"version": "0.2.0",
|
|
6
|
+
"configurations": [
|
|
7
|
+
{
|
|
8
|
+
"name": "PMCID to Text",
|
|
9
|
+
"type": "debugpy",
|
|
10
|
+
"request": "launch",
|
|
11
|
+
"cwd": "${workspaceFolder}",
|
|
12
|
+
"module": "src.pmcid_to_text.fetch_article",
|
|
13
|
+
"args": ["--pmcid", "PMC1884285", "--save_path", "data/articles"],
|
|
14
|
+
"console": "integratedTerminal"
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"name": "Check OA Status",
|
|
18
|
+
"type": "debugpy",
|
|
19
|
+
"request": "launch",
|
|
20
|
+
"cwd": "${workspaceFolder}",
|
|
21
|
+
"module": "src.pmcid_to_text.check_oa_status",
|
|
22
|
+
"console": "integratedTerminal"
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"name": "Aaron HTML",
|
|
26
|
+
"type": "debugpy",
|
|
27
|
+
"request": "launch",
|
|
28
|
+
"cwd": "${workspaceFolder}",
|
|
29
|
+
"module": "src.pmcid_to_text.html_getter",
|
|
30
|
+
"console": "integratedTerminal"
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"name": "Single Article",
|
|
34
|
+
"type": "debugpy",
|
|
35
|
+
"request": "launch",
|
|
36
|
+
"cwd": "${workspaceFolder}",
|
|
37
|
+
"module": "src.markdown_from_pmid",
|
|
38
|
+
"args": ["--pmid", "12895196", "--save_dir", "data"],
|
|
39
|
+
"console": "integratedTerminal"
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"name": "Pubmed Converter",
|
|
43
|
+
"type": "debugpy",
|
|
44
|
+
"request": "launch",
|
|
45
|
+
"cwd": "${workspaceFolder}",
|
|
46
|
+
"module": "src.pubmed_downloader",
|
|
47
|
+
}
|
|
48
|
+
]
|
|
49
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
FROM python:3.12-slim
|
|
2
|
+
|
|
3
|
+
WORKDIR /app
|
|
4
|
+
|
|
5
|
+
COPY requirements.txt .
|
|
6
|
+
RUN pip install --no-cache-dir -r requirements.txt
|
|
7
|
+
|
|
8
|
+
COPY pubmed_downloader/ pubmed_downloader/
|
|
9
|
+
COPY api.py .
|
|
10
|
+
|
|
11
|
+
EXPOSE 8000
|
|
12
|
+
|
|
13
|
+
CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Shlok Natarajan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pubmed-markdown
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert PubMed articles (PMIDs or PMCIDs) to clean, structured markdown with full text, abstracts, and supplementary materials
|
|
5
|
+
Project-URL: Homepage, https://github.com/shloknatarajan/PubMedDownloader
|
|
6
|
+
Project-URL: Repository, https://github.com/shloknatarajan/PubMedDownloader
|
|
7
|
+
Project-URL: Issues, https://github.com/shloknatarajan/PubMedDownloader/issues
|
|
8
|
+
Author-email: Shlok Natarajan <shlok.natarajan@gmail.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: bioinformatics,markdown,pharmacogenomics,pmc,pubmed
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Requires-Dist: beautifulsoup4>=4.13.0
|
|
22
|
+
Requires-Dist: biopython>=1.85
|
|
23
|
+
Requires-Dist: loguru>=0.7.0
|
|
24
|
+
Requires-Dist: pandas>=2.0.0
|
|
25
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
26
|
+
Requires-Dist: requests>=2.32.0
|
|
27
|
+
Requires-Dist: tqdm>=4.67.0
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# PubMed Downloader
|
|
31
|
+
|
|
32
|
+
Convert PubMed articles to clean, structured markdown. Handles the full pipeline: PMID resolution, full-text extraction via PubMed Central, HTML-to-markdown conversion, and supplementary material retrieval.
|
|
33
|
+
|
|
34
|
+
Articles without open-access full text automatically fall back to abstract-only download.
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install git+https://github.com/shloknatarajan/PubMedDownloader.git
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Setup
|
|
43
|
+
|
|
44
|
+
Set your email for NCBI API identification (optional but recommended):
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
export NCBI_EMAIL=your-email@institution.edu
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Or create a `.env` file in your working directory:
|
|
51
|
+
|
|
52
|
+
```env
|
|
53
|
+
NCBI_EMAIL=your-email@institution.edu
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
### Python API
|
|
59
|
+
|
|
60
|
+
**Single article (returns markdown string, no files created):**
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from pubmed_downloader import PubMedDownloader
|
|
64
|
+
|
|
65
|
+
downloader = PubMedDownloader()
|
|
66
|
+
|
|
67
|
+
# From PMID (resolves to PMCID automatically, falls back to abstract if not open access)
|
|
68
|
+
markdown = downloader.single_pmid_to_markdown("12895196")
|
|
69
|
+
|
|
70
|
+
# From PMCID directly
|
|
71
|
+
markdown = downloader.single_pmcid_to_markdown("PMC1884285")
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
**Batch processing (saves HTML and markdown files to disk):**
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from pubmed_downloader import PubMedDownloader
|
|
78
|
+
|
|
79
|
+
downloader = PubMedDownloader()
|
|
80
|
+
pmids = ["12895196", "17872605", "25051018"]
|
|
81
|
+
downloader.pmids_to_markdown(pmids, save_dir="data")
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
This creates:
|
|
85
|
+
```
|
|
86
|
+
data/
|
|
87
|
+
├── html/ # Raw HTML from PMC
|
|
88
|
+
├── markdown/ # Converted markdown files
|
|
89
|
+
├── cache/ # PMID-to-PMCID mapping cache
|
|
90
|
+
└── pmcids.txt # Resolved PMCIDs
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
**Add supplementary materials to existing markdown files:**
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
downloader.add_supplements_to_existing(save_dir="data")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
**Individual utility functions:**
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from pubmed_downloader import (
|
|
103
|
+
get_pmcid_from_pmid,
|
|
104
|
+
get_html_from_pmcid,
|
|
105
|
+
get_abstract_markdown_from_pmid,
|
|
106
|
+
fetch_bioc_supplement,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Resolve PMIDs to PMCIDs
|
|
110
|
+
mapping = get_pmcid_from_pmid(["12895196", "17872605"])
|
|
111
|
+
|
|
112
|
+
# Fetch raw HTML from PMC
|
|
113
|
+
html = get_html_from_pmcid("PMC1884285")
|
|
114
|
+
|
|
115
|
+
# Get abstract for non-open-access articles
|
|
116
|
+
abstract_md = get_abstract_markdown_from_pmid("12345678")
|
|
117
|
+
|
|
118
|
+
# Get supplementary material text
|
|
119
|
+
supplement = fetch_bioc_supplement("PMC6435416")
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Command Line
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
# Convert PMIDs from a file (one PMID per line)
|
|
126
|
+
pubmed-download --file_path=pmids.txt --save_dir=data
|
|
127
|
+
|
|
128
|
+
# Add supplementary materials to existing markdown
|
|
129
|
+
pubmed-download --add_supplements --save_dir=data
|
|
130
|
+
|
|
131
|
+
# Clear all caches
|
|
132
|
+
pubmed-download --clear_caches
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### API Reference
|
|
136
|
+
|
|
137
|
+
| Method | Creates Files | Returns | Use Case |
|
|
138
|
+
|--------|--------------|---------|----------|
|
|
139
|
+
| `single_pmid_to_markdown()` | No | Markdown string | Single article, programmatic use |
|
|
140
|
+
| `single_pmcid_to_markdown()` | No | Markdown string | Direct PMCID conversion |
|
|
141
|
+
| `pmids_to_markdown()` | Yes | None | Batch processing, building datasets |
|
|
142
|
+
| `local_html_to_markdown()` | Yes | None | Re-convert existing HTML files |
|
|
143
|
+
| `add_supplements_to_existing()` | Yes | None | Append supplements to existing markdown |
|
|
144
|
+
|
|
145
|
+
## PharmGKB Integration
|
|
146
|
+
|
|
147
|
+
Extract PMIDs from PharmGKB variant annotations for pharmacogenomics research:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from pubmed_downloader.pharmgkb_annotations import get_pmid_list
|
|
151
|
+
from pubmed_downloader import PubMedDownloader
|
|
152
|
+
|
|
153
|
+
# Download PharmGKB annotations and extract PMIDs
|
|
154
|
+
pmids = get_pmid_list(save_dir="data")
|
|
155
|
+
|
|
156
|
+
# Convert to markdown
|
|
157
|
+
downloader = PubMedDownloader()
|
|
158
|
+
downloader.pmids_to_markdown([str(p) for p in pmids], save_dir="data")
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## How It Works
|
|
162
|
+
|
|
163
|
+
1. **PMID to PMCID** -- Uses NCBI's ID Converter API with batching, caching (30-day expiry), and rate limiting
|
|
164
|
+
2. **HTML extraction** -- Fetches full article HTML from PubMed Central
|
|
165
|
+
3. **Markdown conversion** -- Converts HTML to structured markdown preserving tables, figures, citations, and references
|
|
166
|
+
4. **Supplementary materials** -- Fetches pre-processed supplement text via NCBI's BioC API
|
|
167
|
+
5. **Abstract fallback** -- Articles not in PMC Open Access get abstract + metadata via NCBI E-Fetch
|
|
168
|
+
|
|
169
|
+
## Configuration
|
|
170
|
+
|
|
171
|
+
| Environment Variable | Default | Description |
|
|
172
|
+
|---------------------|---------|-------------|
|
|
173
|
+
| `NCBI_EMAIL` | None | Email for NCBI API identification |
|
|
174
|
+
| `PMID_CACHE_DIR` | `data/cache` | Cache directory path |
|
|
175
|
+
| `PMID_CACHE_FILE` | `pmid_to_pmcid.json` | Cache filename |
|
|
176
|
+
|
|
177
|
+
## License
|
|
178
|
+
|
|
179
|
+
MIT
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# PubMed Downloader
|
|
2
|
+
|
|
3
|
+
Convert PubMed articles to clean, structured markdown. Handles the full pipeline: PMID resolution, full-text extraction via PubMed Central, HTML-to-markdown conversion, and supplementary material retrieval.
|
|
4
|
+
|
|
5
|
+
Articles without open-access full text automatically fall back to abstract-only download.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install git+https://github.com/shloknatarajan/PubMedDownloader.git
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Setup
|
|
14
|
+
|
|
15
|
+
Set your email for NCBI API identification (optional but recommended):
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
export NCBI_EMAIL=your-email@institution.edu
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Or create a `.env` file in your working directory:
|
|
22
|
+
|
|
23
|
+
```env
|
|
24
|
+
NCBI_EMAIL=your-email@institution.edu
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Usage
|
|
28
|
+
|
|
29
|
+
### Python API
|
|
30
|
+
|
|
31
|
+
**Single article (returns markdown string, no files created):**
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from pubmed_downloader import PubMedDownloader
|
|
35
|
+
|
|
36
|
+
downloader = PubMedDownloader()
|
|
37
|
+
|
|
38
|
+
# From PMID (resolves to PMCID automatically, falls back to abstract if not open access)
|
|
39
|
+
markdown = downloader.single_pmid_to_markdown("12895196")
|
|
40
|
+
|
|
41
|
+
# From PMCID directly
|
|
42
|
+
markdown = downloader.single_pmcid_to_markdown("PMC1884285")
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
**Batch processing (saves HTML and markdown files to disk):**
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from pubmed_downloader import PubMedDownloader
|
|
49
|
+
|
|
50
|
+
downloader = PubMedDownloader()
|
|
51
|
+
pmids = ["12895196", "17872605", "25051018"]
|
|
52
|
+
downloader.pmids_to_markdown(pmids, save_dir="data")
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
This creates:
|
|
56
|
+
```
|
|
57
|
+
data/
|
|
58
|
+
├── html/ # Raw HTML from PMC
|
|
59
|
+
├── markdown/ # Converted markdown files
|
|
60
|
+
├── cache/ # PMID-to-PMCID mapping cache
|
|
61
|
+
└── pmcids.txt # Resolved PMCIDs
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
**Add supplementary materials to existing markdown files:**
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
downloader.add_supplements_to_existing(save_dir="data")
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
**Individual utility functions:**
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from pubmed_downloader import (
|
|
74
|
+
get_pmcid_from_pmid,
|
|
75
|
+
get_html_from_pmcid,
|
|
76
|
+
get_abstract_markdown_from_pmid,
|
|
77
|
+
fetch_bioc_supplement,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Resolve PMIDs to PMCIDs
|
|
81
|
+
mapping = get_pmcid_from_pmid(["12895196", "17872605"])
|
|
82
|
+
|
|
83
|
+
# Fetch raw HTML from PMC
|
|
84
|
+
html = get_html_from_pmcid("PMC1884285")
|
|
85
|
+
|
|
86
|
+
# Get abstract for non-open-access articles
|
|
87
|
+
abstract_md = get_abstract_markdown_from_pmid("12345678")
|
|
88
|
+
|
|
89
|
+
# Get supplementary material text
|
|
90
|
+
supplement = fetch_bioc_supplement("PMC6435416")
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Command Line
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
# Convert PMIDs from a file (one PMID per line)
|
|
97
|
+
pubmed-download --file_path=pmids.txt --save_dir=data
|
|
98
|
+
|
|
99
|
+
# Add supplementary materials to existing markdown
|
|
100
|
+
pubmed-download --add_supplements --save_dir=data
|
|
101
|
+
|
|
102
|
+
# Clear all caches
|
|
103
|
+
pubmed-download --clear_caches
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### API Reference
|
|
107
|
+
|
|
108
|
+
| Method | Creates Files | Returns | Use Case |
|
|
109
|
+
|--------|--------------|---------|----------|
|
|
110
|
+
| `single_pmid_to_markdown()` | No | Markdown string | Single article, programmatic use |
|
|
111
|
+
| `single_pmcid_to_markdown()` | No | Markdown string | Direct PMCID conversion |
|
|
112
|
+
| `pmids_to_markdown()` | Yes | None | Batch processing, building datasets |
|
|
113
|
+
| `local_html_to_markdown()` | Yes | None | Re-convert existing HTML files |
|
|
114
|
+
| `add_supplements_to_existing()` | Yes | None | Append supplements to existing markdown |
|
|
115
|
+
|
|
116
|
+
## PharmGKB Integration
|
|
117
|
+
|
|
118
|
+
Extract PMIDs from PharmGKB variant annotations for pharmacogenomics research:
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
from pubmed_downloader.pharmgkb_annotations import get_pmid_list
|
|
122
|
+
from pubmed_downloader import PubMedDownloader
|
|
123
|
+
|
|
124
|
+
# Download PharmGKB annotations and extract PMIDs
|
|
125
|
+
pmids = get_pmid_list(save_dir="data")
|
|
126
|
+
|
|
127
|
+
# Convert to markdown
|
|
128
|
+
downloader = PubMedDownloader()
|
|
129
|
+
downloader.pmids_to_markdown([str(p) for p in pmids], save_dir="data")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## How It Works
|
|
133
|
+
|
|
134
|
+
1. **PMID to PMCID** -- Uses NCBI's ID Converter API with batching, caching (30-day expiry), and rate limiting
|
|
135
|
+
2. **HTML extraction** -- Fetches full article HTML from PubMed Central
|
|
136
|
+
3. **Markdown conversion** -- Converts HTML to structured markdown preserving tables, figures, citations, and references
|
|
137
|
+
4. **Supplementary materials** -- Fetches pre-processed supplement text via NCBI's BioC API
|
|
138
|
+
5. **Abstract fallback** -- Articles not in PMC Open Access get abstract + metadata via NCBI E-Fetch
|
|
139
|
+
|
|
140
|
+
## Configuration
|
|
141
|
+
|
|
142
|
+
| Environment Variable | Default | Description |
|
|
143
|
+
|---------------------|---------|-------------|
|
|
144
|
+
| `NCBI_EMAIL` | None | Email for NCBI API identification |
|
|
145
|
+
| `PMID_CACHE_DIR` | `data/cache` | Cache directory path |
|
|
146
|
+
| `PMID_CACHE_FILE` | `pmid_to_pmcid.json` | Cache filename |
|
|
147
|
+
|
|
148
|
+
## License
|
|
149
|
+
|
|
150
|
+
MIT
|