pubmed-markdown 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. pubmed_markdown-0.1.0/.claude/settings.local.json +43 -0
  2. pubmed_markdown-0.1.0/.env.example +6 -0
  3. pubmed_markdown-0.1.0/.gitattributes +2 -0
  4. pubmed_markdown-0.1.0/.gitignore +18 -0
  5. pubmed_markdown-0.1.0/.vscode/launch.json +49 -0
  6. pubmed_markdown-0.1.0/Dockerfile +13 -0
  7. pubmed_markdown-0.1.0/LICENSE +21 -0
  8. pubmed_markdown-0.1.0/PKG-INFO +179 -0
  9. pubmed_markdown-0.1.0/README.md +150 -0
  10. pubmed_markdown-0.1.0/api.py +376 -0
  11. pubmed_markdown-0.1.0/cpic_pmids.txt +190 -0
  12. pubmed_markdown-0.1.0/docs/api_endpoint_plan.md +215 -0
  13. pubmed_markdown-0.1.0/docs/packaging_prd.md +279 -0
  14. pubmed_markdown-0.1.0/docs/pubmed_html_to_markdown_conversion_process.md +240 -0
  15. pubmed_markdown-0.1.0/notebooks/runner.ipynb +26542 -0
  16. pubmed_markdown-0.1.0/pixi.lock +1763 -0
  17. pubmed_markdown-0.1.0/pixi.toml +32 -0
  18. pubmed_markdown-0.1.0/pubmed_downloader/__init__.py +17 -0
  19. pubmed_markdown-0.1.0/pubmed_downloader/abstract_from_pmid.py +109 -0
  20. pubmed_markdown-0.1.0/pubmed_downloader/copy_markdown.py +32 -0
  21. pubmed_markdown-0.1.0/pubmed_downloader/html_from_pmcid.py +77 -0
  22. pubmed_markdown-0.1.0/pubmed_downloader/manage_records.py +158 -0
  23. pubmed_markdown-0.1.0/pubmed_downloader/markdown_from_html.py +724 -0
  24. pubmed_markdown-0.1.0/pubmed_downloader/pharmgkb_annotations.py +137 -0
  25. pubmed_markdown-0.1.0/pubmed_downloader/pmcid_from_pmid.py +235 -0
  26. pubmed_markdown-0.1.0/pubmed_downloader/pubmed_downloader.py +522 -0
  27. pubmed_markdown-0.1.0/pubmed_downloader/utils_bioc.py +298 -0
  28. pubmed_markdown-0.1.0/pyproject.toml +45 -0
  29. pubmed_markdown-0.1.0/requirements.txt +10 -0
  30. pubmed_markdown-0.1.0/test_auth.py +16 -0
@@ -0,0 +1,43 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(grep:*)",
5
+ "Bash(rg:*)",
6
+ "Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n 'class=\"\".*article.*\"\"' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
7
+ "Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n 'section.*abstract|section.*introduction|section.*methods|section.*results|section.*discussion|section.*conclusion' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
8
+ "Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n '<h[1-6]' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
9
+ "Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n '<figure|<table' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
10
+ "Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n 'href=\"\".*\\.pdf\"\"|\\.jpg\"\"|\\.png\"\"|\\.svg\"\"' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
11
+ "Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n 'cdn\\.ncbi\\.nlm\\.nih\\.gov.*\\.jpg|\\.png' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
12
+ "Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n 'References|ref-list|class=\"\"ref\"\"' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
13
+ "Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n 'Discussion|discussion' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
14
+ "Bash(/opt/homebrew/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -n '<h[2-6].*class=\"pmc_sec_title\"' \"/Users/shloknatarajan/stanford/research/daneshjou/pmid-article-resolver/data/raw_html/PMC10038974.html\")",
15
+ "Bash(python:*)",
16
+ "Bash(pixi run:*)",
17
+ "Bash(find:*)",
18
+ "Bash(rm:*)",
19
+ "Bash(ls:*)",
20
+ "Bash(du -sh:*)",
21
+ "Bash(pixi install:*)",
22
+ "Bash(curl:*)",
23
+ "Bash(pkill:*)",
24
+ "Bash(node:*)",
25
+ "Bash(git config:*)",
26
+ "Bash(python3:*)",
27
+ "Bash(test -f ~/.gsd/defaults.json)",
28
+ "Read(//Users/shloknatarajan/.gsd/**)",
29
+ "Skill(gsd:progress)",
30
+ "WebSearch",
31
+ "Bash(pip install:*)",
32
+ "Bash(python -c \"from pubmed_downloader.pubmed_downloader import main; print\\('Entry point OK'\\)\" && unzip -l dist/pubmed_downloader-0.1.0-py3-none-any.whl | head -20)",
33
+ "Read(//private/tmp/**)",
34
+ "Bash(source pubmed_test_env/bin/activate)",
35
+ "Bash(source /tmp/pubmed_test_env/bin/activate && pubmed-download --help 2>&1)",
36
+ "Bash(source /tmp/pubmed_test_env/bin/activate && pip install /Users/shloknatarajan/stanford/research/daneshjou/PubMedDownloader 2>&1 | tail -3)",
37
+ "Bash(source /tmp/pubmed_test_env/bin/activate && cd /tmp && rm -rf test_output && pubmed-download --file_path=test_pmids.txt --save_dir=test_output 2>&1)",
38
+ "Bash(twine check:*)",
39
+ "Bash(pip index:*)"
40
+ ]
41
+ },
42
+ "enableAllProjectMcpServers": false
43
+ }
@@ -0,0 +1,6 @@
1
+ # Required: Email for NCBI API compliance
2
+ NCBI_EMAIL=your.email@example.com
3
+
4
+ # Optional: NCBI API key (raises rate limit from 3 to 10 requests/sec)
5
+ # Register at https://www.ncbi.nlm.nih.gov/account/settings/
6
+ # NCBI_API_KEY=your_api_key_here
@@ -0,0 +1,2 @@
1
+ # SCM syntax highlighting & preventing 3-way merges
2
+ pixi.lock merge=binary linguist-language=YAML linguist-generated=true
@@ -0,0 +1,18 @@
1
+
2
+ # pixi environments
3
+ .pixi
4
+ *.egg-info
5
+
6
+ # environment files
7
+ .env
8
+
9
+ # python cache
10
+ **/*__pycache__
11
+ **/*.pyc
12
+ **/.DS_Store
13
+
14
+ # data
15
+ data/*
16
+
17
+ # cursor
18
+ .cursor
@@ -0,0 +1,49 @@
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "name": "PMCID to Text",
9
+ "type": "debugpy",
10
+ "request": "launch",
11
+ "cwd": "${workspaceFolder}",
12
+ "module": "src.pmcid_to_text.fetch_article",
13
+ "args": ["--pmcid", "PMC1884285", "--save_path", "data/articles"],
14
+ "console": "integratedTerminal"
15
+ },
16
+ {
17
+ "name": "Check OA Status",
18
+ "type": "debugpy",
19
+ "request": "launch",
20
+ "cwd": "${workspaceFolder}",
21
+ "module": "src.pmcid_to_text.check_oa_status",
22
+ "console": "integratedTerminal"
23
+ },
24
+ {
25
+ "name": "Aaron HTML",
26
+ "type": "debugpy",
27
+ "request": "launch",
28
+ "cwd": "${workspaceFolder}",
29
+ "module": "src.pmcid_to_text.html_getter",
30
+ "console": "integratedTerminal"
31
+ },
32
+ {
33
+ "name": "Single Article",
34
+ "type": "debugpy",
35
+ "request": "launch",
36
+ "cwd": "${workspaceFolder}",
37
+ "module": "src.markdown_from_pmid",
38
+ "args": ["--pmid", "12895196", "--save_dir", "data"],
39
+ "console": "integratedTerminal"
40
+ },
41
+ {
42
+ "name": "Pubmed Converter",
43
+ "type": "debugpy",
44
+ "request": "launch",
45
+ "cwd": "${workspaceFolder}",
46
+ "module": "src.pubmed_downloader",
47
+ }
48
+ ]
49
+ }
@@ -0,0 +1,13 @@
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY pubmed_downloader/ pubmed_downloader/
9
+ COPY api.py .
10
+
11
+ EXPOSE 8000
12
+
13
+ CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "8000"]
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Shlok Natarajan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,179 @@
1
+ Metadata-Version: 2.4
2
+ Name: pubmed-markdown
3
+ Version: 0.1.0
4
+ Summary: Convert PubMed articles (PMIDs or PMCIDs) to clean, structured markdown with full text, abstracts, and supplementary materials
5
+ Project-URL: Homepage, https://github.com/shloknatarajan/PubMedDownloader
6
+ Project-URL: Repository, https://github.com/shloknatarajan/PubMedDownloader
7
+ Project-URL: Issues, https://github.com/shloknatarajan/PubMedDownloader/issues
8
+ Author-email: Shlok Natarajan <shlok.natarajan@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: bioinformatics,markdown,pharmacogenomics,pmc,pubmed
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
20
+ Requires-Python: >=3.11
21
+ Requires-Dist: beautifulsoup4>=4.13.0
22
+ Requires-Dist: biopython>=1.85
23
+ Requires-Dist: loguru>=0.7.0
24
+ Requires-Dist: pandas>=2.0.0
25
+ Requires-Dist: python-dotenv>=1.0.0
26
+ Requires-Dist: requests>=2.32.0
27
+ Requires-Dist: tqdm>=4.67.0
28
+ Description-Content-Type: text/markdown
29
+
30
+ # PubMed Downloader
31
+
32
+ Convert PubMed articles to clean, structured markdown. Handles the full pipeline: PMID resolution, full-text extraction via PubMed Central, HTML-to-markdown conversion, and supplementary material retrieval.
33
+
34
+ Articles without open-access full text automatically fall back to abstract-only download.
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ pip install git+https://github.com/shloknatarajan/PubMedDownloader.git
40
+ ```
41
+
42
+ ## Setup
43
+
44
+ Set your email for NCBI API identification (optional but recommended):
45
+
46
+ ```bash
47
+ export NCBI_EMAIL=your-email@institution.edu
48
+ ```
49
+
50
+ Or create a `.env` file in your working directory:
51
+
52
+ ```env
53
+ NCBI_EMAIL=your-email@institution.edu
54
+ ```
55
+
56
+ ## Usage
57
+
58
+ ### Python API
59
+
60
+ **Single article (returns markdown string, no files created):**
61
+
62
+ ```python
63
+ from pubmed_downloader import PubMedDownloader
64
+
65
+ downloader = PubMedDownloader()
66
+
67
+ # From PMID (resolves to PMCID automatically, falls back to abstract if not open access)
68
+ markdown = downloader.single_pmid_to_markdown("12895196")
69
+
70
+ # From PMCID directly
71
+ markdown = downloader.single_pmcid_to_markdown("PMC1884285")
72
+ ```
73
+
74
+ **Batch processing (saves HTML and markdown files to disk):**
75
+
76
+ ```python
77
+ from pubmed_downloader import PubMedDownloader
78
+
79
+ downloader = PubMedDownloader()
80
+ pmids = ["12895196", "17872605", "25051018"]
81
+ downloader.pmids_to_markdown(pmids, save_dir="data")
82
+ ```
83
+
84
+ This creates:
85
+ ```
86
+ data/
87
+ ├── html/ # Raw HTML from PMC
88
+ ├── markdown/ # Converted markdown files
89
+ ├── cache/ # PMID-to-PMCID mapping cache
90
+ └── pmcids.txt # Resolved PMCIDs
91
+ ```
92
+
93
+ **Add supplementary materials to existing markdown files:**
94
+
95
+ ```python
96
+ downloader.add_supplements_to_existing(save_dir="data")
97
+ ```
98
+
99
+ **Individual utility functions:**
100
+
101
+ ```python
102
+ from pubmed_downloader import (
103
+ get_pmcid_from_pmid,
104
+ get_html_from_pmcid,
105
+ get_abstract_markdown_from_pmid,
106
+ fetch_bioc_supplement,
107
+ )
108
+
109
+ # Resolve PMIDs to PMCIDs
110
+ mapping = get_pmcid_from_pmid(["12895196", "17872605"])
111
+
112
+ # Fetch raw HTML from PMC
113
+ html = get_html_from_pmcid("PMC1884285")
114
+
115
+ # Get abstract for non-open-access articles
116
+ abstract_md = get_abstract_markdown_from_pmid("12345678")
117
+
118
+ # Get supplementary material text
119
+ supplement = fetch_bioc_supplement("PMC6435416")
120
+ ```
121
+
122
+ ### Command Line
123
+
124
+ ```bash
125
+ # Convert PMIDs from a file (one PMID per line)
126
+ pubmed-download --file_path=pmids.txt --save_dir=data
127
+
128
+ # Add supplementary materials to existing markdown
129
+ pubmed-download --add_supplements --save_dir=data
130
+
131
+ # Clear all caches
132
+ pubmed-download --clear_caches
133
+ ```
134
+
135
+ ### API Reference
136
+
137
+ | Method | Creates Files | Returns | Use Case |
138
+ |--------|--------------|---------|----------|
139
+ | `single_pmid_to_markdown()` | No | Markdown string | Single article, programmatic use |
140
+ | `single_pmcid_to_markdown()` | No | Markdown string | Direct PMCID conversion |
141
+ | `pmids_to_markdown()` | Yes | None | Batch processing, building datasets |
142
+ | `local_html_to_markdown()` | Yes | None | Re-convert existing HTML files |
143
+ | `add_supplements_to_existing()` | Yes | None | Append supplements to existing markdown |
144
+
145
+ ## PharmGKB Integration
146
+
147
+ Extract PMIDs from PharmGKB variant annotations for pharmacogenomics research:
148
+
149
+ ```python
150
+ from pubmed_downloader.pharmgkb_annotations import get_pmid_list
151
+ from pubmed_downloader import PubMedDownloader
152
+
153
+ # Download PharmGKB annotations and extract PMIDs
154
+ pmids = get_pmid_list(save_dir="data")
155
+
156
+ # Convert to markdown
157
+ downloader = PubMedDownloader()
158
+ downloader.pmids_to_markdown([str(p) for p in pmids], save_dir="data")
159
+ ```
160
+
161
+ ## How It Works
162
+
163
+ 1. **PMID to PMCID** -- Uses NCBI's ID Converter API with batching, caching (30-day expiry), and rate limiting
164
+ 2. **HTML extraction** -- Fetches full article HTML from PubMed Central
165
+ 3. **Markdown conversion** -- Converts HTML to structured markdown preserving tables, figures, citations, and references
166
+ 4. **Supplementary materials** -- Fetches pre-processed supplement text via NCBI's BioC API
167
+ 5. **Abstract fallback** -- Articles not in PMC Open Access get abstract + metadata via NCBI E-Fetch
168
+
169
+ ## Configuration
170
+
171
+ | Environment Variable | Default | Description |
172
+ |---------------------|---------|-------------|
173
+ | `NCBI_EMAIL` | None | Email for NCBI API identification |
174
+ | `PMID_CACHE_DIR` | `data/cache` | Cache directory path |
175
+ | `PMID_CACHE_FILE` | `pmid_to_pmcid.json` | Cache filename |
176
+
177
+ ## License
178
+
179
+ MIT
@@ -0,0 +1,150 @@
1
+ # PubMed Downloader
2
+
3
+ Convert PubMed articles to clean, structured markdown. Handles the full pipeline: PMID resolution, full-text extraction via PubMed Central, HTML-to-markdown conversion, and supplementary material retrieval.
4
+
5
+ Articles without open-access full text automatically fall back to abstract-only download.
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ pip install git+https://github.com/shloknatarajan/PubMedDownloader.git
11
+ ```
12
+
13
+ ## Setup
14
+
15
+ Set your email for NCBI API identification (optional but recommended):
16
+
17
+ ```bash
18
+ export NCBI_EMAIL=your-email@institution.edu
19
+ ```
20
+
21
+ Or create a `.env` file in your working directory:
22
+
23
+ ```env
24
+ NCBI_EMAIL=your-email@institution.edu
25
+ ```
26
+
27
+ ## Usage
28
+
29
+ ### Python API
30
+
31
+ **Single article (returns markdown string, no files created):**
32
+
33
+ ```python
34
+ from pubmed_downloader import PubMedDownloader
35
+
36
+ downloader = PubMedDownloader()
37
+
38
+ # From PMID (resolves to PMCID automatically, falls back to abstract if not open access)
39
+ markdown = downloader.single_pmid_to_markdown("12895196")
40
+
41
+ # From PMCID directly
42
+ markdown = downloader.single_pmcid_to_markdown("PMC1884285")
43
+ ```
44
+
45
+ **Batch processing (saves HTML and markdown files to disk):**
46
+
47
+ ```python
48
+ from pubmed_downloader import PubMedDownloader
49
+
50
+ downloader = PubMedDownloader()
51
+ pmids = ["12895196", "17872605", "25051018"]
52
+ downloader.pmids_to_markdown(pmids, save_dir="data")
53
+ ```
54
+
55
+ This creates:
56
+ ```
57
+ data/
58
+ ├── html/ # Raw HTML from PMC
59
+ ├── markdown/ # Converted markdown files
60
+ ├── cache/ # PMID-to-PMCID mapping cache
61
+ └── pmcids.txt # Resolved PMCIDs
62
+ ```
63
+
64
+ **Add supplementary materials to existing markdown files:**
65
+
66
+ ```python
67
+ downloader.add_supplements_to_existing(save_dir="data")
68
+ ```
69
+
70
+ **Individual utility functions:**
71
+
72
+ ```python
73
+ from pubmed_downloader import (
74
+ get_pmcid_from_pmid,
75
+ get_html_from_pmcid,
76
+ get_abstract_markdown_from_pmid,
77
+ fetch_bioc_supplement,
78
+ )
79
+
80
+ # Resolve PMIDs to PMCIDs
81
+ mapping = get_pmcid_from_pmid(["12895196", "17872605"])
82
+
83
+ # Fetch raw HTML from PMC
84
+ html = get_html_from_pmcid("PMC1884285")
85
+
86
+ # Get abstract for non-open-access articles
87
+ abstract_md = get_abstract_markdown_from_pmid("12345678")
88
+
89
+ # Get supplementary material text
90
+ supplement = fetch_bioc_supplement("PMC6435416")
91
+ ```
92
+
93
+ ### Command Line
94
+
95
+ ```bash
96
+ # Convert PMIDs from a file (one PMID per line)
97
+ pubmed-download --file_path=pmids.txt --save_dir=data
98
+
99
+ # Add supplementary materials to existing markdown
100
+ pubmed-download --add_supplements --save_dir=data
101
+
102
+ # Clear all caches
103
+ pubmed-download --clear_caches
104
+ ```
105
+
106
+ ### API Reference
107
+
108
+ | Method | Creates Files | Returns | Use Case |
109
+ |--------|--------------|---------|----------|
110
+ | `single_pmid_to_markdown()` | No | Markdown string | Single article, programmatic use |
111
+ | `single_pmcid_to_markdown()` | No | Markdown string | Direct PMCID conversion |
112
+ | `pmids_to_markdown()` | Yes | None | Batch processing, building datasets |
113
+ | `local_html_to_markdown()` | Yes | None | Re-convert existing HTML files |
114
+ | `add_supplements_to_existing()` | Yes | None | Append supplements to existing markdown |
115
+
116
+ ## PharmGKB Integration
117
+
118
+ Extract PMIDs from PharmGKB variant annotations for pharmacogenomics research:
119
+
120
+ ```python
121
+ from pubmed_downloader.pharmgkb_annotations import get_pmid_list
122
+ from pubmed_downloader import PubMedDownloader
123
+
124
+ # Download PharmGKB annotations and extract PMIDs
125
+ pmids = get_pmid_list(save_dir="data")
126
+
127
+ # Convert to markdown
128
+ downloader = PubMedDownloader()
129
+ downloader.pmids_to_markdown([str(p) for p in pmids], save_dir="data")
130
+ ```
131
+
132
+ ## How It Works
133
+
134
+ 1. **PMID to PMCID** -- Uses NCBI's ID Converter API with batching, caching (30-day expiry), and rate limiting
135
+ 2. **HTML extraction** -- Fetches full article HTML from PubMed Central
136
+ 3. **Markdown conversion** -- Converts HTML to structured markdown preserving tables, figures, citations, and references
137
+ 4. **Supplementary materials** -- Fetches pre-processed supplement text via NCBI's BioC API
138
+ 5. **Abstract fallback** -- Articles not in PMC Open Access get abstract + metadata via NCBI E-Fetch
139
+
140
+ ## Configuration
141
+
142
+ | Environment Variable | Default | Description |
143
+ |---------------------|---------|-------------|
144
+ | `NCBI_EMAIL` | None | Email for NCBI API identification |
145
+ | `PMID_CACHE_DIR` | `data/cache` | Cache directory path |
146
+ | `PMID_CACHE_FILE` | `pmid_to_pmcid.json` | Cache filename |
147
+
148
+ ## License
149
+
150
+ MIT