pmc-toolkit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pmc_toolkit-0.1.0/.github/workflows/ci.yml +44 -0
- pmc_toolkit-0.1.0/.github/workflows/release.yml +41 -0
- pmc_toolkit-0.1.0/.gitignore +10 -0
- pmc_toolkit-0.1.0/.python-version +1 -0
- pmc_toolkit-0.1.0/AGENTS.md +19 -0
- pmc_toolkit-0.1.0/LICENSE +21 -0
- pmc_toolkit-0.1.0/PKG-INFO +183 -0
- pmc_toolkit-0.1.0/README.md +151 -0
- pmc_toolkit-0.1.0/RELEASING.md +44 -0
- pmc_toolkit-0.1.0/pyproject.toml +55 -0
- pmc_toolkit-0.1.0/src/pmc_toolkit/__init__.py +0 -0
- pmc_toolkit-0.1.0/src/pmc_toolkit/cache.py +144 -0
- pmc_toolkit-0.1.0/src/pmc_toolkit/cli.py +196 -0
- pmc_toolkit-0.1.0/src/pmc_toolkit/models.py +56 -0
- pmc_toolkit-0.1.0/src/pmc_toolkit/storage_api.py +108 -0
- pmc_toolkit-0.1.0/src/pmc_toolkit/storage_utils.py +169 -0
- pmc_toolkit-0.1.0/src/pmc_toolkit/validators.py +37 -0
- pmc_toolkit-0.1.0/src/pmc_toolkit/xml_parse_api.py +232 -0
- pmc_toolkit-0.1.0/src/pmc_toolkit/xml_parse_utils.py +856 -0
- pmc_toolkit-0.1.0/tests/test_cli.py +251 -0
- pmc_toolkit-0.1.0/tests/test_storage.py +586 -0
- pmc_toolkit-0.1.0/tests/test_validators.py +45 -0
- pmc_toolkit-0.1.0/tests/test_xml_parse_api.py +127 -0
- pmc_toolkit-0.1.0/tests/test_xml_parse_utils.py +284 -0
- pmc_toolkit-0.1.0/uv.lock +627 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
push:
|
|
6
|
+
branches: [main]
|
|
7
|
+
|
|
8
|
+
concurrency:
|
|
9
|
+
group: ci-${{ github.ref }}
|
|
10
|
+
cancel-in-progress: true
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
lint-typecheck-test:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
strategy:
|
|
16
|
+
fail-fast: false
|
|
17
|
+
matrix:
|
|
18
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
19
|
+
steps:
|
|
20
|
+
- uses: actions/checkout@v4
|
|
21
|
+
|
|
22
|
+
- name: Install uv
|
|
23
|
+
uses: astral-sh/setup-uv@v6
|
|
24
|
+
with:
|
|
25
|
+
enable-cache: true
|
|
26
|
+
cache-dependency-glob: uv.lock
|
|
27
|
+
|
|
28
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
29
|
+
run: uv python install ${{ matrix.python-version }}
|
|
30
|
+
|
|
31
|
+
- name: Install dependencies
|
|
32
|
+
run: uv sync --all-extras --dev --python ${{ matrix.python-version }}
|
|
33
|
+
|
|
34
|
+
- name: Ruff lint
|
|
35
|
+
run: uv run ruff check .
|
|
36
|
+
|
|
37
|
+
- name: Ruff format check
|
|
38
|
+
run: uv run ruff format --check .
|
|
39
|
+
|
|
40
|
+
- name: Type check (ty)
|
|
41
|
+
run: uv run ty check
|
|
42
|
+
|
|
43
|
+
- name: Pytest
|
|
44
|
+
run: uv run pytest
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ["v*"]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build-and-publish:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
environment:
|
|
11
|
+
name: pypi
|
|
12
|
+
url: https://pypi.org/p/pmc-toolkit
|
|
13
|
+
permissions:
|
|
14
|
+
id-token: write
|
|
15
|
+
contents: read
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Install uv
|
|
20
|
+
uses: astral-sh/setup-uv@v6
|
|
21
|
+
with:
|
|
22
|
+
enable-cache: true
|
|
23
|
+
cache-dependency-glob: uv.lock
|
|
24
|
+
|
|
25
|
+
- name: Set up Python
|
|
26
|
+
run: uv python install 3.13
|
|
27
|
+
|
|
28
|
+
- name: Verify tag matches package version
|
|
29
|
+
run: |
|
|
30
|
+
tag="${GITHUB_REF#refs/tags/v}"
|
|
31
|
+
pkg=$(uv version --short)
|
|
32
|
+
if [ "$tag" != "$pkg" ]; then
|
|
33
|
+
echo "::error::Git tag v$tag does not match pyproject.toml version $pkg"
|
|
34
|
+
exit 1
|
|
35
|
+
fi
|
|
36
|
+
|
|
37
|
+
- name: Build distributions
|
|
38
|
+
run: uv build --no-sources
|
|
39
|
+
|
|
40
|
+
- name: Publish to PyPI (Trusted Publishing)
|
|
41
|
+
run: uv publish
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.14
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Agent / dev notes
|
|
2
|
+
|
|
3
|
+
- ALWAYS use **uv** for Python environment, dependency, and tool commands; do not use **pip**, **python -m pip**, **virtualenv**, **poetry**, or similar unless explicitly asked.
|
|
4
|
+
|
|
5
|
+
- ALWAYS run repo tools via **`uv run`** from the project root.
|
|
6
|
+
|
|
7
|
+
- AFTER Python edits, run **`uv run ty check`**, **`uv run ruff check`**, and **`uv run ruff format`**; run **`uv sync`** when dependencies change.
|
|
8
|
+
|
|
9
|
+
- AFTER behavior changes, run **`uv run pytest`** to check for regressions and report the result.
|
|
10
|
+
|
|
11
|
+
- PREFER **`uv run pytest`** with a path, node id, or **`-k`** while iterating, not the whole suite each time.
|
|
12
|
+
|
|
13
|
+
- NEVER start development servers, watchers, builds, or long-running local processes unless explicitly asked.
|
|
14
|
+
|
|
15
|
+
- NEVER add or modify tests unless explicitly asked.
|
|
16
|
+
|
|
17
|
+
- NEVER assume **`ruff`**, **`ty check`**, or **`pytest`** failures on main are pre-existing.
|
|
18
|
+
|
|
19
|
+
- AVOID shortened names; prefer descriptive names like `version` over `ver`.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jaka
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pmc-toolkit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python toolkit and CLI for exploring, downloading, and parsing PMC article data.
|
|
5
|
+
Project-URL: Homepage, https://github.com/JakaKokosar/pmc-toolkit
|
|
6
|
+
Project-URL: Repository, https://github.com/JakaKokosar/pmc-toolkit
|
|
7
|
+
Project-URL: Issues, https://github.com/JakaKokosar/pmc-toolkit/issues
|
|
8
|
+
Author: Jaka Kokosar
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: bioinformatics,full-text,ncbi,open-access,pmc,pubmed
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
23
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Requires-Python: >=3.11
|
|
26
|
+
Requires-Dist: boto3>=1.42.93
|
|
27
|
+
Requires-Dist: lxml>=6.1.0
|
|
28
|
+
Requires-Dist: platformdirs>=4.3.0
|
|
29
|
+
Requires-Dist: pydantic>=2.13.3
|
|
30
|
+
Requires-Dist: typer>=0.24.1
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# PMC Toolkit
|
|
34
|
+
|
|
35
|
+
Python toolkit and CLI for exploring, downloading, and parsing PMC article data
|
|
36
|
+
from the PMC Open Access dataset on AWS S3 (`s3://pmc-oa-opendata`).
|
|
37
|
+
|
|
38
|
+
## Current Status
|
|
39
|
+
|
|
40
|
+
The project currently supports:
|
|
41
|
+
|
|
42
|
+
- listing available versions for a PMCID
|
|
43
|
+
- validating PMC identifiers before making requests
|
|
44
|
+
- retrieving metadata for a PMC identifier, defaulting to the latest version for a base PMCID
|
|
45
|
+
- listing every object for a resolved article version, using the local cache when available
|
|
46
|
+
- downloading files for an article version into a local cache (optional `--ext`
|
|
47
|
+
filters apply only to `fetch`, not to `files`; `--ext` accepts either a
|
|
48
|
+
comma-separated list or repeated flags)
|
|
49
|
+
- parsing cached full-text XML into a normalized article dictionary with
|
|
50
|
+
title, journal, article, affiliations, author notes, abstract, content,
|
|
51
|
+
acknowledgements, data availability, related articles, custom metadata,
|
|
52
|
+
competing interests, supplementary media, references, figures, and tables
|
|
53
|
+
|
|
54
|
+
## Requirements
|
|
55
|
+
|
|
56
|
+
- Python 3.11+
|
|
57
|
+
- `uv`
|
|
58
|
+
|
|
59
|
+
## Setup
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
uv sync
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Development
|
|
66
|
+
|
|
67
|
+
After code changes, run the checks in [AGENTS.md](AGENTS.md) (typecheck, Ruff, tests).
|
|
68
|
+
|
|
69
|
+
## CLI Usage
|
|
70
|
+
|
|
71
|
+
Show the available commands:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
uv run pmc --help
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
CLI commands print indented JSON to stdout.
|
|
78
|
+
|
|
79
|
+
List versions for a PMC article:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
uv run pmc versions PMC11370360
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Fetch metadata for the latest available version of a PMCID:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
uv run pmc metadata PMC11370360
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Fetch metadata for a specific version:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
uv run pmc metadata PMC11370360.1
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
List every object key for an article version (including media and supplements).
|
|
98
|
+
For unversioned IDs, the CLI resolves the latest version from S3 first; once the
|
|
99
|
+
version is known, the cached object-key manifest is reused when present. There
|
|
100
|
+
is no extension filter on this command.
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
uv run pmc files PMC11370360.1
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Download files to a local cache. The default root is the **per-OS user cache
|
|
107
|
+
directory** from
|
|
108
|
+
[`platformdirs`](https://github.com/tox-dev/platformdirs) (e.g. `~/.cache/pmc-toolkit` on
|
|
109
|
+
Linux, `~/Library/Caches/pmc-toolkit` on macOS, and under `%LOCALAPPDATA%` on
|
|
110
|
+
Windows), with files under `<root>/<PMCid.N>/`. Override with `--cache-dir` or
|
|
111
|
+
`PMC_TOOLKIT_CACHE`.
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
uv run pmc fetch PMC11370360.1
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Download only selected file types, re-downloading even if cached:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
uv run pmc fetch PMC11370360.1 --ext xml,pdf,jpg --force
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
The `--ext` option also accepts repeated flags if you prefer the more explicit
|
|
124
|
+
form:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
uv run pmc fetch PMC11370360.1 --ext pdf --ext xml --ext jpg --force
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Override the cache location via a flag or the `PMC_TOOLKIT_CACHE` env var:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
uv run pmc fetch PMC11370360.1 --cache-dir ./data
|
|
134
|
+
PMC_TOOLKIT_CACHE=./data uv run pmc fetch PMC11370360.1
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Convert a cached XML file into extracted JSON. Run `fetch --ext xml` first if
|
|
138
|
+
the XML is not already in the cache. The first conversion parses XML once,
|
|
139
|
+
writes `<cache-root>/<PMCid.N>/.pmc-extracted-article.json`, and prints the
|
|
140
|
+
extracted JSON; later conversions for the same article version read that JSON
|
|
141
|
+
cache unless `--force` is passed.
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
uv run pmc fetch PMC11370360.1 --ext xml
|
|
145
|
+
uv run pmc convert-xml PMC11370360.1
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
List the extracted JSON top-level keys:
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
uv run pmc convert-xml --list-keys PMC11370360.1
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
`article_info.publication_date` currently uses the first publication date found
|
|
155
|
+
in the XML. If downstream consumers need to distinguish date types such as
|
|
156
|
+
`epub`, `ppub`, or `collection`, the output can be extended later.
|
|
157
|
+
|
|
158
|
+
## Project Layout
|
|
159
|
+
|
|
160
|
+
Here **“storage”** means the AWS bucket plus the local cache directory where
|
|
161
|
+
`pmc fetch` writes files—not a database or ORM.
|
|
162
|
+
|
|
163
|
+
- `src/pmc_toolkit/cli.py` - Typer CLI commands
|
|
164
|
+
- `src/pmc_toolkit/storage_api.py` - import this for programmatic use: list versions, metadata, list all keys, fetch to cache
|
|
165
|
+
- `src/pmc_toolkit/storage_utils.py` - boto3/unsigned S3 client, list-objects, downloads; implementation details for `storage_api`
|
|
166
|
+
- `src/pmc_toolkit/xml_parse_api.py` - import this for programmatic parsing of cached XML files
|
|
167
|
+
- `src/pmc_toolkit/xml_parse_utils.py` - small `lxml`-based helpers for cached full-text XML extraction
|
|
168
|
+
- `src/pmc_toolkit/cache.py` - per-article directories under the cache root, JSON metadata, cached S3 key listings, and safe local paths for downloaded objects
|
|
169
|
+
- `src/pmc_toolkit/validators.py` - identifier validation
|
|
170
|
+
- `src/pmc_toolkit/models.py` - response models
|
|
171
|
+
- `tests/` - automated tests
|
|
172
|
+
|
|
173
|
+
### Local cache
|
|
174
|
+
|
|
175
|
+
Each resolved article version has a directory `<cache_root>/<PMCid.N>/` containing:
|
|
176
|
+
|
|
177
|
+
- **`<PMCid.N>.json`** — cached metadata (from S3 `metadata/<PMCid.N>.json`), written after a successful read.
|
|
178
|
+
- **`.pmc-object-keys.json`** — JSON array of S3 object keys under that article’s prefix, written after `list_objects_v2` (or read on cache hit). If this file is missing or not a list of strings, listing or fetch may refetch from S3 or raise `ValueError` for an invalid manifest.
|
|
179
|
+
- **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc convert-xml`; reused by later conversions for the same article version.
|
|
180
|
+
|
|
181
|
+
**Cache root selection:** `pmc metadata` and `pmc files` (and the matching `storage_api` functions) always use the default OS user cache from [`platformdirs`](https://github.com/tox-dev/platformdirs). Only `pmc fetch` and `fetch_files(..., cache_dir=...)` accept `--cache-dir` or the `PMC_TOOLKIT_CACHE` environment variable.
|
|
182
|
+
|
|
183
|
+
**Download paths:** For each S3 key, the toolkit maps `PMCid.N/relative/path` to `<cache_root>/PMCid.N/relative/path`. Keys that do not start with the `PMCid.N/` prefix, use an absolute path segment, or resolve outside that directory (for example `..` path segments) are rejected with `ValueError` so downloads never leave the article folder.
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# PMC Toolkit
|
|
2
|
+
|
|
3
|
+
Python toolkit and CLI for exploring, downloading, and parsing PMC article data
|
|
4
|
+
from the PMC Open Access dataset on AWS S3 (`s3://pmc-oa-opendata`).
|
|
5
|
+
|
|
6
|
+
## Current Status
|
|
7
|
+
|
|
8
|
+
The project currently supports:
|
|
9
|
+
|
|
10
|
+
- listing available versions for a PMCID
|
|
11
|
+
- validating PMC identifiers before making requests
|
|
12
|
+
- retrieving metadata for a PMC identifier, defaulting to the latest version for a base PMCID
|
|
13
|
+
- listing every object for a resolved article version, using the local cache when available
|
|
14
|
+
- downloading files for an article version into a local cache (optional `--ext`
|
|
15
|
+
filters apply only to `fetch`, not to `files`; `--ext` accepts either a
|
|
16
|
+
comma-separated list or repeated flags)
|
|
17
|
+
- parsing cached full-text XML into a normalized article dictionary with
|
|
18
|
+
title, journal, article, affiliations, author notes, abstract, content,
|
|
19
|
+
acknowledgements, data availability, related articles, custom metadata,
|
|
20
|
+
competing interests, supplementary media, references, figures, and tables
|
|
21
|
+
|
|
22
|
+
## Requirements
|
|
23
|
+
|
|
24
|
+
- Python 3.11+
|
|
25
|
+
- `uv`
|
|
26
|
+
|
|
27
|
+
## Setup
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
uv sync
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Development
|
|
34
|
+
|
|
35
|
+
After code changes, run the checks in [AGENTS.md](AGENTS.md) (typecheck, Ruff, tests).
|
|
36
|
+
|
|
37
|
+
## CLI Usage
|
|
38
|
+
|
|
39
|
+
Show the available commands:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
uv run pmc --help
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
CLI commands print indented JSON to stdout.
|
|
46
|
+
|
|
47
|
+
List versions for a PMC article:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
uv run pmc versions PMC11370360
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Fetch metadata for the latest available version of a PMCID:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
uv run pmc metadata PMC11370360
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Fetch metadata for a specific version:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
uv run pmc metadata PMC11370360.1
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
List every object key for an article version (including media and supplements).
|
|
66
|
+
For unversioned IDs, the CLI resolves the latest version from S3 first; once the
|
|
67
|
+
version is known, the cached object-key manifest is reused when present. There
|
|
68
|
+
is no extension filter on this command.
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
uv run pmc files PMC11370360.1
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Download files to a local cache. The default root is the **per-OS user cache
|
|
75
|
+
directory** from
|
|
76
|
+
[`platformdirs`](https://github.com/tox-dev/platformdirs) (e.g. `~/.cache/pmc-toolkit` on
|
|
77
|
+
Linux, `~/Library/Caches/pmc-toolkit` on macOS, and under `%LOCALAPPDATA%` on
|
|
78
|
+
Windows), with files under `<root>/<PMCid.N>/`. Override with `--cache-dir` or
|
|
79
|
+
`PMC_TOOLKIT_CACHE`.
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
uv run pmc fetch PMC11370360.1
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Download only selected file types, re-downloading even if cached:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
uv run pmc fetch PMC11370360.1 --ext xml,pdf,jpg --force
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
The `--ext` option also accepts repeated flags if you prefer the more explicit
|
|
92
|
+
form:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
uv run pmc fetch PMC11370360.1 --ext pdf --ext xml --ext jpg --force
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Override the cache location via a flag or the `PMC_TOOLKIT_CACHE` env var:
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
uv run pmc fetch PMC11370360.1 --cache-dir ./data
|
|
102
|
+
PMC_TOOLKIT_CACHE=./data uv run pmc fetch PMC11370360.1
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Convert a cached XML file into extracted JSON. Run `fetch --ext xml` first if
|
|
106
|
+
the XML is not already in the cache. The first conversion parses XML once,
|
|
107
|
+
writes `<cache-root>/<PMCid.N>/.pmc-extracted-article.json`, and prints the
|
|
108
|
+
extracted JSON; later conversions for the same article version read that JSON
|
|
109
|
+
cache unless `--force` is passed.
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
uv run pmc fetch PMC11370360.1 --ext xml
|
|
113
|
+
uv run pmc convert-xml PMC11370360.1
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
List the extracted JSON top-level keys:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
uv run pmc convert-xml --list-keys PMC11370360.1
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
`article_info.publication_date` currently uses the first publication date found
|
|
123
|
+
in the XML. If downstream consumers need to distinguish date types such as
|
|
124
|
+
`epub`, `ppub`, or `collection`, the output can be extended later.
|
|
125
|
+
|
|
126
|
+
## Project Layout
|
|
127
|
+
|
|
128
|
+
Here **“storage”** means the AWS bucket plus the local cache directory where
|
|
129
|
+
`pmc fetch` writes files—not a database or ORM.
|
|
130
|
+
|
|
131
|
+
- `src/pmc_toolkit/cli.py` - Typer CLI commands
|
|
132
|
+
- `src/pmc_toolkit/storage_api.py` - import this for programmatic use: list versions, metadata, list all keys, fetch to cache
|
|
133
|
+
- `src/pmc_toolkit/storage_utils.py` - boto3/unsigned S3 client, list-objects, downloads; implementation details for `storage_api`
|
|
134
|
+
- `src/pmc_toolkit/xml_parse_api.py` - import this for programmatic parsing of cached XML files
|
|
135
|
+
- `src/pmc_toolkit/xml_parse_utils.py` - small `lxml`-based helpers for cached full-text XML extraction
|
|
136
|
+
- `src/pmc_toolkit/cache.py` - per-article directories under the cache root, JSON metadata, cached S3 key listings, and safe local paths for downloaded objects
|
|
137
|
+
- `src/pmc_toolkit/validators.py` - identifier validation
|
|
138
|
+
- `src/pmc_toolkit/models.py` - response models
|
|
139
|
+
- `tests/` - automated tests
|
|
140
|
+
|
|
141
|
+
### Local cache
|
|
142
|
+
|
|
143
|
+
Each resolved article version has a directory `<cache_root>/<PMCid.N>/` containing:
|
|
144
|
+
|
|
145
|
+
- **`<PMCid.N>.json`** — cached metadata (from S3 `metadata/<PMCid.N>.json`), written after a successful read.
|
|
146
|
+
- **`.pmc-object-keys.json`** — JSON array of S3 object keys under that article’s prefix, written after `list_objects_v2` (or read on cache hit). If this file is missing or not a list of strings, listing or fetch may refetch from S3 or raise `ValueError` for an invalid manifest.
|
|
147
|
+
- **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc convert-xml`; reused by later conversions for the same article version.
|
|
148
|
+
|
|
149
|
+
**Cache root selection:** `pmc metadata` and `pmc files` (and the matching `storage_api` functions) always use the default OS user cache from [`platformdirs`](https://github.com/tox-dev/platformdirs). Only `pmc fetch` and `fetch_files(..., cache_dir=...)` accept `--cache-dir` or the `PMC_TOOLKIT_CACHE` environment variable.
|
|
150
|
+
|
|
151
|
+
**Download paths:** For each S3 key, the toolkit maps `PMCid.N/relative/path` to `<cache_root>/PMCid.N/relative/path`. Keys that do not start with the `PMCid.N/` prefix, use an absolute path segment, or resolve outside that directory (for example `..` path segments) are rejected with `ValueError` so downloads never leave the article folder.
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Releasing
|
|
2
|
+
|
|
3
|
+
Pushing a `v*` git tag triggers [`release.yml`](.github/workflows/release.yml),
|
|
4
|
+
which builds with `uv build` and publishes to
|
|
5
|
+
[PyPI](https://pypi.org/project/pmc-toolkit/) via Trusted Publishing (OIDC,
|
|
6
|
+
no tokens).
|
|
7
|
+
|
|
8
|
+
## Release flow
|
|
9
|
+
|
|
10
|
+
```sh
|
|
11
|
+
# 1. Make sure main is green and up to date.
|
|
12
|
+
git switch main && git pull
|
|
13
|
+
|
|
14
|
+
# 2. Bump version (patch | minor | major, or X.Y.Z for an exact version).
|
|
15
|
+
uv version --bump patch
|
|
16
|
+
version="$(uv version --short)"
|
|
17
|
+
|
|
18
|
+
# 3. Commit, push, wait for CI to go green.
|
|
19
|
+
git add pyproject.toml uv.lock
|
|
20
|
+
git commit -m "release: v${version}"
|
|
21
|
+
git push
|
|
22
|
+
|
|
23
|
+
# 4. Tag and push — this triggers the publish.
|
|
24
|
+
git tag "v${version}"
|
|
25
|
+
git push origin "v${version}"
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Watch the **Release** workflow in the Actions tab. Approve the `pypi`
|
|
29
|
+
deployment if the environment requires it. Smoke test:
|
|
30
|
+
|
|
31
|
+
```sh
|
|
32
|
+
uv run --with "pmc-toolkit==${version}" --no-project -- pmc --help
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Optionally draft a GitHub Release from the tag for user-facing notes.
|
|
36
|
+
|
|
37
|
+
## Troubleshooting
|
|
38
|
+
|
|
39
|
+
- **`invalid-publisher`** — PyPI trusted-publisher fields don't match the
|
|
40
|
+
workflow run (case-sensitive).
|
|
41
|
+
- **Tag/version mismatch** — fix `pyproject.toml` or delete the bad tag:
|
|
42
|
+
`git tag -d vX.Y.Z && git push --delete origin vX.Y.Z`.
|
|
43
|
+
- **Bad release** — PyPI forbids re-uploading the same version. Yank it on
|
|
44
|
+
PyPI, bump again, re-release.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pmc-toolkit"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Python toolkit and CLI for exploring, downloading, and parsing PMC article data."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
authors = [{ name = "Jaka Kokosar" }]
|
|
14
|
+
keywords = ["pmc", "pubmed", "bioinformatics", "ncbi", "open-access", "full-text"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Environment :: Console",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"Operating System :: OS Independent",
|
|
21
|
+
"Programming Language :: Python",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Programming Language :: Python :: 3.13",
|
|
26
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
27
|
+
"Topic :: Text Processing :: Markup :: XML",
|
|
28
|
+
"Typing :: Typed",
|
|
29
|
+
]
|
|
30
|
+
dependencies = [
|
|
31
|
+
"boto3>=1.42.93",
|
|
32
|
+
"lxml>=6.1.0",
|
|
33
|
+
"platformdirs>=4.3.0",
|
|
34
|
+
"pydantic>=2.13.3",
|
|
35
|
+
"typer>=0.24.1",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[project.urls]
|
|
39
|
+
Homepage = "https://github.com/JakaKokosar/pmc-toolkit"
|
|
40
|
+
Repository = "https://github.com/JakaKokosar/pmc-toolkit"
|
|
41
|
+
Issues = "https://github.com/JakaKokosar/pmc-toolkit/issues"
|
|
42
|
+
|
|
43
|
+
[dependency-groups]
|
|
44
|
+
dev = [
|
|
45
|
+
"pytest>=9.0.3",
|
|
46
|
+
"ruff>=0.15.11",
|
|
47
|
+
"ty>=0.0.32",
|
|
48
|
+
"types-boto3[s3]>=1.42.94",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
[project.scripts]
|
|
52
|
+
pmc = "pmc_toolkit.cli:main"
|
|
53
|
+
|
|
54
|
+
[tool.hatch.build.targets.wheel]
|
|
55
|
+
packages = ["src/pmc_toolkit"]
|
|
File without changes
|