pmc-toolkit 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/PKG-INFO +6 -11
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/README.md +5 -10
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/RELEASING.md +4 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/pyproject.toml +1 -1
- pmc_toolkit-0.3.0/skills/pmc-toolkit/SKILL.md +50 -0
- pmc_toolkit-0.3.0/skills/pmc-toolkit/agents/openai.yaml +4 -0
- pmc_toolkit-0.3.0/skills/pmc-toolkit/references/cli-metadata.md +36 -0
- pmc_toolkit-0.3.0/skills/pmc-toolkit/references/cli-parse-figures.md +3 -0
- pmc_toolkit-0.3.0/skills/pmc-toolkit/references/cli-parse-references.md +3 -0
- pmc_toolkit-0.3.0/skills/pmc-toolkit/references/cli-parse-tables.md +3 -0
- pmc_toolkit-0.3.0/skills/pmc-toolkit/references/cli-parse.md +122 -0
- pmc_toolkit-0.3.0/skills/pmc-toolkit/references/cli-versions.md +33 -0
- pmc_toolkit-0.3.0/skills/pmc-toolkit/scripts/content-outline.jq +14 -0
- pmc_toolkit-0.3.0/skills/pmc-toolkit/scripts/query-id.jq +3 -0
- pmc_toolkit-0.3.0/skills/pmc-toolkit/scripts/reverse-lookup-xref.jq +13 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/cli.py +3 -15
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/xml_parse_utils.py +0 -20
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/uv.lock +1 -1
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/.github/workflows/ci.yml +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/.github/workflows/release.yml +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/.gitignore +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/.python-version +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/AGENTS.md +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/LICENSE +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/__init__.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/cache.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/models.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/storage_api.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/storage_utils.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/validators.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/xml_parse_api.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/tests/test_cli.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/tests/test_storage.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/tests/test_validators.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/tests/test_xml_parse_api.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/tests/test_xml_parse_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pmc-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Python toolkit and CLI for exploring, downloading, and parsing PMC article data.
|
|
5
5
|
Project-URL: Homepage, https://github.com/JakaKokosar/pmc-toolkit
|
|
6
6
|
Project-URL: Repository, https://github.com/JakaKokosar/pmc-toolkit
|
|
@@ -134,22 +134,17 @@ uv run pmc-toolkit fetch PMC11370360.1 --cache-dir ./data
|
|
|
134
134
|
PMC_TOOLKIT_CACHE=./data uv run pmc-toolkit fetch PMC11370360.1
|
|
135
135
|
```
|
|
136
136
|
|
|
137
|
-
|
|
138
|
-
the XML is not already in the cache. The first
|
|
137
|
+
Parse a cached XML file into extracted JSON. Run `fetch --ext xml` first if
|
|
138
|
+
the XML is not already in the cache. The first parse reads XML once,
|
|
139
139
|
writes `<cache-root>/<PMCid.N>/.pmc-extracted-article.json`, and prints the
|
|
140
|
-
extracted JSON; later
|
|
140
|
+
extracted JSON; later parses for the same article version read that JSON
|
|
141
141
|
cache unless `--force` is passed.
|
|
142
142
|
|
|
143
143
|
```bash
|
|
144
144
|
uv run pmc-toolkit fetch PMC11370360.1 --ext xml
|
|
145
|
-
uv run pmc-toolkit
|
|
145
|
+
uv run pmc-toolkit parse PMC11370360.1
|
|
146
146
|
```
|
|
147
147
|
|
|
148
|
-
List the extracted JSON top-level keys:
|
|
149
|
-
|
|
150
|
-
```bash
|
|
151
|
-
uv run pmc-toolkit convert-xml --list-keys PMC11370360.1
|
|
152
|
-
```
|
|
153
148
|
|
|
154
149
|
`article_info.publication_date` currently uses the first publication date found
|
|
155
150
|
in the XML. If downstream consumers need to distinguish date types such as
|
|
@@ -176,7 +171,7 @@ Each resolved article version has a directory `<cache_root>/<PMCid.N>/` containi
|
|
|
176
171
|
|
|
177
172
|
- **`<PMCid.N>.json`** — cached metadata (from S3 `metadata/<PMCid.N>.json`), written after a successful read.
|
|
178
173
|
- **`.pmc-object-keys.json`** — JSON array of S3 object keys under that article’s prefix, written after `list_objects_v2` (or read on cache hit). If this file is missing or not a list of strings, listing or fetch may refetch from S3 or raise `ValueError` for an invalid manifest.
|
|
179
|
-
- **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit
|
|
174
|
+
- **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit parse`; reused by later parses for the same article version.
|
|
180
175
|
|
|
181
176
|
**Cache root selection:** `pmc-toolkit metadata` and `pmc-toolkit files` (and the matching `storage_api` functions) always use the default OS user cache from [`platformdirs`](https://github.com/tox-dev/platformdirs). Only `pmc-toolkit fetch` and `fetch_files(..., cache_dir=...)` accept `--cache-dir` or the `PMC_TOOLKIT_CACHE` environment variable.
|
|
182
177
|
|
|
@@ -102,22 +102,17 @@ uv run pmc-toolkit fetch PMC11370360.1 --cache-dir ./data
|
|
|
102
102
|
PMC_TOOLKIT_CACHE=./data uv run pmc-toolkit fetch PMC11370360.1
|
|
103
103
|
```
|
|
104
104
|
|
|
105
|
-
|
|
106
|
-
the XML is not already in the cache. The first
|
|
105
|
+
Parse a cached XML file into extracted JSON. Run `fetch --ext xml` first if
|
|
106
|
+
the XML is not already in the cache. The first parse reads XML once,
|
|
107
107
|
writes `<cache-root>/<PMCid.N>/.pmc-extracted-article.json`, and prints the
|
|
108
|
-
extracted JSON; later
|
|
108
|
+
extracted JSON; later parses for the same article version read that JSON
|
|
109
109
|
cache unless `--force` is passed.
|
|
110
110
|
|
|
111
111
|
```bash
|
|
112
112
|
uv run pmc-toolkit fetch PMC11370360.1 --ext xml
|
|
113
|
-
uv run pmc-toolkit
|
|
113
|
+
uv run pmc-toolkit parse PMC11370360.1
|
|
114
114
|
```
|
|
115
115
|
|
|
116
|
-
List the extracted JSON top-level keys:
|
|
117
|
-
|
|
118
|
-
```bash
|
|
119
|
-
uv run pmc-toolkit convert-xml --list-keys PMC11370360.1
|
|
120
|
-
```
|
|
121
116
|
|
|
122
117
|
`article_info.publication_date` currently uses the first publication date found
|
|
123
118
|
in the XML. If downstream consumers need to distinguish date types such as
|
|
@@ -144,7 +139,7 @@ Each resolved article version has a directory `<cache_root>/<PMCid.N>/` containi
|
|
|
144
139
|
|
|
145
140
|
- **`<PMCid.N>.json`** — cached metadata (from S3 `metadata/<PMCid.N>.json`), written after a successful read.
|
|
146
141
|
- **`.pmc-object-keys.json`** — JSON array of S3 object keys under that article’s prefix, written after `list_objects_v2` (or read on cache hit). If this file is missing or not a list of strings, listing or fetch may refetch from S3 or raise `ValueError` for an invalid manifest.
|
|
147
|
-
- **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit
|
|
142
|
+
- **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit parse`; reused by later parses for the same article version.
|
|
148
143
|
|
|
149
144
|
**Cache root selection:** `pmc-toolkit metadata` and `pmc-toolkit files` (and the matching `storage_api` functions) always use the default OS user cache from [`platformdirs`](https://github.com/tox-dev/platformdirs). Only `pmc-toolkit fetch` and `fetch_files(..., cache_dir=...)` accept `--cache-dir` or the `PMC_TOOLKIT_CACHE` environment variable.
|
|
150
145
|
|
|
@@ -32,6 +32,10 @@ deployment if the environment requires it. Smoke test:
|
|
|
32
32
|
uv run --with "pmc-toolkit==${version}" --no-project -- pmc-toolkit --help
|
|
33
33
|
```
|
|
34
34
|
|
|
35
|
+
From **v0.2.0**, the PyPI wheel exposes only the `pmc-toolkit` console script
|
|
36
|
+
(the previous `pmc` script was removed so the binary matches the distribution
|
|
37
|
+
name).
|
|
38
|
+
|
|
35
39
|
Optionally draft a GitHub Release from the tag for user-facing notes.
|
|
36
40
|
|
|
37
41
|
## Troubleshooting
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: pmc-toolkit
|
|
3
|
+
description: Retrieve PubMed Central (PMC) Open Access article data from a PMCID or versioned PMCID, including version discovery, metadata, S3 file listings, downloads, cached XML parsing, structured article JSON, references, figures, tables, affiliations, author notes, funding, acknowledgements, data availability, competing interests, supplementary media, related articles, or custom metadata. Covers PMC Toolkit CLI and Python API workflows. Not for PubMed-only PMID lookup or general literature search without PMC OA article retrieval.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# PMC Toolkit
|
|
7
|
+
|
|
8
|
+
Use this skill to retrieve, download, and parse PMC Open Access article data with PMC Toolkit. Select commands by the data needed to complete the task, not by surface wording in the request.
|
|
9
|
+
|
|
10
|
+
## Operating Rules
|
|
11
|
+
|
|
12
|
+
- Run published-tool commands as `uvx pmc-toolkit ...`.
|
|
13
|
+
- Do not add installation guidance. If `uv` or `uvx` is unavailable, report that PMC Toolkit needs it and stop.
|
|
14
|
+
- Live lookups, listings, and downloads require network access to the PMC Open Access S3 dataset unless the needed data is already cached.
|
|
15
|
+
- Prefer targeted XML, metadata, and media retrieval over whole-PDF parsing. Use PDFs only when the user asks or just need to download for other reasons.
|
|
16
|
+
|
|
17
|
+
## Quick overview
|
|
18
|
+
|
|
19
|
+
1. Treat a base PMCID such as `<PMCID>` as the normal input. Convert it to an explicit versioned PMCID once, then reuse `<PMCID.N>` for downstream commands.
|
|
20
|
+
2. `versions <PMCID>` lists every published `<PMCID.N>`. Use it to enumerate versions or pin a concrete version for scripts (for example latest via `jq -r '.versions[-1]'`). Details: [references/cli-versions.md](references/cli-versions.md).
|
|
21
|
+
3. `metadata <PMCID>` or `metadata <PMCID.N>` returns bibliographic fields, OA flags, and S3 URL fields (`xml_url`, `pdf_url`, and so on). It is **not** the primary command for **resolving which `<PMCID.N>` exists or picking a version**. Details: [references/cli-metadata.md](references/cli-metadata.md).
|
|
22
|
+
4. `files <PMCID.N>` returns the complete S3 object-key inventory for that article version. It is the discovery step for available XML, PDFs, figures, media, and supplements.
|
|
23
|
+
5. `fetch <PMCID.N>` downloads all or filtered S3 objects into the local article cache. Add `--ext` for specific file types, `--cache-dir` for a task-specific cache, and `--force` to refresh cached files.
|
|
24
|
+
6. `parse <PMCID.N>` transforms cached full-text XML into normalized article JSON with top-level keys `article_info`, `content`, `references`, `figures`, `tables`, and `supporting_info`. Use it after XML is present in the selected cache; add `--force` to rebuild the extracted JSON cache. Details: [references/cli-parse.md](references/cli-parse.md).
|
|
25
|
+
|
|
26
|
+
## Context-first retrieval
|
|
27
|
+
|
|
28
|
+
Choose the smallest useful retrieval path for the question:
|
|
29
|
+
|
|
30
|
+
- Metadata, DOI, license, version, OA status, or file availability: run `versions` if a pinned version matters, then `metadata` or `files`.
|
|
31
|
+
- Abstract title, content sections, funding grants, authors, affiliations, references, figure captions, tables, or supporting statements, use `parse` to get the article JSON:
|
|
32
|
+
- Figure questions: inspect `figures[]` from extracted JSON first; fetch only the referenced image extensions when the visual itself is needed.
|
|
33
|
+
- Table questions: inspect `tables[]`; if it is empty, report that no structured XML tables were found rather than falling back to PDF parsing automatically.
|
|
34
|
+
- Supplement, data availability, acknowledgements, competing interests, author contribution, or correspondence questions: inspect `supporting_info`.
|
|
35
|
+
- Citation and evidence grounding: use link-aware paragraph fields (`source_id`, `reference_ids`, `figure_ids`, and `table_ids`) to retrieve only linked references, figures, or tables needed for the answer.
|
|
36
|
+
|
|
37
|
+
## Additional resources
|
|
38
|
+
|
|
39
|
+
- [references/cli-versions.md](references/cli-versions.md) — Examples for `versions`, including selecting the latest or a specific `<PMCID.N>`.
|
|
40
|
+
- [references/cli-metadata.md](references/cli-metadata.md) — Examples and field overview for `metadata`.
|
|
41
|
+
- [references/cli-parse.md](references/cli-parse.md) — Examples and field overview for `parse` output.
|
|
42
|
+
|
|
43
|
+
## Gotchas
|
|
44
|
+
|
|
45
|
+
- `versions` rejects versioned IDs; pass only a base PMCID.
|
|
46
|
+
- `metadata`, `files`, `fetch`, and `parse` accept base or versioned IDs. Passing a base ID makes those commands resolve the latest version at run time. Prefer `versions <PMCID>` (then reuse the chosen `<PMCID.N>`) when the work needs an explicit pinned version rather than repeating implicit latest-resolution on each command.
|
|
47
|
+
- `files` has no extension filter. Use `fetch --ext` for filtered downloads.
|
|
48
|
+
- `parse` needs `<cache>/<PMCID.N>/<PMCID.N>.xml`; run `fetch <PMCID.N> --ext xml` first when the XML is absent.
|
|
49
|
+
- `metadata` and `files` use the default OS user cache. `fetch` and `parse` can use `--cache-dir` or `PMC_TOOLKIT_CACHE`.
|
|
50
|
+
- Cache paths are per article version. Keep the same cache root across `fetch` and `parse`.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# CLI: `metadata`
|
|
2
|
+
|
|
3
|
+
Use `metadata <PMCID>` or `metadata <PMCID.N>` to fetch bibliographic fields, Open Access flags, and S3 URL fields (for example `xml_url`, `pdf_url`, `media_urls`, `text_url`), plus `pmid` and `doi`.
|
|
4
|
+
|
|
5
|
+
**Version resolution:** When the task is to discover which `<PMCID.N>` exist or to **choose** a versioned PMCID, use `versions` first (examples and jq patterns live in `references/cli-versions.md`, linked from the main SKILL), not `metadata`. `metadata` does include a `version` number and versioned URLs for the resolved record, but it is not the right command for enumerating or picking versions.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
```bash
|
|
9
|
+
uvx pmc-toolkit metadata PMCxxxx.N
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
Example output:
|
|
13
|
+
|
|
14
|
+
```json
|
|
15
|
+
{
|
|
16
|
+
"pmcid": "PMCxxxx",
|
|
17
|
+
"version": N,
|
|
18
|
+
"pmid": 12345678,
|
|
19
|
+
"doi": "10.1234/example.doi",
|
|
20
|
+
"mid": null,
|
|
21
|
+
"title": "Example article title",
|
|
22
|
+
"citation": "Journal Name",
|
|
23
|
+
"is_pmc_openaccess": true/false,
|
|
24
|
+
"is_manuscript": true/false,
|
|
25
|
+
"is_historical_ocr": true/false,
|
|
26
|
+
"is_retracted": true/false,
|
|
27
|
+
"license_code": "license code",
|
|
28
|
+
"xml_url": "s3://pmc-oa-opendata/PMCxxxx.N/PMCxxxx.N.xml?md5=<hex>",
|
|
29
|
+
"pdf_url": "s3://pmc-oa-opendata/PMCxxxx.N/PMCxxxx.N.pdf?md5=<hex>",
|
|
30
|
+
"media_urls": [
|
|
31
|
+
"s3://pmc-oa-opendata/PMCxxxx.N/media-1.jpg?md5=<hex>",
|
|
32
|
+
...
|
|
33
|
+
],
|
|
34
|
+
"text_url": "s3://pmc-oa-opendata/PMCxxxx.N/PMCxxxx.N.txt?md5=<hex>"
|
|
35
|
+
}
|
|
36
|
+
```
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# CLI: `parse`
|
|
2
|
+
|
|
3
|
+
Use `parse <PMCID.N>` after cached full-text XML exists. Run `fetch <PMCID.N> --ext xml` first when `<cache>/<PMCID.N>/<PMCID.N>.xml` is missing. The first run parses XML once, writes `<cache-root>/<PMCID.N>/.pmc-extracted-article.json`, and prints the extracted JSON; later runs reuse that cache unless `--force` is passed.
|
|
4
|
+
|
|
5
|
+
Add `--cache-dir` or `PMC_TOOLKIT_CACHE` when the XML was fetched outside the default OS user cache. Keep the same cache root across `fetch` and `parse`.
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
uvx pmc-toolkit fetch PMCxxxx.N --ext xml
|
|
9
|
+
uvx pmc-toolkit parse PMCxxxx.N
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Extracted JSON top-level keys
|
|
13
|
+
|
|
14
|
+
- **article_info** — `journal`, `article_ids`, `title`, `publication_date`, `article_type`, `license`, `keywords`, `authors[]`, `abstract`, `funding_grants[]`
|
|
15
|
+
- **content** — `paragraphs[]` and `sections[]`; items include `source_id`, `section_id`, `title`, `text`, `reference_ids`, `figure_ids`, and `table_ids`
|
|
16
|
+
- **references** — `references[]` with `source_id`, `label`, `text`, `publication_type`, `identifiers`, `article_title`, `source`, `year`, `volume`, `issue`, and `pages`
|
|
17
|
+
- **figures** — `figures[]` with `source_id`, `label`, `caption`, and `graphics`
|
|
18
|
+
- **tables** — `tables[]` with `source_id`, `label`, `caption`, `rows`, and `footnotes`
|
|
19
|
+
- **supporting_info** — `acknowledgements`, `competing_interests`, `data_availability`, `supplementary_media`, `author_notes`, `related_articles`, and `custom_metadata`
|
|
20
|
+
|
|
21
|
+
## Narrow retrieval with jq
|
|
22
|
+
|
|
23
|
+
**Start here.** Full `parse` output is large. Pipe it through jq and load only the slice you need.
|
|
24
|
+
|
|
25
|
+
### Content outline (default first step)
|
|
26
|
+
|
|
27
|
+
`scripts/content-outline.jq` returns a nested section tree: article title plus `section_id` and `title` for each section.
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq -f scripts/content-outline.jq
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Example output:
|
|
34
|
+
|
|
35
|
+
```json
|
|
36
|
+
{
|
|
37
|
+
"title": "journal title",
|
|
38
|
+
"sections": [
|
|
39
|
+
{
|
|
40
|
+
"section_id": "S1",
|
|
41
|
+
"title": "section title"
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
"section_id": "S2",
|
|
45
|
+
"title": "section title",
|
|
46
|
+
"sections": [
|
|
47
|
+
{
|
|
48
|
+
"section_id": "S3",
|
|
49
|
+
"title": "sub-section title"
|
|
50
|
+
}
|
|
51
|
+
]
|
|
52
|
+
}
|
|
53
|
+
]
|
|
54
|
+
}
|
|
55
|
+
```
|
|
56
|
+
Use this to pick relevant sections (based on their titles) before loading detailed information.
|
|
57
|
+
The `section_id` values are XML source IDs (`S1`, `S2`, …) — use them with `scripts/query-id.jq` to fetch detailed section data.
|
|
58
|
+
|
|
59
|
+
### Drill down by ID
|
|
60
|
+
|
|
61
|
+
`scripts/query-id.jq` returns the first object whose `source_id` matches. After the content outline, pass a chosen ID:
|
|
62
|
+
|
|
63
|
+
| Prefix | Meaning | Example |
|
|
64
|
+
| --- | --- | --- |
|
|
65
|
+
| `S*` | Section | `S3` |
|
|
66
|
+
| `P*` | Paragraph | `P9` |
|
|
67
|
+
| `F*` | Figure | `F1` |
|
|
68
|
+
| `R*` | Reference | `R1` |
|
|
69
|
+
| `T*` | Table | `T1` |
|
|
70
|
+
|
|
71
|
+
**Section** — paragraph text and xref links for that section:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq --arg id "S3" -f scripts/query-id.jq
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Example output:
|
|
78
|
+
|
|
79
|
+
```json
|
|
80
|
+
{
|
|
81
|
+
"source_id": "S3",
|
|
82
|
+
"section_id": "2.1",
|
|
83
|
+
"title": "sub-section title",
|
|
84
|
+
"paragraphs": [
|
|
85
|
+
{
|
|
86
|
+
"source_id": "P9",
|
|
87
|
+
"text": "paragraph text",
|
|
88
|
+
"reference_ids": ["R1", "R18"],
|
|
89
|
+
"figure_ids": ["F1", "F5"],
|
|
90
|
+
"table_ids": ["T1"]
|
|
91
|
+
}
|
|
92
|
+
],
|
|
93
|
+
"sections": []
|
|
94
|
+
}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Some sections are containers only. In the outline, `S2` (Results) has child sections but no paragraphs of its own — the text lives in `S3`, `S4`, etc.
|
|
98
|
+
Query those leaf `S*` IDs (sections with no nested `sections` in the outline), not the parent, to load only the subsection you need.
|
|
99
|
+
|
|
100
|
+
**Figure, table, or reference** — same script, different ID prefix:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq --arg id "F1" -f scripts/query-id.jq
|
|
104
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq --arg id "R1" -f scripts/query-id.jq
|
|
105
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq --arg id "T1" -f scripts/query-id.jq
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Use paragraph `reference_ids`, `figure_ids`, and `table_ids` to fetch linked entries with `scripts/query-id.jq`. Output shapes:
|
|
109
|
+
|
|
110
|
+
- [cli-parse-references.md](cli-parse-references.md) — `R*` lookup
|
|
111
|
+
- [cli-parse-figures.md](cli-parse-figures.md) — `F*` lookup
|
|
112
|
+
- [cli-parse-tables.md](cli-parse-tables.md) — `T*` lookup
|
|
113
|
+
|
|
114
|
+
### Reverse lookup by xref
|
|
115
|
+
|
|
116
|
+
`query-id.jq` resolves an ID to its object. `reverse-lookup-xref.jq` finds every paragraph that cites a given reference, figure, or table. Pass `--arg xref` as `references`, `figures`, or `tables`:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq --arg xref references --arg id "R1" -f scripts/reverse-lookup-xref.jq
|
|
120
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq --arg xref figures --arg id "F1" -f scripts/reverse-lookup-xref.jq
|
|
121
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq --arg xref tables --arg id "T1" -f scripts/reverse-lookup-xref.jq
|
|
122
|
+
```
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# CLI: `versions`
|
|
2
|
+
|
|
3
|
+
Use `versions <PMCID>` to list every published versioned PMCID string (`PMCxxxx.1`, `PMCxxxx.2`, …) for a **base** PMCID only. `versions` rejects versioned IDs.
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
uv run pmc-toolkit versions PMCxxxx
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
Example output shape:
|
|
10
|
+
|
|
11
|
+
```json
|
|
12
|
+
{
|
|
13
|
+
"pmcid": "PMCxxxx",
|
|
14
|
+
"versions": [
|
|
15
|
+
"PMCxxxx.1",
|
|
16
|
+
"PMCxxxx.2"
|
|
17
|
+
]
|
|
18
|
+
}
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Pick the latest `<PMCID.N>`
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
uv run pmc-toolkit versions PMCxxxx | jq -r '.versions[-1]'
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Pick a non-latest version
|
|
28
|
+
|
|
29
|
+
Select an element of `.versions` by index (for example `.versions[0]` for the first published version).
|
|
30
|
+
|
|
31
|
+
## Next steps
|
|
32
|
+
|
|
33
|
+
After you have `<PMCID.N>`, continue with `metadata`, `files`, `fetch`, and `parse` as described in the main skill.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
def drop_empty($o):
|
|
2
|
+
$o | with_entries(select(.value | if type == "array" then length > 0 else . != null end));
|
|
3
|
+
|
|
4
|
+
def section:
|
|
5
|
+
drop_empty({
|
|
6
|
+
section_id: .source_id,
|
|
7
|
+
title: .title,
|
|
8
|
+
sections: [.sections[]? | section]
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
{
|
|
12
|
+
title: .article_info.title,
|
|
13
|
+
sections: [.content.sections[]? | section]
|
|
14
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
{
|
|
2
|
+
references: "reference_ids",
|
|
3
|
+
figures: "figure_ids",
|
|
4
|
+
tables: "table_ids"
|
|
5
|
+
}[$xref] as $field
|
|
6
|
+
| if $field == null then
|
|
7
|
+
error("xref must be references, figures, or tables")
|
|
8
|
+
else
|
|
9
|
+
[ .. | objects
|
|
10
|
+
| select(.[$field]? | index($id))
|
|
11
|
+
| {source_id, text, ($field): .[$field] }
|
|
12
|
+
]
|
|
13
|
+
end
|
|
@@ -140,8 +140,8 @@ def fetch(
|
|
|
140
140
|
_emit_json(result.model_dump(mode="json"))
|
|
141
141
|
|
|
142
142
|
|
|
143
|
-
@app.command("
|
|
144
|
-
def
|
|
143
|
+
@app.command("parse")
|
|
144
|
+
def parse(
|
|
145
145
|
requested_pmcid: str = typer.Argument(
|
|
146
146
|
...,
|
|
147
147
|
help="PMC accession ID or version ID, e.g. PMC11370360 or PMC11370360.1",
|
|
@@ -158,22 +158,10 @@ def convert_xml(
|
|
|
158
158
|
"-f",
|
|
159
159
|
help="Recreate the extracted JSON cache from the cached XML.",
|
|
160
160
|
),
|
|
161
|
-
list_keys: bool = typer.Option(
|
|
162
|
-
False,
|
|
163
|
-
"--list-keys",
|
|
164
|
-
help="Print available extracted JSON keys and descriptions, then exit.",
|
|
165
|
-
),
|
|
166
161
|
) -> None:
|
|
167
162
|
"""
|
|
168
|
-
|
|
163
|
+
Parse cached PMC full-text XML into cached extracted JSON.
|
|
169
164
|
"""
|
|
170
|
-
if list_keys:
|
|
171
|
-
from pmc_toolkit.xml_parse_utils import EXTRACT_OUTPUT_KEY_DESCRIPTIONS
|
|
172
|
-
|
|
173
|
-
typer.echo("Available extracted JSON keys:")
|
|
174
|
-
for key, description in EXTRACT_OUTPUT_KEY_DESCRIPTIONS.items():
|
|
175
|
-
typer.echo(f"- {key}: {description}")
|
|
176
|
-
return
|
|
177
165
|
|
|
178
166
|
def build_result():
|
|
179
167
|
from pmc_toolkit.xml_parse_api import ensure_extracted_article
|
|
@@ -13,26 +13,6 @@ XMLParser = etree.XMLParser(
|
|
|
13
13
|
remove_blank_text=True,
|
|
14
14
|
)
|
|
15
15
|
REFERENCE_SEPARATOR_PATTERN = re.compile(r"^[\s,;]+$")
|
|
16
|
-
EXTRACT_OUTPUT_KEY_DESCRIPTIONS = {
|
|
17
|
-
"article_info": (
|
|
18
|
-
"article_info.journal, article_ids, title, publication_date, article_type, "
|
|
19
|
-
"license, keywords, authors[], abstract, and funding_grants[]"
|
|
20
|
-
),
|
|
21
|
-
"content": (
|
|
22
|
-
"content.paragraphs[] and content.sections[]; objects include source_id, "
|
|
23
|
-
"section_id, title, text, reference_ids, figure_ids, and table_ids"
|
|
24
|
-
),
|
|
25
|
-
"references": (
|
|
26
|
-
"references[] items with source_id, label, text, publication_type, "
|
|
27
|
-
"identifiers, article_title, source, year, volume, issue, and pages"
|
|
28
|
-
),
|
|
29
|
-
"figures": "figures[] items with source_id, label, caption, and graphics",
|
|
30
|
-
"tables": "tables[] items with source_id, label, caption, rows, and footnotes",
|
|
31
|
-
"supporting_info": (
|
|
32
|
-
"acknowledgements, competing_interests, data_availability, "
|
|
33
|
-
"supplementary_media, author_notes, related_articles, and custom_metadata"
|
|
34
|
-
),
|
|
35
|
-
}
|
|
36
16
|
|
|
37
17
|
|
|
38
18
|
def load_xml(path: Path) -> Any:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|