pmc-toolkit 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/PKG-INFO +6 -11
  2. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/README.md +5 -10
  3. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/RELEASING.md +4 -0
  4. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/pyproject.toml +1 -1
  5. pmc_toolkit-0.3.0/skills/pmc-toolkit/SKILL.md +50 -0
  6. pmc_toolkit-0.3.0/skills/pmc-toolkit/agents/openai.yaml +4 -0
  7. pmc_toolkit-0.3.0/skills/pmc-toolkit/references/cli-metadata.md +36 -0
  8. pmc_toolkit-0.3.0/skills/pmc-toolkit/references/cli-parse-figures.md +3 -0
  9. pmc_toolkit-0.3.0/skills/pmc-toolkit/references/cli-parse-references.md +3 -0
  10. pmc_toolkit-0.3.0/skills/pmc-toolkit/references/cli-parse-tables.md +3 -0
  11. pmc_toolkit-0.3.0/skills/pmc-toolkit/references/cli-parse.md +122 -0
  12. pmc_toolkit-0.3.0/skills/pmc-toolkit/references/cli-versions.md +33 -0
  13. pmc_toolkit-0.3.0/skills/pmc-toolkit/scripts/content-outline.jq +14 -0
  14. pmc_toolkit-0.3.0/skills/pmc-toolkit/scripts/query-id.jq +3 -0
  15. pmc_toolkit-0.3.0/skills/pmc-toolkit/scripts/reverse-lookup-xref.jq +13 -0
  16. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/cli.py +3 -15
  17. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/xml_parse_utils.py +0 -20
  18. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/uv.lock +1 -1
  19. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/.github/workflows/ci.yml +0 -0
  20. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/.github/workflows/release.yml +0 -0
  21. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/.gitignore +0 -0
  22. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/.python-version +0 -0
  23. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/AGENTS.md +0 -0
  24. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/LICENSE +0 -0
  25. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/__init__.py +0 -0
  26. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/cache.py +0 -0
  27. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/models.py +0 -0
  28. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/storage_api.py +0 -0
  29. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/storage_utils.py +0 -0
  30. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/validators.py +0 -0
  31. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/src/pmc_toolkit/xml_parse_api.py +0 -0
  32. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/tests/test_cli.py +0 -0
  33. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/tests/test_storage.py +0 -0
  34. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/tests/test_validators.py +0 -0
  35. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/tests/test_xml_parse_api.py +0 -0
  36. {pmc_toolkit-0.2.0 → pmc_toolkit-0.3.0}/tests/test_xml_parse_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pmc-toolkit
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Python toolkit and CLI for exploring, downloading, and parsing PMC article data.
5
5
  Project-URL: Homepage, https://github.com/JakaKokosar/pmc-toolkit
6
6
  Project-URL: Repository, https://github.com/JakaKokosar/pmc-toolkit
@@ -134,22 +134,17 @@ uv run pmc-toolkit fetch PMC11370360.1 --cache-dir ./data
134
134
  PMC_TOOLKIT_CACHE=./data uv run pmc-toolkit fetch PMC11370360.1
135
135
  ```
136
136
 
137
- Convert a cached XML file into extracted JSON. Run `fetch --ext xml` first if
138
- the XML is not already in the cache. The first conversion parses XML once,
137
+ Parse a cached XML file into extracted JSON. Run `fetch --ext xml` first if
138
+ the XML is not already in the cache. The first parse reads XML once,
139
139
  writes `<cache-root>/<PMCid.N>/.pmc-extracted-article.json`, and prints the
140
- extracted JSON; later conversions for the same article version read that JSON
140
+ extracted JSON; later parses for the same article version read that JSON
141
141
  cache unless `--force` is passed.
142
142
 
143
143
  ```bash
144
144
  uv run pmc-toolkit fetch PMC11370360.1 --ext xml
145
- uv run pmc-toolkit convert-xml PMC11370360.1
145
+ uv run pmc-toolkit parse PMC11370360.1
146
146
  ```
147
147
 
148
- List the extracted JSON top-level keys:
149
-
150
- ```bash
151
- uv run pmc-toolkit convert-xml --list-keys PMC11370360.1
152
- ```
153
148
 
154
149
  `article_info.publication_date` currently uses the first publication date found
155
150
  in the XML. If downstream consumers need to distinguish date types such as
@@ -176,7 +171,7 @@ Each resolved article version has a directory `<cache_root>/<PMCid.N>/` containi
176
171
 
177
172
  - **`<PMCid.N>.json`** — cached metadata (from S3 `metadata/<PMCid.N>.json`), written after a successful read.
178
173
  - **`.pmc-object-keys.json`** — JSON array of S3 object keys under that article’s prefix, written after `list_objects_v2` (or read on cache hit). If this file is missing or not a list of strings, listing or fetch may refetch from S3 or raise `ValueError` for an invalid manifest.
179
- - **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit convert-xml`; reused by later conversions for the same article version.
174
+ - **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit parse`; reused by later parses for the same article version.
180
175
 
181
176
  **Cache root selection:** `pmc-toolkit metadata` and `pmc-toolkit files` (and the matching `storage_api` functions) always use the default OS user cache from [`platformdirs`](https://github.com/tox-dev/platformdirs). Only `pmc-toolkit fetch` and `fetch_files(..., cache_dir=...)` accept `--cache-dir` or the `PMC_TOOLKIT_CACHE` environment variable.
182
177
 
@@ -102,22 +102,17 @@ uv run pmc-toolkit fetch PMC11370360.1 --cache-dir ./data
102
102
  PMC_TOOLKIT_CACHE=./data uv run pmc-toolkit fetch PMC11370360.1
103
103
  ```
104
104
 
105
- Convert a cached XML file into extracted JSON. Run `fetch --ext xml` first if
106
- the XML is not already in the cache. The first conversion parses XML once,
105
+ Parse a cached XML file into extracted JSON. Run `fetch --ext xml` first if
106
+ the XML is not already in the cache. The first parse reads XML once,
107
107
  writes `<cache-root>/<PMCid.N>/.pmc-extracted-article.json`, and prints the
108
- extracted JSON; later conversions for the same article version read that JSON
108
+ extracted JSON; later parses for the same article version read that JSON
109
109
  cache unless `--force` is passed.
110
110
 
111
111
  ```bash
112
112
  uv run pmc-toolkit fetch PMC11370360.1 --ext xml
113
- uv run pmc-toolkit convert-xml PMC11370360.1
113
+ uv run pmc-toolkit parse PMC11370360.1
114
114
  ```
115
115
 
116
- List the extracted JSON top-level keys:
117
-
118
- ```bash
119
- uv run pmc-toolkit convert-xml --list-keys PMC11370360.1
120
- ```
121
116
 
122
117
  `article_info.publication_date` currently uses the first publication date found
123
118
  in the XML. If downstream consumers need to distinguish date types such as
@@ -144,7 +139,7 @@ Each resolved article version has a directory `<cache_root>/<PMCid.N>/` containi
144
139
 
145
140
  - **`<PMCid.N>.json`** — cached metadata (from S3 `metadata/<PMCid.N>.json`), written after a successful read.
146
141
  - **`.pmc-object-keys.json`** — JSON array of S3 object keys under that article’s prefix, written after `list_objects_v2` (or read on cache hit). If this file is missing or not a list of strings, listing or fetch may refetch from S3 or raise `ValueError` for an invalid manifest.
147
- - **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit convert-xml`; reused by later conversions for the same article version.
142
+ - **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit parse`; reused by later parses for the same article version.
148
143
 
149
144
  **Cache root selection:** `pmc-toolkit metadata` and `pmc-toolkit files` (and the matching `storage_api` functions) always use the default OS user cache from [`platformdirs`](https://github.com/tox-dev/platformdirs). Only `pmc-toolkit fetch` and `fetch_files(..., cache_dir=...)` accept `--cache-dir` or the `PMC_TOOLKIT_CACHE` environment variable.
150
145
 
@@ -32,6 +32,10 @@ deployment if the environment requires it. Smoke test:
32
32
  uv run --with "pmc-toolkit==${version}" --no-project -- pmc-toolkit --help
33
33
  ```
34
34
 
35
+ From **v0.2.0**, the PyPI wheel exposes only the `pmc-toolkit` console script
36
+ (the previous `pmc` script was removed so the binary matches the distribution
37
+ name).
38
+
35
39
  Optionally draft a GitHub Release from the tag for user-facing notes.
36
40
 
37
41
  ## Troubleshooting
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "pmc-toolkit"
7
- version = "0.2.0"
7
+ version = "0.3.0"
8
8
  description = "Python toolkit and CLI for exploring, downloading, and parsing PMC article data."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -0,0 +1,50 @@
1
+ ---
2
+ name: pmc-toolkit
3
+ description: Retrieve PubMed Central (PMC) Open Access article data from a PMCID or versioned PMCID, including version discovery, metadata, S3 file listings, downloads, cached XML parsing, structured article JSON, references, figures, tables, affiliations, author notes, funding, acknowledgements, data availability, competing interests, supplementary media, related articles, or custom metadata. Covers PMC Toolkit CLI and Python API workflows. Not for PubMed-only PMID lookup or general literature search without PMC OA article retrieval.
4
+ ---
5
+
6
+ # PMC Toolkit
7
+
8
+ Use this skill to retrieve, download, and parse PMC Open Access article data with PMC Toolkit. Select commands by the data needed to complete the task, not by surface wording in the request.
9
+
10
+ ## Operating Rules
11
+
12
+ - Run published-tool commands as `uvx pmc-toolkit ...`.
13
+ - Do not add installation guidance. If `uv` or `uvx` is unavailable, report that PMC Toolkit needs it and stop.
14
+ - Live lookups, listings, and downloads require network access to the PMC Open Access S3 dataset unless the needed data is already cached.
15
+ - Prefer targeted XML, metadata, and media retrieval over whole-PDF parsing. Use PDFs only when the user asks or just need to download for other reasons.
16
+
17
+ ## Quick overview
18
+
19
+ 1. Treat a base PMCID such as `<PMCID>` as the normal input. Convert it to an explicit versioned PMCID once, then reuse `<PMCID.N>` for downstream commands.
20
+ 2. `versions <PMCID>` lists every published `<PMCID.N>`. Use it to enumerate versions or pin a concrete version for scripts (for example latest via `jq -r '.versions[-1]'`). Details: [references/cli-versions.md](references/cli-versions.md).
21
+ 3. `metadata <PMCID>` or `metadata <PMCID.N>` returns bibliographic fields, OA flags, and S3 URL fields (`xml_url`, `pdf_url`, and so on). It is **not** the primary command for **resolving which `<PMCID.N>` exists or picking a version**. Details: [references/cli-metadata.md](references/cli-metadata.md).
22
+ 4. `files <PMCID.N>` returns the complete S3 object-key inventory for that article version. It is the discovery step for available XML, PDFs, figures, media, and supplements.
23
+ 5. `fetch <PMCID.N>` downloads all or filtered S3 objects into the local article cache. Add `--ext` for specific file types, `--cache-dir` for a task-specific cache, and `--force` to refresh cached files.
24
+ 6. `parse <PMCID.N>` transforms cached full-text XML into normalized article JSON with top-level keys `article_info`, `content`, `references`, `figures`, `tables`, and `supporting_info`. Use it after XML is present in the selected cache; add `--force` to rebuild the extracted JSON cache. Details: [references/cli-parse.md](references/cli-parse.md).
25
+
26
+ ## Context-first retrieval
27
+
28
+ Choose the smallest useful retrieval path for the question:
29
+
30
+ - Metadata, DOI, license, version, OA status, or file availability: run `versions` if a pinned version matters, then `metadata` or `files`.
31
+ - Abstract title, content sections, funding grants, authors, affiliations, references, figure captions, tables, or supporting statements, use `parse` to get the article JSON:
32
+ - Figure questions: inspect `figures[]` from extracted JSON first; fetch only the referenced image extensions when the visual itself is needed.
33
+ - Table questions: inspect `tables[]`; if it is empty, report that no structured XML tables were found rather than falling back to PDF parsing automatically.
34
+ - Supplement, data availability, acknowledgements, competing interests, author contribution, or correspondence questions: inspect `supporting_info`.
35
+ - Citation and evidence grounding: use link-aware paragraph fields (`source_id`, `reference_ids`, `figure_ids`, and `table_ids`) to retrieve only linked references, figures, or tables needed for the answer.
36
+
37
+ ## Additional resources
38
+
39
+ - [references/cli-versions.md](references/cli-versions.md) — Examples for `versions`, including selecting the latest or a specific `<PMCID.N>`.
40
+ - [references/cli-metadata.md](references/cli-metadata.md) — Examples and field overview for `metadata`.
41
+ - [references/cli-parse.md](references/cli-parse.md) — Examples and field overview for `parse` output.
42
+
43
+ ## Gotchas
44
+
45
+ - `versions` rejects versioned IDs; pass only a base PMCID.
46
+ - `metadata`, `files`, `fetch`, and `parse` accept base or versioned IDs. Passing a base ID makes those commands resolve the latest version at run time. Prefer `versions <PMCID>` (then reuse the chosen `<PMCID.N>`) when the work needs an explicit pinned version rather than repeating implicit latest-resolution on each command.
47
+ - `files` has no extension filter. Use `fetch --ext` for filtered downloads.
48
+ - `parse` needs `<cache>/<PMCID.N>/<PMCID.N>.xml`; run `fetch <PMCID.N> --ext xml` first when the XML is absent.
49
+ - `metadata` and `files` use the default OS user cache. `fetch` and `parse` can use `--cache-dir` or `PMC_TOOLKIT_CACHE`.
50
+ - Cache paths are per article version. Keep the same cache root across `fetch` and `parse`.
@@ -0,0 +1,4 @@
1
+ interface:
2
+ display_name: "PMC Toolkit"
3
+ short_description: "Targeted PMC OA paper retrieval"
4
+ default_prompt: "Use $pmc-toolkit to retrieve only the PMC paper metadata, text sections, figures, tables, or supporting info needed for this question."
@@ -0,0 +1,36 @@
1
+ # CLI: `metadata`
2
+
3
+ Use `metadata <PMCID>` or `metadata <PMCID.N>` to fetch bibliographic fields, Open Access flags, and S3 URL fields (for example `xml_url`, `pdf_url`, `media_urls`, `text_url`), plus `pmid` and `doi`.
4
+
5
+ **Version resolution:** When the task is to discover which `<PMCID.N>` exist or to **choose** a versioned PMCID, use `versions` first (examples and jq patterns live in `references/cli-versions.md`, linked from the main SKILL), not `metadata`. `metadata` does include a `version` number and versioned URLs for the resolved record, but it is not the right command for enumerating or picking versions.
6
+
7
+ Example:
8
+ ```bash
9
+ uvx pmc-toolkit metadata PMCxxxx.N
10
+ ```
11
+
12
+ Example output:
13
+
14
+ ```json
15
+ {
16
+ "pmcid": "PMCxxxx",
17
+ "version": N,
18
+ "pmid": 12345678,
19
+ "doi": "10.1234/example.doi",
20
+ "mid": null,
21
+ "title": "Example article title",
22
+ "citation": "Journal Name",
23
+ "is_pmc_openaccess": true/false,
24
+ "is_manuscript": true/false,
25
+ "is_historical_ocr": true/false,
26
+ "is_retracted": true/false,
27
+ "license_code": "license code",
28
+ "xml_url": "s3://pmc-oa-opendata/PMCxxxx.N/PMCxxxx.N.xml?md5=<hex>",
29
+ "pdf_url": "s3://pmc-oa-opendata/PMCxxxx.N/PMCxxxx.N.pdf?md5=<hex>",
30
+ "media_urls": [
31
+ "s3://pmc-oa-opendata/PMCxxxx.N/media-1.jpg?md5=<hex>",
32
+ ...
33
+ ],
34
+ "text_url": "s3://pmc-oa-opendata/PMCxxxx.N/PMCxxxx.N.txt?md5=<hex>"
35
+ }
36
+ ```
@@ -0,0 +1,3 @@
1
+ # parse: figures (`F*`)
2
+
3
+ TODO
@@ -0,0 +1,3 @@
1
+ # parse: references (`R*`)
2
+
3
+ TODO
@@ -0,0 +1,3 @@
1
+ # parse: tables (`T*`)
2
+
3
+ TODO
@@ -0,0 +1,122 @@
1
+ # CLI: `parse`
2
+
3
+ Use `parse <PMCID.N>` after cached full-text XML exists. Run `fetch <PMCID.N> --ext xml` first when `<cache>/<PMCID.N>/<PMCID.N>.xml` is missing. The first run parses XML once, writes `<cache-root>/<PMCID.N>/.pmc-extracted-article.json`, and prints the extracted JSON; later runs reuse that cache unless `--force` is passed.
4
+
5
+ Add `--cache-dir` or `PMC_TOOLKIT_CACHE` when the XML was fetched outside the default OS user cache. Keep the same cache root across `fetch` and `parse`.
6
+
7
+ ```bash
8
+ uvx pmc-toolkit fetch PMCxxxx.N --ext xml
9
+ uvx pmc-toolkit parse PMCxxxx.N
10
+ ```
11
+
12
+ ## Extracted JSON top-level keys
13
+
14
+ - **article_info** — `journal`, `article_ids`, `title`, `publication_date`, `article_type`, `license`, `keywords`, `authors[]`, `abstract`, `funding_grants[]`
15
+ - **content** — `paragraphs[]` and `sections[]`; items include `source_id`, `section_id`, `title`, `text`, `reference_ids`, `figure_ids`, and `table_ids`
16
+ - **references** — `references[]` with `source_id`, `label`, `text`, `publication_type`, `identifiers`, `article_title`, `source`, `year`, `volume`, `issue`, and `pages`
17
+ - **figures** — `figures[]` with `source_id`, `label`, `caption`, and `graphics`
18
+ - **tables** — `tables[]` with `source_id`, `label`, `caption`, `rows`, and `footnotes`
19
+ - **supporting_info** — `acknowledgements`, `competing_interests`, `data_availability`, `supplementary_media`, `author_notes`, `related_articles`, and `custom_metadata`
20
+
21
+ ## Narrow retrieval with jq
22
+
23
+ **Start here.** Full `parse` output is large. Pipe it through jq and load only the slice you need.
24
+
25
+ ### Content outline (default first step)
26
+
27
+ `scripts/content-outline.jq` returns a nested section tree: article title plus `section_id` and `title` for each section.
28
+
29
+ ```bash
30
+ uvx pmc-toolkit parse PMCxxxx.N | jq -f scripts/content-outline.jq
31
+ ```
32
+
33
+ Example output:
34
+
35
+ ```json
36
+ {
37
+ "title": "journal title",
38
+ "sections": [
39
+ {
40
+ "section_id": "S1",
41
+ "title": "section title"
42
+ },
43
+ {
44
+ "section_id": "S2",
45
+ "title": "section title",
46
+ "sections": [
47
+ {
48
+ "section_id": "S3",
49
+ "title": "sub-section title"
50
+ }
51
+ ]
52
+ }
53
+ ]
54
+ }
55
+ ```
56
+ Use this to pick relevant sections (based on their titles) before loading detailed information.
57
+ The `section_id` values are XML source IDs (`S1`, `S2`, …) — use them with `scripts/query-id.jq` to fetch detailed section data.
58
+
59
+ ### Drill down by ID
60
+
61
+ `scripts/query-id.jq` returns the first object whose `source_id` matches. After the content outline, pass a chosen ID:
62
+
63
+ | Prefix | Meaning | Example |
64
+ | --- | --- | --- |
65
+ | `S*` | Section | `S3` |
66
+ | `P*` | Paragraph | `P9` |
67
+ | `F*` | Figure | `F1` |
68
+ | `R*` | Reference | `R1` |
69
+ | `T*` | Table | `T1` |
70
+
71
+ **Section** — paragraph text and xref links for that section:
72
+
73
+ ```bash
74
+ uvx pmc-toolkit parse PMCxxxx.N | jq --arg id "S3" -f scripts/query-id.jq
75
+ ```
76
+
77
+ Example output:
78
+
79
+ ```json
80
+ {
81
+ "source_id": "S3",
82
+ "section_id": "2.1",
83
+ "title": "sub-section title",
84
+ "paragraphs": [
85
+ {
86
+ "source_id": "P9",
87
+ "text": "paragraph text",
88
+ "reference_ids": ["R1", "R18"],
89
+ "figure_ids": ["F1", "F5"],
90
+ "table_ids": ["T1"]
91
+ }
92
+ ],
93
+ "sections": []
94
+ }
95
+ ```
96
+
97
+ Some sections are containers only. In the outline, `S2` (Results) has child sections but no paragraphs of its own — the text lives in `S3`, `S4`, etc.
98
+ Query those leaf `S*` IDs (sections with no nested `sections` in the outline), not the parent, to load only the subsection you need.
99
+
100
+ **Figure, table, or reference** — same script, different ID prefix:
101
+
102
+ ```bash
103
+ uvx pmc-toolkit parse PMCxxxx.N | jq --arg id "F1" -f scripts/query-id.jq
104
+ uvx pmc-toolkit parse PMCxxxx.N | jq --arg id "R1" -f scripts/query-id.jq
105
+ uvx pmc-toolkit parse PMCxxxx.N | jq --arg id "T1" -f scripts/query-id.jq
106
+ ```
107
+
108
+ Use paragraph `reference_ids`, `figure_ids`, and `table_ids` to fetch linked entries with `scripts/query-id.jq`. Output shapes:
109
+
110
+ - [cli-parse-references.md](cli-parse-references.md) — `R*` lookup
111
+ - [cli-parse-figures.md](cli-parse-figures.md) — `F*` lookup
112
+ - [cli-parse-tables.md](cli-parse-tables.md) — `T*` lookup
113
+
114
+ ### Reverse lookup by xref
115
+
116
+ `query-id.jq` resolves an ID to its object. `reverse-lookup-xref.jq` finds every paragraph that cites a given reference, figure, or table. Pass `--arg xref` as `references`, `figures`, or `tables`:
117
+
118
+ ```bash
119
+ uvx pmc-toolkit parse PMCxxxx.N | jq --arg xref references --arg id "R1" -f scripts/reverse-lookup-xref.jq
120
+ uvx pmc-toolkit parse PMCxxxx.N | jq --arg xref figures --arg id "F1" -f scripts/reverse-lookup-xref.jq
121
+ uvx pmc-toolkit parse PMCxxxx.N | jq --arg xref tables --arg id "T1" -f scripts/reverse-lookup-xref.jq
122
+ ```
@@ -0,0 +1,33 @@
1
+ # CLI: `versions`
2
+
3
+ Use `versions <PMCID>` to list every published versioned PMCID string (`PMCxxxx.1`, `PMCxxxx.2`, …) for a **base** PMCID only. `versions` rejects versioned IDs.
4
+
5
+ ```bash
6
+ uv run pmc-toolkit versions PMCxxxx
7
+ ```
8
+
9
+ Example output shape:
10
+
11
+ ```json
12
+ {
13
+ "pmcid": "PMCxxxx",
14
+ "versions": [
15
+ "PMCxxxx.1",
16
+ "PMCxxxx.2"
17
+ ]
18
+ }
19
+ ```
20
+
21
+ ## Pick the latest `<PMCID.N>`
22
+
23
+ ```bash
24
+ uv run pmc-toolkit versions PMCxxxx | jq -r '.versions[-1]'
25
+ ```
26
+
27
+ ## Pick a non-latest version
28
+
29
+ Select an element of `.versions` by index (for example `.versions[0]` for the first published version).
30
+
31
+ ## Next steps
32
+
33
+ After you have `<PMCID.N>`, continue with `metadata`, `files`, `fetch`, and `parse` as described in the main skill.
@@ -0,0 +1,14 @@
1
+ def drop_empty($o):
2
+ $o | with_entries(select(.value | if type == "array" then length > 0 else . != null end));
3
+
4
+ def section:
5
+ drop_empty({
6
+ section_id: .source_id,
7
+ title: .title,
8
+ sections: [.sections[]? | section]
9
+ });
10
+
11
+ {
12
+ title: .article_info.title,
13
+ sections: [.content.sections[]? | section]
14
+ }
@@ -0,0 +1,3 @@
1
+ ..
2
+ | objects
3
+ | select(.source_id? == $id)
@@ -0,0 +1,13 @@
1
+ {
2
+ references: "reference_ids",
3
+ figures: "figure_ids",
4
+ tables: "table_ids"
5
+ }[$xref] as $field
6
+ | if $field == null then
7
+ error("xref must be references, figures, or tables")
8
+ else
9
+ [ .. | objects
10
+ | select(.[$field]? | index($id))
11
+ | {source_id, text, ($field): .[$field] }
12
+ ]
13
+ end
@@ -140,8 +140,8 @@ def fetch(
140
140
  _emit_json(result.model_dump(mode="json"))
141
141
 
142
142
 
143
- @app.command("convert-xml")
144
- def convert_xml(
143
+ @app.command("parse")
144
+ def parse(
145
145
  requested_pmcid: str = typer.Argument(
146
146
  ...,
147
147
  help="PMC accession ID or version ID, e.g. PMC11370360 or PMC11370360.1",
@@ -158,22 +158,10 @@ def convert_xml(
158
158
  "-f",
159
159
  help="Recreate the extracted JSON cache from the cached XML.",
160
160
  ),
161
- list_keys: bool = typer.Option(
162
- False,
163
- "--list-keys",
164
- help="Print available extracted JSON keys and descriptions, then exit.",
165
- ),
166
161
  ) -> None:
167
162
  """
168
- Convert cached PMC full-text XML into cached extracted JSON.
163
+ Parse cached PMC full-text XML into cached extracted JSON.
169
164
  """
170
- if list_keys:
171
- from pmc_toolkit.xml_parse_utils import EXTRACT_OUTPUT_KEY_DESCRIPTIONS
172
-
173
- typer.echo("Available extracted JSON keys:")
174
- for key, description in EXTRACT_OUTPUT_KEY_DESCRIPTIONS.items():
175
- typer.echo(f"- {key}: {description}")
176
- return
177
165
 
178
166
  def build_result():
179
167
  from pmc_toolkit.xml_parse_api import ensure_extracted_article
@@ -13,26 +13,6 @@ XMLParser = etree.XMLParser(
13
13
  remove_blank_text=True,
14
14
  )
15
15
  REFERENCE_SEPARATOR_PATTERN = re.compile(r"^[\s,;]+$")
16
- EXTRACT_OUTPUT_KEY_DESCRIPTIONS = {
17
- "article_info": (
18
- "article_info.journal, article_ids, title, publication_date, article_type, "
19
- "license, keywords, authors[], abstract, and funding_grants[]"
20
- ),
21
- "content": (
22
- "content.paragraphs[] and content.sections[]; objects include source_id, "
23
- "section_id, title, text, reference_ids, figure_ids, and table_ids"
24
- ),
25
- "references": (
26
- "references[] items with source_id, label, text, publication_type, "
27
- "identifiers, article_title, source, year, volume, issue, and pages"
28
- ),
29
- "figures": "figures[] items with source_id, label, caption, and graphics",
30
- "tables": "tables[] items with source_id, label, caption, rows, and footnotes",
31
- "supporting_info": (
32
- "acknowledgements, competing_interests, data_availability, "
33
- "supplementary_media, author_notes, related_articles, and custom_metadata"
34
- ),
35
- }
36
16
 
37
17
 
38
18
  def load_xml(path: Path) -> Any:
@@ -251,7 +251,7 @@ wheels = [
251
251
 
252
252
  [[package]]
253
253
  name = "pmc-toolkit"
254
- version = "0.2.0"
254
+ version = "0.3.0"
255
255
  source = { editable = "." }
256
256
  dependencies = [
257
257
  { name = "boto3" },
File without changes
File without changes
File without changes
File without changes