pmc-toolkit 0.2.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/.gitignore +3 -0
  2. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/PKG-INFO +14 -11
  3. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/README.md +13 -10
  4. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/RELEASING.md +4 -0
  5. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/pyproject.toml +1 -1
  6. pmc_toolkit-0.4.0/skills/pmc-toolkit/SKILL.md +71 -0
  7. pmc_toolkit-0.4.0/skills/pmc-toolkit/agents/openai.yaml +4 -0
  8. pmc_toolkit-0.4.0/skills/pmc-toolkit/references/cli-files.md +59 -0
  9. pmc_toolkit-0.4.0/skills/pmc-toolkit/references/cli-idconv.md +25 -0
  10. pmc_toolkit-0.4.0/skills/pmc-toolkit/references/cli-metadata.md +34 -0
  11. pmc_toolkit-0.4.0/skills/pmc-toolkit/references/cli-parse.md +116 -0
  12. pmc_toolkit-0.4.0/skills/pmc-toolkit/references/cli-versions.md +35 -0
  13. pmc_toolkit-0.4.0/skills/pmc-toolkit/references/data-locator.md +42 -0
  14. pmc_toolkit-0.4.0/skills/pmc-toolkit/references/workflow-author-contributor-analysis.md +48 -0
  15. pmc_toolkit-0.4.0/skills/pmc-toolkit/references/workflow-evidence-extraction.md +47 -0
  16. pmc_toolkit-0.4.0/skills/pmc-toolkit/references/workflow-figure-image-analysis.md +33 -0
  17. pmc_toolkit-0.4.0/skills/pmc-toolkit/references/workflow-knowledge-extraction.md +38 -0
  18. pmc_toolkit-0.4.0/skills/pmc-toolkit/references/workflow-reporting.md +32 -0
  19. pmc_toolkit-0.4.0/skills/pmc-toolkit/scripts/content-outline.jq +14 -0
  20. pmc_toolkit-0.4.0/skills/pmc-toolkit/scripts/query-id.jq +3 -0
  21. pmc_toolkit-0.4.0/skills/pmc-toolkit/scripts/reverse-lookup-xref.jq +13 -0
  22. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/cli.py +41 -15
  23. pmc_toolkit-0.4.0/src/pmc_toolkit/idconv_api.py +46 -0
  24. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/xml_parse_api.py +1 -12
  25. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/xml_parse_utils.py +0 -20
  26. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/tests/test_xml_parse_api.py +1 -1
  27. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/uv.lock +1 -1
  28. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/.github/workflows/ci.yml +0 -0
  29. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/.github/workflows/release.yml +0 -0
  30. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/.python-version +0 -0
  31. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/AGENTS.md +0 -0
  32. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/LICENSE +0 -0
  33. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/__init__.py +0 -0
  34. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/cache.py +0 -0
  35. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/models.py +0 -0
  36. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/storage_api.py +0 -0
  37. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/storage_utils.py +0 -0
  38. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/validators.py +0 -0
  39. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/tests/test_cli.py +0 -0
  40. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/tests/test_storage.py +0 -0
  41. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/tests/test_validators.py +0 -0
  42. {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/tests/test_xml_parse_utils.py +0 -0
@@ -8,3 +8,6 @@ wheels/
8
8
 
9
9
  # Virtual environments
10
10
  .venv
11
+
12
+
13
+ .DS_Store
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pmc-toolkit
3
- Version: 0.2.0
3
+ Version: 0.4.0
4
4
  Summary: Python toolkit and CLI for exploring, downloading, and parsing PMC article data.
5
5
  Project-URL: Homepage, https://github.com/JakaKokosar/pmc-toolkit
6
6
  Project-URL: Repository, https://github.com/JakaKokosar/pmc-toolkit
@@ -42,6 +42,7 @@ The project currently supports:
42
42
  - listing available versions for a PMCID
43
43
  - validating PMC identifiers before making requests
44
44
  - retrieving metadata for a PMC identifier, defaulting to the latest version for a base PMCID
45
+ - converting PMID, DOI, PMCID, or MID values to PMC identifiers when PMC has a matching record
45
46
  - listing every object for a resolved article version, using the local cache when available
46
47
  - downloading files for an article version into a local cache (optional `--ext`
47
48
  filters apply only to `fetch`, not to `files`; `--ext` accepts either a
@@ -94,6 +95,13 @@ Fetch metadata for a specific version:
94
95
  uv run pmc-toolkit metadata PMC11370360.1
95
96
  ```
96
97
 
98
+ Convert a PMID or DOI to a PMCID when PMC has a matching record:
99
+
100
+ ```bash
101
+ uv run pmc-toolkit idconv 23193287
102
+ uv run pmc-toolkit idconv 10.1093/nar/gks1195 --idtype doi
103
+ ```
104
+
97
105
  List every object key for an article version (including media and supplements).
98
106
  For unversioned IDs, the CLI resolves the latest version from S3 first; once the
99
107
  version is known, the cached object-key manifest is reused when present. There
@@ -134,22 +142,17 @@ uv run pmc-toolkit fetch PMC11370360.1 --cache-dir ./data
134
142
  PMC_TOOLKIT_CACHE=./data uv run pmc-toolkit fetch PMC11370360.1
135
143
  ```
136
144
 
137
- Convert a cached XML file into extracted JSON. Run `fetch --ext xml` first if
138
- the XML is not already in the cache. The first conversion parses XML once,
145
+ Parse a cached XML file into extracted JSON. Run `fetch --ext xml` first if
146
+ the XML is not already in the cache. The first parse reads XML once,
139
147
  writes `<cache-root>/<PMCid.N>/.pmc-extracted-article.json`, and prints the
140
- extracted JSON; later conversions for the same article version read that JSON
148
+ extracted JSON; later parses for the same article version read that JSON
141
149
  cache unless `--force` is passed.
142
150
 
143
151
  ```bash
144
152
  uv run pmc-toolkit fetch PMC11370360.1 --ext xml
145
- uv run pmc-toolkit convert-xml PMC11370360.1
153
+ uv run pmc-toolkit parse PMC11370360.1
146
154
  ```
147
155
 
148
- List the extracted JSON top-level keys:
149
-
150
- ```bash
151
- uv run pmc-toolkit convert-xml --list-keys PMC11370360.1
152
- ```
153
156
 
154
157
  `article_info.publication_date` currently uses the first publication date found
155
158
  in the XML. If downstream consumers need to distinguish date types such as
@@ -176,7 +179,7 @@ Each resolved article version has a directory `<cache_root>/<PMCid.N>/` containi
176
179
 
177
180
  - **`<PMCid.N>.json`** — cached metadata (from S3 `metadata/<PMCid.N>.json`), written after a successful read.
178
181
  - **`.pmc-object-keys.json`** — JSON array of S3 object keys under that article’s prefix, written after `list_objects_v2` (or read on cache hit). If this file is missing or not a list of strings, listing or fetch may refetch from S3 or raise `ValueError` for an invalid manifest.
179
- - **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit convert-xml`; reused by later conversions for the same article version.
182
+ - **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit parse`; reused by later parses for the same article version.
180
183
 
181
184
  **Cache root selection:** `pmc-toolkit metadata` and `pmc-toolkit files` (and the matching `storage_api` functions) always use the default OS user cache from [`platformdirs`](https://github.com/tox-dev/platformdirs). Only `pmc-toolkit fetch` and `fetch_files(..., cache_dir=...)` accept `--cache-dir` or the `PMC_TOOLKIT_CACHE` environment variable.
182
185
 
@@ -10,6 +10,7 @@ The project currently supports:
10
10
  - listing available versions for a PMCID
11
11
  - validating PMC identifiers before making requests
12
12
  - retrieving metadata for a PMC identifier, defaulting to the latest version for a base PMCID
13
+ - converting PMID, DOI, PMCID, or MID values to PMC identifiers when PMC has a matching record
13
14
  - listing every object for a resolved article version, using the local cache when available
14
15
  - downloading files for an article version into a local cache (optional `--ext`
15
16
  filters apply only to `fetch`, not to `files`; `--ext` accepts either a
@@ -62,6 +63,13 @@ Fetch metadata for a specific version:
62
63
  uv run pmc-toolkit metadata PMC11370360.1
63
64
  ```
64
65
 
66
+ Convert a PMID or DOI to a PMCID when PMC has a matching record:
67
+
68
+ ```bash
69
+ uv run pmc-toolkit idconv 23193287
70
+ uv run pmc-toolkit idconv 10.1093/nar/gks1195 --idtype doi
71
+ ```
72
+
65
73
  List every object key for an article version (including media and supplements).
66
74
  For unversioned IDs, the CLI resolves the latest version from S3 first; once the
67
75
  version is known, the cached object-key manifest is reused when present. There
@@ -102,22 +110,17 @@ uv run pmc-toolkit fetch PMC11370360.1 --cache-dir ./data
102
110
  PMC_TOOLKIT_CACHE=./data uv run pmc-toolkit fetch PMC11370360.1
103
111
  ```
104
112
 
105
- Convert a cached XML file into extracted JSON. Run `fetch --ext xml` first if
106
- the XML is not already in the cache. The first conversion parses XML once,
113
+ Parse a cached XML file into extracted JSON. Run `fetch --ext xml` first if
114
+ the XML is not already in the cache. The first parse reads XML once,
107
115
  writes `<cache-root>/<PMCid.N>/.pmc-extracted-article.json`, and prints the
108
- extracted JSON; later conversions for the same article version read that JSON
116
+ extracted JSON; later parses for the same article version read that JSON
109
117
  cache unless `--force` is passed.
110
118
 
111
119
  ```bash
112
120
  uv run pmc-toolkit fetch PMC11370360.1 --ext xml
113
- uv run pmc-toolkit convert-xml PMC11370360.1
121
+ uv run pmc-toolkit parse PMC11370360.1
114
122
  ```
115
123
 
116
- List the extracted JSON top-level keys:
117
-
118
- ```bash
119
- uv run pmc-toolkit convert-xml --list-keys PMC11370360.1
120
- ```
121
124
 
122
125
  `article_info.publication_date` currently uses the first publication date found
123
126
  in the XML. If downstream consumers need to distinguish date types such as
@@ -144,7 +147,7 @@ Each resolved article version has a directory `<cache_root>/<PMCid.N>/` containi
144
147
 
145
148
  - **`<PMCid.N>.json`** — cached metadata (from S3 `metadata/<PMCid.N>.json`), written after a successful read.
146
149
  - **`.pmc-object-keys.json`** — JSON array of S3 object keys under that article’s prefix, written after `list_objects_v2` (or read on cache hit). If this file is missing or not a list of strings, listing or fetch may refetch from S3 or raise `ValueError` for an invalid manifest.
147
- - **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit convert-xml`; reused by later conversions for the same article version.
150
+ - **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit parse`; reused by later parses for the same article version.
148
151
 
149
152
  **Cache root selection:** `pmc-toolkit metadata` and `pmc-toolkit files` (and the matching `storage_api` functions) always use the default OS user cache from [`platformdirs`](https://github.com/tox-dev/platformdirs). Only `pmc-toolkit fetch` and `fetch_files(..., cache_dir=...)` accept `--cache-dir` or the `PMC_TOOLKIT_CACHE` environment variable.
150
153
 
@@ -32,6 +32,10 @@ deployment if the environment requires it. Smoke test:
32
32
  uv run --with "pmc-toolkit==${version}" --no-project -- pmc-toolkit --help
33
33
  ```
34
34
 
35
+ From **v0.2.0**, the PyPI wheel exposes only the `pmc-toolkit` console script
36
+ (the previous `pmc` script was removed so the binary matches the distribution
37
+ name).
38
+
35
39
  Optionally draft a GitHub Release from the tag for user-facing notes.
36
40
 
37
41
  ## Troubleshooting
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "pmc-toolkit"
7
- version = "0.2.0"
7
+ version = "0.4.0"
8
8
  description = "Python toolkit and CLI for exploring, downloading, and parsing PMC article data."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -0,0 +1,71 @@
1
+ ---
2
+ name: pmc-toolkit
3
+ description: Work with PubMed Central Open Access articles by PMCID using PMC Toolkit. Use for version resolution, metadata and file inventory, downloads, parsed article evidence extraction, authors and contributor analysis, figures, tables, references, supplements, declarations, knowledge extraction, and report-style summaries. Can convert PMID/DOI to PMCID only to continue PMC full-text workflows; not for keyword literature search or non-PMC article analysis.
4
+ ---
5
+
6
+ # PMC Toolkit
7
+
8
+ Use this skill to retrieve, download, parse, and cite PMC Open Access article data with PMC Toolkit. Select commands by the data needed to complete the task, not by surface wording in the request.
9
+
10
+ ## Operating Rules
11
+
12
+ - Run published-tool commands as `uvx pmc-toolkit ...`.
13
+ - Do not add installation guidance. If `uv` or `uvx` is unavailable, report that PMC Toolkit needs it and stop.
14
+ - Live lookups, listings, and downloads require network access to the PMC Open Access S3 dataset unless the needed data is already cached.
15
+ - Resolve bundled helper paths relative to this skill directory, for example `<SKILL_DIR>/scripts/content-outline.jq`.
16
+ - Do not load, dump, grep, or search raw XML or PDF files directly for article-content tasks. Fetch XML only as parser input, then use `parse` output and bundled JSON helpers for evidence extraction.
17
+ - When piping PMC Toolkit JSON through `jq`, use `jq -c` unless pretty-printed JSON is explicitly needed for human inspection. Prefer compact JSON to avoid bloating context.
18
+ - For simple extraction requests where the command output is already the user-facing answer, do not repeat large text in the final response. Return only a brief label or status plus the exact command output when needed; for long abstracts, tables, or lists, prefer telling the user the command printed the requested value instead of restating it.
19
+ - Do not invent missing declarations, author notes, figures, tables, or references. Report the missing parsed field or empty list.
20
+
21
+ ## Task Router
22
+
23
+ Choose the smallest route that answers the request. Prefer a direct CLI route when one command output is enough; use a workflow route when the task needs multi-step retrieval, synthesis, or evidence reporting. If using a direct CLI route, make the shell command do the final formatting so the assistant response can stay minimal.
24
+
25
+ ### Direct CLI Routes
26
+
27
+ - PMCID availability and version resolution: read [references/cli-versions.md](references/cli-versions.md) for `versions` command details.
28
+ - PMID/DOI to PMCID conversion for continuing PMC workflows: read [references/cli-idconv.md](references/cli-idconv.md) for `idconv` command details.
29
+ - DOI, title, journal, license, OA flags, retraction flags, and S3 URL fields: read [references/cli-metadata.md](references/cli-metadata.md) for `metadata` command details.
30
+ - File inventory or downloads for XML, PDF, text, figures, media, or supplements: read [references/cli-files.md](references/cli-files.md) for `files` and `fetch` command details.
31
+ - Parsed article JSON, body sections, supporting info, parsed authors, figures, tables, references, or helper `jq` usage: run `fetch --ext xml` as needed, then `parse`. Read [references/cli-parse.md](references/cli-parse.md) for `parse` output shape and helper-script usage.
32
+ - If `parse` is needed but the right parsed field is not obvious, read [references/data-locator.md](references/data-locator.md) before retrieving detailed evidence.
33
+ - When a task asks about a referenced/cited article and the parsed reference has `identifiers.pmid` or `identifiers.doi` but no `identifiers.pmcid`, use `idconv` before stopping.
34
+
35
+ ### Workflow Routes
36
+
37
+ - Article-content questions, passage finding, section analysis, support for claims, declarations, supplements, or evidence-grounded answers: read [references/workflow-evidence-extraction.md](references/workflow-evidence-extraction.md).
38
+ - Author, affiliation, ORCID, equal-contribution, corresponding-author, contributor, or author-note tasks: read [references/workflow-author-contributor-analysis.md](references/workflow-author-contributor-analysis.md).
39
+ - Knowledge extraction, claim extraction, evidence matrices, mechanism summaries, or structured fact extraction: read [references/workflow-knowledge-extraction.md](references/workflow-knowledge-extraction.md).
40
+ - Figure interpretation, graphics lookup, panel questions, or visual inspection: read [references/workflow-figure-image-analysis.md](references/workflow-figure-image-analysis.md). Fetch image files only when visual inspection is required.
41
+ - Report-style summaries, author reports, evidence reports, or deliverables combining several data types: read [references/workflow-reporting.md](references/workflow-reporting.md), then load only the source-specific workflow references required by the report.
42
+
43
+ ## Bundled Resources
44
+
45
+ Open references only after choosing a route above, when command-specific details, output shapes, or workflow details are needed.
46
+
47
+ - [references/data-locator.md](references/data-locator.md) - task-to-parsed-JSON-field routing.
48
+ - [references/workflow-evidence-extraction.md](references/workflow-evidence-extraction.md) - detailed evidence retrieval loop and answer contract.
49
+ - [references/workflow-author-contributor-analysis.md](references/workflow-author-contributor-analysis.md) - author, affiliation, correspondence, and contributor-note workflow.
50
+ - [references/workflow-knowledge-extraction.md](references/workflow-knowledge-extraction.md) - generic structured extraction workflow.
51
+ - [references/workflow-figure-image-analysis.md](references/workflow-figure-image-analysis.md) - figure caption, linked text, and visual-inspection workflow.
52
+ - [references/workflow-reporting.md](references/workflow-reporting.md) - report assembly pattern for mixed data tasks.
53
+ - [references/cli-versions.md](references/cli-versions.md) - `versions` examples and version selection.
54
+ - [references/cli-idconv.md](references/cli-idconv.md) - `idconv` examples and missing-PMC handling.
55
+ - [references/cli-metadata.md](references/cli-metadata.md) - `metadata` examples and field overview.
56
+ - [references/cli-files.md](references/cli-files.md) - `files` and `fetch` output shapes.
57
+ - [references/cli-parse.md](references/cli-parse.md) - `parse` output shape and helper-script usage.
58
+ - [references/cli-parse-figures.md](references/cli-parse-figures.md) - figure lookup shape and citation context.
59
+ - [references/cli-parse-tables.md](references/cli-parse-tables.md) - table lookup shape and citation context.
60
+ - [references/cli-parse-references.md](references/cli-parse-references.md) - reference lookup shape and citation context.
61
+ - `<SKILL_DIR>/scripts/content-outline.jq` - paper outline first step for evidence extraction.
62
+ - `<SKILL_DIR>/scripts/query-id.jq` - lookup sections, paragraphs, figures, tables, and references by `source_id`.
63
+ - `<SKILL_DIR>/scripts/reverse-lookup-xref.jq` - find paragraphs that cite a figure, table, or reference.
64
+
65
+ ## Gotchas
66
+
67
+ - `files` has no extension filter. Use `fetch --ext` for filtered downloads.
68
+ - Parsed reference records often omit PMCID even when they include PMID or DOI. Use `idconv` to test whether PMC has a matching article before saying PMC full text is unavailable.
69
+ - `parse` needs cached XML; run `fetch <PMCID.N> --ext xml` first when XML is absent.
70
+ - `fetch` and `parse` use the default PMC Toolkit cache unless `--cache-dir` or `PMC_TOOLKIT_CACHE` is provided. Use custom cache paths only when there is a concrete reason.
71
+ - Cache paths are per article version. Keep the same cache root across `fetch` and `parse` if a custom cache is used.
@@ -0,0 +1,4 @@
1
+ interface:
2
+ display_name: "PMC Toolkit"
3
+ short_description: "PMC OA evidence extraction"
4
+ default_prompt: "Use $pmc-toolkit to retrieve PMC paper metadata, files, parsed evidence, authors, figures, tables, or report-ready findings for this PMCID. Convert PMID/DOI to PMCID only when needed to continue PMC full-text work."
@@ -0,0 +1,59 @@
1
+ # CLI: `files` And `fetch`
2
+
3
+ Use `files <PMCID.N>` to list every S3 object key under the article version prefix. Use `fetch <PMCID.N>` to download all or selected object extensions into the local cache.
4
+
5
+ ## `files`
6
+
7
+ `files` has no extension filter.
8
+
9
+ ```bash
10
+ uvx pmc-toolkit files PMCxxxx.N
11
+ ```
12
+
13
+ Example output:
14
+
15
+ ```json
16
+ {
17
+ "versioned_pmcid": "PMCxxxx.N",
18
+ "keys": [
19
+ "PMCxxxx.N/PMCxxxx.N.xml",
20
+ "PMCxxxx.N/PMCxxxx.N.pdf",
21
+ "PMCxxxx.N/media-1.jpg"
22
+ ]
23
+ }
24
+ ```
25
+
26
+ Use `files` for inventory, not for local paths.
27
+
28
+ ## `fetch`
29
+
30
+ Use `fetch` when a file must exist locally for parsing, inspection, or user delivery.
31
+
32
+ Example for downloading all files listed in above `files` output:
33
+ ```bash
34
+ uvx pmc-toolkit fetch PMCxxxx.N --ext xml,pdf,jpg
35
+ ```
36
+
37
+ Example output:
38
+
39
+ ```json
40
+ {
41
+ "versioned_pmcid": "PMCxxxx.N",
42
+ "cache_dir": "/cache/root/PMCxxxx.N",
43
+ "files": [
44
+ {
45
+ "key": "PMCxxxx.N/PMCxxxx.N.xml",
46
+ "local_path": "/cache/root/PMCxxxx.N/PMCxxxx.N.xml",
47
+ "action": "downloaded"
48
+ }, ...
49
+ ]
50
+ }
51
+ ```
52
+
53
+ Use `local_path` if you need access to the downloaded files.
54
+
55
+ ## Cache Notes
56
+
57
+ - `metadata` and `files` use the default OS user cache for metadata/manifests.
58
+ - `fetch` and `parse` can use `--cache-dir` or `PMC_TOOLKIT_CACHE`; keep the same cache root across both commands.
59
+ - Cache paths are per article version under `<cache_root>/<PMCID.N>/`.
@@ -0,0 +1,25 @@
1
+ # CLI: `idconv`
2
+
3
+ Use `idconv <ID...>` to convert PMID, DOI, PMCID, or MID values to PMC identifiers through the PMC ID Converter API. Use this only as a bridge back into PMC full-text workflows, for example when a parsed reference has `identifiers.pmid` or `identifiers.doi` but no `identifiers.pmcid`.
4
+
5
+ ```bash
6
+ uvx pmc-toolkit idconv 23193287
7
+ uvx pmc-toolkit idconv 10.1093/nar/gks1195 --idtype doi
8
+ ```
9
+
10
+ Example output shape:
11
+
12
+ ```json
13
+ [
14
+ {
15
+ "requested-id": "23193287",
16
+ "pmid": 23193287,
17
+ "pmcid": "PMC3531190",
18
+ "doi": "10.1093/nar/gks1195"
19
+ }
20
+ ]
21
+ ```
22
+
23
+ When a record has `status: "error"` or no `pmcid`, stop the PMC full-text workflow for that referenced article and report that no matching PMC record was found. Do not summarize from the title alone.
24
+
25
+ After a record returns `pmcid`, run `versions <PMCID>` and continue with `metadata`, `fetch`, and `parse`.
@@ -0,0 +1,34 @@
1
+ # CLI: `metadata`
2
+
3
+ Use `metadata <PMCID.N>` to fetch bibliographic fields, Open Access flags, and S3 URL fields (for example `xml_url`, `pdf_url`, `media_urls`, `text_url`), plus `pmid` and `doi`.
4
+
5
+ Example:
6
+ ```bash
7
+ uvx pmc-toolkit metadata PMCxxxx.N
8
+ ```
9
+
10
+ Example output:
11
+
12
+ ```json
13
+ {
14
+ "pmcid": "PMCxxxx",
15
+ "version": N,
16
+ "pmid": 12345678,
17
+ "doi": "10.1234/example.doi",
18
+ "mid": null,
19
+ "title": "Example article title",
20
+ "citation": "Journal Name",
21
+ "is_pmc_openaccess": true/false,
22
+ "is_manuscript": true/false,
23
+ "is_historical_ocr": true/false,
24
+ "is_retracted": true/false,
25
+ "license_code": "license code",
26
+ "xml_url": "s3://pmc-oa-opendata/PMCxxxx.N/PMCxxxx.N.xml?md5=<hex>",
27
+ "pdf_url": "s3://pmc-oa-opendata/PMCxxxx.N/PMCxxxx.N.pdf?md5=<hex>",
28
+ "media_urls": [
29
+ "s3://pmc-oa-opendata/PMCxxxx.N/media-1.jpg?md5=<hex>",
30
+ ...
31
+ ],
32
+ "text_url": "s3://pmc-oa-opendata/PMCxxxx.N/PMCxxxx.N.txt?md5=<hex>"
33
+ }
34
+ ```
@@ -0,0 +1,116 @@
1
+ # CLI: `parse`
2
+
3
+ Use `parse <PMCID.N>` after cached full-text XML exists. Run `fetch <PMCID.N> --ext xml` first when `<cache>/<PMCID.N>/<PMCID.N>.xml` is missing. The first run parses XML once, writes `<cache-root>/<PMCID.N>/.pmc-extracted-article.json`, and prints the extracted JSON; later runs reuse that cache unless `--force` is passed.
4
+
5
+ Add `--cache-dir` or `PMC_TOOLKIT_CACHE` when the XML was fetched outside the default OS user cache. Keep the same cache root across `fetch` and `parse`.
6
+
7
+ ```bash
8
+ uvx pmc-toolkit fetch PMCxxxx.N --ext xml
9
+ uvx pmc-toolkit parse PMCxxxx.N
10
+ ```
11
+
12
+ The `parse` command prints the extracted article JSON (`result.data`), not the `fetch` wrapper with `versioned_pmcid`, `cache_dir`, and downloaded `files`.
13
+
14
+ ## Extracted JSON top-level keys
15
+
16
+ - **article_info** - `journal`, `article_ids`, `title`, `publication_date`, `article_type`, `license`, `keywords`, `authors[]`, `abstract`, `funding_grants[]`
17
+ - **content** - `paragraphs[]` and `sections[]`; items include `source_id`, `section_id`, `title`, `text`, `reference_ids`, `figure_ids`, and `table_ids`
18
+ - **references** - `references[]` with `source_id`, `label`, `text`, `publication_type`, `identifiers`, `article_title`, `source`, `year`, `volume`, `issue`, and `pages`
19
+ - **figures** - `figures[]` with `source_id`, `label`, `caption`, and `graphics`
20
+ - **tables** - `tables[]` with `source_id`, `label`, `caption`, `rows`, and `footnotes`
21
+ - **supporting_info** - `acknowledgements`, `competing_interests`, `data_availability`, `supplementary_media`, `author_notes`, `related_articles`, and `custom_metadata`
22
+
23
+ ## Narrow retrieval with jq
24
+
25
+ **Start here.** Full `parse` output is large. Pipe it through jq and load only the slice you need.
26
+
27
+ ### Content outline (default first step)
28
+
29
+ `scripts/content-outline.jq` returns a nested section tree: article title plus `section_id` and `title` for each section.
30
+
31
+ ```bash
32
+ uvx pmc-toolkit parse PMCxxxx.N | jq -c -f <SKILL_DIR>/scripts/content-outline.jq
33
+ ```
34
+
35
+ Example output:
36
+
37
+ ```json
38
+ {
39
+ "title": "journal title",
40
+ "sections": [
41
+ {
42
+ "section_id": "S1",
43
+ "title": "section title"
44
+ },
45
+ {
46
+ "section_id": "S2",
47
+ "title": "section title",
48
+ "sections": [
49
+ {
50
+ "section_id": "S3",
51
+ "title": "sub-section title"
52
+ }
53
+ ]
54
+ }
55
+ ]
56
+ }
57
+ ```
58
+ Use this to pick relevant sections (based on their titles) before loading detailed information.
59
+ The `section_id` values in the outline are XML source IDs (`S1`, `S2`, ...). Use them with `<SKILL_DIR>/scripts/query-id.jq` to fetch detailed section data.
60
+
61
+ ### Drill down by ID
62
+
63
+ `scripts/query-id.jq` returns the first object whose `source_id` matches. After the content outline, pass a chosen ID:
64
+
65
+ **Section** - paragraph text and xref links for that section:
66
+
67
+ ```bash
68
+ uvx pmc-toolkit parse PMCxxxx.N | jq -c --arg id "S3" -f <SKILL_DIR>/scripts/query-id.jq
69
+ ```
70
+
71
+ Example output:
72
+
73
+ ```json
74
+ {
75
+ "source_id": "S3",
76
+ "section_id": "2.1",
77
+ "title": "sub-section title",
78
+ "paragraphs": [
79
+ {
80
+ "source_id": "P9",
81
+ "text": "paragraph text",
82
+ "reference_ids": ["R1", "R18"],
83
+ "figure_ids": ["F1", "F5"],
84
+ "table_ids": ["T1"]
85
+ }
86
+ ],
87
+ "sections": []
88
+ }
89
+ ```
90
+
91
+ Some sections are containers only. In the outline, `S2` (Results) has child sections but no paragraphs of its own - the text lives in `S3`, `S4`, etc.
92
+ Query those leaf `S*` IDs (sections with no nested `sections` in the outline), not the parent, to load only the subsection you need.
93
+
94
+ **Figure, table, or reference** - same script, different ID prefix:
95
+
96
+ ```bash
97
+ uvx pmc-toolkit parse PMCxxxx.N | jq -c --arg id "F1" -f <SKILL_DIR>/scripts/query-id.jq
98
+ uvx pmc-toolkit parse PMCxxxx.N | jq -c --arg id "R1" -f <SKILL_DIR>/scripts/query-id.jq
99
+ uvx pmc-toolkit parse PMCxxxx.N | jq -c --arg id "T1" -f <SKILL_DIR>/scripts/query-id.jq
100
+ ```
101
+
102
+ Use paragraph `reference_ids`, `figure_ids`, and `table_ids` to fetch linked entries with `scripts/query-id.jq`. Output shapes:
103
+
104
+ - [cli-parse-references.md](cli-parse-references.md) - `R*` lookup
105
+ - [cli-parse-figures.md](cli-parse-figures.md) - `F*` lookup
106
+ - [cli-parse-tables.md](cli-parse-tables.md) - `T*` lookup
107
+
108
+ ### Reverse lookup by xref
109
+
110
+ `query-id.jq` resolves an ID to its object. `reverse-lookup-xref.jq` finds every paragraph that cites a given reference, figure, or table. Pass `--arg xref` as `references`, `figures`, or `tables`:
111
+
112
+ ```bash
113
+ uvx pmc-toolkit parse PMCxxxx.N | jq -c --arg xref references --arg id "R1" -f <SKILL_DIR>/scripts/reverse-lookup-xref.jq
114
+ uvx pmc-toolkit parse PMCxxxx.N | jq -c --arg xref figures --arg id "F1" -f <SKILL_DIR>/scripts/reverse-lookup-xref.jq
115
+ uvx pmc-toolkit parse PMCxxxx.N | jq -c --arg xref tables --arg id "T1" -f <SKILL_DIR>/scripts/reverse-lookup-xref.jq
116
+ ```
@@ -0,0 +1,35 @@
1
+ # CLI: `versions`
2
+
3
+ Use `versions <PMCID>` to list every published versioned PMCID string (`PMCxxxx.1`, `PMCxxxx.2`, ...) for a **base** PMCID only. `versions` rejects versioned IDs.
4
+
5
+ ```bash
6
+ uvx pmc-toolkit versions PMCxxxx
7
+ ```
8
+
9
+ Example output shape:
10
+
11
+ ```json
12
+ {
13
+ "pmcid": "PMCxxxx",
14
+ "versions": [
15
+ "PMCxxxx.1",
16
+ "PMCxxxx.2"
17
+ ]
18
+ }
19
+ ```
20
+
21
+ If `.versions` is empty, stop for that PMCID and report that no PMC Open Access version was found. Do not continue to `metadata`, `files`, `fetch`, or `parse` for that PMCID.
22
+
23
+ ## Pick the latest `<PMCID.N>`
24
+
25
+ ```bash
26
+ uvx pmc-toolkit versions PMCxxxx | jq -c -r '.versions[-1]'
27
+ ```
28
+
29
+ ## Pick a non-latest version
30
+
31
+ Select an element of `.versions` by index (for example `.versions[0]` for the first published version).
32
+
33
+ ## Next steps
34
+
35
+ After you have `<PMCID.N>`, continue with `metadata`, `files`, `fetch`, and `parse` as described in the main skill.
@@ -0,0 +1,42 @@
1
+ # Parsed Data Locator
2
+
3
+ Use this file after choosing `parse` when the right parsed JSON field is not obvious. It maps task intent to the lowest-cost parsed field or helper command. For command selection before parsing, use the router in `SKILL.md` and the CLI references.
4
+
5
+ ## Parsed JSON Routing
6
+
7
+ Use `uvx pmc-toolkit parse <PMCID.N> | jq -c '<filter>'` for compact retrieval.
8
+
9
+ | Task | First parsed source | Notes |
10
+ | --- | --- | --- |
11
+ | Article identity from XML | `.article_info` | Use when XML-derived identity is needed. For DOI, title, journal, license, OA flags, and S3 URLs alone, prefer `metadata`. |
12
+ | Abstract | `.article_info.abstract` | Use before body sections for high-level study orientation. |
13
+ | Body section discovery | `<SKILL_DIR>/scripts/content-outline.jq` | Always inspect outline before loading body text. |
14
+ | Body passages | `query-id.jq` on selected `content.sections[].source_id` | Prefer leaf sections when parent sections only group subsections. |
15
+ | Standalone body paragraphs | `.content.paragraphs[]?` | Some XML has top-level body paragraphs outside sections. |
16
+ | Authors and affiliations | `.article_info.authors[]` | Authors include resolved affiliation text when available. |
17
+ | ORCID | `.article_info.authors[].orcid` | Report absent ORCIDs as absent, not unknown. |
18
+ | Equal contribution, author notes, correspondence | `.supporting_info.author_notes` | Use with `.article_info.authors[]`; do not infer equal contribution from author order alone. |
19
+ | Funding | `.article_info.funding_grants[]`, then `.supporting_info` | Some articles encode funding in article metadata, some in acknowledgements. |
20
+ | Acknowledgements | `.supporting_info.acknowledgements[]` | Cite paragraph `source_id` when available. |
21
+ | Competing interests | `.supporting_info.competing_interests[]` | Preserve exact statement and report absence if empty. |
22
+ | Data availability | `.supporting_info.data_availability[]` | Preserve accessions, repository names, URLs, and restrictions. |
23
+ | Supplementary media | `.supporting_info.supplementary_media[]` | Use `files` only when local download or object-key inventory is needed. |
24
+ | Related articles | `.supporting_info.related_articles[]` | Useful for preprint to published article links. |
25
+ | Custom PMC/JATS metadata | `.supporting_info.custom_metadata` | Use for PMC properties that are not normal article fields. |
26
+ | Figures | `.figures[]` | Start with label, caption, graphics. Use linked paragraphs before fetching images unless visual inspection is requested. |
27
+ | Tables | `.tables[]` | Contains XML-extracted rows and footnotes only. Do not assume PDF-only tables are available. |
28
+ | References | `.references[]` | Use identifiers and labels. Reverse lookup paragraphs that cite a reference for context. |
29
+ | Figure, table, or reference citation context | `reverse-lookup-xref.jq` | Pass `--arg xref figures`, `tables`, or `references`. |
30
+
31
+ ## Retrieval Shortcuts
32
+
33
+ - Author summary:
34
+ `uvx pmc-toolkit parse <PMCID.N> | jq -c '{title: .article_info.title, authors: .article_info.authors, author_notes: .supporting_info.author_notes}'`
35
+ - Declarations:
36
+ `uvx pmc-toolkit parse <PMCID.N> | jq -c '.supporting_info | {acknowledgements, competing_interests, data_availability, author_notes}'`
37
+ - Figure inventory:
38
+ `uvx pmc-toolkit parse <PMCID.N> | jq -c '.figures[] | {source_id, label, caption, graphics}'`
39
+ - Table inventory:
40
+ `uvx pmc-toolkit parse <PMCID.N> | jq -c '.tables[] | {source_id, label, caption, rows, footnotes}'`
41
+ - Reference inventory:
42
+ `uvx pmc-toolkit parse <PMCID.N> | jq -c '.references[] | {source_id, label, article_title, source, year, identifiers}'`
@@ -0,0 +1,48 @@
1
+ # Workflow: Author And Contributor Analysis
2
+
3
+ Use this workflow for author lists, affiliations, ORCIDs, corresponding authors, equal-contribution notes, contributor notes, author-focused reports, and author-related declarations.
4
+
5
+ ## Source Priority
6
+
7
+ 1. Use `metadata` for PMCID version, title, DOI, citation, OA flags, and retraction status.
8
+ 2. Use `parse` for author names, resolved affiliations, ORCIDs, and author notes.
9
+ 3. Use `supporting_info.author_notes` for equal contribution, correspondence, and other notes.
10
+ 4. Use `supporting_info.acknowledgements`, `competing_interests`, and `data_availability` only when the author task asks for declarations or report context.
11
+
12
+ ## Retrieval
13
+
14
+ Resolve the version and fetch XML:
15
+
16
+ ```bash
17
+ uvx pmc-toolkit metadata <PMCID.N>
18
+ uvx pmc-toolkit fetch <PMCID.N> --ext xml
19
+ uvx pmc-toolkit parse <PMCID.N> | jq -c '{title: .article_info.title, authors: .article_info.authors, author_notes: .supporting_info.author_notes}'
20
+ ```
21
+
22
+ If declarations or report context are requested:
23
+
24
+ ```bash
25
+ uvx pmc-toolkit parse <PMCID.N> | jq -c '.supporting_info | {acknowledgements, competing_interests, data_availability, author_notes}'
26
+ ```
27
+
28
+ ## Interpretation Rules
29
+
30
+ - Preserve author order from `.article_info.authors[]`.
31
+ - Treat missing `orcid` fields as absent ORCIDs. Do not infer ORCIDs.
32
+ - Treat missing affiliation text as absent affiliation data. Do not invent institutional names.
33
+ - Identify equal contribution only from `author_notes`, not from author order or symbols unless the note explains the symbol.
34
+ - Identify corresponding authors only from correspondence entries or explicit author notes.
35
+ - If author notes mention symbols but the parsed author list does not connect symbols to names, report the limitation instead of forcing a mapping.
36
+
37
+ ## Output Patterns
38
+
39
+ For a compact author answer, include:
40
+
41
+ - Selected `<PMCID.N>`.
42
+ - Article title and DOI when available.
43
+ - Ordered author list.
44
+ - Affiliation and ORCID fields when requested or relevant.
45
+ - Author-note evidence with `supporting_info.author_notes` and item `source_id` when available.
46
+ - Clear gaps for missing ORCIDs, affiliations, equal-contribution notes, or correspondence.
47
+
48
+ For an author report, use [workflow-reporting.md](workflow-reporting.md) and include an evidence table for author-note claims.
@@ -0,0 +1,47 @@
1
+ # Workflow: Evidence Extraction
2
+
3
+ Use this workflow for article-content questions, evidence-grounded answers, section analysis, declarations, supplements, and claims that must be tied to PMC article evidence.
4
+
5
+ ## Retrieval Loop
6
+
7
+ 1. Resolve the PMCID to a pinned `<PMCID.N>`.
8
+ 2. Fetch XML if needed:
9
+ `uvx pmc-toolkit fetch <PMCID.N> --ext xml`
10
+ 3. Inspect the outline first:
11
+ `uvx pmc-toolkit parse <PMCID.N> | jq -c -f <SKILL_DIR>/scripts/content-outline.jq`
12
+ 4. Use [data-locator.md](data-locator.md) to decide whether the answer lives in `article_info`, `content`, `supporting_info`, `figures`, `tables`, or `references`.
13
+ 5. State the next retrieval plan before loading detailed evidence when the task needs multiple evidence targets.
14
+ 6. Retrieve narrow evidence:
15
+ `uvx pmc-toolkit parse <PMCID.N> | jq -c --arg id "<SOURCE_ID>" -f <SKILL_DIR>/scripts/query-id.jq`
16
+ 7. For linked support, retrieve cited objects or reverse lookup citation context:
17
+ `uvx pmc-toolkit parse <PMCID.N> | jq -c --arg xref references --arg id "<REFERENCE_ID>" -f <SKILL_DIR>/scripts/reverse-lookup-xref.jq`
18
+ 8. If the user asks about the full text, abstract, authors, figures, tables, or evidence inside a referenced/cited article, inspect that reference's `identifiers`. If it has `pmcid`, continue with that PMCID. If it has `pmid` or `doi` but no `pmcid`, read [cli-idconv.md](cli-idconv.md) and run `idconv` before stopping.
19
+ 9. Decide after each retrieval whether the evidence is sufficient. If not, choose the next source and repeat.
20
+ 10. Stop when the answer is sufficiently supported or when the parsed JSON lacks the needed field. Report gaps explicitly.
21
+
22
+ ## Evidence Selection
23
+
24
+ - Use article title, abstract, and outline to orient.
25
+ - Prefer sections whose titles match the task. Query leaf sections rather than broad parent sections when possible.
26
+ - For claims about results, methods, or discussion, use body paragraphs, not only the abstract.
27
+ - For declarations, use `supporting_info` first.
28
+ - For figure, table, or reference claims, inspect the object and linked paragraph context.
29
+ - For author/contributor claims, use the author workflow.
30
+
31
+ ## Answer Requirements
32
+
33
+ Include in the final answer:
34
+
35
+ - Base PMCID and selected `<PMCID.N>`.
36
+ - Each claim with a human-readable locator.
37
+ - Stable `source_id` when useful for traceability.
38
+ - Short evidence summary or short quote from retrieved parsed JSON.
39
+ - Any gap, conflict, or mismatch.
40
+
41
+ Use these locators:
42
+
43
+ - Body text: `section_id` and section title, plus paragraph `source_id` when needed.
44
+ - Figures: figure `label`.
45
+ - Tables: table `label`.
46
+ - References: reference `label`.
47
+ - Supporting info: supporting-info category plus item `source_id` when available.
@@ -0,0 +1,33 @@
1
+ # Workflow: Figure And Image Analysis
2
+
3
+ Use this workflow for figure captions, figure-linked claims, panel interpretation, graphics files, and visual inspection requests.
4
+
5
+ ## Caption And Text Evidence
6
+
7
+ 1. Resolve to `<PMCID.N>`.
8
+ 2. Fetch and parse XML.
9
+ 3. List figures:
10
+ `uvx pmc-toolkit parse <PMCID.N> | jq -c '.figures[] | {source_id, label, caption, graphics}'`
11
+ 4. Retrieve the selected figure by ID:
12
+ `uvx pmc-toolkit parse <PMCID.N> | jq -c --arg id "<FIGURE_ID>" -f <SKILL_DIR>/scripts/query-id.jq`
13
+ 5. Retrieve paragraphs that cite the figure:
14
+ `uvx pmc-toolkit parse <PMCID.N> | jq -c --arg xref figures --arg id "<FIGURE_ID>" -f <SKILL_DIR>/scripts/reverse-lookup-xref.jq`
15
+
16
+ Use caption plus linked paragraphs for text-grounded figure answers.
17
+
18
+ ## Visual Inspection
19
+
20
+ Fetch image files only when the user asks about the visual itself, a panel, an image feature, or when caption/text evidence is insufficient.
21
+
22
+ 1. Run `files <PMCID.N>` to inspect available image/media object keys.
23
+ 2. Match figure `graphics[]` values to object-key suffixes when possible.
24
+ 3. Fetch only likely image extensions:
25
+ `uvx pmc-toolkit fetch <PMCID.N> --ext jpg,png,tif,tiff,gif`
26
+ 4. Use the returned `local_path` for visual inspection with the available image-viewing tool.
27
+
28
+ ## Output Rules
29
+
30
+ - Cite figure `label` and selected `<PMCID.N>`.
31
+ - Include caption evidence and linked paragraph evidence when used.
32
+ - Distinguish what is visible in the image from what the caption or article text states.
33
+ - If the graphics file cannot be matched or fetched, answer from caption/text evidence and report the visual gap.
@@ -0,0 +1,38 @@
1
+ # Workflow: Knowledge Extraction
2
+
3
+ Use this workflow for broad but structured tasks such as extracting key findings, mechanisms, datasets, claims, measurements, limitations, interventions, outcomes, or article-specific knowledge graphs. Keep the workflow generic; specialize the output schema to the user's task.
4
+
5
+ ## Process
6
+
7
+ 1. Resolve to `<PMCID.N>`.
8
+ 2. Fetch XML and inspect the outline.
9
+ 3. Read the abstract only for orientation:
10
+ `uvx pmc-toolkit parse <PMCID.N> | jq -c '{title: .article_info.title, abstract: .article_info.abstract, keywords: .article_info.keywords}'`
11
+ 4. Choose target sections from the outline. For most research extraction tasks, inspect methods, results, discussion, limitations, and any named domain sections.
12
+ 5. Retrieve selected sections by `source_id` with `query-id.jq`.
13
+ 6. Extract candidate knowledge records from retrieved evidence only.
14
+ 7. If the task involves figures, tables, or references, retrieve those objects and their linked paragraph context.
15
+ 8. Stop when the selected evidence covers the requested schema or when additional sections are unlikely to change the answer. Report uninspected sections when they are relevant but not loaded.
16
+
17
+ ## Record Schema
18
+
19
+ Use or adapt this schema unless the user provides another:
20
+
21
+ - `item`: concise concept, claim, finding, method, variable, dataset, limitation, or outcome.
22
+ - `category`: user-relevant class such as method, result, mechanism, limitation, dataset, or evidence.
23
+ - `evidence_locator`: section title and `section_id`; figure/table/reference label if applicable.
24
+ - `source_id`: paragraph, section, figure, table, or reference ID.
25
+ - `evidence`: short quote or compact summary from retrieved parsed JSON.
26
+ - `confidence`: high, medium, or low based on specificity and directness of evidence.
27
+ - `gap`: missing context, ambiguity, or unsupported inference.
28
+
29
+ ## Rules
30
+
31
+ - Separate article claims from your own synthesis.
32
+ - Do not use uninspected sections as evidence.
33
+ - Prefer direct result/method paragraphs over abstract-only evidence.
34
+ - Keep extraction records small enough to verify. If the task is large, produce a first-pass matrix and state what remains to inspect.
35
+ - Use [workflow-reporting.md](workflow-reporting.md) when the user asks for a polished report rather than raw records.
36
+
37
+
38
+
@@ -0,0 +1,32 @@
1
+ # Workflow: Reporting
2
+
3
+ Use this workflow when the user asks for a report, memo, evidence brief, author report, structured summary, or deliverable that combines multiple PMC Toolkit data sources.
4
+
5
+ ## Process
6
+
7
+ 1. Infer the report scope from the request. Ask a question only when the deliverable cannot be scoped safely.
8
+ 2. Resolve to `<PMCID.N>` and collect metadata for the report header.
9
+ 3. Use the router in `SKILL.md` to choose command and workflow sources. Load [data-locator.md](data-locator.md) only when parsed JSON fields are not obvious.
10
+ 4. Load only the source-specific workflow references required by the report.
11
+ 5. Retrieve evidence in small slices. Keep a scratch list of every claim with its locator.
12
+ 6. Assemble the report from retrieved evidence only.
13
+ 7. Include a limitations or gaps section when data is absent, ambiguous, or not inspected.
14
+
15
+ ## Suggested Sections
16
+
17
+ Use these sections when they fit the task:
18
+
19
+ - Article: PMCID, selected version, title, DOI, journal/citation, date, license/OA status.
20
+ - Scope: what the report covers.
21
+ - Findings: grouped by the user's task.
22
+ - Evidence Table: claim, locator, source ID, short evidence, gap.
23
+ - Files Or Artifacts: available XML/PDF/media/supplements when relevant.
24
+ - Gaps: absent parsed fields, unavailable XML/files, or uninspected sections.
25
+
26
+ ## Rules
27
+
28
+ - Do not make report sections that hide evidence gaps.
29
+ - Do not over-fetch. A report can combine metadata, author notes, and a few body sections without loading the entire parsed article.
30
+ - Use concise quotations only when they add auditability.
31
+ - For author reports, use [workflow-author-contributor-analysis.md](workflow-author-contributor-analysis.md).
32
+ - For broad extraction reports, use [workflow-knowledge-extraction.md](workflow-knowledge-extraction.md).
@@ -0,0 +1,14 @@
1
+ def drop_empty($o):
2
+ $o | with_entries(select(.value | if type == "array" then length > 0 else . != null end));
3
+
4
+ def section:
5
+ drop_empty({
6
+ section_id: .source_id,
7
+ title: .title,
8
+ sections: [.sections[]? | section]
9
+ });
10
+
11
+ {
12
+ title: .article_info.title,
13
+ sections: [.content.sections[]? | section]
14
+ }
@@ -0,0 +1,3 @@
1
+ ..
2
+ | objects
3
+ | select(.source_id? == $id)
@@ -0,0 +1,13 @@
1
+ {
2
+ references: "reference_ids",
3
+ figures: "figure_ids",
4
+ tables: "table_ids"
5
+ }[$xref] as $field
6
+ | if $field == null then
7
+ error("xref must be references, figures, or tables")
8
+ else
9
+ [ .. | objects
10
+ | select(.[$field]? | index($id))
11
+ | {source_id, text, ($field): .[$field] }
12
+ ]
13
+ end
@@ -71,6 +71,44 @@ def metadata(
71
71
  _emit_json(result.model_dump(mode="json"))
72
72
 
73
73
 
74
+ @app.command("idconv")
75
+ def idconv(
76
+ identifiers: list[str] = typer.Argument(
77
+ ...,
78
+ help=(
79
+ "PMID, DOI, PMCID, or MID values to convert to PMC identifiers. "
80
+ "Comma-separated values are accepted."
81
+ ),
82
+ ),
83
+ idtype: str | None = typer.Option(
84
+ None,
85
+ "--idtype",
86
+ help="Optional input identifier type: pmid, doi, pmcid, or mid.",
87
+ ),
88
+ email: str | None = typer.Option(
89
+ None,
90
+ "--email",
91
+ envvar="NCBI_EMAIL",
92
+ help="Optional contact email sent to the NCBI ID Converter API.",
93
+ ),
94
+ ) -> None:
95
+ """
96
+ Convert PMID/DOI identifiers to PMC identifiers when available in PMC.
97
+ """
98
+
99
+ def build_result():
100
+ from pmc_toolkit.idconv_api import convert_to_pmcids
101
+
102
+ return convert_to_pmcids(
103
+ identifiers,
104
+ idtype=idtype,
105
+ email=email,
106
+ )
107
+
108
+ result = _run_command(build_result)
109
+ _emit_json(result["records"])
110
+
111
+
74
112
  @app.command("files")
75
113
  def files(
76
114
  requested_pmcid: str = typer.Argument(
@@ -140,8 +178,8 @@ def fetch(
140
178
  _emit_json(result.model_dump(mode="json"))
141
179
 
142
180
 
143
- @app.command("convert-xml")
144
- def convert_xml(
181
+ @app.command("parse")
182
+ def parse(
145
183
  requested_pmcid: str = typer.Argument(
146
184
  ...,
147
185
  help="PMC accession ID or version ID, e.g. PMC11370360 or PMC11370360.1",
@@ -158,22 +196,10 @@ def convert_xml(
158
196
  "-f",
159
197
  help="Recreate the extracted JSON cache from the cached XML.",
160
198
  ),
161
- list_keys: bool = typer.Option(
162
- False,
163
- "--list-keys",
164
- help="Print available extracted JSON keys and descriptions, then exit.",
165
- ),
166
199
  ) -> None:
167
200
  """
168
- Convert cached PMC full-text XML into cached extracted JSON.
201
+ Parse cached PMC full-text XML into cached extracted JSON.
169
202
  """
170
- if list_keys:
171
- from pmc_toolkit.xml_parse_utils import EXTRACT_OUTPUT_KEY_DESCRIPTIONS
172
-
173
- typer.echo("Available extracted JSON keys:")
174
- for key, description in EXTRACT_OUTPUT_KEY_DESCRIPTIONS.items():
175
- typer.echo(f"- {key}: {description}")
176
- return
177
203
 
178
204
  def build_result():
179
205
  from pmc_toolkit.xml_parse_api import ensure_extracted_article
@@ -0,0 +1,46 @@
1
+ """NCBI PMC ID Converter client."""
2
+
3
+ from collections.abc import Sequence
4
+ import json
5
+ from typing import Any
6
+ from urllib.error import HTTPError, URLError
7
+ from urllib.parse import urlencode
8
+ from urllib.request import Request, urlopen
9
+
10
+ ID_CONVERTER_URL = "https://pmc.ncbi.nlm.nih.gov/tools/idconv/api/v1/articles/"
11
+
12
+
13
+ def convert_to_pmcids(
14
+ identifiers: Sequence[str],
15
+ *,
16
+ idtype: str | None = None,
17
+ email: str | None = None,
18
+ timeout: float = 30.0,
19
+ ) -> dict[str, Any]:
20
+ params = {
21
+ "tool": "pmc_toolkit",
22
+ "format": "json",
23
+ "ids": ",".join(identifiers),
24
+ }
25
+ if idtype:
26
+ params["idtype"] = idtype
27
+ if email:
28
+ params["email"] = email
29
+
30
+ url = f"{ID_CONVERTER_URL}?{urlencode(params)}"
31
+ request = Request(url, headers={"User-Agent": "pmc-toolkit"})
32
+ try:
33
+ with urlopen(request, timeout=timeout) as response:
34
+ payload = json.loads(response.read().decode("utf-8"))
35
+ except HTTPError as exc:
36
+ raise RuntimeError(
37
+ f"ID converter request failed with HTTP {exc.code}."
38
+ ) from exc
39
+ except URLError as exc:
40
+ raise RuntimeError(f"ID converter request failed: {exc.reason}.") from exc
41
+ except json.JSONDecodeError as exc:
42
+ raise RuntimeError("ID converter returned invalid JSON.") from exc
43
+
44
+ if not isinstance(payload, dict):
45
+ raise RuntimeError("ID converter returned an unexpected response.")
46
+ return payload
@@ -105,11 +105,7 @@ def _ensure_extracted_article_cache(
105
105
  from pmc_toolkit.xml_parse_utils import extract_article_data, load_xml
106
106
 
107
107
  root = load_xml(paths.xml_path)
108
- parsed = _group_extracted_article(
109
- extract_article_data(root),
110
- versioned_pmcid=paths.versioned_pmcid,
111
- xml_path=paths.xml_path,
112
- )
108
+ parsed = _group_extracted_article(extract_article_data(root))
113
109
  storage_cache.write_cached_extracted_article(
114
110
  paths.cache_root,
115
111
  paths.versioned_pmcid,
@@ -120,15 +116,8 @@ def _ensure_extracted_article_cache(
120
116
 
121
117
  def _group_extracted_article(
122
118
  raw_data: dict[str, Any],
123
- *,
124
- versioned_pmcid: str,
125
- xml_path: Path,
126
119
  ) -> dict[str, Any]:
127
120
  return {
128
- "_meta": {
129
- "versioned_pmcid": versioned_pmcid,
130
- "xml_path": str(xml_path),
131
- },
132
121
  "article_info": _article_info(raw_data),
133
122
  "content": raw_data["content"],
134
123
  "references": raw_data["references"],
@@ -13,26 +13,6 @@ XMLParser = etree.XMLParser(
13
13
  remove_blank_text=True,
14
14
  )
15
15
  REFERENCE_SEPARATOR_PATTERN = re.compile(r"^[\s,;]+$")
16
- EXTRACT_OUTPUT_KEY_DESCRIPTIONS = {
17
- "article_info": (
18
- "article_info.journal, article_ids, title, publication_date, article_type, "
19
- "license, keywords, authors[], abstract, and funding_grants[]"
20
- ),
21
- "content": (
22
- "content.paragraphs[] and content.sections[]; objects include source_id, "
23
- "section_id, title, text, reference_ids, figure_ids, and table_ids"
24
- ),
25
- "references": (
26
- "references[] items with source_id, label, text, publication_type, "
27
- "identifiers, article_title, source, year, volume, issue, and pages"
28
- ),
29
- "figures": "figures[] items with source_id, label, caption, and graphics",
30
- "tables": "tables[] items with source_id, label, caption, rows, and footnotes",
31
- "supporting_info": (
32
- "acknowledgements, competing_interests, data_availability, "
33
- "supplementary_media, author_notes, related_articles, and custom_metadata"
34
- ),
35
- }
36
16
 
37
17
 
38
18
  def load_xml(path: Path) -> Any:
@@ -73,7 +73,7 @@ def test_ensure_extracted_article_reads_xml_and_writes_extracted_cache(
73
73
 
74
74
  assert result.versioned_pmcid == "PMC11370360.1"
75
75
  assert result.xml_path == str(article_dir / "PMC11370360.1.xml")
76
- assert result.data["_meta"]["versioned_pmcid"] == "PMC11370360.1"
76
+ assert "_meta" not in result.data
77
77
  assert result.data["article_info"]["journal"]["name"] == "bioRxiv"
78
78
  assert result.data["article_info"]["journal"]["issn"] == "2692-8205"
79
79
  assert result.data["article_info"]["article_ids"] == {
@@ -251,7 +251,7 @@ wheels = [
251
251
 
252
252
  [[package]]
253
253
  name = "pmc-toolkit"
254
- version = "0.2.0"
254
+ version = "0.4.0"
255
255
  source = { editable = "." }
256
256
  dependencies = [
257
257
  { name = "boto3" },
File without changes
File without changes
File without changes