pmc-toolkit 0.2.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/.gitignore +3 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/PKG-INFO +14 -11
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/README.md +13 -10
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/RELEASING.md +4 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/pyproject.toml +1 -1
- pmc_toolkit-0.4.0/skills/pmc-toolkit/SKILL.md +71 -0
- pmc_toolkit-0.4.0/skills/pmc-toolkit/agents/openai.yaml +4 -0
- pmc_toolkit-0.4.0/skills/pmc-toolkit/references/cli-files.md +59 -0
- pmc_toolkit-0.4.0/skills/pmc-toolkit/references/cli-idconv.md +25 -0
- pmc_toolkit-0.4.0/skills/pmc-toolkit/references/cli-metadata.md +34 -0
- pmc_toolkit-0.4.0/skills/pmc-toolkit/references/cli-parse.md +116 -0
- pmc_toolkit-0.4.0/skills/pmc-toolkit/references/cli-versions.md +35 -0
- pmc_toolkit-0.4.0/skills/pmc-toolkit/references/data-locator.md +42 -0
- pmc_toolkit-0.4.0/skills/pmc-toolkit/references/workflow-author-contributor-analysis.md +48 -0
- pmc_toolkit-0.4.0/skills/pmc-toolkit/references/workflow-evidence-extraction.md +47 -0
- pmc_toolkit-0.4.0/skills/pmc-toolkit/references/workflow-figure-image-analysis.md +33 -0
- pmc_toolkit-0.4.0/skills/pmc-toolkit/references/workflow-knowledge-extraction.md +38 -0
- pmc_toolkit-0.4.0/skills/pmc-toolkit/references/workflow-reporting.md +32 -0
- pmc_toolkit-0.4.0/skills/pmc-toolkit/scripts/content-outline.jq +14 -0
- pmc_toolkit-0.4.0/skills/pmc-toolkit/scripts/query-id.jq +3 -0
- pmc_toolkit-0.4.0/skills/pmc-toolkit/scripts/reverse-lookup-xref.jq +13 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/cli.py +41 -15
- pmc_toolkit-0.4.0/src/pmc_toolkit/idconv_api.py +46 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/xml_parse_api.py +1 -12
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/xml_parse_utils.py +0 -20
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/tests/test_xml_parse_api.py +1 -1
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/uv.lock +1 -1
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/.github/workflows/ci.yml +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/.github/workflows/release.yml +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/.python-version +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/AGENTS.md +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/LICENSE +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/__init__.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/cache.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/models.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/storage_api.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/storage_utils.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/src/pmc_toolkit/validators.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/tests/test_cli.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/tests/test_storage.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/tests/test_validators.py +0 -0
- {pmc_toolkit-0.2.0 → pmc_toolkit-0.4.0}/tests/test_xml_parse_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pmc-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Python toolkit and CLI for exploring, downloading, and parsing PMC article data.
|
|
5
5
|
Project-URL: Homepage, https://github.com/JakaKokosar/pmc-toolkit
|
|
6
6
|
Project-URL: Repository, https://github.com/JakaKokosar/pmc-toolkit
|
|
@@ -42,6 +42,7 @@ The project currently supports:
|
|
|
42
42
|
- listing available versions for a PMCID
|
|
43
43
|
- validating PMC identifiers before making requests
|
|
44
44
|
- retrieving metadata for a PMC identifier, defaulting to the latest version for a base PMCID
|
|
45
|
+
- converting PMID, DOI, PMCID, or MID values to PMC identifiers when PMC has a matching record
|
|
45
46
|
- listing every object for a resolved article version, using the local cache when available
|
|
46
47
|
- downloading files for an article version into a local cache (optional `--ext`
|
|
47
48
|
filters apply only to `fetch`, not to `files`; `--ext` accepts either a
|
|
@@ -94,6 +95,13 @@ Fetch metadata for a specific version:
|
|
|
94
95
|
uv run pmc-toolkit metadata PMC11370360.1
|
|
95
96
|
```
|
|
96
97
|
|
|
98
|
+
Convert a PMID or DOI to a PMCID when PMC has a matching record:
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
uv run pmc-toolkit idconv 23193287
|
|
102
|
+
uv run pmc-toolkit idconv 10.1093/nar/gks1195 --idtype doi
|
|
103
|
+
```
|
|
104
|
+
|
|
97
105
|
List every object key for an article version (including media and supplements).
|
|
98
106
|
For unversioned IDs, the CLI resolves the latest version from S3 first; once the
|
|
99
107
|
version is known, the cached object-key manifest is reused when present. There
|
|
@@ -134,22 +142,17 @@ uv run pmc-toolkit fetch PMC11370360.1 --cache-dir ./data
|
|
|
134
142
|
PMC_TOOLKIT_CACHE=./data uv run pmc-toolkit fetch PMC11370360.1
|
|
135
143
|
```
|
|
136
144
|
|
|
137
|
-
|
|
138
|
-
the XML is not already in the cache. The first
|
|
145
|
+
Parse a cached XML file into extracted JSON. Run `fetch --ext xml` first if
|
|
146
|
+
the XML is not already in the cache. The first parse reads XML once,
|
|
139
147
|
writes `<cache-root>/<PMCid.N>/.pmc-extracted-article.json`, and prints the
|
|
140
|
-
extracted JSON; later
|
|
148
|
+
extracted JSON; later parses for the same article version read that JSON
|
|
141
149
|
cache unless `--force` is passed.
|
|
142
150
|
|
|
143
151
|
```bash
|
|
144
152
|
uv run pmc-toolkit fetch PMC11370360.1 --ext xml
|
|
145
|
-
uv run pmc-toolkit
|
|
153
|
+
uv run pmc-toolkit parse PMC11370360.1
|
|
146
154
|
```
|
|
147
155
|
|
|
148
|
-
List the extracted JSON top-level keys:
|
|
149
|
-
|
|
150
|
-
```bash
|
|
151
|
-
uv run pmc-toolkit convert-xml --list-keys PMC11370360.1
|
|
152
|
-
```
|
|
153
156
|
|
|
154
157
|
`article_info.publication_date` currently uses the first publication date found
|
|
155
158
|
in the XML. If downstream consumers need to distinguish date types such as
|
|
@@ -176,7 +179,7 @@ Each resolved article version has a directory `<cache_root>/<PMCid.N>/` containi
|
|
|
176
179
|
|
|
177
180
|
- **`<PMCid.N>.json`** — cached metadata (from S3 `metadata/<PMCid.N>.json`), written after a successful read.
|
|
178
181
|
- **`.pmc-object-keys.json`** — JSON array of S3 object keys under that article’s prefix, written after `list_objects_v2` (or read on cache hit). If this file is missing or not a list of strings, listing or fetch may refetch from S3 or raise `ValueError` for an invalid manifest.
|
|
179
|
-
- **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit
|
|
182
|
+
- **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit parse`; reused by later parses for the same article version.
|
|
180
183
|
|
|
181
184
|
**Cache root selection:** `pmc-toolkit metadata` and `pmc-toolkit files` (and the matching `storage_api` functions) always use the default OS user cache from [`platformdirs`](https://github.com/tox-dev/platformdirs). Only `pmc-toolkit fetch` and `fetch_files(..., cache_dir=...)` accept `--cache-dir` or the `PMC_TOOLKIT_CACHE` environment variable.
|
|
182
185
|
|
|
@@ -10,6 +10,7 @@ The project currently supports:
|
|
|
10
10
|
- listing available versions for a PMCID
|
|
11
11
|
- validating PMC identifiers before making requests
|
|
12
12
|
- retrieving metadata for a PMC identifier, defaulting to the latest version for a base PMCID
|
|
13
|
+
- converting PMID, DOI, PMCID, or MID values to PMC identifiers when PMC has a matching record
|
|
13
14
|
- listing every object for a resolved article version, using the local cache when available
|
|
14
15
|
- downloading files for an article version into a local cache (optional `--ext`
|
|
15
16
|
filters apply only to `fetch`, not to `files`; `--ext` accepts either a
|
|
@@ -62,6 +63,13 @@ Fetch metadata for a specific version:
|
|
|
62
63
|
uv run pmc-toolkit metadata PMC11370360.1
|
|
63
64
|
```
|
|
64
65
|
|
|
66
|
+
Convert a PMID or DOI to a PMCID when PMC has a matching record:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
uv run pmc-toolkit idconv 23193287
|
|
70
|
+
uv run pmc-toolkit idconv 10.1093/nar/gks1195 --idtype doi
|
|
71
|
+
```
|
|
72
|
+
|
|
65
73
|
List every object key for an article version (including media and supplements).
|
|
66
74
|
For unversioned IDs, the CLI resolves the latest version from S3 first; once the
|
|
67
75
|
version is known, the cached object-key manifest is reused when present. There
|
|
@@ -102,22 +110,17 @@ uv run pmc-toolkit fetch PMC11370360.1 --cache-dir ./data
|
|
|
102
110
|
PMC_TOOLKIT_CACHE=./data uv run pmc-toolkit fetch PMC11370360.1
|
|
103
111
|
```
|
|
104
112
|
|
|
105
|
-
|
|
106
|
-
the XML is not already in the cache. The first
|
|
113
|
+
Parse a cached XML file into extracted JSON. Run `fetch --ext xml` first if
|
|
114
|
+
the XML is not already in the cache. The first parse reads XML once,
|
|
107
115
|
writes `<cache-root>/<PMCid.N>/.pmc-extracted-article.json`, and prints the
|
|
108
|
-
extracted JSON; later
|
|
116
|
+
extracted JSON; later parses for the same article version read that JSON
|
|
109
117
|
cache unless `--force` is passed.
|
|
110
118
|
|
|
111
119
|
```bash
|
|
112
120
|
uv run pmc-toolkit fetch PMC11370360.1 --ext xml
|
|
113
|
-
uv run pmc-toolkit
|
|
121
|
+
uv run pmc-toolkit parse PMC11370360.1
|
|
114
122
|
```
|
|
115
123
|
|
|
116
|
-
List the extracted JSON top-level keys:
|
|
117
|
-
|
|
118
|
-
```bash
|
|
119
|
-
uv run pmc-toolkit convert-xml --list-keys PMC11370360.1
|
|
120
|
-
```
|
|
121
124
|
|
|
122
125
|
`article_info.publication_date` currently uses the first publication date found
|
|
123
126
|
in the XML. If downstream consumers need to distinguish date types such as
|
|
@@ -144,7 +147,7 @@ Each resolved article version has a directory `<cache_root>/<PMCid.N>/` containi
|
|
|
144
147
|
|
|
145
148
|
- **`<PMCid.N>.json`** — cached metadata (from S3 `metadata/<PMCid.N>.json`), written after a successful read.
|
|
146
149
|
- **`.pmc-object-keys.json`** — JSON array of S3 object keys under that article’s prefix, written after `list_objects_v2` (or read on cache hit). If this file is missing or not a list of strings, listing or fetch may refetch from S3 or raise `ValueError` for an invalid manifest.
|
|
147
|
-
- **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit
|
|
150
|
+
- **`.pmc-extracted-article.json`** — full extracted JSON produced from the cached XML by `pmc-toolkit parse`; reused by later parses for the same article version.
|
|
148
151
|
|
|
149
152
|
**Cache root selection:** `pmc-toolkit metadata` and `pmc-toolkit files` (and the matching `storage_api` functions) always use the default OS user cache from [`platformdirs`](https://github.com/tox-dev/platformdirs). Only `pmc-toolkit fetch` and `fetch_files(..., cache_dir=...)` accept `--cache-dir` or the `PMC_TOOLKIT_CACHE` environment variable.
|
|
150
153
|
|
|
@@ -32,6 +32,10 @@ deployment if the environment requires it. Smoke test:
|
|
|
32
32
|
uv run --with "pmc-toolkit==${version}" --no-project -- pmc-toolkit --help
|
|
33
33
|
```
|
|
34
34
|
|
|
35
|
+
From **v0.2.0**, the PyPI wheel exposes only the `pmc-toolkit` console script
|
|
36
|
+
(the previous `pmc` script was removed so the binary matches the distribution
|
|
37
|
+
name).
|
|
38
|
+
|
|
35
39
|
Optionally draft a GitHub Release from the tag for user-facing notes.
|
|
36
40
|
|
|
37
41
|
## Troubleshooting
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: pmc-toolkit
|
|
3
|
+
description: Work with PubMed Central Open Access articles by PMCID using PMC Toolkit. Use for version resolution, metadata and file inventory, downloads, parsed article evidence extraction, authors and contributor analysis, figures, tables, references, supplements, declarations, knowledge extraction, and report-style summaries. Can convert PMID/DOI to PMCID only to continue PMC full-text workflows; not for keyword literature search or non-PMC article analysis.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# PMC Toolkit
|
|
7
|
+
|
|
8
|
+
Use this skill to retrieve, download, parse, and cite PMC Open Access article data with PMC Toolkit. Select commands by the data needed to complete the task, not by surface wording in the request.
|
|
9
|
+
|
|
10
|
+
## Operating Rules
|
|
11
|
+
|
|
12
|
+
- Run published-tool commands as `uvx pmc-toolkit ...`.
|
|
13
|
+
- Do not add installation guidance. If `uv` or `uvx` is unavailable, report that PMC Toolkit needs it and stop.
|
|
14
|
+
- Live lookups, listings, and downloads require network access to the PMC Open Access S3 dataset unless the needed data is already cached.
|
|
15
|
+
- Resolve bundled helper paths relative to this skill directory, for example `<SKILL_DIR>/scripts/content-outline.jq`.
|
|
16
|
+
- Do not load, dump, grep, or search raw XML or PDF files directly for article-content tasks. Fetch XML only as parser input, then use `parse` output and bundled JSON helpers for evidence extraction.
|
|
17
|
+
- When piping PMC Toolkit JSON through `jq`, use `jq -c` unless pretty-printed JSON is explicitly needed for human inspection. Prefer compact JSON to avoid bloating context.
|
|
18
|
+
- For simple extraction requests where the command output is already the user-facing answer, do not repeat large text in the final response. Return only a brief label or status plus the exact command output when needed; for long abstracts, tables, or lists, prefer telling the user the command printed the requested value instead of restating it.
|
|
19
|
+
- Do not invent missing declarations, author notes, figures, tables, or references. Report the missing parsed field or empty list.
|
|
20
|
+
|
|
21
|
+
## Task Router
|
|
22
|
+
|
|
23
|
+
Choose the smallest route that answers the request. Prefer a direct CLI route when one command output is enough; use a workflow route when the task needs multi-step retrieval, synthesis, or evidence reporting. If using a direct CLI route, make the shell command do the final formatting so the assistant response can stay minimal.
|
|
24
|
+
|
|
25
|
+
### Direct CLI Routes
|
|
26
|
+
|
|
27
|
+
- PMCID availability and version resolution: read [references/cli-versions.md](references/cli-versions.md) for `versions` command details.
|
|
28
|
+
- PMID/DOI to PMCID conversion for continuing PMC workflows: read [references/cli-idconv.md](references/cli-idconv.md) for `idconv` command details.
|
|
29
|
+
- DOI, title, journal, license, OA flags, retraction flags, and S3 URL fields: read [references/cli-metadata.md](references/cli-metadata.md) for `metadata` command details.
|
|
30
|
+
- File inventory or downloads for XML, PDF, text, figures, media, or supplements: read [references/cli-files.md](references/cli-files.md) for `files` and `fetch` command details.
|
|
31
|
+
- Parsed article JSON, body sections, supporting info, parsed authors, figures, tables, references, or helper `jq` usage: run `fetch --ext xml` as needed, then `parse`. Read [references/cli-parse.md](references/cli-parse.md) for `parse` output shape and helper-script usage.
|
|
32
|
+
- If `parse` is needed but the right parsed field is not obvious, read [references/data-locator.md](references/data-locator.md) before retrieving detailed evidence.
|
|
33
|
+
- When a task asks about a referenced/cited article and the parsed reference has `identifiers.pmid` or `identifiers.doi` but no `identifiers.pmcid`, use `idconv` before stopping.
|
|
34
|
+
|
|
35
|
+
### Workflow Routes
|
|
36
|
+
|
|
37
|
+
- Article-content questions, passage finding, section analysis, support for claims, declarations, supplements, or evidence-grounded answers: read [references/workflow-evidence-extraction.md](references/workflow-evidence-extraction.md).
|
|
38
|
+
- Author, affiliation, ORCID, equal-contribution, corresponding-author, contributor, or author-note tasks: read [references/workflow-author-contributor-analysis.md](references/workflow-author-contributor-analysis.md).
|
|
39
|
+
- Knowledge extraction, claim extraction, evidence matrices, mechanism summaries, or structured fact extraction: read [references/workflow-knowledge-extraction.md](references/workflow-knowledge-extraction.md).
|
|
40
|
+
- Figure interpretation, graphics lookup, panel questions, or visual inspection: read [references/workflow-figure-image-analysis.md](references/workflow-figure-image-analysis.md). Fetch image files only when visual inspection is required.
|
|
41
|
+
- Report-style summaries, author reports, evidence reports, or deliverables combining several data types: read [references/workflow-reporting.md](references/workflow-reporting.md), then load only the source-specific workflow references required by the report.
|
|
42
|
+
|
|
43
|
+
## Bundled Resources
|
|
44
|
+
|
|
45
|
+
Open references only after choosing a route above, when command-specific details, output shapes, or workflow details are needed.
|
|
46
|
+
|
|
47
|
+
- [references/data-locator.md](references/data-locator.md) - task-to-parsed-JSON-field routing.
|
|
48
|
+
- [references/workflow-evidence-extraction.md](references/workflow-evidence-extraction.md) - detailed evidence retrieval loop and answer contract.
|
|
49
|
+
- [references/workflow-author-contributor-analysis.md](references/workflow-author-contributor-analysis.md) - author, affiliation, correspondence, and contributor-note workflow.
|
|
50
|
+
- [references/workflow-knowledge-extraction.md](references/workflow-knowledge-extraction.md) - generic structured extraction workflow.
|
|
51
|
+
- [references/workflow-figure-image-analysis.md](references/workflow-figure-image-analysis.md) - figure caption, linked text, and visual-inspection workflow.
|
|
52
|
+
- [references/workflow-reporting.md](references/workflow-reporting.md) - report assembly pattern for mixed data tasks.
|
|
53
|
+
- [references/cli-versions.md](references/cli-versions.md) - `versions` examples and version selection.
|
|
54
|
+
- [references/cli-idconv.md](references/cli-idconv.md) - `idconv` examples and missing-PMC handling.
|
|
55
|
+
- [references/cli-metadata.md](references/cli-metadata.md) - `metadata` examples and field overview.
|
|
56
|
+
- [references/cli-files.md](references/cli-files.md) - `files` and `fetch` output shapes.
|
|
57
|
+
- [references/cli-parse.md](references/cli-parse.md) - `parse` output shape and helper-script usage.
|
|
58
|
+
- [references/cli-parse-figures.md](references/cli-parse-figures.md) - figure lookup shape and citation context.
|
|
59
|
+
- [references/cli-parse-tables.md](references/cli-parse-tables.md) - table lookup shape and citation context.
|
|
60
|
+
- [references/cli-parse-references.md](references/cli-parse-references.md) - reference lookup shape and citation context.
|
|
61
|
+
- `<SKILL_DIR>/scripts/content-outline.jq` - paper outline first step for evidence extraction.
|
|
62
|
+
- `<SKILL_DIR>/scripts/query-id.jq` - lookup sections, paragraphs, figures, tables, and references by `source_id`.
|
|
63
|
+
- `<SKILL_DIR>/scripts/reverse-lookup-xref.jq` - find paragraphs that cite a figure, table, or reference.
|
|
64
|
+
|
|
65
|
+
## Gotchas
|
|
66
|
+
|
|
67
|
+
- `files` has no extension filter. Use `fetch --ext` for filtered downloads.
|
|
68
|
+
- Parsed reference records often omit PMCID even when they include PMID or DOI. Use `idconv` to test whether PMC has a matching article before saying PMC full text is unavailable.
|
|
69
|
+
- `parse` needs cached XML; run `fetch <PMCID.N> --ext xml` first when XML is absent.
|
|
70
|
+
- `fetch` and `parse` use the default PMC Toolkit cache unless `--cache-dir` or `PMC_TOOLKIT_CACHE` is provided. Use custom cache paths only when there is a concrete reason.
|
|
71
|
+
- Cache paths are per article version. Keep the same cache root across `fetch` and `parse` if a custom cache is used.
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
interface:
|
|
2
|
+
display_name: "PMC Toolkit"
|
|
3
|
+
short_description: "PMC OA evidence extraction"
|
|
4
|
+
default_prompt: "Use $pmc-toolkit to retrieve PMC paper metadata, files, parsed evidence, authors, figures, tables, or report-ready findings for this PMCID. Convert PMID/DOI to PMCID only when needed to continue PMC full-text work."
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# CLI: `files` And `fetch`
|
|
2
|
+
|
|
3
|
+
Use `files <PMCID.N>` to list every S3 object key under the article version prefix. Use `fetch <PMCID.N>` to download all or selected object extensions into the local cache.
|
|
4
|
+
|
|
5
|
+
## `files`
|
|
6
|
+
|
|
7
|
+
`files` has no extension filter.
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
uvx pmc-toolkit files PMCxxxx.N
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Example output:
|
|
14
|
+
|
|
15
|
+
```json
|
|
16
|
+
{
|
|
17
|
+
"versioned_pmcid": "PMCxxxx.N",
|
|
18
|
+
"keys": [
|
|
19
|
+
"PMCxxxx.N/PMCxxxx.N.xml",
|
|
20
|
+
"PMCxxxx.N/PMCxxxx.N.pdf",
|
|
21
|
+
"PMCxxxx.N/media-1.jpg"
|
|
22
|
+
]
|
|
23
|
+
}
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Use `files` for inventory, not for local paths.
|
|
27
|
+
|
|
28
|
+
## `fetch`
|
|
29
|
+
|
|
30
|
+
Use `fetch` when a file must exist locally for parsing, inspection, or user delivery.
|
|
31
|
+
|
|
32
|
+
Example for downloading all files listed in above `files` output:
|
|
33
|
+
```bash
|
|
34
|
+
uvx pmc-toolkit fetch PMCxxxx.N --ext xml,pdf,jpg
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Example output:
|
|
38
|
+
|
|
39
|
+
```json
|
|
40
|
+
{
|
|
41
|
+
"versioned_pmcid": "PMCxxxx.N",
|
|
42
|
+
"cache_dir": "/cache/root/PMCxxxx.N",
|
|
43
|
+
"files": [
|
|
44
|
+
{
|
|
45
|
+
"key": "PMCxxxx.N/PMCxxxx.N.xml",
|
|
46
|
+
"local_path": "/cache/root/PMCxxxx.N/PMCxxxx.N.xml",
|
|
47
|
+
"action": "downloaded"
|
|
48
|
+
}, ...
|
|
49
|
+
]
|
|
50
|
+
}
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Use `local_path` if you need access to the downloaded files.
|
|
54
|
+
|
|
55
|
+
## Cache Notes
|
|
56
|
+
|
|
57
|
+
- `metadata` and `files` use the default OS user cache for metadata/manifests.
|
|
58
|
+
- `fetch` and `parse` can use `--cache-dir` or `PMC_TOOLKIT_CACHE`; keep the same cache root across both commands.
|
|
59
|
+
- Cache paths are per article version under `<cache_root>/<PMCID.N>/`.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# CLI: `idconv`
|
|
2
|
+
|
|
3
|
+
Use `idconv <ID...>` to convert PMID, DOI, PMCID, or MID values to PMC identifiers through the PMC ID Converter API. Use this only as a bridge back into PMC full-text workflows, for example when a parsed reference has `identifiers.pmid` or `identifiers.doi` but no `identifiers.pmcid`.
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
uvx pmc-toolkit idconv 23193287
|
|
7
|
+
uvx pmc-toolkit idconv 10.1093/nar/gks1195 --idtype doi
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
Example output shape:
|
|
11
|
+
|
|
12
|
+
```json
|
|
13
|
+
[
|
|
14
|
+
{
|
|
15
|
+
"requested-id": "23193287",
|
|
16
|
+
"pmid": 23193287,
|
|
17
|
+
"pmcid": "PMC3531190",
|
|
18
|
+
"doi": "10.1093/nar/gks1195"
|
|
19
|
+
}
|
|
20
|
+
]
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
When a record has `status: "error"` or no `pmcid`, stop the PMC full-text workflow for that referenced article and report that no matching PMC record was found. Do not summarize from the title alone.
|
|
24
|
+
|
|
25
|
+
After a record returns `pmcid`, run `versions <PMCID>` and continue with `metadata`, `fetch`, and `parse`.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# CLI: `metadata`
|
|
2
|
+
|
|
3
|
+
Use `metadata <PMCID.N>` to fetch bibliographic fields, Open Access flags, and S3 URL fields (for example `xml_url`, `pdf_url`, `media_urls`, `text_url`), plus `pmid` and `doi`.
|
|
4
|
+
|
|
5
|
+
Example:
|
|
6
|
+
```bash
|
|
7
|
+
uvx pmc-toolkit metadata PMCxxxx.N
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
Example output:
|
|
11
|
+
|
|
12
|
+
```json
|
|
13
|
+
{
|
|
14
|
+
"pmcid": "PMCxxxx",
|
|
15
|
+
"version": N,
|
|
16
|
+
"pmid": 12345678,
|
|
17
|
+
"doi": "10.1234/example.doi",
|
|
18
|
+
"mid": null,
|
|
19
|
+
"title": "Example article title",
|
|
20
|
+
"citation": "Journal Name",
|
|
21
|
+
"is_pmc_openaccess": true/false,
|
|
22
|
+
"is_manuscript": true/false,
|
|
23
|
+
"is_historical_ocr": true/false,
|
|
24
|
+
"is_retracted": true/false,
|
|
25
|
+
"license_code": "license code",
|
|
26
|
+
"xml_url": "s3://pmc-oa-opendata/PMCxxxx.N/PMCxxxx.N.xml?md5=<hex>",
|
|
27
|
+
"pdf_url": "s3://pmc-oa-opendata/PMCxxxx.N/PMCxxxx.N.pdf?md5=<hex>",
|
|
28
|
+
"media_urls": [
|
|
29
|
+
"s3://pmc-oa-opendata/PMCxxxx.N/media-1.jpg?md5=<hex>",
|
|
30
|
+
...
|
|
31
|
+
],
|
|
32
|
+
"text_url": "s3://pmc-oa-opendata/PMCxxxx.N/PMCxxxx.N.txt?md5=<hex>"
|
|
33
|
+
}
|
|
34
|
+
```
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# CLI: `parse`
|
|
2
|
+
|
|
3
|
+
Use `parse <PMCID.N>` after cached full-text XML exists. Run `fetch <PMCID.N> --ext xml` first when `<cache>/<PMCID.N>/<PMCID.N>.xml` is missing. The first run parses XML once, writes `<cache-root>/<PMCID.N>/.pmc-extracted-article.json`, and prints the extracted JSON; later runs reuse that cache unless `--force` is passed.
|
|
4
|
+
|
|
5
|
+
Add `--cache-dir` or `PMC_TOOLKIT_CACHE` when the XML was fetched outside the default OS user cache. Keep the same cache root across `fetch` and `parse`.
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
uvx pmc-toolkit fetch PMCxxxx.N --ext xml
|
|
9
|
+
uvx pmc-toolkit parse PMCxxxx.N
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
The `parse` command prints the extracted article JSON (`result.data`), not the `fetch` wrapper with `versioned_pmcid`, `cache_dir`, and downloaded `files`.
|
|
13
|
+
|
|
14
|
+
## Extracted JSON top-level keys
|
|
15
|
+
|
|
16
|
+
- **article_info** - `journal`, `article_ids`, `title`, `publication_date`, `article_type`, `license`, `keywords`, `authors[]`, `abstract`, `funding_grants[]`
|
|
17
|
+
- **content** - `paragraphs[]` and `sections[]`; items include `source_id`, `section_id`, `title`, `text`, `reference_ids`, `figure_ids`, and `table_ids`
|
|
18
|
+
- **references** - `references[]` with `source_id`, `label`, `text`, `publication_type`, `identifiers`, `article_title`, `source`, `year`, `volume`, `issue`, and `pages`
|
|
19
|
+
- **figures** - `figures[]` with `source_id`, `label`, `caption`, and `graphics`
|
|
20
|
+
- **tables** - `tables[]` with `source_id`, `label`, `caption`, `rows`, and `footnotes`
|
|
21
|
+
- **supporting_info** - `acknowledgements`, `competing_interests`, `data_availability`, `supplementary_media`, `author_notes`, `related_articles`, and `custom_metadata`
|
|
22
|
+
|
|
23
|
+
## Narrow retrieval with jq
|
|
24
|
+
|
|
25
|
+
**Start here.** Full `parse` output is large. Pipe it through jq and load only the slice you need.
|
|
26
|
+
|
|
27
|
+
### Content outline (default first step)
|
|
28
|
+
|
|
29
|
+
`scripts/content-outline.jq` returns a nested section tree: article title plus `section_id` and `title` for each section.
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq -c -f <SKILL_DIR>/scripts/content-outline.jq
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Example output:
|
|
36
|
+
|
|
37
|
+
```json
|
|
38
|
+
{
|
|
39
|
+
"title": "journal title",
|
|
40
|
+
"sections": [
|
|
41
|
+
{
|
|
42
|
+
"section_id": "S1",
|
|
43
|
+
"title": "section title"
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"section_id": "S2",
|
|
47
|
+
"title": "section title",
|
|
48
|
+
"sections": [
|
|
49
|
+
{
|
|
50
|
+
"section_id": "S3",
|
|
51
|
+
"title": "sub-section title"
|
|
52
|
+
}
|
|
53
|
+
]
|
|
54
|
+
}
|
|
55
|
+
]
|
|
56
|
+
}
|
|
57
|
+
```
|
|
58
|
+
Use this to pick relevant sections (based on their titles) before loading detailed information.
|
|
59
|
+
The `section_id` values in the outline are XML source IDs (`S1`, `S2`, ...). Use them with `<SKILL_DIR>/scripts/query-id.jq` to fetch detailed section data.
|
|
60
|
+
|
|
61
|
+
### Drill down by ID
|
|
62
|
+
|
|
63
|
+
`scripts/query-id.jq` returns the first object whose `source_id` matches. After the content outline, pass a chosen ID:
|
|
64
|
+
|
|
65
|
+
**Section** - paragraph text and xref links for that section:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq -c --arg id "S3" -f <SKILL_DIR>/scripts/query-id.jq
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Example output:
|
|
72
|
+
|
|
73
|
+
```json
|
|
74
|
+
{
|
|
75
|
+
"source_id": "S3",
|
|
76
|
+
"section_id": "2.1",
|
|
77
|
+
"title": "sub-section title",
|
|
78
|
+
"paragraphs": [
|
|
79
|
+
{
|
|
80
|
+
"source_id": "P9",
|
|
81
|
+
"text": "paragraph text",
|
|
82
|
+
"reference_ids": ["R1", "R18"],
|
|
83
|
+
"figure_ids": ["F1", "F5"],
|
|
84
|
+
"table_ids": ["T1"]
|
|
85
|
+
}
|
|
86
|
+
],
|
|
87
|
+
"sections": []
|
|
88
|
+
}
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Some sections are containers only. In the outline, `S2` (Results) has child sections but no paragraphs of its own - the text lives in `S3`, `S4`, etc.
|
|
92
|
+
Query those leaf `S*` IDs (sections with no nested `sections` in the outline), not the parent, to load only the subsection you need.
|
|
93
|
+
|
|
94
|
+
**Figure, table, or reference** - same script, different ID prefix:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq -c --arg id "F1" -f <SKILL_DIR>/scripts/query-id.jq
|
|
98
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq -c --arg id "R1" -f <SKILL_DIR>/scripts/query-id.jq
|
|
99
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq -c --arg id "T1" -f <SKILL_DIR>/scripts/query-id.jq
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Use paragraph `reference_ids`, `figure_ids`, and `table_ids` to fetch linked entries with `scripts/query-id.jq`. Output shapes:
|
|
103
|
+
|
|
104
|
+
- [cli-parse-references.md](cli-parse-references.md) - `R*` lookup
|
|
105
|
+
- [cli-parse-figures.md](cli-parse-figures.md) - `F*` lookup
|
|
106
|
+
- [cli-parse-tables.md](cli-parse-tables.md) - `T*` lookup
|
|
107
|
+
|
|
108
|
+
### Reverse lookup by xref
|
|
109
|
+
|
|
110
|
+
`query-id.jq` resolves an ID to its object. `reverse-lookup-xref.jq` finds every paragraph that cites a given reference, figure, or table. Pass `--arg xref` as `references`, `figures`, or `tables`:
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq -c --arg xref references --arg id "R1" -f <SKILL_DIR>/scripts/reverse-lookup-xref.jq
|
|
114
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq -c --arg xref figures --arg id "F1" -f <SKILL_DIR>/scripts/reverse-lookup-xref.jq
|
|
115
|
+
uvx pmc-toolkit parse PMCxxxx.N | jq -c --arg xref tables --arg id "T1" -f <SKILL_DIR>/scripts/reverse-lookup-xref.jq
|
|
116
|
+
```
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# CLI: `versions`
|
|
2
|
+
|
|
3
|
+
Use `versions <PMCID>` to list every published versioned PMCID string (`PMCxxxx.1`, `PMCxxxx.2`, ...) for a **base** PMCID only. `versions` rejects versioned IDs.
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
uvx pmc-toolkit versions PMCxxxx
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
Example output shape:
|
|
10
|
+
|
|
11
|
+
```json
|
|
12
|
+
{
|
|
13
|
+
"pmcid": "PMCxxxx",
|
|
14
|
+
"versions": [
|
|
15
|
+
"PMCxxxx.1",
|
|
16
|
+
"PMCxxxx.2"
|
|
17
|
+
]
|
|
18
|
+
}
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
If `.versions` is empty, stop for that PMCID and report that no PMC Open Access version was found. Do not continue to `metadata`, `files`, `fetch`, or `parse` for that PMCID.
|
|
22
|
+
|
|
23
|
+
## Pick the latest `<PMCID.N>`
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
uvx pmc-toolkit versions PMCxxxx | jq -c -r '.versions[-1]'
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Pick a non-latest version
|
|
30
|
+
|
|
31
|
+
Select an element of `.versions` by index (for example `.versions[0]` for the first published version).
|
|
32
|
+
|
|
33
|
+
## Next steps
|
|
34
|
+
|
|
35
|
+
After you have `<PMCID.N>`, continue with `metadata`, `files`, `fetch`, and `parse` as described in the main skill.
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Parsed Data Locator
|
|
2
|
+
|
|
3
|
+
Use this file after choosing `parse` when the right parsed JSON field is not obvious. It maps task intent to the lowest-cost parsed field or helper command. For command selection before parsing, use the router in `SKILL.md` and the CLI references.
|
|
4
|
+
|
|
5
|
+
## Parsed JSON Routing
|
|
6
|
+
|
|
7
|
+
Use `uvx pmc-toolkit parse <PMCID.N> | jq -c '<filter>'` for compact retrieval.
|
|
8
|
+
|
|
9
|
+
| Task | First parsed source | Notes |
|
|
10
|
+
| --- | --- | --- |
|
|
11
|
+
| Article identity from XML | `.article_info` | Use when XML-derived identity is needed. For DOI, title, journal, license, OA flags, and S3 URLs alone, prefer `metadata`. |
|
|
12
|
+
| Abstract | `.article_info.abstract` | Use before body sections for high-level study orientation. |
|
|
13
|
+
| Body section discovery | `<SKILL_DIR>/scripts/content-outline.jq` | Always inspect outline before loading body text. |
|
|
14
|
+
| Body passages | `query-id.jq` on selected `content.sections[].source_id` | Prefer leaf sections when parent sections only group subsections. |
|
|
15
|
+
| Standalone body paragraphs | `.content.paragraphs[]?` | Some XML has top-level body paragraphs outside sections. |
|
|
16
|
+
| Authors and affiliations | `.article_info.authors[]` | Authors include resolved affiliation text when available. |
|
|
17
|
+
| ORCID | `.article_info.authors[].orcid` | Report absent ORCIDs as absent, not unknown. |
|
|
18
|
+
| Equal contribution, author notes, correspondence | `.supporting_info.author_notes` | Use with `.article_info.authors[]`; do not infer equal contribution from author order alone. |
|
|
19
|
+
| Funding | `.article_info.funding_grants[]`, then `.supporting_info` | Some articles encode funding in article metadata, some in acknowledgements. |
|
|
20
|
+
| Acknowledgements | `.supporting_info.acknowledgements[]` | Cite paragraph `source_id` when available. |
|
|
21
|
+
| Competing interests | `.supporting_info.competing_interests[]` | Preserve exact statement and report absence if empty. |
|
|
22
|
+
| Data availability | `.supporting_info.data_availability[]` | Preserve accessions, repository names, URLs, and restrictions. |
|
|
23
|
+
| Supplementary media | `.supporting_info.supplementary_media[]` | Use `files` only when local download or object-key inventory is needed. |
|
|
24
|
+
| Related articles | `.supporting_info.related_articles[]` | Useful for preprint to published article links. |
|
|
25
|
+
| Custom PMC/JATS metadata | `.supporting_info.custom_metadata` | Use for PMC properties that are not normal article fields. |
|
|
26
|
+
| Figures | `.figures[]` | Start with label, caption, graphics. Use linked paragraphs before fetching images unless visual inspection is requested. |
|
|
27
|
+
| Tables | `.tables[]` | Contains XML-extracted rows and footnotes only. Do not assume PDF-only tables are available. |
|
|
28
|
+
| References | `.references[]` | Use identifiers and labels. Reverse lookup paragraphs that cite a reference for context. |
|
|
29
|
+
| Figure, table, or reference citation context | `reverse-lookup-xref.jq` | Pass `--arg xref figures`, `tables`, or `references`. |
|
|
30
|
+
|
|
31
|
+
## Retrieval Shortcuts
|
|
32
|
+
|
|
33
|
+
- Author summary:
|
|
34
|
+
`uvx pmc-toolkit parse <PMCID.N> | jq -c '{title: .article_info.title, authors: .article_info.authors, author_notes: .supporting_info.author_notes}'`
|
|
35
|
+
- Declarations:
|
|
36
|
+
`uvx pmc-toolkit parse <PMCID.N> | jq -c '.supporting_info | {acknowledgements, competing_interests, data_availability, author_notes}'`
|
|
37
|
+
- Figure inventory:
|
|
38
|
+
`uvx pmc-toolkit parse <PMCID.N> | jq -c '.figures[] | {source_id, label, caption, graphics}'`
|
|
39
|
+
- Table inventory:
|
|
40
|
+
`uvx pmc-toolkit parse <PMCID.N> | jq -c '.tables[] | {source_id, label, caption, rows, footnotes}'`
|
|
41
|
+
- Reference inventory:
|
|
42
|
+
`uvx pmc-toolkit parse <PMCID.N> | jq -c '.references[] | {source_id, label, article_title, source, year, identifiers}'`
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Workflow: Author And Contributor Analysis
|
|
2
|
+
|
|
3
|
+
Use this workflow for author lists, affiliations, ORCIDs, corresponding authors, equal-contribution notes, contributor notes, author-focused reports, and author-related declarations.
|
|
4
|
+
|
|
5
|
+
## Source Priority
|
|
6
|
+
|
|
7
|
+
1. Use `metadata` for PMCID version, title, DOI, citation, OA flags, and retraction status.
|
|
8
|
+
2. Use `parse` for author names, resolved affiliations, ORCIDs, and author notes.
|
|
9
|
+
3. Use `supporting_info.author_notes` for equal contribution, correspondence, and other notes.
|
|
10
|
+
4. Use `supporting_info.acknowledgements`, `competing_interests`, and `data_availability` only when the author task asks for declarations or report context.
|
|
11
|
+
|
|
12
|
+
## Retrieval
|
|
13
|
+
|
|
14
|
+
Resolve the version and fetch XML:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
uvx pmc-toolkit metadata <PMCID.N>
|
|
18
|
+
uvx pmc-toolkit fetch <PMCID.N> --ext xml
|
|
19
|
+
uvx pmc-toolkit parse <PMCID.N> | jq -c '{title: .article_info.title, authors: .article_info.authors, author_notes: .supporting_info.author_notes}'
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
If declarations or report context are requested:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
uvx pmc-toolkit parse <PMCID.N> | jq -c '.supporting_info | {acknowledgements, competing_interests, data_availability, author_notes}'
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Interpretation Rules
|
|
29
|
+
|
|
30
|
+
- Preserve author order from `.article_info.authors[]`.
|
|
31
|
+
- Treat missing `orcid` fields as absent ORCIDs. Do not infer ORCIDs.
|
|
32
|
+
- Treat missing affiliation text as absent affiliation data. Do not invent institutional names.
|
|
33
|
+
- Identify equal contribution only from `author_notes`, not from author order or symbols unless the note explains the symbol.
|
|
34
|
+
- Identify corresponding authors only from correspondence entries or explicit author notes.
|
|
35
|
+
- If author notes mention symbols but the parsed author list does not connect symbols to names, report the limitation instead of forcing a mapping.
|
|
36
|
+
|
|
37
|
+
## Output Patterns
|
|
38
|
+
|
|
39
|
+
For a compact author answer, include:
|
|
40
|
+
|
|
41
|
+
- Selected `<PMCID.N>`.
|
|
42
|
+
- Article title and DOI when available.
|
|
43
|
+
- Ordered author list.
|
|
44
|
+
- Affiliation and ORCID fields when requested or relevant.
|
|
45
|
+
- Author-note evidence with `supporting_info.author_notes` and item `source_id` when available.
|
|
46
|
+
- Clear gaps for missing ORCIDs, affiliations, equal-contribution notes, or correspondence.
|
|
47
|
+
|
|
48
|
+
For an author report, use [workflow-reporting.md](workflow-reporting.md) and include an evidence table for author-note claims.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Workflow: Evidence Extraction
|
|
2
|
+
|
|
3
|
+
Use this workflow for article-content questions, evidence-grounded answers, section analysis, declarations, supplements, and claims that must be tied to PMC article evidence.
|
|
4
|
+
|
|
5
|
+
## Retrieval Loop
|
|
6
|
+
|
|
7
|
+
1. Resolve the PMCID to a pinned `<PMCID.N>`.
|
|
8
|
+
2. Fetch XML if needed:
|
|
9
|
+
`uvx pmc-toolkit fetch <PMCID.N> --ext xml`
|
|
10
|
+
3. Inspect the outline first:
|
|
11
|
+
`uvx pmc-toolkit parse <PMCID.N> | jq -c -f <SKILL_DIR>/scripts/content-outline.jq`
|
|
12
|
+
4. Use [data-locator.md](data-locator.md) to decide whether the answer lives in `article_info`, `content`, `supporting_info`, `figures`, `tables`, or `references`.
|
|
13
|
+
5. State the next retrieval plan before loading detailed evidence when the task needs multiple evidence targets.
|
|
14
|
+
6. Retrieve narrow evidence:
|
|
15
|
+
`uvx pmc-toolkit parse <PMCID.N> | jq -c --arg id "<SOURCE_ID>" -f <SKILL_DIR>/scripts/query-id.jq`
|
|
16
|
+
7. For linked support, retrieve cited objects or reverse lookup citation context:
|
|
17
|
+
`uvx pmc-toolkit parse <PMCID.N> | jq -c --arg xref references --arg id "<REFERENCE_ID>" -f <SKILL_DIR>/scripts/reverse-lookup-xref.jq`
|
|
18
|
+
8. If the user asks about the full text, abstract, authors, figures, tables, or evidence inside a referenced/cited article, inspect that reference's `identifiers`. If it has `pmcid`, continue with that PMCID. If it has `pmid` or `doi` but no `pmcid`, read [cli-idconv.md](cli-idconv.md) and run `idconv` before stopping.
|
|
19
|
+
9. Decide after each retrieval whether the evidence is sufficient. If not, choose the next source and repeat.
|
|
20
|
+
10. Stop when the answer is sufficiently supported or when the parsed JSON lacks the needed field. Report gaps explicitly.
|
|
21
|
+
|
|
22
|
+
## Evidence Selection
|
|
23
|
+
|
|
24
|
+
- Use article title, abstract, and outline to orient.
|
|
25
|
+
- Prefer sections whose titles match the task. Query leaf sections rather than broad parent sections when possible.
|
|
26
|
+
- For claims about results, methods, or discussion, use body paragraphs, not only the abstract.
|
|
27
|
+
- For declarations, use `supporting_info` first.
|
|
28
|
+
- For figure, table, or reference claims, inspect the object and linked paragraph context.
|
|
29
|
+
- For author/contributor claims, use the author workflow.
|
|
30
|
+
|
|
31
|
+
## Answer Requirements
|
|
32
|
+
|
|
33
|
+
Include in the final answer:
|
|
34
|
+
|
|
35
|
+
- Base PMCID and selected `<PMCID.N>`.
|
|
36
|
+
- Each claim with a human-readable locator.
|
|
37
|
+
- Stable `source_id` when useful for traceability.
|
|
38
|
+
- Short evidence summary or short quote from retrieved parsed JSON.
|
|
39
|
+
- Any gap, conflict, or mismatch.
|
|
40
|
+
|
|
41
|
+
Use these locators:
|
|
42
|
+
|
|
43
|
+
- Body text: `section_id` and section title, plus paragraph `source_id` when needed.
|
|
44
|
+
- Figures: figure `label`.
|
|
45
|
+
- Tables: table `label`.
|
|
46
|
+
- References: reference `label`.
|
|
47
|
+
- Supporting info: supporting-info category plus item `source_id` when available.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Workflow: Figure And Image Analysis
|
|
2
|
+
|
|
3
|
+
Use this workflow for figure captions, figure-linked claims, panel interpretation, graphics files, and visual inspection requests.
|
|
4
|
+
|
|
5
|
+
## Caption And Text Evidence
|
|
6
|
+
|
|
7
|
+
1. Resolve to `<PMCID.N>`.
|
|
8
|
+
2. Fetch and parse XML.
|
|
9
|
+
3. List figures:
|
|
10
|
+
`uvx pmc-toolkit parse <PMCID.N> | jq -c '.figures[] | {source_id, label, caption, graphics}'`
|
|
11
|
+
4. Retrieve the selected figure by ID:
|
|
12
|
+
`uvx pmc-toolkit parse <PMCID.N> | jq -c --arg id "<FIGURE_ID>" -f <SKILL_DIR>/scripts/query-id.jq`
|
|
13
|
+
5. Retrieve paragraphs that cite the figure:
|
|
14
|
+
`uvx pmc-toolkit parse <PMCID.N> | jq -c --arg xref figures --arg id "<FIGURE_ID>" -f <SKILL_DIR>/scripts/reverse-lookup-xref.jq`
|
|
15
|
+
|
|
16
|
+
Use caption plus linked paragraphs for text-grounded figure answers.
|
|
17
|
+
|
|
18
|
+
## Visual Inspection
|
|
19
|
+
|
|
20
|
+
Fetch image files only when the user asks about the visual itself, a panel, an image feature, or when caption/text evidence is insufficient.
|
|
21
|
+
|
|
22
|
+
1. Run `files <PMCID.N>` to inspect available image/media object keys.
|
|
23
|
+
2. Match figure `graphics[]` values to object-key suffixes when possible.
|
|
24
|
+
3. Fetch only likely image extensions:
|
|
25
|
+
`uvx pmc-toolkit fetch <PMCID.N> --ext jpg,png,tif,tiff,gif`
|
|
26
|
+
4. Use the returned `local_path` for visual inspection with the available image-viewing tool.
|
|
27
|
+
|
|
28
|
+
## Output Rules
|
|
29
|
+
|
|
30
|
+
- Cite figure `label` and selected `<PMCID.N>`.
|
|
31
|
+
- Include caption evidence and linked paragraph evidence when used.
|
|
32
|
+
- Distinguish what is visible in the image from what the caption or article text states.
|
|
33
|
+
- If the graphics file cannot be matched or fetched, answer from caption/text evidence and report the visual gap.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Workflow: Knowledge Extraction
|
|
2
|
+
|
|
3
|
+
Use this workflow for broad but structured tasks such as extracting key findings, mechanisms, datasets, claims, measurements, limitations, interventions, outcomes, or article-specific knowledge graphs. Keep the workflow generic; specialize the output schema to the user's task.
|
|
4
|
+
|
|
5
|
+
## Process
|
|
6
|
+
|
|
7
|
+
1. Resolve to `<PMCID.N>`.
|
|
8
|
+
2. Fetch XML and inspect the outline.
|
|
9
|
+
3. Read the abstract only for orientation:
|
|
10
|
+
`uvx pmc-toolkit parse <PMCID.N> | jq -c '{title: .article_info.title, abstract: .article_info.abstract, keywords: .article_info.keywords}'`
|
|
11
|
+
4. Choose target sections from the outline. For most research extraction tasks, inspect methods, results, discussion, limitations, and any named domain sections.
|
|
12
|
+
5. Retrieve selected sections by `source_id` with `query-id.jq`.
|
|
13
|
+
6. Extract candidate knowledge records from retrieved evidence only.
|
|
14
|
+
7. If the task involves figures, tables, or references, retrieve those objects and their linked paragraph context.
|
|
15
|
+
8. Stop when the selected evidence covers the requested schema or when additional sections are unlikely to change the answer. Report uninspected sections when they are relevant but not loaded.
|
|
16
|
+
|
|
17
|
+
## Record Schema
|
|
18
|
+
|
|
19
|
+
Use or adapt this schema unless the user provides another:
|
|
20
|
+
|
|
21
|
+
- `item`: concise concept, claim, finding, method, variable, dataset, limitation, or outcome.
|
|
22
|
+
- `category`: user-relevant class such as method, result, mechanism, limitation, dataset, or evidence.
|
|
23
|
+
- `evidence_locator`: section title and `section_id`; figure/table/reference label if applicable.
|
|
24
|
+
- `source_id`: paragraph, section, figure, table, or reference ID.
|
|
25
|
+
- `evidence`: short quote or compact summary from retrieved parsed JSON.
|
|
26
|
+
- `confidence`: high, medium, or low based on specificity and directness of evidence.
|
|
27
|
+
- `gap`: missing context, ambiguity, or unsupported inference.
|
|
28
|
+
|
|
29
|
+
## Rules
|
|
30
|
+
|
|
31
|
+
- Separate article claims from your own synthesis.
|
|
32
|
+
- Do not use uninspected sections as evidence.
|
|
33
|
+
- Prefer direct result/method paragraphs over abstract-only evidence.
|
|
34
|
+
- Keep extraction records small enough to verify. If the task is large, produce a first-pass matrix and state what remains to inspect.
|
|
35
|
+
- Use [workflow-reporting.md](workflow-reporting.md) when the user asks for a polished report rather than raw records.
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Workflow: Reporting
|
|
2
|
+
|
|
3
|
+
Use this workflow when the user asks for a report, memo, evidence brief, author report, structured summary, or deliverable that combines multiple PMC Toolkit data sources.
|
|
4
|
+
|
|
5
|
+
## Process
|
|
6
|
+
|
|
7
|
+
1. Infer the report scope from the request. Ask a question only when the deliverable cannot be scoped safely.
|
|
8
|
+
2. Resolve to `<PMCID.N>` and collect metadata for the report header.
|
|
9
|
+
3. Use the router in `SKILL.md` to choose command and workflow sources. Load [data-locator.md](data-locator.md) only when parsed JSON fields are not obvious.
|
|
10
|
+
4. Load only the source-specific workflow references required by the report.
|
|
11
|
+
5. Retrieve evidence in small slices. Keep a scratch list of every claim with its locator.
|
|
12
|
+
6. Assemble the report from retrieved evidence only.
|
|
13
|
+
7. Include a limitations or gaps section when data is absent, ambiguous, or not inspected.
|
|
14
|
+
|
|
15
|
+
## Suggested Sections
|
|
16
|
+
|
|
17
|
+
Use these sections when they fit the task:
|
|
18
|
+
|
|
19
|
+
- Article: PMCID, selected version, title, DOI, journal/citation, date, license/OA status.
|
|
20
|
+
- Scope: what the report covers.
|
|
21
|
+
- Findings: grouped by the user's task.
|
|
22
|
+
- Evidence Table: claim, locator, source ID, short evidence, gap.
|
|
23
|
+
- Files Or Artifacts: available XML/PDF/media/supplements when relevant.
|
|
24
|
+
- Gaps: absent parsed fields, unavailable XML/files, or uninspected sections.
|
|
25
|
+
|
|
26
|
+
## Rules
|
|
27
|
+
|
|
28
|
+
- Do not make report sections that hide evidence gaps.
|
|
29
|
+
- Do not over-fetch. A report can combine metadata, author notes, and a few body sections without loading the entire parsed article.
|
|
30
|
+
- Use concise quotations only when they add auditability.
|
|
31
|
+
- For author reports, use [workflow-author-contributor-analysis.md](workflow-author-contributor-analysis.md).
|
|
32
|
+
- For broad extraction reports, use [workflow-knowledge-extraction.md](workflow-knowledge-extraction.md).
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
def drop_empty($o):
|
|
2
|
+
$o | with_entries(select(.value | if type == "array" then length > 0 else . != null end));
|
|
3
|
+
|
|
4
|
+
def section:
|
|
5
|
+
drop_empty({
|
|
6
|
+
section_id: .source_id,
|
|
7
|
+
title: .title,
|
|
8
|
+
sections: [.sections[]? | section]
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
{
|
|
12
|
+
title: .article_info.title,
|
|
13
|
+
sections: [.content.sections[]? | section]
|
|
14
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
{
|
|
2
|
+
references: "reference_ids",
|
|
3
|
+
figures: "figure_ids",
|
|
4
|
+
tables: "table_ids"
|
|
5
|
+
}[$xref] as $field
|
|
6
|
+
| if $field == null then
|
|
7
|
+
error("xref must be references, figures, or tables")
|
|
8
|
+
else
|
|
9
|
+
[ .. | objects
|
|
10
|
+
| select(.[$field]? | index($id))
|
|
11
|
+
| {source_id, text, ($field): .[$field] }
|
|
12
|
+
]
|
|
13
|
+
end
|
|
@@ -71,6 +71,44 @@ def metadata(
|
|
|
71
71
|
_emit_json(result.model_dump(mode="json"))
|
|
72
72
|
|
|
73
73
|
|
|
74
|
+
@app.command("idconv")
|
|
75
|
+
def idconv(
|
|
76
|
+
identifiers: list[str] = typer.Argument(
|
|
77
|
+
...,
|
|
78
|
+
help=(
|
|
79
|
+
"PMID, DOI, PMCID, or MID values to convert to PMC identifiers. "
|
|
80
|
+
"Comma-separated values are accepted."
|
|
81
|
+
),
|
|
82
|
+
),
|
|
83
|
+
idtype: str | None = typer.Option(
|
|
84
|
+
None,
|
|
85
|
+
"--idtype",
|
|
86
|
+
help="Optional input identifier type: pmid, doi, pmcid, or mid.",
|
|
87
|
+
),
|
|
88
|
+
email: str | None = typer.Option(
|
|
89
|
+
None,
|
|
90
|
+
"--email",
|
|
91
|
+
envvar="NCBI_EMAIL",
|
|
92
|
+
help="Optional contact email sent to the NCBI ID Converter API.",
|
|
93
|
+
),
|
|
94
|
+
) -> None:
|
|
95
|
+
"""
|
|
96
|
+
Convert PMID/DOI identifiers to PMC identifiers when available in PMC.
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
def build_result():
|
|
100
|
+
from pmc_toolkit.idconv_api import convert_to_pmcids
|
|
101
|
+
|
|
102
|
+
return convert_to_pmcids(
|
|
103
|
+
identifiers,
|
|
104
|
+
idtype=idtype,
|
|
105
|
+
email=email,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
result = _run_command(build_result)
|
|
109
|
+
_emit_json(result["records"])
|
|
110
|
+
|
|
111
|
+
|
|
74
112
|
@app.command("files")
|
|
75
113
|
def files(
|
|
76
114
|
requested_pmcid: str = typer.Argument(
|
|
@@ -140,8 +178,8 @@ def fetch(
|
|
|
140
178
|
_emit_json(result.model_dump(mode="json"))
|
|
141
179
|
|
|
142
180
|
|
|
143
|
-
@app.command("
|
|
144
|
-
def
|
|
181
|
+
@app.command("parse")
|
|
182
|
+
def parse(
|
|
145
183
|
requested_pmcid: str = typer.Argument(
|
|
146
184
|
...,
|
|
147
185
|
help="PMC accession ID or version ID, e.g. PMC11370360 or PMC11370360.1",
|
|
@@ -158,22 +196,10 @@ def convert_xml(
|
|
|
158
196
|
"-f",
|
|
159
197
|
help="Recreate the extracted JSON cache from the cached XML.",
|
|
160
198
|
),
|
|
161
|
-
list_keys: bool = typer.Option(
|
|
162
|
-
False,
|
|
163
|
-
"--list-keys",
|
|
164
|
-
help="Print available extracted JSON keys and descriptions, then exit.",
|
|
165
|
-
),
|
|
166
199
|
) -> None:
|
|
167
200
|
"""
|
|
168
|
-
|
|
201
|
+
Parse cached PMC full-text XML into cached extracted JSON.
|
|
169
202
|
"""
|
|
170
|
-
if list_keys:
|
|
171
|
-
from pmc_toolkit.xml_parse_utils import EXTRACT_OUTPUT_KEY_DESCRIPTIONS
|
|
172
|
-
|
|
173
|
-
typer.echo("Available extracted JSON keys:")
|
|
174
|
-
for key, description in EXTRACT_OUTPUT_KEY_DESCRIPTIONS.items():
|
|
175
|
-
typer.echo(f"- {key}: {description}")
|
|
176
|
-
return
|
|
177
203
|
|
|
178
204
|
def build_result():
|
|
179
205
|
from pmc_toolkit.xml_parse_api import ensure_extracted_article
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""NCBI PMC ID Converter client."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
import json
|
|
5
|
+
from typing import Any
|
|
6
|
+
from urllib.error import HTTPError, URLError
|
|
7
|
+
from urllib.parse import urlencode
|
|
8
|
+
from urllib.request import Request, urlopen
|
|
9
|
+
|
|
10
|
+
ID_CONVERTER_URL = "https://pmc.ncbi.nlm.nih.gov/tools/idconv/api/v1/articles/"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def convert_to_pmcids(
|
|
14
|
+
identifiers: Sequence[str],
|
|
15
|
+
*,
|
|
16
|
+
idtype: str | None = None,
|
|
17
|
+
email: str | None = None,
|
|
18
|
+
timeout: float = 30.0,
|
|
19
|
+
) -> dict[str, Any]:
|
|
20
|
+
params = {
|
|
21
|
+
"tool": "pmc_toolkit",
|
|
22
|
+
"format": "json",
|
|
23
|
+
"ids": ",".join(identifiers),
|
|
24
|
+
}
|
|
25
|
+
if idtype:
|
|
26
|
+
params["idtype"] = idtype
|
|
27
|
+
if email:
|
|
28
|
+
params["email"] = email
|
|
29
|
+
|
|
30
|
+
url = f"{ID_CONVERTER_URL}?{urlencode(params)}"
|
|
31
|
+
request = Request(url, headers={"User-Agent": "pmc-toolkit"})
|
|
32
|
+
try:
|
|
33
|
+
with urlopen(request, timeout=timeout) as response:
|
|
34
|
+
payload = json.loads(response.read().decode("utf-8"))
|
|
35
|
+
except HTTPError as exc:
|
|
36
|
+
raise RuntimeError(
|
|
37
|
+
f"ID converter request failed with HTTP {exc.code}."
|
|
38
|
+
) from exc
|
|
39
|
+
except URLError as exc:
|
|
40
|
+
raise RuntimeError(f"ID converter request failed: {exc.reason}.") from exc
|
|
41
|
+
except json.JSONDecodeError as exc:
|
|
42
|
+
raise RuntimeError("ID converter returned invalid JSON.") from exc
|
|
43
|
+
|
|
44
|
+
if not isinstance(payload, dict):
|
|
45
|
+
raise RuntimeError("ID converter returned an unexpected response.")
|
|
46
|
+
return payload
|
|
@@ -105,11 +105,7 @@ def _ensure_extracted_article_cache(
|
|
|
105
105
|
from pmc_toolkit.xml_parse_utils import extract_article_data, load_xml
|
|
106
106
|
|
|
107
107
|
root = load_xml(paths.xml_path)
|
|
108
|
-
parsed = _group_extracted_article(
|
|
109
|
-
extract_article_data(root),
|
|
110
|
-
versioned_pmcid=paths.versioned_pmcid,
|
|
111
|
-
xml_path=paths.xml_path,
|
|
112
|
-
)
|
|
108
|
+
parsed = _group_extracted_article(extract_article_data(root))
|
|
113
109
|
storage_cache.write_cached_extracted_article(
|
|
114
110
|
paths.cache_root,
|
|
115
111
|
paths.versioned_pmcid,
|
|
@@ -120,15 +116,8 @@ def _ensure_extracted_article_cache(
|
|
|
120
116
|
|
|
121
117
|
def _group_extracted_article(
|
|
122
118
|
raw_data: dict[str, Any],
|
|
123
|
-
*,
|
|
124
|
-
versioned_pmcid: str,
|
|
125
|
-
xml_path: Path,
|
|
126
119
|
) -> dict[str, Any]:
|
|
127
120
|
return {
|
|
128
|
-
"_meta": {
|
|
129
|
-
"versioned_pmcid": versioned_pmcid,
|
|
130
|
-
"xml_path": str(xml_path),
|
|
131
|
-
},
|
|
132
121
|
"article_info": _article_info(raw_data),
|
|
133
122
|
"content": raw_data["content"],
|
|
134
123
|
"references": raw_data["references"],
|
|
@@ -13,26 +13,6 @@ XMLParser = etree.XMLParser(
|
|
|
13
13
|
remove_blank_text=True,
|
|
14
14
|
)
|
|
15
15
|
REFERENCE_SEPARATOR_PATTERN = re.compile(r"^[\s,;]+$")
|
|
16
|
-
EXTRACT_OUTPUT_KEY_DESCRIPTIONS = {
|
|
17
|
-
"article_info": (
|
|
18
|
-
"article_info.journal, article_ids, title, publication_date, article_type, "
|
|
19
|
-
"license, keywords, authors[], abstract, and funding_grants[]"
|
|
20
|
-
),
|
|
21
|
-
"content": (
|
|
22
|
-
"content.paragraphs[] and content.sections[]; objects include source_id, "
|
|
23
|
-
"section_id, title, text, reference_ids, figure_ids, and table_ids"
|
|
24
|
-
),
|
|
25
|
-
"references": (
|
|
26
|
-
"references[] items with source_id, label, text, publication_type, "
|
|
27
|
-
"identifiers, article_title, source, year, volume, issue, and pages"
|
|
28
|
-
),
|
|
29
|
-
"figures": "figures[] items with source_id, label, caption, and graphics",
|
|
30
|
-
"tables": "tables[] items with source_id, label, caption, rows, and footnotes",
|
|
31
|
-
"supporting_info": (
|
|
32
|
-
"acknowledgements, competing_interests, data_availability, "
|
|
33
|
-
"supplementary_media, author_notes, related_articles, and custom_metadata"
|
|
34
|
-
),
|
|
35
|
-
}
|
|
36
16
|
|
|
37
17
|
|
|
38
18
|
def load_xml(path: Path) -> Any:
|
|
@@ -73,7 +73,7 @@ def test_ensure_extracted_article_reads_xml_and_writes_extracted_cache(
|
|
|
73
73
|
|
|
74
74
|
assert result.versioned_pmcid == "PMC11370360.1"
|
|
75
75
|
assert result.xml_path == str(article_dir / "PMC11370360.1.xml")
|
|
76
|
-
assert
|
|
76
|
+
assert "_meta" not in result.data
|
|
77
77
|
assert result.data["article_info"]["journal"]["name"] == "bioRxiv"
|
|
78
78
|
assert result.data["article_info"]["journal"]["issn"] == "2692-8205"
|
|
79
79
|
assert result.data["article_info"]["article_ids"] == {
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|