pmc-toolkit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pmc_toolkit/__init__.py +0 -0
- pmc_toolkit/cache.py +144 -0
- pmc_toolkit/cli.py +196 -0
- pmc_toolkit/models.py +56 -0
- pmc_toolkit/storage_api.py +108 -0
- pmc_toolkit/storage_utils.py +169 -0
- pmc_toolkit/validators.py +37 -0
- pmc_toolkit/xml_parse_api.py +232 -0
- pmc_toolkit/xml_parse_utils.py +856 -0
- pmc_toolkit-0.1.0.dist-info/METADATA +183 -0
- pmc_toolkit-0.1.0.dist-info/RECORD +14 -0
- pmc_toolkit-0.1.0.dist-info/WHEEL +4 -0
- pmc_toolkit-0.1.0.dist-info/entry_points.txt +2 -0
- pmc_toolkit-0.1.0.dist-info/licenses/LICENSE +21 -0
pmc_toolkit/__init__.py
ADDED
|
File without changes
|
pmc_toolkit/cache.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Local filesystem cache helpers for PMC metadata, manifests, and downloads."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from pmc_toolkit.models import PMCMetadata
|
|
10
|
+
|
|
11
|
+
OBJECT_KEYS_CACHE_FILENAME = ".pmc-object-keys.json"
|
|
12
|
+
EXTRACTED_ARTICLE_CACHE_FILENAME = ".pmc-extracted-article.json"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def default_cache_root() -> Path:
|
|
16
|
+
from platformdirs import user_cache_dir
|
|
17
|
+
|
|
18
|
+
return Path(user_cache_dir("pmc-toolkit", appauthor=False))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def resolve_cache_root(cache_dir: Path | None = None) -> Path:
|
|
22
|
+
return Path(cache_dir) if cache_dir is not None else default_cache_root()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def article_cache_dir(cache_root: Path, versioned_pmcid: str) -> Path:
|
|
26
|
+
return cache_root / versioned_pmcid
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def metadata_cache_path(cache_root: Path, versioned_pmcid: str) -> Path:
|
|
30
|
+
return article_cache_dir(cache_root, versioned_pmcid) / f"{versioned_pmcid}.json"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def object_keys_cache_path(cache_root: Path, versioned_pmcid: str) -> Path:
|
|
34
|
+
return article_cache_dir(cache_root, versioned_pmcid) / OBJECT_KEYS_CACHE_FILENAME
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def extracted_article_cache_path(cache_root: Path, versioned_pmcid: str) -> Path:
|
|
38
|
+
return (
|
|
39
|
+
article_cache_dir(cache_root, versioned_pmcid)
|
|
40
|
+
/ EXTRACTED_ARTICLE_CACHE_FILENAME
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _read_json_file(path: Path) -> Any | None:
|
|
45
|
+
if not path.exists():
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
import json
|
|
49
|
+
|
|
50
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def read_cached_metadata(cache_root: Path, versioned_pmcid: str) -> PMCMetadata | None:
|
|
54
|
+
payload = _read_json_file(metadata_cache_path(cache_root, versioned_pmcid))
|
|
55
|
+
if payload is None:
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
from pmc_toolkit.models import PMCMetadata
|
|
59
|
+
|
|
60
|
+
return PMCMetadata.model_validate(payload)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def write_cached_metadata(
|
|
64
|
+
cache_root: Path, versioned_pmcid: str, metadata: PMCMetadata
|
|
65
|
+
) -> None:
|
|
66
|
+
path = metadata_cache_path(cache_root, versioned_pmcid)
|
|
67
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
68
|
+
path.write_text(metadata.model_dump_json(indent=2), encoding="utf-8")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def read_cached_object_keys(cache_root: Path, versioned_pmcid: str) -> list[str] | None:
|
|
72
|
+
payload = _read_json_file(object_keys_cache_path(cache_root, versioned_pmcid))
|
|
73
|
+
if payload is None:
|
|
74
|
+
return None
|
|
75
|
+
if not isinstance(payload, list) or not all(
|
|
76
|
+
isinstance(item, str) for item in payload
|
|
77
|
+
):
|
|
78
|
+
raise ValueError(f"Invalid cached file listing for article: {versioned_pmcid}.")
|
|
79
|
+
|
|
80
|
+
return sorted(payload)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def write_cached_object_keys(
|
|
84
|
+
cache_root: Path, versioned_pmcid: str, keys: list[str]
|
|
85
|
+
) -> None:
|
|
86
|
+
import json
|
|
87
|
+
|
|
88
|
+
path = object_keys_cache_path(cache_root, versioned_pmcid)
|
|
89
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
90
|
+
path.write_text(json.dumps(sorted(keys), indent=2), encoding="utf-8")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def read_cached_extracted_article(
|
|
94
|
+
cache_root: Path, versioned_pmcid: str
|
|
95
|
+
) -> dict[str, Any] | None:
|
|
96
|
+
path = extracted_article_cache_path(cache_root, versioned_pmcid)
|
|
97
|
+
try:
|
|
98
|
+
payload = _read_json_file(path)
|
|
99
|
+
except ValueError:
|
|
100
|
+
return None
|
|
101
|
+
if payload is None:
|
|
102
|
+
return None
|
|
103
|
+
if not isinstance(payload, dict):
|
|
104
|
+
return None
|
|
105
|
+
return payload
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def write_cached_extracted_article(
|
|
109
|
+
cache_root: Path, versioned_pmcid: str, data: dict[str, Any]
|
|
110
|
+
) -> None:
|
|
111
|
+
import json
|
|
112
|
+
|
|
113
|
+
path = extracted_article_cache_path(cache_root, versioned_pmcid)
|
|
114
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
115
|
+
path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def local_object_path(cache_root: Path, versioned_pmcid: str, key: str) -> Path:
|
|
119
|
+
"""Return the local cache path for an S3 object key.
|
|
120
|
+
|
|
121
|
+
S3 object keys are remote-controlled opaque strings, not trusted filesystem paths.
|
|
122
|
+
A key may still start with the expected article prefix while using ``..`` or an
|
|
123
|
+
absolute path segment to escape the article cache directory, so this helper
|
|
124
|
+
enforces that the resolved destination remains inside that directory.
|
|
125
|
+
"""
|
|
126
|
+
prefix = f"{versioned_pmcid}/"
|
|
127
|
+
if not key.startswith(prefix):
|
|
128
|
+
raise ValueError(
|
|
129
|
+
f"Object key {key!r} does not belong to article: {versioned_pmcid}."
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
relpath = key.removeprefix(prefix)
|
|
133
|
+
article_dir = article_cache_dir(cache_root, versioned_pmcid)
|
|
134
|
+
|
|
135
|
+
# Keep downloads contained to the article cache directory.
|
|
136
|
+
if Path(relpath).is_absolute():
|
|
137
|
+
raise ValueError(f"Unsafe object key path for article: {versioned_pmcid}.")
|
|
138
|
+
|
|
139
|
+
dest_path = (article_dir / relpath).resolve()
|
|
140
|
+
article_dir_resolved = article_dir.resolve()
|
|
141
|
+
if not dest_path.is_relative_to(article_dir_resolved):
|
|
142
|
+
raise ValueError(f"Unsafe object key path for article: {versioned_pmcid}.")
|
|
143
|
+
|
|
144
|
+
return dest_path
|
pmc_toolkit/cli.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, TypeVar
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
|
|
8
|
+
CommandResult = TypeVar("CommandResult")
|
|
9
|
+
|
|
10
|
+
app = typer.Typer(
|
|
11
|
+
help="CLI for interacting with the PMC Open Data S3 bucket.",
|
|
12
|
+
no_args_is_help=True,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _run_command(action: Callable[[], CommandResult]) -> CommandResult:
|
|
17
|
+
try:
|
|
18
|
+
return action()
|
|
19
|
+
except ValueError as exc:
|
|
20
|
+
typer.echo(f"Error: {exc}", err=True)
|
|
21
|
+
raise typer.Exit(code=2) from exc
|
|
22
|
+
except Exception as exc:
|
|
23
|
+
typer.echo(f"Error: {exc}", err=True)
|
|
24
|
+
raise typer.Exit(code=1) from exc
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _emit_json(payload: Any) -> None:
|
|
28
|
+
typer.echo(json.dumps(payload, indent=2, ensure_ascii=False))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@app.command("versions")
|
|
32
|
+
def versions(
|
|
33
|
+
pmcid: str = typer.Argument(..., help="PMC accession ID, e.g. PMC11370360"),
|
|
34
|
+
) -> None:
|
|
35
|
+
"""
|
|
36
|
+
List all versions belonging to a PMCID.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def build_result():
|
|
40
|
+
from pmc_toolkit.storage_api import list_versions
|
|
41
|
+
from pmc_toolkit.validators import parse_pmcid
|
|
42
|
+
|
|
43
|
+
normalized_pmcid, version = parse_pmcid(pmcid)
|
|
44
|
+
if version is not None:
|
|
45
|
+
raise ValueError(
|
|
46
|
+
"The versions command expects a base PMCID like 'PMC11370360', not a versioned ID."
|
|
47
|
+
)
|
|
48
|
+
return list_versions(normalized_pmcid)
|
|
49
|
+
|
|
50
|
+
result = _run_command(build_result)
|
|
51
|
+
_emit_json(result.model_dump(mode="json"))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@app.command("metadata")
|
|
55
|
+
def metadata(
|
|
56
|
+
requested_pmcid: str = typer.Argument(
|
|
57
|
+
...,
|
|
58
|
+
help="PMC accession ID or version ID, e.g. PMC11370360 or PMC11370360.1",
|
|
59
|
+
),
|
|
60
|
+
) -> None:
|
|
61
|
+
"""
|
|
62
|
+
Fetch metadata for a PMC article identifier.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def build_result():
|
|
66
|
+
from pmc_toolkit.storage_api import get_metadata
|
|
67
|
+
|
|
68
|
+
return get_metadata(requested_pmcid)
|
|
69
|
+
|
|
70
|
+
result = _run_command(build_result)
|
|
71
|
+
_emit_json(result.model_dump(mode="json"))
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@app.command("files")
|
|
75
|
+
def files(
|
|
76
|
+
requested_pmcid: str = typer.Argument(
|
|
77
|
+
...,
|
|
78
|
+
help="PMC accession ID or version ID, e.g. PMC11370360 or PMC11370360.1",
|
|
79
|
+
),
|
|
80
|
+
) -> None:
|
|
81
|
+
"""
|
|
82
|
+
List every object stored under a PMC article version's S3 prefix.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def build_result():
|
|
86
|
+
from pmc_toolkit.storage_api import list_files
|
|
87
|
+
|
|
88
|
+
return list_files(requested_pmcid)
|
|
89
|
+
|
|
90
|
+
result = _run_command(build_result)
|
|
91
|
+
_emit_json(result.model_dump(mode="json"))
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@app.command("fetch")
|
|
95
|
+
def fetch(
|
|
96
|
+
requested_pmcid: str = typer.Argument(
|
|
97
|
+
...,
|
|
98
|
+
help="PMC accession ID or version ID, e.g. PMC11370360 or PMC11370360.1",
|
|
99
|
+
),
|
|
100
|
+
extensions: list[str] = typer.Option(
|
|
101
|
+
None,
|
|
102
|
+
"--ext",
|
|
103
|
+
"-e",
|
|
104
|
+
help=(
|
|
105
|
+
"Restrict download to these file extensions. Repeat the option or pass a "
|
|
106
|
+
"comma-separated list, e.g. -e pdf -e xml or -e pdf,xml."
|
|
107
|
+
),
|
|
108
|
+
),
|
|
109
|
+
cache_dir: Path = typer.Option(
|
|
110
|
+
None,
|
|
111
|
+
"--cache-dir",
|
|
112
|
+
envvar="PMC_TOOLKIT_CACHE",
|
|
113
|
+
help=(
|
|
114
|
+
"Cache root (default: OS user cache dir for pmc-toolkit, e.g. XDG on Linux, "
|
|
115
|
+
"Library/Caches on macOS, Local AppData on Windows). Files under <cache>/<PMCid.N>/."
|
|
116
|
+
),
|
|
117
|
+
),
|
|
118
|
+
force: bool = typer.Option(
|
|
119
|
+
False,
|
|
120
|
+
"--force",
|
|
121
|
+
"-f",
|
|
122
|
+
help="Re-download files even when they already exist in the cache.",
|
|
123
|
+
),
|
|
124
|
+
) -> None:
|
|
125
|
+
"""
|
|
126
|
+
Download all (or filtered) files for a PMC article version into a local cache.
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
def build_result():
|
|
130
|
+
from pmc_toolkit.storage_api import fetch_files
|
|
131
|
+
|
|
132
|
+
return fetch_files(
|
|
133
|
+
requested_pmcid,
|
|
134
|
+
cache_dir=cache_dir,
|
|
135
|
+
extensions=extensions,
|
|
136
|
+
force=force,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
result = _run_command(build_result)
|
|
140
|
+
_emit_json(result.model_dump(mode="json"))
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@app.command("convert-xml")
|
|
144
|
+
def convert_xml(
|
|
145
|
+
requested_pmcid: str = typer.Argument(
|
|
146
|
+
...,
|
|
147
|
+
help="PMC accession ID or version ID, e.g. PMC11370360 or PMC11370360.1",
|
|
148
|
+
),
|
|
149
|
+
cache_dir: Path = typer.Option(
|
|
150
|
+
None,
|
|
151
|
+
"--cache-dir",
|
|
152
|
+
envvar="PMC_TOOLKIT_CACHE",
|
|
153
|
+
help="Cache root containing <PMCid.N>/<PMCid.N>.xml.",
|
|
154
|
+
),
|
|
155
|
+
force: bool = typer.Option(
|
|
156
|
+
False,
|
|
157
|
+
"--force",
|
|
158
|
+
"-f",
|
|
159
|
+
help="Recreate the extracted JSON cache from the cached XML.",
|
|
160
|
+
),
|
|
161
|
+
list_keys: bool = typer.Option(
|
|
162
|
+
False,
|
|
163
|
+
"--list-keys",
|
|
164
|
+
help="Print available extracted JSON keys and descriptions, then exit.",
|
|
165
|
+
),
|
|
166
|
+
) -> None:
|
|
167
|
+
"""
|
|
168
|
+
Convert cached PMC full-text XML into cached extracted JSON.
|
|
169
|
+
"""
|
|
170
|
+
if list_keys:
|
|
171
|
+
from pmc_toolkit.xml_parse_utils import EXTRACT_OUTPUT_KEY_DESCRIPTIONS
|
|
172
|
+
|
|
173
|
+
typer.echo("Available extracted JSON keys:")
|
|
174
|
+
for key, description in EXTRACT_OUTPUT_KEY_DESCRIPTIONS.items():
|
|
175
|
+
typer.echo(f"- {key}: {description}")
|
|
176
|
+
return
|
|
177
|
+
|
|
178
|
+
def build_result():
|
|
179
|
+
from pmc_toolkit.xml_parse_api import ensure_extracted_article
|
|
180
|
+
|
|
181
|
+
return ensure_extracted_article(
|
|
182
|
+
requested_pmcid,
|
|
183
|
+
cache_dir=cache_dir,
|
|
184
|
+
force=force,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
result = _run_command(build_result)
|
|
188
|
+
_emit_json(result.data)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def main() -> None:
|
|
192
|
+
app()
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
if __name__ == "__main__":
|
|
196
|
+
main()
|
pmc_toolkit/models.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PMCVersions(BaseModel):
|
|
8
|
+
pmcid: str
|
|
9
|
+
versions: list[str]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PMCMetadata(BaseModel):
|
|
13
|
+
pmcid: str
|
|
14
|
+
version: int
|
|
15
|
+
pmid: int | None = None
|
|
16
|
+
doi: str | None = None
|
|
17
|
+
mid: str | None = None
|
|
18
|
+
title: str | None = None
|
|
19
|
+
citation: str | None = None
|
|
20
|
+
is_pmc_openaccess: bool
|
|
21
|
+
is_manuscript: bool
|
|
22
|
+
is_historical_ocr: bool
|
|
23
|
+
is_retracted: bool
|
|
24
|
+
license_code: str | None = None
|
|
25
|
+
xml_url: str
|
|
26
|
+
pdf_url: str | None = None
|
|
27
|
+
media_urls: list[str] = Field(default_factory=list)
|
|
28
|
+
text_url: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class PMCFiles(BaseModel):
|
|
32
|
+
versioned_pmcid: str
|
|
33
|
+
keys: list[str]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class FetchAction(str, Enum):
|
|
37
|
+
DOWNLOADED = "downloaded"
|
|
38
|
+
SKIPPED = "skipped"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class PMCFetchFile(BaseModel):
|
|
42
|
+
key: str
|
|
43
|
+
local_path: str
|
|
44
|
+
action: FetchAction
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class PMCFetchResult(BaseModel):
|
|
48
|
+
versioned_pmcid: str
|
|
49
|
+
cache_dir: str
|
|
50
|
+
files: list[PMCFetchFile]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class PMCExtractResult(BaseModel):
|
|
54
|
+
versioned_pmcid: str
|
|
55
|
+
xml_path: str
|
|
56
|
+
data: dict[str, Any]
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Public API for the PMC open-access **S3 dataset** and **local download cache**.
|
|
2
|
+
|
|
3
|
+
CLI commands use this API; low-level S3 helpers live in :mod:`pmc_toolkit.storage_utils`
|
|
4
|
+
and local cache helpers live in :mod:`pmc_toolkit.cache`."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
from pmc_toolkit import cache as storage_cache
|
|
12
|
+
from pmc_toolkit import storage_utils
|
|
13
|
+
from pmc_toolkit.validators import parse_pmcid
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from pmc_toolkit.models import (
|
|
17
|
+
PMCFetchFile,
|
|
18
|
+
PMCFetchResult,
|
|
19
|
+
PMCFiles,
|
|
20
|
+
PMCMetadata,
|
|
21
|
+
PMCVersions,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def list_versions(pmcid: str) -> PMCVersions:
|
|
26
|
+
from pmc_toolkit.models import PMCVersions
|
|
27
|
+
|
|
28
|
+
base_pmcid, version = parse_pmcid(pmcid)
|
|
29
|
+
if version is not None:
|
|
30
|
+
raise ValueError(
|
|
31
|
+
"list_versions expects a base PMCID like 'PMC11370360', not a versioned ID."
|
|
32
|
+
)
|
|
33
|
+
versions = storage_utils.list_versioned_pmcids(base_pmcid)
|
|
34
|
+
return PMCVersions(
|
|
35
|
+
pmcid=base_pmcid,
|
|
36
|
+
versions=sorted(set(versions), key=storage_utils.version_number),
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_metadata(requested_pmcid: str) -> PMCMetadata:
|
|
41
|
+
cache_root = storage_cache.resolve_cache_root()
|
|
42
|
+
versioned_pmcid = storage_utils.resolve_versioned_pmcid(requested_pmcid)
|
|
43
|
+
cached = storage_cache.read_cached_metadata(cache_root, versioned_pmcid)
|
|
44
|
+
|
|
45
|
+
if cached is not None:
|
|
46
|
+
return cached
|
|
47
|
+
|
|
48
|
+
metadata = storage_utils.read_metadata(versioned_pmcid)
|
|
49
|
+
storage_cache.write_cached_metadata(cache_root, versioned_pmcid, metadata)
|
|
50
|
+
return metadata
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def list_files(requested_pmcid: str) -> PMCFiles:
|
|
54
|
+
from pmc_toolkit.models import PMCFiles
|
|
55
|
+
|
|
56
|
+
cache_root = storage_cache.resolve_cache_root()
|
|
57
|
+
versioned_pmcid = storage_utils.resolve_versioned_pmcid(requested_pmcid)
|
|
58
|
+
keys = storage_utils.read_or_cache_object_keys(cache_root, versioned_pmcid)
|
|
59
|
+
|
|
60
|
+
return PMCFiles(versioned_pmcid=versioned_pmcid, keys=keys)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def fetch_files(
|
|
64
|
+
requested_pmcid: str,
|
|
65
|
+
cache_dir: Path | None = None,
|
|
66
|
+
extensions: list[str] | None = None,
|
|
67
|
+
force: bool = False,
|
|
68
|
+
) -> PMCFetchResult:
|
|
69
|
+
from pmc_toolkit.models import FetchAction, PMCFetchFile, PMCFetchResult
|
|
70
|
+
|
|
71
|
+
cache_root = storage_cache.resolve_cache_root(cache_dir)
|
|
72
|
+
versioned_pmcid = storage_utils.resolve_versioned_pmcid(requested_pmcid)
|
|
73
|
+
all_keys = storage_utils.read_or_cache_object_keys(cache_root, versioned_pmcid)
|
|
74
|
+
|
|
75
|
+
normalized = storage_utils.normalize_extensions(extensions)
|
|
76
|
+
keys = [
|
|
77
|
+
key for key in all_keys if storage_utils.key_matches_extensions(key, normalized)
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
article_dir = storage_cache.article_cache_dir(cache_root, versioned_pmcid)
|
|
81
|
+
article_dir.mkdir(parents=True, exist_ok=True)
|
|
82
|
+
|
|
83
|
+
results: list[PMCFetchFile] = []
|
|
84
|
+
mkdir_done: set[Path] = set()
|
|
85
|
+
|
|
86
|
+
for key in keys:
|
|
87
|
+
dest = storage_cache.local_object_path(cache_root, versioned_pmcid, key)
|
|
88
|
+
parent = dest.parent
|
|
89
|
+
if parent not in mkdir_done:
|
|
90
|
+
parent.mkdir(parents=True, exist_ok=True)
|
|
91
|
+
mkdir_done.add(parent)
|
|
92
|
+
|
|
93
|
+
if dest.exists() and not force:
|
|
94
|
+
results.append(
|
|
95
|
+
PMCFetchFile(key=key, local_path=str(dest), action=FetchAction.SKIPPED)
|
|
96
|
+
)
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
storage_utils.download_object(key, dest)
|
|
100
|
+
results.append(
|
|
101
|
+
PMCFetchFile(key=key, local_path=str(dest), action=FetchAction.DOWNLOADED)
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
return PMCFetchResult(
|
|
105
|
+
versioned_pmcid=versioned_pmcid,
|
|
106
|
+
cache_dir=str(article_dir),
|
|
107
|
+
files=results,
|
|
108
|
+
)
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Internal S3 helpers for ``storage_api``; local cache helpers live in
|
|
2
|
+
:mod:`pmc_toolkit.cache`. Not the Python import surface—use
|
|
3
|
+
:mod:`pmc_toolkit.storage_api`."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from functools import cache
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
from pmc_toolkit import cache as storage_cache
|
|
12
|
+
from pmc_toolkit.validators import parse_pmcid
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from pmc_toolkit.models import PMCMetadata
|
|
16
|
+
from types_boto3_s3.client import S3Client
|
|
17
|
+
|
|
18
|
+
BUCKET = "pmc-oa-opendata"
|
|
19
|
+
REGION = "us-east-1"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Shared S3 setup
|
|
23
|
+
@cache
|
|
24
|
+
def _get_s3_client() -> "S3Client":
|
|
25
|
+
import boto3
|
|
26
|
+
from botocore import UNSIGNED
|
|
27
|
+
from botocore.config import Config
|
|
28
|
+
|
|
29
|
+
return boto3.client(
|
|
30
|
+
"s3", region_name=REGION, config=Config(signature_version=UNSIGNED)
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Version resolution helpers
|
|
35
|
+
def version_number(versioned_pmcid: str) -> int:
|
|
36
|
+
return int(versioned_pmcid.rsplit(".", 1)[1])
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def list_versioned_pmcids(pmcid: str) -> list[str]:
|
|
40
|
+
prefix = f"{pmcid}."
|
|
41
|
+
|
|
42
|
+
s3 = _get_s3_client()
|
|
43
|
+
paginator = s3.get_paginator("list_objects_v2")
|
|
44
|
+
pages = paginator.paginate(
|
|
45
|
+
Bucket=BUCKET,
|
|
46
|
+
Prefix=prefix,
|
|
47
|
+
Delimiter="/",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
versions: list[str] = []
|
|
51
|
+
|
|
52
|
+
for page in pages:
|
|
53
|
+
for item in page.get("CommonPrefixes", []):
|
|
54
|
+
raw_prefix = item["Prefix"]
|
|
55
|
+
version = raw_prefix.rstrip("/")
|
|
56
|
+
if version:
|
|
57
|
+
versions.append(version)
|
|
58
|
+
|
|
59
|
+
return versions
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _latest_versioned_pmcid(pmcid: str) -> str:
|
|
63
|
+
"""Return the highest available article version published for a base PMCID."""
|
|
64
|
+
versions = list_versioned_pmcids(pmcid)
|
|
65
|
+
|
|
66
|
+
if not versions:
|
|
67
|
+
raise ValueError(f"No versions found for PMCID: {pmcid}.")
|
|
68
|
+
|
|
69
|
+
return max(versions, key=version_number)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def resolve_versioned_pmcid(requested_pmcid: str) -> str:
|
|
73
|
+
"""Resolve a PMCID input to an explicit version, using the latest version when omitted."""
|
|
74
|
+
pmcid, version = parse_pmcid(requested_pmcid)
|
|
75
|
+
return (
|
|
76
|
+
f"{pmcid}.{version}" if version is not None else _latest_versioned_pmcid(pmcid)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# Metadata S3 helpers
|
|
81
|
+
def read_metadata(versioned_pmcid: str) -> PMCMetadata:
|
|
82
|
+
"""Fetch article metadata from the S3 metadata index for a specific version."""
|
|
83
|
+
key = f"metadata/{versioned_pmcid}.json"
|
|
84
|
+
|
|
85
|
+
import json
|
|
86
|
+
|
|
87
|
+
from botocore.exceptions import ClientError
|
|
88
|
+
from pmc_toolkit.models import PMCMetadata
|
|
89
|
+
|
|
90
|
+
s3 = _get_s3_client()
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
response = s3.get_object(Bucket=BUCKET, Key=key)
|
|
94
|
+
except ClientError as exc:
|
|
95
|
+
error_code = exc.response.get("Error", {}).get("Code")
|
|
96
|
+
if error_code in {"NoSuchKey", "404"}:
|
|
97
|
+
raise ValueError(
|
|
98
|
+
f"No metadata found for article: {versioned_pmcid}."
|
|
99
|
+
) from exc
|
|
100
|
+
raise
|
|
101
|
+
|
|
102
|
+
payload = json.loads(response["Body"].read())
|
|
103
|
+
return PMCMetadata.model_validate(payload)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# Object-key listing helpers
|
|
107
|
+
def list_object_keys(versioned_pmcid: str) -> list[str]:
|
|
108
|
+
prefix = f"{versioned_pmcid}/"
|
|
109
|
+
|
|
110
|
+
s3 = _get_s3_client()
|
|
111
|
+
paginator = s3.get_paginator("list_objects_v2")
|
|
112
|
+
pages = paginator.paginate(Bucket=BUCKET, Prefix=prefix)
|
|
113
|
+
|
|
114
|
+
keys: list[str] = []
|
|
115
|
+
for page in pages:
|
|
116
|
+
for item in page.get("Contents", []):
|
|
117
|
+
key = item.get("Key")
|
|
118
|
+
if key and not key.endswith("/"):
|
|
119
|
+
keys.append(key)
|
|
120
|
+
|
|
121
|
+
return sorted(keys)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def read_or_cache_object_keys(cache_root: Path, versioned_pmcid: str) -> list[str]:
|
|
125
|
+
"""Return cached object keys when available, otherwise list S3 and persist the result."""
|
|
126
|
+
keys = storage_cache.read_cached_object_keys(cache_root, versioned_pmcid)
|
|
127
|
+
if keys is not None:
|
|
128
|
+
return keys
|
|
129
|
+
|
|
130
|
+
keys = list_object_keys(versioned_pmcid)
|
|
131
|
+
if not keys:
|
|
132
|
+
raise ValueError(f"No files found for article: {versioned_pmcid}.")
|
|
133
|
+
|
|
134
|
+
storage_cache.write_cached_object_keys(cache_root, versioned_pmcid, keys)
|
|
135
|
+
return keys
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# Fetch filtering and downloads
|
|
139
|
+
def normalize_extensions(extensions: list[str] | None) -> set[str] | None:
|
|
140
|
+
"""Normalize repeated or comma-separated extension filters into a lowercase suffix set."""
|
|
141
|
+
if not extensions:
|
|
142
|
+
return None
|
|
143
|
+
normalized = {
|
|
144
|
+
part.strip().lower().lstrip(".")
|
|
145
|
+
for ext in extensions
|
|
146
|
+
for part in ext.split(",")
|
|
147
|
+
if part.strip()
|
|
148
|
+
}
|
|
149
|
+
return normalized or None
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def key_matches_extensions(key: str, extensions: set[str] | None) -> bool:
|
|
153
|
+
if extensions is None:
|
|
154
|
+
return True
|
|
155
|
+
filename = key.rsplit("/", 1)[-1]
|
|
156
|
+
_, _, ext = filename.rpartition(".")
|
|
157
|
+
return bool(ext) and ext.lower() in extensions
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def download_object(key: str, dest: Path) -> None:
|
|
161
|
+
from botocore.exceptions import ClientError
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
_get_s3_client().download_file(BUCKET, key, str(dest))
|
|
165
|
+
except ClientError as exc:
|
|
166
|
+
error_code = exc.response.get("Error", {}).get("Code")
|
|
167
|
+
if error_code in {"NoSuchKey", "404"}:
|
|
168
|
+
raise ValueError(f"No object found for key: {key!r}.") from exc
|
|
169
|
+
raise
|