pmc-toolkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
pmc_toolkit/cache.py ADDED
@@ -0,0 +1,144 @@
1
+ """Local filesystem cache helpers for PMC metadata, manifests, and downloads."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ if TYPE_CHECKING:
9
+ from pmc_toolkit.models import PMCMetadata
10
+
11
+ OBJECT_KEYS_CACHE_FILENAME = ".pmc-object-keys.json"
12
+ EXTRACTED_ARTICLE_CACHE_FILENAME = ".pmc-extracted-article.json"
13
+
14
+
15
+ def default_cache_root() -> Path:
16
+ from platformdirs import user_cache_dir
17
+
18
+ return Path(user_cache_dir("pmc-toolkit", appauthor=False))
19
+
20
+
21
+ def resolve_cache_root(cache_dir: Path | None = None) -> Path:
22
+ return Path(cache_dir) if cache_dir is not None else default_cache_root()
23
+
24
+
25
+ def article_cache_dir(cache_root: Path, versioned_pmcid: str) -> Path:
26
+ return cache_root / versioned_pmcid
27
+
28
+
29
+ def metadata_cache_path(cache_root: Path, versioned_pmcid: str) -> Path:
30
+ return article_cache_dir(cache_root, versioned_pmcid) / f"{versioned_pmcid}.json"
31
+
32
+
33
+ def object_keys_cache_path(cache_root: Path, versioned_pmcid: str) -> Path:
34
+ return article_cache_dir(cache_root, versioned_pmcid) / OBJECT_KEYS_CACHE_FILENAME
35
+
36
+
37
+ def extracted_article_cache_path(cache_root: Path, versioned_pmcid: str) -> Path:
38
+ return (
39
+ article_cache_dir(cache_root, versioned_pmcid)
40
+ / EXTRACTED_ARTICLE_CACHE_FILENAME
41
+ )
42
+
43
+
44
+ def _read_json_file(path: Path) -> Any | None:
45
+ if not path.exists():
46
+ return None
47
+
48
+ import json
49
+
50
+ return json.loads(path.read_text(encoding="utf-8"))
51
+
52
+
53
+ def read_cached_metadata(cache_root: Path, versioned_pmcid: str) -> PMCMetadata | None:
54
+ payload = _read_json_file(metadata_cache_path(cache_root, versioned_pmcid))
55
+ if payload is None:
56
+ return None
57
+
58
+ from pmc_toolkit.models import PMCMetadata
59
+
60
+ return PMCMetadata.model_validate(payload)
61
+
62
+
63
+ def write_cached_metadata(
64
+ cache_root: Path, versioned_pmcid: str, metadata: PMCMetadata
65
+ ) -> None:
66
+ path = metadata_cache_path(cache_root, versioned_pmcid)
67
+ path.parent.mkdir(parents=True, exist_ok=True)
68
+ path.write_text(metadata.model_dump_json(indent=2), encoding="utf-8")
69
+
70
+
71
+ def read_cached_object_keys(cache_root: Path, versioned_pmcid: str) -> list[str] | None:
72
+ payload = _read_json_file(object_keys_cache_path(cache_root, versioned_pmcid))
73
+ if payload is None:
74
+ return None
75
+ if not isinstance(payload, list) or not all(
76
+ isinstance(item, str) for item in payload
77
+ ):
78
+ raise ValueError(f"Invalid cached file listing for article: {versioned_pmcid}.")
79
+
80
+ return sorted(payload)
81
+
82
+
83
+ def write_cached_object_keys(
84
+ cache_root: Path, versioned_pmcid: str, keys: list[str]
85
+ ) -> None:
86
+ import json
87
+
88
+ path = object_keys_cache_path(cache_root, versioned_pmcid)
89
+ path.parent.mkdir(parents=True, exist_ok=True)
90
+ path.write_text(json.dumps(sorted(keys), indent=2), encoding="utf-8")
91
+
92
+
93
+ def read_cached_extracted_article(
94
+ cache_root: Path, versioned_pmcid: str
95
+ ) -> dict[str, Any] | None:
96
+ path = extracted_article_cache_path(cache_root, versioned_pmcid)
97
+ try:
98
+ payload = _read_json_file(path)
99
+ except ValueError:
100
+ return None
101
+ if payload is None:
102
+ return None
103
+ if not isinstance(payload, dict):
104
+ return None
105
+ return payload
106
+
107
+
108
+ def write_cached_extracted_article(
109
+ cache_root: Path, versioned_pmcid: str, data: dict[str, Any]
110
+ ) -> None:
111
+ import json
112
+
113
+ path = extracted_article_cache_path(cache_root, versioned_pmcid)
114
+ path.parent.mkdir(parents=True, exist_ok=True)
115
+ path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
116
+
117
+
118
+ def local_object_path(cache_root: Path, versioned_pmcid: str, key: str) -> Path:
119
+ """Return the local cache path for an S3 object key.
120
+
121
+ S3 object keys are remote-controlled opaque strings, not trusted filesystem paths.
122
+ A key may still start with the expected article prefix while using ``..`` or an
123
+ absolute path segment to escape the article cache directory, so this helper
124
+ enforces that the resolved destination remains inside that directory.
125
+ """
126
+ prefix = f"{versioned_pmcid}/"
127
+ if not key.startswith(prefix):
128
+ raise ValueError(
129
+ f"Object key {key!r} does not belong to article: {versioned_pmcid}."
130
+ )
131
+
132
+ relpath = key.removeprefix(prefix)
133
+ article_dir = article_cache_dir(cache_root, versioned_pmcid)
134
+
135
+ # Keep downloads contained to the article cache directory.
136
+ if Path(relpath).is_absolute():
137
+ raise ValueError(f"Unsafe object key path for article: {versioned_pmcid}.")
138
+
139
+ dest_path = (article_dir / relpath).resolve()
140
+ article_dir_resolved = article_dir.resolve()
141
+ if not dest_path.is_relative_to(article_dir_resolved):
142
+ raise ValueError(f"Unsafe object key path for article: {versioned_pmcid}.")
143
+
144
+ return dest_path
pmc_toolkit/cli.py ADDED
@@ -0,0 +1,196 @@
1
+ from collections.abc import Callable
2
+ import json
3
+ from pathlib import Path
4
+ from typing import Any, TypeVar
5
+
6
+ import typer
7
+
8
+ CommandResult = TypeVar("CommandResult")
9
+
10
+ app = typer.Typer(
11
+ help="CLI for interacting with the PMC Open Data S3 bucket.",
12
+ no_args_is_help=True,
13
+ )
14
+
15
+
16
+ def _run_command(action: Callable[[], CommandResult]) -> CommandResult:
17
+ try:
18
+ return action()
19
+ except ValueError as exc:
20
+ typer.echo(f"Error: {exc}", err=True)
21
+ raise typer.Exit(code=2) from exc
22
+ except Exception as exc:
23
+ typer.echo(f"Error: {exc}", err=True)
24
+ raise typer.Exit(code=1) from exc
25
+
26
+
27
+ def _emit_json(payload: Any) -> None:
28
+ typer.echo(json.dumps(payload, indent=2, ensure_ascii=False))
29
+
30
+
31
+ @app.command("versions")
32
+ def versions(
33
+ pmcid: str = typer.Argument(..., help="PMC accession ID, e.g. PMC11370360"),
34
+ ) -> None:
35
+ """
36
+ List all versions belonging to a PMCID.
37
+ """
38
+
39
+ def build_result():
40
+ from pmc_toolkit.storage_api import list_versions
41
+ from pmc_toolkit.validators import parse_pmcid
42
+
43
+ normalized_pmcid, version = parse_pmcid(pmcid)
44
+ if version is not None:
45
+ raise ValueError(
46
+ "The versions command expects a base PMCID like 'PMC11370360', not a versioned ID."
47
+ )
48
+ return list_versions(normalized_pmcid)
49
+
50
+ result = _run_command(build_result)
51
+ _emit_json(result.model_dump(mode="json"))
52
+
53
+
54
+ @app.command("metadata")
55
+ def metadata(
56
+ requested_pmcid: str = typer.Argument(
57
+ ...,
58
+ help="PMC accession ID or version ID, e.g. PMC11370360 or PMC11370360.1",
59
+ ),
60
+ ) -> None:
61
+ """
62
+ Fetch metadata for a PMC article identifier.
63
+ """
64
+
65
+ def build_result():
66
+ from pmc_toolkit.storage_api import get_metadata
67
+
68
+ return get_metadata(requested_pmcid)
69
+
70
+ result = _run_command(build_result)
71
+ _emit_json(result.model_dump(mode="json"))
72
+
73
+
74
+ @app.command("files")
75
+ def files(
76
+ requested_pmcid: str = typer.Argument(
77
+ ...,
78
+ help="PMC accession ID or version ID, e.g. PMC11370360 or PMC11370360.1",
79
+ ),
80
+ ) -> None:
81
+ """
82
+ List every object stored under a PMC article version's S3 prefix.
83
+ """
84
+
85
+ def build_result():
86
+ from pmc_toolkit.storage_api import list_files
87
+
88
+ return list_files(requested_pmcid)
89
+
90
+ result = _run_command(build_result)
91
+ _emit_json(result.model_dump(mode="json"))
92
+
93
+
94
+ @app.command("fetch")
95
+ def fetch(
96
+ requested_pmcid: str = typer.Argument(
97
+ ...,
98
+ help="PMC accession ID or version ID, e.g. PMC11370360 or PMC11370360.1",
99
+ ),
100
+ extensions: list[str] = typer.Option(
101
+ None,
102
+ "--ext",
103
+ "-e",
104
+ help=(
105
+ "Restrict download to these file extensions. Repeat the option or pass a "
106
+ "comma-separated list, e.g. -e pdf -e xml or -e pdf,xml."
107
+ ),
108
+ ),
109
+ cache_dir: Path = typer.Option(
110
+ None,
111
+ "--cache-dir",
112
+ envvar="PMC_TOOLKIT_CACHE",
113
+ help=(
114
+ "Cache root (default: OS user cache dir for pmc-toolkit, e.g. XDG on Linux, "
115
+ "Library/Caches on macOS, Local AppData on Windows). Files under <cache>/<PMCid.N>/."
116
+ ),
117
+ ),
118
+ force: bool = typer.Option(
119
+ False,
120
+ "--force",
121
+ "-f",
122
+ help="Re-download files even when they already exist in the cache.",
123
+ ),
124
+ ) -> None:
125
+ """
126
+ Download all (or filtered) files for a PMC article version into a local cache.
127
+ """
128
+
129
+ def build_result():
130
+ from pmc_toolkit.storage_api import fetch_files
131
+
132
+ return fetch_files(
133
+ requested_pmcid,
134
+ cache_dir=cache_dir,
135
+ extensions=extensions,
136
+ force=force,
137
+ )
138
+
139
+ result = _run_command(build_result)
140
+ _emit_json(result.model_dump(mode="json"))
141
+
142
+
143
+ @app.command("convert-xml")
144
+ def convert_xml(
145
+ requested_pmcid: str = typer.Argument(
146
+ ...,
147
+ help="PMC accession ID or version ID, e.g. PMC11370360 or PMC11370360.1",
148
+ ),
149
+ cache_dir: Path = typer.Option(
150
+ None,
151
+ "--cache-dir",
152
+ envvar="PMC_TOOLKIT_CACHE",
153
+ help="Cache root containing <PMCid.N>/<PMCid.N>.xml.",
154
+ ),
155
+ force: bool = typer.Option(
156
+ False,
157
+ "--force",
158
+ "-f",
159
+ help="Recreate the extracted JSON cache from the cached XML.",
160
+ ),
161
+ list_keys: bool = typer.Option(
162
+ False,
163
+ "--list-keys",
164
+ help="Print available extracted JSON keys and descriptions, then exit.",
165
+ ),
166
+ ) -> None:
167
+ """
168
+ Convert cached PMC full-text XML into cached extracted JSON.
169
+ """
170
+ if list_keys:
171
+ from pmc_toolkit.xml_parse_utils import EXTRACT_OUTPUT_KEY_DESCRIPTIONS
172
+
173
+ typer.echo("Available extracted JSON keys:")
174
+ for key, description in EXTRACT_OUTPUT_KEY_DESCRIPTIONS.items():
175
+ typer.echo(f"- {key}: {description}")
176
+ return
177
+
178
+ def build_result():
179
+ from pmc_toolkit.xml_parse_api import ensure_extracted_article
180
+
181
+ return ensure_extracted_article(
182
+ requested_pmcid,
183
+ cache_dir=cache_dir,
184
+ force=force,
185
+ )
186
+
187
+ result = _run_command(build_result)
188
+ _emit_json(result.data)
189
+
190
+
191
+ def main() -> None:
192
+ app()
193
+
194
+
195
+ if __name__ == "__main__":
196
+ main()
pmc_toolkit/models.py ADDED
@@ -0,0 +1,56 @@
1
+ from enum import Enum
2
+ from typing import Any
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+
7
+ class PMCVersions(BaseModel):
8
+ pmcid: str
9
+ versions: list[str]
10
+
11
+
12
+ class PMCMetadata(BaseModel):
13
+ pmcid: str
14
+ version: int
15
+ pmid: int | None = None
16
+ doi: str | None = None
17
+ mid: str | None = None
18
+ title: str | None = None
19
+ citation: str | None = None
20
+ is_pmc_openaccess: bool
21
+ is_manuscript: bool
22
+ is_historical_ocr: bool
23
+ is_retracted: bool
24
+ license_code: str | None = None
25
+ xml_url: str
26
+ pdf_url: str | None = None
27
+ media_urls: list[str] = Field(default_factory=list)
28
+ text_url: str
29
+
30
+
31
+ class PMCFiles(BaseModel):
32
+ versioned_pmcid: str
33
+ keys: list[str]
34
+
35
+
36
+ class FetchAction(str, Enum):
37
+ DOWNLOADED = "downloaded"
38
+ SKIPPED = "skipped"
39
+
40
+
41
+ class PMCFetchFile(BaseModel):
42
+ key: str
43
+ local_path: str
44
+ action: FetchAction
45
+
46
+
47
+ class PMCFetchResult(BaseModel):
48
+ versioned_pmcid: str
49
+ cache_dir: str
50
+ files: list[PMCFetchFile]
51
+
52
+
53
+ class PMCExtractResult(BaseModel):
54
+ versioned_pmcid: str
55
+ xml_path: str
56
+ data: dict[str, Any]
@@ -0,0 +1,108 @@
1
+ """Public API for the PMC open-access **S3 dataset** and **local download cache**.
2
+
3
+ CLI commands use this API; low-level S3 helpers live in :mod:`pmc_toolkit.storage_utils`
4
+ and local cache helpers live in :mod:`pmc_toolkit.cache`."""
5
+
6
+ from __future__ import annotations
7
+
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING
10
+
11
+ from pmc_toolkit import cache as storage_cache
12
+ from pmc_toolkit import storage_utils
13
+ from pmc_toolkit.validators import parse_pmcid
14
+
15
+ if TYPE_CHECKING:
16
+ from pmc_toolkit.models import (
17
+ PMCFetchFile,
18
+ PMCFetchResult,
19
+ PMCFiles,
20
+ PMCMetadata,
21
+ PMCVersions,
22
+ )
23
+
24
+
25
+ def list_versions(pmcid: str) -> PMCVersions:
26
+ from pmc_toolkit.models import PMCVersions
27
+
28
+ base_pmcid, version = parse_pmcid(pmcid)
29
+ if version is not None:
30
+ raise ValueError(
31
+ "list_versions expects a base PMCID like 'PMC11370360', not a versioned ID."
32
+ )
33
+ versions = storage_utils.list_versioned_pmcids(base_pmcid)
34
+ return PMCVersions(
35
+ pmcid=base_pmcid,
36
+ versions=sorted(set(versions), key=storage_utils.version_number),
37
+ )
38
+
39
+
40
+ def get_metadata(requested_pmcid: str) -> PMCMetadata:
41
+ cache_root = storage_cache.resolve_cache_root()
42
+ versioned_pmcid = storage_utils.resolve_versioned_pmcid(requested_pmcid)
43
+ cached = storage_cache.read_cached_metadata(cache_root, versioned_pmcid)
44
+
45
+ if cached is not None:
46
+ return cached
47
+
48
+ metadata = storage_utils.read_metadata(versioned_pmcid)
49
+ storage_cache.write_cached_metadata(cache_root, versioned_pmcid, metadata)
50
+ return metadata
51
+
52
+
53
+ def list_files(requested_pmcid: str) -> PMCFiles:
54
+ from pmc_toolkit.models import PMCFiles
55
+
56
+ cache_root = storage_cache.resolve_cache_root()
57
+ versioned_pmcid = storage_utils.resolve_versioned_pmcid(requested_pmcid)
58
+ keys = storage_utils.read_or_cache_object_keys(cache_root, versioned_pmcid)
59
+
60
+ return PMCFiles(versioned_pmcid=versioned_pmcid, keys=keys)
61
+
62
+
63
+ def fetch_files(
64
+ requested_pmcid: str,
65
+ cache_dir: Path | None = None,
66
+ extensions: list[str] | None = None,
67
+ force: bool = False,
68
+ ) -> PMCFetchResult:
69
+ from pmc_toolkit.models import FetchAction, PMCFetchFile, PMCFetchResult
70
+
71
+ cache_root = storage_cache.resolve_cache_root(cache_dir)
72
+ versioned_pmcid = storage_utils.resolve_versioned_pmcid(requested_pmcid)
73
+ all_keys = storage_utils.read_or_cache_object_keys(cache_root, versioned_pmcid)
74
+
75
+ normalized = storage_utils.normalize_extensions(extensions)
76
+ keys = [
77
+ key for key in all_keys if storage_utils.key_matches_extensions(key, normalized)
78
+ ]
79
+
80
+ article_dir = storage_cache.article_cache_dir(cache_root, versioned_pmcid)
81
+ article_dir.mkdir(parents=True, exist_ok=True)
82
+
83
+ results: list[PMCFetchFile] = []
84
+ mkdir_done: set[Path] = set()
85
+
86
+ for key in keys:
87
+ dest = storage_cache.local_object_path(cache_root, versioned_pmcid, key)
88
+ parent = dest.parent
89
+ if parent not in mkdir_done:
90
+ parent.mkdir(parents=True, exist_ok=True)
91
+ mkdir_done.add(parent)
92
+
93
+ if dest.exists() and not force:
94
+ results.append(
95
+ PMCFetchFile(key=key, local_path=str(dest), action=FetchAction.SKIPPED)
96
+ )
97
+ continue
98
+
99
+ storage_utils.download_object(key, dest)
100
+ results.append(
101
+ PMCFetchFile(key=key, local_path=str(dest), action=FetchAction.DOWNLOADED)
102
+ )
103
+
104
+ return PMCFetchResult(
105
+ versioned_pmcid=versioned_pmcid,
106
+ cache_dir=str(article_dir),
107
+ files=results,
108
+ )
@@ -0,0 +1,169 @@
1
+ """Internal S3 helpers for ``storage_api``; local cache helpers live in
2
+ :mod:`pmc_toolkit.cache`. Not the Python import surface—use
3
+ :mod:`pmc_toolkit.storage_api`."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from functools import cache
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING
10
+
11
+ from pmc_toolkit import cache as storage_cache
12
+ from pmc_toolkit.validators import parse_pmcid
13
+
14
+ if TYPE_CHECKING:
15
+ from pmc_toolkit.models import PMCMetadata
16
+ from types_boto3_s3.client import S3Client
17
+
18
+ BUCKET = "pmc-oa-opendata"
19
+ REGION = "us-east-1"
20
+
21
+
22
+ # Shared S3 setup
23
+ @cache
24
+ def _get_s3_client() -> "S3Client":
25
+ import boto3
26
+ from botocore import UNSIGNED
27
+ from botocore.config import Config
28
+
29
+ return boto3.client(
30
+ "s3", region_name=REGION, config=Config(signature_version=UNSIGNED)
31
+ )
32
+
33
+
34
+ # Version resolution helpers
35
+ def version_number(versioned_pmcid: str) -> int:
36
+ return int(versioned_pmcid.rsplit(".", 1)[1])
37
+
38
+
39
+ def list_versioned_pmcids(pmcid: str) -> list[str]:
40
+ prefix = f"{pmcid}."
41
+
42
+ s3 = _get_s3_client()
43
+ paginator = s3.get_paginator("list_objects_v2")
44
+ pages = paginator.paginate(
45
+ Bucket=BUCKET,
46
+ Prefix=prefix,
47
+ Delimiter="/",
48
+ )
49
+
50
+ versions: list[str] = []
51
+
52
+ for page in pages:
53
+ for item in page.get("CommonPrefixes", []):
54
+ raw_prefix = item["Prefix"]
55
+ version = raw_prefix.rstrip("/")
56
+ if version:
57
+ versions.append(version)
58
+
59
+ return versions
60
+
61
+
62
+ def _latest_versioned_pmcid(pmcid: str) -> str:
63
+ """Return the highest available article version published for a base PMCID."""
64
+ versions = list_versioned_pmcids(pmcid)
65
+
66
+ if not versions:
67
+ raise ValueError(f"No versions found for PMCID: {pmcid}.")
68
+
69
+ return max(versions, key=version_number)
70
+
71
+
72
+ def resolve_versioned_pmcid(requested_pmcid: str) -> str:
73
+ """Resolve a PMCID input to an explicit version, using the latest version when omitted."""
74
+ pmcid, version = parse_pmcid(requested_pmcid)
75
+ return (
76
+ f"{pmcid}.{version}" if version is not None else _latest_versioned_pmcid(pmcid)
77
+ )
78
+
79
+
80
+ # Metadata S3 helpers
81
+ def read_metadata(versioned_pmcid: str) -> PMCMetadata:
82
+ """Fetch article metadata from the S3 metadata index for a specific version."""
83
+ key = f"metadata/{versioned_pmcid}.json"
84
+
85
+ import json
86
+
87
+ from botocore.exceptions import ClientError
88
+ from pmc_toolkit.models import PMCMetadata
89
+
90
+ s3 = _get_s3_client()
91
+
92
+ try:
93
+ response = s3.get_object(Bucket=BUCKET, Key=key)
94
+ except ClientError as exc:
95
+ error_code = exc.response.get("Error", {}).get("Code")
96
+ if error_code in {"NoSuchKey", "404"}:
97
+ raise ValueError(
98
+ f"No metadata found for article: {versioned_pmcid}."
99
+ ) from exc
100
+ raise
101
+
102
+ payload = json.loads(response["Body"].read())
103
+ return PMCMetadata.model_validate(payload)
104
+
105
+
106
+ # Object-key listing helpers
107
+ def list_object_keys(versioned_pmcid: str) -> list[str]:
108
+ prefix = f"{versioned_pmcid}/"
109
+
110
+ s3 = _get_s3_client()
111
+ paginator = s3.get_paginator("list_objects_v2")
112
+ pages = paginator.paginate(Bucket=BUCKET, Prefix=prefix)
113
+
114
+ keys: list[str] = []
115
+ for page in pages:
116
+ for item in page.get("Contents", []):
117
+ key = item.get("Key")
118
+ if key and not key.endswith("/"):
119
+ keys.append(key)
120
+
121
+ return sorted(keys)
122
+
123
+
124
+ def read_or_cache_object_keys(cache_root: Path, versioned_pmcid: str) -> list[str]:
125
+ """Return cached object keys when available, otherwise list S3 and persist the result."""
126
+ keys = storage_cache.read_cached_object_keys(cache_root, versioned_pmcid)
127
+ if keys is not None:
128
+ return keys
129
+
130
+ keys = list_object_keys(versioned_pmcid)
131
+ if not keys:
132
+ raise ValueError(f"No files found for article: {versioned_pmcid}.")
133
+
134
+ storage_cache.write_cached_object_keys(cache_root, versioned_pmcid, keys)
135
+ return keys
136
+
137
+
138
+ # Fetch filtering and downloads
139
+ def normalize_extensions(extensions: list[str] | None) -> set[str] | None:
140
+ """Normalize repeated or comma-separated extension filters into a lowercase suffix set."""
141
+ if not extensions:
142
+ return None
143
+ normalized = {
144
+ part.strip().lower().lstrip(".")
145
+ for ext in extensions
146
+ for part in ext.split(",")
147
+ if part.strip()
148
+ }
149
+ return normalized or None
150
+
151
+
152
+ def key_matches_extensions(key: str, extensions: set[str] | None) -> bool:
153
+ if extensions is None:
154
+ return True
155
+ filename = key.rsplit("/", 1)[-1]
156
+ _, _, ext = filename.rpartition(".")
157
+ return bool(ext) and ext.lower() in extensions
158
+
159
+
160
+ def download_object(key: str, dest: Path) -> None:
161
+ from botocore.exceptions import ClientError
162
+
163
+ try:
164
+ _get_s3_client().download_file(BUCKET, key, str(dest))
165
+ except ClientError as exc:
166
+ error_code = exc.response.get("Error", {}).get("Code")
167
+ if error_code in {"NoSuchKey", "404"}:
168
+ raise ValueError(f"No object found for key: {key!r}.") from exc
169
+ raise