refua-data 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 JJ Ben-Joseph
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,193 @@
1
+ Metadata-Version: 2.4
2
+ Name: refua-data
3
+ Version: 0.6.0
4
+ Summary: Data ingestion, caching, and parquet materialization for the Refua drug discovery ecosystem.
5
+ Author-email: JJ Ben-Joseph <jj@tensorspace.ai>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://agentcures.com/
8
+ Project-URL: Repository, https://github.com/agentcures/refua
9
+ Project-URL: Documentation, https://github.com/agentcures/refua#readme
10
+ Project-URL: Issues, https://github.com/agentcures/refua/issues
11
+ Keywords: drug discovery,data engineering,cheminformatics,bioinformatics,parquet,refua
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Programming Language :: Python :: 3.14
20
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
21
+ Classifier: Topic :: Scientific/Engineering :: Chemistry
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Requires-Python: <3.15,>=3.11
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: pandas>=2.2.2
27
+ Requires-Dist: pyarrow>=18.0.0
28
+ Requires-Dist: requests>=2.32.3
29
+ Requires-Dist: tqdm>=4.66.0
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
32
+ Requires-Dist: ruff>=0.6.0; extra == "dev"
33
+ Requires-Dist: mypy>=1.11.0; extra == "dev"
34
+ Requires-Dist: pandas-stubs>=2.2.3.250527; extra == "dev"
35
+ Requires-Dist: types-requests>=2.32.0.20241016; extra == "dev"
36
+ Requires-Dist: build>=1.2.2; extra == "dev"
37
+ Requires-Dist: twine>=6.1.0; extra == "dev"
38
+ Dynamic: license-file
39
+
40
+ # refua-data
41
+
42
+ `refua-data` is the Refua data layer for drug discovery. It provides a curated dataset catalog, intelligent local caching, and parquet materialization optimized for downstream modeling and campaign workflows.
43
+
44
+ ## What it provides
45
+
46
+ - A built-in catalog of useful drug-discovery datasets.
47
+ - Dataset-aware download pipeline with cache reuse and metadata tracking.
48
+ - Pluggable cache backend architecture (filesystem cache by default).
49
+ - API dataset ingestion for paginated JSON endpoints (for example ChEMBL and UniProt).
50
+ - HTTP conditional refresh support (`ETag` / `Last-Modified`) when enabled.
51
+ - Incremental parquet materialization (chunked processing + partitioned parquet parts).
52
+ - CLI for listing, fetching, and materializing datasets.
53
+ - Source health checks via `validate-sources` for CI and environment diagnostics.
54
+ - Rich dataset metadata snapshots (description + usage notes) persisted in cache metadata.
55
+
56
+ ## Included datasets
57
+
58
+ The default catalog includes local-file/HTTP datasets plus API presets useful in drug discovery, including **ZINC**, **ChEMBL**, and **UniProt**.
59
+
60
+ 1. `zinc15_250k` (ZINC)
61
+ 2. `zinc15_tranche_druglike_instock` (ZINC tranche)
62
+ 3. `zinc15_tranche_druglike_agent` (ZINC tranche)
63
+ 4. `zinc15_tranche_druglike_wait_ok` (ZINC tranche)
64
+ 5. `zinc15_tranche_druglike_boutique` (ZINC tranche)
65
+ 6. `zinc15_tranche_druglike_annotated` (ZINC tranche)
66
+ 7. `tox21`
67
+ 8. `bbbp`
68
+ 9. `bace`
69
+ 10. `clintox`
70
+ 11. `sider`
71
+ 12. `hiv`
72
+ 13. `muv`
73
+ 14. `esol`
74
+ 15. `freesolv`
75
+ 16. `lipophilicity`
76
+ 17. `pcba`
77
+ 18. `chembl_activity_ki_human`
78
+ 19. `chembl_activity_ic50_human`
79
+ 20. `chembl_assays_binding_human`
80
+ 21. `chembl_targets_human_single_protein`
81
+ 22. `chembl_molecules_phase3plus`
82
+ 23. `uniprot_human_reviewed`
83
+ 24. `uniprot_human_kinases`
84
+ 25. `uniprot_human_gpcr`
85
+ 26. `uniprot_human_ion_channels`
86
+ 27. `uniprot_human_transporters`
87
+
88
+ Most of these are distributed through MoleculeNet/DeepChem mirrors and retain upstream licensing terms.
89
+ ChEMBL and UniProt presets are fetched through their public REST APIs and cached locally as JSONL.
90
+ ZINC tranche presets aggregate multiple tranche files per dataset (drug-like MW B-K and logP A-K bins,
91
+ reactivity A/B/C/E) into one cached tabular source during fetch.
92
+
93
+ ## Install
94
+
95
+ ```bash
96
+ cd refua-data
97
+ pip install -e .
98
+ ```
99
+
100
+ ## CLI quickstart
101
+
102
+ List datasets:
103
+
104
+ ```bash
105
+ refua-data list
106
+ ```
107
+
108
+ Validate all dataset sources:
109
+
110
+ ```bash
111
+ refua-data validate-sources
112
+ ```
113
+
114
+ Validate a subset and fail CI on probe failures:
115
+
116
+ ```bash
117
+ refua-data validate-sources chembl_activity_ki_human uniprot_human_kinases --fail-on-error
118
+ ```
119
+
120
+ JSON output for automation:
121
+
122
+ ```bash
123
+ refua-data validate-sources --json --fail-on-error
124
+ ```
125
+
126
+ For datasets with multiple mirrors, source validation succeeds when at least one configured source
127
+ is reachable. Failed fallback attempts are included in the result details.
128
+
129
+ Fetch raw data with cache:
130
+
131
+ ```bash
132
+ refua-data fetch zinc15_250k
133
+ ```
134
+
135
+ Fetch API-based presets:
136
+
137
+ ```bash
138
+ refua-data fetch chembl_activity_ki_human
139
+ refua-data fetch uniprot_human_kinases
140
+ ```
141
+
142
+ Materialize parquet:
143
+
144
+ ```bash
145
+ refua-data materialize zinc15_250k
146
+ ```
147
+
148
+ Refresh against remote metadata:
149
+
150
+ ```bash
151
+ refua-data fetch zinc15_250k --refresh
152
+ ```
153
+
154
+ For API datasets, `--refresh` re-runs the API query (with conditional headers on first page when available).
155
+
156
+ ## Cache layout
157
+
158
+ By default, cache root is:
159
+
160
+ - `~/.cache/refua-data`
161
+
162
+ Override with:
163
+
164
+ - `REFUA_DATA_HOME=/custom/path`
165
+
166
+ Layout:
167
+
168
+ - `raw/<dataset>/<version>/...` downloaded source files
169
+ - `_meta/raw/<dataset>/<version>/...json` raw metadata (`etag`, `sha256`, API request signature, rows/pages, dataset description/usage metadata)
170
+ - `parquet/<dataset>/<version>/part-*.parquet` materialized parquet parts
171
+ - `_meta/parquet/<dataset>/<version>/manifest.json` parquet manifest metadata with dataset snapshot
172
+
173
+ ## Python API
174
+
175
+ ```python
176
+ from refua_data import DatasetManager
177
+
178
+ manager = DatasetManager()
179
+ manager.fetch("zinc15_250k")
180
+ manager.fetch("chembl_activity_ki_human")
181
+ result = manager.materialize("zinc15_250k")
182
+ print(result.parquet_dir)
183
+ ```
184
+
185
+ `DataCache` is the default cache backend. You can pass a custom backend object that implements
186
+ the same interface (`ensure`, `raw_file`, `raw_meta`, `parquet_dir`, `parquet_manifest`,
187
+ `read_json`, `write_json`) to make storage pluggable.
188
+
189
+ ## Licensing notes
190
+
191
+ - `refua-data` package code is MIT licensed.
192
+ - Dataset content licenses are dataset-specific and controlled by upstream providers.
193
+ - Always verify dataset licensing and allowed use before redistribution or commercial deployment.
@@ -0,0 +1,154 @@
1
+ # refua-data
2
+
3
+ `refua-data` is the Refua data layer for drug discovery. It provides a curated dataset catalog, intelligent local caching, and parquet materialization optimized for downstream modeling and campaign workflows.
4
+
5
+ ## What it provides
6
+
7
+ - A built-in catalog of useful drug-discovery datasets.
8
+ - Dataset-aware download pipeline with cache reuse and metadata tracking.
9
+ - Pluggable cache backend architecture (filesystem cache by default).
10
+ - API dataset ingestion for paginated JSON endpoints (for example ChEMBL and UniProt).
11
+ - HTTP conditional refresh support (`ETag` / `Last-Modified`) when enabled.
12
+ - Incremental parquet materialization (chunked processing + partitioned parquet parts).
13
+ - CLI for listing, fetching, and materializing datasets.
14
+ - Source health checks via `validate-sources` for CI and environment diagnostics.
15
+ - Rich dataset metadata snapshots (description + usage notes) persisted in cache metadata.
16
+
17
+ ## Included datasets
18
+
19
+ The default catalog includes local-file/HTTP datasets plus API presets useful in drug discovery, including **ZINC**, **ChEMBL**, and **UniProt**.
20
+
21
+ 1. `zinc15_250k` (ZINC)
22
+ 2. `zinc15_tranche_druglike_instock` (ZINC tranche)
23
+ 3. `zinc15_tranche_druglike_agent` (ZINC tranche)
24
+ 4. `zinc15_tranche_druglike_wait_ok` (ZINC tranche)
25
+ 5. `zinc15_tranche_druglike_boutique` (ZINC tranche)
26
+ 6. `zinc15_tranche_druglike_annotated` (ZINC tranche)
27
+ 7. `tox21`
28
+ 8. `bbbp`
29
+ 9. `bace`
30
+ 10. `clintox`
31
+ 11. `sider`
32
+ 12. `hiv`
33
+ 13. `muv`
34
+ 14. `esol`
35
+ 15. `freesolv`
36
+ 16. `lipophilicity`
37
+ 17. `pcba`
38
+ 18. `chembl_activity_ki_human`
39
+ 19. `chembl_activity_ic50_human`
40
+ 20. `chembl_assays_binding_human`
41
+ 21. `chembl_targets_human_single_protein`
42
+ 22. `chembl_molecules_phase3plus`
43
+ 23. `uniprot_human_reviewed`
44
+ 24. `uniprot_human_kinases`
45
+ 25. `uniprot_human_gpcr`
46
+ 26. `uniprot_human_ion_channels`
47
+ 27. `uniprot_human_transporters`
48
+
49
+ Most of these are distributed through MoleculeNet/DeepChem mirrors and retain upstream licensing terms.
50
+ ChEMBL and UniProt presets are fetched through their public REST APIs and cached locally as JSONL.
51
+ ZINC tranche presets aggregate multiple tranche files per dataset (drug-like MW B-K and logP A-K bins,
52
+ reactivity A/B/C/E) into one cached tabular source during fetch.
53
+
54
+ ## Install
55
+
56
+ ```bash
57
+ cd refua-data
58
+ pip install -e .
59
+ ```
60
+
61
+ ## CLI quickstart
62
+
63
+ List datasets:
64
+
65
+ ```bash
66
+ refua-data list
67
+ ```
68
+
69
+ Validate all dataset sources:
70
+
71
+ ```bash
72
+ refua-data validate-sources
73
+ ```
74
+
75
+ Validate a subset and fail CI on probe failures:
76
+
77
+ ```bash
78
+ refua-data validate-sources chembl_activity_ki_human uniprot_human_kinases --fail-on-error
79
+ ```
80
+
81
+ JSON output for automation:
82
+
83
+ ```bash
84
+ refua-data validate-sources --json --fail-on-error
85
+ ```
86
+
87
+ For datasets with multiple mirrors, source validation succeeds when at least one configured source
88
+ is reachable. Failed fallback attempts are included in the result details.
89
+
90
+ Fetch raw data with cache:
91
+
92
+ ```bash
93
+ refua-data fetch zinc15_250k
94
+ ```
95
+
96
+ Fetch API-based presets:
97
+
98
+ ```bash
99
+ refua-data fetch chembl_activity_ki_human
100
+ refua-data fetch uniprot_human_kinases
101
+ ```
102
+
103
+ Materialize parquet:
104
+
105
+ ```bash
106
+ refua-data materialize zinc15_250k
107
+ ```
108
+
109
+ Refresh against remote metadata:
110
+
111
+ ```bash
112
+ refua-data fetch zinc15_250k --refresh
113
+ ```
114
+
115
+ For API datasets, `--refresh` re-runs the API query (with conditional headers on first page when available).
116
+
117
+ ## Cache layout
118
+
119
+ By default, cache root is:
120
+
121
+ - `~/.cache/refua-data`
122
+
123
+ Override with:
124
+
125
+ - `REFUA_DATA_HOME=/custom/path`
126
+
127
+ Layout:
128
+
129
+ - `raw/<dataset>/<version>/...` downloaded source files
130
+ - `_meta/raw/<dataset>/<version>/...json` raw metadata (`etag`, `sha256`, API request signature, rows/pages, dataset description/usage metadata)
131
+ - `parquet/<dataset>/<version>/part-*.parquet` materialized parquet parts
132
+ - `_meta/parquet/<dataset>/<version>/manifest.json` parquet manifest metadata with dataset snapshot
133
+
134
+ ## Python API
135
+
136
+ ```python
137
+ from refua_data import DatasetManager
138
+
139
+ manager = DatasetManager()
140
+ manager.fetch("zinc15_250k")
141
+ manager.fetch("chembl_activity_ki_human")
142
+ result = manager.materialize("zinc15_250k")
143
+ print(result.parquet_dir)
144
+ ```
145
+
146
+ `DataCache` is the default cache backend. You can pass a custom backend object that implements
147
+ the same interface (`ensure`, `raw_file`, `raw_meta`, `parquet_dir`, `parquet_manifest`,
148
+ `read_json`, `write_json`) to make storage pluggable.
149
+
150
+ ## Licensing notes
151
+
152
+ - `refua-data` package code is MIT licensed.
153
+ - Dataset content licenses are dataset-specific and controlled by upstream providers.
154
+ - Always verify dataset licensing and allowed use before redistribution or commercial deployment.
@@ -0,0 +1,84 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "refua-data"
7
+ version = "0.6.0"
8
+ description = "Data ingestion, caching, and parquet materialization for the Refua drug discovery ecosystem."
9
+ readme = "README.md"
10
+ requires-python = ">=3.11,<3.15"
11
+ license = "MIT"
12
+ license-files = ["LICENSE"]
13
+ authors = [{name = "JJ Ben-Joseph", email = "jj@tensorspace.ai"}]
14
+ keywords = [
15
+ "drug discovery",
16
+ "data engineering",
17
+ "cheminformatics",
18
+ "bioinformatics",
19
+ "parquet",
20
+ "refua",
21
+ ]
22
+ classifiers = [
23
+ "Development Status :: 4 - Beta",
24
+ "Intended Audience :: Science/Research",
25
+ "Operating System :: OS Independent",
26
+ "Programming Language :: Python :: 3",
27
+ "Programming Language :: Python :: 3.11",
28
+ "Programming Language :: Python :: 3.12",
29
+ "Programming Language :: Python :: 3.13",
30
+ "Programming Language :: Python :: 3.14",
31
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
32
+ "Topic :: Scientific/Engineering :: Chemistry",
33
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
34
+ ]
35
+ dependencies = [
36
+ "pandas>=2.2.2",
37
+ "pyarrow>=18.0.0",
38
+ "requests>=2.32.3",
39
+ "tqdm>=4.66.0",
40
+ ]
41
+
42
+ [project.urls]
43
+ Homepage = "https://agentcures.com/"
44
+ Repository = "https://github.com/agentcures/refua"
45
+ Documentation = "https://github.com/agentcures/refua#readme"
46
+ Issues = "https://github.com/agentcures/refua/issues"
47
+
48
+ [project.optional-dependencies]
49
+ dev = [
50
+ "pytest>=8.0.0",
51
+ "ruff>=0.6.0",
52
+ "mypy>=1.11.0",
53
+ "pandas-stubs>=2.2.3.250527",
54
+ "types-requests>=2.32.0.20241016",
55
+ "build>=1.2.2",
56
+ "twine>=6.1.0",
57
+ ]
58
+
59
+ [project.scripts]
60
+ refua-data = "refua_data.cli:main"
61
+
62
+ [tool.setuptools.packages.find]
63
+ where = ["src"]
64
+
65
+ [tool.setuptools]
66
+ include-package-data = true
67
+
68
+ [tool.pytest.ini_options]
69
+ pythonpath = ["src"]
70
+ testpaths = ["tests"]
71
+
72
+ [tool.ruff]
73
+ line-length = 100
74
+ target-version = "py311"
75
+
76
+ [tool.ruff.lint]
77
+ select = ["E", "F", "I", "UP", "B"]
78
+
79
+ [tool.mypy]
80
+ python_version = "3.11"
81
+ warn_unused_ignores = true
82
+ disallow_untyped_defs = true
83
+ check_untyped_defs = true
84
+ strict_optional = true
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,20 @@
1
+ """refua-data package API."""
2
+
3
+ from .cache import CacheBackend, DataCache
4
+ from .catalog import DatasetCatalog, get_default_catalog
5
+ from .models import ApiDatasetConfig, DatasetDefinition, FetchResult, MaterializeResult
6
+ from .pipeline import DatasetManager
7
+ from .validation import SourceValidationResult
8
+
9
+ __all__ = [
10
+ "ApiDatasetConfig",
11
+ "CacheBackend",
12
+ "DataCache",
13
+ "DatasetCatalog",
14
+ "DatasetDefinition",
15
+ "DatasetManager",
16
+ "FetchResult",
17
+ "MaterializeResult",
18
+ "SourceValidationResult",
19
+ "get_default_catalog",
20
+ ]
@@ -0,0 +1,6 @@
1
+ """Module runner for `python -m refua_data`."""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ raise SystemExit(main())
@@ -0,0 +1,106 @@
1
+ """Filesystem cache primitives for dataset files and metadata."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import json
7
+ import os
8
+ from pathlib import Path
9
+ from typing import Any, Protocol
10
+
11
+ from .config import default_cache_root
12
+ from .models import DatasetDefinition
13
+
14
+
15
+ class CacheBackend(Protocol):
16
+ """Protocol for pluggable cache backends used by the pipeline."""
17
+
18
+ root: Path
19
+
20
+ def ensure(self) -> None:
21
+ ...
22
+
23
+ def raw_file(self, dataset: DatasetDefinition) -> Path:
24
+ ...
25
+
26
+ def raw_meta(self, dataset: DatasetDefinition) -> Path:
27
+ ...
28
+
29
+ def parquet_dir(self, dataset: DatasetDefinition) -> Path:
30
+ ...
31
+
32
+ def parquet_manifest(self, dataset: DatasetDefinition) -> Path:
33
+ ...
34
+
35
+ def read_json(self, path: Path) -> dict[str, Any] | None:
36
+ ...
37
+
38
+ def write_json(self, path: Path, payload: dict[str, Any]) -> None:
39
+ ...
40
+
41
+
42
+ class DataCache:
43
+ """Filesystem-backed cache backend for raw + parquet artifacts."""
44
+
45
+ def __init__(self, root: Path | None = None):
46
+ self.root = (root or default_cache_root()).expanduser().resolve()
47
+
48
+ def ensure(self) -> None:
49
+ """Create required cache root directories."""
50
+ self.root.mkdir(parents=True, exist_ok=True)
51
+ self.root.joinpath("raw").mkdir(parents=True, exist_ok=True)
52
+ self.root.joinpath("parquet").mkdir(parents=True, exist_ok=True)
53
+ self.root.joinpath("_meta", "raw").mkdir(parents=True, exist_ok=True)
54
+ self.root.joinpath("_meta", "parquet").mkdir(parents=True, exist_ok=True)
55
+
56
+ def raw_file(self, dataset: DatasetDefinition) -> Path:
57
+ """Return raw file path for a dataset."""
58
+ filename = dataset.preferred_filename()
59
+ return self.root.joinpath("raw", dataset.dataset_id, dataset.version, filename)
60
+
61
+ def raw_meta(self, dataset: DatasetDefinition) -> Path:
62
+ """Return raw metadata path for a dataset."""
63
+ filename = f"{dataset.preferred_filename()}.json"
64
+ return self.root.joinpath("_meta", "raw", dataset.dataset_id, dataset.version, filename)
65
+
66
+ def parquet_dir(self, dataset: DatasetDefinition) -> Path:
67
+ """Return parquet output directory for a dataset."""
68
+ return self.root.joinpath("parquet", dataset.dataset_id, dataset.version)
69
+
70
+ def parquet_manifest(self, dataset: DatasetDefinition) -> Path:
71
+ """Return parquet manifest metadata path for a dataset."""
72
+ return self.root.joinpath(
73
+ "_meta",
74
+ "parquet",
75
+ dataset.dataset_id,
76
+ dataset.version,
77
+ "manifest.json",
78
+ )
79
+
80
+ def read_json(self, path: Path) -> dict[str, Any] | None:
81
+ """Read JSON metadata if it exists."""
82
+ if not path.exists():
83
+ return None
84
+ return json.loads(path.read_text(encoding="utf-8"))
85
+
86
+ def write_json(self, path: Path, payload: dict[str, Any]) -> None:
87
+ """Write JSON metadata atomically."""
88
+ path.parent.mkdir(parents=True, exist_ok=True)
89
+ tmp_path = path.with_suffix(path.suffix + ".tmp")
90
+ tmp_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
91
+ os.replace(tmp_path, path)
92
+
93
+
94
+ _CHUNK_SIZE = 4 * 1024 * 1024
95
+
96
+
97
+ def sha256_file(path: Path) -> str:
98
+ """Compute the SHA256 checksum of a file."""
99
+ digest = hashlib.sha256()
100
+ with path.open("rb") as handle:
101
+ while True:
102
+ chunk = handle.read(_CHUNK_SIZE)
103
+ if not chunk:
104
+ break
105
+ digest.update(chunk)
106
+ return digest.hexdigest()