refua-data 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- refua_data-0.6.0/LICENSE +21 -0
- refua_data-0.6.0/PKG-INFO +193 -0
- refua_data-0.6.0/README.md +154 -0
- refua_data-0.6.0/pyproject.toml +84 -0
- refua_data-0.6.0/setup.cfg +4 -0
- refua_data-0.6.0/src/refua_data/__init__.py +20 -0
- refua_data-0.6.0/src/refua_data/__main__.py +6 -0
- refua_data-0.6.0/src/refua_data/cache.py +106 -0
- refua_data-0.6.0/src/refua_data/catalog.py +651 -0
- refua_data-0.6.0/src/refua_data/cli.py +354 -0
- refua_data-0.6.0/src/refua_data/config.py +21 -0
- refua_data-0.6.0/src/refua_data/downloader.py +781 -0
- refua_data-0.6.0/src/refua_data/io.py +112 -0
- refua_data-0.6.0/src/refua_data/models.py +160 -0
- refua_data-0.6.0/src/refua_data/pipeline.py +233 -0
- refua_data-0.6.0/src/refua_data/validation.py +346 -0
- refua_data-0.6.0/src/refua_data.egg-info/PKG-INFO +193 -0
- refua_data-0.6.0/src/refua_data.egg-info/SOURCES.txt +26 -0
- refua_data-0.6.0/src/refua_data.egg-info/dependency_links.txt +1 -0
- refua_data-0.6.0/src/refua_data.egg-info/entry_points.txt +2 -0
- refua_data-0.6.0/src/refua_data.egg-info/requires.txt +13 -0
- refua_data-0.6.0/src/refua_data.egg-info/top_level.txt +1 -0
- refua_data-0.6.0/tests/test_cache_backend.py +70 -0
- refua_data-0.6.0/tests/test_catalog.py +13 -0
- refua_data-0.6.0/tests/test_fetch_api.py +172 -0
- refua_data-0.6.0/tests/test_fetch_cache.py +110 -0
- refua_data-0.6.0/tests/test_materialize.py +53 -0
- refua_data-0.6.0/tests/test_validate_sources.py +179 -0
refua_data-0.6.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 JJ Ben-Joseph
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: refua-data
|
|
3
|
+
Version: 0.6.0
|
|
4
|
+
Summary: Data ingestion, caching, and parquet materialization for the Refua drug discovery ecosystem.
|
|
5
|
+
Author-email: JJ Ben-Joseph <jj@tensorspace.ai>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://agentcures.com/
|
|
8
|
+
Project-URL: Repository, https://github.com/agentcures/refua
|
|
9
|
+
Project-URL: Documentation, https://github.com/agentcures/refua#readme
|
|
10
|
+
Project-URL: Issues, https://github.com/agentcures/refua/issues
|
|
11
|
+
Keywords: drug discovery,data engineering,cheminformatics,bioinformatics,parquet,refua
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Chemistry
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Requires-Python: <3.15,>=3.11
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: pandas>=2.2.2
|
|
27
|
+
Requires-Dist: pyarrow>=18.0.0
|
|
28
|
+
Requires-Dist: requests>=2.32.3
|
|
29
|
+
Requires-Dist: tqdm>=4.66.0
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
32
|
+
Requires-Dist: ruff>=0.6.0; extra == "dev"
|
|
33
|
+
Requires-Dist: mypy>=1.11.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pandas-stubs>=2.2.3.250527; extra == "dev"
|
|
35
|
+
Requires-Dist: types-requests>=2.32.0.20241016; extra == "dev"
|
|
36
|
+
Requires-Dist: build>=1.2.2; extra == "dev"
|
|
37
|
+
Requires-Dist: twine>=6.1.0; extra == "dev"
|
|
38
|
+
Dynamic: license-file
|
|
39
|
+
|
|
40
|
+
# refua-data
|
|
41
|
+
|
|
42
|
+
`refua-data` is the Refua data layer for drug discovery. It provides a curated dataset catalog, intelligent local caching, and parquet materialization optimized for downstream modeling and campaign workflows.
|
|
43
|
+
|
|
44
|
+
## What it provides
|
|
45
|
+
|
|
46
|
+
- A built-in catalog of useful drug-discovery datasets.
|
|
47
|
+
- Dataset-aware download pipeline with cache reuse and metadata tracking.
|
|
48
|
+
- Pluggable cache backend architecture (filesystem cache by default).
|
|
49
|
+
- API dataset ingestion for paginated JSON endpoints (for example ChEMBL and UniProt).
|
|
50
|
+
- HTTP conditional refresh support (`ETag` / `Last-Modified`) when enabled.
|
|
51
|
+
- Incremental parquet materialization (chunked processing + partitioned parquet parts).
|
|
52
|
+
- CLI for listing, fetching, and materializing datasets.
|
|
53
|
+
- Source health checks via `validate-sources` for CI and environment diagnostics.
|
|
54
|
+
- Rich dataset metadata snapshots (description + usage notes) persisted in cache metadata.
|
|
55
|
+
|
|
56
|
+
## Included datasets
|
|
57
|
+
|
|
58
|
+
The default catalog includes local-file/HTTP datasets plus API presets useful in drug discovery, including **ZINC**, **ChEMBL**, and **UniProt**.
|
|
59
|
+
|
|
60
|
+
1. `zinc15_250k` (ZINC)
|
|
61
|
+
2. `zinc15_tranche_druglike_instock` (ZINC tranche)
|
|
62
|
+
3. `zinc15_tranche_druglike_agent` (ZINC tranche)
|
|
63
|
+
4. `zinc15_tranche_druglike_wait_ok` (ZINC tranche)
|
|
64
|
+
5. `zinc15_tranche_druglike_boutique` (ZINC tranche)
|
|
65
|
+
6. `zinc15_tranche_druglike_annotated` (ZINC tranche)
|
|
66
|
+
7. `tox21`
|
|
67
|
+
8. `bbbp`
|
|
68
|
+
9. `bace`
|
|
69
|
+
10. `clintox`
|
|
70
|
+
11. `sider`
|
|
71
|
+
12. `hiv`
|
|
72
|
+
13. `muv`
|
|
73
|
+
14. `esol`
|
|
74
|
+
15. `freesolv`
|
|
75
|
+
16. `lipophilicity`
|
|
76
|
+
17. `pcba`
|
|
77
|
+
18. `chembl_activity_ki_human`
|
|
78
|
+
19. `chembl_activity_ic50_human`
|
|
79
|
+
20. `chembl_assays_binding_human`
|
|
80
|
+
21. `chembl_targets_human_single_protein`
|
|
81
|
+
22. `chembl_molecules_phase3plus`
|
|
82
|
+
23. `uniprot_human_reviewed`
|
|
83
|
+
24. `uniprot_human_kinases`
|
|
84
|
+
25. `uniprot_human_gpcr`
|
|
85
|
+
26. `uniprot_human_ion_channels`
|
|
86
|
+
27. `uniprot_human_transporters`
|
|
87
|
+
|
|
88
|
+
Most of these are distributed through MoleculeNet/DeepChem mirrors and retain upstream licensing terms.
|
|
89
|
+
ChEMBL and UniProt presets are fetched through their public REST APIs and cached locally as JSONL.
|
|
90
|
+
ZINC tranche presets aggregate multiple tranche files per dataset (drug-like MW B-K and logP A-K bins,
|
|
91
|
+
reactivity A/B/C/E) into one cached tabular source during fetch.
|
|
92
|
+
|
|
93
|
+
## Install
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
cd refua-data
|
|
97
|
+
pip install -e .
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## CLI quickstart
|
|
101
|
+
|
|
102
|
+
List datasets:
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
refua-data list
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Validate all dataset sources:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
refua-data validate-sources
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Validate a subset and fail CI on probe failures:
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
refua-data validate-sources chembl_activity_ki_human uniprot_human_kinases --fail-on-error
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
JSON output for automation:
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
refua-data validate-sources --json --fail-on-error
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
For datasets with multiple mirrors, source validation succeeds when at least one configured source
|
|
127
|
+
is reachable. Failed fallback attempts are included in the result details.
|
|
128
|
+
|
|
129
|
+
Fetch raw data with cache:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
refua-data fetch zinc15_250k
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Fetch API-based presets:
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
refua-data fetch chembl_activity_ki_human
|
|
139
|
+
refua-data fetch uniprot_human_kinases
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
Materialize parquet:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
refua-data materialize zinc15_250k
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
Refresh against remote metadata:
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
refua-data fetch zinc15_250k --refresh
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
For API datasets, `--refresh` re-runs the API query (with conditional headers on first page when available).
|
|
155
|
+
|
|
156
|
+
## Cache layout
|
|
157
|
+
|
|
158
|
+
By default, cache root is:
|
|
159
|
+
|
|
160
|
+
- `~/.cache/refua-data`
|
|
161
|
+
|
|
162
|
+
Override with:
|
|
163
|
+
|
|
164
|
+
- `REFUA_DATA_HOME=/custom/path`
|
|
165
|
+
|
|
166
|
+
Layout:
|
|
167
|
+
|
|
168
|
+
- `raw/<dataset>/<version>/...` downloaded source files
|
|
169
|
+
- `_meta/raw/<dataset>/<version>/...json` raw metadata (`etag`, `sha256`, API request signature, rows/pages, dataset description/usage metadata)
|
|
170
|
+
- `parquet/<dataset>/<version>/part-*.parquet` materialized parquet parts
|
|
171
|
+
- `_meta/parquet/<dataset>/<version>/manifest.json` parquet manifest metadata with dataset snapshot
|
|
172
|
+
|
|
173
|
+
## Python API
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
from refua_data import DatasetManager
|
|
177
|
+
|
|
178
|
+
manager = DatasetManager()
|
|
179
|
+
manager.fetch("zinc15_250k")
|
|
180
|
+
manager.fetch("chembl_activity_ki_human")
|
|
181
|
+
result = manager.materialize("zinc15_250k")
|
|
182
|
+
print(result.parquet_dir)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
`DataCache` is the default cache backend. You can pass a custom backend object that implements
|
|
186
|
+
the same interface (`ensure`, `raw_file`, `raw_meta`, `parquet_dir`, `parquet_manifest`,
|
|
187
|
+
`read_json`, `write_json`) to make storage pluggable.
|
|
188
|
+
|
|
189
|
+
## Licensing notes
|
|
190
|
+
|
|
191
|
+
- `refua-data` package code is MIT licensed.
|
|
192
|
+
- Dataset content licenses are dataset-specific and controlled by upstream providers.
|
|
193
|
+
- Always verify dataset licensing and allowed use before redistribution or commercial deployment.
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# refua-data
|
|
2
|
+
|
|
3
|
+
`refua-data` is the Refua data layer for drug discovery. It provides a curated dataset catalog, intelligent local caching, and parquet materialization optimized for downstream modeling and campaign workflows.
|
|
4
|
+
|
|
5
|
+
## What it provides
|
|
6
|
+
|
|
7
|
+
- A built-in catalog of useful drug-discovery datasets.
|
|
8
|
+
- Dataset-aware download pipeline with cache reuse and metadata tracking.
|
|
9
|
+
- Pluggable cache backend architecture (filesystem cache by default).
|
|
10
|
+
- API dataset ingestion for paginated JSON endpoints (for example ChEMBL and UniProt).
|
|
11
|
+
- HTTP conditional refresh support (`ETag` / `Last-Modified`) when enabled.
|
|
12
|
+
- Incremental parquet materialization (chunked processing + partitioned parquet parts).
|
|
13
|
+
- CLI for listing, fetching, and materializing datasets.
|
|
14
|
+
- Source health checks via `validate-sources` for CI and environment diagnostics.
|
|
15
|
+
- Rich dataset metadata snapshots (description + usage notes) persisted in cache metadata.
|
|
16
|
+
|
|
17
|
+
## Included datasets
|
|
18
|
+
|
|
19
|
+
The default catalog includes local-file/HTTP datasets plus API presets useful in drug discovery, including **ZINC**, **ChEMBL**, and **UniProt**.
|
|
20
|
+
|
|
21
|
+
1. `zinc15_250k` (ZINC)
|
|
22
|
+
2. `zinc15_tranche_druglike_instock` (ZINC tranche)
|
|
23
|
+
3. `zinc15_tranche_druglike_agent` (ZINC tranche)
|
|
24
|
+
4. `zinc15_tranche_druglike_wait_ok` (ZINC tranche)
|
|
25
|
+
5. `zinc15_tranche_druglike_boutique` (ZINC tranche)
|
|
26
|
+
6. `zinc15_tranche_druglike_annotated` (ZINC tranche)
|
|
27
|
+
7. `tox21`
|
|
28
|
+
8. `bbbp`
|
|
29
|
+
9. `bace`
|
|
30
|
+
10. `clintox`
|
|
31
|
+
11. `sider`
|
|
32
|
+
12. `hiv`
|
|
33
|
+
13. `muv`
|
|
34
|
+
14. `esol`
|
|
35
|
+
15. `freesolv`
|
|
36
|
+
16. `lipophilicity`
|
|
37
|
+
17. `pcba`
|
|
38
|
+
18. `chembl_activity_ki_human`
|
|
39
|
+
19. `chembl_activity_ic50_human`
|
|
40
|
+
20. `chembl_assays_binding_human`
|
|
41
|
+
21. `chembl_targets_human_single_protein`
|
|
42
|
+
22. `chembl_molecules_phase3plus`
|
|
43
|
+
23. `uniprot_human_reviewed`
|
|
44
|
+
24. `uniprot_human_kinases`
|
|
45
|
+
25. `uniprot_human_gpcr`
|
|
46
|
+
26. `uniprot_human_ion_channels`
|
|
47
|
+
27. `uniprot_human_transporters`
|
|
48
|
+
|
|
49
|
+
Most of these are distributed through MoleculeNet/DeepChem mirrors and retain upstream licensing terms.
|
|
50
|
+
ChEMBL and UniProt presets are fetched through their public REST APIs and cached locally as JSONL.
|
|
51
|
+
ZINC tranche presets aggregate multiple tranche files per dataset (drug-like MW B-K and logP A-K bins,
|
|
52
|
+
reactivity A/B/C/E) into one cached tabular source during fetch.
|
|
53
|
+
|
|
54
|
+
## Install
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
cd refua-data
|
|
58
|
+
pip install -e .
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## CLI quickstart
|
|
62
|
+
|
|
63
|
+
List datasets:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
refua-data list
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Validate all dataset sources:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
refua-data validate-sources
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Validate a subset and fail CI on probe failures:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
refua-data validate-sources chembl_activity_ki_human uniprot_human_kinases --fail-on-error
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
JSON output for automation:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
refua-data validate-sources --json --fail-on-error
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
For datasets with multiple mirrors, source validation succeeds when at least one configured source
|
|
88
|
+
is reachable. Failed fallback attempts are included in the result details.
|
|
89
|
+
|
|
90
|
+
Fetch raw data with cache:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
refua-data fetch zinc15_250k
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Fetch API-based presets:
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
refua-data fetch chembl_activity_ki_human
|
|
100
|
+
refua-data fetch uniprot_human_kinases
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Materialize parquet:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
refua-data materialize zinc15_250k
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Refresh against remote metadata:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
refua-data fetch zinc15_250k --refresh
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
For API datasets, `--refresh` re-runs the API query (with conditional headers on first page when available).
|
|
116
|
+
|
|
117
|
+
## Cache layout
|
|
118
|
+
|
|
119
|
+
By default, cache root is:
|
|
120
|
+
|
|
121
|
+
- `~/.cache/refua-data`
|
|
122
|
+
|
|
123
|
+
Override with:
|
|
124
|
+
|
|
125
|
+
- `REFUA_DATA_HOME=/custom/path`
|
|
126
|
+
|
|
127
|
+
Layout:
|
|
128
|
+
|
|
129
|
+
- `raw/<dataset>/<version>/...` downloaded source files
|
|
130
|
+
- `_meta/raw/<dataset>/<version>/...json` raw metadata (`etag`, `sha256`, API request signature, rows/pages, dataset description/usage metadata)
|
|
131
|
+
- `parquet/<dataset>/<version>/part-*.parquet` materialized parquet parts
|
|
132
|
+
- `_meta/parquet/<dataset>/<version>/manifest.json` parquet manifest metadata with dataset snapshot
|
|
133
|
+
|
|
134
|
+
## Python API
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from refua_data import DatasetManager
|
|
138
|
+
|
|
139
|
+
manager = DatasetManager()
|
|
140
|
+
manager.fetch("zinc15_250k")
|
|
141
|
+
manager.fetch("chembl_activity_ki_human")
|
|
142
|
+
result = manager.materialize("zinc15_250k")
|
|
143
|
+
print(result.parquet_dir)
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
`DataCache` is the default cache backend. You can pass a custom backend object that implements
|
|
147
|
+
the same interface (`ensure`, `raw_file`, `raw_meta`, `parquet_dir`, `parquet_manifest`,
|
|
148
|
+
`read_json`, `write_json`) to make storage pluggable.
|
|
149
|
+
|
|
150
|
+
## Licensing notes
|
|
151
|
+
|
|
152
|
+
- `refua-data` package code is MIT licensed.
|
|
153
|
+
- Dataset content licenses are dataset-specific and controlled by upstream providers.
|
|
154
|
+
- Always verify dataset licensing and allowed use before redistribution or commercial deployment.
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "refua-data"
|
|
7
|
+
version = "0.6.0"
|
|
8
|
+
description = "Data ingestion, caching, and parquet materialization for the Refua drug discovery ecosystem."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11,<3.15"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
authors = [{name = "JJ Ben-Joseph", email = "jj@tensorspace.ai"}]
|
|
14
|
+
keywords = [
|
|
15
|
+
"drug discovery",
|
|
16
|
+
"data engineering",
|
|
17
|
+
"cheminformatics",
|
|
18
|
+
"bioinformatics",
|
|
19
|
+
"parquet",
|
|
20
|
+
"refua",
|
|
21
|
+
]
|
|
22
|
+
classifiers = [
|
|
23
|
+
"Development Status :: 4 - Beta",
|
|
24
|
+
"Intended Audience :: Science/Research",
|
|
25
|
+
"Operating System :: OS Independent",
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.11",
|
|
28
|
+
"Programming Language :: Python :: 3.12",
|
|
29
|
+
"Programming Language :: Python :: 3.13",
|
|
30
|
+
"Programming Language :: Python :: 3.14",
|
|
31
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
32
|
+
"Topic :: Scientific/Engineering :: Chemistry",
|
|
33
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
34
|
+
]
|
|
35
|
+
dependencies = [
|
|
36
|
+
"pandas>=2.2.2",
|
|
37
|
+
"pyarrow>=18.0.0",
|
|
38
|
+
"requests>=2.32.3",
|
|
39
|
+
"tqdm>=4.66.0",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.urls]
|
|
43
|
+
Homepage = "https://agentcures.com/"
|
|
44
|
+
Repository = "https://github.com/agentcures/refua"
|
|
45
|
+
Documentation = "https://github.com/agentcures/refua#readme"
|
|
46
|
+
Issues = "https://github.com/agentcures/refua/issues"
|
|
47
|
+
|
|
48
|
+
[project.optional-dependencies]
|
|
49
|
+
dev = [
|
|
50
|
+
"pytest>=8.0.0",
|
|
51
|
+
"ruff>=0.6.0",
|
|
52
|
+
"mypy>=1.11.0",
|
|
53
|
+
"pandas-stubs>=2.2.3.250527",
|
|
54
|
+
"types-requests>=2.32.0.20241016",
|
|
55
|
+
"build>=1.2.2",
|
|
56
|
+
"twine>=6.1.0",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
[project.scripts]
|
|
60
|
+
refua-data = "refua_data.cli:main"
|
|
61
|
+
|
|
62
|
+
[tool.setuptools.packages.find]
|
|
63
|
+
where = ["src"]
|
|
64
|
+
|
|
65
|
+
[tool.setuptools]
|
|
66
|
+
include-package-data = true
|
|
67
|
+
|
|
68
|
+
[tool.pytest.ini_options]
|
|
69
|
+
pythonpath = ["src"]
|
|
70
|
+
testpaths = ["tests"]
|
|
71
|
+
|
|
72
|
+
[tool.ruff]
|
|
73
|
+
line-length = 100
|
|
74
|
+
target-version = "py311"
|
|
75
|
+
|
|
76
|
+
[tool.ruff.lint]
|
|
77
|
+
select = ["E", "F", "I", "UP", "B"]
|
|
78
|
+
|
|
79
|
+
[tool.mypy]
|
|
80
|
+
python_version = "3.11"
|
|
81
|
+
warn_unused_ignores = true
|
|
82
|
+
disallow_untyped_defs = true
|
|
83
|
+
check_untyped_defs = true
|
|
84
|
+
strict_optional = true
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""refua-data package API."""
|
|
2
|
+
|
|
3
|
+
from .cache import CacheBackend, DataCache
|
|
4
|
+
from .catalog import DatasetCatalog, get_default_catalog
|
|
5
|
+
from .models import ApiDatasetConfig, DatasetDefinition, FetchResult, MaterializeResult
|
|
6
|
+
from .pipeline import DatasetManager
|
|
7
|
+
from .validation import SourceValidationResult
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"ApiDatasetConfig",
|
|
11
|
+
"CacheBackend",
|
|
12
|
+
"DataCache",
|
|
13
|
+
"DatasetCatalog",
|
|
14
|
+
"DatasetDefinition",
|
|
15
|
+
"DatasetManager",
|
|
16
|
+
"FetchResult",
|
|
17
|
+
"MaterializeResult",
|
|
18
|
+
"SourceValidationResult",
|
|
19
|
+
"get_default_catalog",
|
|
20
|
+
]
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Filesystem cache primitives for dataset files and metadata."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Protocol
|
|
10
|
+
|
|
11
|
+
from .config import default_cache_root
|
|
12
|
+
from .models import DatasetDefinition
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CacheBackend(Protocol):
|
|
16
|
+
"""Protocol for pluggable cache backends used by the pipeline."""
|
|
17
|
+
|
|
18
|
+
root: Path
|
|
19
|
+
|
|
20
|
+
def ensure(self) -> None:
|
|
21
|
+
...
|
|
22
|
+
|
|
23
|
+
def raw_file(self, dataset: DatasetDefinition) -> Path:
|
|
24
|
+
...
|
|
25
|
+
|
|
26
|
+
def raw_meta(self, dataset: DatasetDefinition) -> Path:
|
|
27
|
+
...
|
|
28
|
+
|
|
29
|
+
def parquet_dir(self, dataset: DatasetDefinition) -> Path:
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
def parquet_manifest(self, dataset: DatasetDefinition) -> Path:
|
|
33
|
+
...
|
|
34
|
+
|
|
35
|
+
def read_json(self, path: Path) -> dict[str, Any] | None:
|
|
36
|
+
...
|
|
37
|
+
|
|
38
|
+
def write_json(self, path: Path, payload: dict[str, Any]) -> None:
|
|
39
|
+
...
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class DataCache:
|
|
43
|
+
"""Filesystem-backed cache backend for raw + parquet artifacts."""
|
|
44
|
+
|
|
45
|
+
def __init__(self, root: Path | None = None):
|
|
46
|
+
self.root = (root or default_cache_root()).expanduser().resolve()
|
|
47
|
+
|
|
48
|
+
def ensure(self) -> None:
|
|
49
|
+
"""Create required cache root directories."""
|
|
50
|
+
self.root.mkdir(parents=True, exist_ok=True)
|
|
51
|
+
self.root.joinpath("raw").mkdir(parents=True, exist_ok=True)
|
|
52
|
+
self.root.joinpath("parquet").mkdir(parents=True, exist_ok=True)
|
|
53
|
+
self.root.joinpath("_meta", "raw").mkdir(parents=True, exist_ok=True)
|
|
54
|
+
self.root.joinpath("_meta", "parquet").mkdir(parents=True, exist_ok=True)
|
|
55
|
+
|
|
56
|
+
def raw_file(self, dataset: DatasetDefinition) -> Path:
|
|
57
|
+
"""Return raw file path for a dataset."""
|
|
58
|
+
filename = dataset.preferred_filename()
|
|
59
|
+
return self.root.joinpath("raw", dataset.dataset_id, dataset.version, filename)
|
|
60
|
+
|
|
61
|
+
def raw_meta(self, dataset: DatasetDefinition) -> Path:
|
|
62
|
+
"""Return raw metadata path for a dataset."""
|
|
63
|
+
filename = f"{dataset.preferred_filename()}.json"
|
|
64
|
+
return self.root.joinpath("_meta", "raw", dataset.dataset_id, dataset.version, filename)
|
|
65
|
+
|
|
66
|
+
def parquet_dir(self, dataset: DatasetDefinition) -> Path:
|
|
67
|
+
"""Return parquet output directory for a dataset."""
|
|
68
|
+
return self.root.joinpath("parquet", dataset.dataset_id, dataset.version)
|
|
69
|
+
|
|
70
|
+
def parquet_manifest(self, dataset: DatasetDefinition) -> Path:
|
|
71
|
+
"""Return parquet manifest metadata path for a dataset."""
|
|
72
|
+
return self.root.joinpath(
|
|
73
|
+
"_meta",
|
|
74
|
+
"parquet",
|
|
75
|
+
dataset.dataset_id,
|
|
76
|
+
dataset.version,
|
|
77
|
+
"manifest.json",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def read_json(self, path: Path) -> dict[str, Any] | None:
|
|
81
|
+
"""Read JSON metadata if it exists."""
|
|
82
|
+
if not path.exists():
|
|
83
|
+
return None
|
|
84
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
85
|
+
|
|
86
|
+
def write_json(self, path: Path, payload: dict[str, Any]) -> None:
|
|
87
|
+
"""Write JSON metadata atomically."""
|
|
88
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
89
|
+
tmp_path = path.with_suffix(path.suffix + ".tmp")
|
|
90
|
+
tmp_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
|
91
|
+
os.replace(tmp_path, path)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
_CHUNK_SIZE = 4 * 1024 * 1024
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def sha256_file(path: Path) -> str:
|
|
98
|
+
"""Compute the SHA256 checksum of a file."""
|
|
99
|
+
digest = hashlib.sha256()
|
|
100
|
+
with path.open("rb") as handle:
|
|
101
|
+
while True:
|
|
102
|
+
chunk = handle.read(_CHUNK_SIZE)
|
|
103
|
+
if not chunk:
|
|
104
|
+
break
|
|
105
|
+
digest.update(chunk)
|
|
106
|
+
return digest.hexdigest()
|