deriva-ml 1.14.0__py3-none-any.whl → 1.14.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +25 -30
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1489 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +4 -0
- deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
- deriva_ml/{dataset.py → dataset/dataset.py} +406 -428
- deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
- deriva_ml/{history.py → dataset/history.py} +51 -33
- deriva_ml/{upload.py → dataset/upload.py} +48 -70
- deriva_ml/demo_catalog.py +233 -183
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/{execution.py → execution/execution.py} +365 -252
- deriva_ml/execution/execution_configuration.py +163 -0
- deriva_ml/{execution_configuration.py → execution/workflow.py} +212 -224
- deriva_ml/feature.py +83 -46
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
- deriva_ml/{database_model.py → model/database.py} +52 -74
- deriva_ml/model/sql_mapper.py +44 -0
- deriva_ml/run_notebook.py +19 -11
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/{schema_setup → schema}/annotations.py +31 -22
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/METADATA +5 -4
- deriva_ml-1.14.27.dist-info/RECORD +40 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/entry_points.txt +1 -0
- deriva_ml/deriva_definitions.py +0 -391
- deriva_ml/deriva_ml_base.py +0 -1046
- deriva_ml/execution_environment.py +0 -139
- deriva_ml/schema_setup/table_comments_utils.py +0 -56
- deriva_ml/test-files/execution-parameters.json +0 -1
- deriva_ml/test-files/notebook-parameters.json +0 -5
- deriva_ml/test_functions.py +0 -141
- deriva_ml/test_notebook.ipynb +0 -197
- deriva_ml-1.14.0.dist-info/RECORD +0 -31
- /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
- /deriva_ml/{schema_setup → schema}/policy.json +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/WHEEL +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/top_level.txt +0 -0
|
@@ -36,6 +36,7 @@ Here is the directory layout we support:
|
|
|
36
36
|
"""
|
|
37
37
|
|
|
38
38
|
import json
|
|
39
|
+
import os
|
|
39
40
|
from pathlib import Path
|
|
40
41
|
from tempfile import TemporaryDirectory
|
|
41
42
|
from typing import Any, Optional
|
|
@@ -46,49 +47,41 @@ from deriva.core.ermrest_model import Table
|
|
|
46
47
|
from deriva.core.hatrac_store import HatracStore
|
|
47
48
|
from deriva.core.utils import hash_utils, mime_utils
|
|
48
49
|
from deriva.transfer.upload.deriva_upload import GenericUploader
|
|
49
|
-
from pydantic import
|
|
50
|
+
from pydantic import ConfigDict, validate_call
|
|
50
51
|
|
|
51
|
-
from deriva_ml.
|
|
52
|
+
from deriva_ml.core.definitions import (
|
|
52
53
|
RID,
|
|
53
|
-
|
|
54
|
+
DerivaSystemColumns,
|
|
54
55
|
FileUploadState,
|
|
55
56
|
UploadState,
|
|
56
|
-
DerivaSystemColumns,
|
|
57
57
|
)
|
|
58
|
-
from deriva_ml.
|
|
59
|
-
|
|
58
|
+
from deriva_ml.core.exceptions import DerivaMLException
|
|
59
|
+
from deriva_ml.model.catalog import DerivaModel
|
|
60
60
|
|
|
61
61
|
try:
|
|
62
62
|
from icecream import ic
|
|
63
63
|
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
64
64
|
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
65
65
|
|
|
66
|
+
# Use os.path.sep for OS-agnostic paths in regex patterns
|
|
67
|
+
SEP = re.escape(os.path.sep)
|
|
68
|
+
upload_root_regex = f"(?i)^.*{SEP}deriva-ml"
|
|
66
69
|
|
|
67
|
-
|
|
70
|
+
exec_dir_regex = upload_root_regex + f"{SEP}execution{SEP}(?P<execution_rid>[-\\w]+)"
|
|
68
71
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
feature_dir_regex = exec_dir_regex + r"/feature"
|
|
72
|
+
feature_dir_regex = exec_dir_regex + f"{SEP}feature"
|
|
72
73
|
feature_table_dir_regex = (
|
|
73
|
-
feature_dir_regex
|
|
74
|
-
+ r"/(?P<schema>[-\w]+)/(?P<target_table>[-\w]+)/(?P<feature_name>[-\w]+)"
|
|
75
|
-
)
|
|
76
|
-
feature_value_regex = (
|
|
77
|
-
feature_table_dir_regex + r"/(?P=feature_name)[.](?P<ext>[(csv|json)]*)$"
|
|
78
|
-
)
|
|
79
|
-
feature_asset_dir_regex = feature_table_dir_regex + r"/asset/(?P<asset_table>[-\w]+)"
|
|
80
|
-
feature_asset_regex = (
|
|
81
|
-
feature_asset_dir_regex + r"/(?P<file>[A-Za-z0-9_-]+)[.](?P<ext>[a-z0-9]*)$"
|
|
74
|
+
feature_dir_regex + f"{SEP}(?P<schema>[-\\w]+){SEP}(?P<target_table>[-\\w]+){SEP}(?P<feature_name>[-\\w]+)"
|
|
82
75
|
)
|
|
76
|
+
feature_value_regex = feature_table_dir_regex + f"{SEP}(?P=feature_name)[.](?P<ext>[(csv|json)]*)$"
|
|
77
|
+
feature_asset_dir_regex = feature_table_dir_regex + f"{SEP}asset{SEP}(?P<asset_table>[-\\w]+)"
|
|
78
|
+
feature_asset_regex = feature_asset_dir_regex + f"{SEP}(?P<file>[A-Za-z0-9_-]+)[.](?P<ext>[a-z0-9]*)$"
|
|
83
79
|
|
|
84
|
-
asset_path_regex = exec_dir_regex +
|
|
80
|
+
asset_path_regex = exec_dir_regex + f"{SEP}asset{SEP}(?P<schema>[-\\w]+){SEP}(?P<asset_table>[-\\w]*)"
|
|
85
81
|
|
|
86
82
|
asset_file_regex = r"(?P<file>[-\w]+)[.](?P<ext>[a-z0-9]*)$"
|
|
87
83
|
|
|
88
|
-
table_regex = (
|
|
89
|
-
exec_dir_regex
|
|
90
|
-
+ r"/table/(?P<schema>[-\w]+)/(?P<table>[-\w]+)/(?P=table)[.](csv|json)$"
|
|
91
|
-
)
|
|
84
|
+
table_regex = exec_dir_regex + f"{SEP}table{SEP}(?P<schema>[-\\w]+){SEP}(?P<table>[-\\w]+){SEP}(?P=table)[.](csv|json)$"
|
|
92
85
|
|
|
93
86
|
|
|
94
87
|
def is_feature_dir(path: Path) -> Optional[re.Match]:
|
|
@@ -97,9 +90,16 @@ def is_feature_dir(path: Path) -> Optional[re.Match]:
|
|
|
97
90
|
|
|
98
91
|
|
|
99
92
|
def normalize_asset_dir(path: str) -> Optional[tuple[str, str]]:
|
|
100
|
-
"""Parse a path to an asset file and return the asset table name and file name
|
|
93
|
+
"""Parse a path to an asset file and return the asset table name and file name.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
path: Path to the asset file
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Tuple of (schema/table, filename) or None if path doesn't match pattern
|
|
100
|
+
"""
|
|
101
101
|
path = Path(path)
|
|
102
|
-
if not (m := re.match(asset_path_regex, path
|
|
102
|
+
if not (m := re.match(asset_path_regex, str(path))):
|
|
103
103
|
return None
|
|
104
104
|
return f"{m['schema']}/{m['asset_table']}", path.name
|
|
105
105
|
|
|
@@ -138,18 +138,14 @@ def asset_root(prefix: Path | str, exec_rid: str) -> Path:
|
|
|
138
138
|
return path
|
|
139
139
|
|
|
140
140
|
|
|
141
|
-
def feature_dir(
|
|
142
|
-
prefix: Path | str, exec_rid: str, schema: str, target_table: str, feature_name: str
|
|
143
|
-
) -> Path:
|
|
141
|
+
def feature_dir(prefix: Path | str, exec_rid: str, schema: str, target_table: str, feature_name: str) -> Path:
|
|
144
142
|
"""Return the path to eht directory in which a named feature for an execution should be placed."""
|
|
145
143
|
path = feature_root(prefix, exec_rid) / schema / target_table / feature_name
|
|
146
144
|
path.mkdir(parents=True, exist_ok=True)
|
|
147
145
|
return path
|
|
148
146
|
|
|
149
147
|
|
|
150
|
-
def feature_value_path(
|
|
151
|
-
prefix: Path | str, exec_rid: str, schema: str, target_table: str, feature_name: str
|
|
152
|
-
) -> Path:
|
|
148
|
+
def feature_value_path(prefix: Path | str, exec_rid: str, schema: str, target_table: str, feature_name: str) -> Path:
|
|
153
149
|
"""Return the path to a CSV file in which to place feature values that are to be uploaded.
|
|
154
150
|
|
|
155
151
|
Args:
|
|
@@ -162,10 +158,7 @@ def feature_value_path(
|
|
|
162
158
|
Returns:
|
|
163
159
|
Path to CSV file in which to place feature values
|
|
164
160
|
"""
|
|
165
|
-
return (
|
|
166
|
-
feature_dir(prefix, exec_rid, schema, target_table, feature_name)
|
|
167
|
-
/ f"{feature_name}.jsonl"
|
|
168
|
-
)
|
|
161
|
+
return feature_dir(prefix, exec_rid, schema, target_table, feature_name) / f"{feature_name}.jsonl"
|
|
169
162
|
|
|
170
163
|
|
|
171
164
|
def table_path(prefix: Path | str, schema: str, table: str) -> Path:
|
|
@@ -185,14 +178,14 @@ def table_path(prefix: Path | str, schema: str, table: str) -> Path:
|
|
|
185
178
|
|
|
186
179
|
|
|
187
180
|
def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
|
|
188
|
-
"""Generate
|
|
181
|
+
"""Generate upload specification for an asset table.
|
|
189
182
|
|
|
190
183
|
Args:
|
|
191
|
-
model:
|
|
192
|
-
asset_table:
|
|
184
|
+
model: The DerivaModel instance.
|
|
185
|
+
asset_table: The asset table name or Table object.
|
|
193
186
|
|
|
194
187
|
Returns:
|
|
195
|
-
|
|
188
|
+
A dictionary containing the upload specification for the asset table.
|
|
196
189
|
"""
|
|
197
190
|
metadata_columns = model.asset_metadata(asset_table)
|
|
198
191
|
asset_table = model.name_to_table(asset_table)
|
|
@@ -201,7 +194,9 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
|
|
|
201
194
|
asset_path = f"{exec_dir_regex}/asset/{schema}/{asset_table.name}/{metadata_path}/{asset_file_regex}"
|
|
202
195
|
asset_table = model.name_to_table(asset_table)
|
|
203
196
|
schema = model.name_to_table(asset_table).schema.name
|
|
204
|
-
|
|
197
|
+
|
|
198
|
+
# Create upload specification
|
|
199
|
+
spec = {
|
|
205
200
|
# Upload assets into an asset table of an asset table.
|
|
206
201
|
"column_map": {
|
|
207
202
|
"MD5": "{md5}",
|
|
@@ -221,6 +216,7 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
|
|
|
221
216
|
},
|
|
222
217
|
"record_query_template": "/entity/{target_table}/MD5={md5}&Filename={file_name}",
|
|
223
218
|
}
|
|
219
|
+
return spec
|
|
224
220
|
|
|
225
221
|
|
|
226
222
|
def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
|
|
@@ -229,9 +225,7 @@ def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
|
|
|
229
225
|
model: Model from which to generate the upload configuration
|
|
230
226
|
"""
|
|
231
227
|
asset_tables_with_metadata = [
|
|
232
|
-
asset_table_upload_spec(model=model, asset_table=t)
|
|
233
|
-
for t in model.find_assets()
|
|
234
|
-
if model.asset_metadata(t)
|
|
228
|
+
asset_table_upload_spec(model=model, asset_table=t) for t in model.find_assets() if model.asset_metadata(t)
|
|
235
229
|
]
|
|
236
230
|
return {
|
|
237
231
|
"asset_mappings": asset_tables_with_metadata
|
|
@@ -246,9 +240,7 @@ def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
|
|
|
246
240
|
},
|
|
247
241
|
"asset_type": "file",
|
|
248
242
|
"target_table": ["{schema}", "{asset_table}"],
|
|
249
|
-
"file_pattern": asset_path_regex
|
|
250
|
-
+ "/"
|
|
251
|
-
+ asset_file_regex, # Sets schema, asset_table, name, ext
|
|
243
|
+
"file_pattern": asset_path_regex + "/" + asset_file_regex, # Sets schema, asset_table, name, ext
|
|
252
244
|
"checksum_types": ["sha256", "md5"],
|
|
253
245
|
"hatrac_options": {"versioned_urls": True},
|
|
254
246
|
"hatrac_templates": {
|
|
@@ -280,9 +272,7 @@ def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
|
|
|
280
272
|
|
|
281
273
|
|
|
282
274
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
283
|
-
def upload_directory(
|
|
284
|
-
model: DerivaModel, directory: Path | str
|
|
285
|
-
) -> dict[Any, FileUploadState] | None:
|
|
275
|
+
def upload_directory(model: DerivaModel, directory: Path | str) -> dict[Any, FileUploadState] | None:
|
|
286
276
|
"""Upload assets from a directory. This routine assumes that the current upload specification includes a
|
|
287
277
|
configuration for the specified directory. Every asset in the specified directory is uploaded
|
|
288
278
|
|
|
@@ -294,7 +284,7 @@ def upload_directory(
|
|
|
294
284
|
Results of the upload operation.
|
|
295
285
|
|
|
296
286
|
Raises:
|
|
297
|
-
DerivaMLException: If there is an issue uploading the assets.
|
|
287
|
+
DerivaMLException: If there is an issue with uploading the assets.
|
|
298
288
|
"""
|
|
299
289
|
directory = Path(directory)
|
|
300
290
|
if not directory.is_dir():
|
|
@@ -302,9 +292,9 @@ def upload_directory(
|
|
|
302
292
|
|
|
303
293
|
# Now upload the files by creating an upload spec and then calling the uploader.
|
|
304
294
|
with TemporaryDirectory() as temp_dir:
|
|
305
|
-
spec_file =
|
|
295
|
+
spec_file = Path(temp_dir) / "config.json"
|
|
306
296
|
|
|
307
|
-
with open(
|
|
297
|
+
with spec_file.open("w+") as cfile:
|
|
308
298
|
json.dump(bulk_upload_configuration(model), cfile)
|
|
309
299
|
uploader = GenericUploader(
|
|
310
300
|
server={
|
|
@@ -331,9 +321,7 @@ def upload_directory(
|
|
|
331
321
|
|
|
332
322
|
|
|
333
323
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
334
|
-
def upload_asset(
|
|
335
|
-
model: DerivaModel, file: Path | str, table: Table, **kwargs: Any
|
|
336
|
-
) -> dict:
|
|
324
|
+
def upload_asset(model: DerivaModel, file: Path | str, table: Table, **kwargs: Any) -> dict:
|
|
337
325
|
"""Upload the specified file into Hatrac and update the associated asset table.
|
|
338
326
|
|
|
339
327
|
Args:
|
|
@@ -359,9 +347,7 @@ def upload_asset(
|
|
|
359
347
|
credentials=model.catalog.deriva_server.credentials,
|
|
360
348
|
)
|
|
361
349
|
md5_hashes = hash_utils.compute_file_hashes(file, ["md5"])["md5"]
|
|
362
|
-
sanitized_filename = urlquote(
|
|
363
|
-
re.sub("[^a-zA-Z0-9_.-]", "_", md5_hashes[0] + "." + file_name)
|
|
364
|
-
)
|
|
350
|
+
sanitized_filename = urlquote(re.sub("[^a-zA-Z0-9_.-]", "_", md5_hashes[0] + "." + file_name))
|
|
365
351
|
hatrac_path = f"{hatrac_path}{sanitized_filename}"
|
|
366
352
|
|
|
367
353
|
try:
|
|
@@ -377,9 +363,7 @@ def upload_asset(
|
|
|
377
363
|
raise e
|
|
378
364
|
try:
|
|
379
365
|
# Now update the asset table.
|
|
380
|
-
ipath = (
|
|
381
|
-
model.catalog.getPathBuilder().schemas[table.schema.name].tables[table.name]
|
|
382
|
-
)
|
|
366
|
+
ipath = model.catalog.getPathBuilder().schemas[table.schema.name].tables[table.name]
|
|
383
367
|
return list(
|
|
384
368
|
ipath.insert(
|
|
385
369
|
[
|
|
@@ -429,9 +413,7 @@ def asset_file_path(
|
|
|
429
413
|
}.union(set(DerivaSystemColumns))
|
|
430
414
|
asset_metadata = {c.name for c in asset_table.columns} - asset_columns
|
|
431
415
|
if not (asset_metadata >= set(metadata.keys())):
|
|
432
|
-
raise DerivaMLException(
|
|
433
|
-
f"Metadata {metadata} does not match asset metadata {asset_metadata}"
|
|
434
|
-
)
|
|
416
|
+
raise DerivaMLException(f"Metadata {metadata} does not match asset metadata {asset_metadata}")
|
|
435
417
|
|
|
436
418
|
for m in asset_metadata:
|
|
437
419
|
path = path / metadata.get(m, "None")
|
|
@@ -450,10 +432,6 @@ def asset_type_path(prefix: Path | str, exec_rid: RID, asset_table: Table) -> Pa
|
|
|
450
432
|
Returns:
|
|
451
433
|
Path to the file in which to place asset_type values for the named asset.
|
|
452
434
|
"""
|
|
453
|
-
path = (
|
|
454
|
-
execution_root(prefix, exec_rid=exec_rid)
|
|
455
|
-
/ "asset-type"
|
|
456
|
-
/ asset_table.schema.name
|
|
457
|
-
)
|
|
435
|
+
path = execution_root(prefix, exec_rid=exec_rid) / "asset-type" / asset_table.schema.name
|
|
458
436
|
path.mkdir(parents=True, exist_ok=True)
|
|
459
437
|
return path / f"{asset_table.name}.jsonl"
|