deriva-ml 1.13.3__py3-none-any.whl → 1.14.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. deriva_ml/__init__.py +25 -30
  2. deriva_ml/core/__init__.py +39 -0
  3. deriva_ml/core/base.py +1489 -0
  4. deriva_ml/core/constants.py +36 -0
  5. deriva_ml/core/definitions.py +74 -0
  6. deriva_ml/core/enums.py +222 -0
  7. deriva_ml/core/ermrest.py +288 -0
  8. deriva_ml/core/exceptions.py +28 -0
  9. deriva_ml/core/filespec.py +116 -0
  10. deriva_ml/dataset/__init__.py +4 -0
  11. deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
  12. deriva_ml/{dataset.py → dataset/dataset.py} +408 -416
  13. deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
  14. deriva_ml/{history.py → dataset/history.py} +52 -33
  15. deriva_ml/{upload.py → dataset/upload.py} +48 -70
  16. deriva_ml/demo_catalog.py +233 -183
  17. deriva_ml/execution/environment.py +290 -0
  18. deriva_ml/{execution.py → execution/execution.py} +365 -252
  19. deriva_ml/execution/execution_configuration.py +163 -0
  20. deriva_ml/{execution_configuration.py → execution/workflow.py} +206 -218
  21. deriva_ml/feature.py +83 -46
  22. deriva_ml/model/__init__.py +0 -0
  23. deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
  24. deriva_ml/{database_model.py → model/database.py} +52 -74
  25. deriva_ml/model/sql_mapper.py +44 -0
  26. deriva_ml/run_notebook.py +19 -11
  27. deriva_ml/schema/__init__.py +3 -0
  28. deriva_ml/{schema_setup → schema}/annotations.py +31 -22
  29. deriva_ml/schema/check_schema.py +104 -0
  30. deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
  31. deriva_ml/schema/deriva-ml-reference.json +8525 -0
  32. deriva_ml/schema/table_comments_utils.py +57 -0
  33. {deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/METADATA +5 -4
  34. deriva_ml-1.14.26.dist-info/RECORD +40 -0
  35. {deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/entry_points.txt +1 -0
  36. deriva_ml/deriva_definitions.py +0 -372
  37. deriva_ml/deriva_ml_base.py +0 -1046
  38. deriva_ml/execution_environment.py +0 -139
  39. deriva_ml/schema_setup/table_comments_utils.py +0 -56
  40. deriva_ml/test-files/execution-parameters.json +0 -1
  41. deriva_ml/test-files/notebook-parameters.json +0 -5
  42. deriva_ml/test_functions.py +0 -141
  43. deriva_ml/test_notebook.ipynb +0 -197
  44. deriva_ml-1.13.3.dist-info/RECORD +0 -31
  45. /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
  46. /deriva_ml/{schema_setup → schema}/policy.json +0 -0
  47. {deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/WHEEL +0 -0
  48. {deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/licenses/LICENSE +0 -0
  49. {deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/top_level.txt +0 -0
@@ -36,6 +36,7 @@ Here is the directory layout we support:
36
36
  """
37
37
 
38
38
  import json
39
+ import os
39
40
  from pathlib import Path
40
41
  from tempfile import TemporaryDirectory
41
42
  from typing import Any, Optional
@@ -46,49 +47,41 @@ from deriva.core.ermrest_model import Table
46
47
  from deriva.core.hatrac_store import HatracStore
47
48
  from deriva.core.utils import hash_utils, mime_utils
48
49
  from deriva.transfer.upload.deriva_upload import GenericUploader
49
- from pydantic import validate_call, ConfigDict
50
+ from pydantic import ConfigDict, validate_call
50
51
 
51
- from deriva_ml.deriva_definitions import (
52
+ from deriva_ml.core.definitions import (
52
53
  RID,
53
- DerivaMLException,
54
+ DerivaSystemColumns,
54
55
  FileUploadState,
55
56
  UploadState,
56
- DerivaSystemColumns,
57
57
  )
58
- from deriva_ml.deriva_model import DerivaModel
59
-
58
+ from deriva_ml.core.exceptions import DerivaMLException
59
+ from deriva_ml.model.catalog import DerivaModel
60
60
 
61
61
  try:
62
62
  from icecream import ic
63
63
  except ImportError: # Graceful fallback if IceCream isn't installed.
64
64
  ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
65
65
 
66
+ # Use os.path.sep for OS-agnostic paths in regex patterns
67
+ SEP = re.escape(os.path.sep)
68
+ upload_root_regex = f"(?i)^.*{SEP}deriva-ml"
66
69
 
67
- upload_root_regex = r"(?i)^.*/deriva-ml"
70
+ exec_dir_regex = upload_root_regex + f"{SEP}execution{SEP}(?P<execution_rid>[-\\w]+)"
68
71
 
69
- exec_dir_regex = upload_root_regex + r"/execution/(?P<execution_rid>[-\w]+)"
70
-
71
- feature_dir_regex = exec_dir_regex + r"/feature"
72
+ feature_dir_regex = exec_dir_regex + f"{SEP}feature"
72
73
  feature_table_dir_regex = (
73
- feature_dir_regex
74
- + r"/(?P<schema>[-\w]+)/(?P<target_table>[-\w]+)/(?P<feature_name>[-\w]+)"
75
- )
76
- feature_value_regex = (
77
- feature_table_dir_regex + r"/(?P=feature_name)[.](?P<ext>[(csv|json)]*)$"
78
- )
79
- feature_asset_dir_regex = feature_table_dir_regex + r"/asset/(?P<asset_table>[-\w]+)"
80
- feature_asset_regex = (
81
- feature_asset_dir_regex + r"/(?P<file>[A-Za-z0-9_-]+)[.](?P<ext>[a-z0-9]*)$"
74
+ feature_dir_regex + f"{SEP}(?P<schema>[-\\w]+){SEP}(?P<target_table>[-\\w]+){SEP}(?P<feature_name>[-\\w]+)"
82
75
  )
76
+ feature_value_regex = feature_table_dir_regex + f"{SEP}(?P=feature_name)[.](?P<ext>[(csv|json)]*)$"
77
+ feature_asset_dir_regex = feature_table_dir_regex + f"{SEP}asset{SEP}(?P<asset_table>[-\\w]+)"
78
+ feature_asset_regex = feature_asset_dir_regex + f"{SEP}(?P<file>[A-Za-z0-9_-]+)[.](?P<ext>[a-z0-9]*)$"
83
79
 
84
- asset_path_regex = exec_dir_regex + r"/asset/(?P<schema>[-\w]+)/(?P<asset_table>[-\w]*)"
80
+ asset_path_regex = exec_dir_regex + f"{SEP}asset{SEP}(?P<schema>[-\\w]+){SEP}(?P<asset_table>[-\\w]*)"
85
81
 
86
82
  asset_file_regex = r"(?P<file>[-\w]+)[.](?P<ext>[a-z0-9]*)$"
87
83
 
88
- table_regex = (
89
- exec_dir_regex
90
- + r"/table/(?P<schema>[-\w]+)/(?P<table>[-\w]+)/(?P=table)[.](csv|json)$"
91
- )
84
+ table_regex = exec_dir_regex + f"{SEP}table{SEP}(?P<schema>[-\\w]+){SEP}(?P<table>[-\\w]+){SEP}(?P=table)[.](csv|json)$"
92
85
 
93
86
 
94
87
  def is_feature_dir(path: Path) -> Optional[re.Match]:
@@ -97,9 +90,16 @@ def is_feature_dir(path: Path) -> Optional[re.Match]:
97
90
 
98
91
 
99
92
  def normalize_asset_dir(path: str) -> Optional[tuple[str, str]]:
100
- """Parse a path to an asset file and return the asset table name and file name"""
93
+ """Parse a path to an asset file and return the asset table name and file name.
94
+
95
+ Args:
96
+ path: Path to the asset file
97
+
98
+ Returns:
99
+ Tuple of (schema/table, filename) or None if path doesn't match pattern
100
+ """
101
101
  path = Path(path)
102
- if not (m := re.match(asset_path_regex, path.as_posix())):
102
+ if not (m := re.match(asset_path_regex, str(path))):
103
103
  return None
104
104
  return f"{m['schema']}/{m['asset_table']}", path.name
105
105
 
@@ -138,18 +138,14 @@ def asset_root(prefix: Path | str, exec_rid: str) -> Path:
138
138
  return path
139
139
 
140
140
 
141
- def feature_dir(
142
- prefix: Path | str, exec_rid: str, schema: str, target_table: str, feature_name: str
143
- ) -> Path:
141
+ def feature_dir(prefix: Path | str, exec_rid: str, schema: str, target_table: str, feature_name: str) -> Path:
144
142
  """Return the path to eht directory in which a named feature for an execution should be placed."""
145
143
  path = feature_root(prefix, exec_rid) / schema / target_table / feature_name
146
144
  path.mkdir(parents=True, exist_ok=True)
147
145
  return path
148
146
 
149
147
 
150
- def feature_value_path(
151
- prefix: Path | str, exec_rid: str, schema: str, target_table: str, feature_name: str
152
- ) -> Path:
148
+ def feature_value_path(prefix: Path | str, exec_rid: str, schema: str, target_table: str, feature_name: str) -> Path:
153
149
  """Return the path to a CSV file in which to place feature values that are to be uploaded.
154
150
 
155
151
  Args:
@@ -162,10 +158,7 @@ def feature_value_path(
162
158
  Returns:
163
159
  Path to CSV file in which to place feature values
164
160
  """
165
- return (
166
- feature_dir(prefix, exec_rid, schema, target_table, feature_name)
167
- / f"{feature_name}.jsonl"
168
- )
161
+ return feature_dir(prefix, exec_rid, schema, target_table, feature_name) / f"{feature_name}.jsonl"
169
162
 
170
163
 
171
164
  def table_path(prefix: Path | str, schema: str, table: str) -> Path:
@@ -185,14 +178,14 @@ def table_path(prefix: Path | str, schema: str, table: str) -> Path:
185
178
 
186
179
 
187
180
  def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
188
- """Generate a pattern to an asset table that may include additional metadata columns.
181
+ """Generate upload specification for an asset table.
189
182
 
190
183
  Args:
191
- model:
192
- asset_table:
184
+ model: The DerivaModel instance.
185
+ asset_table: The asset table name or Table object.
193
186
 
194
187
  Returns:
195
-
188
+ A dictionary containing the upload specification for the asset table.
196
189
  """
197
190
  metadata_columns = model.asset_metadata(asset_table)
198
191
  asset_table = model.name_to_table(asset_table)
@@ -201,7 +194,9 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
201
194
  asset_path = f"{exec_dir_regex}/asset/{schema}/{asset_table.name}/{metadata_path}/{asset_file_regex}"
202
195
  asset_table = model.name_to_table(asset_table)
203
196
  schema = model.name_to_table(asset_table).schema.name
204
- return {
197
+
198
+ # Create upload specification
199
+ spec = {
205
200
  # Upload assets into an asset table of an asset table.
206
201
  "column_map": {
207
202
  "MD5": "{md5}",
@@ -221,6 +216,7 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
221
216
  },
222
217
  "record_query_template": "/entity/{target_table}/MD5={md5}&Filename={file_name}",
223
218
  }
219
+ return spec
224
220
 
225
221
 
226
222
  def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
@@ -229,9 +225,7 @@ def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
229
225
  model: Model from which to generate the upload configuration
230
226
  """
231
227
  asset_tables_with_metadata = [
232
- asset_table_upload_spec(model=model, asset_table=t)
233
- for t in model.find_assets()
234
- if model.asset_metadata(t)
228
+ asset_table_upload_spec(model=model, asset_table=t) for t in model.find_assets() if model.asset_metadata(t)
235
229
  ]
236
230
  return {
237
231
  "asset_mappings": asset_tables_with_metadata
@@ -246,9 +240,7 @@ def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
246
240
  },
247
241
  "asset_type": "file",
248
242
  "target_table": ["{schema}", "{asset_table}"],
249
- "file_pattern": asset_path_regex
250
- + "/"
251
- + asset_file_regex, # Sets schema, asset_table, name, ext
243
+ "file_pattern": asset_path_regex + "/" + asset_file_regex, # Sets schema, asset_table, name, ext
252
244
  "checksum_types": ["sha256", "md5"],
253
245
  "hatrac_options": {"versioned_urls": True},
254
246
  "hatrac_templates": {
@@ -280,9 +272,7 @@ def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
280
272
 
281
273
 
282
274
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
283
- def upload_directory(
284
- model: DerivaModel, directory: Path | str
285
- ) -> dict[Any, FileUploadState] | None:
275
+ def upload_directory(model: DerivaModel, directory: Path | str) -> dict[Any, FileUploadState] | None:
286
276
  """Upload assets from a directory. This routine assumes that the current upload specification includes a
287
277
  configuration for the specified directory. Every asset in the specified directory is uploaded
288
278
 
@@ -294,7 +284,7 @@ def upload_directory(
294
284
  Results of the upload operation.
295
285
 
296
286
  Raises:
297
- DerivaMLException: If there is an issue uploading the assets.
287
+ DerivaMLException: If there is an issue with uploading the assets.
298
288
  """
299
289
  directory = Path(directory)
300
290
  if not directory.is_dir():
@@ -302,9 +292,9 @@ def upload_directory(
302
292
 
303
293
  # Now upload the files by creating an upload spec and then calling the uploader.
304
294
  with TemporaryDirectory() as temp_dir:
305
- spec_file = f"{temp_dir}/config.json"
295
+ spec_file = Path(temp_dir) / "config.json"
306
296
 
307
- with open(spec_file, "w+") as cfile:
297
+ with spec_file.open("w+") as cfile:
308
298
  json.dump(bulk_upload_configuration(model), cfile)
309
299
  uploader = GenericUploader(
310
300
  server={
@@ -331,9 +321,7 @@ def upload_directory(
331
321
 
332
322
 
333
323
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
334
- def upload_asset(
335
- model: DerivaModel, file: Path | str, table: Table, **kwargs: Any
336
- ) -> dict:
324
+ def upload_asset(model: DerivaModel, file: Path | str, table: Table, **kwargs: Any) -> dict:
337
325
  """Upload the specified file into Hatrac and update the associated asset table.
338
326
 
339
327
  Args:
@@ -359,9 +347,7 @@ def upload_asset(
359
347
  credentials=model.catalog.deriva_server.credentials,
360
348
  )
361
349
  md5_hashes = hash_utils.compute_file_hashes(file, ["md5"])["md5"]
362
- sanitized_filename = urlquote(
363
- re.sub("[^a-zA-Z0-9_.-]", "_", md5_hashes[0] + "." + file_name)
364
- )
350
+ sanitized_filename = urlquote(re.sub("[^a-zA-Z0-9_.-]", "_", md5_hashes[0] + "." + file_name))
365
351
  hatrac_path = f"{hatrac_path}{sanitized_filename}"
366
352
 
367
353
  try:
@@ -377,9 +363,7 @@ def upload_asset(
377
363
  raise e
378
364
  try:
379
365
  # Now update the asset table.
380
- ipath = (
381
- model.catalog.getPathBuilder().schemas[table.schema.name].tables[table.name]
382
- )
366
+ ipath = model.catalog.getPathBuilder().schemas[table.schema.name].tables[table.name]
383
367
  return list(
384
368
  ipath.insert(
385
369
  [
@@ -429,9 +413,7 @@ def asset_file_path(
429
413
  }.union(set(DerivaSystemColumns))
430
414
  asset_metadata = {c.name for c in asset_table.columns} - asset_columns
431
415
  if not (asset_metadata >= set(metadata.keys())):
432
- raise DerivaMLException(
433
- f"Metadata {metadata} does not match asset metadata {asset_metadata}"
434
- )
416
+ raise DerivaMLException(f"Metadata {metadata} does not match asset metadata {asset_metadata}")
435
417
 
436
418
  for m in asset_metadata:
437
419
  path = path / metadata.get(m, "None")
@@ -450,10 +432,6 @@ def asset_type_path(prefix: Path | str, exec_rid: RID, asset_table: Table) -> Pa
450
432
  Returns:
451
433
  Path to the file in which to place asset_type values for the named asset.
452
434
  """
453
- path = (
454
- execution_root(prefix, exec_rid=exec_rid)
455
- / "asset-type"
456
- / asset_table.schema.name
457
- )
435
+ path = execution_root(prefix, exec_rid=exec_rid) / "asset-type" / asset_table.schema.name
458
436
  path.mkdir(parents=True, exist_ok=True)
459
437
  return path / f"{asset_table.name}.jsonl"