deriva-ml 1.17.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. deriva_ml/.DS_Store +0 -0
  2. deriva_ml/__init__.py +79 -0
  3. deriva_ml/bump_version.py +142 -0
  4. deriva_ml/core/__init__.py +39 -0
  5. deriva_ml/core/base.py +1527 -0
  6. deriva_ml/core/config.py +69 -0
  7. deriva_ml/core/constants.py +36 -0
  8. deriva_ml/core/definitions.py +74 -0
  9. deriva_ml/core/enums.py +222 -0
  10. deriva_ml/core/ermrest.py +288 -0
  11. deriva_ml/core/exceptions.py +28 -0
  12. deriva_ml/core/filespec.py +116 -0
  13. deriva_ml/dataset/__init__.py +12 -0
  14. deriva_ml/dataset/aux_classes.py +225 -0
  15. deriva_ml/dataset/dataset.py +1519 -0
  16. deriva_ml/dataset/dataset_bag.py +450 -0
  17. deriva_ml/dataset/history.py +109 -0
  18. deriva_ml/dataset/upload.py +439 -0
  19. deriva_ml/demo_catalog.py +495 -0
  20. deriva_ml/execution/__init__.py +26 -0
  21. deriva_ml/execution/environment.py +290 -0
  22. deriva_ml/execution/execution.py +1180 -0
  23. deriva_ml/execution/execution_configuration.py +147 -0
  24. deriva_ml/execution/workflow.py +413 -0
  25. deriva_ml/feature.py +228 -0
  26. deriva_ml/install_kernel.py +71 -0
  27. deriva_ml/model/__init__.py +0 -0
  28. deriva_ml/model/catalog.py +485 -0
  29. deriva_ml/model/database.py +719 -0
  30. deriva_ml/protocols/dataset.py +19 -0
  31. deriva_ml/run_notebook.py +228 -0
  32. deriva_ml/schema/__init__.py +3 -0
  33. deriva_ml/schema/annotations.py +473 -0
  34. deriva_ml/schema/check_schema.py +104 -0
  35. deriva_ml/schema/create_schema.py +393 -0
  36. deriva_ml/schema/deriva-ml-reference.json +8525 -0
  37. deriva_ml/schema/policy.json +81 -0
  38. deriva_ml/schema/table_comments_utils.py +57 -0
  39. deriva_ml/test.py +94 -0
  40. deriva_ml-1.17.10.dist-info/METADATA +38 -0
  41. deriva_ml-1.17.10.dist-info/RECORD +45 -0
  42. deriva_ml-1.17.10.dist-info/WHEEL +5 -0
  43. deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
  44. deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
  45. deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
@@ -0,0 +1,439 @@
1
+ """
2
+ This module provides functions that help structure local directories for uploading to a DerivaML catalog, and
3
+ generating an upload specification for those directories.
4
+
5
+ Here is the directory layout we support:
6
+
7
+ deriva-ml/
8
+ execution
9
+ <execution_rid>
10
+ execution-asset
11
+ <asset_type>
12
+ file1, file2, .... <- Need to update execution_asset association table.
13
+ execution-metadata
14
+ <metadata_type>
15
+ feature
16
+ <schema>
17
+ <target_table>
18
+ <feature_name>
19
+ asset
20
+ <asset_table>
21
+ file1, file2, ...
22
+ <feature_name>.jsonl <- needs to have asset_name column remapped before uploading
23
+ table
24
+ <schema>
25
+ <record_table>
26
+ record_table.csv
27
+ asset
28
+ <schema>
29
+ <asset_table>
30
+ <metadata1>
31
+ <metadata2>
32
+ file1, file2, ....
33
+ asset-type
34
+ <schema>
35
+ file1.jsonl, file2.jsonl
36
+ """
37
+
38
+ import json
39
+ import os
40
+ from pathlib import Path
41
+ from tempfile import TemporaryDirectory
42
+ from typing import Any, Optional
43
+
44
+ import regex as re
45
+ from deriva.core import urlquote
46
+ from deriva.core.ermrest_model import Table
47
+ from deriva.core.hatrac_store import HatracStore
48
+ from deriva.core.utils import hash_utils, mime_utils
49
+ from deriva.transfer.upload.deriva_upload import GenericUploader
50
+ from pydantic import ConfigDict, validate_call
51
+
52
+ from deriva_ml.core.definitions import (
53
+ RID,
54
+ DerivaSystemColumns,
55
+ FileUploadState,
56
+ UploadState,
57
+ )
58
+ from deriva_ml.core.exceptions import DerivaMLException
59
+ from deriva_ml.model.catalog import DerivaModel
60
+
61
+ try:
62
+ from icecream import ic
63
+ except ImportError: # Graceful fallback if IceCream isn't installed.
64
+ ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
65
+
66
+ # Use os.path.sep for OS-agnostic paths in regex patterns
67
+ SEP = re.escape(os.path.sep)
68
+ upload_root_regex = f"(?i)^.*{SEP}deriva-ml"
69
+
70
+ exec_dir_regex = upload_root_regex + f"{SEP}execution{SEP}(?P<execution_rid>[-\\w]+)"
71
+
72
+ feature_dir_regex = exec_dir_regex + f"{SEP}feature"
73
+ feature_table_dir_regex = (
74
+ feature_dir_regex + f"{SEP}(?P<schema>[-\\w]+){SEP}(?P<target_table>[-\\w]+){SEP}(?P<feature_name>[-\\w]+)"
75
+ )
76
+ feature_value_regex = feature_table_dir_regex + f"{SEP}(?P=feature_name)[.](?P<ext>[(csv|json)]*)$"
77
+ feature_asset_dir_regex = feature_table_dir_regex + f"{SEP}asset{SEP}(?P<asset_table>[-\\w]+)"
78
+ feature_asset_regex = feature_asset_dir_regex + f"{SEP}(?P<file>[A-Za-z0-9_-]+)[.](?P<ext>[a-z0-9]*)$"
79
+
80
+ asset_path_regex = exec_dir_regex + rf"{SEP}asset{SEP}(?P<schema>[-\w]+){SEP}(?P<asset_table>[-\w]*)"
81
+
82
+ asset_file_regex = r"(?P<file>[-\w]+)[.](?P<ext>[a-z0-9]*)$"
83
+
84
+ table_regex = exec_dir_regex + rf"{SEP}table{SEP}(?P<schema>[-\w]+){SEP}(?P<table>[-\w]+){SEP}(?P=table)[.](csv|json)$"
85
+
86
+
87
+ def is_feature_dir(path: Path) -> Optional[re.Match]:
88
+ """Path matches the pattern for where the table for a feature would go."""
89
+ return re.match(feature_table_dir_regex + "$", path.as_posix())
90
+
91
+
92
+ def normalize_asset_dir(path: str) -> Optional[tuple[str, str]]:
93
+ """Parse a path to an asset file and return the asset table name and file name.
94
+
95
+ Args:
96
+ path: Path to the asset file
97
+
98
+ Returns:
99
+ Tuple of (schema/table, filename) or None if path doesn't match pattern
100
+ """
101
+ path = Path(path)
102
+ if not (m := re.match(asset_path_regex, str(path))):
103
+ return None
104
+ return f"{m['schema']}/{m['asset_table']}", path.name
105
+
106
+
107
+ def upload_root(prefix: Path | str) -> Path:
108
+ """Return the top level directory of where to put files to be uploaded."""
109
+ path = Path(prefix) / "deriva-ml"
110
+ path.mkdir(exist_ok=True, parents=True)
111
+ return path
112
+
113
+
114
+ def execution_rids(prefix: Path | str) -> list[RID]:
115
+ """Return a list of all the execution RIDS that have files waiting to be uploaded."""
116
+ path = upload_root(prefix) / "execution"
117
+ return [d.name for d in path.iterdir()]
118
+
119
+
120
+ def execution_root(prefix: Path | str, exec_rid) -> Path:
121
+ """Path to directory to place execution specific upload files."""
122
+ path = upload_root(prefix) / "execution" / exec_rid
123
+ path.mkdir(exist_ok=True, parents=True)
124
+ return path
125
+
126
+
127
+ def feature_root(prefix: Path | str, exec_rid: str) -> Path:
128
+ """Return the path to the directory in which features for the specified execution should be placed."""
129
+ path = execution_root(prefix, exec_rid) / "feature"
130
+ path.mkdir(parents=True, exist_ok=True)
131
+ return path
132
+
133
+
134
+ def asset_root(prefix: Path | str, exec_rid: str) -> Path:
135
+ """Return the path to the directory in which features for the specified execution should be placed."""
136
+ path = execution_root(prefix, exec_rid) / "asset"
137
+ path.mkdir(parents=True, exist_ok=True)
138
+ return path
139
+
140
+
141
+ def feature_dir(prefix: Path | str, exec_rid: str, schema: str, target_table: str, feature_name: str) -> Path:
142
+ """Return the path to eht directory in which a named feature for an execution should be placed."""
143
+ path = feature_root(prefix, exec_rid) / schema / target_table / feature_name
144
+ path.mkdir(parents=True, exist_ok=True)
145
+ return path
146
+
147
+
148
+ def feature_value_path(prefix: Path | str, exec_rid: str, schema: str, target_table: str, feature_name: str) -> Path:
149
+ """Return the path to a CSV file in which to place feature values that are to be uploaded.
150
+
151
+ Args:
152
+ prefix: Location of upload root directory
153
+ exec_rid: RID of the execution to be associated with this feature.
154
+ schema: Domain schema name
155
+ target_table: Target table name for the feature.
156
+ feature_name: Name of the feature.
157
+
158
+ Returns:
159
+ Path to CSV file in which to place feature values
160
+ """
161
+ return feature_dir(prefix, exec_rid, schema, target_table, feature_name) / f"{feature_name}.jsonl"
162
+
163
+
164
+ def table_path(prefix: Path | str, schema: str, table: str) -> Path:
165
+ """Return the path to a CSV file in which to place table values that are to be uploaded.
166
+
167
+ Args:
168
+ prefix: Location of upload root directory
169
+ schema: Domain schema
170
+ table: Name of the table to be uploaded.
171
+
172
+ Returns:
173
+ Path to the file in which to place table values that are to be uploaded.
174
+ """
175
+ path = upload_root(prefix) / "table" / schema / table
176
+ path.mkdir(parents=True, exist_ok=True)
177
+ return path / f"{table}.csv"
178
+
179
+
180
+ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
181
+ """Generate upload specification for an asset table.
182
+
183
+ Args:
184
+ model: The DerivaModel instance.
185
+ asset_table: The asset table name or Table object.
186
+
187
+ Returns:
188
+ A dictionary containing the upload specification for the asset table.
189
+ """
190
+ metadata_columns = model.asset_metadata(asset_table)
191
+ asset_table = model.name_to_table(asset_table)
192
+ schema = model.name_to_table(asset_table).schema.name
193
+
194
+ # Be careful here as a metadata value might be a string with can contain special characters.
195
+ metadata_path = "/".join([rf"(?P<{c}>[-:._ \w]+)" for c in metadata_columns])
196
+ asset_path = f"{exec_dir_regex}/asset/{schema}/{asset_table.name}/{metadata_path}/{asset_file_regex}"
197
+ asset_table = model.name_to_table(asset_table)
198
+ schema = model.name_to_table(asset_table).schema.name
199
+
200
+ # Create upload specification
201
+ spec = {
202
+ # Upload assets into an asset table of an asset table.
203
+ "column_map": {
204
+ "MD5": "{md5}",
205
+ "URL": "{URI}",
206
+ "Length": "{file_size}",
207
+ "Filename": "{file_name}",
208
+ }
209
+ | {c: f"{{{c}}}" for c in metadata_columns},
210
+ "file_pattern": asset_path, # Sets schema, asset_table, file
211
+ "asset_type": "file",
212
+ "target_table": [schema, asset_table.name],
213
+ "checksum_types": ["sha256", "md5"],
214
+ "hatrac_options": {"versioned_urls": True},
215
+ "hatrac_templates": {
216
+ "hatrac_uri": f"/hatrac/{asset_table.name}/{{md5}}.{{file_name}}",
217
+ "content-disposition": "filename*=UTF-8''{file_name}",
218
+ },
219
+ "record_query_template": "/entity/{target_table}/MD5={md5}&Filename={file_name}",
220
+ }
221
+ return spec
222
+
223
+
224
+ def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
225
+ """Return an upload specification for deriva-ml
226
+ Arguments:
227
+ model: Model from which to generate the upload configuration
228
+ """
229
+ asset_tables_with_metadata = [
230
+ asset_table_upload_spec(model=model, asset_table=t) for t in model.find_assets() if model.asset_metadata(t)
231
+ ]
232
+ return {
233
+ "asset_mappings": asset_tables_with_metadata
234
+ + [
235
+ {
236
+ # Upload assets into an asset table of an asset table without any metadata
237
+ "column_map": {
238
+ "MD5": "{md5}",
239
+ "URL": "{URI}",
240
+ "Length": "{file_size}",
241
+ "Filename": "{file_name}",
242
+ },
243
+ "asset_type": "file",
244
+ "target_table": ["{schema}", "{asset_table}"],
245
+ "file_pattern": asset_path_regex + "/" + asset_file_regex, # Sets schema, asset_table, name, ext
246
+ "checksum_types": ["sha256", "md5"],
247
+ "hatrac_options": {"versioned_urls": True},
248
+ "hatrac_templates": {
249
+ "hatrac_uri": "/hatrac/{asset_table}/{md5}.{file_name}",
250
+ "content-disposition": "filename*=UTF-8''{file_name}",
251
+ },
252
+ "record_query_template": "/entity/{target_table}/MD5={md5}&Filename={file_name}",
253
+ },
254
+ # {
255
+ # Upload the records into a table
256
+ # "asset_type": "skip",
257
+ ## "default_columns": ["RID", "RCB", "RMB", "RCT", "RMT"],
258
+ # "file_pattern": feature_value_regex, # Sets schema, table,
259
+ # "ext_pattern": "^.*[.](?P<file_ext>json|csv)$",
260
+ # "target_table": ["{schema}", "{table}"],
261
+ # },
262
+ {
263
+ # Upload the records into a table
264
+ "asset_type": "table",
265
+ "default_columns": ["RID", "RCB", "RMB", "RCT", "RMT"],
266
+ "file_pattern": table_regex, # Sets schema, table,
267
+ "ext_pattern": "^.*[.](?P<file_ext>json|csv)$",
268
+ "target_table": ["{schema}", "{table}"],
269
+ },
270
+ ],
271
+ "version_update_url": "https://github.com/informatics-isi-edu/deriva-client",
272
+ "version_compatibility": [[">=1.4.0", "<2.0.0"]],
273
+ }
274
+
275
+
276
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
277
+ def upload_directory(model: DerivaModel, directory: Path | str) -> dict[Any, FileUploadState] | None:
278
+ """Upload assets from a directory. This routine assumes that the current upload specification includes a
279
+ configuration for the specified directory. Every asset in the specified directory is uploaded
280
+
281
+ Args:
282
+ model: Model to upload assets to.
283
+ directory: Directory containing the assets and tables to upload.
284
+
285
+ Returns:
286
+ Results of the upload operation.
287
+
288
+ Raises:
289
+ DerivaMLException: If there is an issue with uploading the assets.
290
+ """
291
+ directory = Path(directory)
292
+ if not directory.is_dir():
293
+ raise DerivaMLException("Directory does not exist")
294
+
295
+ # Now upload the files by creating an upload spec and then calling the uploader.
296
+ with TemporaryDirectory() as temp_dir:
297
+ spec_file = Path(temp_dir) / "config.json"
298
+ with spec_file.open("w+") as cfile:
299
+ json.dump(bulk_upload_configuration(model), cfile)
300
+ uploader = GenericUploader(
301
+ server={
302
+ "host": model.hostname,
303
+ "protocol": "https",
304
+ "catalog_id": model.catalog.catalog_id,
305
+ },
306
+ config_file=spec_file,
307
+ )
308
+ try:
309
+ uploader.getUpdatedConfig()
310
+ uploader.scanDirectory(directory, purge_state=True)
311
+ results = {
312
+ path: FileUploadState(
313
+ state=UploadState(result["State"]),
314
+ status=result["Status"],
315
+ result=result["Result"],
316
+ )
317
+ for path, result in uploader.uploadFiles().items()
318
+ }
319
+ finally:
320
+ uploader.cleanup()
321
+ return results
322
+
323
+
324
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
325
+ def upload_asset(model: DerivaModel, file: Path | str, table: Table, **kwargs: Any) -> dict:
326
+ """Upload the specified file into Hatrac and update the associated asset table.
327
+
328
+ Args:
329
+ file: path to the file to upload.
330
+ table: Name of the asset table
331
+ model: Model to upload assets to.
332
+ kwargs: Keyword arguments for values of additional columns to be added to the asset table.
333
+
334
+ Returns:
335
+
336
+ """
337
+ if not model.is_asset(table):
338
+ raise DerivaMLException(f"Table {table} is not an asset table.")
339
+
340
+ file_path = Path(file)
341
+ file_name = file_path.name
342
+ file_size = file_path.stat().st_size
343
+
344
+ hatrac_path = f"/hatrac/{table.name}/"
345
+ hs = HatracStore(
346
+ "https",
347
+ server=model.catalog.deriva_server.server,
348
+ credentials=model.catalog.deriva_server.credentials,
349
+ )
350
+ md5_hashes = hash_utils.compute_file_hashes(file, ["md5"])["md5"]
351
+ sanitized_filename = urlquote(re.sub("[^a-zA-Z0-9_.-]", "_", md5_hashes[0] + "." + file_name))
352
+ hatrac_path = f"{hatrac_path}{sanitized_filename}"
353
+
354
+ try:
355
+ # Upload the file to hatrac.
356
+ hatrac_uri = hs.put_obj(
357
+ hatrac_path,
358
+ file,
359
+ md5=md5_hashes[1],
360
+ content_type=mime_utils.guess_content_type(file),
361
+ content_disposition="filename*=UTF-8''" + file_name,
362
+ )
363
+ except Exception as e:
364
+ raise e
365
+ try:
366
+ # Now update the asset table.
367
+ ipath = model.catalog.getPathBuilder().schemas[table.schema.name].tables[table.name]
368
+ return list(
369
+ ipath.insert(
370
+ [
371
+ {
372
+ "URL": hatrac_uri,
373
+ "Filename": file_name,
374
+ "Length": file_size,
375
+ "MD5": md5_hashes[0],
376
+ }
377
+ | kwargs
378
+ ]
379
+ )
380
+ )[0]
381
+ except Exception as e:
382
+ raise e
383
+
384
+
385
+ def asset_file_path(
386
+ prefix: Path | str,
387
+ exec_rid: RID,
388
+ asset_table: Table,
389
+ file_name: str,
390
+ metadata: dict[str, Any],
391
+ ) -> Path:
392
+ """Return the file in which to place assets of a specified type are to be uploaded.
393
+
394
+ Args:
395
+ prefix: Path prefix to use.
396
+ exec_rid: RID to use.
397
+ asset_table: Table in which to place assets.
398
+ file_name: File name to use.
399
+ metadata: Any additional metadata to add to the asset
400
+ Returns:
401
+ Path to directory in which to place assets of type asset_type.
402
+ """
403
+ schema = asset_table.schema.name
404
+ asset_name = asset_table.name
405
+
406
+ path = execution_root(prefix, exec_rid) / "asset" / schema / asset_name
407
+ metadata = metadata or {}
408
+ asset_columns = {
409
+ "Filename",
410
+ "URL",
411
+ "Length",
412
+ "MD5",
413
+ "Description",
414
+ }.union(set(DerivaSystemColumns))
415
+ asset_metadata = {c.name for c in asset_table.columns} - asset_columns
416
+
417
+ if not (asset_metadata >= set(metadata.keys())):
418
+ raise DerivaMLException(f"Metadata {metadata} does not match asset metadata {asset_metadata}")
419
+
420
+ for m in asset_metadata:
421
+ path = path / str(metadata.get(m, "None"))
422
+ path.mkdir(parents=True, exist_ok=True)
423
+ return path / file_name
424
+
425
+
426
+ def asset_type_path(prefix: Path | str, exec_rid: RID, asset_table: Table) -> Path:
427
+ """Return the path to a JSON line file in which to place asset_type information.
428
+
429
+ Args:
430
+ prefix: Location of upload root directory
431
+ exec_rid: Execution RID
432
+ asset_table: Table in which to place assets.
433
+
434
+ Returns:
435
+ Path to the file in which to place asset_type values for the named asset.
436
+ """
437
+ path = execution_root(prefix, exec_rid=exec_rid) / "asset-type" / asset_table.schema.name
438
+ path.mkdir(parents=True, exist_ok=True)
439
+ return path / f"{asset_table.name}.jsonl"