deriva-ml 1.17.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. deriva_ml/.DS_Store +0 -0
  2. deriva_ml/__init__.py +79 -0
  3. deriva_ml/bump_version.py +142 -0
  4. deriva_ml/core/__init__.py +39 -0
  5. deriva_ml/core/base.py +1527 -0
  6. deriva_ml/core/config.py +69 -0
  7. deriva_ml/core/constants.py +36 -0
  8. deriva_ml/core/definitions.py +74 -0
  9. deriva_ml/core/enums.py +222 -0
  10. deriva_ml/core/ermrest.py +288 -0
  11. deriva_ml/core/exceptions.py +28 -0
  12. deriva_ml/core/filespec.py +116 -0
  13. deriva_ml/dataset/__init__.py +12 -0
  14. deriva_ml/dataset/aux_classes.py +225 -0
  15. deriva_ml/dataset/dataset.py +1519 -0
  16. deriva_ml/dataset/dataset_bag.py +450 -0
  17. deriva_ml/dataset/history.py +109 -0
  18. deriva_ml/dataset/upload.py +439 -0
  19. deriva_ml/demo_catalog.py +495 -0
  20. deriva_ml/execution/__init__.py +26 -0
  21. deriva_ml/execution/environment.py +290 -0
  22. deriva_ml/execution/execution.py +1180 -0
  23. deriva_ml/execution/execution_configuration.py +147 -0
  24. deriva_ml/execution/workflow.py +413 -0
  25. deriva_ml/feature.py +228 -0
  26. deriva_ml/install_kernel.py +71 -0
  27. deriva_ml/model/__init__.py +0 -0
  28. deriva_ml/model/catalog.py +485 -0
  29. deriva_ml/model/database.py +719 -0
  30. deriva_ml/protocols/dataset.py +19 -0
  31. deriva_ml/run_notebook.py +228 -0
  32. deriva_ml/schema/__init__.py +3 -0
  33. deriva_ml/schema/annotations.py +473 -0
  34. deriva_ml/schema/check_schema.py +104 -0
  35. deriva_ml/schema/create_schema.py +393 -0
  36. deriva_ml/schema/deriva-ml-reference.json +8525 -0
  37. deriva_ml/schema/policy.json +81 -0
  38. deriva_ml/schema/table_comments_utils.py +57 -0
  39. deriva_ml/test.py +94 -0
  40. deriva_ml-1.17.10.dist-info/METADATA +38 -0
  41. deriva_ml-1.17.10.dist-info/RECORD +45 -0
  42. deriva_ml-1.17.10.dist-info/WHEEL +5 -0
  43. deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
  44. deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
  45. deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
@@ -0,0 +1,473 @@
1
+ import argparse
2
+ import sys
3
+
4
+ from deriva.core.ermrest_model import Model, Table
5
+ from deriva.core.utils.core_utils import tag as deriva_tags
6
+
7
+ from deriva_ml.core.constants import DerivaAssetColumns
8
+ from deriva_ml.dataset.upload import bulk_upload_configuration
9
+ from deriva_ml.model.catalog import DerivaModel
10
+
11
+
12
+ def catalog_annotation(model: DerivaModel) -> None:
13
+ """Set the annotations for a catalog.
14
+
15
+ This routine will dynamically walk the domain schema and create menu bar for the catalog based on the current
16
+ configuration. A side effect is that the annotation attribute of the catalog will be updated and the result
17
+ pushed to the catalog.
18
+
19
+
20
+ Args:
21
+ model: A deriva model to the current catalog.
22
+
23
+ """
24
+ catalog_id = model.catalog.catalog_id
25
+ ml_schema = model.ml_schema
26
+
27
+ catalog_annotation = {
28
+ deriva_tags.display: {"name_style": {"underline_space": True}},
29
+ deriva_tags.chaise_config: {
30
+ "headTitle": "Catalog ML",
31
+ "navbarBrandText": "ML Data Browser",
32
+ "systemColumnsDisplayEntry": ["RID"],
33
+ "systemColumnsDisplayCompact": ["RID"],
34
+ "defaultTable": {"table": "Dataset", "schema": "deriva-ml"},
35
+ "deleteRecord": True,
36
+ "showFaceting": True,
37
+ "shareCiteAcls": True,
38
+ "exportConfigsSubmenu": {"acls": {"show": ["*"], "enable": ["*"]}},
39
+ "resolverImplicitCatalog": False,
40
+ "navbarMenu": {
41
+ "newTab": False,
42
+ "children": [
43
+ {
44
+ "name": "User Info",
45
+ "children": [
46
+ {
47
+ "url": f"/chaise/recordset/#{catalog_id}/public:ERMrest_Client",
48
+ "name": "Users",
49
+ },
50
+ {
51
+ "url": f"/chaise/recordset/#{catalog_id}/public:ERMrest_Group",
52
+ "name": "Groups",
53
+ },
54
+ {
55
+ "url": f"/chaise/recordset/#{catalog_id}/public:ERMrest_RID_Lease",
56
+ "name": "ERMrest RID Lease",
57
+ },
58
+ ],
59
+ },
60
+ { # All the primary tables in deriva-ml schema.
61
+ "name": "Deriva-ML",
62
+ "children": [
63
+ {
64
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Workflow",
65
+ "name": "Workflow",
66
+ },
67
+ {
68
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Execution",
69
+ "name": "Execution",
70
+ },
71
+ {
72
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Execution_Metadata",
73
+ "name": "Execution Metadata",
74
+ },
75
+ {
76
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Execution_Asset",
77
+ "name": "Execution Asset",
78
+ },
79
+ {
80
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Dataset",
81
+ "name": "Dataset",
82
+ },
83
+ {
84
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:Dataset_Version",
85
+ "name": "Dataset Version",
86
+ },
87
+ ],
88
+ },
89
+ { # All the primary tables in deriva-ml schema.
90
+ "name": "WWW",
91
+ "children": [
92
+ {
93
+ "url": f"/chaise/recordset/#{catalog_id}/WWW:Page",
94
+ "name": "Page",
95
+ },
96
+ {
97
+ "url": f"/chaise/recordset/#{catalog_id}/WWW:File",
98
+ "name": "File",
99
+ },
100
+ ],
101
+ },
102
+ {
103
+ "name": model.domain_schema,
104
+ "children": [
105
+ {
106
+ "name": tname,
107
+ "url": f"/chaise/recordset/#{catalog_id}/{model.domain_schema}:{tname}",
108
+ }
109
+ for tname in model.schemas[model.domain_schema].tables
110
+ # Don't include controlled vocabularies, association tables, or feature tables.
111
+ if not (model.is_vocabulary(tname) or model.is_association(tname, pure=False, max_arity=3))
112
+ ],
113
+ },
114
+ { # Vocabulary menu which will list all the controlled vocabularies in deriva-ml and domain.
115
+ "name": "Vocabulary",
116
+ "children": [{"name": f"{ml_schema} Vocabularies", "header": True}]
117
+ + [
118
+ {
119
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:{tname}",
120
+ "name": tname,
121
+ }
122
+ for tname in model.schemas[model.ml_schema].tables
123
+ if model.is_vocabulary(tname)
124
+ ]
125
+ + [
126
+ {
127
+ "name": f"{model.domain_schema} Vocabularies",
128
+ "header": True,
129
+ }
130
+ ]
131
+ + [
132
+ {
133
+ "url": f"/chaise/recordset/#{catalog_id}/{model.domain_schema}:{tname}",
134
+ "name": tname,
135
+ }
136
+ for tname in model.schemas[model.domain_schema].tables
137
+ if model.is_vocabulary(tname)
138
+ ],
139
+ },
140
+ { # List of all of the asset tables in deriva-ml and domain schemas.
141
+ "name": "Assets",
142
+ "children": [
143
+ {
144
+ "url": f"/chaise/recordset/#{catalog_id}/{ml_schema}:{tname}",
145
+ "name": tname,
146
+ }
147
+ for tname in model.schemas[model.ml_schema].tables
148
+ if model.is_asset(tname)
149
+ ]
150
+ + [
151
+ {
152
+ "url": f"/chaise/recordset/#{catalog_id}/{model.domain_schema}:{tname}",
153
+ "name": tname,
154
+ }
155
+ for tname in model.schemas[model.domain_schema].tables
156
+ if model.is_asset(tname)
157
+ ],
158
+ },
159
+ {
160
+ "url": "/chaise/recordset/#0/ermrest:registry@sort(RID)",
161
+ "name": "Catalog Registry",
162
+ },
163
+ {
164
+ "name": "Documentation",
165
+ "children": [
166
+ {
167
+ "url": "https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/ml_workflow_instruction.md",
168
+ "name": "ML Notebook Instruction",
169
+ },
170
+ {
171
+ "url": "https://informatics-isi-edu.github.io/deriva-ml/",
172
+ "name": "Deriva-ML Documentation",
173
+ },
174
+ ],
175
+ },
176
+ ],
177
+ },
178
+ },
179
+ deriva_tags.bulk_upload: bulk_upload_configuration(model=model),
180
+ }
181
+ model.annotations.update(catalog_annotation)
182
+ model.apply()
183
+
184
+
185
+ def asset_annotation(asset_table: Table):
186
+ """Generate annotations for an asset table.
187
+
188
+ Args:
189
+ asset_table: The Table object representing the asset table.
190
+
191
+ Returns:
192
+ A dictionary containing the annotations for the asset table.
193
+ """
194
+
195
+ schema = asset_table.schema.name
196
+ asset_name = asset_table.name
197
+ asset_metadata = {c.name for c in asset_table.columns} - DerivaAssetColumns
198
+
199
+ def fkey_column(column):
200
+ """Map the column name to a FK if a constraint exists on the column"""
201
+ return next(
202
+ (
203
+ (fk.name[0].name, fk.name[1])
204
+ for fk in asset_table.foreign_keys
205
+ if asset_table.columns[column] in fk.column_map
206
+ ),
207
+ column,
208
+ )
209
+
210
+ annotations = {
211
+ deriva_tags.table_display: {"row_name": {"row_markdown_pattern": "{{{Filename}}}"}},
212
+ deriva_tags.visible_columns: {
213
+ "*": [
214
+ "RID",
215
+ "RCT",
216
+ "RMT",
217
+ [schema, f"{asset_name}_RCB_fkey"],
218
+ [schema, f"{asset_name}_RMB_fkey"],
219
+ "URL",
220
+ "Filename",
221
+ "Description",
222
+ "Length",
223
+ "MD5",
224
+ {
225
+ "source": [
226
+ {
227
+ "inbound": [
228
+ schema,
229
+ f"{asset_name}_Asset_Type_{asset_name}_fkey",
230
+ ]
231
+ },
232
+ {
233
+ "outbound": [
234
+ schema,
235
+ f"{asset_name}_Asset_Type_Asset_Type_fkey",
236
+ ]
237
+ },
238
+ "RID",
239
+ ],
240
+ "markdown_name": "Asset Types",
241
+ },
242
+ ]
243
+ + [fkey_column(c) for c in asset_metadata],
244
+ },
245
+ }
246
+ asset_table.annotations.update(annotations)
247
+ asset_table.schema.model.apply()
248
+
249
+
250
+ def generate_annotation(model: Model, schema: str) -> dict:
251
+ catalog_id = model.catalog.catalog_id
252
+ workflow_annotation = {
253
+ deriva_tags.visible_columns: {
254
+ "*": [
255
+ "RID",
256
+ [schema, "Workflow_RCB_fkey"],
257
+ [schema, "Workflow_RMB_fkey"],
258
+ "Name",
259
+ "Description",
260
+ {
261
+ "display": {"markdown_pattern": "[{{{URL}}}]({{{URL}}})"},
262
+ "markdown_name": "URL",
263
+ },
264
+ "Checksum",
265
+ "Version",
266
+ {
267
+ "source": [
268
+ {"outbound": [schema, "Workflow_Workflow_Type_fkey"]},
269
+ "RID",
270
+ ]
271
+ },
272
+ ]
273
+ }
274
+ }
275
+
276
+ execution_annotation = {
277
+ deriva_tags.visible_columns: {
278
+ "*": [
279
+ "RID",
280
+ [schema, "Execution_RCB_fkey"],
281
+ [schema, "Execution_RMB_fkey"],
282
+ "RCT",
283
+ "Description",
284
+ {"source": [{"outbound": [schema, "Execution_Workflow_fkey"]}, "RID"]},
285
+ "Duration",
286
+ "Status",
287
+ "Status_Detail",
288
+ ]
289
+ },
290
+ "tag:isrd.isi.edu,2016:visible-foreign-keys": {
291
+ "detailed": [
292
+ {
293
+ "source": [
294
+ {"inbound": [schema, "Dataset_Execution_Execution_fkey"]},
295
+ {"outbound": [schema, "Dataset_Execution_Dataset_fkey"]},
296
+ "RID",
297
+ ],
298
+ "markdown_name": "Dataset",
299
+ },
300
+ {
301
+ "source": [
302
+ {
303
+ "inbound": [
304
+ schema,
305
+ "Execution_Asset_Execution_Execution_fkey",
306
+ ]
307
+ },
308
+ {
309
+ "outbound": [
310
+ schema,
311
+ "Execution_Asset_Execution_Execution_Asset_fkey",
312
+ ]
313
+ },
314
+ "RID",
315
+ ],
316
+ "markdown_name": "Execution Asset",
317
+ },
318
+ {
319
+ "source": [
320
+ {"inbound": [schema, "Execution_Metadata_Execution_Execution_fkey"]},
321
+ {"outbound": [schema, "Execution_Metadata_Execution_Execution_Metadata_fkey"]},
322
+ "RID",
323
+ ],
324
+ "markdown_name": "Execution Metadata",
325
+ },
326
+ ]
327
+ },
328
+ }
329
+
330
+ dataset_annotation = {
331
+ deriva_tags.visible_columns: {
332
+ "*": [
333
+ "RID",
334
+ "Description",
335
+ [schema, "Dataset_RCB_fkey"],
336
+ [schema, "Dataset_RMB_fkey"],
337
+ {
338
+ "source": [
339
+ {"outbound": ["deriva-ml", "Dataset_Version_fkey"]},
340
+ "Version",
341
+ ],
342
+ "markdown_name": "Dataset Version",
343
+ },
344
+ ],
345
+ "detailed": [
346
+ "RID",
347
+ "Description",
348
+ {
349
+ "source": [
350
+ {"inbound": ["deriva-ml", "Dataset_Dataset_Type_Dataset_fkey"]},
351
+ {
352
+ "outbound": [
353
+ "deriva-ml",
354
+ "Dataset_Dataset_Type_Dataset_Type_fkey",
355
+ ]
356
+ },
357
+ "RID",
358
+ ],
359
+ "markdown_name": "Dataset Types",
360
+ },
361
+ {
362
+ "source": [
363
+ {"outbound": ["deriva-ml", "Dataset_Version_fkey"]},
364
+ "Version",
365
+ ],
366
+ "markdown_name": "Dataset Version",
367
+ },
368
+ [schema, "Dataset_RCB_fkey"],
369
+ [schema, "Dataset_RMB_fkey"],
370
+ ],
371
+ "filter": {
372
+ "and": [
373
+ {"source": "RID"},
374
+ {"source": "Description"},
375
+ {
376
+ "source": [
377
+ {
378
+ "inbound": [
379
+ "deriva-ml",
380
+ "Dataset_Dataset_Type_Dataset_fkey",
381
+ ]
382
+ },
383
+ {
384
+ "outbound": [
385
+ "deriva-ml",
386
+ "Dataset_Dataset_Type_Dataset_Type_fkey",
387
+ ]
388
+ },
389
+ "RID",
390
+ ],
391
+ "markdown_name": "Dataset Types",
392
+ },
393
+ {
394
+ "source": [{"outbound": [schema, "Dataset_RCB_fkey"]}, "RID"],
395
+ "markdown_name": "Created By",
396
+ },
397
+ {
398
+ "source": [{"outbound": [schema, "Dataset_RMB_fkey"]}, "RID"],
399
+ "markdown_name": "Modified By",
400
+ },
401
+ ]
402
+ },
403
+ }
404
+ }
405
+
406
+ schema_annotation = {
407
+ "name_style": {"underline_space": True},
408
+ }
409
+
410
+ dataset_version_annotation = {
411
+ deriva_tags.visible_columns: {
412
+ "*": [
413
+ "RID",
414
+ "RCT",
415
+ "RMT",
416
+ [schema, "Dataset_Version_RCB_fkey"],
417
+ [schema, "Dataset_Version_RMB_fkey"],
418
+ {
419
+ "source": [
420
+ {"outbound": [schema, "Dataset_Version_Dataset_fkey"]},
421
+ "RID",
422
+ ]
423
+ },
424
+ "Description",
425
+ {
426
+ "display": {
427
+ "template_engine": "handlebars",
428
+ "markdown_pattern": "[{{{Version}}}](https://{{{$location.host}}}/id/{{{$catalog.id}}}/{{{Dataset}}}@{{{Snapshot}}})",
429
+ },
430
+ "markdown_name": "Version",
431
+ },
432
+ "Minid",
433
+ {
434
+ "source": [
435
+ {"outbound": [schema, "Dataset_Version_Execution_fkey"]},
436
+ "RID",
437
+ ]
438
+ },
439
+ ]
440
+ },
441
+ deriva_tags.visible_foreign_keys: {"*": []},
442
+ deriva_tags.table_display: {
443
+ "row_name": {"row_markdown_pattern": "{{{$fkey_deriva-ml_Dataset_Version_Dataset_fkey.RID}}}:{{{Version}}}"}
444
+ },
445
+ }
446
+
447
+ return {
448
+ "workflow_annotation": workflow_annotation,
449
+ "dataset_annotation": dataset_annotation,
450
+ "execution_annotation": execution_annotation,
451
+ "schema_annotation": schema_annotation,
452
+ "dataset_version_annotation": dataset_version_annotation,
453
+ }
454
+
455
+
456
+ def main():
457
+ """Main entry point for the annotations CLI.
458
+
459
+ Applies annotations to the ML schema based on command line arguments.
460
+
461
+ Returns:
462
+ None. Executes the CLI.
463
+ """
464
+ parser = argparse.ArgumentParser(description="Apply annotations to ML schema")
465
+ parser.add_argument("hostname", help="Hostname for the catalog")
466
+ parser.add_argument("catalog_id", help="Catalog ID")
467
+ parser.add_argument("schema-name", default="deriva-ml", help="Schema name (default: deriva-ml)")
468
+ args = parser.parse_args()
469
+ generate_annotation(args.catalog_id, args.schema_name)
470
+
471
+
472
+ if __name__ == "__main__":
473
+ sys.exit(main())
@@ -0,0 +1,104 @@
1
+ import json
2
+ import re
3
+ from importlib.resources import files
4
+ from pathlib import Path
5
+ from pprint import pprint
6
+
7
+ from deepdiff import DeepDiff
8
+ from deriva.core import AttrDict, BaseCLI, get_credential
9
+ from deriva.core.ermrest_catalog import ErmrestCatalog
10
+
11
+ from deriva_ml.core.definitions import ML_SCHEMA
12
+ from deriva_ml.schema.create_schema import create_ml_catalog
13
+
14
+
15
+ def normalize_schema(d):
16
+ if isinstance(d, dict) or isinstance(d, AttrDict):
17
+ m = {}
18
+ for k, v in d.items():
19
+ if k == "acl_bindings" or k == "annotations" or k == "comment":
20
+ continue
21
+ m[k] = normalize_schema(v)
22
+ return m
23
+ elif isinstance(d, list):
24
+ return [normalize_schema(i) for i in d]
25
+ elif isinstance(d, str):
26
+ # ID templates for controlled vocabulary
27
+ if m := re.match("(?P<s>.*):{RID}", d):
28
+ d = d if m["s"] == "deriva-ml" else "reference-catalog:{RID}" if re.match(".*:{RID}", d) else d
29
+ return d
30
+ else:
31
+ return d
32
+
33
+
34
+ def check_ml_schema(hostname, catalog_id, schema_file: Path | None = None):
35
+ """Check the ML schema against a reference schema file.
36
+
37
+ Args:
38
+ hostname: The hostname of the Deriva catalog.
39
+ catalog_id: The catalog ID to check.
40
+ schema_file: Optional path to reference schema file. If None, uses default reference.
41
+
42
+ Returns:
43
+ None. Prints the diff between target and reference schemas.
44
+ """
45
+ # schema_file = schema_file or files("deriva-ml.data").joinpath("deriva-ml-reference.json")
46
+ schema_file = schema_file or files("deriva_ml.schema").joinpath("deriva-ml-reference.json")
47
+
48
+ # Now map
49
+
50
+ with Path(schema_file).open("r") as f:
51
+ reference_schema = normalize_schema(json.load(f)["schemas"][ML_SCHEMA])
52
+
53
+ catalog = ErmrestCatalog("https", hostname, catalog_id, credentials=get_credential(hostname))
54
+ target_schema = normalize_schema(catalog.getCatalogModel().schemas[ML_SCHEMA].prejson())
55
+
56
+ # Compute the diff
57
+ diff = DeepDiff(reference_schema, target_schema, ignore_order=True, view="tree")
58
+ print(f"Diff between {schema_file} and {ML_SCHEMA} schema:")
59
+ # Pretty‐print as JSON
60
+ pprint(diff, indent=2)
61
+ return diff
62
+
63
+
64
+ def dump_ml_schema(hostname: str, filename: str = "deriva-ml-reference.json") -> None:
65
+ """Dump the schema of the ML catalog to stdout."""
66
+ catalog = create_ml_catalog(hostname, "reference-catalog")
67
+ try:
68
+ model = catalog.getCatalogModel()
69
+ print(f"Dumping ML schema to {Path(filename).resolve()}...")
70
+ with Path(filename).open("w") as f:
71
+ json.dump(model.prejson(), f, indent=2)
72
+ finally:
73
+ catalog.delete_ermrest_catalog(really=True)
74
+
75
+
76
+ class CheckMLSchemaCLI(BaseCLI):
77
+ """Main class to part command line arguments and call model"""
78
+
79
+ def __init__(self, description, epilog, **kwargs):
80
+ BaseCLI.__init__(self, description, epilog, **kwargs)
81
+
82
+ self.parser.add_argument("--catalog", default=1, metavar="<1>", help="Catalog number. Default: 1")
83
+ self.parser.add_argument("--dump", action="store_true", help="Perform execution in dry-run mode.")
84
+
85
+ def main(self):
86
+ """Parse arguments and set up execution environment."""
87
+ args = self.parse_cli()
88
+ hostname = args.host
89
+ catalog_id = args.catalog
90
+
91
+ if args.dump:
92
+ dump_ml_schema(hostname, catalog_id)
93
+ return
94
+
95
+ check_ml_schema(hostname, catalog_id)
96
+
97
+
98
+ def main():
99
+ cli = CheckMLSchemaCLI(description="Check DerivaML Catalog for Compliance", epilog="")
100
+ cli.main()
101
+
102
+
103
+ if __name__ == "__main__":
104
+ main()