deriva-ml 1.14.0__py3-none-any.whl → 1.14.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. deriva_ml/__init__.py +25 -30
  2. deriva_ml/core/__init__.py +39 -0
  3. deriva_ml/core/base.py +1489 -0
  4. deriva_ml/core/constants.py +36 -0
  5. deriva_ml/core/definitions.py +74 -0
  6. deriva_ml/core/enums.py +222 -0
  7. deriva_ml/core/ermrest.py +288 -0
  8. deriva_ml/core/exceptions.py +28 -0
  9. deriva_ml/core/filespec.py +116 -0
  10. deriva_ml/dataset/__init__.py +4 -0
  11. deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
  12. deriva_ml/{dataset.py → dataset/dataset.py} +406 -428
  13. deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
  14. deriva_ml/{history.py → dataset/history.py} +51 -33
  15. deriva_ml/{upload.py → dataset/upload.py} +48 -70
  16. deriva_ml/demo_catalog.py +233 -183
  17. deriva_ml/execution/environment.py +290 -0
  18. deriva_ml/{execution.py → execution/execution.py} +365 -252
  19. deriva_ml/execution/execution_configuration.py +163 -0
  20. deriva_ml/{execution_configuration.py → execution/workflow.py} +212 -224
  21. deriva_ml/feature.py +83 -46
  22. deriva_ml/model/__init__.py +0 -0
  23. deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
  24. deriva_ml/{database_model.py → model/database.py} +52 -74
  25. deriva_ml/model/sql_mapper.py +44 -0
  26. deriva_ml/run_notebook.py +19 -11
  27. deriva_ml/schema/__init__.py +3 -0
  28. deriva_ml/{schema_setup → schema}/annotations.py +31 -22
  29. deriva_ml/schema/check_schema.py +104 -0
  30. deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
  31. deriva_ml/schema/deriva-ml-reference.json +8525 -0
  32. deriva_ml/schema/table_comments_utils.py +57 -0
  33. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/METADATA +5 -4
  34. deriva_ml-1.14.27.dist-info/RECORD +40 -0
  35. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/entry_points.txt +1 -0
  36. deriva_ml/deriva_definitions.py +0 -391
  37. deriva_ml/deriva_ml_base.py +0 -1046
  38. deriva_ml/execution_environment.py +0 -139
  39. deriva_ml/schema_setup/table_comments_utils.py +0 -56
  40. deriva_ml/test-files/execution-parameters.json +0 -1
  41. deriva_ml/test-files/notebook-parameters.json +0 -5
  42. deriva_ml/test_functions.py +0 -141
  43. deriva_ml/test_notebook.ipynb +0 -197
  44. deriva_ml-1.14.0.dist-info/RECORD +0 -31
  45. /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
  46. /deriva_ml/{schema_setup → schema}/policy.json +0 -0
  47. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/WHEEL +0 -0
  48. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/licenses/LICENSE +0 -0
  49. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/top_level.txt +0 -0
@@ -1,20 +1,27 @@
1
1
  import argparse
2
+ import subprocess
2
3
  import sys
3
- from typing import Optional, Any
4
+ from importlib.resources import files
5
+ from typing import Any, Optional
4
6
 
5
- from deriva.core import DerivaServer, get_credential, ErmrestCatalog
6
- from deriva.core.ermrest_model import Model
7
+ from deriva.core import DerivaServer, ErmrestCatalog, get_credential
7
8
  from deriva.core.ermrest_model import (
8
- builtin_types,
9
- Schema,
10
- Table,
11
9
  Column,
12
10
  ForeignKey,
13
11
  Key,
12
+ Model,
13
+ Schema,
14
+ Table,
15
+ builtin_types,
14
16
  )
15
17
 
16
- from deriva_ml import MLVocab
17
- from deriva_ml.schema_setup.annotations import generate_annotation, asset_annotation
18
+ from deriva_ml.core.definitions import ML_SCHEMA, MLTable, MLVocab
19
+ from deriva_ml.schema.annotations import asset_annotation, generate_annotation
20
+
21
+ try:
22
+ from icecream import ic
23
+ except ImportError: # Graceful fallback if IceCream isn't installed.
24
+ ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
18
25
 
19
26
 
20
27
  def create_dataset_table(
@@ -23,10 +30,10 @@ def create_dataset_table(
23
30
  project_name: str,
24
31
  dataset_annotation: Optional[dict] = None,
25
32
  version_annotation: Optional[dict] = None,
26
- ):
33
+ ) -> Table:
27
34
  dataset_table = schema.create_table(
28
35
  Table.define(
29
- tname="Dataset",
36
+ tname=MLTable.dataset,
30
37
  column_defs=[
31
38
  Column.define("Description", builtin_types.markdown),
32
39
  Column.define("Deleted", builtin_types.boolean),
@@ -35,9 +42,8 @@ def create_dataset_table(
35
42
  )
36
43
  )
37
44
 
38
- dataset_type = schema.create_table(
39
- Table.define_vocabulary(MLVocab.dataset_type, f"{project_name}:{{RID}}")
40
- )
45
+ dataset_type = schema.create_table(Table.define_vocabulary(MLVocab.dataset_type, f"{project_name}:{{RID}}"))
46
+
41
47
  schema.create_table(
42
48
  Table.define_association(
43
49
  associates=[
@@ -47,27 +53,31 @@ def create_dataset_table(
47
53
  )
48
54
  )
49
55
 
50
- dataset_version = schema.create_table(
51
- define_table_dataset_version(schema.name, version_annotation)
52
- )
56
+ dataset_version = schema.create_table(define_table_dataset_version(schema.name, version_annotation))
53
57
  dataset_table.create_reference(("Version", True, dataset_version))
54
58
 
55
59
  # Nested datasets.
56
60
  schema.create_table(
57
- Table.define_association(
58
- associates=[("Dataset", dataset_table), ("Nested_Dataset", dataset_table)]
59
- )
61
+ Table.define_association(associates=[("Dataset", dataset_table), ("Nested_Dataset", dataset_table)])
60
62
  )
61
63
  schema.create_table(
62
- Table.define_association(
63
- associates=[("Dataset", dataset_table), ("Execution", execution_table)]
64
- )
64
+ Table.define_association(associates=[("Dataset", dataset_table), ("Execution", execution_table)])
65
65
  )
66
+ return dataset_table
66
67
 
67
68
 
68
69
  def define_table_dataset_version(sname: str, annotation: Optional[dict] = None):
69
- return Table.define(
70
- tname="Dataset_Version",
70
+ """Define the dataset version table in the specified schema.
71
+
72
+ Args:
73
+ sname: The schema name where the table should be created.
74
+ annotation: Optional annotation dictionary for the table.
75
+
76
+ Returns:
77
+ The created Table object.
78
+ """
79
+ table = Table.define(
80
+ tname=MLTable.dataset_version,
71
81
  column_defs=[
72
82
  Column.define(
73
83
  "Version",
@@ -78,9 +88,7 @@ def define_table_dataset_version(sname: str, annotation: Optional[dict] = None):
78
88
  Column.define("Description", builtin_types.markdown),
79
89
  Column.define("Dataset", builtin_types.text, comment="RID of dataset"),
80
90
  Column.define("Execution", builtin_types.text, comment="RID of execution"),
81
- Column.define(
82
- "Minid", builtin_types.text, comment="URL to MINID for dataset"
83
- ),
91
+ Column.define("Minid", builtin_types.text, comment="URL to MINID for dataset"),
84
92
  Column.define(
85
93
  "Snapshot",
86
94
  builtin_types.text,
@@ -94,13 +102,23 @@ def define_table_dataset_version(sname: str, annotation: Optional[dict] = None):
94
102
  ForeignKey.define(["Execution"], sname, "Execution", ["RID"]),
95
103
  ],
96
104
  )
105
+ return table
97
106
 
98
107
 
99
108
  def create_execution_table(schema, annotation: Optional[dict] = None):
109
+ """Create the execution table in the specified schema.
110
+
111
+ Args:
112
+ schema: The schema where the table should be created.
113
+ annotation: Optional annotation dictionary for the table.
114
+
115
+ Returns:
116
+ The created Table object.
117
+ """
100
118
  annotation = annotation if annotation is not None else {}
101
119
  execution = schema.create_table(
102
120
  Table.define(
103
- "Execution",
121
+ MLTable.execution,
104
122
  column_defs=[
105
123
  Column.define("Workflow", builtin_types.text),
106
124
  Column.define("Description", builtin_types.markdown),
@@ -108,9 +126,7 @@ def create_execution_table(schema, annotation: Optional[dict] = None):
108
126
  Column.define("Status", builtin_types.text),
109
127
  Column.define("Status_Detail", builtin_types.text),
110
128
  ],
111
- fkey_defs=[
112
- ForeignKey.define(["Workflow"], schema.name, "Workflow", ["RID"])
113
- ],
129
+ fkey_defs=[ForeignKey.define(["Workflow"], schema.name, "Workflow", ["RID"])],
114
130
  annotations=annotation,
115
131
  )
116
132
  )
@@ -123,6 +139,7 @@ def create_asset_table(
123
139
  execution_table,
124
140
  asset_type_table,
125
141
  asset_role_table,
142
+ use_hatrac: bool = True,
126
143
  ):
127
144
  asset_table = schema.create_table(
128
145
  Table.define_asset(
@@ -153,45 +170,19 @@ def create_asset_table(
153
170
  return asset_table
154
171
 
155
172
 
156
- def create_file_table(
157
- schema: Schema,
158
- execution_table: Table,
159
- project_name: str,
160
- annotation: Optional[dict] = None,
161
- ):
162
- """Define files table structure"""
163
- annotation = annotation or {}
164
- file_table = schema.create_table(
165
- Table.define_asset(sname=schema.name, tname="File")
166
- )
167
-
168
- file_type = schema.create_table(
169
- Table.define_vocabulary(MLVocab.file_type, f"{project_name}:{{RID}}")
170
- )
171
-
172
- schema.create_table(
173
- Table.define_association(
174
- associates=[
175
- ("File", file_table),
176
- (MLVocab.file_type, file_type),
177
- ]
178
- )
179
- )
180
- schema.create_table(
181
- Table.define_association(
182
- [
183
- ("File", file_table),
184
- ("Execution", execution_table),
185
- ]
186
- )
187
- )
173
+ def create_workflow_table(schema: Schema, annotations: Optional[dict[str, Any]] = None):
174
+ """Create the workflow table in the specified schema.
188
175
 
176
+ Args:
177
+ schema: The schema where the table should be created.
178
+ annotations: Optional annotation dictionary for the table.
189
179
 
190
- def create_workflow_table(schema: Schema, annotations: Optional[dict[str, Any]] = None):
191
- annotations = annotations or {}
180
+ Returns:
181
+ The created Table object.
182
+ """
192
183
  workflow_table = schema.create_table(
193
184
  Table.define(
194
- "Workflow",
185
+ tname=MLTable.workflow,
195
186
  column_defs=[
196
187
  Column.define("Name", builtin_types.text),
197
188
  Column.define("Description", builtin_types.markdown),
@@ -203,9 +194,7 @@ def create_workflow_table(schema: Schema, annotations: Optional[dict[str, Any]]
203
194
  )
204
195
  )
205
196
  workflow_table.create_reference(
206
- schema.create_table(
207
- Table.define_vocabulary(MLVocab.workflow_type, f"{schema.name}:{{RID}}")
208
- )
197
+ schema.create_table(Table.define_vocabulary(MLVocab.workflow_type, f"{schema.name}:{{RID}}"))
209
198
  )
210
199
  return workflow_table
211
200
 
@@ -226,39 +215,23 @@ def create_ml_schema(
226
215
 
227
216
  client_annotation = {
228
217
  "tag:misd.isi.edu,2015:display": {"name": "Users"},
229
- "tag:isrd.isi.edu,2016:table-display": {
230
- "row_name": {"row_markdown_pattern": "{{{Full_Name}}}"}
231
- },
232
- "tag:isrd.isi.edu,2016:visible-columns": {
233
- "compact": ["Full_Name", "Display_Name", "Email", "ID"]
234
- },
218
+ "tag:isrd.isi.edu,2016:table-display": {"row_name": {"row_markdown_pattern": "{{{Full_Name}}}"}},
219
+ "tag:isrd.isi.edu,2016:visible-columns": {"compact": ["Full_Name", "Display_Name", "Email", "ID"]},
235
220
  }
236
- model.schemas["public"].tables["ERMrest_Client"].annotations.update(
237
- client_annotation
238
- )
221
+ model.schemas["public"].tables["ERMrest_Client"].annotations.update(client_annotation)
239
222
  model.apply()
240
223
 
241
- schema = model.create_schema(
242
- Schema.define(schema_name, annotations=annotations["schema_annotation"])
243
- )
224
+ schema = model.create_schema(Schema.define(schema_name, annotations=annotations["schema_annotation"]))
244
225
 
245
226
  # Create workflow and execution table.
246
227
 
247
- schema.create_table(
248
- Table.define_vocabulary("Feature_Name", f"{project_name}:{{RID}}")
249
- )
250
- asset_type_table = schema.create_table(
251
- Table.define_vocabulary("Asset_Type", f"{project_name}:{{RID}}")
252
- )
253
- asset_role_table = schema.create_table(
254
- Table.define_vocabulary("Asset_Role", f"{project_name}:{{RID}}")
255
- )
228
+ schema.create_table(Table.define_vocabulary(MLVocab.feature_name, f"{project_name}:{{RID}}"))
229
+ asset_type_table = schema.create_table(Table.define_vocabulary(MLVocab.asset_type, f"{project_name}:{{RID}}"))
230
+ asset_role_table = schema.create_table(Table.define_vocabulary(MLVocab.asset_role, f"{project_name}:{{RID}}"))
256
231
 
257
232
  create_workflow_table(schema, annotations["workflow_annotation"])
258
- execution_table = create_execution_table(
259
- schema, annotations["execution_annotation"]
260
- )
261
- create_dataset_table(
233
+ execution_table = create_execution_table(schema, annotations["execution_annotation"])
234
+ dataset_table = create_dataset_table(
262
235
  schema,
263
236
  execution_table,
264
237
  project_name,
@@ -268,7 +241,7 @@ def create_ml_schema(
268
241
 
269
242
  create_asset_table(
270
243
  schema,
271
- "Execution_Metadata",
244
+ MLTable.execution_metadata,
272
245
  execution_table,
273
246
  asset_type_table,
274
247
  asset_role_table,
@@ -276,21 +249,47 @@ def create_ml_schema(
276
249
 
277
250
  create_asset_table(
278
251
  schema,
279
- "Execution_Asset",
252
+ MLTable.execution_asset,
280
253
  execution_table,
281
254
  asset_type_table,
282
255
  asset_role_table,
283
256
  )
284
257
 
285
258
  # File table
286
- create_file_table(schema, execution_table, project_name)
259
+ file_table = create_asset_table(
260
+ schema,
261
+ MLTable.file,
262
+ execution_table,
263
+ asset_type_table,
264
+ asset_role_table,
265
+ use_hatrac=False,
266
+ )
267
+ # And make Files be part of a dataset.
268
+ schema.create_table(
269
+ Table.define_association(
270
+ associates=[
271
+ ("Dataset", dataset_table),
272
+ (MLTable.file, file_table),
273
+ ]
274
+ )
275
+ )
287
276
 
288
277
  initialize_ml_schema(model, schema_name)
289
278
 
290
279
 
291
280
  def initialize_ml_schema(model: Model, schema_name: str = "deriva-ml"):
281
+ """Initialize the ML schema with all required tables.
282
+
283
+ Args:
284
+ model: The ERMrest model to add the schema to.
285
+ schema_name: The name of the schema to create. Defaults to "deriva-ml".
286
+
287
+ Returns:
288
+ None. Modifies the model in place.
289
+ """
290
+
292
291
  catalog = model.catalog
293
- asset_type = catalog.getPathBuilder().schemas[schema_name].tables["Asset_Type"]
292
+ asset_type = catalog.getPathBuilder().schemas[schema_name].tables[MLVocab.asset_type]
294
293
  asset_type.insert(
295
294
  [
296
295
  {
@@ -309,10 +308,13 @@ def initialize_ml_schema(model: Model, schema_name: str = "deriva-ml"):
309
308
  "Name": "Execution_Asset",
310
309
  "Description": "A file generated by an execution",
311
310
  },
311
+ {"Name": "File", "Description": "A file that is not managed by Hatrac"},
312
+ {"Name": "Model_File", "Description": "The ML model."},
312
313
  ],
313
314
  defaults={"ID", "URI"},
314
315
  )
315
- asset_role = catalog.getPathBuilder().schemas[schema_name].tables["Asset_Role"]
316
+
317
+ asset_role = catalog.getPathBuilder().schemas[schema_name].tables[MLVocab.asset_role]
316
318
  asset_role.insert(
317
319
  [
318
320
  {"Name": "Input", "Description": "Asset used for input of an execution."},
@@ -320,21 +322,66 @@ def initialize_ml_schema(model: Model, schema_name: str = "deriva-ml"):
320
322
  ],
321
323
  defaults={"ID", "URI"},
322
324
  )
325
+ dataset_type = catalog.getPathBuilder().schemas[schema_name].tables[MLVocab.dataset_type]
326
+ dataset_type.insert(
327
+ [{"Name": "File", "Description": "A dataset that contains file assets."}],
328
+ defaults={"ID", "URI"},
329
+ )
330
+
331
+
332
+ def create_ml_catalog(hostname: str, project_name: str) -> ErmrestCatalog:
333
+ server = DerivaServer("https", hostname, credentials=get_credential(hostname))
334
+ catalog = server.create_ermrest_catalog()
335
+ model = catalog.getCatalogModel()
336
+ model.configure_baseline_catalog()
337
+ policy_file = files("deriva_ml.schema").joinpath("policy.json")
338
+ subprocess.run(
339
+ [
340
+ "deriva-acl-config",
341
+ "--host",
342
+ catalog.deriva_server.server,
343
+ "--config-file",
344
+ policy_file,
345
+ catalog.catalog_id,
346
+ ]
347
+ )
348
+ create_ml_schema(catalog, project_name=project_name)
349
+ return catalog
350
+
351
+
352
+ def reset_ml_schema(catalog: ErmrestCatalog, ml_schema=ML_SCHEMA) -> None:
353
+ model = catalog.getCatalogModel()
354
+ schemas = [schema for sname, schema in model.schemas.items() if sname not in ["public", "WWW"]]
355
+ for s in schemas:
356
+ s.drop(cascade=True)
357
+ model = catalog.getCatalogModel()
358
+ create_ml_schema(catalog, ml_schema)
323
359
 
324
360
 
325
361
  def main():
362
+ """Main entry point for the schema creation CLI.
363
+
364
+ Creates ML schema and catalog based on command line arguments.
365
+
366
+ Returns:
367
+ None. Executes the CLI.
368
+ """
326
369
  scheme = "https"
327
- parser = argparse.ArgumentParser()
328
- parser.add_argument("--hostname", type=str, required=True)
329
- parser.add_argument("--schema_name", type=str, required=True)
330
- parser.add_argument("--catalog_id", type=str, required=True)
331
- parser.add_argument("--curie_prefix", type=str, required=True)
370
+ parser = argparse.ArgumentParser(description="Create ML schema and catalog")
371
+ parser.add_argument("hostname", help="Hostname for the catalog")
372
+ parser.add_argument("project_name", help="Project name for the catalog")
373
+ parser.add_argument("schema-name", default="deriva-ml", help="Schema name (default: deriva-ml)")
374
+ parser.add_argument("curie_prefix", type=str, required=True)
375
+
332
376
  args = parser.parse_args()
333
377
  credentials = get_credential(args.hostname)
334
378
  server = DerivaServer(scheme, args.hostname, credentials)
335
379
  model = server.connect_ermrest(args.catalog_id).getCatalogModel()
336
380
  create_ml_schema(model, args.schema_name)
337
381
 
382
+ print(f"Created ML catalog at {args.hostname} with project {args.project_name}")
383
+ print(f"Schema '{args.schema_name}' initialized successfully")
384
+
338
385
 
339
386
  if __name__ == "__main__":
340
387
  sys.exit(main())