deriva-ml 1.17.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. deriva_ml/.DS_Store +0 -0
  2. deriva_ml/__init__.py +79 -0
  3. deriva_ml/bump_version.py +142 -0
  4. deriva_ml/core/__init__.py +39 -0
  5. deriva_ml/core/base.py +1527 -0
  6. deriva_ml/core/config.py +69 -0
  7. deriva_ml/core/constants.py +36 -0
  8. deriva_ml/core/definitions.py +74 -0
  9. deriva_ml/core/enums.py +222 -0
  10. deriva_ml/core/ermrest.py +288 -0
  11. deriva_ml/core/exceptions.py +28 -0
  12. deriva_ml/core/filespec.py +116 -0
  13. deriva_ml/dataset/__init__.py +12 -0
  14. deriva_ml/dataset/aux_classes.py +225 -0
  15. deriva_ml/dataset/dataset.py +1519 -0
  16. deriva_ml/dataset/dataset_bag.py +450 -0
  17. deriva_ml/dataset/history.py +109 -0
  18. deriva_ml/dataset/upload.py +439 -0
  19. deriva_ml/demo_catalog.py +495 -0
  20. deriva_ml/execution/__init__.py +26 -0
  21. deriva_ml/execution/environment.py +290 -0
  22. deriva_ml/execution/execution.py +1180 -0
  23. deriva_ml/execution/execution_configuration.py +147 -0
  24. deriva_ml/execution/workflow.py +413 -0
  25. deriva_ml/feature.py +228 -0
  26. deriva_ml/install_kernel.py +71 -0
  27. deriva_ml/model/__init__.py +0 -0
  28. deriva_ml/model/catalog.py +485 -0
  29. deriva_ml/model/database.py +719 -0
  30. deriva_ml/protocols/dataset.py +19 -0
  31. deriva_ml/run_notebook.py +228 -0
  32. deriva_ml/schema/__init__.py +3 -0
  33. deriva_ml/schema/annotations.py +473 -0
  34. deriva_ml/schema/check_schema.py +104 -0
  35. deriva_ml/schema/create_schema.py +393 -0
  36. deriva_ml/schema/deriva-ml-reference.json +8525 -0
  37. deriva_ml/schema/policy.json +81 -0
  38. deriva_ml/schema/table_comments_utils.py +57 -0
  39. deriva_ml/test.py +94 -0
  40. deriva_ml-1.17.10.dist-info/METADATA +38 -0
  41. deriva_ml-1.17.10.dist-info/RECORD +45 -0
  42. deriva_ml-1.17.10.dist-info/WHEEL +5 -0
  43. deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
  44. deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
  45. deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
@@ -0,0 +1,393 @@
1
+ import argparse
2
+ import subprocess
3
+ import sys
4
+ from importlib.resources import files
5
+ from typing import Any, Optional
6
+
7
+ from deriva.core import DerivaServer, ErmrestCatalog, get_credential
8
+ from deriva.core.ermrest_model import (
9
+ Column,
10
+ ForeignKey,
11
+ Key,
12
+ Model,
13
+ Schema,
14
+ Table,
15
+ builtin_types,
16
+ )
17
+
18
+ from deriva_ml.core.definitions import ML_SCHEMA, MLTable, MLVocab
19
+ from deriva_ml.schema.annotations import asset_annotation, generate_annotation
20
+
21
+ try:
22
+ from icecream import ic
23
+ except ImportError: # Graceful fallback if IceCream isn't installed.
24
+ ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
25
+
26
+
27
+ def create_dataset_table(
28
+ schema: Schema,
29
+ execution_table: Table,
30
+ project_name: str,
31
+ dataset_annotation: Optional[dict] = None,
32
+ version_annotation: Optional[dict] = None,
33
+ ) -> Table:
34
+ dataset_table = schema.create_table(
35
+ Table.define(
36
+ tname=MLTable.dataset,
37
+ column_defs=[
38
+ Column.define("Description", builtin_types.markdown),
39
+ Column.define("Deleted", builtin_types.boolean),
40
+ ],
41
+ annotations=dataset_annotation if dataset_annotation is not None else {},
42
+ )
43
+ )
44
+
45
+ dataset_type = schema.create_table(Table.define_vocabulary(MLVocab.dataset_type, f"{project_name}:{{RID}}"))
46
+
47
+ schema.create_table(
48
+ Table.define_association(
49
+ associates=[
50
+ ("Dataset", dataset_table),
51
+ (MLVocab.dataset_type, dataset_type),
52
+ ]
53
+ )
54
+ )
55
+
56
+ dataset_version = schema.create_table(define_table_dataset_version(schema.name, version_annotation))
57
+ dataset_table.create_reference(("Version", True, dataset_version))
58
+
59
+ # Nested datasets.
60
+ schema.create_table(
61
+ Table.define_association(associates=[("Dataset", dataset_table), ("Nested_Dataset", dataset_table)])
62
+ )
63
+ schema.create_table(
64
+ Table.define_association(associates=[("Dataset", dataset_table), ("Execution", execution_table)])
65
+ )
66
+ return dataset_table
67
+
68
+
69
+ def define_table_dataset_version(sname: str, annotation: Optional[dict] = None):
70
+ """Define the dataset version table in the specified schema.
71
+
72
+ Args:
73
+ sname: The schema name where the table should be created.
74
+ annotation: Optional annotation dictionary for the table.
75
+
76
+ Returns:
77
+ The created Table object.
78
+ """
79
+ table = Table.define(
80
+ tname=MLTable.dataset_version,
81
+ column_defs=[
82
+ Column.define(
83
+ "Version",
84
+ builtin_types.text,
85
+ default="0.1.0",
86
+ comment="Semantic version of dataset",
87
+ ),
88
+ Column.define("Description", builtin_types.markdown),
89
+ Column.define("Dataset", builtin_types.text, comment="RID of dataset"),
90
+ Column.define("Execution", builtin_types.text, comment="RID of execution"),
91
+ Column.define("Minid", builtin_types.text, comment="URL to MINID for dataset"),
92
+ Column.define(
93
+ "Snapshot",
94
+ builtin_types.text,
95
+ comment="Catalog Snapshot ID for dataset",
96
+ ),
97
+ ],
98
+ annotations=annotation,
99
+ key_defs=[Key.define(["Dataset", "Version"])],
100
+ fkey_defs=[
101
+ ForeignKey.define(["Dataset"], sname, "Dataset", ["RID"]),
102
+ ForeignKey.define(["Execution"], sname, "Execution", ["RID"]),
103
+ ],
104
+ )
105
+ return table
106
+
107
+
108
+ def create_execution_table(schema, annotation: Optional[dict] = None):
109
+ """Create the execution table in the specified schema.
110
+
111
+ Args:
112
+ schema: The schema where the table should be created.
113
+ annotation: Optional annotation dictionary for the table.
114
+
115
+ Returns:
116
+ The created Table object.
117
+ """
118
+ annotation = annotation if annotation is not None else {}
119
+ execution = schema.create_table(
120
+ Table.define(
121
+ MLTable.execution,
122
+ column_defs=[
123
+ Column.define("Workflow", builtin_types.text),
124
+ Column.define("Description", builtin_types.markdown),
125
+ Column.define("Duration", builtin_types.text),
126
+ Column.define("Status", builtin_types.text),
127
+ Column.define("Status_Detail", builtin_types.text),
128
+ ],
129
+ fkey_defs=[ForeignKey.define(["Workflow"], schema.name, "Workflow", ["RID"])],
130
+ annotations=annotation,
131
+ )
132
+ )
133
+ return execution
134
+
135
+
136
+ def create_asset_table(
137
+ schema,
138
+ asset_name: str,
139
+ execution_table,
140
+ asset_type_table,
141
+ asset_role_table,
142
+ use_hatrac: bool = True,
143
+ ):
144
+ asset_table = schema.create_table(
145
+ Table.define_asset(
146
+ sname=schema.name,
147
+ tname=asset_name,
148
+ hatrac_template="/hatrac/metadata/{{MD5}}.{{Filename}}",
149
+ )
150
+ )
151
+ schema.create_table(
152
+ Table.define_association(
153
+ [
154
+ (asset_name, asset_table),
155
+ ("Asset_Type", asset_type_table),
156
+ ],
157
+ )
158
+ )
159
+
160
+ atable = schema.create_table(
161
+ Table.define_association(
162
+ [
163
+ (asset_name, asset_table),
164
+ ("Execution", execution_table),
165
+ ],
166
+ )
167
+ )
168
+ atable.create_reference(asset_role_table)
169
+ asset_annotation(asset_table)
170
+ return asset_table
171
+
172
+
173
+ def create_workflow_table(schema: Schema, annotations: Optional[dict[str, Any]] = None):
174
+ """Create the workflow table in the specified schema.
175
+
176
+ Args:
177
+ schema: The schema where the table should be created.
178
+ annotations: Optional annotation dictionary for the table.
179
+
180
+ Returns:
181
+ The created Table object.
182
+ """
183
+ workflow_table = schema.create_table(
184
+ Table.define(
185
+ tname=MLTable.workflow,
186
+ column_defs=[
187
+ Column.define("Name", builtin_types.text),
188
+ Column.define("Description", builtin_types.markdown),
189
+ Column.define("URL", builtin_types.ermrest_uri),
190
+ Column.define("Checksum", builtin_types.text),
191
+ Column.define("Version", builtin_types.text),
192
+ ],
193
+ annotations=annotations,
194
+ )
195
+ )
196
+ workflow_table.create_reference(
197
+ schema.create_table(Table.define_vocabulary(MLVocab.workflow_type, f"{schema.name}:{{RID}}"))
198
+ )
199
+ return workflow_table
200
+
201
+
202
+ def create_ml_schema(
203
+ catalog: ErmrestCatalog,
204
+ schema_name: str = "deriva-ml",
205
+ project_name: Optional[str] = None,
206
+ ):
207
+ project_name = project_name or schema_name
208
+
209
+ model = catalog.getCatalogModel()
210
+ if model.schemas.get(schema_name):
211
+ model.schemas[schema_name].drop(cascade=True)
212
+
213
+ # get annotations
214
+ annotations = generate_annotation(model, schema_name)
215
+
216
+ client_annotation = {
217
+ "tag:misd.isi.edu,2015:display": {"name": "Users"},
218
+ "tag:isrd.isi.edu,2016:table-display": {"row_name": {"row_markdown_pattern": "{{{Full_Name}}}"}},
219
+ "tag:isrd.isi.edu,2016:visible-columns": {"compact": ["Full_Name", "Display_Name", "Email", "ID"]},
220
+ }
221
+ model.schemas["public"].tables["ERMrest_Client"].annotations.update(client_annotation)
222
+ model.apply()
223
+
224
+ schema = model.create_schema(Schema.define(schema_name, annotations=annotations["schema_annotation"]))
225
+
226
+ # Create workflow and execution table.
227
+
228
+ schema.create_table(Table.define_vocabulary(MLVocab.feature_name, f"{project_name}:{{RID}}"))
229
+ asset_type_table = schema.create_table(Table.define_vocabulary(MLVocab.asset_type, f"{project_name}:{{RID}}"))
230
+ asset_role_table = schema.create_table(Table.define_vocabulary(MLVocab.asset_role, f"{project_name}:{{RID}}"))
231
+
232
+ create_workflow_table(schema, annotations["workflow_annotation"])
233
+ execution_table = create_execution_table(schema, annotations["execution_annotation"])
234
+ dataset_table = create_dataset_table(
235
+ schema,
236
+ execution_table,
237
+ project_name,
238
+ annotations["dataset_annotation"],
239
+ annotations["dataset_version_annotation"],
240
+ )
241
+
242
+ create_asset_table(
243
+ schema,
244
+ MLTable.execution_metadata,
245
+ execution_table,
246
+ asset_type_table,
247
+ asset_role_table,
248
+ )
249
+
250
+ create_asset_table(
251
+ schema,
252
+ MLTable.execution_asset,
253
+ execution_table,
254
+ asset_type_table,
255
+ asset_role_table,
256
+ )
257
+
258
+ # File table
259
+ file_table = create_asset_table(
260
+ schema,
261
+ MLTable.file,
262
+ execution_table,
263
+ asset_type_table,
264
+ asset_role_table,
265
+ use_hatrac=False,
266
+ )
267
+ # And make Files be part of a dataset.
268
+ schema.create_table(
269
+ Table.define_association(
270
+ associates=[
271
+ ("Dataset", dataset_table),
272
+ (MLTable.file, file_table),
273
+ ]
274
+ )
275
+ )
276
+
277
+ initialize_ml_schema(model, schema_name)
278
+
279
+
280
+ def initialize_ml_schema(model: Model, schema_name: str = "deriva-ml"):
281
+ """Initialize the ML schema with all required tables.
282
+
283
+ Args:
284
+ model: The ERMrest model to add the schema to.
285
+ schema_name: The name of the schema to create. Defaults to "deriva-ml".
286
+
287
+ Returns:
288
+ None. Modifies the model in place.
289
+ """
290
+
291
+ catalog = model.catalog
292
+ asset_type = catalog.getPathBuilder().schemas[schema_name].tables[MLVocab.asset_type]
293
+ asset_type.insert(
294
+ [
295
+ {
296
+ "Name": "Execution_Config",
297
+ "Description": "Configuration File for execution metadata",
298
+ },
299
+ {
300
+ "Name": "Runtime_Env",
301
+ "Description": "Information about the runtime environment",
302
+ },
303
+ {
304
+ "Name": "Execution_Metadata",
305
+ "Description": "Information about the execution environment",
306
+ },
307
+ {
308
+ "Name": "Execution_Asset",
309
+ "Description": "A file generated by an execution",
310
+ },
311
+ {"Name": "File", "Description": "A file that is not managed by Hatrac"},
312
+ {"Name": "Input_File", "Description": "A file input to an execution."},
313
+ {"Name": "Output_File", "Description": "A file output from an execution."},
314
+ {"Name": "Model_File", "Description": "The ML model."},
315
+ {
316
+ "Name": "Notebook_Output",
317
+ "Description": "A Jupyter notebook with output cells filled from an execution.",
318
+ },
319
+ ],
320
+ defaults={"ID", "URI"},
321
+ )
322
+
323
+ asset_role = catalog.getPathBuilder().schemas[schema_name].tables[MLVocab.asset_role]
324
+ asset_role.insert(
325
+ [
326
+ {"Name": "Input", "Description": "Asset used for input of an execution."},
327
+ {"Name": "Output", "Description": "Asset used for output of an execution."},
328
+ ],
329
+ defaults={"ID", "URI"},
330
+ )
331
+ dataset_type = catalog.getPathBuilder().schemas[schema_name].tables[MLVocab.dataset_type]
332
+ dataset_type.insert(
333
+ [{"Name": "File", "Description": "A dataset that contains file assets."}],
334
+ defaults={"ID", "URI"},
335
+ )
336
+
337
+
338
+ def create_ml_catalog(hostname: str, project_name: str) -> ErmrestCatalog:
339
+ server = DerivaServer("https", hostname, credentials=get_credential(hostname))
340
+ catalog = server.create_ermrest_catalog()
341
+ model = catalog.getCatalogModel()
342
+ model.configure_baseline_catalog()
343
+ policy_file = files("deriva_ml.schema").joinpath("policy.json")
344
+ subprocess.run(
345
+ [
346
+ "deriva-acl-config",
347
+ "--host",
348
+ catalog.deriva_server.server,
349
+ "--config-file",
350
+ policy_file,
351
+ catalog.catalog_id,
352
+ ]
353
+ )
354
+ create_ml_schema(catalog, project_name=project_name)
355
+ return catalog
356
+
357
+
358
+ def reset_ml_schema(catalog: ErmrestCatalog, ml_schema=ML_SCHEMA) -> None:
359
+ model = catalog.getCatalogModel()
360
+ schemas = [schema for sname, schema in model.schemas.items() if sname not in ["public", "WWW"]]
361
+ for s in schemas:
362
+ s.drop(cascade=True)
363
+ model = catalog.getCatalogModel()
364
+ create_ml_schema(catalog, ml_schema)
365
+
366
+
367
+ def main():
368
+ """Main entry point for the schema creation CLI.
369
+
370
+ Creates ML schema and catalog based on command line arguments.
371
+
372
+ Returns:
373
+ None. Executes the CLI.
374
+ """
375
+ scheme = "https"
376
+ parser = argparse.ArgumentParser(description="Create ML schema and catalog")
377
+ parser.add_argument("hostname", help="Hostname for the catalog")
378
+ parser.add_argument("project_name", help="Project name for the catalog")
379
+ parser.add_argument("schema-name", default="deriva-ml", help="Schema name (default: deriva-ml)")
380
+ parser.add_argument("curie_prefix", type=str, required=True)
381
+
382
+ args = parser.parse_args()
383
+ credentials = get_credential(args.hostname)
384
+ server = DerivaServer(scheme, args.hostname, credentials)
385
+ model = server.connect_ermrest(args.catalog_id).getCatalogModel()
386
+ create_ml_schema(model, args.schema_name)
387
+
388
+ print(f"Created ML catalog at {args.hostname} with project {args.project_name}")
389
+ print(f"Schema '{args.schema_name}' initialized successfully")
390
+
391
+
392
+ if __name__ == "__main__":
393
+ sys.exit(main())