deriva-ml 1.17.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/.DS_Store +0 -0
- deriva_ml/__init__.py +79 -0
- deriva_ml/bump_version.py +142 -0
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1527 -0
- deriva_ml/core/config.py +69 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +12 -0
- deriva_ml/dataset/aux_classes.py +225 -0
- deriva_ml/dataset/dataset.py +1519 -0
- deriva_ml/dataset/dataset_bag.py +450 -0
- deriva_ml/dataset/history.py +109 -0
- deriva_ml/dataset/upload.py +439 -0
- deriva_ml/demo_catalog.py +495 -0
- deriva_ml/execution/__init__.py +26 -0
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/execution/execution.py +1180 -0
- deriva_ml/execution/execution_configuration.py +147 -0
- deriva_ml/execution/workflow.py +413 -0
- deriva_ml/feature.py +228 -0
- deriva_ml/install_kernel.py +71 -0
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/model/catalog.py +485 -0
- deriva_ml/model/database.py +719 -0
- deriva_ml/protocols/dataset.py +19 -0
- deriva_ml/run_notebook.py +228 -0
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/schema/annotations.py +473 -0
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/schema/create_schema.py +393 -0
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/policy.json +81 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- deriva_ml/test.py +94 -0
- deriva_ml-1.17.10.dist-info/METADATA +38 -0
- deriva_ml-1.17.10.dist-info/RECORD +45 -0
- deriva_ml-1.17.10.dist-info/WHEEL +5 -0
- deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
- deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
- deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import subprocess
|
|
3
|
+
import sys
|
|
4
|
+
from importlib.resources import files
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
from deriva.core import DerivaServer, ErmrestCatalog, get_credential
|
|
8
|
+
from deriva.core.ermrest_model import (
|
|
9
|
+
Column,
|
|
10
|
+
ForeignKey,
|
|
11
|
+
Key,
|
|
12
|
+
Model,
|
|
13
|
+
Schema,
|
|
14
|
+
Table,
|
|
15
|
+
builtin_types,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
from deriva_ml.core.definitions import ML_SCHEMA, MLTable, MLVocab
|
|
19
|
+
from deriva_ml.schema.annotations import asset_annotation, generate_annotation
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
from icecream import ic
|
|
23
|
+
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
24
|
+
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def create_dataset_table(
|
|
28
|
+
schema: Schema,
|
|
29
|
+
execution_table: Table,
|
|
30
|
+
project_name: str,
|
|
31
|
+
dataset_annotation: Optional[dict] = None,
|
|
32
|
+
version_annotation: Optional[dict] = None,
|
|
33
|
+
) -> Table:
|
|
34
|
+
dataset_table = schema.create_table(
|
|
35
|
+
Table.define(
|
|
36
|
+
tname=MLTable.dataset,
|
|
37
|
+
column_defs=[
|
|
38
|
+
Column.define("Description", builtin_types.markdown),
|
|
39
|
+
Column.define("Deleted", builtin_types.boolean),
|
|
40
|
+
],
|
|
41
|
+
annotations=dataset_annotation if dataset_annotation is not None else {},
|
|
42
|
+
)
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
dataset_type = schema.create_table(Table.define_vocabulary(MLVocab.dataset_type, f"{project_name}:{{RID}}"))
|
|
46
|
+
|
|
47
|
+
schema.create_table(
|
|
48
|
+
Table.define_association(
|
|
49
|
+
associates=[
|
|
50
|
+
("Dataset", dataset_table),
|
|
51
|
+
(MLVocab.dataset_type, dataset_type),
|
|
52
|
+
]
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
dataset_version = schema.create_table(define_table_dataset_version(schema.name, version_annotation))
|
|
57
|
+
dataset_table.create_reference(("Version", True, dataset_version))
|
|
58
|
+
|
|
59
|
+
# Nested datasets.
|
|
60
|
+
schema.create_table(
|
|
61
|
+
Table.define_association(associates=[("Dataset", dataset_table), ("Nested_Dataset", dataset_table)])
|
|
62
|
+
)
|
|
63
|
+
schema.create_table(
|
|
64
|
+
Table.define_association(associates=[("Dataset", dataset_table), ("Execution", execution_table)])
|
|
65
|
+
)
|
|
66
|
+
return dataset_table
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def define_table_dataset_version(sname: str, annotation: Optional[dict] = None):
|
|
70
|
+
"""Define the dataset version table in the specified schema.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
sname: The schema name where the table should be created.
|
|
74
|
+
annotation: Optional annotation dictionary for the table.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
The created Table object.
|
|
78
|
+
"""
|
|
79
|
+
table = Table.define(
|
|
80
|
+
tname=MLTable.dataset_version,
|
|
81
|
+
column_defs=[
|
|
82
|
+
Column.define(
|
|
83
|
+
"Version",
|
|
84
|
+
builtin_types.text,
|
|
85
|
+
default="0.1.0",
|
|
86
|
+
comment="Semantic version of dataset",
|
|
87
|
+
),
|
|
88
|
+
Column.define("Description", builtin_types.markdown),
|
|
89
|
+
Column.define("Dataset", builtin_types.text, comment="RID of dataset"),
|
|
90
|
+
Column.define("Execution", builtin_types.text, comment="RID of execution"),
|
|
91
|
+
Column.define("Minid", builtin_types.text, comment="URL to MINID for dataset"),
|
|
92
|
+
Column.define(
|
|
93
|
+
"Snapshot",
|
|
94
|
+
builtin_types.text,
|
|
95
|
+
comment="Catalog Snapshot ID for dataset",
|
|
96
|
+
),
|
|
97
|
+
],
|
|
98
|
+
annotations=annotation,
|
|
99
|
+
key_defs=[Key.define(["Dataset", "Version"])],
|
|
100
|
+
fkey_defs=[
|
|
101
|
+
ForeignKey.define(["Dataset"], sname, "Dataset", ["RID"]),
|
|
102
|
+
ForeignKey.define(["Execution"], sname, "Execution", ["RID"]),
|
|
103
|
+
],
|
|
104
|
+
)
|
|
105
|
+
return table
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def create_execution_table(schema, annotation: Optional[dict] = None):
|
|
109
|
+
"""Create the execution table in the specified schema.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
schema: The schema where the table should be created.
|
|
113
|
+
annotation: Optional annotation dictionary for the table.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
The created Table object.
|
|
117
|
+
"""
|
|
118
|
+
annotation = annotation if annotation is not None else {}
|
|
119
|
+
execution = schema.create_table(
|
|
120
|
+
Table.define(
|
|
121
|
+
MLTable.execution,
|
|
122
|
+
column_defs=[
|
|
123
|
+
Column.define("Workflow", builtin_types.text),
|
|
124
|
+
Column.define("Description", builtin_types.markdown),
|
|
125
|
+
Column.define("Duration", builtin_types.text),
|
|
126
|
+
Column.define("Status", builtin_types.text),
|
|
127
|
+
Column.define("Status_Detail", builtin_types.text),
|
|
128
|
+
],
|
|
129
|
+
fkey_defs=[ForeignKey.define(["Workflow"], schema.name, "Workflow", ["RID"])],
|
|
130
|
+
annotations=annotation,
|
|
131
|
+
)
|
|
132
|
+
)
|
|
133
|
+
return execution
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def create_asset_table(
|
|
137
|
+
schema,
|
|
138
|
+
asset_name: str,
|
|
139
|
+
execution_table,
|
|
140
|
+
asset_type_table,
|
|
141
|
+
asset_role_table,
|
|
142
|
+
use_hatrac: bool = True,
|
|
143
|
+
):
|
|
144
|
+
asset_table = schema.create_table(
|
|
145
|
+
Table.define_asset(
|
|
146
|
+
sname=schema.name,
|
|
147
|
+
tname=asset_name,
|
|
148
|
+
hatrac_template="/hatrac/metadata/{{MD5}}.{{Filename}}",
|
|
149
|
+
)
|
|
150
|
+
)
|
|
151
|
+
schema.create_table(
|
|
152
|
+
Table.define_association(
|
|
153
|
+
[
|
|
154
|
+
(asset_name, asset_table),
|
|
155
|
+
("Asset_Type", asset_type_table),
|
|
156
|
+
],
|
|
157
|
+
)
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
atable = schema.create_table(
|
|
161
|
+
Table.define_association(
|
|
162
|
+
[
|
|
163
|
+
(asset_name, asset_table),
|
|
164
|
+
("Execution", execution_table),
|
|
165
|
+
],
|
|
166
|
+
)
|
|
167
|
+
)
|
|
168
|
+
atable.create_reference(asset_role_table)
|
|
169
|
+
asset_annotation(asset_table)
|
|
170
|
+
return asset_table
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def create_workflow_table(schema: Schema, annotations: Optional[dict[str, Any]] = None):
|
|
174
|
+
"""Create the workflow table in the specified schema.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
schema: The schema where the table should be created.
|
|
178
|
+
annotations: Optional annotation dictionary for the table.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
The created Table object.
|
|
182
|
+
"""
|
|
183
|
+
workflow_table = schema.create_table(
|
|
184
|
+
Table.define(
|
|
185
|
+
tname=MLTable.workflow,
|
|
186
|
+
column_defs=[
|
|
187
|
+
Column.define("Name", builtin_types.text),
|
|
188
|
+
Column.define("Description", builtin_types.markdown),
|
|
189
|
+
Column.define("URL", builtin_types.ermrest_uri),
|
|
190
|
+
Column.define("Checksum", builtin_types.text),
|
|
191
|
+
Column.define("Version", builtin_types.text),
|
|
192
|
+
],
|
|
193
|
+
annotations=annotations,
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
workflow_table.create_reference(
|
|
197
|
+
schema.create_table(Table.define_vocabulary(MLVocab.workflow_type, f"{schema.name}:{{RID}}"))
|
|
198
|
+
)
|
|
199
|
+
return workflow_table
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def create_ml_schema(
|
|
203
|
+
catalog: ErmrestCatalog,
|
|
204
|
+
schema_name: str = "deriva-ml",
|
|
205
|
+
project_name: Optional[str] = None,
|
|
206
|
+
):
|
|
207
|
+
project_name = project_name or schema_name
|
|
208
|
+
|
|
209
|
+
model = catalog.getCatalogModel()
|
|
210
|
+
if model.schemas.get(schema_name):
|
|
211
|
+
model.schemas[schema_name].drop(cascade=True)
|
|
212
|
+
|
|
213
|
+
# get annotations
|
|
214
|
+
annotations = generate_annotation(model, schema_name)
|
|
215
|
+
|
|
216
|
+
client_annotation = {
|
|
217
|
+
"tag:misd.isi.edu,2015:display": {"name": "Users"},
|
|
218
|
+
"tag:isrd.isi.edu,2016:table-display": {"row_name": {"row_markdown_pattern": "{{{Full_Name}}}"}},
|
|
219
|
+
"tag:isrd.isi.edu,2016:visible-columns": {"compact": ["Full_Name", "Display_Name", "Email", "ID"]},
|
|
220
|
+
}
|
|
221
|
+
model.schemas["public"].tables["ERMrest_Client"].annotations.update(client_annotation)
|
|
222
|
+
model.apply()
|
|
223
|
+
|
|
224
|
+
schema = model.create_schema(Schema.define(schema_name, annotations=annotations["schema_annotation"]))
|
|
225
|
+
|
|
226
|
+
# Create workflow and execution table.
|
|
227
|
+
|
|
228
|
+
schema.create_table(Table.define_vocabulary(MLVocab.feature_name, f"{project_name}:{{RID}}"))
|
|
229
|
+
asset_type_table = schema.create_table(Table.define_vocabulary(MLVocab.asset_type, f"{project_name}:{{RID}}"))
|
|
230
|
+
asset_role_table = schema.create_table(Table.define_vocabulary(MLVocab.asset_role, f"{project_name}:{{RID}}"))
|
|
231
|
+
|
|
232
|
+
create_workflow_table(schema, annotations["workflow_annotation"])
|
|
233
|
+
execution_table = create_execution_table(schema, annotations["execution_annotation"])
|
|
234
|
+
dataset_table = create_dataset_table(
|
|
235
|
+
schema,
|
|
236
|
+
execution_table,
|
|
237
|
+
project_name,
|
|
238
|
+
annotations["dataset_annotation"],
|
|
239
|
+
annotations["dataset_version_annotation"],
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
create_asset_table(
|
|
243
|
+
schema,
|
|
244
|
+
MLTable.execution_metadata,
|
|
245
|
+
execution_table,
|
|
246
|
+
asset_type_table,
|
|
247
|
+
asset_role_table,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
create_asset_table(
|
|
251
|
+
schema,
|
|
252
|
+
MLTable.execution_asset,
|
|
253
|
+
execution_table,
|
|
254
|
+
asset_type_table,
|
|
255
|
+
asset_role_table,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# File table
|
|
259
|
+
file_table = create_asset_table(
|
|
260
|
+
schema,
|
|
261
|
+
MLTable.file,
|
|
262
|
+
execution_table,
|
|
263
|
+
asset_type_table,
|
|
264
|
+
asset_role_table,
|
|
265
|
+
use_hatrac=False,
|
|
266
|
+
)
|
|
267
|
+
# And make Files be part of a dataset.
|
|
268
|
+
schema.create_table(
|
|
269
|
+
Table.define_association(
|
|
270
|
+
associates=[
|
|
271
|
+
("Dataset", dataset_table),
|
|
272
|
+
(MLTable.file, file_table),
|
|
273
|
+
]
|
|
274
|
+
)
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
initialize_ml_schema(model, schema_name)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def initialize_ml_schema(model: Model, schema_name: str = "deriva-ml"):
|
|
281
|
+
"""Initialize the ML schema with all required tables.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
model: The ERMrest model to add the schema to.
|
|
285
|
+
schema_name: The name of the schema to create. Defaults to "deriva-ml".
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
None. Modifies the model in place.
|
|
289
|
+
"""
|
|
290
|
+
|
|
291
|
+
catalog = model.catalog
|
|
292
|
+
asset_type = catalog.getPathBuilder().schemas[schema_name].tables[MLVocab.asset_type]
|
|
293
|
+
asset_type.insert(
|
|
294
|
+
[
|
|
295
|
+
{
|
|
296
|
+
"Name": "Execution_Config",
|
|
297
|
+
"Description": "Configuration File for execution metadata",
|
|
298
|
+
},
|
|
299
|
+
{
|
|
300
|
+
"Name": "Runtime_Env",
|
|
301
|
+
"Description": "Information about the runtime environment",
|
|
302
|
+
},
|
|
303
|
+
{
|
|
304
|
+
"Name": "Execution_Metadata",
|
|
305
|
+
"Description": "Information about the execution environment",
|
|
306
|
+
},
|
|
307
|
+
{
|
|
308
|
+
"Name": "Execution_Asset",
|
|
309
|
+
"Description": "A file generated by an execution",
|
|
310
|
+
},
|
|
311
|
+
{"Name": "File", "Description": "A file that is not managed by Hatrac"},
|
|
312
|
+
{"Name": "Input_File", "Description": "A file input to an execution."},
|
|
313
|
+
{"Name": "Output_File", "Description": "A file output from an execution."},
|
|
314
|
+
{"Name": "Model_File", "Description": "The ML model."},
|
|
315
|
+
{
|
|
316
|
+
"Name": "Notebook_Output",
|
|
317
|
+
"Description": "A Jupyter notebook with output cells filled from an execution.",
|
|
318
|
+
},
|
|
319
|
+
],
|
|
320
|
+
defaults={"ID", "URI"},
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
asset_role = catalog.getPathBuilder().schemas[schema_name].tables[MLVocab.asset_role]
|
|
324
|
+
asset_role.insert(
|
|
325
|
+
[
|
|
326
|
+
{"Name": "Input", "Description": "Asset used for input of an execution."},
|
|
327
|
+
{"Name": "Output", "Description": "Asset used for output of an execution."},
|
|
328
|
+
],
|
|
329
|
+
defaults={"ID", "URI"},
|
|
330
|
+
)
|
|
331
|
+
dataset_type = catalog.getPathBuilder().schemas[schema_name].tables[MLVocab.dataset_type]
|
|
332
|
+
dataset_type.insert(
|
|
333
|
+
[{"Name": "File", "Description": "A dataset that contains file assets."}],
|
|
334
|
+
defaults={"ID", "URI"},
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def create_ml_catalog(hostname: str, project_name: str) -> ErmrestCatalog:
|
|
339
|
+
server = DerivaServer("https", hostname, credentials=get_credential(hostname))
|
|
340
|
+
catalog = server.create_ermrest_catalog()
|
|
341
|
+
model = catalog.getCatalogModel()
|
|
342
|
+
model.configure_baseline_catalog()
|
|
343
|
+
policy_file = files("deriva_ml.schema").joinpath("policy.json")
|
|
344
|
+
subprocess.run(
|
|
345
|
+
[
|
|
346
|
+
"deriva-acl-config",
|
|
347
|
+
"--host",
|
|
348
|
+
catalog.deriva_server.server,
|
|
349
|
+
"--config-file",
|
|
350
|
+
policy_file,
|
|
351
|
+
catalog.catalog_id,
|
|
352
|
+
]
|
|
353
|
+
)
|
|
354
|
+
create_ml_schema(catalog, project_name=project_name)
|
|
355
|
+
return catalog
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def reset_ml_schema(catalog: ErmrestCatalog, ml_schema=ML_SCHEMA) -> None:
|
|
359
|
+
model = catalog.getCatalogModel()
|
|
360
|
+
schemas = [schema for sname, schema in model.schemas.items() if sname not in ["public", "WWW"]]
|
|
361
|
+
for s in schemas:
|
|
362
|
+
s.drop(cascade=True)
|
|
363
|
+
model = catalog.getCatalogModel()
|
|
364
|
+
create_ml_schema(catalog, ml_schema)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def main():
|
|
368
|
+
"""Main entry point for the schema creation CLI.
|
|
369
|
+
|
|
370
|
+
Creates ML schema and catalog based on command line arguments.
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
None. Executes the CLI.
|
|
374
|
+
"""
|
|
375
|
+
scheme = "https"
|
|
376
|
+
parser = argparse.ArgumentParser(description="Create ML schema and catalog")
|
|
377
|
+
parser.add_argument("hostname", help="Hostname for the catalog")
|
|
378
|
+
parser.add_argument("project_name", help="Project name for the catalog")
|
|
379
|
+
parser.add_argument("schema-name", default="deriva-ml", help="Schema name (default: deriva-ml)")
|
|
380
|
+
parser.add_argument("curie_prefix", type=str, required=True)
|
|
381
|
+
|
|
382
|
+
args = parser.parse_args()
|
|
383
|
+
credentials = get_credential(args.hostname)
|
|
384
|
+
server = DerivaServer(scheme, args.hostname, credentials)
|
|
385
|
+
model = server.connect_ermrest(args.catalog_id).getCatalogModel()
|
|
386
|
+
create_ml_schema(model, args.schema_name)
|
|
387
|
+
|
|
388
|
+
print(f"Created ML catalog at {args.hostname} with project {args.project_name}")
|
|
389
|
+
print(f"Schema '{args.schema_name}' initialized successfully")
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
if __name__ == "__main__":
|
|
393
|
+
sys.exit(main())
|