deriva-ml 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/dataset.py +1 -1
- deriva_ml/dataset_bag.py +10 -3
- deriva_ml/demo_catalog.py +84 -78
- deriva_ml/deriva_definitions.py +2 -2
- deriva_ml/deriva_ml_base.py +85 -121
- deriva_ml/deriva_model.py +25 -0
- deriva_ml/execution.py +386 -309
- deriva_ml/feature.py +1 -2
- deriva_ml/schema_setup/create_schema.py +223 -183
- deriva_ml/upload.py +95 -232
- {deriva_ml-1.10.1.dist-info → deriva_ml-1.11.0.dist-info}/METADATA +2 -1
- deriva_ml-1.11.0.dist-info/RECORD +27 -0
- deriva_ml-1.10.1.dist-info/RECORD +0 -27
- {deriva_ml-1.10.1.dist-info → deriva_ml-1.11.0.dist-info}/WHEEL +0 -0
- {deriva_ml-1.10.1.dist-info → deriva_ml-1.11.0.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.10.1.dist-info → deriva_ml-1.11.0.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.10.1.dist-info → deriva_ml-1.11.0.dist-info}/top_level.txt +0 -0
deriva_ml/feature.py
CHANGED
|
@@ -25,12 +25,11 @@ class FeatureRecord(BaseModel):
|
|
|
25
25
|
"""
|
|
26
26
|
|
|
27
27
|
# model_dump of this feature should be compatible with feature table columns.
|
|
28
|
-
Execution: str
|
|
28
|
+
Execution: Optional[str] = None
|
|
29
29
|
Feature_Name: str
|
|
30
30
|
feature: ClassVar[Optional["Feature"]] = None
|
|
31
31
|
|
|
32
32
|
class Config:
|
|
33
|
-
|
|
34
33
|
arbitrary_types_allowed = True
|
|
35
34
|
|
|
36
35
|
@classmethod
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import sys
|
|
3
|
-
from typing import Optional
|
|
3
|
+
from typing import Optional, Any
|
|
4
4
|
|
|
5
5
|
from deriva.core import DerivaServer, get_credential
|
|
6
6
|
from deriva.core.ermrest_model import Model
|
|
@@ -19,28 +19,48 @@ from deriva_ml.schema_setup.annotations import generate_annotation
|
|
|
19
19
|
from deriva_ml.deriva_model import DerivaModel
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
def
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
22
|
+
def create_dataset_table(
|
|
23
|
+
schema: Schema,
|
|
24
|
+
execution_table: Table,
|
|
25
|
+
project_name: str,
|
|
26
|
+
dataset_annotation: Optional[dict] = None,
|
|
27
|
+
):
|
|
28
|
+
dataset_table = schema.create_table(
|
|
29
|
+
Table.define(
|
|
30
|
+
tname="Dataset",
|
|
31
|
+
column_defs=[
|
|
32
|
+
Column.define("Description", builtin_types.markdown),
|
|
33
|
+
Column.define("Deleted", builtin_types.boolean),
|
|
34
|
+
],
|
|
35
|
+
annotations=dataset_annotation if dataset_annotation is not None else {},
|
|
36
|
+
)
|
|
33
37
|
)
|
|
34
38
|
|
|
39
|
+
dataset_type = schema.create_table(
|
|
40
|
+
Table.define_vocabulary(MLVocab.dataset_type, f"{project_name}:{{RID}}")
|
|
41
|
+
)
|
|
42
|
+
schema.create_table(
|
|
43
|
+
Table.define_association(
|
|
44
|
+
associates=[
|
|
45
|
+
("Dataset", dataset_table),
|
|
46
|
+
(MLVocab.dataset_type, dataset_type),
|
|
47
|
+
]
|
|
48
|
+
)
|
|
49
|
+
)
|
|
35
50
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
51
|
+
dataset_version = schema.create_table(define_table_dataset_version(schema.name))
|
|
52
|
+
dataset_table.create_reference(("Version", True, dataset_version))
|
|
53
|
+
|
|
54
|
+
# Nested datasets.
|
|
55
|
+
schema.create_table(
|
|
56
|
+
Table.define_association(
|
|
57
|
+
associates=[("Dataset", dataset_table), ("Nested_Dataset", dataset_table)]
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
schema.create_table(
|
|
61
|
+
Table.define_association(
|
|
62
|
+
associates=[("Dataset", dataset_table), ("Execution", execution_table)]
|
|
63
|
+
)
|
|
44
64
|
)
|
|
45
65
|
|
|
46
66
|
|
|
@@ -66,92 +86,120 @@ def define_table_dataset_version(sname: str):
|
|
|
66
86
|
)
|
|
67
87
|
|
|
68
88
|
|
|
69
|
-
def
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
89
|
+
def create_execution_table(schema, annotation: Optional[dict] = None):
|
|
90
|
+
annotation = annotation if annotation is not None else {}
|
|
91
|
+
execution = schema.create_table(
|
|
92
|
+
Table.define(
|
|
93
|
+
"Execution",
|
|
94
|
+
column_defs=[
|
|
95
|
+
Column.define("Workflow", builtin_types.text),
|
|
96
|
+
Column.define("Description", builtin_types.markdown),
|
|
97
|
+
Column.define("Duration", builtin_types.text),
|
|
98
|
+
Column.define("Status", builtin_types.text),
|
|
99
|
+
Column.define("Status_Detail", builtin_types.text),
|
|
100
|
+
],
|
|
101
|
+
fkey_defs=[
|
|
102
|
+
ForeignKey.define(["Workflow"], schema.name, "Workflow", ["RID"])
|
|
103
|
+
],
|
|
104
|
+
annotations=annotation,
|
|
105
|
+
)
|
|
81
106
|
)
|
|
82
|
-
return
|
|
107
|
+
return execution
|
|
83
108
|
|
|
84
109
|
|
|
85
|
-
def
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
110
|
+
def create_asset_table(
|
|
111
|
+
schema,
|
|
112
|
+
asset_name: str,
|
|
113
|
+
execution_table,
|
|
114
|
+
asset_type_table,
|
|
115
|
+
asset_role_table,
|
|
116
|
+
annotation: Optional[dict] = None,
|
|
117
|
+
):
|
|
118
|
+
annotation = annotation if annotation is not None else {}
|
|
119
|
+
asset_table = schema.create_table(
|
|
120
|
+
Table.define_asset(
|
|
121
|
+
sname=schema.name,
|
|
122
|
+
tname=asset_name,
|
|
123
|
+
hatrac_template="/hatrac/metadata/{{MD5}}.{{Filename}}",
|
|
124
|
+
annotations=annotation,
|
|
125
|
+
)
|
|
126
|
+
)
|
|
127
|
+
atable = schema.create_table(
|
|
128
|
+
Table.define_association(
|
|
129
|
+
[
|
|
130
|
+
(asset_name, asset_table),
|
|
131
|
+
("Asset_Type", asset_type_table),
|
|
132
|
+
],
|
|
133
|
+
)
|
|
91
134
|
)
|
|
92
135
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
136
|
+
atable = schema.create_table(
|
|
137
|
+
Table.define_association(
|
|
138
|
+
[
|
|
139
|
+
(asset_name, asset_table),
|
|
140
|
+
("Execution", execution_table),
|
|
141
|
+
],
|
|
142
|
+
)
|
|
100
143
|
)
|
|
101
|
-
|
|
144
|
+
atable.create_reference(asset_role_table)
|
|
145
|
+
return asset_table
|
|
102
146
|
|
|
103
147
|
|
|
104
|
-
def
|
|
148
|
+
def create_file_table(
|
|
149
|
+
schema: Schema,
|
|
150
|
+
execution_table: Table,
|
|
151
|
+
project_name: str,
|
|
152
|
+
annotation: Optional[dict] = None,
|
|
153
|
+
):
|
|
105
154
|
"""Define files table structure"""
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
tname="File"
|
|
155
|
+
annotation = annotation or {}
|
|
156
|
+
file_table = schema.create_table(
|
|
157
|
+
Table.define_asset(sname=schema.name, tname="File")
|
|
109
158
|
)
|
|
110
159
|
|
|
160
|
+
file_type = schema.create_table(
|
|
161
|
+
Table.define_vocabulary(MLVocab.file_type, f"{project_name}:{{RID}}")
|
|
162
|
+
)
|
|
111
163
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
if model.schemas.get("www"):
|
|
119
|
-
model.schemas["www"].drop(cascade=True)
|
|
120
|
-
www_schema = model.create_schema(
|
|
121
|
-
Schema.define(
|
|
122
|
-
"www", comment="Schema for tables that will be displayed as web content"
|
|
164
|
+
schema.create_table(
|
|
165
|
+
Table.define_association(
|
|
166
|
+
associates=[
|
|
167
|
+
("File", file_table),
|
|
168
|
+
(MLVocab.file_type, file_type),
|
|
169
|
+
]
|
|
123
170
|
)
|
|
124
171
|
)
|
|
125
|
-
|
|
172
|
+
schema.create_table(
|
|
173
|
+
Table.define_association(
|
|
174
|
+
[
|
|
175
|
+
("File", file_table),
|
|
176
|
+
("Execution", execution_table),
|
|
177
|
+
]
|
|
178
|
+
)
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def create_workflow_table(schema: Schema, annotations: Optional[dict[str, Any]] = None):
|
|
183
|
+
annotations = annotations or {}
|
|
184
|
+
workflow_table = schema.create_table(
|
|
126
185
|
Table.define(
|
|
127
|
-
"
|
|
186
|
+
"Workflow",
|
|
128
187
|
column_defs=[
|
|
129
|
-
Column.define(
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
),
|
|
135
|
-
Column.define(
|
|
136
|
-
"Content",
|
|
137
|
-
builtin_types.markdown,
|
|
138
|
-
comment="Content of the page in markdown",
|
|
139
|
-
),
|
|
188
|
+
Column.define("Name", builtin_types.text),
|
|
189
|
+
Column.define("Description", builtin_types.markdown),
|
|
190
|
+
Column.define("URL", builtin_types.ermrest_uri),
|
|
191
|
+
Column.define("Checksum", builtin_types.text),
|
|
192
|
+
Column.define("Version", builtin_types.text),
|
|
140
193
|
],
|
|
141
|
-
|
|
142
|
-
annotations={
|
|
143
|
-
chaise_tags.table_display: {
|
|
144
|
-
"detailed": {
|
|
145
|
-
"hide_column_headers": True,
|
|
146
|
-
"collapse_toc_panel": True,
|
|
147
|
-
}
|
|
148
|
-
},
|
|
149
|
-
chaise_tags.visible_foreign_keys: {"detailed": {}},
|
|
150
|
-
chaise_tags.visible_columns: {"detailed": ["Content"]},
|
|
151
|
-
},
|
|
194
|
+
annotations=annotations,
|
|
152
195
|
)
|
|
153
196
|
)
|
|
154
|
-
|
|
197
|
+
workflow_table.create_reference(
|
|
198
|
+
schema.create_table(
|
|
199
|
+
Table.define_vocabulary(MLVocab.workflow_type, f"{schema.name}:{{RID}}")
|
|
200
|
+
)
|
|
201
|
+
)
|
|
202
|
+
return workflow_table
|
|
155
203
|
|
|
156
204
|
|
|
157
205
|
def create_ml_schema(
|
|
@@ -182,124 +230,100 @@ def create_ml_schema(
|
|
|
182
230
|
Schema.define(schema_name, annotations=annotations["schema_annotation"])
|
|
183
231
|
)
|
|
184
232
|
project_name = project_name or schema_name
|
|
185
|
-
# Workflow
|
|
186
|
-
schema.create_table(
|
|
187
|
-
Table.define_vocabulary("Feature_Name", f"{project_name}:{{RID}}")
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
workflow_table = schema.create_table(
|
|
191
|
-
define_table_workflow(annotations["workflow_annotation"])
|
|
192
|
-
)
|
|
193
|
-
workflow_table.create_reference(
|
|
194
|
-
schema.create_table(
|
|
195
|
-
Table.define_vocabulary(MLVocab.workflow_type, f"{schema_name}:{{RID}}")
|
|
196
|
-
)
|
|
197
|
-
)
|
|
198
233
|
|
|
199
|
-
|
|
200
|
-
define_table_execution(schema_name, annotations["execution_annotation"])
|
|
201
|
-
)
|
|
234
|
+
# Create workflow and execution table.
|
|
202
235
|
|
|
203
|
-
dataset_table = schema.create_table(
|
|
204
|
-
define_table_dataset(annotations["dataset_annotation"])
|
|
205
|
-
)
|
|
206
|
-
dataset_type = schema.create_table(
|
|
207
|
-
Table.define_vocabulary(MLVocab.dataset_type, f"{project_name}:{{RID}}")
|
|
208
|
-
)
|
|
209
236
|
schema.create_table(
|
|
210
|
-
Table.
|
|
211
|
-
associates=[
|
|
212
|
-
("Dataset", dataset_table),
|
|
213
|
-
(MLVocab.dataset_type, dataset_type),
|
|
214
|
-
]
|
|
215
|
-
)
|
|
237
|
+
Table.define_vocabulary("Feature_Name", f"{project_name}:{{RID}}")
|
|
216
238
|
)
|
|
217
|
-
schema.create_table(
|
|
218
|
-
Table.
|
|
219
|
-
associates=[("Dataset", dataset_table), ("Execution", execution_table)]
|
|
220
|
-
)
|
|
239
|
+
asset_type_table = schema.create_table(
|
|
240
|
+
Table.define_vocabulary("Asset_Type", f"{project_name}:{{RID}}")
|
|
221
241
|
)
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
dataset_table.create_reference(("Version", True, dataset_version))
|
|
225
|
-
|
|
226
|
-
# Nested datasets.
|
|
227
|
-
schema.create_table(
|
|
228
|
-
Table.define_association(
|
|
229
|
-
associates=[("Dataset", dataset_table), ("Nested_Dataset", dataset_table)]
|
|
230
|
-
)
|
|
242
|
+
asset_role_table = schema.create_table(
|
|
243
|
+
Table.define_vocabulary("Asset_Role", f"{project_name}:{{RID}}")
|
|
231
244
|
)
|
|
232
245
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
schema.name, annotations["execution_metadata_annotation"]
|
|
237
|
-
)
|
|
246
|
+
create_workflow_table(schema, annotations["workflow_annotation"])
|
|
247
|
+
execution_table = create_execution_table(
|
|
248
|
+
schema, annotations["execution_annotation"]
|
|
238
249
|
)
|
|
239
|
-
|
|
240
|
-
schema
|
|
241
|
-
Table.define_vocabulary(
|
|
242
|
-
"Execution_Metadata_Type", f"{project_name}:{{RID}}"
|
|
243
|
-
)
|
|
244
|
-
)
|
|
245
|
-
)
|
|
246
|
-
schema.create_table(
|
|
247
|
-
Table.define_association(
|
|
248
|
-
[
|
|
249
|
-
("Execution_Metadata", execution_metadata_table),
|
|
250
|
-
("Execution", execution_table),
|
|
251
|
-
]
|
|
252
|
-
)
|
|
250
|
+
create_dataset_table(
|
|
251
|
+
schema, execution_table, project_name, annotations["dataset_annotation"]
|
|
253
252
|
)
|
|
254
253
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
schema.create_table(
|
|
263
|
-
Table.define_vocabulary("Execution_Asset_Type", f"{project_name}:{{RID}}")
|
|
264
|
-
)
|
|
254
|
+
create_asset_table(
|
|
255
|
+
schema,
|
|
256
|
+
"Execution_Metadata",
|
|
257
|
+
execution_table,
|
|
258
|
+
asset_type_table,
|
|
259
|
+
asset_role_table,
|
|
260
|
+
annotations["execution_metadata_annotation"],
|
|
265
261
|
)
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
262
|
+
create_asset_table(
|
|
263
|
+
schema,
|
|
264
|
+
"Execution_Asset",
|
|
265
|
+
execution_table,
|
|
266
|
+
asset_type_table,
|
|
267
|
+
asset_role_table,
|
|
268
|
+
annotations["execution_asset_annotation"],
|
|
270
269
|
)
|
|
271
270
|
|
|
272
271
|
# File table
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
272
|
+
create_file_table(schema, execution_table, project_name)
|
|
273
|
+
|
|
274
|
+
create_www_schema(model)
|
|
275
|
+
initialize_ml_schema(model, schema_name)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def create_www_schema(model: Model):
|
|
279
|
+
"""
|
|
280
|
+
Set up a new schema and tables to hold web-page like content. The tables include a page table, and an asset
|
|
281
|
+
table that can have images that are referred to by the web page. Pages are written using markdown.
|
|
282
|
+
:return:
|
|
283
|
+
"""
|
|
284
|
+
if model.schemas.get("www"):
|
|
285
|
+
model.schemas["www"].drop(cascade=True)
|
|
286
|
+
www_schema = model.create_schema(
|
|
287
|
+
Schema.define(
|
|
288
|
+
"www", comment="Schema for tables that will be displayed as web content"
|
|
283
289
|
)
|
|
284
290
|
)
|
|
285
|
-
|
|
286
|
-
Table.
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
(
|
|
290
|
-
|
|
291
|
+
www_schema.create_table(
|
|
292
|
+
Table.define(
|
|
293
|
+
"Page",
|
|
294
|
+
column_defs=[
|
|
295
|
+
Column.define(
|
|
296
|
+
"Title",
|
|
297
|
+
builtin_types.text,
|
|
298
|
+
nullok=False,
|
|
299
|
+
comment="Unique title for the page",
|
|
300
|
+
),
|
|
301
|
+
Column.define(
|
|
302
|
+
"Content",
|
|
303
|
+
builtin_types.markdown,
|
|
304
|
+
comment="Content of the page in markdown",
|
|
305
|
+
),
|
|
306
|
+
],
|
|
307
|
+
key_defs=[Key.define(["Title"])],
|
|
308
|
+
annotations={
|
|
309
|
+
chaise_tags.table_display: {
|
|
310
|
+
"detailed": {
|
|
311
|
+
"hide_column_headers": True,
|
|
312
|
+
"collapse_toc_panel": True,
|
|
313
|
+
}
|
|
314
|
+
},
|
|
315
|
+
chaise_tags.visible_foreign_keys: {"detailed": {}},
|
|
316
|
+
chaise_tags.visible_columns: {"detailed": ["Content"]},
|
|
317
|
+
},
|
|
291
318
|
)
|
|
292
319
|
)
|
|
293
|
-
|
|
294
|
-
initialize_ml_schema(model, schema_name)
|
|
320
|
+
return www_schema
|
|
295
321
|
|
|
296
322
|
|
|
297
323
|
def initialize_ml_schema(model: Model, schema_name: str = "deriva-ml"):
|
|
298
324
|
catalog = model.catalog
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
)
|
|
302
|
-
execution_metadata_type.insert(
|
|
325
|
+
asset_type = catalog.getPathBuilder().schemas[schema_name].tables["Asset_Type"]
|
|
326
|
+
asset_type.insert(
|
|
303
327
|
[
|
|
304
328
|
{
|
|
305
329
|
"Name": "Execution_Config",
|
|
@@ -309,6 +333,22 @@ def initialize_ml_schema(model: Model, schema_name: str = "deriva-ml"):
|
|
|
309
333
|
"Name": "Runtime_Env",
|
|
310
334
|
"Description": "Information about the execution environment",
|
|
311
335
|
},
|
|
336
|
+
{
|
|
337
|
+
"Name": "Execution_Metadata",
|
|
338
|
+
"Description": "Information about the execution environment",
|
|
339
|
+
},
|
|
340
|
+
{
|
|
341
|
+
"Name": "Execution_Asset",
|
|
342
|
+
"Description": "A file generated by an execution",
|
|
343
|
+
},
|
|
344
|
+
],
|
|
345
|
+
defaults={"ID", "URI"},
|
|
346
|
+
)
|
|
347
|
+
asset_role = catalog.getPathBuilder().schemas[schema_name].tables["Asset_Role"]
|
|
348
|
+
asset_role.insert(
|
|
349
|
+
[
|
|
350
|
+
{"Name": "Input", "Description": "Asset used for input of an execution."},
|
|
351
|
+
{"Name": "Output", "Description": "Asset used for output of an execution."},
|
|
312
352
|
],
|
|
313
353
|
defaults={"ID", "URI"},
|
|
314
354
|
)
|