deriva-ml 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/dataset.py +1 -1
- deriva_ml/dataset_bag.py +10 -3
- deriva_ml/demo_catalog.py +84 -78
- deriva_ml/deriva_definitions.py +2 -2
- deriva_ml/deriva_ml_base.py +87 -128
- deriva_ml/deriva_model.py +25 -0
- deriva_ml/execution.py +389 -309
- deriva_ml/execution_configuration.py +16 -6
- deriva_ml/feature.py +1 -2
- deriva_ml/schema_setup/create_schema.py +223 -183
- deriva_ml/upload.py +95 -232
- {deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/METADATA +2 -1
- deriva_ml-1.11.0.dist-info/RECORD +27 -0
- {deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/WHEEL +1 -1
- deriva_ml-1.10.0.dist-info/RECORD +0 -27
- {deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/top_level.txt +0 -0
deriva_ml/dataset.py
CHANGED
|
@@ -805,7 +805,7 @@ class Dataset:
|
|
|
805
805
|
dataset_elements = [
|
|
806
806
|
snapshot_catalog._model.name_to_table(e)
|
|
807
807
|
for e, m in snapshot_catalog.list_dataset_members(
|
|
808
|
-
dataset_rid=dataset_rid,
|
|
808
|
+
dataset_rid=dataset_rid, # limit=1 Limit seems to make things run slow.
|
|
809
809
|
).items()
|
|
810
810
|
if m
|
|
811
811
|
]
|
deriva_ml/dataset_bag.py
CHANGED
|
@@ -168,7 +168,7 @@ class DatasetBag:
|
|
|
168
168
|
yield dict(zip(col_names, row))
|
|
169
169
|
|
|
170
170
|
@validate_call
|
|
171
|
-
def list_dataset_members(self, recurse: bool = False) -> dict[str,
|
|
171
|
+
def list_dataset_members(self, recurse: bool = False) -> dict[str, dict[str, Any]]:
|
|
172
172
|
"""Return a list of entities associated with a specific _dataset_table.
|
|
173
173
|
|
|
174
174
|
Args:
|
|
@@ -206,12 +206,19 @@ class DatasetBag:
|
|
|
206
206
|
)
|
|
207
207
|
|
|
208
208
|
with self.database as db:
|
|
209
|
+
col_names = [
|
|
210
|
+
c[1]
|
|
211
|
+
for c in db.execute(f'PRAGMA table_info("{sql_target}")').fetchall()
|
|
212
|
+
]
|
|
213
|
+
select_cols = ",".join([f'"{sql_target}".{c}' for c in col_names])
|
|
209
214
|
sql_cmd = (
|
|
210
|
-
f'SELECT
|
|
215
|
+
f'SELECT {select_cols} FROM "{sql_member}" '
|
|
211
216
|
f'JOIN "{sql_target}" ON "{sql_member}".{member_link[0]} = "{sql_target}".{member_link[1]} '
|
|
212
217
|
f'WHERE "{self.dataset_rid}" = "{sql_member}".Dataset;'
|
|
213
218
|
)
|
|
214
|
-
target_entities =
|
|
219
|
+
target_entities = [
|
|
220
|
+
dict(zip(col_names, e)) for e in db.execute(sql_cmd).fetchall()
|
|
221
|
+
]
|
|
215
222
|
members[target_table.name].extend(target_entities)
|
|
216
223
|
|
|
217
224
|
target_entities = [] # path.entities().fetch()
|
deriva_ml/demo_catalog.py
CHANGED
|
@@ -2,9 +2,7 @@ import atexit
|
|
|
2
2
|
from importlib.metadata import version
|
|
3
3
|
from importlib.resources import files
|
|
4
4
|
import logging
|
|
5
|
-
from random import
|
|
6
|
-
import tempfile
|
|
7
|
-
from tempfile import TemporaryDirectory
|
|
5
|
+
from random import randint, random
|
|
8
6
|
from typing import Optional
|
|
9
7
|
import itertools
|
|
10
8
|
|
|
@@ -12,7 +10,6 @@ from deriva.config.acl_config import AclConfig
|
|
|
12
10
|
from deriva.core import DerivaServer
|
|
13
11
|
from deriva.core import ErmrestCatalog, get_credential
|
|
14
12
|
from deriva.core.datapath import DataPathException
|
|
15
|
-
from deriva.core.ermrest_model import Model
|
|
16
13
|
from deriva.core.ermrest_model import builtin_types, Schema, Table, Column
|
|
17
14
|
from requests import HTTPError
|
|
18
15
|
|
|
@@ -35,48 +32,51 @@ TEST_DATASET_SIZE = 4
|
|
|
35
32
|
def reset_demo_catalog(deriva_ml: DerivaML, sname: str):
|
|
36
33
|
model = deriva_ml.model
|
|
37
34
|
for trial in range(3):
|
|
38
|
-
for t in [
|
|
39
|
-
v
|
|
40
|
-
for v in model.schemas[sname].tables.values()
|
|
41
|
-
if v.name not in {"Subject", "Image"}
|
|
42
|
-
]:
|
|
35
|
+
for t in [v for v in model.schemas[sname].tables.values()]:
|
|
43
36
|
try:
|
|
44
37
|
t.drop()
|
|
45
38
|
except HTTPError:
|
|
46
39
|
pass
|
|
47
|
-
|
|
40
|
+
model.schemas[sname].drop()
|
|
48
41
|
# Empty out remaining tables.
|
|
49
42
|
pb = deriva_ml.pathBuilder
|
|
50
43
|
retry = True
|
|
51
44
|
while retry:
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
except DataPathException: # FK constraint.
|
|
59
|
-
retry = True
|
|
60
|
-
|
|
45
|
+
for t in pb.schemas["deriva-ml"].tables.values():
|
|
46
|
+
for e in t.entities().fetch():
|
|
47
|
+
try:
|
|
48
|
+
t.filter(t.RID == e["RID"]).delete()
|
|
49
|
+
except DataPathException: # FK constraint.
|
|
50
|
+
retry = True
|
|
61
51
|
initialize_ml_schema(model, "deriva-ml")
|
|
52
|
+
create_domain_schema(deriva_ml, sname)
|
|
62
53
|
|
|
63
54
|
|
|
64
55
|
def populate_demo_catalog(deriva_ml: DerivaML, sname: str) -> None:
|
|
65
56
|
# Delete any vocabularies and features.
|
|
66
|
-
reset_demo_catalog(deriva_ml, sname)
|
|
67
57
|
domain_schema = deriva_ml.catalog.getPathBuilder().schemas[sname]
|
|
68
58
|
subject = domain_schema.tables["Subject"]
|
|
69
59
|
ss = subject.insert([{"Name": f"Thing{t + 1}"} for t in range(TEST_DATASET_SIZE)])
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
60
|
+
deriva_ml.add_term(
|
|
61
|
+
MLVocab.workflow_type,
|
|
62
|
+
"Demo Catalog Creation",
|
|
63
|
+
description="A workflow demonstrating how to create a demo catalog.",
|
|
64
|
+
)
|
|
65
|
+
execution = deriva_ml.create_execution(
|
|
66
|
+
ExecutionConfiguration(
|
|
67
|
+
workflow=deriva_ml.create_workflow(
|
|
68
|
+
name="Demo Catalog", workflow_type="Demo Catalog Creation"
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
with execution.execute() as e:
|
|
73
73
|
for s in ss:
|
|
74
|
-
image_file =
|
|
75
|
-
f"test_{s['RID']}.txt",
|
|
74
|
+
image_file = e.asset_file_path(
|
|
75
|
+
"Image", f"test_{s['RID']}.txt", Subject=s["RID"]
|
|
76
76
|
)
|
|
77
77
|
with open(image_file, "w") as f:
|
|
78
78
|
f.write(f"Hello there {random()}\n")
|
|
79
|
-
|
|
79
|
+
execution.upload_execution_outputs()
|
|
80
80
|
|
|
81
81
|
|
|
82
82
|
def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RID]]:
|
|
@@ -84,6 +84,13 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
|
|
|
84
84
|
ml_instance.add_dataset_element_type("Image")
|
|
85
85
|
|
|
86
86
|
type_rid = ml_instance.add_term("Dataset_Type", "TestSet", description="A test")
|
|
87
|
+
training_rid = ml_instance.add_term(
|
|
88
|
+
"Dataset_Type", "Training", description="A traing set"
|
|
89
|
+
)
|
|
90
|
+
testing_rid = ml_instance.add_term(
|
|
91
|
+
"Dataset_Type", "Testing", description="A testing set"
|
|
92
|
+
)
|
|
93
|
+
|
|
87
94
|
table_path = (
|
|
88
95
|
ml_instance.catalog.getPathBuilder()
|
|
89
96
|
.schemas[ml_instance.domain_schema]
|
|
@@ -94,7 +101,7 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
|
|
|
94
101
|
dataset_rids = []
|
|
95
102
|
for r in subject_rids[0:4]:
|
|
96
103
|
d = ml_instance.create_dataset(
|
|
97
|
-
type_rid.name,
|
|
104
|
+
type=[type_rid.name, "Testing"],
|
|
98
105
|
description=f"Dataset {r}",
|
|
99
106
|
version=DatasetVersion(1, 0, 0),
|
|
100
107
|
)
|
|
@@ -104,7 +111,7 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
|
|
|
104
111
|
nested_datasets = []
|
|
105
112
|
for i in range(0, 4, 2):
|
|
106
113
|
nested_dataset = ml_instance.create_dataset(
|
|
107
|
-
type_rid.name,
|
|
114
|
+
type=[type_rid.name, "Training"],
|
|
108
115
|
description=f"Nested Dataset {i}",
|
|
109
116
|
version=DatasetVersion(1, 0, 0),
|
|
110
117
|
)
|
|
@@ -132,13 +139,11 @@ def create_demo_features(ml_instance):
|
|
|
132
139
|
"Well",
|
|
133
140
|
description="The subject self reports that they feel well",
|
|
134
141
|
)
|
|
135
|
-
|
|
136
142
|
ml_instance.create_vocabulary(
|
|
137
143
|
"ImageQuality", "Controlled vocabulary for image quality"
|
|
138
144
|
)
|
|
139
145
|
ml_instance.add_term("ImageQuality", "Good", description="The image is good")
|
|
140
146
|
ml_instance.add_term("ImageQuality", "Bad", description="The image is bad")
|
|
141
|
-
|
|
142
147
|
box_asset = ml_instance.create_asset(
|
|
143
148
|
"BoundingBox", comment="A file that contains a cropped version of a image"
|
|
144
149
|
)
|
|
@@ -150,7 +155,6 @@ def create_demo_features(ml_instance):
|
|
|
150
155
|
metadata=[ColumnDefinition(name="Scale", type=BuiltinTypes.int2, nullok=True)],
|
|
151
156
|
optional=["Scale"],
|
|
152
157
|
)
|
|
153
|
-
|
|
154
158
|
ml_instance.create_feature("Image", "BoundingBox", assets=[box_asset])
|
|
155
159
|
ml_instance.create_feature("Image", "Quality", terms=["ImageQuality"])
|
|
156
160
|
|
|
@@ -158,78 +162,88 @@ def create_demo_features(ml_instance):
|
|
|
158
162
|
ImageBoundingboxFeature = ml_instance.feature_record_class("Image", "BoundingBox")
|
|
159
163
|
SubjectWellnessFeature = ml_instance.feature_record_class("Subject", "Health")
|
|
160
164
|
|
|
165
|
+
# Get the workflow for this notebook
|
|
166
|
+
|
|
161
167
|
ml_instance.add_term(
|
|
162
168
|
MLVocab.workflow_type,
|
|
163
|
-
"
|
|
169
|
+
"Feature Notebook Workflow",
|
|
164
170
|
description="A Workflow that uses Deriva ML API",
|
|
165
171
|
)
|
|
166
172
|
ml_instance.add_term(
|
|
167
|
-
MLVocab.
|
|
168
|
-
"API_Model",
|
|
169
|
-
description="Model for our API workflow",
|
|
173
|
+
MLVocab.asset_type, "API_Model", description="Model for our Notebook workflow"
|
|
170
174
|
)
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
name="API Workflow",
|
|
174
|
-
workflow_type="API Workflow",
|
|
175
|
+
notebook_workflow = ml_instance.create_workflow(
|
|
176
|
+
name="API Workflow", workflow_type="Feature Notebook Workflow"
|
|
175
177
|
)
|
|
176
178
|
|
|
177
|
-
|
|
179
|
+
feature_execution = ml_instance.create_execution(
|
|
178
180
|
ExecutionConfiguration(
|
|
179
|
-
workflow=
|
|
181
|
+
workflow=notebook_workflow, description="Our Sample Workflow instance"
|
|
180
182
|
)
|
|
181
183
|
)
|
|
182
184
|
|
|
183
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
|
184
|
-
assetdir = ml_instance.asset_dir("BoundingBox", prefix=temp_dir)
|
|
185
|
-
for i in range(10):
|
|
186
|
-
with open(assetdir.path / f"box{i}.txt", "w") as fp:
|
|
187
|
-
fp.write(f"Hi there {i}")
|
|
188
|
-
bounding_box_assets = ml_instance.upload_assets(assetdir)
|
|
189
|
-
bounding_box_rids = [a.result["RID"] for a in bounding_box_assets.values()]
|
|
190
|
-
|
|
191
|
-
# Get the IDs of al of the things that we are going to want to attach features to.
|
|
192
185
|
subject_rids = [
|
|
193
186
|
i["RID"] for i in ml_instance.domain_path.tables["Subject"].entities().fetch()
|
|
194
187
|
]
|
|
195
188
|
image_rids = [
|
|
196
189
|
i["RID"] for i in ml_instance.domain_path.tables["Image"].entities().fetch()
|
|
197
190
|
]
|
|
198
|
-
|
|
199
191
|
subject_feature_list = [
|
|
200
192
|
SubjectWellnessFeature(
|
|
201
193
|
Subject=subject_rid,
|
|
202
|
-
Execution=
|
|
194
|
+
Execution=feature_execution.execution_rid,
|
|
203
195
|
SubjectHealth=["Well", "Sick"][randint(0, 1)],
|
|
204
196
|
Scale=randint(1, 10),
|
|
205
197
|
)
|
|
206
198
|
for subject_rid in subject_rids
|
|
207
199
|
]
|
|
208
200
|
|
|
201
|
+
# Create a new set of images. For fun, lets wrap this in an execution so we get status updates
|
|
202
|
+
bounding_box_files = []
|
|
203
|
+
for i in range(10):
|
|
204
|
+
bounding_box_file = feature_execution.asset_file_path(
|
|
205
|
+
"BoundingBox", f"box{i}.txt"
|
|
206
|
+
)
|
|
207
|
+
with open(bounding_box_file, "w") as fp:
|
|
208
|
+
fp.write(f"Hi there {i}")
|
|
209
|
+
bounding_box_files.append(bounding_box_file)
|
|
210
|
+
|
|
211
|
+
image_bounding_box_feature_list = [
|
|
212
|
+
ImageBoundingboxFeature(
|
|
213
|
+
Image=image_rid,
|
|
214
|
+
BoundingBox=asset_name,
|
|
215
|
+
)
|
|
216
|
+
for image_rid, asset_name in zip(
|
|
217
|
+
image_rids, itertools.cycle(bounding_box_files)
|
|
218
|
+
)
|
|
219
|
+
]
|
|
220
|
+
|
|
209
221
|
image_quality_feature_list = [
|
|
210
222
|
ImageQualityFeature(
|
|
211
223
|
Image=image_rid,
|
|
212
|
-
Execution=api_execution.execution_rid,
|
|
213
224
|
ImageQuality=["Good", "Bad"][randint(0, 1)],
|
|
214
225
|
)
|
|
215
226
|
for image_rid in image_rids
|
|
216
227
|
]
|
|
217
228
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
229
|
+
subject_feature_list = [
|
|
230
|
+
SubjectWellnessFeature(
|
|
231
|
+
Subject=subject_rid,
|
|
232
|
+
SubjectHealth=["Well", "Sick"][randint(0, 1)],
|
|
233
|
+
Scale=randint(1, 10),
|
|
223
234
|
)
|
|
224
|
-
for
|
|
235
|
+
for subject_rid in subject_rids
|
|
225
236
|
]
|
|
226
237
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
238
|
+
with feature_execution.execute() as execution:
|
|
239
|
+
feature_execution.add_features(image_bounding_box_feature_list)
|
|
240
|
+
feature_execution.add_features(image_quality_feature_list)
|
|
241
|
+
feature_execution.add_features(subject_feature_list)
|
|
242
|
+
|
|
243
|
+
feature_execution.upload_execution_outputs()
|
|
230
244
|
|
|
231
245
|
|
|
232
|
-
def create_domain_schema(
|
|
246
|
+
def create_domain_schema(ml_instance: DerivaML, sname: str) -> None:
|
|
233
247
|
"""
|
|
234
248
|
Create a domain schema. Assumes that the ml-schema has already been created.
|
|
235
249
|
:param model:
|
|
@@ -238,28 +252,19 @@ def create_domain_schema(model: Model, sname: str) -> None:
|
|
|
238
252
|
"""
|
|
239
253
|
|
|
240
254
|
# Make sure that we have a ml schema
|
|
241
|
-
_ = model.schemas["deriva-ml"]
|
|
255
|
+
_ = ml_instance.model.schemas["deriva-ml"]
|
|
242
256
|
|
|
243
|
-
if model.schemas.get(sname):
|
|
257
|
+
if ml_instance.model.schemas.get(sname):
|
|
244
258
|
# Clean out any old junk....
|
|
245
|
-
model.schemas[sname].drop()
|
|
259
|
+
ml_instance.model.schemas[sname].drop()
|
|
246
260
|
|
|
247
|
-
domain_schema = model.create_schema(
|
|
261
|
+
domain_schema = ml_instance.model.model.create_schema(
|
|
248
262
|
Schema.define(sname, annotations={"name_style": {"underline_space": True}})
|
|
249
263
|
)
|
|
250
264
|
subject_table = domain_schema.create_table(
|
|
251
265
|
Table.define("Subject", column_defs=[Column.define("Name", builtin_types.text)])
|
|
252
266
|
)
|
|
253
|
-
|
|
254
|
-
image_table = domain_schema.create_table(
|
|
255
|
-
Table.define_asset(
|
|
256
|
-
sname=sname,
|
|
257
|
-
tname="Image",
|
|
258
|
-
hatrac_template="/hatrac/image_asset/{{MD5}}.{{Filename}}",
|
|
259
|
-
column_defs=[Column.define("Name", builtin_types.text)],
|
|
260
|
-
)
|
|
261
|
-
)
|
|
262
|
-
image_table.create_reference(subject_table)
|
|
267
|
+
ml_instance.create_asset("Image", referenced_tables=[subject_table])
|
|
263
268
|
|
|
264
269
|
|
|
265
270
|
def destroy_demo_catalog(catalog):
|
|
@@ -284,13 +289,14 @@ def create_demo_catalog(
|
|
|
284
289
|
|
|
285
290
|
try:
|
|
286
291
|
create_ml_schema(model, project_name=project_name)
|
|
287
|
-
create_domain_schema(model, domain_schema)
|
|
288
292
|
deriva_ml = DerivaML(
|
|
289
293
|
hostname=hostname,
|
|
290
294
|
catalog_id=test_catalog.catalog_id,
|
|
291
295
|
project_name=project_name,
|
|
296
|
+
domain_schema=domain_schema,
|
|
292
297
|
logging_level=logging.WARN,
|
|
293
298
|
)
|
|
299
|
+
create_domain_schema(deriva_ml, domain_schema)
|
|
294
300
|
working_dir = deriva_ml.working_dir
|
|
295
301
|
dataset_table = deriva_ml.dataset_table
|
|
296
302
|
dataset_table.annotations.update(
|
deriva_ml/deriva_definitions.py
CHANGED
|
@@ -186,9 +186,9 @@ class MLVocab(StrEnum):
|
|
|
186
186
|
|
|
187
187
|
dataset_type = "Dataset_Type"
|
|
188
188
|
workflow_type = "Workflow_Type"
|
|
189
|
-
execution_asset_type = "Execution_Asset_Type"
|
|
190
|
-
execution_metadata_type = "Execution_Metadata_Type"
|
|
191
189
|
file_type = "File_Type"
|
|
190
|
+
asset_type = "Asset_Type"
|
|
191
|
+
asset_role = "Asset_Role"
|
|
192
192
|
|
|
193
193
|
|
|
194
194
|
class ExecMetadataVocab(StrEnum):
|