deriva-ml 1.13.3__py3-none-any.whl → 1.14.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +25 -30
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1489 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +4 -0
- deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
- deriva_ml/{dataset.py → dataset/dataset.py} +408 -416
- deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
- deriva_ml/{history.py → dataset/history.py} +52 -33
- deriva_ml/{upload.py → dataset/upload.py} +48 -70
- deriva_ml/demo_catalog.py +233 -183
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/{execution.py → execution/execution.py} +365 -252
- deriva_ml/execution/execution_configuration.py +163 -0
- deriva_ml/{execution_configuration.py → execution/workflow.py} +206 -218
- deriva_ml/feature.py +83 -46
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
- deriva_ml/{database_model.py → model/database.py} +52 -74
- deriva_ml/model/sql_mapper.py +44 -0
- deriva_ml/run_notebook.py +19 -11
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/{schema_setup → schema}/annotations.py +31 -22
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- {deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/METADATA +5 -4
- deriva_ml-1.14.26.dist-info/RECORD +40 -0
- {deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/entry_points.txt +1 -0
- deriva_ml/deriva_definitions.py +0 -372
- deriva_ml/deriva_ml_base.py +0 -1046
- deriva_ml/execution_environment.py +0 -139
- deriva_ml/schema_setup/table_comments_utils.py +0 -56
- deriva_ml/test-files/execution-parameters.json +0 -1
- deriva_ml/test-files/notebook-parameters.json +0 -5
- deriva_ml/test_functions.py +0 -141
- deriva_ml/test_notebook.ipynb +0 -197
- deriva_ml-1.13.3.dist-info/RECORD +0 -31
- /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
- /deriva_ml/{schema_setup → schema}/policy.json +0 -0
- {deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/WHEEL +0 -0
- {deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/top_level.txt +0 -0
deriva_ml/demo_catalog.py
CHANGED
|
@@ -1,150 +1,188 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import atexit
|
|
2
|
-
from importlib.resources import files
|
|
3
4
|
import itertools
|
|
4
5
|
import logging
|
|
5
|
-
|
|
6
|
-
from
|
|
6
|
+
import string
|
|
7
|
+
from collections.abc import Iterator, Sequence
|
|
8
|
+
from numbers import Integral
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from random import choice, randint, random
|
|
7
11
|
from tempfile import TemporaryDirectory
|
|
8
12
|
|
|
9
|
-
from deriva.core import DerivaServer, get_credential
|
|
10
13
|
from deriva.core import ErmrestCatalog
|
|
11
|
-
from deriva.core.
|
|
12
|
-
from
|
|
13
|
-
from requests import HTTPError
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
from .
|
|
17
|
-
from deriva_ml import
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
ColumnDefinition,
|
|
23
|
-
DatasetVersion,
|
|
24
|
-
RID,
|
|
14
|
+
from deriva.core.ermrest_model import Column, Schema, Table, builtin_types
|
|
15
|
+
from pydantic import BaseModel, ConfigDict
|
|
16
|
+
from requests.exceptions import HTTPError
|
|
17
|
+
|
|
18
|
+
from deriva_ml import DerivaML, MLVocab
|
|
19
|
+
from deriva_ml.core.definitions import RID, BuiltinTypes, ColumnDefinition
|
|
20
|
+
from deriva_ml.dataset.aux_classes import DatasetVersion
|
|
21
|
+
from deriva_ml.execution.execution import Execution
|
|
22
|
+
from deriva_ml.execution.execution_configuration import ExecutionConfiguration
|
|
23
|
+
from deriva_ml.schema import (
|
|
24
|
+
create_ml_catalog,
|
|
25
25
|
)
|
|
26
|
+
from deriva_ml.schema.annotations import catalog_annotation
|
|
26
27
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
)
|
|
28
|
+
try:
|
|
29
|
+
from icecream import ic
|
|
30
|
+
|
|
31
|
+
ic.configureOutput(includeContext=True)
|
|
32
|
+
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
33
|
+
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
TEST_DATASET_SIZE = 12
|
|
31
37
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def reset_demo_catalog(deriva_ml: DerivaML, sname: str):
|
|
36
|
-
model = deriva_ml.model
|
|
37
|
-
for trial in range(3):
|
|
38
|
-
for t in [v for v in model.schemas[sname].tables.values()]:
|
|
39
|
-
try:
|
|
40
|
-
t.drop()
|
|
41
|
-
except HTTPError:
|
|
42
|
-
pass
|
|
43
|
-
model.schemas[sname].drop()
|
|
44
|
-
# Empty out remaining tables.
|
|
45
|
-
pb = deriva_ml.pathBuilder
|
|
46
|
-
retry = True
|
|
47
|
-
while retry:
|
|
48
|
-
for t in pb.schemas["deriva-ml"].tables.values():
|
|
49
|
-
for e in t.entities().fetch():
|
|
50
|
-
try:
|
|
51
|
-
t.filter(t.RID == e["RID"]).delete()
|
|
52
|
-
except DataPathException: # FK constraint.
|
|
53
|
-
retry = True
|
|
54
|
-
initialize_ml_schema(model, "deriva-ml")
|
|
55
|
-
create_domain_schema(deriva_ml, sname)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def populate_demo_catalog(deriva_ml: DerivaML, sname: str) -> None:
|
|
38
|
+
|
|
39
|
+
def populate_demo_catalog(ml_instance: DerivaML) -> None:
|
|
59
40
|
# Delete any vocabularies and features.
|
|
60
|
-
domain_schema =
|
|
41
|
+
domain_schema = ml_instance.pathBuilder.schemas[ml_instance.domain_schema]
|
|
61
42
|
subject = domain_schema.tables["Subject"]
|
|
62
43
|
ss = subject.insert([{"Name": f"Thing{t + 1}"} for t in range(TEST_DATASET_SIZE)])
|
|
63
|
-
|
|
44
|
+
|
|
45
|
+
ml_instance.add_term(
|
|
64
46
|
MLVocab.workflow_type,
|
|
65
47
|
"Demo Catalog Creation",
|
|
66
48
|
description="A workflow demonstrating how to create a demo catalog.",
|
|
67
49
|
)
|
|
68
|
-
execution =
|
|
50
|
+
execution = ml_instance.create_execution(
|
|
69
51
|
ExecutionConfiguration(
|
|
70
|
-
workflow=
|
|
71
|
-
name="Demo Catalog", workflow_type="Demo Catalog Creation"
|
|
72
|
-
)
|
|
52
|
+
workflow=ml_instance.create_workflow(name="Demo Catalog", workflow_type="Demo Catalog Creation")
|
|
73
53
|
)
|
|
74
54
|
)
|
|
75
55
|
with execution.execute() as e:
|
|
76
56
|
for s in ss:
|
|
77
|
-
image_file = e.asset_file_path(
|
|
78
|
-
|
|
79
|
-
)
|
|
80
|
-
with open(image_file, "w") as f:
|
|
57
|
+
image_file = e.asset_file_path("Image", f"test_{s['RID']}.txt", Subject=s["RID"])
|
|
58
|
+
with image_file.open("w") as f:
|
|
81
59
|
f.write(f"Hello there {random()}\n")
|
|
82
60
|
execution.upload_execution_outputs()
|
|
83
61
|
|
|
84
62
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
63
|
+
class DatasetDescription(BaseModel):
|
|
64
|
+
types: list[str] # Types of the dataset.
|
|
65
|
+
description: str # Description.
|
|
66
|
+
members: dict[
|
|
67
|
+
str, int | list[DatasetDescription]
|
|
68
|
+
] # Either a list of nested dataset, or then number of elements to add
|
|
69
|
+
member_rids: dict[str, list[RID]] = {} # The rids of the members of the dataset.
|
|
70
|
+
version: DatasetVersion = DatasetVersion(1, 0, 0) # The initial version.
|
|
71
|
+
rid: RID = None # RID of dataset that was created.
|
|
72
|
+
|
|
73
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def create_datasets(
|
|
77
|
+
client: Execution,
|
|
78
|
+
spec: DatasetDescription,
|
|
79
|
+
member_rids: dict[str, Iterator[RID]],
|
|
80
|
+
) -> DatasetDescription:
|
|
81
|
+
"""
|
|
82
|
+
Create a dataset per `spec`, then add child members (either by slicing
|
|
83
|
+
off pre-generated RIDs or by recursing on nested specs).
|
|
84
|
+
"""
|
|
85
|
+
dataset_rid = client.create_dataset(
|
|
86
|
+
dataset_types=spec.types,
|
|
87
|
+
description=spec.description,
|
|
88
|
+
version=spec.version,
|
|
89
|
+
)
|
|
88
90
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
91
|
+
result_spec = DatasetDescription(
|
|
92
|
+
description=spec.description,
|
|
93
|
+
members={},
|
|
94
|
+
types=spec.types,
|
|
95
|
+
rid=dataset_rid,
|
|
96
|
+
version=spec.version,
|
|
92
97
|
)
|
|
93
|
-
|
|
94
|
-
|
|
98
|
+
dataset_rids = {}
|
|
99
|
+
for member_type, value in spec.members.items():
|
|
100
|
+
if isinstance(value, Sequence) and not isinstance(value, (str, bytes)):
|
|
101
|
+
nested_specs: list[DatasetDescription] = list(value)
|
|
102
|
+
rids: list[RID] = []
|
|
103
|
+
for child_spec in nested_specs:
|
|
104
|
+
child_ds = create_datasets(client, child_spec, member_rids)
|
|
105
|
+
result_spec.members.setdefault(member_type, []).append(child_ds)
|
|
106
|
+
rids.append(child_ds.rid)
|
|
107
|
+
elif isinstance(value, Integral):
|
|
108
|
+
count = int(value)
|
|
109
|
+
# take exactly `count` RIDs (or an empty list if count <= 0)
|
|
110
|
+
rids = list(itertools.islice(member_rids[member_type], count))
|
|
111
|
+
assert len(rids) == count, f"Expected {count} RIDs, got {len(rids)}"
|
|
112
|
+
result_spec.members[member_type] = count
|
|
113
|
+
else:
|
|
114
|
+
raise TypeError(
|
|
115
|
+
f"Expected spec.members['{member_type}'] to be either an int or a list, got {type(value).__name__!r}"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# attach and record
|
|
119
|
+
if rids:
|
|
120
|
+
dataset_rids[member_type] = rids
|
|
121
|
+
result_spec.member_rids.setdefault(member_type, []).extend(rids)
|
|
122
|
+
client.add_dataset_members(dataset_rid, dataset_rids, description="Added by create_datasets")
|
|
123
|
+
|
|
124
|
+
return result_spec
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def dataset_spec() -> DatasetDescription:
|
|
128
|
+
dataset = DatasetDescription(
|
|
129
|
+
description="A dataset",
|
|
130
|
+
members={"Subject": 2},
|
|
131
|
+
types=[],
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
training_dataset = DatasetDescription(
|
|
135
|
+
description="A dataset that is nested",
|
|
136
|
+
members={"Dataset": [dataset, dataset], "Image": 2},
|
|
137
|
+
types=["Testing"],
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
testing_dataset = DatasetDescription(
|
|
141
|
+
description="A dataset that is nested",
|
|
142
|
+
members={"Dataset": [dataset, dataset], "Image": 2},
|
|
143
|
+
types=["Testing"],
|
|
95
144
|
)
|
|
96
145
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
146
|
+
double_nested_dataset = DatasetDescription(
|
|
147
|
+
description="A dataset that is double nested",
|
|
148
|
+
members={"Dataset": [training_dataset, testing_dataset]},
|
|
149
|
+
types=["Complete"],
|
|
101
150
|
)
|
|
151
|
+
return double_nested_dataset
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def create_demo_datasets(ml_instance: DerivaML) -> DatasetDescription:
|
|
155
|
+
"""Create datasets from a populated catalog."""
|
|
156
|
+
ml_instance.add_dataset_element_type("Subject")
|
|
157
|
+
ml_instance.add_dataset_element_type("Image")
|
|
158
|
+
|
|
159
|
+
_type_rid = ml_instance.add_term("Dataset_Type", "Complete", synonyms=["Whole"], description="A test")
|
|
160
|
+
_training_rid = ml_instance.add_term("Dataset_Type", "Training", synonyms=["Train"], description="A training set")
|
|
161
|
+
_testing_rid = ml_instance.add_term("Dataset_Type", "Testing", description="A testing set")
|
|
162
|
+
|
|
163
|
+
table_path = ml_instance.catalog.getPathBuilder().schemas[ml_instance.domain_schema].tables["Subject"]
|
|
102
164
|
subject_rids = [i["RID"] for i in table_path.entities().fetch()]
|
|
165
|
+
table_path = ml_instance.catalog.getPathBuilder().schemas[ml_instance.domain_schema].tables["Image"]
|
|
166
|
+
image_rids = [i["RID"] for i in table_path.entities().fetch()]
|
|
103
167
|
|
|
104
168
|
ml_instance.add_term(
|
|
105
169
|
MLVocab.workflow_type,
|
|
106
170
|
"Create Dataset Workflow",
|
|
107
171
|
description="A Workflow that creates a new dataset.",
|
|
108
172
|
)
|
|
109
|
-
dataset_workflow = ml_instance.create_workflow(
|
|
110
|
-
name="API Workflow", workflow_type="Create Dataset Workflow"
|
|
111
|
-
)
|
|
173
|
+
dataset_workflow = ml_instance.create_workflow(name="API Workflow", workflow_type="Create Dataset Workflow")
|
|
112
174
|
|
|
113
175
|
dataset_execution = ml_instance.create_execution(
|
|
114
176
|
ExecutionConfiguration(workflow=dataset_workflow, description="Create Dataset")
|
|
115
177
|
)
|
|
116
178
|
|
|
117
179
|
with dataset_execution.execute() as exe:
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
dataset_types=[type_rid.name, "Testing"],
|
|
122
|
-
description=f"Dataset {r}",
|
|
123
|
-
version=DatasetVersion(1, 0, 0),
|
|
124
|
-
)
|
|
125
|
-
ml_instance.add_dataset_members(d, [r])
|
|
126
|
-
dataset_rids.append(d)
|
|
127
|
-
|
|
128
|
-
nested_datasets = []
|
|
129
|
-
for i in range(0, 4, 2):
|
|
130
|
-
nested_dataset = exe.create_dataset(
|
|
131
|
-
dataset_types=[type_rid.name, "Training"],
|
|
132
|
-
description=f"Nested Dataset {i}",
|
|
133
|
-
version=DatasetVersion(1, 0, 0),
|
|
134
|
-
)
|
|
135
|
-
exe.add_dataset_members(nested_dataset, dataset_rids[i : i + 2])
|
|
136
|
-
nested_datasets.append(nested_dataset)
|
|
137
|
-
|
|
138
|
-
double_nested_dataset = exe.create_dataset(
|
|
139
|
-
dataset_types=type_rid.name,
|
|
140
|
-
description="Double nested dataset",
|
|
141
|
-
version=DatasetVersion(1, 0, 0),
|
|
142
|
-
)
|
|
143
|
-
exe.add_dataset_members(double_nested_dataset, nested_datasets)
|
|
144
|
-
return double_nested_dataset, nested_datasets, dataset_rids
|
|
180
|
+
spec = dataset_spec()
|
|
181
|
+
dataset = create_datasets(exe, spec, {"Subject": iter(subject_rids), "Image": iter(image_rids)})
|
|
182
|
+
return dataset
|
|
145
183
|
|
|
146
184
|
|
|
147
|
-
def create_demo_features(ml_instance):
|
|
185
|
+
def create_demo_features(ml_instance: DerivaML) -> None:
|
|
148
186
|
ml_instance.create_vocabulary("SubjectHealth", "A vocab")
|
|
149
187
|
ml_instance.add_term(
|
|
150
188
|
"SubjectHealth",
|
|
@@ -156,14 +194,10 @@ def create_demo_features(ml_instance):
|
|
|
156
194
|
"Well",
|
|
157
195
|
description="The subject self reports that they feel well",
|
|
158
196
|
)
|
|
159
|
-
ml_instance.create_vocabulary(
|
|
160
|
-
"ImageQuality", "Controlled vocabulary for image quality"
|
|
161
|
-
)
|
|
197
|
+
ml_instance.create_vocabulary("ImageQuality", "Controlled vocabulary for image quality")
|
|
162
198
|
ml_instance.add_term("ImageQuality", "Good", description="The image is good")
|
|
163
199
|
ml_instance.add_term("ImageQuality", "Bad", description="The image is bad")
|
|
164
|
-
box_asset = ml_instance.create_asset(
|
|
165
|
-
"BoundingBox", comment="A file that contains a cropped version of a image"
|
|
166
|
-
)
|
|
200
|
+
box_asset = ml_instance.create_asset("BoundingBox", comment="A file that contains a cropped version of a image")
|
|
167
201
|
|
|
168
202
|
ml_instance.create_feature(
|
|
169
203
|
"Subject",
|
|
@@ -186,30 +220,20 @@ def create_demo_features(ml_instance):
|
|
|
186
220
|
"Feature Notebook Workflow",
|
|
187
221
|
description="A Workflow that uses Deriva ML API",
|
|
188
222
|
)
|
|
189
|
-
ml_instance.add_term(
|
|
190
|
-
|
|
191
|
-
)
|
|
192
|
-
notebook_workflow = ml_instance.create_workflow(
|
|
193
|
-
name="API Workflow", workflow_type="Feature Notebook Workflow"
|
|
194
|
-
)
|
|
223
|
+
ml_instance.add_term(MLVocab.asset_type, "API_Model", description="Model for our Notebook workflow")
|
|
224
|
+
notebook_workflow = ml_instance.create_workflow(name="API Workflow", workflow_type="Feature Notebook Workflow")
|
|
195
225
|
|
|
196
226
|
feature_execution = ml_instance.create_execution(
|
|
197
|
-
ExecutionConfiguration(
|
|
198
|
-
workflow=notebook_workflow, description="Our Sample Workflow instance"
|
|
199
|
-
)
|
|
227
|
+
ExecutionConfiguration(workflow=notebook_workflow, description="Our Sample Workflow instance")
|
|
200
228
|
)
|
|
201
229
|
|
|
202
|
-
subject_rids = [
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
image_rids = [
|
|
206
|
-
i["RID"] for i in ml_instance.domain_path.tables["Image"].entities().fetch()
|
|
207
|
-
]
|
|
208
|
-
subject_feature_list = [
|
|
230
|
+
subject_rids = [i["RID"] for i in ml_instance.domain_path.tables["Subject"].entities().fetch()]
|
|
231
|
+
image_rids = [i["RID"] for i in ml_instance.domain_path.tables["Image"].entities().fetch()]
|
|
232
|
+
_subject_feature_list = [
|
|
209
233
|
SubjectWellnessFeature(
|
|
210
234
|
Subject=subject_rid,
|
|
211
235
|
Execution=feature_execution.execution_rid,
|
|
212
|
-
SubjectHealth=["Well", "Sick"]
|
|
236
|
+
SubjectHealth=choice(["Well", "Sick"]),
|
|
213
237
|
Scale=randint(1, 10),
|
|
214
238
|
)
|
|
215
239
|
for subject_rid in subject_rids
|
|
@@ -218,10 +242,8 @@ def create_demo_features(ml_instance):
|
|
|
218
242
|
# Create a new set of images. For fun, lets wrap this in an execution so we get status updates
|
|
219
243
|
bounding_box_files = []
|
|
220
244
|
for i in range(10):
|
|
221
|
-
bounding_box_file = feature_execution.asset_file_path(
|
|
222
|
-
|
|
223
|
-
)
|
|
224
|
-
with open(bounding_box_file, "w") as fp:
|
|
245
|
+
bounding_box_file = feature_execution.asset_file_path("BoundingBox", f"box{i}.txt")
|
|
246
|
+
with bounding_box_file.open("w") as fp:
|
|
225
247
|
fp.write(f"Hi there {i}")
|
|
226
248
|
bounding_box_files.append(bounding_box_file)
|
|
227
249
|
|
|
@@ -230,15 +252,13 @@ def create_demo_features(ml_instance):
|
|
|
230
252
|
Image=image_rid,
|
|
231
253
|
BoundingBox=asset_name,
|
|
232
254
|
)
|
|
233
|
-
for image_rid, asset_name in zip(
|
|
234
|
-
image_rids, itertools.cycle(bounding_box_files)
|
|
235
|
-
)
|
|
255
|
+
for image_rid, asset_name in zip(image_rids, itertools.cycle(bounding_box_files))
|
|
236
256
|
]
|
|
237
257
|
|
|
238
258
|
image_quality_feature_list = [
|
|
239
259
|
ImageQualityFeature(
|
|
240
260
|
Image=image_rid,
|
|
241
|
-
ImageQuality=["Good", "Bad"]
|
|
261
|
+
ImageQuality=choice(["Good", "Bad"]),
|
|
242
262
|
)
|
|
243
263
|
for image_rid in image_rids
|
|
244
264
|
]
|
|
@@ -246,99 +266,129 @@ def create_demo_features(ml_instance):
|
|
|
246
266
|
subject_feature_list = [
|
|
247
267
|
SubjectWellnessFeature(
|
|
248
268
|
Subject=subject_rid,
|
|
249
|
-
SubjectHealth=["Well", "Sick"]
|
|
269
|
+
SubjectHealth=choice(["Well", "Sick"]),
|
|
250
270
|
Scale=randint(1, 10),
|
|
251
271
|
)
|
|
252
272
|
for subject_rid in subject_rids
|
|
253
273
|
]
|
|
254
274
|
|
|
255
275
|
with feature_execution.execute() as execution:
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
276
|
+
execution.add_features(image_bounding_box_feature_list)
|
|
277
|
+
execution.add_features(image_quality_feature_list)
|
|
278
|
+
execution.add_features(subject_feature_list)
|
|
259
279
|
|
|
260
280
|
feature_execution.upload_execution_outputs()
|
|
261
281
|
|
|
262
282
|
|
|
263
|
-
def
|
|
283
|
+
def create_demo_files(ml_instance: DerivaML):
|
|
284
|
+
"""Create demo files for testing purposes.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
ml_instance: The DerivaML instance to create files for.
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
None. Creates files in the working directory.
|
|
291
|
+
"""
|
|
292
|
+
|
|
293
|
+
def random_string(length: int) -> str:
|
|
294
|
+
"""Generate a random string of specified length.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
length: The length of the string to generate.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
A random string of the specified length.
|
|
301
|
+
"""
|
|
302
|
+
return "".join(random.choice(string.ascii_letters) for _ in range(length))
|
|
303
|
+
|
|
304
|
+
test_dir = ml_instance.working_dir / "test_dir"
|
|
305
|
+
test_dir.mkdir(parents=True, exist_ok=True)
|
|
306
|
+
d1 = test_dir / "d1"
|
|
307
|
+
d1.mkdir(parents=True, exist_ok=True)
|
|
308
|
+
d2 = test_dir / "d2"
|
|
309
|
+
d2.mkdir(parents=True, exist_ok=True)
|
|
310
|
+
|
|
311
|
+
# Create some demo files
|
|
312
|
+
for d in [test_dir, d1, d2]:
|
|
313
|
+
for i in range(5):
|
|
314
|
+
fname = Path(d) / f"file{i}.{random.choice(['txt', 'jpeg'])}"
|
|
315
|
+
with fname.open("w") as f:
|
|
316
|
+
f.write(random_string(10))
|
|
317
|
+
ml_instance.add_term(MLVocab.workflow_type, "File Test Workflow", description="Test workflow")
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def create_domain_schema(catalog: ErmrestCatalog, sname: str) -> None:
|
|
264
321
|
"""
|
|
265
322
|
Create a domain schema. Assumes that the ml-schema has already been created.
|
|
266
|
-
:param model:
|
|
267
323
|
:param sname:
|
|
268
324
|
:return:
|
|
269
325
|
"""
|
|
326
|
+
model = catalog.getCatalogModel()
|
|
327
|
+
_ = model.schemas["deriva-ml"]
|
|
270
328
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
329
|
+
try:
|
|
330
|
+
model.schemas[sname].drop(cascade=True)
|
|
331
|
+
except KeyError:
|
|
332
|
+
pass
|
|
333
|
+
except HTTPError as e:
|
|
334
|
+
print(e)
|
|
335
|
+
if f"Schema {sname} does not exist" in str(e):
|
|
336
|
+
pass
|
|
337
|
+
else:
|
|
338
|
+
raise e
|
|
339
|
+
|
|
340
|
+
domain_schema = model.create_schema(Schema.define(sname, annotations={"name_style": {"underline_space": True}}))
|
|
280
341
|
subject_table = domain_schema.create_table(
|
|
281
342
|
Table.define("Subject", column_defs=[Column.define("Name", builtin_types.text)])
|
|
282
343
|
)
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
344
|
+
with TemporaryDirectory() as tmpdir:
|
|
345
|
+
ml_instance = DerivaML(hostname=catalog.deriva_server.server, catalog_id=catalog.catalog_id, working_dir=tmpdir)
|
|
346
|
+
ml_instance.create_asset("Image", referenced_tables=[subject_table])
|
|
347
|
+
catalog_annotation(ml_instance.model)
|
|
286
348
|
|
|
287
349
|
|
|
288
350
|
def destroy_demo_catalog(catalog):
|
|
351
|
+
"""Destroy the demo catalog and clean up resources.
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
catalog: The ErmrestCatalog instance to destroy.
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
None. Destroys the catalog.
|
|
358
|
+
"""
|
|
289
359
|
catalog.delete_ermrest_catalog(really=True)
|
|
290
360
|
|
|
291
361
|
|
|
292
362
|
def create_demo_catalog(
|
|
293
363
|
hostname,
|
|
294
|
-
domain_schema="
|
|
364
|
+
domain_schema="demo-schema",
|
|
295
365
|
project_name="ml-test",
|
|
296
366
|
populate=True,
|
|
297
367
|
create_features=False,
|
|
298
368
|
create_datasets=False,
|
|
299
369
|
on_exit_delete=True,
|
|
370
|
+
logging_level=logging.INFO,
|
|
300
371
|
) -> ErmrestCatalog:
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
server = DerivaServer("https", hostname, credentials=credential)
|
|
304
|
-
test_catalog = server.create_ermrest_catalog()
|
|
305
|
-
model = test_catalog.getCatalogModel()
|
|
306
|
-
model.configure_baseline_catalog()
|
|
307
|
-
policy_file = files("deriva_ml.schema_setup").joinpath("policy.json")
|
|
308
|
-
subprocess.run(
|
|
309
|
-
[
|
|
310
|
-
"deriva-acl-config",
|
|
311
|
-
"--host",
|
|
312
|
-
test_catalog.deriva_server.server,
|
|
313
|
-
"--config-file",
|
|
314
|
-
policy_file,
|
|
315
|
-
test_catalog.catalog_id,
|
|
316
|
-
]
|
|
317
|
-
)
|
|
318
|
-
|
|
372
|
+
test_catalog = create_ml_catalog(hostname, project_name=project_name)
|
|
319
373
|
if on_exit_delete:
|
|
320
374
|
atexit.register(destroy_demo_catalog, test_catalog)
|
|
321
|
-
|
|
322
375
|
try:
|
|
323
376
|
with TemporaryDirectory() as tmpdir:
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
hostname
|
|
377
|
+
create_domain_schema(test_catalog, domain_schema)
|
|
378
|
+
ml_instance = DerivaML(
|
|
379
|
+
hostname,
|
|
327
380
|
catalog_id=test_catalog.catalog_id,
|
|
328
|
-
project_name=project_name,
|
|
329
381
|
domain_schema=domain_schema,
|
|
330
|
-
logging_level=logging.WARN,
|
|
331
382
|
working_dir=tmpdir,
|
|
332
|
-
|
|
383
|
+
logging_level=logging_level,
|
|
333
384
|
)
|
|
334
|
-
create_domain_schema(deriva_ml, domain_schema)
|
|
335
385
|
|
|
336
386
|
if populate or create_features or create_datasets:
|
|
337
|
-
populate_demo_catalog(
|
|
387
|
+
populate_demo_catalog(ml_instance)
|
|
338
388
|
if create_features:
|
|
339
|
-
create_demo_features(
|
|
389
|
+
create_demo_features(ml_instance)
|
|
340
390
|
if create_datasets:
|
|
341
|
-
create_demo_datasets(
|
|
391
|
+
create_demo_datasets(ml_instance)
|
|
342
392
|
|
|
343
393
|
except Exception:
|
|
344
394
|
# on failure, delete catalog and re-raise exception
|
|
@@ -352,8 +402,8 @@ class DemoML(DerivaML):
|
|
|
352
402
|
self,
|
|
353
403
|
hostname,
|
|
354
404
|
catalog_id,
|
|
355
|
-
cache_dir:
|
|
356
|
-
working_dir:
|
|
405
|
+
cache_dir: str | None = None,
|
|
406
|
+
working_dir: str | None = None,
|
|
357
407
|
use_minid=True,
|
|
358
408
|
):
|
|
359
409
|
super().__init__(
|