deriva-ml 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/dataset.py +1 -1
- deriva_ml/dataset_bag.py +10 -3
- deriva_ml/demo_catalog.py +84 -78
- deriva_ml/deriva_definitions.py +2 -2
- deriva_ml/deriva_ml_base.py +85 -121
- deriva_ml/deriva_model.py +25 -0
- deriva_ml/execution.py +386 -309
- deriva_ml/feature.py +1 -2
- deriva_ml/schema_setup/create_schema.py +223 -183
- deriva_ml/upload.py +95 -232
- {deriva_ml-1.10.1.dist-info → deriva_ml-1.11.0.dist-info}/METADATA +2 -1
- deriva_ml-1.11.0.dist-info/RECORD +27 -0
- deriva_ml-1.10.1.dist-info/RECORD +0 -27
- {deriva_ml-1.10.1.dist-info → deriva_ml-1.11.0.dist-info}/WHEEL +0 -0
- {deriva_ml-1.10.1.dist-info → deriva_ml-1.11.0.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.10.1.dist-info → deriva_ml-1.11.0.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.10.1.dist-info → deriva_ml-1.11.0.dist-info}/top_level.txt +0 -0
deriva_ml/dataset.py
CHANGED
|
@@ -805,7 +805,7 @@ class Dataset:
|
|
|
805
805
|
dataset_elements = [
|
|
806
806
|
snapshot_catalog._model.name_to_table(e)
|
|
807
807
|
for e, m in snapshot_catalog.list_dataset_members(
|
|
808
|
-
dataset_rid=dataset_rid,
|
|
808
|
+
dataset_rid=dataset_rid, # limit=1 Limit seems to make things run slow.
|
|
809
809
|
).items()
|
|
810
810
|
if m
|
|
811
811
|
]
|
deriva_ml/dataset_bag.py
CHANGED
|
@@ -168,7 +168,7 @@ class DatasetBag:
|
|
|
168
168
|
yield dict(zip(col_names, row))
|
|
169
169
|
|
|
170
170
|
@validate_call
|
|
171
|
-
def list_dataset_members(self, recurse: bool = False) -> dict[str,
|
|
171
|
+
def list_dataset_members(self, recurse: bool = False) -> dict[str, dict[str, Any]]:
|
|
172
172
|
"""Return a list of entities associated with a specific _dataset_table.
|
|
173
173
|
|
|
174
174
|
Args:
|
|
@@ -206,12 +206,19 @@ class DatasetBag:
|
|
|
206
206
|
)
|
|
207
207
|
|
|
208
208
|
with self.database as db:
|
|
209
|
+
col_names = [
|
|
210
|
+
c[1]
|
|
211
|
+
for c in db.execute(f'PRAGMA table_info("{sql_target}")').fetchall()
|
|
212
|
+
]
|
|
213
|
+
select_cols = ",".join([f'"{sql_target}".{c}' for c in col_names])
|
|
209
214
|
sql_cmd = (
|
|
210
|
-
f'SELECT
|
|
215
|
+
f'SELECT {select_cols} FROM "{sql_member}" '
|
|
211
216
|
f'JOIN "{sql_target}" ON "{sql_member}".{member_link[0]} = "{sql_target}".{member_link[1]} '
|
|
212
217
|
f'WHERE "{self.dataset_rid}" = "{sql_member}".Dataset;'
|
|
213
218
|
)
|
|
214
|
-
target_entities =
|
|
219
|
+
target_entities = [
|
|
220
|
+
dict(zip(col_names, e)) for e in db.execute(sql_cmd).fetchall()
|
|
221
|
+
]
|
|
215
222
|
members[target_table.name].extend(target_entities)
|
|
216
223
|
|
|
217
224
|
target_entities = [] # path.entities().fetch()
|
deriva_ml/demo_catalog.py
CHANGED
|
@@ -2,9 +2,7 @@ import atexit
|
|
|
2
2
|
from importlib.metadata import version
|
|
3
3
|
from importlib.resources import files
|
|
4
4
|
import logging
|
|
5
|
-
from random import
|
|
6
|
-
import tempfile
|
|
7
|
-
from tempfile import TemporaryDirectory
|
|
5
|
+
from random import randint, random
|
|
8
6
|
from typing import Optional
|
|
9
7
|
import itertools
|
|
10
8
|
|
|
@@ -12,7 +10,6 @@ from deriva.config.acl_config import AclConfig
|
|
|
12
10
|
from deriva.core import DerivaServer
|
|
13
11
|
from deriva.core import ErmrestCatalog, get_credential
|
|
14
12
|
from deriva.core.datapath import DataPathException
|
|
15
|
-
from deriva.core.ermrest_model import Model
|
|
16
13
|
from deriva.core.ermrest_model import builtin_types, Schema, Table, Column
|
|
17
14
|
from requests import HTTPError
|
|
18
15
|
|
|
@@ -35,48 +32,51 @@ TEST_DATASET_SIZE = 4
|
|
|
35
32
|
def reset_demo_catalog(deriva_ml: DerivaML, sname: str):
|
|
36
33
|
model = deriva_ml.model
|
|
37
34
|
for trial in range(3):
|
|
38
|
-
for t in [
|
|
39
|
-
v
|
|
40
|
-
for v in model.schemas[sname].tables.values()
|
|
41
|
-
if v.name not in {"Subject", "Image"}
|
|
42
|
-
]:
|
|
35
|
+
for t in [v for v in model.schemas[sname].tables.values()]:
|
|
43
36
|
try:
|
|
44
37
|
t.drop()
|
|
45
38
|
except HTTPError:
|
|
46
39
|
pass
|
|
47
|
-
|
|
40
|
+
model.schemas[sname].drop()
|
|
48
41
|
# Empty out remaining tables.
|
|
49
42
|
pb = deriva_ml.pathBuilder
|
|
50
43
|
retry = True
|
|
51
44
|
while retry:
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
except DataPathException: # FK constraint.
|
|
59
|
-
retry = True
|
|
60
|
-
|
|
45
|
+
for t in pb.schemas["deriva-ml"].tables.values():
|
|
46
|
+
for e in t.entities().fetch():
|
|
47
|
+
try:
|
|
48
|
+
t.filter(t.RID == e["RID"]).delete()
|
|
49
|
+
except DataPathException: # FK constraint.
|
|
50
|
+
retry = True
|
|
61
51
|
initialize_ml_schema(model, "deriva-ml")
|
|
52
|
+
create_domain_schema(deriva_ml, sname)
|
|
62
53
|
|
|
63
54
|
|
|
64
55
|
def populate_demo_catalog(deriva_ml: DerivaML, sname: str) -> None:
|
|
65
56
|
# Delete any vocabularies and features.
|
|
66
|
-
reset_demo_catalog(deriva_ml, sname)
|
|
67
57
|
domain_schema = deriva_ml.catalog.getPathBuilder().schemas[sname]
|
|
68
58
|
subject = domain_schema.tables["Subject"]
|
|
69
59
|
ss = subject.insert([{"Name": f"Thing{t + 1}"} for t in range(TEST_DATASET_SIZE)])
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
60
|
+
deriva_ml.add_term(
|
|
61
|
+
MLVocab.workflow_type,
|
|
62
|
+
"Demo Catalog Creation",
|
|
63
|
+
description="A workflow demonstrating how to create a demo catalog.",
|
|
64
|
+
)
|
|
65
|
+
execution = deriva_ml.create_execution(
|
|
66
|
+
ExecutionConfiguration(
|
|
67
|
+
workflow=deriva_ml.create_workflow(
|
|
68
|
+
name="Demo Catalog", workflow_type="Demo Catalog Creation"
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
with execution.execute() as e:
|
|
73
73
|
for s in ss:
|
|
74
|
-
image_file =
|
|
75
|
-
f"test_{s['RID']}.txt",
|
|
74
|
+
image_file = e.asset_file_path(
|
|
75
|
+
"Image", f"test_{s['RID']}.txt", Subject=s["RID"]
|
|
76
76
|
)
|
|
77
77
|
with open(image_file, "w") as f:
|
|
78
78
|
f.write(f"Hello there {random()}\n")
|
|
79
|
-
|
|
79
|
+
execution.upload_execution_outputs()
|
|
80
80
|
|
|
81
81
|
|
|
82
82
|
def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RID]]:
|
|
@@ -84,6 +84,13 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
|
|
|
84
84
|
ml_instance.add_dataset_element_type("Image")
|
|
85
85
|
|
|
86
86
|
type_rid = ml_instance.add_term("Dataset_Type", "TestSet", description="A test")
|
|
87
|
+
training_rid = ml_instance.add_term(
|
|
88
|
+
"Dataset_Type", "Training", description="A traing set"
|
|
89
|
+
)
|
|
90
|
+
testing_rid = ml_instance.add_term(
|
|
91
|
+
"Dataset_Type", "Testing", description="A testing set"
|
|
92
|
+
)
|
|
93
|
+
|
|
87
94
|
table_path = (
|
|
88
95
|
ml_instance.catalog.getPathBuilder()
|
|
89
96
|
.schemas[ml_instance.domain_schema]
|
|
@@ -94,7 +101,7 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
|
|
|
94
101
|
dataset_rids = []
|
|
95
102
|
for r in subject_rids[0:4]:
|
|
96
103
|
d = ml_instance.create_dataset(
|
|
97
|
-
type_rid.name,
|
|
104
|
+
type=[type_rid.name, "Testing"],
|
|
98
105
|
description=f"Dataset {r}",
|
|
99
106
|
version=DatasetVersion(1, 0, 0),
|
|
100
107
|
)
|
|
@@ -104,7 +111,7 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
|
|
|
104
111
|
nested_datasets = []
|
|
105
112
|
for i in range(0, 4, 2):
|
|
106
113
|
nested_dataset = ml_instance.create_dataset(
|
|
107
|
-
type_rid.name,
|
|
114
|
+
type=[type_rid.name, "Training"],
|
|
108
115
|
description=f"Nested Dataset {i}",
|
|
109
116
|
version=DatasetVersion(1, 0, 0),
|
|
110
117
|
)
|
|
@@ -132,13 +139,11 @@ def create_demo_features(ml_instance):
|
|
|
132
139
|
"Well",
|
|
133
140
|
description="The subject self reports that they feel well",
|
|
134
141
|
)
|
|
135
|
-
|
|
136
142
|
ml_instance.create_vocabulary(
|
|
137
143
|
"ImageQuality", "Controlled vocabulary for image quality"
|
|
138
144
|
)
|
|
139
145
|
ml_instance.add_term("ImageQuality", "Good", description="The image is good")
|
|
140
146
|
ml_instance.add_term("ImageQuality", "Bad", description="The image is bad")
|
|
141
|
-
|
|
142
147
|
box_asset = ml_instance.create_asset(
|
|
143
148
|
"BoundingBox", comment="A file that contains a cropped version of a image"
|
|
144
149
|
)
|
|
@@ -150,7 +155,6 @@ def create_demo_features(ml_instance):
|
|
|
150
155
|
metadata=[ColumnDefinition(name="Scale", type=BuiltinTypes.int2, nullok=True)],
|
|
151
156
|
optional=["Scale"],
|
|
152
157
|
)
|
|
153
|
-
|
|
154
158
|
ml_instance.create_feature("Image", "BoundingBox", assets=[box_asset])
|
|
155
159
|
ml_instance.create_feature("Image", "Quality", terms=["ImageQuality"])
|
|
156
160
|
|
|
@@ -158,78 +162,88 @@ def create_demo_features(ml_instance):
|
|
|
158
162
|
ImageBoundingboxFeature = ml_instance.feature_record_class("Image", "BoundingBox")
|
|
159
163
|
SubjectWellnessFeature = ml_instance.feature_record_class("Subject", "Health")
|
|
160
164
|
|
|
165
|
+
# Get the workflow for this notebook
|
|
166
|
+
|
|
161
167
|
ml_instance.add_term(
|
|
162
168
|
MLVocab.workflow_type,
|
|
163
|
-
"
|
|
169
|
+
"Feature Notebook Workflow",
|
|
164
170
|
description="A Workflow that uses Deriva ML API",
|
|
165
171
|
)
|
|
166
172
|
ml_instance.add_term(
|
|
167
|
-
MLVocab.
|
|
168
|
-
"API_Model",
|
|
169
|
-
description="Model for our API workflow",
|
|
173
|
+
MLVocab.asset_type, "API_Model", description="Model for our Notebook workflow"
|
|
170
174
|
)
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
name="API Workflow",
|
|
174
|
-
workflow_type="API Workflow",
|
|
175
|
+
notebook_workflow = ml_instance.create_workflow(
|
|
176
|
+
name="API Workflow", workflow_type="Feature Notebook Workflow"
|
|
175
177
|
)
|
|
176
178
|
|
|
177
|
-
|
|
179
|
+
feature_execution = ml_instance.create_execution(
|
|
178
180
|
ExecutionConfiguration(
|
|
179
|
-
workflow=
|
|
181
|
+
workflow=notebook_workflow, description="Our Sample Workflow instance"
|
|
180
182
|
)
|
|
181
183
|
)
|
|
182
184
|
|
|
183
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
|
184
|
-
assetdir = ml_instance.asset_dir("BoundingBox", prefix=temp_dir)
|
|
185
|
-
for i in range(10):
|
|
186
|
-
with open(assetdir.path / f"box{i}.txt", "w") as fp:
|
|
187
|
-
fp.write(f"Hi there {i}")
|
|
188
|
-
bounding_box_assets = ml_instance.upload_assets(assetdir)
|
|
189
|
-
bounding_box_rids = [a.result["RID"] for a in bounding_box_assets.values()]
|
|
190
|
-
|
|
191
|
-
# Get the IDs of al of the things that we are going to want to attach features to.
|
|
192
185
|
subject_rids = [
|
|
193
186
|
i["RID"] for i in ml_instance.domain_path.tables["Subject"].entities().fetch()
|
|
194
187
|
]
|
|
195
188
|
image_rids = [
|
|
196
189
|
i["RID"] for i in ml_instance.domain_path.tables["Image"].entities().fetch()
|
|
197
190
|
]
|
|
198
|
-
|
|
199
191
|
subject_feature_list = [
|
|
200
192
|
SubjectWellnessFeature(
|
|
201
193
|
Subject=subject_rid,
|
|
202
|
-
Execution=
|
|
194
|
+
Execution=feature_execution.execution_rid,
|
|
203
195
|
SubjectHealth=["Well", "Sick"][randint(0, 1)],
|
|
204
196
|
Scale=randint(1, 10),
|
|
205
197
|
)
|
|
206
198
|
for subject_rid in subject_rids
|
|
207
199
|
]
|
|
208
200
|
|
|
201
|
+
# Create a new set of images. For fun, lets wrap this in an execution so we get status updates
|
|
202
|
+
bounding_box_files = []
|
|
203
|
+
for i in range(10):
|
|
204
|
+
bounding_box_file = feature_execution.asset_file_path(
|
|
205
|
+
"BoundingBox", f"box{i}.txt"
|
|
206
|
+
)
|
|
207
|
+
with open(bounding_box_file, "w") as fp:
|
|
208
|
+
fp.write(f"Hi there {i}")
|
|
209
|
+
bounding_box_files.append(bounding_box_file)
|
|
210
|
+
|
|
211
|
+
image_bounding_box_feature_list = [
|
|
212
|
+
ImageBoundingboxFeature(
|
|
213
|
+
Image=image_rid,
|
|
214
|
+
BoundingBox=asset_name,
|
|
215
|
+
)
|
|
216
|
+
for image_rid, asset_name in zip(
|
|
217
|
+
image_rids, itertools.cycle(bounding_box_files)
|
|
218
|
+
)
|
|
219
|
+
]
|
|
220
|
+
|
|
209
221
|
image_quality_feature_list = [
|
|
210
222
|
ImageQualityFeature(
|
|
211
223
|
Image=image_rid,
|
|
212
|
-
Execution=api_execution.execution_rid,
|
|
213
224
|
ImageQuality=["Good", "Bad"][randint(0, 1)],
|
|
214
225
|
)
|
|
215
226
|
for image_rid in image_rids
|
|
216
227
|
]
|
|
217
228
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
229
|
+
subject_feature_list = [
|
|
230
|
+
SubjectWellnessFeature(
|
|
231
|
+
Subject=subject_rid,
|
|
232
|
+
SubjectHealth=["Well", "Sick"][randint(0, 1)],
|
|
233
|
+
Scale=randint(1, 10),
|
|
223
234
|
)
|
|
224
|
-
for
|
|
235
|
+
for subject_rid in subject_rids
|
|
225
236
|
]
|
|
226
237
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
238
|
+
with feature_execution.execute() as execution:
|
|
239
|
+
feature_execution.add_features(image_bounding_box_feature_list)
|
|
240
|
+
feature_execution.add_features(image_quality_feature_list)
|
|
241
|
+
feature_execution.add_features(subject_feature_list)
|
|
242
|
+
|
|
243
|
+
feature_execution.upload_execution_outputs()
|
|
230
244
|
|
|
231
245
|
|
|
232
|
-
def create_domain_schema(
|
|
246
|
+
def create_domain_schema(ml_instance: DerivaML, sname: str) -> None:
|
|
233
247
|
"""
|
|
234
248
|
Create a domain schema. Assumes that the ml-schema has already been created.
|
|
235
249
|
:param model:
|
|
@@ -238,28 +252,19 @@ def create_domain_schema(model: Model, sname: str) -> None:
|
|
|
238
252
|
"""
|
|
239
253
|
|
|
240
254
|
# Make sure that we have a ml schema
|
|
241
|
-
_ = model.schemas["deriva-ml"]
|
|
255
|
+
_ = ml_instance.model.schemas["deriva-ml"]
|
|
242
256
|
|
|
243
|
-
if model.schemas.get(sname):
|
|
257
|
+
if ml_instance.model.schemas.get(sname):
|
|
244
258
|
# Clean out any old junk....
|
|
245
|
-
model.schemas[sname].drop()
|
|
259
|
+
ml_instance.model.schemas[sname].drop()
|
|
246
260
|
|
|
247
|
-
domain_schema = model.create_schema(
|
|
261
|
+
domain_schema = ml_instance.model.model.create_schema(
|
|
248
262
|
Schema.define(sname, annotations={"name_style": {"underline_space": True}})
|
|
249
263
|
)
|
|
250
264
|
subject_table = domain_schema.create_table(
|
|
251
265
|
Table.define("Subject", column_defs=[Column.define("Name", builtin_types.text)])
|
|
252
266
|
)
|
|
253
|
-
|
|
254
|
-
image_table = domain_schema.create_table(
|
|
255
|
-
Table.define_asset(
|
|
256
|
-
sname=sname,
|
|
257
|
-
tname="Image",
|
|
258
|
-
hatrac_template="/hatrac/image_asset/{{MD5}}.{{Filename}}",
|
|
259
|
-
column_defs=[Column.define("Name", builtin_types.text)],
|
|
260
|
-
)
|
|
261
|
-
)
|
|
262
|
-
image_table.create_reference(subject_table)
|
|
267
|
+
ml_instance.create_asset("Image", referenced_tables=[subject_table])
|
|
263
268
|
|
|
264
269
|
|
|
265
270
|
def destroy_demo_catalog(catalog):
|
|
@@ -284,13 +289,14 @@ def create_demo_catalog(
|
|
|
284
289
|
|
|
285
290
|
try:
|
|
286
291
|
create_ml_schema(model, project_name=project_name)
|
|
287
|
-
create_domain_schema(model, domain_schema)
|
|
288
292
|
deriva_ml = DerivaML(
|
|
289
293
|
hostname=hostname,
|
|
290
294
|
catalog_id=test_catalog.catalog_id,
|
|
291
295
|
project_name=project_name,
|
|
296
|
+
domain_schema=domain_schema,
|
|
292
297
|
logging_level=logging.WARN,
|
|
293
298
|
)
|
|
299
|
+
create_domain_schema(deriva_ml, domain_schema)
|
|
294
300
|
working_dir = deriva_ml.working_dir
|
|
295
301
|
dataset_table = deriva_ml.dataset_table
|
|
296
302
|
dataset_table.annotations.update(
|
deriva_ml/deriva_definitions.py
CHANGED
|
@@ -186,9 +186,9 @@ class MLVocab(StrEnum):
|
|
|
186
186
|
|
|
187
187
|
dataset_type = "Dataset_Type"
|
|
188
188
|
workflow_type = "Workflow_Type"
|
|
189
|
-
execution_asset_type = "Execution_Asset_Type"
|
|
190
|
-
execution_metadata_type = "Execution_Metadata_Type"
|
|
191
189
|
file_type = "File_Type"
|
|
190
|
+
asset_type = "Asset_Type"
|
|
191
|
+
asset_role = "Asset_Role"
|
|
192
192
|
|
|
193
193
|
|
|
194
194
|
class ExecMetadataVocab(StrEnum):
|
deriva_ml/deriva_ml_base.py
CHANGED
|
@@ -31,7 +31,6 @@ from deriva.core.datapath import DataPathException
|
|
|
31
31
|
from deriva.core.deriva_server import DerivaServer
|
|
32
32
|
from deriva.core.ermrest_catalog import ResolveRidResult
|
|
33
33
|
from deriva.core.ermrest_model import Key, Table
|
|
34
|
-
from deriva.core.hatrac_store import HatracStore
|
|
35
34
|
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
|
|
36
35
|
from pydantic import validate_call, ConfigDict
|
|
37
36
|
from requests import RequestException
|
|
@@ -42,24 +41,17 @@ from .dataset import Dataset
|
|
|
42
41
|
from .dataset_aux_classes import DatasetSpec
|
|
43
42
|
from .dataset_bag import DatasetBag
|
|
44
43
|
from .deriva_model import DerivaModel
|
|
45
|
-
from .upload import
|
|
46
|
-
table_path,
|
|
47
|
-
execution_rids,
|
|
48
|
-
execution_metadata_dir,
|
|
49
|
-
upload_directory,
|
|
50
|
-
UploadAssetDirectory,
|
|
51
|
-
)
|
|
44
|
+
from .upload import table_path, execution_rids, asset_file_path
|
|
52
45
|
from .deriva_definitions import ColumnDefinition
|
|
53
|
-
from .deriva_definitions import ExecMetadataVocab
|
|
54
46
|
from .deriva_definitions import (
|
|
55
47
|
RID,
|
|
56
48
|
Status,
|
|
57
|
-
FileUploadState,
|
|
58
49
|
DerivaMLException,
|
|
59
50
|
ML_SCHEMA,
|
|
60
51
|
VocabularyTerm,
|
|
61
52
|
MLVocab,
|
|
62
53
|
FileSpec,
|
|
54
|
+
TableDefinition,
|
|
63
55
|
)
|
|
64
56
|
|
|
65
57
|
try:
|
|
@@ -346,30 +338,6 @@ class DerivaML(Dataset):
|
|
|
346
338
|
table=self.model.name_to_table(table).name,
|
|
347
339
|
)
|
|
348
340
|
|
|
349
|
-
def asset_dir(
|
|
350
|
-
self, table: str | Table, prefix: Optional[str | Path] = None
|
|
351
|
-
) -> UploadAssetDirectory:
|
|
352
|
-
"""Return a local file path in which to place a files for an asset table. T
|
|
353
|
-
|
|
354
|
-
Args:
|
|
355
|
-
table: Location of where to place files. Defaults to execution_assets_path.
|
|
356
|
-
prefix: Root path to asset directory.
|
|
357
|
-
|
|
358
|
-
Returns:
|
|
359
|
-
Path to the directory in which asset files should be placed.
|
|
360
|
-
"""
|
|
361
|
-
table = self.model.name_to_table(table)
|
|
362
|
-
if not self.model.is_asset(table):
|
|
363
|
-
raise DerivaMLException(f"The table {table} is not an asset table.")
|
|
364
|
-
|
|
365
|
-
prefix = Path(prefix) if prefix else self.working_dir
|
|
366
|
-
return UploadAssetDirectory(
|
|
367
|
-
model=self.model,
|
|
368
|
-
prefix=prefix,
|
|
369
|
-
schema=table.schema.name,
|
|
370
|
-
table=table.name,
|
|
371
|
-
)
|
|
372
|
-
|
|
373
341
|
def download_dir(self, cached: bool = False) -> Path:
|
|
374
342
|
"""Location where downloaded files are placed.
|
|
375
343
|
|
|
@@ -532,10 +500,17 @@ class DerivaML(Dataset):
|
|
|
532
500
|
)
|
|
533
501
|
)
|
|
534
502
|
|
|
503
|
+
def create_table(self, table: TableDefinition) -> Table:
|
|
504
|
+
"""Create a table from a table definition."""
|
|
505
|
+
return self.model.schemas[self.domain_schema].create_table(table.model_dump())
|
|
506
|
+
|
|
507
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
535
508
|
def create_asset(
|
|
536
509
|
self,
|
|
537
510
|
asset_name: str,
|
|
538
511
|
column_defs: Optional[Iterable[ColumnDefinition]] = None,
|
|
512
|
+
fkey_defs: Optional[Iterable[ColumnDefinition]] = None,
|
|
513
|
+
referenced_tables: Optional[Iterable[Table]] = None,
|
|
539
514
|
comment: str = "",
|
|
540
515
|
schema: Optional[str] = None,
|
|
541
516
|
) -> Table:
|
|
@@ -544,6 +519,8 @@ class DerivaML(Dataset):
|
|
|
544
519
|
Args:
|
|
545
520
|
asset_name: Name of the asset table.
|
|
546
521
|
column_defs: Iterable of ColumnDefinition objects to provide additional metadata for asset.
|
|
522
|
+
fkey_defs: Iterable of ForeignKeyDefinition objects to provide additional metadata for asset.
|
|
523
|
+
referenced_tables: Iterable of Table objects to which asset should provide foreign-key references to.
|
|
547
524
|
comment: Description of the asset table. (Default value = '')
|
|
548
525
|
schema: Schema in which to create the asset table. Defaults to domain_schema.
|
|
549
526
|
asset_name: str:
|
|
@@ -553,17 +530,82 @@ class DerivaML(Dataset):
|
|
|
553
530
|
Table object for the asset table.
|
|
554
531
|
"""
|
|
555
532
|
column_defs = column_defs or []
|
|
533
|
+
fkey_defs = fkey_defs or []
|
|
534
|
+
referenced_tables = referenced_tables or []
|
|
556
535
|
schema = schema or self.domain_schema
|
|
536
|
+
|
|
537
|
+
self.add_term(
|
|
538
|
+
MLVocab.asset_type, asset_name, description=f"A {asset_name} asset"
|
|
539
|
+
)
|
|
557
540
|
asset_table = self.model.schemas[schema].create_table(
|
|
558
541
|
Table.define_asset(
|
|
559
542
|
schema,
|
|
560
543
|
asset_name,
|
|
561
544
|
column_defs=[c.model_dump() for c in column_defs],
|
|
545
|
+
fkey_defs=[fk.model_dump() for fk in fkey_defs],
|
|
562
546
|
comment=comment,
|
|
563
547
|
)
|
|
564
548
|
)
|
|
549
|
+
|
|
550
|
+
self.model.schemas[self.domain_schema].create_table(
|
|
551
|
+
Table.define_association(
|
|
552
|
+
[
|
|
553
|
+
(asset_table.name, asset_table),
|
|
554
|
+
("Asset_Type", self.model.name_to_table("Asset_Type")),
|
|
555
|
+
]
|
|
556
|
+
)
|
|
557
|
+
)
|
|
558
|
+
for t in referenced_tables:
|
|
559
|
+
asset_table.create_reference(self.model.name_to_table(t))
|
|
560
|
+
# Create a table to track execution that creates the asset
|
|
561
|
+
atable = self.model.schemas[self.domain_schema].create_table(
|
|
562
|
+
Table.define_association(
|
|
563
|
+
[
|
|
564
|
+
(asset_name, asset_table),
|
|
565
|
+
(
|
|
566
|
+
"Execution",
|
|
567
|
+
self.model.schemas[self.ml_schema].tables["Execution"],
|
|
568
|
+
),
|
|
569
|
+
]
|
|
570
|
+
)
|
|
571
|
+
)
|
|
572
|
+
atable.create_reference(self.model.name_to_table("Asset_Role"))
|
|
565
573
|
return asset_table
|
|
566
574
|
|
|
575
|
+
# @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
576
|
+
def list_assets(self, asset_table: Table | str):
|
|
577
|
+
"""Return the contents of an asset table"""
|
|
578
|
+
|
|
579
|
+
if not self.model.is_asset(asset_table):
|
|
580
|
+
raise DerivaMLException(f"Table {asset_table.name} is not an asset")
|
|
581
|
+
asset_table = self.model.name_to_table(asset_table)
|
|
582
|
+
pb = self._model.catalog.getPathBuilder()
|
|
583
|
+
asset_path = pb.schemas[asset_table.schema.name].tables[asset_table.name]
|
|
584
|
+
|
|
585
|
+
asset_type_table = self._model.find_association(asset_table, MLVocab.asset_type)
|
|
586
|
+
type_path = pb.schemas[asset_type_table.schema.name].tables[
|
|
587
|
+
asset_type_table.name
|
|
588
|
+
]
|
|
589
|
+
|
|
590
|
+
# Get a list of all the asset_type values associated with this dataset_table.
|
|
591
|
+
assets = []
|
|
592
|
+
for asset in asset_path.entities().fetch():
|
|
593
|
+
asset_types = (
|
|
594
|
+
type_path.filter(type_path.columns[asset_table.name] == asset["RID"])
|
|
595
|
+
.attributes(type_path.Asset_Type)
|
|
596
|
+
.fetch()
|
|
597
|
+
)
|
|
598
|
+
assets.append(
|
|
599
|
+
asset
|
|
600
|
+
| {
|
|
601
|
+
MLVocab.asset_type.value: [
|
|
602
|
+
asset_type[MLVocab.asset_type.value]
|
|
603
|
+
for asset_type in asset_types
|
|
604
|
+
]
|
|
605
|
+
}
|
|
606
|
+
)
|
|
607
|
+
return assets
|
|
608
|
+
|
|
567
609
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
568
610
|
def create_feature(
|
|
569
611
|
self,
|
|
@@ -717,24 +759,6 @@ class DerivaML(Dataset):
|
|
|
717
759
|
"""
|
|
718
760
|
return self.model.find_features(table)
|
|
719
761
|
|
|
720
|
-
@validate_call
|
|
721
|
-
def add_features(self, features: Iterable[FeatureRecord]) -> int:
|
|
722
|
-
"""Add a set of new feature values to the catalog.
|
|
723
|
-
|
|
724
|
-
Args:
|
|
725
|
-
features: Iterable[FeatureRecord]:
|
|
726
|
-
|
|
727
|
-
Returns:
|
|
728
|
-
Number of attributes added
|
|
729
|
-
"""
|
|
730
|
-
features = list(features)
|
|
731
|
-
feature_table = features[0].feature.feature_table
|
|
732
|
-
feature_path = self.pathBuilder.schemas[feature_table.schema.name].tables[
|
|
733
|
-
feature_table.name
|
|
734
|
-
]
|
|
735
|
-
entries = feature_path.insert(f.model_dump() for f in features)
|
|
736
|
-
return len(entries)
|
|
737
|
-
|
|
738
762
|
# noinspection PyProtectedMember
|
|
739
763
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
740
764
|
def list_feature_values(
|
|
@@ -838,7 +862,8 @@ class DerivaML(Dataset):
|
|
|
838
862
|
raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
|
|
839
863
|
schema_name, table_name = vocab_table.schema.name, vocab_table.name
|
|
840
864
|
schema_path = self.catalog.getPathBuilder().schemas[schema_name]
|
|
841
|
-
|
|
865
|
+
|
|
866
|
+
for term in schema_path.tables[table_name].entities().fetch():
|
|
842
867
|
if term_name == term["Name"] or (
|
|
843
868
|
term["Synonyms"] and term_name in term["Synonyms"]
|
|
844
869
|
):
|
|
@@ -891,65 +916,6 @@ class DerivaML(Dataset):
|
|
|
891
916
|
snapshot_catalog=DerivaML(self.host_name, self._version_snapshot(dataset)),
|
|
892
917
|
)
|
|
893
918
|
|
|
894
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
895
|
-
def download_asset(self, asset_rid: RID, dest_dir: Path) -> Path:
|
|
896
|
-
"""Download an asset from a URL and place it in a local directory.
|
|
897
|
-
|
|
898
|
-
Args:
|
|
899
|
-
asset_rid: URL of the asset.
|
|
900
|
-
dest_dir: Destination directory for the asset.
|
|
901
|
-
|
|
902
|
-
Returns:
|
|
903
|
-
A Path object to the downloaded asset.
|
|
904
|
-
"""
|
|
905
|
-
table = self.resolve_rid(asset_rid).table
|
|
906
|
-
if not self.model.is_asset(table):
|
|
907
|
-
raise DerivaMLException(f"RID {asset_rid} is not for an asset table.")
|
|
908
|
-
|
|
909
|
-
tpath = self.pathBuilder.schemas[table.schema.name].tables[table.name]
|
|
910
|
-
asset_metadata = list(tpath.filter(tpath.RID == asset_rid).entities())[0]
|
|
911
|
-
asset_url = asset_metadata["URL"]
|
|
912
|
-
asset_filename = dest_dir / asset_metadata["Filename"]
|
|
913
|
-
|
|
914
|
-
hs = HatracStore("https", self.host_name, self.credential)
|
|
915
|
-
hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
|
|
916
|
-
return Path(asset_filename)
|
|
917
|
-
|
|
918
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
919
|
-
def upload_assets(
|
|
920
|
-
self,
|
|
921
|
-
assets_dir: str | Path | UploadAssetDirectory,
|
|
922
|
-
) -> dict[Any, FileUploadState] | None:
|
|
923
|
-
"""Upload assets from a directory.
|
|
924
|
-
|
|
925
|
-
This routine assumes that the current upload specification includes a configuration for the specified directory.
|
|
926
|
-
Every asset in the specified directory is uploaded
|
|
927
|
-
|
|
928
|
-
Args:
|
|
929
|
-
assets_dir: Directory containing the assets to upload.
|
|
930
|
-
|
|
931
|
-
Returns:
|
|
932
|
-
Results of the upload operation.
|
|
933
|
-
|
|
934
|
-
Raises:
|
|
935
|
-
DerivaMLException: If there is an issue uploading the assets.
|
|
936
|
-
"""
|
|
937
|
-
|
|
938
|
-
def path_to_asset(path: str) -> str:
|
|
939
|
-
"""Pull the asset name out of a path to that asset in the filesystem"""
|
|
940
|
-
components = path.split("/")
|
|
941
|
-
return components[
|
|
942
|
-
components.index("asset") + 2
|
|
943
|
-
] # Look for asset in the path to find the name
|
|
944
|
-
|
|
945
|
-
if isinstance(assets_dir, UploadAssetDirectory):
|
|
946
|
-
assets_dir = assets_dir.path
|
|
947
|
-
|
|
948
|
-
if not self.model.is_asset(Path(assets_dir).name):
|
|
949
|
-
raise DerivaMLException("Directory does not have name of an asset table.")
|
|
950
|
-
results = upload_directory(self.model, assets_dir)
|
|
951
|
-
return {path_to_asset(p): r for p, r in results.items()}
|
|
952
|
-
|
|
953
919
|
def _update_status(
|
|
954
920
|
self, new_status: Status, status_detail: str, execution_rid: RID
|
|
955
921
|
):
|
|
@@ -1205,7 +1171,7 @@ class DerivaML(Dataset):
|
|
|
1205
1171
|
|
|
1206
1172
|
"""
|
|
1207
1173
|
|
|
1208
|
-
# Get repo URL from local
|
|
1174
|
+
# Get repo URL from local gitHub repo.
|
|
1209
1175
|
try:
|
|
1210
1176
|
result = subprocess.run(
|
|
1211
1177
|
["git", "remote", "get-url", "origin"],
|
|
@@ -1261,7 +1227,7 @@ class DerivaML(Dataset):
|
|
|
1261
1227
|
|
|
1262
1228
|
Args:
|
|
1263
1229
|
configuration: ExecutionConfiguration:
|
|
1264
|
-
|
|
1230
|
+
dry_run: Do not create an execution record or upload results.
|
|
1265
1231
|
|
|
1266
1232
|
Returns:
|
|
1267
1233
|
An execution object.
|
|
@@ -1283,13 +1249,11 @@ class DerivaML(Dataset):
|
|
|
1283
1249
|
raise DerivaMLException(f"Multiple execution RIDs were found {e_rids}.")
|
|
1284
1250
|
|
|
1285
1251
|
execution_rid = e_rids[0]
|
|
1286
|
-
cfile = (
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
)
|
|
1292
|
-
/ "configuration.json"
|
|
1252
|
+
cfile = asset_file_path(
|
|
1253
|
+
prefix=self.working_dir,
|
|
1254
|
+
exec_rid=execution_rid,
|
|
1255
|
+
file_name="configuration.json",
|
|
1256
|
+
asset_table=self.model.name_to_table("Execution_Metadata"),
|
|
1293
1257
|
)
|
|
1294
1258
|
configuration = ExecutionConfiguration.load_configuration(cfile)
|
|
1295
1259
|
return Execution(configuration, self, reload=execution_rid)
|