deriva-ml 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/dataset.py +1 -1
- deriva_ml/dataset_bag.py +10 -3
- deriva_ml/demo_catalog.py +84 -78
- deriva_ml/deriva_definitions.py +2 -2
- deriva_ml/deriva_ml_base.py +87 -128
- deriva_ml/deriva_model.py +25 -0
- deriva_ml/execution.py +389 -309
- deriva_ml/execution_configuration.py +16 -6
- deriva_ml/feature.py +1 -2
- deriva_ml/schema_setup/create_schema.py +223 -183
- deriva_ml/upload.py +95 -232
- {deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/METADATA +2 -1
- deriva_ml-1.11.0.dist-info/RECORD +27 -0
- {deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/WHEEL +1 -1
- deriva_ml-1.10.0.dist-info/RECORD +0 -27
- {deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/top_level.txt +0 -0
deriva_ml/execution.py
CHANGED
|
@@ -5,44 +5,37 @@ This module defined the Execution class which is used to interact with the state
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
7
|
from collections import defaultdict
|
|
8
|
-
import csv
|
|
9
8
|
import json
|
|
10
9
|
import logging
|
|
11
10
|
import os
|
|
12
11
|
import shutil
|
|
13
12
|
from datetime import datetime
|
|
14
13
|
from pathlib import Path
|
|
15
|
-
import requests
|
|
16
|
-
from tempfile import NamedTemporaryFile
|
|
17
14
|
from typing import Iterable, Any, Optional
|
|
15
|
+
|
|
18
16
|
from deriva.core import format_exception
|
|
19
|
-
from deriva.core.ermrest_model import Table
|
|
20
17
|
from pydantic import validate_call, ConfigDict
|
|
18
|
+
import sys
|
|
19
|
+
from deriva.core.hatrac_store import HatracStore
|
|
21
20
|
|
|
22
|
-
from .deriva_definitions import
|
|
23
|
-
from .deriva_definitions import
|
|
24
|
-
RID,
|
|
25
|
-
Status,
|
|
26
|
-
FileUploadState,
|
|
27
|
-
UploadState,
|
|
28
|
-
DerivaMLException,
|
|
29
|
-
)
|
|
21
|
+
from .deriva_definitions import ExecMetadataVocab
|
|
22
|
+
from .deriva_definitions import RID, Status, FileUploadState, DerivaMLException, MLVocab
|
|
30
23
|
from .deriva_ml_base import DerivaML, FeatureRecord
|
|
31
24
|
from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
|
|
32
25
|
from .dataset_bag import DatasetBag
|
|
33
26
|
from .execution_configuration import ExecutionConfiguration, Workflow
|
|
34
27
|
from .execution_environment import get_execution_environment
|
|
35
28
|
from .upload import (
|
|
36
|
-
execution_metadata_dir,
|
|
37
|
-
execution_asset_dir,
|
|
38
29
|
execution_root,
|
|
39
30
|
feature_root,
|
|
40
|
-
|
|
31
|
+
asset_root,
|
|
41
32
|
feature_value_path,
|
|
42
33
|
is_feature_dir,
|
|
43
|
-
is_feature_asset_dir,
|
|
44
34
|
table_path,
|
|
45
35
|
upload_directory,
|
|
36
|
+
normalize_asset_dir,
|
|
37
|
+
asset_file_path,
|
|
38
|
+
asset_type_path,
|
|
46
39
|
)
|
|
47
40
|
|
|
48
41
|
try:
|
|
@@ -59,6 +52,41 @@ except ImportError:
|
|
|
59
52
|
return []
|
|
60
53
|
|
|
61
54
|
|
|
55
|
+
class AssetFilePath(type(Path())):
|
|
56
|
+
"""Derived class of Path that also includes information about a downloaded.
|
|
57
|
+
|
|
58
|
+
An AssetFilePath has all the methods associated with a pathlib.Path object. In addition, it defines additional
|
|
59
|
+
attributes associated with a DerviaML asset.
|
|
60
|
+
|
|
61
|
+
Attributes:
|
|
62
|
+
asset_types: A list of the types associated with this asset. From the Asset_Type controlled vocabulary.
|
|
63
|
+
asset_metadata: A dictionary of names and values of any additional columns associated with this asset.
|
|
64
|
+
asset_name: The name of the asset table
|
|
65
|
+
file_name: The name of the file in the local file system that has the asset contents
|
|
66
|
+
asset_rid: The RID of the asset if it has been uploaded into an asset table
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def __new__(
|
|
71
|
+
cls,
|
|
72
|
+
asset_path,
|
|
73
|
+
asset_name: str,
|
|
74
|
+
file_name: str,
|
|
75
|
+
asset_metadata: dict[str, Any],
|
|
76
|
+
asset_types: list[str] | str,
|
|
77
|
+
asset_rid: Optional[RID] = None,
|
|
78
|
+
):
|
|
79
|
+
obj = super().__new__(cls, asset_path)
|
|
80
|
+
obj.asset_types = (
|
|
81
|
+
asset_types if isinstance(asset_types, list) else [asset_types]
|
|
82
|
+
)
|
|
83
|
+
obj.asset_metadata = asset_metadata
|
|
84
|
+
obj.asset_name = asset_name
|
|
85
|
+
obj.file_name = file_name
|
|
86
|
+
obj.asset_rid = asset_rid
|
|
87
|
+
return obj
|
|
88
|
+
|
|
89
|
+
|
|
62
90
|
class Execution:
|
|
63
91
|
"""The Execution class is used to capture the context of an activity within DerivaML. While these are primarily
|
|
64
92
|
computational, manual processes can be represented by an execution as well.
|
|
@@ -101,21 +129,24 @@ class Execution:
|
|
|
101
129
|
"""
|
|
102
130
|
|
|
103
131
|
Args:
|
|
104
|
-
configuration:
|
|
105
|
-
ml_object:
|
|
132
|
+
configuration: Execution configuration object that describes the execution.
|
|
133
|
+
ml_object: The DerivaML instance that created the execution.
|
|
106
134
|
reload: RID of previously initialized execution object.
|
|
107
135
|
"""
|
|
108
136
|
self.asset_paths: list[Path] = []
|
|
109
137
|
self.configuration = configuration
|
|
110
138
|
self._ml_object = ml_object
|
|
139
|
+
self._model = ml_object.model
|
|
111
140
|
self._logger = ml_object._logger
|
|
112
141
|
self.start_time = None
|
|
113
142
|
self.stop_time = None
|
|
114
143
|
self.status = Status.created
|
|
115
144
|
self.uploaded_assets: list[Path] = []
|
|
145
|
+
self.configuration.argv = sys.argv
|
|
116
146
|
|
|
117
147
|
self.dataset_rids: list[RID] = []
|
|
118
148
|
self.datasets: list[DatasetBag] = []
|
|
149
|
+
self.parameters = self.configuration.parameters
|
|
119
150
|
|
|
120
151
|
self._working_dir = self._ml_object.working_dir
|
|
121
152
|
self._cache_dir = self._ml_object.cache_dir
|
|
@@ -144,9 +175,7 @@ class Execution:
|
|
|
144
175
|
)
|
|
145
176
|
|
|
146
177
|
for a in self.configuration.assets:
|
|
147
|
-
if not self._ml_object.
|
|
148
|
-
self._ml_object.resolve_rid(a).table.name
|
|
149
|
-
):
|
|
178
|
+
if not self._model.is_asset(self._ml_object.resolve_rid(a).table.name):
|
|
150
179
|
raise DerivaMLException(
|
|
151
180
|
"Asset specified in execution configuration is not a asset table"
|
|
152
181
|
)
|
|
@@ -173,15 +202,12 @@ class Execution:
|
|
|
173
202
|
self._initialize_execution(reload)
|
|
174
203
|
|
|
175
204
|
def _save_runtime_environment(self):
|
|
176
|
-
runtime_env_path =
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
suffix=".txt",
|
|
183
|
-
delete=False,
|
|
184
|
-
) as fp:
|
|
205
|
+
runtime_env_path = self.asset_file_path(
|
|
206
|
+
asset_name="Execution_Metadata",
|
|
207
|
+
file_name=f"environment_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
|
|
208
|
+
asset_types=ExecMetadataVocab.runtime_env.value,
|
|
209
|
+
)
|
|
210
|
+
with open(runtime_env_path, "w") as fp:
|
|
185
211
|
json.dump(get_execution_environment(), fp)
|
|
186
212
|
|
|
187
213
|
def _initialize_execution(self, reload: Optional[RID] = None) -> None:
|
|
@@ -203,6 +229,7 @@ class Execution:
|
|
|
203
229
|
)
|
|
204
230
|
self.datasets.append(self.download_dataset_bag(dataset))
|
|
205
231
|
self.dataset_rids.append(dataset.rid)
|
|
232
|
+
|
|
206
233
|
# Update execution info
|
|
207
234
|
schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
|
|
208
235
|
if self.dataset_rids and not (reload or self._dry_run):
|
|
@@ -215,16 +242,29 @@ class Execution:
|
|
|
215
242
|
|
|
216
243
|
# Download assets....
|
|
217
244
|
self.update_status(Status.running, "Downloading assets ...")
|
|
218
|
-
self.asset_paths =
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
245
|
+
self.asset_paths = {}
|
|
246
|
+
for asset_rid in self.configuration.assets:
|
|
247
|
+
asset_table = self._ml_object.resolve_rid(asset_rid).table.name
|
|
248
|
+
dest_dir = (
|
|
249
|
+
execution_root(self._ml_object.working_dir, self.execution_rid)
|
|
250
|
+
/ "downloaded-assets"
|
|
251
|
+
/ asset_table
|
|
252
|
+
)
|
|
253
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
254
|
+
self.asset_paths.setdefault(asset_table, []).append(
|
|
255
|
+
self.download_asset(
|
|
256
|
+
asset_rid=asset_rid,
|
|
257
|
+
dest_dir=dest_dir,
|
|
258
|
+
update_catalog=not (reload or self._dry_run),
|
|
259
|
+
)
|
|
260
|
+
)
|
|
224
261
|
|
|
225
262
|
# Save configuration details for later upload
|
|
226
|
-
|
|
227
|
-
|
|
263
|
+
cfile = self.asset_file_path(
|
|
264
|
+
asset_name="Execution_Metadata",
|
|
265
|
+
file_name="configuration.json",
|
|
266
|
+
asset_types=ExecMetadataVocab.execution_config.value,
|
|
267
|
+
)
|
|
228
268
|
with open(cfile, "w", encoding="utf-8") as config_file:
|
|
229
269
|
json.dump(self.configuration.model_dump(), config_file)
|
|
230
270
|
|
|
@@ -234,6 +274,42 @@ class Execution:
|
|
|
234
274
|
self.start_time = datetime.now()
|
|
235
275
|
self.update_status(Status.pending, "Initialize status finished.")
|
|
236
276
|
|
|
277
|
+
@property
|
|
278
|
+
def _execution_root(self) -> Path:
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
:return:
|
|
285
|
+
|
|
286
|
+
"""
|
|
287
|
+
return execution_root(self._working_dir, self.execution_rid)
|
|
288
|
+
|
|
289
|
+
@property
|
|
290
|
+
def _feature_root(self) -> Path:
|
|
291
|
+
"""The root path to all execution specific files.
|
|
292
|
+
:return:
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
|
|
298
|
+
"""
|
|
299
|
+
return feature_root(self._working_dir, self.execution_rid)
|
|
300
|
+
|
|
301
|
+
@property
|
|
302
|
+
def _asset_root(self) -> Path:
|
|
303
|
+
"""The root path to all execution specific files.
|
|
304
|
+
:return:
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
|
|
310
|
+
"""
|
|
311
|
+
return asset_root(self._working_dir, self.execution_rid)
|
|
312
|
+
|
|
237
313
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
238
314
|
def download_dataset_bag(self, dataset: DatasetSpec) -> DatasetBag:
|
|
239
315
|
"""Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
|
|
@@ -273,27 +349,6 @@ class Execution:
|
|
|
273
349
|
]
|
|
274
350
|
)
|
|
275
351
|
|
|
276
|
-
def _create_notebook_checkpoint(self):
|
|
277
|
-
"""Trigger a checkpoint creation using Jupyter's API."""
|
|
278
|
-
|
|
279
|
-
server, session = self._ml_object._get_notebook_session()
|
|
280
|
-
notebook_name = session["notebook"]["path"]
|
|
281
|
-
notebook_url = f"{server['url']}api/contents/{notebook_name}"
|
|
282
|
-
|
|
283
|
-
# Get notebook content
|
|
284
|
-
response = requests.get(
|
|
285
|
-
notebook_url, headers={"Authorization": f"Token {server['token']}"}
|
|
286
|
-
)
|
|
287
|
-
if response.status_code == 200:
|
|
288
|
-
notebook_content = response.json()["content"]
|
|
289
|
-
# Execution metadata cannot be in a directory, so map path into filename.
|
|
290
|
-
checkpoint_path = (
|
|
291
|
-
self.execution_metadata_path(ExecMetadataVocab.runtime_env.value)
|
|
292
|
-
/ f"{notebook_name.replace('/', '_')}.checkpoint"
|
|
293
|
-
)
|
|
294
|
-
with open(checkpoint_path, "w", encoding="utf-8") as f:
|
|
295
|
-
json.dump(notebook_content, f)
|
|
296
|
-
|
|
297
352
|
def execution_start(self) -> None:
|
|
298
353
|
"""Start an execution, uploading status to catalog"""
|
|
299
354
|
|
|
@@ -315,7 +370,7 @@ class Execution:
|
|
|
315
370
|
self._ml_object.ml_schema
|
|
316
371
|
].Execution.update([{"RID": self.execution_rid, "Duration": duration}])
|
|
317
372
|
|
|
318
|
-
def _upload_execution_dirs(self) -> dict[str,
|
|
373
|
+
def _upload_execution_dirs(self) -> dict[str, list[AssetFilePath]]:
|
|
319
374
|
"""Upload execution assets at _working_dir/Execution_asset.
|
|
320
375
|
|
|
321
376
|
This routine uploads the contents of the
|
|
@@ -329,86 +384,142 @@ class Execution:
|
|
|
329
384
|
DerivaMLException: If there is an issue uploading the assets.
|
|
330
385
|
"""
|
|
331
386
|
|
|
332
|
-
def asset_name(p: str) -> str:
|
|
333
|
-
return Path(*Path(p).parts[-2:]).as_posix()
|
|
334
|
-
|
|
335
387
|
try:
|
|
336
388
|
self.update_status(Status.running, "Uploading execution files...")
|
|
337
|
-
results = upload_directory(self.
|
|
338
|
-
results = {asset_name(k): v for k, v in results.items()}
|
|
339
|
-
|
|
340
|
-
execution_assets = [
|
|
341
|
-
r.result["RID"]
|
|
342
|
-
for r in results.values()
|
|
343
|
-
if r.state == UploadState.success and "Execution_Asset_Type" in r.result
|
|
344
|
-
]
|
|
345
|
-
execution_metadata = [
|
|
346
|
-
r.result["RID"]
|
|
347
|
-
for r in results.values()
|
|
348
|
-
if r.state == UploadState.success
|
|
349
|
-
and "Execution_Metadata_Type" in r.result
|
|
350
|
-
]
|
|
351
|
-
self._update_execution_asset_table(execution_assets)
|
|
352
|
-
self._update_execution_metadata_table(execution_metadata)
|
|
353
|
-
|
|
389
|
+
results = upload_directory(self._model, self._asset_root)
|
|
354
390
|
except Exception as e:
|
|
355
391
|
error = format_exception(e)
|
|
356
392
|
self.update_status(Status.failed, error)
|
|
357
393
|
raise DerivaMLException(f"Fail to upload execution_assets. Error: {error}")
|
|
358
394
|
|
|
395
|
+
asset_map = {}
|
|
396
|
+
for path, status in results.items():
|
|
397
|
+
asset_table, file_name = normalize_asset_dir(path)
|
|
398
|
+
|
|
399
|
+
asset_map.setdefault(asset_table, []).append(
|
|
400
|
+
AssetFilePath(
|
|
401
|
+
asset_path=path,
|
|
402
|
+
asset_name=asset_table,
|
|
403
|
+
file_name=file_name,
|
|
404
|
+
asset_metadata={
|
|
405
|
+
k: v
|
|
406
|
+
for k, v in status.result.items()
|
|
407
|
+
if k in self._model.asset_metadata(asset_table.split("/")[1])
|
|
408
|
+
},
|
|
409
|
+
asset_types=[],
|
|
410
|
+
asset_rid=status.result["RID"],
|
|
411
|
+
)
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
self._update_asset_execution_table(asset_map)
|
|
359
415
|
self.update_status(Status.running, "Updating features...")
|
|
360
416
|
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
Returns:
|
|
370
|
-
|
|
371
|
-
"""
|
|
372
|
-
entries = list(directory.iterdir())
|
|
373
|
-
for entry in entries:
|
|
374
|
-
if entry.is_dir():
|
|
375
|
-
yield from traverse_bottom_up(entry)
|
|
376
|
-
yield directory
|
|
377
|
-
|
|
378
|
-
for p in traverse_bottom_up(self._feature_root):
|
|
379
|
-
if m := is_feature_asset_dir(p):
|
|
380
|
-
try:
|
|
381
|
-
self.update_status(
|
|
382
|
-
Status.running, f"Uploading feature {m['feature_name']}..."
|
|
383
|
-
)
|
|
384
|
-
feature_assets[m["target_table"], m["feature_name"]] = (
|
|
385
|
-
self._ml_object.upload_assets(p)
|
|
386
|
-
)
|
|
387
|
-
results |= feature_assets[m["target_table"], m["feature_name"]]
|
|
388
|
-
except Exception as e:
|
|
389
|
-
error = format_exception(e)
|
|
390
|
-
self.update_status(Status.failed, error)
|
|
391
|
-
raise DerivaMLException(
|
|
392
|
-
f"Fail to upload execution metadata. Error: {error}"
|
|
393
|
-
)
|
|
394
|
-
elif m := is_feature_dir(p):
|
|
395
|
-
files = [f for f in p.iterdir() if f.is_file()]
|
|
396
|
-
if files:
|
|
397
|
-
self._update_feature_table(
|
|
398
|
-
target_table=m["target_table"],
|
|
399
|
-
feature_name=m["feature_name"],
|
|
400
|
-
feature_file=files[0],
|
|
401
|
-
uploaded_files=feature_assets[
|
|
402
|
-
m["target_table"], m["feature_name"]
|
|
403
|
-
],
|
|
404
|
-
)
|
|
417
|
+
for p in self._feature_root.glob("**/*.jsonl"):
|
|
418
|
+
m = is_feature_dir(p.parent)
|
|
419
|
+
self._update_feature_table(
|
|
420
|
+
target_table=m["target_table"],
|
|
421
|
+
feature_name=m["feature_name"],
|
|
422
|
+
feature_file=p,
|
|
423
|
+
uploaded_files=asset_map,
|
|
424
|
+
)
|
|
405
425
|
|
|
406
426
|
self.update_status(Status.running, "Upload assets complete")
|
|
407
|
-
return
|
|
427
|
+
return asset_map
|
|
428
|
+
|
|
429
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
430
|
+
def download_asset(
|
|
431
|
+
self, asset_rid: RID, dest_dir: Path, update_catalog=True
|
|
432
|
+
) -> AssetFilePath:
|
|
433
|
+
"""Download an asset from a URL and place it in a local directory.
|
|
434
|
+
|
|
435
|
+
Args:
|
|
436
|
+
asset_rid: URL of the asset.
|
|
437
|
+
dest_dir: Destination directory for the asset.
|
|
438
|
+
update_catalog: Whether to update the catalog execution information after downloading.
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
A tuple with the name of the asset table and a Path object to the downloaded asset.
|
|
442
|
+
"""
|
|
443
|
+
|
|
444
|
+
asset_table = self._ml_object.resolve_rid(asset_rid).table
|
|
445
|
+
if not self._model.is_asset(asset_table):
|
|
446
|
+
raise DerivaMLException(f"RID {asset_rid} is not for an asset table.")
|
|
447
|
+
|
|
448
|
+
asset_record = self._ml_object.retrieve_rid(asset_rid)
|
|
449
|
+
asset_metadata = {
|
|
450
|
+
k: v
|
|
451
|
+
for k, v in asset_record.items()
|
|
452
|
+
if k in self._model.asset_metadata(asset_table)
|
|
453
|
+
}
|
|
454
|
+
asset_url = asset_record["URL"]
|
|
455
|
+
asset_filename = dest_dir / asset_record["Filename"]
|
|
456
|
+
hs = HatracStore("https", self._ml_object.host_name, self._ml_object.credential)
|
|
457
|
+
hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
|
|
458
|
+
|
|
459
|
+
asset_type_table = self._model.find_association(asset_table, MLVocab.asset_type)
|
|
460
|
+
type_path = self._ml_object.pathBuilder.schemas[
|
|
461
|
+
asset_type_table.schema.name
|
|
462
|
+
].tables[asset_type_table.name]
|
|
463
|
+
asset_types = [
|
|
464
|
+
asset_type[MLVocab.asset_type.value]
|
|
465
|
+
for asset_type in type_path.filter(
|
|
466
|
+
type_path.columns[asset_table.name] == asset_rid
|
|
467
|
+
)
|
|
468
|
+
.attributes(type_path.Asset_Type)
|
|
469
|
+
.fetch()
|
|
470
|
+
]
|
|
471
|
+
|
|
472
|
+
asset_path = AssetFilePath(
|
|
473
|
+
file_name=asset_filename,
|
|
474
|
+
asset_rid=asset_rid,
|
|
475
|
+
asset_path=asset_filename,
|
|
476
|
+
asset_metadata=asset_metadata,
|
|
477
|
+
asset_name=asset_table.name,
|
|
478
|
+
asset_types=asset_types,
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
if update_catalog:
|
|
482
|
+
self._update_asset_execution_table(
|
|
483
|
+
{f"{asset_table.schema.name}/{asset_table.name}": [asset_path]},
|
|
484
|
+
asset_role="Input",
|
|
485
|
+
)
|
|
486
|
+
return asset_path
|
|
487
|
+
|
|
488
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
489
|
+
def upload_assets(
|
|
490
|
+
self,
|
|
491
|
+
assets_dir: str | Path,
|
|
492
|
+
) -> dict[Any, FileUploadState] | None:
|
|
493
|
+
"""Upload assets from a directory.
|
|
494
|
+
|
|
495
|
+
This routine assumes that the current upload specification includes a configuration for the specified directory.
|
|
496
|
+
Every asset in the specified directory is uploaded
|
|
497
|
+
|
|
498
|
+
Args:
|
|
499
|
+
assets_dir: Directory containing the assets to upload.
|
|
500
|
+
|
|
501
|
+
Returns:
|
|
502
|
+
Results of the upload operation.
|
|
503
|
+
|
|
504
|
+
Raises:
|
|
505
|
+
DerivaMLException: If there is an issue uploading the assets.
|
|
506
|
+
"""
|
|
507
|
+
|
|
508
|
+
def path_to_asset(path: str) -> str:
|
|
509
|
+
"""Pull the asset name out of a path to that asset in the filesystem"""
|
|
510
|
+
components = path.split("/")
|
|
511
|
+
return components[
|
|
512
|
+
components.index("asset") + 2
|
|
513
|
+
] # Look for asset in the path to find the name
|
|
514
|
+
|
|
515
|
+
if not self._model.is_asset(Path(assets_dir).name):
|
|
516
|
+
raise DerivaMLException("Directory does not have name of an asset table.")
|
|
517
|
+
results = upload_directory(self._model, assets_dir)
|
|
518
|
+
return {path_to_asset(p): r for p, r in results.items()}
|
|
408
519
|
|
|
409
520
|
def upload_execution_outputs(
|
|
410
521
|
self, clean_folder: bool = True
|
|
411
|
-
) -> dict[str,
|
|
522
|
+
) -> dict[str, AssetFilePath]:
|
|
412
523
|
"""Upload all the assets and metadata associated with the current execution.
|
|
413
524
|
|
|
414
525
|
This will include any new assets, features, or table values.
|
|
@@ -434,19 +545,6 @@ class Execution:
|
|
|
434
545
|
self.update_status(Status.failed, error)
|
|
435
546
|
raise e
|
|
436
547
|
|
|
437
|
-
def _asset_dir(self) -> Path:
|
|
438
|
-
"""
|
|
439
|
-
|
|
440
|
-
Args:
|
|
441
|
-
|
|
442
|
-
Returns:
|
|
443
|
-
:return: PathLib path object to model directory.
|
|
444
|
-
|
|
445
|
-
"""
|
|
446
|
-
path = self._working_dir / self.execution_rid / "asset"
|
|
447
|
-
path.mkdir(parents=True, exist_ok=True)
|
|
448
|
-
return path
|
|
449
|
-
|
|
450
548
|
def _clean_folder_contents(self, folder_path: Path):
|
|
451
549
|
"""
|
|
452
550
|
|
|
@@ -469,7 +567,7 @@ class Execution:
|
|
|
469
567
|
target_table: str,
|
|
470
568
|
feature_name: str,
|
|
471
569
|
feature_file: str | Path,
|
|
472
|
-
uploaded_files: dict[str,
|
|
570
|
+
uploaded_files: dict[str, list[AssetFilePath]],
|
|
473
571
|
) -> None:
|
|
474
572
|
"""
|
|
475
573
|
|
|
@@ -477,121 +575,130 @@ class Execution:
|
|
|
477
575
|
target_table: str:
|
|
478
576
|
feature_name: str:
|
|
479
577
|
feature_file: str | Path:
|
|
480
|
-
uploaded_files:
|
|
578
|
+
uploaded_files: Dictionary whose key ia an asset name, file-name pair, and whose value is a filename, RID of that asset.
|
|
481
579
|
"""
|
|
482
580
|
|
|
581
|
+
# Get the column names of all the Feature columns that should be the RID of an asset
|
|
582
|
+
asset_columns = [
|
|
583
|
+
c.name
|
|
584
|
+
for c in self._ml_object.feature_record_class(
|
|
585
|
+
target_table, feature_name
|
|
586
|
+
).feature.asset_columns
|
|
587
|
+
]
|
|
588
|
+
|
|
589
|
+
# Get the names of the columns in the feature that are assets.
|
|
483
590
|
asset_columns = [
|
|
484
591
|
c.name
|
|
485
592
|
for c in self._ml_object.feature_record_class(
|
|
486
593
|
target_table, feature_name
|
|
487
594
|
).feature.asset_columns
|
|
488
595
|
]
|
|
596
|
+
|
|
489
597
|
feature_table = self._ml_object.feature_record_class(
|
|
490
598
|
target_table, feature_name
|
|
491
599
|
).feature.feature_table.name
|
|
600
|
+
asset_map = {
|
|
601
|
+
(asset_table, asset.file_name): asset.asset_rid
|
|
602
|
+
for asset_table, assets in uploaded_files.items()
|
|
603
|
+
for asset in assets
|
|
604
|
+
}
|
|
492
605
|
|
|
493
606
|
def map_path(e):
|
|
494
|
-
"""
|
|
495
|
-
|
|
496
|
-
Args:
|
|
497
|
-
e:
|
|
498
|
-
|
|
499
|
-
Returns:
|
|
500
|
-
|
|
501
|
-
"""
|
|
502
|
-
# Go through the asset columns and replace the file name with the RID for the uploaded file.
|
|
607
|
+
"""Go through the asset columns and replace the file name with the RID for the uploaded file."""
|
|
503
608
|
for c in asset_columns:
|
|
504
|
-
e[c] = asset_map[e[c]]
|
|
609
|
+
e[c] = asset_map[normalize_asset_dir(e[c])]
|
|
505
610
|
return e
|
|
506
611
|
|
|
507
|
-
#
|
|
508
|
-
asset_map = {
|
|
509
|
-
file: asset.result["RID"]
|
|
510
|
-
for file, asset in uploaded_files.items()
|
|
511
|
-
if asset.state == UploadState.success and asset.result
|
|
512
|
-
}
|
|
612
|
+
# Load the JSON file that has the set of records that contain the feature values.
|
|
513
613
|
with open(feature_file, "r") as feature_values:
|
|
514
|
-
entities = [
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
"""Upload execution metadata at _working_dir/Execution_metadata."""
|
|
519
|
-
ml_schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
|
|
520
|
-
entities = [
|
|
521
|
-
{"Execution_Metadata": metadata_rid, "Execution": self.execution_rid}
|
|
522
|
-
for metadata_rid in assets
|
|
523
|
-
]
|
|
524
|
-
ml_schema_path.Execution_Metadata_Execution.insert(entities)
|
|
525
|
-
|
|
526
|
-
def _update_execution_asset_table(self, assets: list[RID]) -> None:
|
|
527
|
-
"""Assets associated with an execution must be linked to an execution entity after they are uploaded into
|
|
528
|
-
the catalog. This routine takes a list of uploaded assets and makes that association.
|
|
529
|
-
|
|
530
|
-
Args:
|
|
531
|
-
assets: list of RIDS for execution assets.:
|
|
532
|
-
"""
|
|
533
|
-
ml_schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
|
|
534
|
-
entities = [
|
|
535
|
-
{"Execution_Asset": asset_rid, "Execution": self.execution_rid}
|
|
536
|
-
for asset_rid in assets
|
|
537
|
-
]
|
|
538
|
-
ml_schema_path.Execution_Asset_Execution.insert(entities)
|
|
539
|
-
|
|
540
|
-
@property
|
|
541
|
-
def _execution_metadata_dir(self) -> Path:
|
|
542
|
-
"""
|
|
543
|
-
|
|
544
|
-
Args:
|
|
545
|
-
|
|
546
|
-
Returns:
|
|
547
|
-
to the catalog by the execution_upload method in an execution object.
|
|
548
|
-
|
|
549
|
-
:return:
|
|
550
|
-
|
|
551
|
-
"""
|
|
552
|
-
return execution_metadata_dir(
|
|
553
|
-
self._working_dir, exec_rid=self.execution_rid, metadata_type=""
|
|
614
|
+
entities = [json.loads(line.strip()) for line in feature_values]
|
|
615
|
+
# Update the asset columns in the feature and add to the catalog.
|
|
616
|
+
self._ml_object.domain_path.tables[feature_table].insert(
|
|
617
|
+
[map_path(e) for e in entities]
|
|
554
618
|
)
|
|
555
619
|
|
|
556
|
-
def
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
metadata_type: Type of metadata to be uploaded. Must be a term in Metadata_Type controlled vocabulary.
|
|
563
|
-
|
|
564
|
-
Returns:
|
|
565
|
-
Path to the directory in which to place files of type metadata_type.
|
|
566
|
-
"""
|
|
567
|
-
self._ml_object.lookup_term(
|
|
568
|
-
MLVocab.execution_metadata_type, metadata_type
|
|
569
|
-
) # Make sure metadata type exists.
|
|
570
|
-
return execution_metadata_dir(
|
|
571
|
-
self._working_dir, exec_rid=self.execution_rid, metadata_type=metadata_type
|
|
572
|
-
)
|
|
573
|
-
|
|
574
|
-
@property
|
|
575
|
-
def _execution_asset_dir(self) -> Path:
|
|
576
|
-
"""
|
|
620
|
+
def _update_asset_execution_table(
|
|
621
|
+
self,
|
|
622
|
+
uploaded_assets: dict[str, list[AssetFilePath]],
|
|
623
|
+
asset_role: str = "Output",
|
|
624
|
+
):
|
|
625
|
+
"""Add entry to association table connecting an asset to an execution RID
|
|
577
626
|
|
|
578
627
|
Args:
|
|
628
|
+
uploaded_assets: Dictionary whose key is the name of an asset table, and whose value is a list of RIDs for
|
|
629
|
+
newly added assets to that table.
|
|
630
|
+
asset_role: A term or list of terms from the Asset_Role vocabulary.
|
|
631
|
+
"""
|
|
632
|
+
# Make sure the asset role is in the controlled vocabulary table.
|
|
633
|
+
self._ml_object.lookup_term(MLVocab.asset_role, asset_role)
|
|
634
|
+
|
|
635
|
+
pb = self._ml_object.pathBuilder
|
|
636
|
+
for asset_table, asset_list in uploaded_assets.items():
|
|
637
|
+
asset_table_name = asset_table.split("/")[
|
|
638
|
+
1
|
|
639
|
+
] # Peel off the schema from the asset table
|
|
640
|
+
asset_exe = self._model.find_association(asset_table_name, "Execution")
|
|
641
|
+
asset_exe_path = pb.schemas[asset_exe.schema.name].tables[asset_exe.name]
|
|
642
|
+
asset_exe_path.insert(
|
|
643
|
+
[
|
|
644
|
+
{
|
|
645
|
+
asset_table_name: asset_path.asset_rid,
|
|
646
|
+
"Execution": self.execution_rid,
|
|
647
|
+
"Asset_Role": asset_role,
|
|
648
|
+
}
|
|
649
|
+
for asset_path in asset_list
|
|
650
|
+
]
|
|
651
|
+
)
|
|
579
652
|
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
653
|
+
# Now add in the type names via the asset_asset_type association table.
|
|
654
|
+
# Get the list of types for each file in the asset.
|
|
655
|
+
if asset_role == "Input":
|
|
656
|
+
return
|
|
657
|
+
asset_type_map = {}
|
|
658
|
+
with open(
|
|
659
|
+
asset_type_path(
|
|
660
|
+
self._working_dir,
|
|
661
|
+
self.execution_rid,
|
|
662
|
+
self._model.name_to_table(asset_table_name),
|
|
663
|
+
),
|
|
664
|
+
"r",
|
|
665
|
+
) as f:
|
|
666
|
+
for line in f:
|
|
667
|
+
asset_type_map.update(json.loads(line.strip()))
|
|
668
|
+
for asset_path in asset_list:
|
|
669
|
+
asset_path.asset_types = asset_type_map[asset_path.file_name]
|
|
670
|
+
|
|
671
|
+
asset_asset_type = self._model.find_association(
|
|
672
|
+
asset_table_name, "Asset_Type"
|
|
673
|
+
)
|
|
674
|
+
type_path = pb.schemas[asset_asset_type.schema.name].tables[
|
|
675
|
+
asset_asset_type.name
|
|
676
|
+
]
|
|
677
|
+
type_path.insert(
|
|
678
|
+
[
|
|
679
|
+
{asset_table_name: asset.asset_rid, "Asset_Type": t}
|
|
680
|
+
for asset in asset_list
|
|
681
|
+
for t in asset_type_map[asset.file_name]
|
|
682
|
+
]
|
|
683
|
+
)
|
|
587
684
|
|
|
588
|
-
|
|
685
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
686
|
+
def asset_file_path(
|
|
687
|
+
self,
|
|
688
|
+
asset_name: str,
|
|
689
|
+
file_name: str,
|
|
690
|
+
asset_types: Optional[list[str] | str] = None,
|
|
691
|
+
**kwargs,
|
|
692
|
+
) -> AssetFilePath:
|
|
589
693
|
"""Return a pathlib Path to the directory in which to place files for the specified execution_asset type.
|
|
590
694
|
|
|
591
695
|
These files are uploaded as part of the upload_execution method in DerivaML class.
|
|
592
696
|
|
|
593
697
|
Args:
|
|
594
|
-
|
|
698
|
+
asset_name: Type of asset to be uploaded. Must be a term in Asset_Type controlled vocabulary.
|
|
699
|
+
asset_types: Type of asset to be uploaded. Defaults to name of the asset.
|
|
700
|
+
file_name: Name of file to be uploaded.
|
|
701
|
+
**kwargs: Any additional metadata values that may be part of the asset table.
|
|
595
702
|
|
|
596
703
|
Returns:
|
|
597
704
|
Path in which to place asset files.
|
|
@@ -599,73 +706,39 @@ class Execution:
|
|
|
599
706
|
Raises:
|
|
600
707
|
DerivaException: If the asset type is not defined.
|
|
601
708
|
"""
|
|
602
|
-
self.
|
|
709
|
+
if not self._model.is_asset(asset_name):
|
|
710
|
+
DerivaMLException(f"Table {asset_name} is not an asset")
|
|
603
711
|
|
|
604
|
-
|
|
605
|
-
self._working_dir, exec_rid=self.execution_rid, asset_type=asset_type
|
|
606
|
-
)
|
|
712
|
+
asset_table = self._model.name_to_table(asset_name)
|
|
607
713
|
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
714
|
+
asset_types = asset_types or kwargs.get("Asset_Type", None) or asset_name
|
|
715
|
+
asset_types = [asset_types] if isinstance(asset_types, str) else asset_types
|
|
716
|
+
for t in asset_types:
|
|
717
|
+
self._ml_object.lookup_term(MLVocab.asset_type, t)
|
|
611
718
|
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
Returns:
|
|
615
|
-
:return:
|
|
616
|
-
|
|
617
|
-
"""
|
|
618
|
-
return execution_root(self._working_dir, self.execution_rid)
|
|
619
|
-
|
|
620
|
-
@property
|
|
621
|
-
def _feature_root(self) -> Path:
|
|
622
|
-
"""The root path to all execution specific files.
|
|
623
|
-
:return:
|
|
624
|
-
|
|
625
|
-
Args:
|
|
626
|
-
|
|
627
|
-
Returns:
|
|
628
|
-
|
|
629
|
-
"""
|
|
630
|
-
return feature_root(self._working_dir, self.execution_rid)
|
|
631
|
-
|
|
632
|
-
def feature_paths(
|
|
633
|
-
self, table: Table | str, feature_name: str
|
|
634
|
-
) -> tuple[Path, dict[str, Path]]:
|
|
635
|
-
"""Return the file path of where to place feature values, and assets for the named feature and table.
|
|
636
|
-
|
|
637
|
-
A side effect of calling this routine is that the directories in which to place the feature values and assets
|
|
638
|
-
will be created
|
|
639
|
-
|
|
640
|
-
Args:
|
|
641
|
-
table: The table with which the feature is associated.
|
|
642
|
-
feature_name: Name of the feature
|
|
643
|
-
|
|
644
|
-
Returns:
|
|
645
|
-
A tuple whose first element is the path for the feature values and whose second element is a dictionary
|
|
646
|
-
of associated asset table names and corresponding paths.
|
|
647
|
-
"""
|
|
648
|
-
feature = self._ml_object.lookup_feature(table, feature_name)
|
|
649
|
-
|
|
650
|
-
tpath = feature_value_path(
|
|
719
|
+
asset_path = asset_file_path(
|
|
651
720
|
self._working_dir,
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
721
|
+
self.execution_rid,
|
|
722
|
+
self._model.name_to_table(asset_name),
|
|
723
|
+
file_name,
|
|
724
|
+
metadata=kwargs,
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
# Persist the asset types into a file
|
|
728
|
+
with open(
|
|
729
|
+
asset_type_path(self._working_dir, self.execution_rid, asset_table),
|
|
730
|
+
"a",
|
|
731
|
+
encoding="utf-8",
|
|
732
|
+
) as f:
|
|
733
|
+
f.write(json.dumps({file_name: asset_types}) + "\n")
|
|
734
|
+
|
|
735
|
+
return AssetFilePath(
|
|
736
|
+
asset_path=asset_path,
|
|
737
|
+
asset_name=asset_name,
|
|
738
|
+
file_name=file_name,
|
|
739
|
+
asset_metadata=kwargs,
|
|
740
|
+
asset_types=asset_types,
|
|
656
741
|
)
|
|
657
|
-
asset_paths = {
|
|
658
|
-
asset_table.name: feature_asset_dir(
|
|
659
|
-
self._working_dir,
|
|
660
|
-
exec_rid=self.execution_rid,
|
|
661
|
-
schema=self._ml_object.domain_schema,
|
|
662
|
-
target_table=feature.target_table.name,
|
|
663
|
-
feature_name=feature_name,
|
|
664
|
-
asset_table=asset_table.name,
|
|
665
|
-
)
|
|
666
|
-
for asset_table in feature.asset_columns
|
|
667
|
-
}
|
|
668
|
-
return tpath, asset_paths
|
|
669
742
|
|
|
670
743
|
def table_path(self, table: str) -> Path:
|
|
671
744
|
"""Return a local file path to a CSV to add values to a table on upload.
|
|
@@ -676,10 +749,7 @@ class Execution:
|
|
|
676
749
|
Returns:
|
|
677
750
|
Pathlib path to the file in which to place table values.
|
|
678
751
|
"""
|
|
679
|
-
if
|
|
680
|
-
table
|
|
681
|
-
not in self._ml_object.model.schemas[self._ml_object.domain_schema].tables
|
|
682
|
-
):
|
|
752
|
+
if table not in self._model.schemas[self._ml_object.domain_schema].tables:
|
|
683
753
|
raise DerivaMLException(
|
|
684
754
|
"Table '{}' not found in domain schema".format(table)
|
|
685
755
|
)
|
|
@@ -693,7 +763,7 @@ class Execution:
|
|
|
693
763
|
return self
|
|
694
764
|
|
|
695
765
|
@validate_call
|
|
696
|
-
def
|
|
766
|
+
def add_features(self, features: Iterable[FeatureRecord]) -> None:
|
|
697
767
|
"""Given a collection of Feature records, write out a CSV file in the appropriate assets directory so that this
|
|
698
768
|
feature gets uploaded when the execution is complete.
|
|
699
769
|
|
|
@@ -701,22 +771,28 @@ class Execution:
|
|
|
701
771
|
features: Iterable of Feature records to write.
|
|
702
772
|
"""
|
|
703
773
|
|
|
704
|
-
|
|
705
|
-
|
|
774
|
+
# Make sure feature list is homogeneous:
|
|
775
|
+
sorted_features = defaultdict(list)
|
|
776
|
+
for f in features:
|
|
777
|
+
sorted_features[type(f)].append(f)
|
|
778
|
+
for fs in sorted_features.values():
|
|
779
|
+
self._add_features(fs)
|
|
780
|
+
|
|
781
|
+
def _add_features(self, features: list[FeatureRecord]) -> None:
|
|
782
|
+
# Update feature records to include current execution_rid
|
|
783
|
+
first_row = features[0]
|
|
706
784
|
feature = first_row.feature
|
|
707
|
-
|
|
708
|
-
|
|
785
|
+
json_path = feature_value_path(
|
|
786
|
+
self._working_dir,
|
|
787
|
+
schema=self._ml_object.domain_schema,
|
|
788
|
+
target_table=feature.target_table.name,
|
|
789
|
+
feature_name=feature.feature_name,
|
|
790
|
+
exec_rid=self.execution_rid,
|
|
709
791
|
)
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
with open(csv_path, "w") as f:
|
|
715
|
-
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
716
|
-
writer.writeheader()
|
|
717
|
-
writer.writerow(first_row.model_dump())
|
|
718
|
-
for feature in feature_iter:
|
|
719
|
-
writer.writerow(feature.model_dump())
|
|
792
|
+
with open(json_path, "a", encoding="utf-8") as file:
|
|
793
|
+
for feature in features:
|
|
794
|
+
feature.Execution = self.execution_rid
|
|
795
|
+
file.write(json.dumps(feature.model_dump(mode="json")) + "\n")
|
|
720
796
|
|
|
721
797
|
@validate_call
|
|
722
798
|
def create_dataset(self, dataset_types: str | list[str], description: str) -> RID:
|
|
@@ -745,9 +821,13 @@ class Execution:
|
|
|
745
821
|
Add new elements to an existing dataset. In addition to adding new members, the minor version number of the
|
|
746
822
|
dataset is incremented and the description, if provide is applied to that new version.
|
|
747
823
|
|
|
824
|
+
The RIDs in the list to not have to be all from the same table, but they must be from a table that has
|
|
825
|
+
been configured to be a dataset element type.
|
|
826
|
+
|
|
748
827
|
Args:
|
|
749
828
|
dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
|
|
750
|
-
members: List of RIDs of members to add to the dataset_table.
|
|
829
|
+
members: List of RIDs of members to add to the dataset_table. RID must be to a table type that is a
|
|
830
|
+
dataset element type (see DerivaML.add_dataset_element_type).
|
|
751
831
|
validate: Check rid_list to make sure elements are not already in the dataset_table.
|
|
752
832
|
description: Markdown description of the updated dataset.
|
|
753
833
|
"""
|