deriva-ml 1.10.1__py3-none-any.whl → 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deriva_ml/execution.py CHANGED
@@ -5,45 +5,37 @@ This module defined the Execution class which is used to interact with the state
5
5
  from __future__ import annotations
6
6
 
7
7
  from collections import defaultdict
8
- import csv
9
8
  import json
10
9
  import logging
11
10
  import os
12
11
  import shutil
13
12
  from datetime import datetime
14
13
  from pathlib import Path
15
- import requests
16
- from tempfile import NamedTemporaryFile
17
14
  from typing import Iterable, Any, Optional
15
+
18
16
  from deriva.core import format_exception
19
- from deriva.core.ermrest_model import Table
20
17
  from pydantic import validate_call, ConfigDict
21
18
  import sys
19
+ from deriva.core.hatrac_store import HatracStore
22
20
 
23
- from .deriva_definitions import MLVocab, ExecMetadataVocab
24
- from .deriva_definitions import (
25
- RID,
26
- Status,
27
- FileUploadState,
28
- UploadState,
29
- DerivaMLException,
30
- )
21
+ from .deriva_definitions import ExecMetadataVocab
22
+ from .deriva_definitions import RID, Status, FileUploadState, DerivaMLException, MLVocab
31
23
  from .deriva_ml_base import DerivaML, FeatureRecord
32
24
  from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
33
25
  from .dataset_bag import DatasetBag
34
26
  from .execution_configuration import ExecutionConfiguration, Workflow
35
27
  from .execution_environment import get_execution_environment
36
28
  from .upload import (
37
- execution_metadata_dir,
38
- execution_asset_dir,
39
29
  execution_root,
40
30
  feature_root,
41
- feature_asset_dir,
31
+ asset_root,
42
32
  feature_value_path,
43
33
  is_feature_dir,
44
- is_feature_asset_dir,
45
34
  table_path,
46
35
  upload_directory,
36
+ normalize_asset_dir,
37
+ asset_file_path,
38
+ asset_type_path,
47
39
  )
48
40
 
49
41
  try:
@@ -60,6 +52,51 @@ except ImportError:
60
52
  return []
61
53
 
62
54
 
55
+ class AssetFilePath(type(Path())):
56
+ """Derived class of Path that also includes information about a downloaded.
57
+
58
+ An AssetFilePath has all the methods associated with a pathlib.Path object. In addition, it defines additional
59
+ attributes associated with a DerviaML asset.
60
+
61
+ Attributes:
62
+ asset_types: A list of the types associated with this asset. From the Asset_Type controlled vocabulary.
63
+ asset_metadata: A dictionary of names and values of any additional columns associated with this asset.
64
+ asset_name: The name of the asset table
65
+ file_name: The name of the file in the local file system that has the asset contents
66
+ asset_rid: The RID of the asset if it has been uploaded into an asset table
67
+ """
68
+
69
+ def __new__(
70
+ cls,
71
+ asset_path,
72
+ asset_name: str,
73
+ file_name: str,
74
+ asset_metadata: dict[str, Any],
75
+ asset_types: list[str] | str,
76
+ asset_rid: Optional[RID] = None,
77
+ ):
78
+ """
79
+ Create a new Path object that has additional information related to the use of this path as an asset.
80
+
81
+ Args:
82
+ asset_path: Local path to the location of the asset.
83
+ asset_name: The name of the asset in the catalog (e.g. the asset table name).
84
+ file_name: Name of the local file that contains the contents of the asset.
85
+ asset_metadata: Any additional columns associated with this asset beyond the URL, Length, and checksum.
86
+ asset_types: A list of terms from the Asset_Type controlled vocabulary.
87
+ asset_rid: The RID of the asset if it has been uploaded into an asset table
88
+ """
89
+ obj = super().__new__(cls, asset_path)
90
+ obj.asset_types = (
91
+ asset_types if isinstance(asset_types, list) else [asset_types]
92
+ )
93
+ obj.asset_metadata = asset_metadata
94
+ obj.asset_name = asset_name
95
+ obj.file_name = file_name
96
+ obj.asset_rid = asset_rid
97
+ return obj
98
+
99
+
63
100
  class Execution:
64
101
  """The Execution class is used to capture the context of an activity within DerivaML. While these are primarily
65
102
  computational, manual processes can be represented by an execution as well.
@@ -102,18 +139,19 @@ class Execution:
102
139
  """
103
140
 
104
141
  Args:
105
- configuration:
106
- ml_object:
142
+ configuration: Execution configuration object that describes the execution.
143
+ ml_object: The DerivaML instance that created the execution.
107
144
  reload: RID of previously initialized execution object.
108
145
  """
109
- self.asset_paths: list[Path] = []
146
+ self.asset_paths: list[AssetFilePath] = []
110
147
  self.configuration = configuration
111
148
  self._ml_object = ml_object
149
+ self._model = ml_object.model
112
150
  self._logger = ml_object._logger
113
151
  self.start_time = None
114
152
  self.stop_time = None
115
153
  self.status = Status.created
116
- self.uploaded_assets: list[Path] = []
154
+ self.uploaded_assets: Optional[dict[str, list[AssetFilePath]]] = None
117
155
  self.configuration.argv = sys.argv
118
156
 
119
157
  self.dataset_rids: list[RID] = []
@@ -124,6 +162,7 @@ class Execution:
124
162
  self._cache_dir = self._ml_object.cache_dir
125
163
  self._dry_run = dry_run
126
164
 
165
+ # Make sure we have a good workflow.
127
166
  if isinstance(self.configuration.workflow, Workflow):
128
167
  self.workflow_rid = (
129
168
  self._ml_object.add_workflow(self.configuration.workflow)
@@ -140,6 +179,7 @@ class Execution:
140
179
  "Workflow specified in execution configuration is not a Workflow"
141
180
  )
142
181
 
182
+ # Validate the datasets and assets to be valid.
143
183
  for d in self.configuration.datasets:
144
184
  if self._ml_object.resolve_rid(d.rid).table.name != "Dataset":
145
185
  raise DerivaMLException(
@@ -147,9 +187,7 @@ class Execution:
147
187
  )
148
188
 
149
189
  for a in self.configuration.assets:
150
- if not self._ml_object.model.is_asset(
151
- self._ml_object.resolve_rid(a).table.name
152
- ):
190
+ if not self._model.is_asset(self._ml_object.resolve_rid(a).table.name):
153
191
  raise DerivaMLException(
154
192
  "Asset specified in execution configuration is not a asset table"
155
193
  )
@@ -176,15 +214,12 @@ class Execution:
176
214
  self._initialize_execution(reload)
177
215
 
178
216
  def _save_runtime_environment(self):
179
- runtime_env_path = ExecMetadataVocab.runtime_env.value
180
- runtime_env_dir = self.execution_metadata_path(runtime_env_path)
181
- with NamedTemporaryFile(
182
- "w+",
183
- dir=runtime_env_dir,
184
- prefix="environment_snapshot_",
185
- suffix=".txt",
186
- delete=False,
187
- ) as fp:
217
+ runtime_env_path = self.asset_file_path(
218
+ asset_name="Execution_Metadata",
219
+ file_name=f"environment_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
220
+ asset_types=ExecMetadataVocab.runtime_env.value,
221
+ )
222
+ with open(runtime_env_path, "w") as fp:
188
223
  json.dump(get_execution_environment(), fp)
189
224
 
190
225
  def _initialize_execution(self, reload: Optional[RID] = None) -> None:
@@ -206,6 +241,7 @@ class Execution:
206
241
  )
207
242
  self.datasets.append(self.download_dataset_bag(dataset))
208
243
  self.dataset_rids.append(dataset.rid)
244
+
209
245
  # Update execution info
210
246
  schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
211
247
  if self.dataset_rids and not (reload or self._dry_run):
@@ -218,17 +254,30 @@ class Execution:
218
254
 
219
255
  # Download assets....
220
256
  self.update_status(Status.running, "Downloading assets ...")
221
- self.asset_paths = [
222
- self._ml_object.download_asset(asset_rid=a, dest_dir=self._asset_dir())
223
- for a in self.configuration.assets
224
- ]
225
- if self.asset_paths and not (reload or self._dry_run):
226
- self._update_execution_asset_table(self.configuration.assets)
257
+ self.asset_paths = {}
258
+ for asset_rid in self.configuration.assets:
259
+ asset_table = self._ml_object.resolve_rid(asset_rid).table.name
260
+ dest_dir = (
261
+ execution_root(self._ml_object.working_dir, self.execution_rid)
262
+ / "downloaded-assets"
263
+ / asset_table
264
+ )
265
+ dest_dir.mkdir(parents=True, exist_ok=True)
266
+ self.asset_paths.setdefault(asset_table, []).append(
267
+ self.download_asset(
268
+ asset_rid=asset_rid,
269
+ dest_dir=dest_dir,
270
+ update_catalog=not (reload or self._dry_run),
271
+ )
272
+ )
227
273
 
228
274
  # Save configuration details for later upload
229
- exec_config_path = ExecMetadataVocab.execution_config.value
230
- cfile = self.execution_metadata_path(exec_config_path) / "configuration.json"
231
- with open(cfile, "w", encoding="utf-8") as config_file:
275
+ cfile = self.asset_file_path(
276
+ asset_name="Execution_Metadata",
277
+ file_name="configuration.json",
278
+ asset_types=ExecMetadataVocab.execution_config.value,
279
+ )
280
+ with open(cfile.as_posix(), "w", encoding="utf-8") as config_file:
232
281
  json.dump(self.configuration.model_dump(), config_file)
233
282
 
234
283
  # save runtime env
@@ -237,6 +286,42 @@ class Execution:
237
286
  self.start_time = datetime.now()
238
287
  self.update_status(Status.pending, "Initialize status finished.")
239
288
 
289
+ @property
290
+ def _execution_root(self) -> Path:
291
+ """
292
+
293
+ Args:
294
+
295
+ Returns:
296
+ :return:
297
+
298
+ """
299
+ return execution_root(self._working_dir, self.execution_rid)
300
+
301
+ @property
302
+ def _feature_root(self) -> Path:
303
+ """The root path to all execution specific files.
304
+ :return:
305
+
306
+ Args:
307
+
308
+ Returns:
309
+
310
+ """
311
+ return feature_root(self._working_dir, self.execution_rid)
312
+
313
+ @property
314
+ def _asset_root(self) -> Path:
315
+ """The root path to all execution specific files.
316
+ :return:
317
+
318
+ Args:
319
+
320
+ Returns:
321
+
322
+ """
323
+ return asset_root(self._working_dir, self.execution_rid)
324
+
240
325
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
241
326
  def download_dataset_bag(self, dataset: DatasetSpec) -> DatasetBag:
242
327
  """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
@@ -276,27 +361,6 @@ class Execution:
276
361
  ]
277
362
  )
278
363
 
279
- def _create_notebook_checkpoint(self):
280
- """Trigger a checkpoint creation using Jupyter's API."""
281
-
282
- server, session = self._ml_object._get_notebook_session()
283
- notebook_name = session["notebook"]["path"]
284
- notebook_url = f"{server['url']}api/contents/{notebook_name}"
285
-
286
- # Get notebook content
287
- response = requests.get(
288
- notebook_url, headers={"Authorization": f"Token {server['token']}"}
289
- )
290
- if response.status_code == 200:
291
- notebook_content = response.json()["content"]
292
- # Execution metadata cannot be in a directory, so map path into filename.
293
- checkpoint_path = (
294
- self.execution_metadata_path(ExecMetadataVocab.runtime_env.value)
295
- / f"{notebook_name.replace('/', '_')}.checkpoint"
296
- )
297
- with open(checkpoint_path, "w", encoding="utf-8") as f:
298
- json.dump(notebook_content, f)
299
-
300
364
  def execution_start(self) -> None:
301
365
  """Start an execution, uploading status to catalog"""
302
366
 
@@ -318,7 +382,7 @@ class Execution:
318
382
  self._ml_object.ml_schema
319
383
  ].Execution.update([{"RID": self.execution_rid, "Duration": duration}])
320
384
 
321
- def _upload_execution_dirs(self) -> dict[str, FileUploadState]:
385
+ def _upload_execution_dirs(self) -> dict[str, list[AssetFilePath]]:
322
386
  """Upload execution assets at _working_dir/Execution_asset.
323
387
 
324
388
  This routine uploads the contents of the
@@ -332,86 +396,142 @@ class Execution:
332
396
  DerivaMLException: If there is an issue uploading the assets.
333
397
  """
334
398
 
335
- def asset_name(p: str) -> str:
336
- return Path(*Path(p).parts[-2:]).as_posix()
337
-
338
399
  try:
339
400
  self.update_status(Status.running, "Uploading execution files...")
340
- results = upload_directory(self._ml_object.model, self._execution_root)
341
- results = {asset_name(k): v for k, v in results.items()}
342
-
343
- execution_assets = [
344
- r.result["RID"]
345
- for r in results.values()
346
- if r.state == UploadState.success and "Execution_Asset_Type" in r.result
347
- ]
348
- execution_metadata = [
349
- r.result["RID"]
350
- for r in results.values()
351
- if r.state == UploadState.success
352
- and "Execution_Metadata_Type" in r.result
353
- ]
354
- self._update_execution_asset_table(execution_assets)
355
- self._update_execution_metadata_table(execution_metadata)
356
-
357
- except Exception as e:
401
+ results = upload_directory(self._model, self._asset_root)
402
+ except RuntimeError as e:
358
403
  error = format_exception(e)
359
404
  self.update_status(Status.failed, error)
360
405
  raise DerivaMLException(f"Fail to upload execution_assets. Error: {error}")
361
406
 
407
+ asset_map = {}
408
+ for path, status in results.items():
409
+ asset_table, file_name = normalize_asset_dir(path)
410
+
411
+ asset_map.setdefault(asset_table, []).append(
412
+ AssetFilePath(
413
+ asset_path=path,
414
+ asset_name=asset_table,
415
+ file_name=file_name,
416
+ asset_metadata={
417
+ k: v
418
+ for k, v in status.result.items()
419
+ if k in self._model.asset_metadata(asset_table.split("/")[1])
420
+ },
421
+ asset_types=[],
422
+ asset_rid=status.result["RID"],
423
+ )
424
+ )
425
+
426
+ self._update_asset_execution_table(asset_map)
362
427
  self.update_status(Status.running, "Updating features...")
363
428
 
364
- feature_assets = defaultdict(dict)
365
-
366
- def traverse_bottom_up(directory: Path):
367
- """Traverses the directory tree in a bottom-up order.
368
-
369
- Args:
370
- directory: Path:
371
-
372
- Returns:
373
-
374
- """
375
- entries = list(directory.iterdir())
376
- for entry in entries:
377
- if entry.is_dir():
378
- yield from traverse_bottom_up(entry)
379
- yield directory
380
-
381
- for p in traverse_bottom_up(self._feature_root):
382
- if m := is_feature_asset_dir(p):
383
- try:
384
- self.update_status(
385
- Status.running, f"Uploading feature {m['feature_name']}..."
386
- )
387
- feature_assets[m["target_table"], m["feature_name"]] = (
388
- self._ml_object.upload_assets(p)
389
- )
390
- results |= feature_assets[m["target_table"], m["feature_name"]]
391
- except Exception as e:
392
- error = format_exception(e)
393
- self.update_status(Status.failed, error)
394
- raise DerivaMLException(
395
- f"Fail to upload execution metadata. Error: {error}"
396
- )
397
- elif m := is_feature_dir(p):
398
- files = [f for f in p.iterdir() if f.is_file()]
399
- if files:
400
- self._update_feature_table(
401
- target_table=m["target_table"],
402
- feature_name=m["feature_name"],
403
- feature_file=files[0],
404
- uploaded_files=feature_assets[
405
- m["target_table"], m["feature_name"]
406
- ],
407
- )
429
+ for p in self._feature_root.glob("**/*.jsonl"):
430
+ m = is_feature_dir(p.parent)
431
+ self._update_feature_table(
432
+ target_table=m["target_table"],
433
+ feature_name=m["feature_name"],
434
+ feature_file=p,
435
+ uploaded_files=asset_map,
436
+ )
408
437
 
409
438
  self.update_status(Status.running, "Upload assets complete")
410
- return results
439
+ return asset_map
440
+
441
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
442
+ def download_asset(
443
+ self, asset_rid: RID, dest_dir: Path, update_catalog=True
444
+ ) -> AssetFilePath:
445
+ """Download an asset from a URL and place it in a local directory.
446
+
447
+ Args:
448
+ asset_rid: URL of the asset.
449
+ dest_dir: Destination directory for the asset.
450
+ update_catalog: Whether to update the catalog execution information after downloading.
451
+
452
+ Returns:
453
+ A tuple with the name of the asset table and a Path object to the downloaded asset.
454
+ """
455
+
456
+ asset_table = self._ml_object.resolve_rid(asset_rid).table
457
+ if not self._model.is_asset(asset_table):
458
+ raise DerivaMLException(f"RID {asset_rid} is not for an asset table.")
459
+
460
+ asset_record = self._ml_object.retrieve_rid(asset_rid)
461
+ asset_metadata = {
462
+ k: v
463
+ for k, v in asset_record.items()
464
+ if k in self._model.asset_metadata(asset_table)
465
+ }
466
+ asset_url = asset_record["URL"]
467
+ asset_filename = dest_dir / asset_record["Filename"]
468
+ hs = HatracStore("https", self._ml_object.host_name, self._ml_object.credential)
469
+ hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
470
+
471
+ asset_type_table = self._model.find_association(asset_table, MLVocab.asset_type)
472
+ type_path = self._ml_object.pathBuilder.schemas[
473
+ asset_type_table.schema.name
474
+ ].tables[asset_type_table.name]
475
+ asset_types = [
476
+ asset_type[MLVocab.asset_type.value]
477
+ for asset_type in type_path.filter(
478
+ type_path.columns[asset_table.name] == asset_rid
479
+ )
480
+ .attributes(type_path.Asset_Type)
481
+ .fetch()
482
+ ]
483
+
484
+ asset_path = AssetFilePath(
485
+ file_name=asset_filename,
486
+ asset_rid=asset_rid,
487
+ asset_path=asset_filename,
488
+ asset_metadata=asset_metadata,
489
+ asset_name=asset_table.name,
490
+ asset_types=asset_types,
491
+ )
492
+
493
+ if update_catalog:
494
+ self._update_asset_execution_table(
495
+ {f"{asset_table.schema.name}/{asset_table.name}": [asset_path]},
496
+ asset_role="Input",
497
+ )
498
+ return asset_path
499
+
500
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
501
+ def upload_assets(
502
+ self,
503
+ assets_dir: str | Path,
504
+ ) -> dict[Any, FileUploadState] | None:
505
+ """Upload assets from a directory.
506
+
507
+ This routine assumes that the current upload specification includes a configuration for the specified directory.
508
+ Every asset in the specified directory is uploaded
509
+
510
+ Args:
511
+ assets_dir: Directory containing the assets to upload.
512
+
513
+ Returns:
514
+ Results of the upload operation.
515
+
516
+ Raises:
517
+ DerivaMLException: If there is an issue uploading the assets.
518
+ """
519
+
520
+ def path_to_asset(path: str) -> str:
521
+ """Pull the asset name out of a path to that asset in the filesystem"""
522
+ components = path.split("/")
523
+ return components[
524
+ components.index("asset") + 2
525
+ ] # Look for asset in the path to find the name
526
+
527
+ if not self._model.is_asset(Path(assets_dir).name):
528
+ raise DerivaMLException("Directory does not have name of an asset table.")
529
+ results = upload_directory(self._model, assets_dir)
530
+ return {path_to_asset(p): r for p, r in results.items()}
411
531
 
412
532
  def upload_execution_outputs(
413
533
  self, clean_folder: bool = True
414
- ) -> dict[str, FileUploadState]:
534
+ ) -> dict[str, list[AssetFilePath]]:
415
535
  """Upload all the assets and metadata associated with the current execution.
416
536
 
417
537
  This will include any new assets, features, or table values.
@@ -427,29 +547,16 @@ class Execution:
427
547
  if self._dry_run:
428
548
  return {}
429
549
  try:
430
- uploaded_assets = self._upload_execution_dirs()
550
+ self.uploaded_assets = self._upload_execution_dirs()
431
551
  self.update_status(Status.completed, "Successfully end the execution.")
432
552
  if clean_folder:
433
553
  self._clean_folder_contents(self._execution_root)
434
- return uploaded_assets
554
+ return self.uploaded_assets
435
555
  except Exception as e:
436
556
  error = format_exception(e)
437
557
  self.update_status(Status.failed, error)
438
558
  raise e
439
559
 
440
- def _asset_dir(self) -> Path:
441
- """
442
-
443
- Args:
444
-
445
- Returns:
446
- :return: PathLib path object to model directory.
447
-
448
- """
449
- path = self._working_dir / self.execution_rid / "asset"
450
- path.mkdir(parents=True, exist_ok=True)
451
- return path
452
-
453
560
  def _clean_folder_contents(self, folder_path: Path):
454
561
  """
455
562
 
@@ -472,7 +579,7 @@ class Execution:
472
579
  target_table: str,
473
580
  feature_name: str,
474
581
  feature_file: str | Path,
475
- uploaded_files: dict[str, FileUploadState],
582
+ uploaded_files: dict[str, list[AssetFilePath]],
476
583
  ) -> None:
477
584
  """
478
585
 
@@ -480,121 +587,140 @@ class Execution:
480
587
  target_table: str:
481
588
  feature_name: str:
482
589
  feature_file: str | Path:
483
- uploaded_files: dict[str: FileUploadState]:
590
+ uploaded_files: Dictionary whose key ia an asset name, file-name pair, and whose value is a filename, RID of that asset.
484
591
  """
485
592
 
593
+ # Get the column names of all the Feature columns that should be the RID of an asset
486
594
  asset_columns = [
487
595
  c.name
488
596
  for c in self._ml_object.feature_record_class(
489
597
  target_table, feature_name
490
598
  ).feature.asset_columns
491
599
  ]
600
+
601
+ # Get the names of the columns in the feature that are assets.
602
+ asset_columns = [
603
+ c.name
604
+ for c in self._ml_object.feature_record_class(
605
+ target_table, feature_name
606
+ ).feature.asset_columns
607
+ ]
608
+
492
609
  feature_table = self._ml_object.feature_record_class(
493
610
  target_table, feature_name
494
611
  ).feature.feature_table.name
612
+ asset_map = {
613
+ (asset_table, asset.file_name): asset.asset_rid
614
+ for asset_table, assets in uploaded_files.items()
615
+ for asset in assets
616
+ }
495
617
 
496
618
  def map_path(e):
497
- """
498
-
499
- Args:
500
- e:
501
-
502
- Returns:
503
-
504
- """
505
- # Go through the asset columns and replace the file name with the RID for the uploaded file.
619
+ """Go through the asset columns and replace the file name with the RID for the uploaded file."""
506
620
  for c in asset_columns:
507
- e[c] = asset_map[e[c]]
621
+ e[c] = asset_map[normalize_asset_dir(e[c])]
508
622
  return e
509
623
 
510
- # Create a map between a file name that appeared in the file to the RID of the uploaded file.
511
- asset_map = {
512
- file: asset.result["RID"]
513
- for file, asset in uploaded_files.items()
514
- if asset.state == UploadState.success and asset.result
515
- }
624
+ # Load the JSON file that has the set of records that contain the feature values.
516
625
  with open(feature_file, "r") as feature_values:
517
- entities = [map_path(e) for e in csv.DictReader(feature_values)]
518
- self._ml_object.domain_path.tables[feature_table].insert(entities)
519
-
520
- def _update_execution_metadata_table(self, assets: list[RID]) -> None:
521
- """Upload execution metadata at _working_dir/Execution_metadata."""
522
- ml_schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
523
- entities = [
524
- {"Execution_Metadata": metadata_rid, "Execution": self.execution_rid}
525
- for metadata_rid in assets
526
- ]
527
- ml_schema_path.Execution_Metadata_Execution.insert(entities)
528
-
529
- def _update_execution_asset_table(self, assets: list[RID]) -> None:
530
- """Assets associated with an execution must be linked to an execution entity after they are uploaded into
531
- the catalog. This routine takes a list of uploaded assets and makes that association.
532
-
533
- Args:
534
- assets: list of RIDS for execution assets.:
535
- """
536
- ml_schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
537
- entities = [
538
- {"Execution_Asset": asset_rid, "Execution": self.execution_rid}
539
- for asset_rid in assets
540
- ]
541
- ml_schema_path.Execution_Asset_Execution.insert(entities)
542
-
543
- @property
544
- def _execution_metadata_dir(self) -> Path:
545
- """
546
-
547
- Args:
548
-
549
- Returns:
550
- to the catalog by the execution_upload method in an execution object.
551
-
552
- :return:
553
-
554
- """
555
- return execution_metadata_dir(
556
- self._working_dir, exec_rid=self.execution_rid, metadata_type=""
626
+ entities = [json.loads(line.strip()) for line in feature_values]
627
+ # Update the asset columns in the feature and add to the catalog.
628
+ self._ml_object.domain_path.tables[feature_table].insert(
629
+ [map_path(e) for e in entities]
557
630
  )
558
631
 
559
- def execution_metadata_path(self, metadata_type: str) -> Path:
560
- """Return a pathlib Path to the directory in which to place files of type metadata_type.
561
-
562
- These files are uploaded to the catalog as part of the execution of the upload_execution method in DerivaML.
632
+ def _update_asset_execution_table(
633
+ self,
634
+ uploaded_assets: dict[str, list[AssetFilePath]],
635
+ asset_role: str = "Output",
636
+ ):
637
+ """Add entry to association table connecting an asset to an execution RID
563
638
 
564
639
  Args:
565
- metadata_type: Type of metadata to be uploaded. Must be a term in Metadata_Type controlled vocabulary.
566
-
567
- Returns:
568
- Path to the directory in which to place files of type metadata_type.
569
- """
570
- self._ml_object.lookup_term(
571
- MLVocab.execution_metadata_type, metadata_type
572
- ) # Make sure metadata type exists.
573
- return execution_metadata_dir(
574
- self._working_dir, exec_rid=self.execution_rid, metadata_type=metadata_type
575
- )
576
-
577
- @property
578
- def _execution_asset_dir(self) -> Path:
579
- """
640
+ uploaded_assets: Dictionary whose key is the name of an asset table, and whose value is a list of RIDs for
641
+ newly added assets to that table.
642
+ asset_role: A term or list of terms from the Asset_Role vocabulary.
643
+ """
644
+ # Make sure the asset role is in the controlled vocabulary table.
645
+ self._ml_object.lookup_term(MLVocab.asset_role, asset_role)
646
+
647
+ pb = self._ml_object.pathBuilder
648
+ for asset_table, asset_list in uploaded_assets.items():
649
+ asset_table_name = asset_table.split("/")[
650
+ 1
651
+ ] # Peel off the schema from the asset table
652
+ asset_exe = self._model.find_association(asset_table_name, "Execution")
653
+ asset_exe_path = pb.schemas[asset_exe.schema.name].tables[asset_exe.name]
654
+ asset_exe_path.insert(
655
+ [
656
+ {
657
+ asset_table_name: asset_path.asset_rid,
658
+ "Execution": self.execution_rid,
659
+ "Asset_Role": asset_role,
660
+ }
661
+ for asset_path in asset_list
662
+ ]
663
+ )
580
664
 
581
- Args:
665
+ # Now add in the type names via the asset_asset_type association table.
666
+ # Get the list of types for each file in the asset.
667
+ if asset_role == "Input":
668
+ return
669
+ asset_type_map = {}
670
+ with open(
671
+ asset_type_path(
672
+ self._working_dir,
673
+ self.execution_rid,
674
+ self._model.name_to_table(asset_table_name),
675
+ ),
676
+ "r",
677
+ ) as f:
678
+ for line in f:
679
+ asset_type_map.update(json.loads(line.strip()))
680
+ for asset_path in asset_list:
681
+ asset_path.asset_types = asset_type_map[asset_path.file_name]
682
+
683
+ asset_asset_type = self._model.find_association(
684
+ asset_table_name, "Asset_Type"
685
+ )
686
+ type_path = pb.schemas[asset_asset_type.schema.name].tables[
687
+ asset_asset_type.name
688
+ ]
689
+ type_path.insert(
690
+ [
691
+ {asset_table_name: asset.asset_rid, "Asset_Type": t}
692
+ for asset in asset_list
693
+ for t in asset_type_map[asset.file_name]
694
+ ]
695
+ )
582
696
 
583
- Returns:
584
- :return:
697
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
698
+ def asset_file_path(
699
+ self,
700
+ asset_name: str,
701
+ file_name: str,
702
+ asset_types: Optional[list[str] | str] = None,
703
+ copy_file=False,
704
+ **kwargs,
705
+ ) -> AssetFilePath:
706
+ """Return a pathlib Path to the directory in which to place files for the specified execution_asset type.
585
707
 
586
- """
587
- return execution_asset_dir(
588
- self._working_dir, exec_rid=self.execution_rid, asset_type=""
589
- )
708
+ Given the name of an asset table, and a file name, register the file for upload, and return a path to that
709
+ file in the upload directory. In addition to the filename, additioal asset metadata and file asset types may
710
+ be specified.
590
711
 
591
- def execution_asset_path(self, asset_type: str) -> Path:
592
- """Return a pathlib Path to the directory in which to place files for the specified execution_asset type.
712
+ This routine has three modes, depending on if file_name refers to an existing file. If it doesn't, a path
713
+ to a new file with the specified name is returned. The caller can then open that file for writing.
593
714
 
594
- These files are uploaded as part of the upload_execution method in DerivaML class.
715
+ If the provided filename refers to an existing file and the copy_file argument is False (the default), then the
716
+ returned path contains a symbolic link to that file. If the copy_file argument is True then the contents of
717
+ file_name are copied into the target directory.
595
718
 
596
719
  Args:
597
- asset_type: Type of asset to be uploaded. Must be a term in Asset_Type controlled vocabulary.
720
+ asset_name: Type of asset to be uploaded. Must be a term in Asset_Type controlled vocabulary.
721
+ file_name: Name of file to be uploaded.
722
+ asset_types: Type of asset to be uploaded. Defaults to name of the asset.
723
+ **kwargs: Any additional metadata values that may be part of the asset table.
598
724
 
599
725
  Returns:
600
726
  Path in which to place asset files.
@@ -602,73 +728,46 @@ class Execution:
602
728
  Raises:
603
729
  DerivaException: If the asset type is not defined.
604
730
  """
605
- self._ml_object.lookup_term(MLVocab.execution_asset_type, asset_type)
606
-
607
- return execution_asset_dir(
608
- self._working_dir, exec_rid=self.execution_rid, asset_type=asset_type
609
- )
610
-
611
- @property
612
- def _execution_root(self) -> Path:
613
- """
614
-
615
- Args:
616
-
617
- Returns:
618
- :return:
619
-
620
- """
621
- return execution_root(self._working_dir, self.execution_rid)
622
-
623
- @property
624
- def _feature_root(self) -> Path:
625
- """The root path to all execution specific files.
626
- :return:
627
-
628
- Args:
629
-
630
- Returns:
631
-
632
- """
633
- return feature_root(self._working_dir, self.execution_rid)
634
-
635
- def feature_paths(
636
- self, table: Table | str, feature_name: str
637
- ) -> tuple[Path, dict[str, Path]]:
638
- """Return the file path of where to place feature values, and assets for the named feature and table.
731
+ if not self._model.is_asset(asset_name):
732
+ DerivaMLException(f"Table {asset_name} is not an asset")
639
733
 
640
- A side effect of calling this routine is that the directories in which to place the feature values and assets
641
- will be created
734
+ asset_table = self._model.name_to_table(asset_name)
642
735
 
643
- Args:
644
- table: The table with which the feature is associated.
645
- feature_name: Name of the feature
646
-
647
- Returns:
648
- A tuple whose first element is the path for the feature values and whose second element is a dictionary
649
- of associated asset table names and corresponding paths.
650
- """
651
- feature = self._ml_object.lookup_feature(table, feature_name)
736
+ asset_types = asset_types or kwargs.get("Asset_Type", None) or asset_name
737
+ asset_types = [asset_types] if isinstance(asset_types, str) else asset_types
738
+ for t in asset_types:
739
+ self._ml_object.lookup_term(MLVocab.asset_type, t)
652
740
 
653
- tpath = feature_value_path(
741
+ file_name = Path(file_name)
742
+ asset_path = asset_file_path(
654
743
  self._working_dir,
655
- schema=self._ml_object.domain_schema,
656
- target_table=feature.target_table.name,
657
- feature_name=feature_name,
658
- exec_rid=self.execution_rid,
744
+ self.execution_rid,
745
+ self._model.name_to_table(asset_name),
746
+ file_name.name,
747
+ metadata=kwargs,
748
+ )
749
+
750
+ if file_name.exists():
751
+ if copy_file:
752
+ asset_path.write_bytes(file_name.read_bytes())
753
+ else:
754
+ asset_path.symlink_to(file_name)
755
+
756
+ # Persist the asset types into a file
757
+ with open(
758
+ asset_type_path(self._working_dir, self.execution_rid, asset_table),
759
+ "a",
760
+ encoding="utf-8",
761
+ ) as f:
762
+ f.write(json.dumps({file_name.name: asset_types}) + "\n")
763
+
764
+ return AssetFilePath(
765
+ asset_path=asset_path,
766
+ asset_name=asset_name,
767
+ file_name=file_name.name,
768
+ asset_metadata=kwargs,
769
+ asset_types=asset_types,
659
770
  )
660
- asset_paths = {
661
- asset_table.name: feature_asset_dir(
662
- self._working_dir,
663
- exec_rid=self.execution_rid,
664
- schema=self._ml_object.domain_schema,
665
- target_table=feature.target_table.name,
666
- feature_name=feature_name,
667
- asset_table=asset_table.name,
668
- )
669
- for asset_table in feature.asset_columns
670
- }
671
- return tpath, asset_paths
672
771
 
673
772
  def table_path(self, table: str) -> Path:
674
773
  """Return a local file path to a CSV to add values to a table on upload.
@@ -679,10 +778,7 @@ class Execution:
679
778
  Returns:
680
779
  Pathlib path to the file in which to place table values.
681
780
  """
682
- if (
683
- table
684
- not in self._ml_object.model.schemas[self._ml_object.domain_schema].tables
685
- ):
781
+ if table not in self._model.schemas[self._ml_object.domain_schema].tables:
686
782
  raise DerivaMLException(
687
783
  "Table '{}' not found in domain schema".format(table)
688
784
  )
@@ -693,10 +789,11 @@ class Execution:
693
789
 
694
790
  def execute(self) -> Execution:
695
791
  """Initiate an execution with provided configuration. Can be used in a context manager."""
792
+ self.execution_start()
696
793
  return self
697
794
 
698
795
  @validate_call
699
- def write_feature_file(self, features: Iterable[FeatureRecord]) -> None:
796
+ def add_features(self, features: Iterable[FeatureRecord]) -> None:
700
797
  """Given a collection of Feature records, write out a CSV file in the appropriate assets directory so that this
701
798
  feature gets uploaded when the execution is complete.
702
799
 
@@ -704,22 +801,28 @@ class Execution:
704
801
  features: Iterable of Feature records to write.
705
802
  """
706
803
 
707
- feature_iter = iter(features)
708
- first_row = next(feature_iter)
804
+ # Make sure feature list is homogeneous:
805
+ sorted_features = defaultdict(list)
806
+ for f in features:
807
+ sorted_features[type(f)].append(f)
808
+ for fs in sorted_features.values():
809
+ self._add_features(fs)
810
+
811
+ def _add_features(self, features: list[FeatureRecord]) -> None:
812
+ # Update feature records to include current execution_rid
813
+ first_row = features[0]
709
814
  feature = first_row.feature
710
- csv_path, _ = self.feature_paths(
711
- feature.target_table.name, feature.feature_name
815
+ json_path = feature_value_path(
816
+ self._working_dir,
817
+ schema=self._ml_object.domain_schema,
818
+ target_table=feature.target_table.name,
819
+ feature_name=feature.feature_name,
820
+ exec_rid=self.execution_rid,
712
821
  )
713
-
714
- fieldnames = {"Execution", "Feature_Name", feature.target_table.name}
715
- fieldnames |= {f.name for f in feature.feature_columns}
716
-
717
- with open(csv_path, "w") as f:
718
- writer = csv.DictWriter(f, fieldnames=fieldnames)
719
- writer.writeheader()
720
- writer.writerow(first_row.model_dump())
721
- for feature in feature_iter:
722
- writer.writerow(feature.model_dump())
822
+ with open(json_path, "a", encoding="utf-8") as file:
823
+ for feature in features:
824
+ feature.Execution = self.execution_rid
825
+ file.write(json.dumps(feature.model_dump(mode="json")) + "\n")
723
826
 
724
827
  @validate_call
725
828
  def create_dataset(self, dataset_types: str | list[str], description: str) -> RID:
@@ -748,9 +851,13 @@ class Execution:
748
851
  Add new elements to an existing dataset. In addition to adding new members, the minor version number of the
749
852
  dataset is incremented and the description, if provide is applied to that new version.
750
853
 
854
+ The RIDs in the list to not have to be all from the same table, but they must be from a table that has
855
+ been configured to be a dataset element type.
856
+
751
857
  Args:
752
858
  dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
753
- members: List of RIDs of members to add to the dataset_table.
859
+ members: List of RIDs of members to add to the dataset_table. RID must be to a table type that is a
860
+ dataset element type (see DerivaML.add_dataset_element_type).
754
861
  validate: Check rid_list to make sure elements are not already in the dataset_table.
755
862
  description: Markdown description of the updated dataset.
756
863
  """