deriva-ml 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deriva_ml/execution.py CHANGED
@@ -5,44 +5,37 @@ This module defined the Execution class which is used to interact with the state
5
5
  from __future__ import annotations
6
6
 
7
7
  from collections import defaultdict
8
- import csv
9
8
  import json
10
9
  import logging
11
10
  import os
12
11
  import shutil
13
12
  from datetime import datetime
14
13
  from pathlib import Path
15
- import requests
16
- from tempfile import NamedTemporaryFile
17
14
  from typing import Iterable, Any, Optional
15
+
18
16
  from deriva.core import format_exception
19
- from deriva.core.ermrest_model import Table
20
17
  from pydantic import validate_call, ConfigDict
18
+ import sys
19
+ from deriva.core.hatrac_store import HatracStore
21
20
 
22
- from .deriva_definitions import MLVocab, ExecMetadataVocab
23
- from .deriva_definitions import (
24
- RID,
25
- Status,
26
- FileUploadState,
27
- UploadState,
28
- DerivaMLException,
29
- )
21
+ from .deriva_definitions import ExecMetadataVocab
22
+ from .deriva_definitions import RID, Status, FileUploadState, DerivaMLException, MLVocab
30
23
  from .deriva_ml_base import DerivaML, FeatureRecord
31
24
  from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
32
25
  from .dataset_bag import DatasetBag
33
26
  from .execution_configuration import ExecutionConfiguration, Workflow
34
27
  from .execution_environment import get_execution_environment
35
28
  from .upload import (
36
- execution_metadata_dir,
37
- execution_asset_dir,
38
29
  execution_root,
39
30
  feature_root,
40
- feature_asset_dir,
31
+ asset_root,
41
32
  feature_value_path,
42
33
  is_feature_dir,
43
- is_feature_asset_dir,
44
34
  table_path,
45
35
  upload_directory,
36
+ normalize_asset_dir,
37
+ asset_file_path,
38
+ asset_type_path,
46
39
  )
47
40
 
48
41
  try:
@@ -59,6 +52,41 @@ except ImportError:
59
52
  return []
60
53
 
61
54
 
55
+ class AssetFilePath(type(Path())):
56
+ """Derived class of Path that also includes information about a downloaded.
57
+
58
+ An AssetFilePath has all the methods associated with a pathlib.Path object. In addition, it defines additional
59
+ attributes associated with a DerviaML asset.
60
+
61
+ Attributes:
62
+ asset_types: A list of the types associated with this asset. From the Asset_Type controlled vocabulary.
63
+ asset_metadata: A dictionary of names and values of any additional columns associated with this asset.
64
+ asset_name: The name of the asset table
65
+ file_name: The name of the file in the local file system that has the asset contents
66
+ asset_rid: The RID of the asset if it has been uploaded into an asset table
67
+ """
68
+
69
+
70
+ def __new__(
71
+ cls,
72
+ asset_path,
73
+ asset_name: str,
74
+ file_name: str,
75
+ asset_metadata: dict[str, Any],
76
+ asset_types: list[str] | str,
77
+ asset_rid: Optional[RID] = None,
78
+ ):
79
+ obj = super().__new__(cls, asset_path)
80
+ obj.asset_types = (
81
+ asset_types if isinstance(asset_types, list) else [asset_types]
82
+ )
83
+ obj.asset_metadata = asset_metadata
84
+ obj.asset_name = asset_name
85
+ obj.file_name = file_name
86
+ obj.asset_rid = asset_rid
87
+ return obj
88
+
89
+
62
90
  class Execution:
63
91
  """The Execution class is used to capture the context of an activity within DerivaML. While these are primarily
64
92
  computational, manual processes can be represented by an execution as well.
@@ -101,21 +129,24 @@ class Execution:
101
129
  """
102
130
 
103
131
  Args:
104
- configuration:
105
- ml_object:
132
+ configuration: Execution configuration object that describes the execution.
133
+ ml_object: The DerivaML instance that created the execution.
106
134
  reload: RID of previously initialized execution object.
107
135
  """
108
136
  self.asset_paths: list[Path] = []
109
137
  self.configuration = configuration
110
138
  self._ml_object = ml_object
139
+ self._model = ml_object.model
111
140
  self._logger = ml_object._logger
112
141
  self.start_time = None
113
142
  self.stop_time = None
114
143
  self.status = Status.created
115
144
  self.uploaded_assets: list[Path] = []
145
+ self.configuration.argv = sys.argv
116
146
 
117
147
  self.dataset_rids: list[RID] = []
118
148
  self.datasets: list[DatasetBag] = []
149
+ self.parameters = self.configuration.parameters
119
150
 
120
151
  self._working_dir = self._ml_object.working_dir
121
152
  self._cache_dir = self._ml_object.cache_dir
@@ -144,9 +175,7 @@ class Execution:
144
175
  )
145
176
 
146
177
  for a in self.configuration.assets:
147
- if not self._ml_object.model.is_asset(
148
- self._ml_object.resolve_rid(a).table.name
149
- ):
178
+ if not self._model.is_asset(self._ml_object.resolve_rid(a).table.name):
150
179
  raise DerivaMLException(
151
180
  "Asset specified in execution configuration is not a asset table"
152
181
  )
@@ -173,15 +202,12 @@ class Execution:
173
202
  self._initialize_execution(reload)
174
203
 
175
204
  def _save_runtime_environment(self):
176
- runtime_env_path = ExecMetadataVocab.runtime_env.value
177
- runtime_env_dir = self.execution_metadata_path(runtime_env_path)
178
- with NamedTemporaryFile(
179
- "w+",
180
- dir=runtime_env_dir,
181
- prefix="environment_snapshot_",
182
- suffix=".txt",
183
- delete=False,
184
- ) as fp:
205
+ runtime_env_path = self.asset_file_path(
206
+ asset_name="Execution_Metadata",
207
+ file_name=f"environment_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
208
+ asset_types=ExecMetadataVocab.runtime_env.value,
209
+ )
210
+ with open(runtime_env_path, "w") as fp:
185
211
  json.dump(get_execution_environment(), fp)
186
212
 
187
213
  def _initialize_execution(self, reload: Optional[RID] = None) -> None:
@@ -203,6 +229,7 @@ class Execution:
203
229
  )
204
230
  self.datasets.append(self.download_dataset_bag(dataset))
205
231
  self.dataset_rids.append(dataset.rid)
232
+
206
233
  # Update execution info
207
234
  schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
208
235
  if self.dataset_rids and not (reload or self._dry_run):
@@ -215,16 +242,29 @@ class Execution:
215
242
 
216
243
  # Download assets....
217
244
  self.update_status(Status.running, "Downloading assets ...")
218
- self.asset_paths = [
219
- self._ml_object.download_asset(asset_rid=a, dest_dir=self._asset_dir())
220
- for a in self.configuration.assets
221
- ]
222
- if self.asset_paths and not (reload or self._dry_run):
223
- self._update_execution_asset_table(self.configuration.assets)
245
+ self.asset_paths = {}
246
+ for asset_rid in self.configuration.assets:
247
+ asset_table = self._ml_object.resolve_rid(asset_rid).table.name
248
+ dest_dir = (
249
+ execution_root(self._ml_object.working_dir, self.execution_rid)
250
+ / "downloaded-assets"
251
+ / asset_table
252
+ )
253
+ dest_dir.mkdir(parents=True, exist_ok=True)
254
+ self.asset_paths.setdefault(asset_table, []).append(
255
+ self.download_asset(
256
+ asset_rid=asset_rid,
257
+ dest_dir=dest_dir,
258
+ update_catalog=not (reload or self._dry_run),
259
+ )
260
+ )
224
261
 
225
262
  # Save configuration details for later upload
226
- exec_config_path = ExecMetadataVocab.execution_config.value
227
- cfile = self.execution_metadata_path(exec_config_path) / "configuration.json"
263
+ cfile = self.asset_file_path(
264
+ asset_name="Execution_Metadata",
265
+ file_name="configuration.json",
266
+ asset_types=ExecMetadataVocab.execution_config.value,
267
+ )
228
268
  with open(cfile, "w", encoding="utf-8") as config_file:
229
269
  json.dump(self.configuration.model_dump(), config_file)
230
270
 
@@ -234,6 +274,42 @@ class Execution:
234
274
  self.start_time = datetime.now()
235
275
  self.update_status(Status.pending, "Initialize status finished.")
236
276
 
277
+ @property
278
+ def _execution_root(self) -> Path:
279
+ """
280
+
281
+ Args:
282
+
283
+ Returns:
284
+ :return:
285
+
286
+ """
287
+ return execution_root(self._working_dir, self.execution_rid)
288
+
289
+ @property
290
+ def _feature_root(self) -> Path:
291
+ """The root path to all execution specific files.
292
+ :return:
293
+
294
+ Args:
295
+
296
+ Returns:
297
+
298
+ """
299
+ return feature_root(self._working_dir, self.execution_rid)
300
+
301
+ @property
302
+ def _asset_root(self) -> Path:
303
+ """The root path to all execution specific files.
304
+ :return:
305
+
306
+ Args:
307
+
308
+ Returns:
309
+
310
+ """
311
+ return asset_root(self._working_dir, self.execution_rid)
312
+
237
313
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
238
314
  def download_dataset_bag(self, dataset: DatasetSpec) -> DatasetBag:
239
315
  """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
@@ -273,27 +349,6 @@ class Execution:
273
349
  ]
274
350
  )
275
351
 
276
- def _create_notebook_checkpoint(self):
277
- """Trigger a checkpoint creation using Jupyter's API."""
278
-
279
- server, session = self._ml_object._get_notebook_session()
280
- notebook_name = session["notebook"]["path"]
281
- notebook_url = f"{server['url']}api/contents/{notebook_name}"
282
-
283
- # Get notebook content
284
- response = requests.get(
285
- notebook_url, headers={"Authorization": f"Token {server['token']}"}
286
- )
287
- if response.status_code == 200:
288
- notebook_content = response.json()["content"]
289
- # Execution metadata cannot be in a directory, so map path into filename.
290
- checkpoint_path = (
291
- self.execution_metadata_path(ExecMetadataVocab.runtime_env.value)
292
- / f"{notebook_name.replace('/', '_')}.checkpoint"
293
- )
294
- with open(checkpoint_path, "w", encoding="utf-8") as f:
295
- json.dump(notebook_content, f)
296
-
297
352
  def execution_start(self) -> None:
298
353
  """Start an execution, uploading status to catalog"""
299
354
 
@@ -315,7 +370,7 @@ class Execution:
315
370
  self._ml_object.ml_schema
316
371
  ].Execution.update([{"RID": self.execution_rid, "Duration": duration}])
317
372
 
318
- def _upload_execution_dirs(self) -> dict[str, FileUploadState]:
373
+ def _upload_execution_dirs(self) -> dict[str, list[AssetFilePath]]:
319
374
  """Upload execution assets at _working_dir/Execution_asset.
320
375
 
321
376
  This routine uploads the contents of the
@@ -329,86 +384,142 @@ class Execution:
329
384
  DerivaMLException: If there is an issue uploading the assets.
330
385
  """
331
386
 
332
- def asset_name(p: str) -> str:
333
- return Path(*Path(p).parts[-2:]).as_posix()
334
-
335
387
  try:
336
388
  self.update_status(Status.running, "Uploading execution files...")
337
- results = upload_directory(self._ml_object.model, self._execution_root)
338
- results = {asset_name(k): v for k, v in results.items()}
339
-
340
- execution_assets = [
341
- r.result["RID"]
342
- for r in results.values()
343
- if r.state == UploadState.success and "Execution_Asset_Type" in r.result
344
- ]
345
- execution_metadata = [
346
- r.result["RID"]
347
- for r in results.values()
348
- if r.state == UploadState.success
349
- and "Execution_Metadata_Type" in r.result
350
- ]
351
- self._update_execution_asset_table(execution_assets)
352
- self._update_execution_metadata_table(execution_metadata)
353
-
389
+ results = upload_directory(self._model, self._asset_root)
354
390
  except Exception as e:
355
391
  error = format_exception(e)
356
392
  self.update_status(Status.failed, error)
357
393
  raise DerivaMLException(f"Fail to upload execution_assets. Error: {error}")
358
394
 
395
+ asset_map = {}
396
+ for path, status in results.items():
397
+ asset_table, file_name = normalize_asset_dir(path)
398
+
399
+ asset_map.setdefault(asset_table, []).append(
400
+ AssetFilePath(
401
+ asset_path=path,
402
+ asset_name=asset_table,
403
+ file_name=file_name,
404
+ asset_metadata={
405
+ k: v
406
+ for k, v in status.result.items()
407
+ if k in self._model.asset_metadata(asset_table.split("/")[1])
408
+ },
409
+ asset_types=[],
410
+ asset_rid=status.result["RID"],
411
+ )
412
+ )
413
+
414
+ self._update_asset_execution_table(asset_map)
359
415
  self.update_status(Status.running, "Updating features...")
360
416
 
361
- feature_assets = defaultdict(dict)
362
-
363
- def traverse_bottom_up(directory: Path):
364
- """Traverses the directory tree in a bottom-up order.
365
-
366
- Args:
367
- directory: Path:
368
-
369
- Returns:
370
-
371
- """
372
- entries = list(directory.iterdir())
373
- for entry in entries:
374
- if entry.is_dir():
375
- yield from traverse_bottom_up(entry)
376
- yield directory
377
-
378
- for p in traverse_bottom_up(self._feature_root):
379
- if m := is_feature_asset_dir(p):
380
- try:
381
- self.update_status(
382
- Status.running, f"Uploading feature {m['feature_name']}..."
383
- )
384
- feature_assets[m["target_table"], m["feature_name"]] = (
385
- self._ml_object.upload_assets(p)
386
- )
387
- results |= feature_assets[m["target_table"], m["feature_name"]]
388
- except Exception as e:
389
- error = format_exception(e)
390
- self.update_status(Status.failed, error)
391
- raise DerivaMLException(
392
- f"Fail to upload execution metadata. Error: {error}"
393
- )
394
- elif m := is_feature_dir(p):
395
- files = [f for f in p.iterdir() if f.is_file()]
396
- if files:
397
- self._update_feature_table(
398
- target_table=m["target_table"],
399
- feature_name=m["feature_name"],
400
- feature_file=files[0],
401
- uploaded_files=feature_assets[
402
- m["target_table"], m["feature_name"]
403
- ],
404
- )
417
+ for p in self._feature_root.glob("**/*.jsonl"):
418
+ m = is_feature_dir(p.parent)
419
+ self._update_feature_table(
420
+ target_table=m["target_table"],
421
+ feature_name=m["feature_name"],
422
+ feature_file=p,
423
+ uploaded_files=asset_map,
424
+ )
405
425
 
406
426
  self.update_status(Status.running, "Upload assets complete")
407
- return results
427
+ return asset_map
428
+
429
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
430
+ def download_asset(
431
+ self, asset_rid: RID, dest_dir: Path, update_catalog=True
432
+ ) -> AssetFilePath:
433
+ """Download an asset from a URL and place it in a local directory.
434
+
435
+ Args:
436
+ asset_rid: URL of the asset.
437
+ dest_dir: Destination directory for the asset.
438
+ update_catalog: Whether to update the catalog execution information after downloading.
439
+
440
+ Returns:
441
+ A tuple with the name of the asset table and a Path object to the downloaded asset.
442
+ """
443
+
444
+ asset_table = self._ml_object.resolve_rid(asset_rid).table
445
+ if not self._model.is_asset(asset_table):
446
+ raise DerivaMLException(f"RID {asset_rid} is not for an asset table.")
447
+
448
+ asset_record = self._ml_object.retrieve_rid(asset_rid)
449
+ asset_metadata = {
450
+ k: v
451
+ for k, v in asset_record.items()
452
+ if k in self._model.asset_metadata(asset_table)
453
+ }
454
+ asset_url = asset_record["URL"]
455
+ asset_filename = dest_dir / asset_record["Filename"]
456
+ hs = HatracStore("https", self._ml_object.host_name, self._ml_object.credential)
457
+ hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
458
+
459
+ asset_type_table = self._model.find_association(asset_table, MLVocab.asset_type)
460
+ type_path = self._ml_object.pathBuilder.schemas[
461
+ asset_type_table.schema.name
462
+ ].tables[asset_type_table.name]
463
+ asset_types = [
464
+ asset_type[MLVocab.asset_type.value]
465
+ for asset_type in type_path.filter(
466
+ type_path.columns[asset_table.name] == asset_rid
467
+ )
468
+ .attributes(type_path.Asset_Type)
469
+ .fetch()
470
+ ]
471
+
472
+ asset_path = AssetFilePath(
473
+ file_name=asset_filename,
474
+ asset_rid=asset_rid,
475
+ asset_path=asset_filename,
476
+ asset_metadata=asset_metadata,
477
+ asset_name=asset_table.name,
478
+ asset_types=asset_types,
479
+ )
480
+
481
+ if update_catalog:
482
+ self._update_asset_execution_table(
483
+ {f"{asset_table.schema.name}/{asset_table.name}": [asset_path]},
484
+ asset_role="Input",
485
+ )
486
+ return asset_path
487
+
488
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
489
+ def upload_assets(
490
+ self,
491
+ assets_dir: str | Path,
492
+ ) -> dict[Any, FileUploadState] | None:
493
+ """Upload assets from a directory.
494
+
495
+ This routine assumes that the current upload specification includes a configuration for the specified directory.
496
+ Every asset in the specified directory is uploaded
497
+
498
+ Args:
499
+ assets_dir: Directory containing the assets to upload.
500
+
501
+ Returns:
502
+ Results of the upload operation.
503
+
504
+ Raises:
505
+ DerivaMLException: If there is an issue uploading the assets.
506
+ """
507
+
508
+ def path_to_asset(path: str) -> str:
509
+ """Pull the asset name out of a path to that asset in the filesystem"""
510
+ components = path.split("/")
511
+ return components[
512
+ components.index("asset") + 2
513
+ ] # Look for asset in the path to find the name
514
+
515
+ if not self._model.is_asset(Path(assets_dir).name):
516
+ raise DerivaMLException("Directory does not have name of an asset table.")
517
+ results = upload_directory(self._model, assets_dir)
518
+ return {path_to_asset(p): r for p, r in results.items()}
408
519
 
409
520
  def upload_execution_outputs(
410
521
  self, clean_folder: bool = True
411
- ) -> dict[str, FileUploadState]:
522
+ ) -> dict[str, AssetFilePath]:
412
523
  """Upload all the assets and metadata associated with the current execution.
413
524
 
414
525
  This will include any new assets, features, or table values.
@@ -434,19 +545,6 @@ class Execution:
434
545
  self.update_status(Status.failed, error)
435
546
  raise e
436
547
 
437
- def _asset_dir(self) -> Path:
438
- """
439
-
440
- Args:
441
-
442
- Returns:
443
- :return: PathLib path object to model directory.
444
-
445
- """
446
- path = self._working_dir / self.execution_rid / "asset"
447
- path.mkdir(parents=True, exist_ok=True)
448
- return path
449
-
450
548
  def _clean_folder_contents(self, folder_path: Path):
451
549
  """
452
550
 
@@ -469,7 +567,7 @@ class Execution:
469
567
  target_table: str,
470
568
  feature_name: str,
471
569
  feature_file: str | Path,
472
- uploaded_files: dict[str, FileUploadState],
570
+ uploaded_files: dict[str, list[AssetFilePath]],
473
571
  ) -> None:
474
572
  """
475
573
 
@@ -477,121 +575,130 @@ class Execution:
477
575
  target_table: str:
478
576
  feature_name: str:
479
577
  feature_file: str | Path:
480
- uploaded_files: dict[str: FileUploadState]:
578
+ uploaded_files: Dictionary whose key ia an asset name, file-name pair, and whose value is a filename, RID of that asset.
481
579
  """
482
580
 
581
+ # Get the column names of all the Feature columns that should be the RID of an asset
582
+ asset_columns = [
583
+ c.name
584
+ for c in self._ml_object.feature_record_class(
585
+ target_table, feature_name
586
+ ).feature.asset_columns
587
+ ]
588
+
589
+ # Get the names of the columns in the feature that are assets.
483
590
  asset_columns = [
484
591
  c.name
485
592
  for c in self._ml_object.feature_record_class(
486
593
  target_table, feature_name
487
594
  ).feature.asset_columns
488
595
  ]
596
+
489
597
  feature_table = self._ml_object.feature_record_class(
490
598
  target_table, feature_name
491
599
  ).feature.feature_table.name
600
+ asset_map = {
601
+ (asset_table, asset.file_name): asset.asset_rid
602
+ for asset_table, assets in uploaded_files.items()
603
+ for asset in assets
604
+ }
492
605
 
493
606
  def map_path(e):
494
- """
495
-
496
- Args:
497
- e:
498
-
499
- Returns:
500
-
501
- """
502
- # Go through the asset columns and replace the file name with the RID for the uploaded file.
607
+ """Go through the asset columns and replace the file name with the RID for the uploaded file."""
503
608
  for c in asset_columns:
504
- e[c] = asset_map[e[c]]
609
+ e[c] = asset_map[normalize_asset_dir(e[c])]
505
610
  return e
506
611
 
507
- # Create a map between a file name that appeared in the file to the RID of the uploaded file.
508
- asset_map = {
509
- file: asset.result["RID"]
510
- for file, asset in uploaded_files.items()
511
- if asset.state == UploadState.success and asset.result
512
- }
612
+ # Load the JSON file that has the set of records that contain the feature values.
513
613
  with open(feature_file, "r") as feature_values:
514
- entities = [map_path(e) for e in csv.DictReader(feature_values)]
515
- self._ml_object.domain_path.tables[feature_table].insert(entities)
516
-
517
- def _update_execution_metadata_table(self, assets: list[RID]) -> None:
518
- """Upload execution metadata at _working_dir/Execution_metadata."""
519
- ml_schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
520
- entities = [
521
- {"Execution_Metadata": metadata_rid, "Execution": self.execution_rid}
522
- for metadata_rid in assets
523
- ]
524
- ml_schema_path.Execution_Metadata_Execution.insert(entities)
525
-
526
- def _update_execution_asset_table(self, assets: list[RID]) -> None:
527
- """Assets associated with an execution must be linked to an execution entity after they are uploaded into
528
- the catalog. This routine takes a list of uploaded assets and makes that association.
529
-
530
- Args:
531
- assets: list of RIDS for execution assets.:
532
- """
533
- ml_schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
534
- entities = [
535
- {"Execution_Asset": asset_rid, "Execution": self.execution_rid}
536
- for asset_rid in assets
537
- ]
538
- ml_schema_path.Execution_Asset_Execution.insert(entities)
539
-
540
- @property
541
- def _execution_metadata_dir(self) -> Path:
542
- """
543
-
544
- Args:
545
-
546
- Returns:
547
- to the catalog by the execution_upload method in an execution object.
548
-
549
- :return:
550
-
551
- """
552
- return execution_metadata_dir(
553
- self._working_dir, exec_rid=self.execution_rid, metadata_type=""
614
+ entities = [json.loads(line.strip()) for line in feature_values]
615
+ # Update the asset columns in the feature and add to the catalog.
616
+ self._ml_object.domain_path.tables[feature_table].insert(
617
+ [map_path(e) for e in entities]
554
618
  )
555
619
 
556
- def execution_metadata_path(self, metadata_type: str) -> Path:
557
- """Return a pathlib Path to the directory in which to place files of type metadata_type.
558
-
559
- These files are uploaded to the catalog as part of the execution of the upload_execution method in DerivaML.
560
-
561
- Args:
562
- metadata_type: Type of metadata to be uploaded. Must be a term in Metadata_Type controlled vocabulary.
563
-
564
- Returns:
565
- Path to the directory in which to place files of type metadata_type.
566
- """
567
- self._ml_object.lookup_term(
568
- MLVocab.execution_metadata_type, metadata_type
569
- ) # Make sure metadata type exists.
570
- return execution_metadata_dir(
571
- self._working_dir, exec_rid=self.execution_rid, metadata_type=metadata_type
572
- )
573
-
574
- @property
575
- def _execution_asset_dir(self) -> Path:
576
- """
620
+ def _update_asset_execution_table(
621
+ self,
622
+ uploaded_assets: dict[str, list[AssetFilePath]],
623
+ asset_role: str = "Output",
624
+ ):
625
+ """Add entry to association table connecting an asset to an execution RID
577
626
 
578
627
  Args:
628
+ uploaded_assets: Dictionary whose key is the name of an asset table, and whose value is a list of RIDs for
629
+ newly added assets to that table.
630
+ asset_role: A term or list of terms from the Asset_Role vocabulary.
631
+ """
632
+ # Make sure the asset role is in the controlled vocabulary table.
633
+ self._ml_object.lookup_term(MLVocab.asset_role, asset_role)
634
+
635
+ pb = self._ml_object.pathBuilder
636
+ for asset_table, asset_list in uploaded_assets.items():
637
+ asset_table_name = asset_table.split("/")[
638
+ 1
639
+ ] # Peel off the schema from the asset table
640
+ asset_exe = self._model.find_association(asset_table_name, "Execution")
641
+ asset_exe_path = pb.schemas[asset_exe.schema.name].tables[asset_exe.name]
642
+ asset_exe_path.insert(
643
+ [
644
+ {
645
+ asset_table_name: asset_path.asset_rid,
646
+ "Execution": self.execution_rid,
647
+ "Asset_Role": asset_role,
648
+ }
649
+ for asset_path in asset_list
650
+ ]
651
+ )
579
652
 
580
- Returns:
581
- :return:
582
-
583
- """
584
- return execution_asset_dir(
585
- self._working_dir, exec_rid=self.execution_rid, asset_type=""
586
- )
653
+ # Now add in the type names via the asset_asset_type association table.
654
+ # Get the list of types for each file in the asset.
655
+ if asset_role == "Input":
656
+ return
657
+ asset_type_map = {}
658
+ with open(
659
+ asset_type_path(
660
+ self._working_dir,
661
+ self.execution_rid,
662
+ self._model.name_to_table(asset_table_name),
663
+ ),
664
+ "r",
665
+ ) as f:
666
+ for line in f:
667
+ asset_type_map.update(json.loads(line.strip()))
668
+ for asset_path in asset_list:
669
+ asset_path.asset_types = asset_type_map[asset_path.file_name]
670
+
671
+ asset_asset_type = self._model.find_association(
672
+ asset_table_name, "Asset_Type"
673
+ )
674
+ type_path = pb.schemas[asset_asset_type.schema.name].tables[
675
+ asset_asset_type.name
676
+ ]
677
+ type_path.insert(
678
+ [
679
+ {asset_table_name: asset.asset_rid, "Asset_Type": t}
680
+ for asset in asset_list
681
+ for t in asset_type_map[asset.file_name]
682
+ ]
683
+ )
587
684
 
588
- def execution_asset_path(self, asset_type: str) -> Path:
685
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
686
+ def asset_file_path(
687
+ self,
688
+ asset_name: str,
689
+ file_name: str,
690
+ asset_types: Optional[list[str] | str] = None,
691
+ **kwargs,
692
+ ) -> AssetFilePath:
589
693
  """Return a pathlib Path to the directory in which to place files for the specified execution_asset type.
590
694
 
591
695
  These files are uploaded as part of the upload_execution method in DerivaML class.
592
696
 
593
697
  Args:
594
- asset_type: Type of asset to be uploaded. Must be a term in Asset_Type controlled vocabulary.
698
+ asset_name: Type of asset to be uploaded. Must be a term in Asset_Type controlled vocabulary.
699
+ asset_types: Type of asset to be uploaded. Defaults to name of the asset.
700
+ file_name: Name of file to be uploaded.
701
+ **kwargs: Any additional metadata values that may be part of the asset table.
595
702
 
596
703
  Returns:
597
704
  Path in which to place asset files.
@@ -599,73 +706,39 @@ class Execution:
599
706
  Raises:
600
707
  DerivaException: If the asset type is not defined.
601
708
  """
602
- self._ml_object.lookup_term(MLVocab.execution_asset_type, asset_type)
709
+ if not self._model.is_asset(asset_name):
710
+ DerivaMLException(f"Table {asset_name} is not an asset")
603
711
 
604
- return execution_asset_dir(
605
- self._working_dir, exec_rid=self.execution_rid, asset_type=asset_type
606
- )
712
+ asset_table = self._model.name_to_table(asset_name)
607
713
 
608
- @property
609
- def _execution_root(self) -> Path:
610
- """
714
+ asset_types = asset_types or kwargs.get("Asset_Type", None) or asset_name
715
+ asset_types = [asset_types] if isinstance(asset_types, str) else asset_types
716
+ for t in asset_types:
717
+ self._ml_object.lookup_term(MLVocab.asset_type, t)
611
718
 
612
- Args:
613
-
614
- Returns:
615
- :return:
616
-
617
- """
618
- return execution_root(self._working_dir, self.execution_rid)
619
-
620
- @property
621
- def _feature_root(self) -> Path:
622
- """The root path to all execution specific files.
623
- :return:
624
-
625
- Args:
626
-
627
- Returns:
628
-
629
- """
630
- return feature_root(self._working_dir, self.execution_rid)
631
-
632
- def feature_paths(
633
- self, table: Table | str, feature_name: str
634
- ) -> tuple[Path, dict[str, Path]]:
635
- """Return the file path of where to place feature values, and assets for the named feature and table.
636
-
637
- A side effect of calling this routine is that the directories in which to place the feature values and assets
638
- will be created
639
-
640
- Args:
641
- table: The table with which the feature is associated.
642
- feature_name: Name of the feature
643
-
644
- Returns:
645
- A tuple whose first element is the path for the feature values and whose second element is a dictionary
646
- of associated asset table names and corresponding paths.
647
- """
648
- feature = self._ml_object.lookup_feature(table, feature_name)
649
-
650
- tpath = feature_value_path(
719
+ asset_path = asset_file_path(
651
720
  self._working_dir,
652
- schema=self._ml_object.domain_schema,
653
- target_table=feature.target_table.name,
654
- feature_name=feature_name,
655
- exec_rid=self.execution_rid,
721
+ self.execution_rid,
722
+ self._model.name_to_table(asset_name),
723
+ file_name,
724
+ metadata=kwargs,
725
+ )
726
+
727
+ # Persist the asset types into a file
728
+ with open(
729
+ asset_type_path(self._working_dir, self.execution_rid, asset_table),
730
+ "a",
731
+ encoding="utf-8",
732
+ ) as f:
733
+ f.write(json.dumps({file_name: asset_types}) + "\n")
734
+
735
+ return AssetFilePath(
736
+ asset_path=asset_path,
737
+ asset_name=asset_name,
738
+ file_name=file_name,
739
+ asset_metadata=kwargs,
740
+ asset_types=asset_types,
656
741
  )
657
- asset_paths = {
658
- asset_table.name: feature_asset_dir(
659
- self._working_dir,
660
- exec_rid=self.execution_rid,
661
- schema=self._ml_object.domain_schema,
662
- target_table=feature.target_table.name,
663
- feature_name=feature_name,
664
- asset_table=asset_table.name,
665
- )
666
- for asset_table in feature.asset_columns
667
- }
668
- return tpath, asset_paths
669
742
 
670
743
  def table_path(self, table: str) -> Path:
671
744
  """Return a local file path to a CSV to add values to a table on upload.
@@ -676,10 +749,7 @@ class Execution:
676
749
  Returns:
677
750
  Pathlib path to the file in which to place table values.
678
751
  """
679
- if (
680
- table
681
- not in self._ml_object.model.schemas[self._ml_object.domain_schema].tables
682
- ):
752
+ if table not in self._model.schemas[self._ml_object.domain_schema].tables:
683
753
  raise DerivaMLException(
684
754
  "Table '{}' not found in domain schema".format(table)
685
755
  )
@@ -693,7 +763,7 @@ class Execution:
693
763
  return self
694
764
 
695
765
  @validate_call
696
- def write_feature_file(self, features: Iterable[FeatureRecord]) -> None:
766
+ def add_features(self, features: Iterable[FeatureRecord]) -> None:
697
767
  """Given a collection of Feature records, write out a CSV file in the appropriate assets directory so that this
698
768
  feature gets uploaded when the execution is complete.
699
769
 
@@ -701,22 +771,28 @@ class Execution:
701
771
  features: Iterable of Feature records to write.
702
772
  """
703
773
 
704
- feature_iter = iter(features)
705
- first_row = next(feature_iter)
774
+ # Make sure feature list is homogeneous:
775
+ sorted_features = defaultdict(list)
776
+ for f in features:
777
+ sorted_features[type(f)].append(f)
778
+ for fs in sorted_features.values():
779
+ self._add_features(fs)
780
+
781
+ def _add_features(self, features: list[FeatureRecord]) -> None:
782
+ # Update feature records to include current execution_rid
783
+ first_row = features[0]
706
784
  feature = first_row.feature
707
- csv_path, _ = self.feature_paths(
708
- feature.target_table.name, feature.feature_name
785
+ json_path = feature_value_path(
786
+ self._working_dir,
787
+ schema=self._ml_object.domain_schema,
788
+ target_table=feature.target_table.name,
789
+ feature_name=feature.feature_name,
790
+ exec_rid=self.execution_rid,
709
791
  )
710
-
711
- fieldnames = {"Execution", "Feature_Name", feature.target_table.name}
712
- fieldnames |= {f.name for f in feature.feature_columns}
713
-
714
- with open(csv_path, "w") as f:
715
- writer = csv.DictWriter(f, fieldnames=fieldnames)
716
- writer.writeheader()
717
- writer.writerow(first_row.model_dump())
718
- for feature in feature_iter:
719
- writer.writerow(feature.model_dump())
792
+ with open(json_path, "a", encoding="utf-8") as file:
793
+ for feature in features:
794
+ feature.Execution = self.execution_rid
795
+ file.write(json.dumps(feature.model_dump(mode="json")) + "\n")
720
796
 
721
797
  @validate_call
722
798
  def create_dataset(self, dataset_types: str | list[str], description: str) -> RID:
@@ -745,9 +821,13 @@ class Execution:
745
821
  Add new elements to an existing dataset. In addition to adding new members, the minor version number of the
746
822
  dataset is incremented and the description, if provide is applied to that new version.
747
823
 
824
+ The RIDs in the list to not have to be all from the same table, but they must be from a table that has
825
+ been configured to be a dataset element type.
826
+
748
827
  Args:
749
828
  dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
750
- members: List of RIDs of members to add to the dataset_table.
829
+ members: List of RIDs of members to add to the dataset_table. RID must be to a table type that is a
830
+ dataset element type (see DerivaML.add_dataset_element_type).
751
831
  validate: Check rid_list to make sure elements are not already in the dataset_table.
752
832
  description: Markdown description of the updated dataset.
753
833
  """