deriva-ml 1.13.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deriva_ml/execution.py CHANGED
@@ -12,13 +12,11 @@ import os
12
12
  from pathlib import Path
13
13
 
14
14
  from pydantic import validate_call, ConfigDict
15
- import regex as re
16
15
  import sys
17
16
  import shutil
18
17
  from typing import Iterable, Any, Optional
19
18
 
20
19
  from deriva.core import format_exception
21
- from deriva.core.datapath import DataPathException
22
20
  from deriva.core.hatrac_store import HatracStore
23
21
  from .deriva_definitions import (
24
22
  RID,
@@ -29,6 +27,7 @@ from .deriva_definitions import (
29
27
  MLAsset,
30
28
  ExecMetadataType,
31
29
  ExecAssetType,
30
+ FileSpec,
32
31
  DRY_RUN_RID,
33
32
  )
34
33
  from .deriva_ml_base import DerivaML, FeatureRecord
@@ -66,29 +65,43 @@ except ImportError:
66
65
  return s
67
66
 
68
67
 
69
- class AssetFilePath(type(Path())):
70
- """Derived class of Path that also includes information about a downloaded.
68
+ # Platform-specific base class
69
+ if sys.version_info >= (3, 12):
71
70
 
72
- An AssetFilePath has all the methods associated with a pathlib.Path object. In addition, it defines additional
73
- attributes associated with a DerviaML asset.
71
+ class AssetFilePath(Path):
72
+ """
73
+ Create a new Path object that has additional information related to the use of this path as an asset.
74
74
 
75
- Attributes:
76
- asset_types: A list of the types associated with this asset. From the Asset_Type controlled vocabulary.
77
- asset_metadata: A dictionary of names and values of any additional columns associated with this asset.
78
- asset_name: The name of the asset table
79
- file_name: The name of the file in the local file system that has the asset contents
80
- asset_rid: The RID of the asset if it has been uploaded into an asset table
81
- """
75
+ Args:
76
+ asset_path: Local path to the location of the asset.
77
+ asset_name: The name of the asset in the catalog (e.g. the asset table name).
78
+ file_name: Name of the local file that contains the contents of the asset.
79
+ asset_metadata: Any additional columns associated with this asset beyond the URL, Length, and checksum.
80
+ asset_types: A list of terms from the Asset_Type controlled vocabulary.
81
+ asset_rid: The RID of the asset if it has been uploaded into an asset table
82
+ """
82
83
 
83
- def __new__(
84
- cls,
85
- asset_path,
86
- asset_name: str,
87
- file_name: str,
88
- asset_metadata: dict[str, Any],
89
- asset_types: list[str] | str,
90
- asset_rid: Optional[RID] = None,
91
- ):
84
+ def __init__(
85
+ self,
86
+ asset_path: str | Path,
87
+ asset_name: str,
88
+ file_name: str,
89
+ asset_metadata: dict[str, Any],
90
+ asset_types: list[str] | str,
91
+ asset_rid: Optional["RID"] = None,
92
+ ):
93
+ super().__init__(asset_path)
94
+ # These assignments happen after __new__ returns the instance
95
+ self.asset_name = asset_name
96
+ self.file_name = file_name
97
+ self.asset_metadata = asset_metadata
98
+ self.asset_types = (
99
+ asset_types if isinstance(asset_types, list) else [asset_types]
100
+ )
101
+ self.asset_rid = asset_rid
102
+ else:
103
+
104
+ class AssetFilePath(type(Path())):
92
105
  """
93
106
  Create a new Path object that has additional information related to the use of this path as an asset.
94
107
 
@@ -100,15 +113,26 @@ class AssetFilePath(type(Path())):
100
113
  asset_types: A list of terms from the Asset_Type controlled vocabulary.
101
114
  asset_rid: The RID of the asset if it has been uploaded into an asset table
102
115
  """
103
- obj = super().__new__(cls, asset_path)
104
- obj.asset_types = (
105
- asset_types if isinstance(asset_types, list) else [asset_types]
106
- )
107
- obj.asset_metadata = asset_metadata
108
- obj.asset_name = asset_name
109
- obj.file_name = file_name
110
- obj.asset_rid = asset_rid
111
- return obj
116
+
117
+ def __new__(
118
+ cls,
119
+ asset_path: str | Path,
120
+ asset_name: str,
121
+ file_name: str,
122
+ asset_metadata: dict[str, Any],
123
+ asset_types: list[str] | str,
124
+ asset_rid: Optional["RID"] = None,
125
+ ):
126
+ # Only pass the path to the base Path class
127
+ obj = super().__new__(cls, asset_path)
128
+ obj.asset_name = asset_name
129
+ obj.file_name = file_name
130
+ obj.asset_metadata = asset_metadata
131
+ obj.asset_types = (
132
+ asset_types if isinstance(asset_types, list) else [asset_types]
133
+ )
134
+ obj.asset_rid = asset_rid
135
+ return obj
112
136
 
113
137
 
114
138
  class Execution:
@@ -155,7 +179,7 @@ class Execution:
155
179
  Args:
156
180
  configuration: Execution configuration object that describes the execution.
157
181
  ml_object: The DerivaML instance that created the execution.
158
- reload: RID of previously initialized execution object.
182
+ reload: RID of a previously initialized execution object.
159
183
  """
160
184
  self.asset_paths: list[AssetFilePath] = []
161
185
  self.configuration = configuration
@@ -476,7 +500,7 @@ class Execution:
476
500
  """Download an asset from a URL and place it in a local directory.
477
501
 
478
502
  Args:
479
- asset_rid: URL of the asset.
503
+ asset_rid: RID of the asset.
480
504
  dest_dir: Destination directory for the asset.
481
505
  update_catalog: Whether to update the catalog execution information after downloading.
482
506
 
@@ -656,20 +680,9 @@ class Execution:
656
680
  with open(feature_file, "r") as feature_values:
657
681
  entities = [json.loads(line.strip()) for line in feature_values]
658
682
  # Update the asset columns in the feature and add to the catalog.
659
- try:
660
- self._ml_object.domain_path.tables[feature_table].insert(
661
- [map_path(e) for e in entities]
662
- )
663
- except DataPathException as e:
664
- if re.match(
665
- rf'DETAIL: +Key +\("Execution", +"{target_table}", +"Feature_Name"\)=\(.*\) already exists',
666
- e.message,
667
- ):
668
- self._logger.info(
669
- f"Skipping reload of feature values for {feature_table}"
670
- )
671
- else:
672
- raise e
683
+ self._ml_object.domain_path.tables[feature_table].insert(
684
+ [map_path(e) for e in entities], on_conflict_skip=True
685
+ )
673
686
 
674
687
  def _update_asset_execution_table(
675
688
  self,
@@ -694,27 +707,17 @@ class Execution:
694
707
  asset_exe = self._model.find_association(asset_table_name, "Execution")
695
708
  asset_exe_path = pb.schemas[asset_exe.schema.name].tables[asset_exe.name]
696
709
 
697
- try:
698
- asset_exe_path.insert(
699
- [
700
- {
701
- asset_table_name: asset_path.asset_rid,
702
- "Execution": self.execution_rid,
703
- "Asset_Role": asset_role,
704
- }
705
- for asset_path in asset_list
706
- ]
707
- )
708
- except DataPathException as e:
709
- if re.match(
710
- rf'DETAIL: +Key +\("{asset_table_name}", +"Execution"\)=\(.*\) already exists',
711
- e.message,
712
- ):
713
- self._logger.info(
714
- f"Skipping reload of execution assocations for {asset_table_name}"
715
- )
716
- else:
717
- raise e
710
+ asset_exe_path.insert(
711
+ [
712
+ {
713
+ asset_table_name: asset_path.asset_rid,
714
+ "Execution": self.execution_rid,
715
+ "Asset_Role": asset_role,
716
+ }
717
+ for asset_path in asset_list
718
+ ],
719
+ on_conflict_skip=True,
720
+ )
718
721
 
719
722
  # Now add in the type names via the asset_asset_type association table.
720
723
  # Get the list of types for each file in the asset.
@@ -740,24 +743,15 @@ class Execution:
740
743
  type_path = pb.schemas[asset_asset_type.schema.name].tables[
741
744
  asset_asset_type.name
742
745
  ]
743
- try:
744
- type_path.insert(
745
- [
746
- {asset_table_name: asset.asset_rid, "Asset_Type": t}
747
- for asset in asset_list
748
- for t in asset_type_map[asset.file_name]
749
- ]
750
- )
751
- except DataPathException as e:
752
- if re.match(
753
- rf'DETAIL: +Key +\("{asset_table_name}", +"Asset_Type"\)=\(.*\) already exists',
754
- e.message,
755
- ):
756
- self._logger.info(
757
- f"Skipping reload of execution asset types for {asset_table_name}"
758
- )
759
- else:
760
- raise e
746
+
747
+ type_path.insert(
748
+ [
749
+ {asset_table_name: asset.asset_rid, "Asset_Type": t}
750
+ for asset in asset_list
751
+ for t in asset_type_map[asset.file_name]
752
+ ],
753
+ on_conflict_skip=True,
754
+ )
761
755
 
762
756
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
763
757
  def asset_file_path(
@@ -889,19 +883,25 @@ class Execution:
889
883
  feature.Execution = self.execution_rid
890
884
  file.write(json.dumps(feature.model_dump(mode="json")) + "\n")
891
885
 
892
- @validate_call
893
- def create_dataset(self, dataset_types: str | list[str], description: str) -> RID:
886
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
887
+ def create_dataset(
888
+ self,
889
+ dataset_types: str | list[str],
890
+ description: str,
891
+ version: Optional[DatasetVersion] = None,
892
+ ) -> RID:
894
893
  """Create a new dataset with specified types.
895
894
 
896
895
  Args:
897
896
  dataset_types: param description:
898
897
  description: Markdown description of the dataset being created.
898
+ version: Version to assign to the dataset. Defaults to 0.1.0
899
899
 
900
900
  Returns:
901
901
  RID of the newly created dataset.
902
902
  """
903
903
  return self._ml_object.create_dataset(
904
- dataset_types, description, self.execution_rid
904
+ dataset_types, description, self.execution_rid, version=version
905
905
  )
906
906
 
907
907
  def add_dataset_members(
@@ -959,6 +959,19 @@ class Execution:
959
959
  execution_rid=self.execution_rid,
960
960
  )
961
961
 
962
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
963
+ def add_files(
964
+ self,
965
+ files: Iterable[FileSpec],
966
+ file_types: str | list[str],
967
+ ) -> Iterable[RID]:
968
+ """Add files to the file table"""
969
+ return self._ml_object.add_files(
970
+ files=files,
971
+ file_types=file_types,
972
+ execution_rid=self.execution_rid,
973
+ )
974
+
962
975
  def __str__(self):
963
976
  items = [
964
977
  f"caching_dir: {self._cache_dir}",
@@ -325,7 +325,8 @@ class ExecutionConfiguration(BaseModel):
325
325
  should be materialized.
326
326
  assets: List of assets to be downloaded prior to execution. The values must be RIDs in an asset table
327
327
  parameters: Either a dictionary or a path to a JSON file that contains configuration parameters for the execution.
328
- workflow: A RID for a workflow instance. Must have a name, URI to the workflow instance, and a type.
328
+ workflow: Either a Workflow object, or a RID for a workflow instance.
329
+ parameters: Either a dictionary or a path to a JSON file that contains configuration parameters for the execution.
329
330
  description: A description of the execution. Can use Markdown format.
330
331
  """
331
332
 
deriva_ml/history.py CHANGED
@@ -54,6 +54,8 @@ def datetime_epoch_us(dt):
54
54
  # -- --------------------------------------------------------------------------------------
55
55
  # Take the iso format string (same as RMT) and return the version number
56
56
  #
57
+
58
+
57
59
  def iso_to_snap(iso_datetime):
58
60
  rmt = datetime.fromisoformat(iso_datetime)
59
61
  return urlb32_encode(datetime_epoch_us(rmt))