PyPI - deriva-ml - Versions diffs - 1.13.1__py3-none-any.whl → 1.13.3__py3-none-any.whl - Mend

deriva-ml 1.13.1py3-none-any.whl → 1.13.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

deriva_ml/database_model.py +5 -11
deriva_ml/dataset.py +293 -307
deriva_ml/dataset_aux_classes.py +10 -10
deriva_ml/demo_catalog.py +90 -67
deriva_ml/deriva_definitions.py +43 -4
deriva_ml/deriva_ml_base.py +31 -30
deriva_ml/deriva_model.py +17 -5
deriva_ml/execution.py +102 -89
deriva_ml/execution_configuration.py +2 -1
deriva_ml/history.py +2 -0
deriva_ml/schema_setup/annotations.py +341 -126
deriva_ml/schema_setup/create_schema.py +33 -65
deriva_ml/schema_setup/policy.json +7 -3
deriva_ml/upload.py +3 -3
{deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/METADATA +2 -2
deriva_ml-1.13.3.dist-info/RECORD +31 -0
{deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/WHEEL +1 -1
deriva_ml-1.13.1.dist-info/RECORD +0 -31
{deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/entry_points.txt +0 -0
{deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/top_level.txt +0 -0

deriva_ml/execution.py CHANGED Viewed

@@ -12,13 +12,11 @@ import os
 from pathlib import Path
 from pydantic import validate_call, ConfigDict
-import regex as re
 import sys
 import shutil
 from typing import Iterable, Any, Optional
 from deriva.core import format_exception
-from deriva.core.datapath import DataPathException
 from deriva.core.hatrac_store import HatracStore
 from .deriva_definitions import (
     RID,
@@ -29,6 +27,7 @@ from .deriva_definitions import (
     MLAsset,
     ExecMetadataType,
     ExecAssetType,
+    FileSpec,
     DRY_RUN_RID,
 )
 from .deriva_ml_base import DerivaML, FeatureRecord
@@ -66,29 +65,43 @@ except ImportError:
         return s
-class AssetFilePath(type(Path())):
-    """Derived class of Path that also includes information about a downloaded.
+# Platform-specific base class
+if sys.version_info >= (3, 12):
-    An AssetFilePath has all  the methods associated with a pathlib.Path object. In addition, it defines additional
-    attributes associated with a DerviaML asset.
+    class AssetFilePath(Path):
+        """
+        Create a new Path object that has additional information related to the use of this path as an asset.
-    Attributes:
-        asset_types: A list of the types associated with this asset.  From the Asset_Type controlled vocabulary.
-        asset_metadata: A dictionary of names and values of any additional columns  associated with this asset.
-        asset_name: The name of the asset table
-        file_name: The name of the file in the local file system that has the asset contents
-        asset_rid: The RID of the asset if it has been uploaded into an asset table
-    """
+        Args:
+            asset_path: Local path to the location of the asset.
+            asset_name:  The name of the asset in the catalog (e.g. the asset table name).
+            file_name:  Name of the local file that contains the contents of the asset.
+            asset_metadata: Any additional columns associated with this asset beyond the URL, Length, and checksum.
+            asset_types:  A list of terms from the Asset_Type controlled vocabulary.
+            asset_rid:  The RID of the asset if it has been uploaded into an asset table
+        """
-    def __new__(
-        cls,
-        asset_path,
-        asset_name: str,
-        file_name: str,
-        asset_metadata: dict[str, Any],
-        asset_types: list[str] | str,
-        asset_rid: Optional[RID] = None,
-    ):
+        def __init__(
+            self,
+            asset_path: str | Path,
+            asset_name: str,
+            file_name: str,
+            asset_metadata: dict[str, Any],
+            asset_types: list[str] | str,
+            asset_rid: Optional["RID"] = None,
+        ):
+            super().__init__(asset_path)
+            # These assignments happen after __new__ returns the instance
+            self.asset_name = asset_name
+            self.file_name = file_name
+            self.asset_metadata = asset_metadata
+            self.asset_types = (
+                asset_types if isinstance(asset_types, list) else [asset_types]
+            )
+            self.asset_rid = asset_rid
+else:
+    class AssetFilePath(type(Path())):
         """
         Create a new Path object that has additional information related to the use of this path as an asset.
@@ -100,15 +113,26 @@ class AssetFilePath(type(Path())):
             asset_types:  A list of terms from the Asset_Type controlled vocabulary.
             asset_rid:  The RID of the asset if it has been uploaded into an asset table
         """
-        obj = super().__new__(cls, asset_path)
-        obj.asset_types = (
-            asset_types if isinstance(asset_types, list) else [asset_types]
-        )
-        obj.asset_metadata = asset_metadata
-        obj.asset_name = asset_name
-        obj.file_name = file_name
-        obj.asset_rid = asset_rid
-        return obj
+        def __new__(
+            cls,
+            asset_path: str | Path,
+            asset_name: str,
+            file_name: str,
+            asset_metadata: dict[str, Any],
+            asset_types: list[str] | str,
+            asset_rid: Optional["RID"] = None,
+        ):
+            # Only pass the path to the base Path class
+            obj = super().__new__(cls, asset_path)
+            obj.asset_name = asset_name
+            obj.file_name = file_name
+            obj.asset_metadata = asset_metadata
+            obj.asset_types = (
+                asset_types if isinstance(asset_types, list) else [asset_types]
+            )
+            obj.asset_rid = asset_rid
+            return obj
 class Execution:
@@ -155,7 +179,7 @@ class Execution:
         Args:
             configuration: Execution configuration object that describes the execution.
             ml_object: The DerivaML instance that created the execution.
-            reload: RID of previously initialized execution object.
+            reload: RID of a previously initialized execution object.
         """
         self.asset_paths: list[AssetFilePath] = []
         self.configuration = configuration
@@ -476,7 +500,7 @@ class Execution:
         """Download an asset from a URL and place it in a local directory.
         Args:
-            asset_rid: URL of the asset.
+            asset_rid: RID of the asset.
             dest_dir: Destination directory for the asset.
             update_catalog: Whether to update the catalog execution information after downloading.
@@ -656,20 +680,9 @@ class Execution:
         with open(feature_file, "r") as feature_values:
             entities = [json.loads(line.strip()) for line in feature_values]
         # Update the asset columns in the feature and add to the catalog.
-        try:
-            self._ml_object.domain_path.tables[feature_table].insert(
-                [map_path(e) for e in entities]
-            )
-        except DataPathException as e:
-            if re.match(
-                rf'DETAIL: +Key +\("Execution", +"{target_table}", +"Feature_Name"\)=\(.*\) already exists',
-                e.message,
-            ):
-                self._logger.info(
-                    f"Skipping reload of feature values for {feature_table}"
-                )
-            else:
-                raise e
+        self._ml_object.domain_path.tables[feature_table].insert(
+            [map_path(e) for e in entities], on_conflict_skip=True
+        )
     def _update_asset_execution_table(
         self,
@@ -694,27 +707,17 @@ class Execution:
             asset_exe = self._model.find_association(asset_table_name, "Execution")
             asset_exe_path = pb.schemas[asset_exe.schema.name].tables[asset_exe.name]
-            try:
-                asset_exe_path.insert(
-                    [
-                        {
-                            asset_table_name: asset_path.asset_rid,
-                            "Execution": self.execution_rid,
-                            "Asset_Role": asset_role,
-                        }
-                        for asset_path in asset_list
-                    ]
-                )
-            except DataPathException as e:
-                if re.match(
-                    rf'DETAIL: +Key +\("{asset_table_name}", +"Execution"\)=\(.*\) already exists',
-                    e.message,
-                ):
-                    self._logger.info(
-                        f"Skipping reload of execution assocations for {asset_table_name}"
-                    )
-                else:
-                    raise e
+            asset_exe_path.insert(
+                [
+                    {
+                        asset_table_name: asset_path.asset_rid,
+                        "Execution": self.execution_rid,
+                        "Asset_Role": asset_role,
+                    }
+                    for asset_path in asset_list
+                ],
+                on_conflict_skip=True,
+            )
             # Now add in the type names via the asset_asset_type association table.
             # Get the list of types for each file in the asset.
@@ -740,24 +743,15 @@ class Execution:
             type_path = pb.schemas[asset_asset_type.schema.name].tables[
                 asset_asset_type.name
             ]
-            try:
-                type_path.insert(
-                    [
-                        {asset_table_name: asset.asset_rid, "Asset_Type": t}
-                        for asset in asset_list
-                        for t in asset_type_map[asset.file_name]
-                    ]
-                )
-            except DataPathException as e:
-                if re.match(
-                    rf'DETAIL: +Key +\("{asset_table_name}", +"Asset_Type"\)=\(.*\) already exists',
-                    e.message,
-                ):
-                    self._logger.info(
-                        f"Skipping reload of execution asset types for {asset_table_name}"
-                    )
-                else:
-                    raise e
+            type_path.insert(
+                [
+                    {asset_table_name: asset.asset_rid, "Asset_Type": t}
+                    for asset in asset_list
+                    for t in asset_type_map[asset.file_name]
+                ],
+                on_conflict_skip=True,
+            )
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def asset_file_path(
@@ -889,19 +883,25 @@ class Execution:
                 feature.Execution = self.execution_rid
                 file.write(json.dumps(feature.model_dump(mode="json")) + "\n")
-    @validate_call
-    def create_dataset(self, dataset_types: str | list[str], description: str) -> RID:
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
+    def create_dataset(
+        self,
+        dataset_types: str | list[str],
+        description: str,
+        version: Optional[DatasetVersion] = None,
+    ) -> RID:
         """Create a new dataset with specified types.
         Args:
             dataset_types: param description:
             description: Markdown description of the dataset being created.
+            version: Version to assign to the dataset.  Defaults to 0.1.0
         Returns:
             RID of the newly created dataset.
         """
         return self._ml_object.create_dataset(
-            dataset_types, description, self.execution_rid
+            dataset_types, description, self.execution_rid, version=version
         )
     def add_dataset_members(
@@ -959,6 +959,19 @@ class Execution:
             execution_rid=self.execution_rid,
         )
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
+    def add_files(
+        self,
+        files: Iterable[FileSpec],
+        file_types: str | list[str],
+    ) -> Iterable[RID]:
+        """Add files to the file table"""
+        return self._ml_object.add_files(
+            files=files,
+            file_types=file_types,
+            execution_rid=self.execution_rid,
+        )
     def __str__(self):
         items = [
             f"caching_dir: {self._cache_dir}",

deriva_ml/execution_configuration.py CHANGED Viewed

@@ -325,7 +325,8 @@ class ExecutionConfiguration(BaseModel):
             should be materialized.
         assets: List of assets to be downloaded prior to execution.  The values must be RIDs in an asset table
         parameters: Either a dictionary or a path to a JSON file that contains configuration parameters for the execution.
-        workflow: A RID for a workflow instance.  Must have a name, URI to the workflow instance, and a type.
+        workflow: Either a Workflow object, or a RID for a workflow instance.
+        parameters: Either a dictionary or a path to a JSON file that contains configuration parameters for the execution.
         description: A description of the execution.  Can use Markdown format.
     """

deriva_ml/history.py CHANGED Viewed

@@ -54,6 +54,8 @@ def datetime_epoch_us(dt):
 # -- --------------------------------------------------------------------------------------
 # Take the iso format string (same as RMT) and return the version number
 #
 def iso_to_snap(iso_datetime):
     rmt = datetime.fromisoformat(iso_datetime)
     return urlb32_encode(datetime_epoch_us(rmt))

deriva-ml 1.13.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

deriva-ml 1.13.1py3-none-any.whl → 1.13.3py3-none-any.whl