PyPI - deriva-ml - Versions diffs - 1.16.0__py3-none-any.whl → 1.17.1__py3-none-any.whl - Mend

deriva-ml 1.16.0py3-none-any.whl → 1.17.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

deriva_ml/.DS_Store +0 -0
deriva_ml/__init__.py +0 -10
deriva_ml/core/base.py +18 -6
deriva_ml/dataset/__init__.py +2 -7
deriva_ml/dataset/aux_classes.py +2 -10
deriva_ml/dataset/dataset.py +5 -4
deriva_ml/dataset/dataset_bag.py +144 -151
deriva_ml/dataset/upload.py +6 -4
deriva_ml/demo_catalog.py +16 -2
deriva_ml/execution/__init__.py +2 -1
deriva_ml/execution/execution.py +5 -3
deriva_ml/execution/execution_configuration.py +28 -9
deriva_ml/execution/workflow.py +8 -0
deriva_ml/model/catalog.py +55 -50
deriva_ml/model/database.py +455 -81
deriva_ml/test.py +94 -0
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.1.dist-info}/METADATA +9 -7
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.1.dist-info}/RECORD +22 -21
deriva_ml/model/sql_mapper.py +0 -44
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.1.dist-info}/WHEEL +0 -0
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.1.dist-info}/entry_points.txt +0 -0
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.1.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.1.dist-info}/top_level.txt +0 -0

deriva_ml/dataset/upload.py CHANGED Viewed

@@ -77,11 +77,11 @@ feature_value_regex = feature_table_dir_regex + f"{SEP}(?P=feature_name)[.](?P<e
 feature_asset_dir_regex = feature_table_dir_regex + f"{SEP}asset{SEP}(?P<asset_table>[-\\w]+)"
 feature_asset_regex = feature_asset_dir_regex + f"{SEP}(?P<file>[A-Za-z0-9_-]+)[.](?P<ext>[a-z0-9]*)$"
-asset_path_regex = exec_dir_regex + f"{SEP}asset{SEP}(?P<schema>[-\\w]+){SEP}(?P<asset_table>[-\\w]*)"
+asset_path_regex = exec_dir_regex + rf"{SEP}asset{SEP}(?P<schema>[-\w]+){SEP}(?P<asset_table>[-\w]*)"
 asset_file_regex = r"(?P<file>[-\w]+)[.](?P<ext>[a-z0-9]*)$"
-table_regex = exec_dir_regex + f"{SEP}table{SEP}(?P<schema>[-\\w]+){SEP}(?P<table>[-\\w]+){SEP}(?P=table)[.](csv|json)$"
+table_regex = exec_dir_regex + rf"{SEP}table{SEP}(?P<schema>[-\w]+){SEP}(?P<table>[-\w]+){SEP}(?P=table)[.](csv|json)$"
 def is_feature_dir(path: Path) -> Optional[re.Match]:
@@ -190,7 +190,9 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
     metadata_columns = model.asset_metadata(asset_table)
     asset_table = model.name_to_table(asset_table)
     schema = model.name_to_table(asset_table).schema.name
-    metadata_path = "/".join([rf"(?P<{c}>[-\w]+)" for c in metadata_columns])
+    # Be careful here as a metadata value might be a string with can contain special characters.
+    metadata_path = "/".join([rf"(?P<{c}>[-:._ \w]+)" for c in metadata_columns])
     asset_path = f"{exec_dir_regex}/asset/{schema}/{asset_table.name}/{metadata_path}/{asset_file_regex}"
     asset_table = model.name_to_table(asset_table)
     schema = model.name_to_table(asset_table).schema.name
@@ -417,7 +419,7 @@ def asset_file_path(
         raise DerivaMLException(f"Metadata {metadata} does not match asset metadata {asset_metadata}")
     for m in asset_metadata:
-        path = path / metadata.get(m, "None")
+        path = path / str(metadata.get(m, "None"))
     path.mkdir(parents=True, exist_ok=True)
     return path / file_name

deriva_ml/demo_catalog.py CHANGED Viewed

@@ -5,6 +5,7 @@ import itertools
 import logging
 import string
 from collections.abc import Iterator, Sequence
+from datetime import datetime
 from numbers import Integral
 from pathlib import Path
 from random import choice, randint, random
@@ -54,7 +55,13 @@ def populate_demo_catalog(ml_instance: DerivaML) -> None:
     )
     with execution.execute() as e:
         for s in ss:
-            image_file = e.asset_file_path("Image", f"test_{s['RID']}.txt", Subject=s["RID"])
+            image_file = e.asset_file_path(
+                "Image",
+                f"test_{s['RID']}.txt",
+                Subject=s["RID"],
+                Acquisition_Time=datetime.now(),
+                Acquisition_Date=datetime.now().date(),
+            )
             with image_file.open("w") as f:
                 f.write(f"Hello there {random()}\n")
         execution.upload_execution_outputs()
@@ -343,7 +350,14 @@ def create_domain_schema(catalog: ErmrestCatalog, sname: str) -> None:
     )
     with TemporaryDirectory() as tmpdir:
         ml_instance = DerivaML(hostname=catalog.deriva_server.server, catalog_id=catalog.catalog_id, working_dir=tmpdir)
-        ml_instance.create_asset("Image", referenced_tables=[subject_table])
+        ml_instance.create_asset(
+            "Image",
+            column_defs=[
+                Column.define("Acquisition_Time", builtin_types.timestamp),
+                Column.define("Acquisition_Date", builtin_types.date),
+            ],
+            referenced_tables=[subject_table],
+        )
         catalog_annotation(ml_instance.model)

deriva_ml/execution/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import TYPE_CHECKING
 # Safe imports - no circular dependencies
-from deriva_ml.execution.execution_configuration import ExecutionConfiguration
+from deriva_ml.execution.execution_configuration import ExecutionConfiguration, AssetRIDConfig
 from deriva_ml.execution.workflow import Workflow
 if TYPE_CHECKING:
@@ -22,4 +22,5 @@ __all__ = [
     "Execution",  # Lazy-loaded
     "ExecutionConfiguration",
     "Workflow",
+    "AssetRIDConfig"
 ]

deriva_ml/execution/execution.py CHANGED Viewed

@@ -583,7 +583,6 @@ class Execution:
                     asset_rid=status.result["RID"],
                 )
             )
         self._update_asset_execution_table(asset_map)
         self.update_status(Status.running, "Updating features...")
@@ -805,7 +804,7 @@ class Execution:
         self,
         uploaded_assets: dict[str, list[AssetFilePath]],
         asset_role: str = "Output",
-    ):
+    ) -> None:
         """Add entry to the association table connecting an asset to an execution RID
         Args:
@@ -814,6 +813,9 @@ class Execution:
              asset_role: A term or list of terms from the Asset_Role vocabulary.
         """
         # Make sure the asset role is in the controlled vocabulary table.
+        if self._dry_run:
+            # Don't do any updates of we are doing a dry run.
+            return
         self._ml_object.lookup_term(MLVocab.asset_role, asset_role)
         pb = self._ml_object.pathBuilder
@@ -1098,7 +1100,7 @@ class Execution:
             description: Description of the files.
         Returns:
-            RID: Dataset RID that identifes newly added files. Will be nested to mirror origioanl directory structure
+            RID: Dataset RID that identifies newly added files. Will be nested to mirror original directory structure
             of the files.
         Raises:

deriva_ml/execution/execution_configuration.py CHANGED Viewed

@@ -22,15 +22,17 @@ Typical usage example:
 from __future__ import annotations
+from dataclasses import dataclass
 import json
 import sys
 from pathlib import Path
 from typing import Any
+from hydra_zen import builds
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 from deriva_ml.core.definitions import RID
-from deriva_ml.dataset.aux_classes import DatasetList, DatasetSpec
+from deriva_ml.dataset.aux_classes import DatasetSpec
 from deriva_ml.execution.workflow import Workflow
@@ -64,7 +66,7 @@ class ExecutionConfiguration(BaseModel):
         ... )
     """
-    datasets: list[DatasetSpec] | DatasetList = []
+    datasets: list[DatasetSpec] = []
     assets: list[RID] = []
     workflow: RID | Workflow
     description: str = ""
@@ -72,13 +74,13 @@ class ExecutionConfiguration(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    @field_validator("datasets", mode="before")
-    @classmethod
-    def validate_datasets(cls, value: Any) -> Any:
-        if isinstance(value, DatasetList):
-            config_list: DatasetList = value
-            value = config_list.datasets
-        return value
+  #  @field_validator("datasets", mode="before")
+  #  @classmethod
+  #  def validate_datasets(cls, value: Any) -> Any:
+  #      if isinstance(value, DatasetList):
+  #          config_list: DatasetList = value
+  #          value = config_list.datasets
+  #      return value
     @field_validator("workflow", mode="before")
     @classmethod
@@ -137,3 +139,20 @@ class ExecutionConfiguration(BaseModel):
     #         hs = HatracStore("https", self.host_name, self.credential)
     #         hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
     #         return ExecutionConfiguration.load_configuration(Path(dest_file.name))
+@dataclass
+class AssetRID(str):
+    rid: str
+    description: str = ""
+    def __new__(cls, rid: str, description: str = ""):
+        obj = super().__new__(cls, rid)
+        obj.description = description
+        return obj
+AssetRIDConfig = builds(AssetRID, populate_full_signature=True)

deriva_ml/execution/workflow.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import Any
 import requests
 from pydantic import BaseModel, PrivateAttr, model_validator
 from requests import RequestException
+from setuptools_scm import get_version
 from deriva_ml.core.definitions import RID
 from deriva_ml.core.exceptions import DerivaMLException
@@ -129,6 +130,13 @@ class Workflow(BaseModel):
             self.url, self.checksum = Workflow.get_url_and_checksum(path)
             self.git_root = Workflow._get_git_root(path)
+        self.version = get_version(
+            root=str(self.git_root or Path.cwd()),
+            search_parent_directories=True,
+            # Optional but recommended: provide a safe fallback when tags are absent
+            fallback_version="0.0",
+        )
         self._logger = logging.getLogger("deriva_ml")
         return self

deriva_ml/model/catalog.py CHANGED Viewed

@@ -8,7 +8,7 @@ ML-specific functionality. It handles schema management, feature definitions, an
 from __future__ import annotations
 # Standard library imports
-from collections import Counter
+from collections import Counter, defaultdict
 from graphlib import CycleError, TopologicalSorter
 from typing import Any, Callable, Final, Iterable, NewType, TypeAlias
@@ -312,7 +312,10 @@ class DerivaModel:
         return [t for a in dataset_table.find_associations() if domain_table(t := a.other_fkeys.pop().pk_table)]
-    def _prepare_wide_table(self, dataset: DatasetLike, dataset_rid: RID, include_tables: list[str] | None) -> tuple:
+    def _prepare_wide_table(self,
+                            dataset,
+                            dataset_rid: RID,
+                            include_tables: list[str]) -> tuple[dict[str, Any], list[tuple]]:
         """
         Generates details of a wide table from the model
@@ -327,7 +330,7 @@ class DerivaModel:
         # Skip over tables that we don't want to include in the denormalized dataset.
         # Also, strip off the Dataset/Dataset_X part of the path so we don't include dataset columns in the denormalized
         # table.
-        include_tables = set(include_tables) if include_tables else set()
+        include_tables = set(include_tables)
         for t in include_tables:
             # Check to make sure the table is in the catalog.
             _ = self.name_to_table(t)
@@ -335,8 +338,11 @@ class DerivaModel:
         table_paths = [
             path
             for path in self._schema_to_paths()
-            if (not include_tables) or include_tables.intersection({p.name for p in path})
+            if path[-1].name in include_tables and include_tables.intersection({p.name for p in path})
         ]
+        paths_by_element = defaultdict(list)
+        for p in table_paths:
+            paths_by_element[p[2].name].append(p)
         # Get the names of all of the tables that can be dataset elements.
         dataset_element_tables = {
@@ -344,58 +350,57 @@ class DerivaModel:
         }
         skip_columns = {"RCT", "RMT", "RCB", "RMB"}
-        tables = {}
-        graph = {}
-        for path in table_paths:
-            for left, right in zip(path[0:], path[1:]):
-                graph.setdefault(left.name, set()).add(right.name)
-        # New lets remove any cycles that we may have in the graph.
-        # We will use a topological sort to find the order in which we need to join the tables.
-        # If we find a cycle, we will remove the table from the graph and splice in an additional ON clause.
-        # We will then repeat the process until there are no cycles.
-        graph_has_cycles = True
-        join_tables = []
-        while graph_has_cycles:
-            try:
-                ts = TopologicalSorter(graph)
-                join_tables = list(reversed(list(ts.static_order())))
-                graph_has_cycles = False
-            except CycleError as e:
-                cycle_nodes = e.args[1]
-                if len(cycle_nodes) > 3:
-                    raise DerivaMLException(f"Unexpected cycle found when normalizing dataset {cycle_nodes}")
-                # Remove cycle from graph and splice in additional ON constraint.
-                graph[cycle_nodes[1]].remove(cycle_nodes[0])
-        # The Dataset_Version table is a special case as it points to dataset and dataset to version.
-        if "Dataset_Version" in join_tables:
-            join_tables.remove("Dataset_Version")
-        for path in table_paths:
-            for left, right in zip(path[0:], path[1:]):
-                if right.name == "Dataset_Version":
-                    # The Dataset_Version table is a special case as it points to dataset and dataset to version.
-                    continue
-                if join_tables.index(right.name) < join_tables.index(left.name):
-                    continue
-                table_relationship = self._table_relationship(left, right)
-                tables.setdefault(self.normalize_table_name(right.name), set()).add(
-                    (table_relationship[0], table_relationship[1])
-                )
+        element_tables = {}
+        for element_table, paths in paths_by_element.items():
+            graph = {}
+            for path in paths:
+                for left, right in zip(path[0:], path[1:]):
+                    graph.setdefault(left.name, set()).add(right.name)
+            # New lets remove any cycles that we may have in the graph.
+            # We will use a topological sort to find the order in which we need to join the tables.
+            # If we find a cycle, we will remove the table from the graph and splice in an additional ON clause.
+            # We will then repeat the process until there are no cycles.
+            graph_has_cycles = True
+            element_join_tables = []
+            element_join_conditions = {}
+            while graph_has_cycles:
+                try:
+                    ts = TopologicalSorter(graph)
+                    element_join_tables = list(reversed(list(ts.static_order())))
+                    graph_has_cycles = False
+                except CycleError as e:
+                    cycle_nodes = e.args[1]
+                    if len(cycle_nodes) > 3:
+                        raise DerivaMLException(f"Unexpected cycle found when normalizing dataset {cycle_nodes}")
+                    # Remove cycle from graph and splice in additional ON constraint.
+                    graph[cycle_nodes[1]].remove(cycle_nodes[0])
+            # The Dataset_Version table is a special case as it points to dataset and dataset to version.
+            if "Dataset_Version" in element_join_tables:
+                element_join_tables.remove("Dataset_Version")
+            for path in paths:
+                for left, right in zip(path[0:], path[1:]):
+                    if right.name == "Dataset_Version":
+                        # The Dataset_Version table is a special case as it points to dataset and dataset to version.
+                        continue
+                    if element_join_tables.index(right.name) < element_join_tables.index(left.name):
+                        continue
+                    table_relationship = self._table_relationship(left, right)
+                    element_join_conditions.setdefault(right.name, set()).add(
+                        (table_relationship[0], table_relationship[1])
+                    )
+            element_tables[element_table] = (element_join_tables, element_join_conditions)
         # Get the list of columns that will appear in the final denormalized dataset.
         denormalized_columns = [
             (table_name, c.name)
-            for table_name in join_tables
+            for table_name in include_tables
             if not self.is_association(table_name)  # Don't include association columns in the denormalized view.'
             for c in self.name_to_table(table_name).columns
-            if c.name not in skip_columns
+            if (not include_tables or table_name in include_tables) and (c.name not in skip_columns)
         ]
-        # List of dataset ids to include in the denormalized view.
-        dataset_rids = dataset.list_dataset_children(recurse=True)
-        return join_tables, tables, denormalized_columns, dataset_rids, dataset_element_tables
+        return element_tables, denormalized_columns
     def _table_relationship(
         self,

deriva-ml 1.16.0__py3-none-any.whl → 1.17.1__py3-none-any.whl

deriva-ml 1.16.0py3-none-any.whl → 1.17.1py3-none-any.whl