PyPI - deriva-ml - Versions diffs - 1.16.0__py3-none-any.whl → 1.17.0__py3-none-any.whl - Mend

deriva-ml 1.16.0py3-none-any.whl → 1.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

deriva_ml/.DS_Store +0 -0
deriva_ml/__init__.py +0 -10
deriva_ml/core/base.py +18 -6
deriva_ml/dataset/__init__.py +2 -7
deriva_ml/dataset/aux_classes.py +21 -11
deriva_ml/dataset/dataset.py +5 -4
deriva_ml/dataset/dataset_bag.py +144 -151
deriva_ml/dataset/upload.py +6 -4
deriva_ml/demo_catalog.py +16 -2
deriva_ml/execution/__init__.py +2 -1
deriva_ml/execution/execution.py +4 -2
deriva_ml/execution/execution_configuration.py +28 -9
deriva_ml/execution/workflow.py +8 -0
deriva_ml/model/catalog.py +55 -50
deriva_ml/model/database.py +455 -81
deriva_ml/test.py +94 -0
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/METADATA +9 -7
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/RECORD +22 -21
deriva_ml/model/sql_mapper.py +0 -44
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/WHEEL +0 -0
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/entry_points.txt +0 -0
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.16.0.dist-info → deriva_ml-1.17.0.dist-info}/top_level.txt +0 -0

deriva_ml/dataset/dataset_bag.py CHANGED Viewed

@@ -4,8 +4,6 @@ The module implements the sqllite interface to a set of directories representing
 from __future__ import annotations
-import sqlite3
 # Standard library imports
 from collections import defaultdict
 from copy import copy
@@ -16,15 +14,18 @@ import deriva.core.datapath as datapath
 # Third-party imports
 import pandas as pd
+# Local imports
+from deriva.core.ermrest_model import Table
 # Deriva imports
-from deriva.core.ermrest_model import Column, Table
 from pydantic import ConfigDict, validate_call
+from sqlalchemy import CompoundSelect, Engine, RowMapping, Select, and_, inspect, select, union
+from sqlalchemy.orm import RelationshipProperty, Session
+from sqlalchemy.orm.util import AliasedClass
-# Local imports
 from deriva_ml.core.definitions import RID, VocabularyTerm
 from deriva_ml.core.exceptions import DerivaMLException, DerivaMLInvalidTerm
 from deriva_ml.feature import Feature
-from deriva_ml.model.sql_mapper import SQLMapper
 if TYPE_CHECKING:
     from deriva_ml.model.database import DatabaseModel
@@ -64,7 +65,8 @@ class DatasetBag:
             dataset_rid: Optional RID for the dataset.
         """
         self.model = database_model
-        self.database = cast(sqlite3.Connection, self.model.dbase)
+        self.engine = cast(Engine, self.model.engine)
+        self.metadata = self.model.metadata
         self.dataset_rid = dataset_rid or self.model.dataset_rid
         if not self.dataset_rid:
@@ -86,54 +88,48 @@ class DatasetBag:
         """
         return self.model.list_tables()
-    def _dataset_table_view(self, table: str) -> str:
-        """Return a SQL command that will return all of the elements in the specified table that are associated with
-        dataset_rid"""
-        table_name = self.model.normalize_table_name(table)
-        # Get the names of the columns in the table.
-        with self.database as dbase:
-            select_args = ",".join(
-                [f'"{table_name}"."{c[1]}"' for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()]
-            )
+    @staticmethod
+    def _find_relationship_attr(source, target):
+        """
+        Return the relationship attribute (InstrumentedAttribute) on `source`
+        that points to `target`. Works with classes or AliasedClass.
+        Raises LookupError if not found.
+        """
+        src_mapper = inspect(source).mapper
+        tgt_mapper = inspect(target).mapper
-        # Get the list of datasets in the bag including the dataset itself.
-        datasets = ",".join(
-            [f'"{self.dataset_rid}"'] + [f'"{ds.dataset_rid}"' for ds in self.list_dataset_children(recurse=True)]
-        )
+        # collect relationships on the *class* mapper (not on alias)
+        candidates: list[RelationshipProperty] = [rel for rel in src_mapper.relationships if rel.mapper is tgt_mapper]
-        # Find the paths that terminate in the table we are looking for
-        # Assemble the ON clause by looking at each table pair, and looking up the FK columns that connect them.
-        paths = [
-            (
-                [f'"{self.model.normalize_table_name(t.name)}"' for t in p],
-                [self.model._table_relationship(t1, t2) for t1, t2 in zip(p, p[1:])],
-            )
-            for p in self.model._schema_to_paths()
-            if p[-1].name == table
-        ]
+        if not candidates:
+            raise LookupError(f"No relationship from {src_mapper.class_.__name__} → {tgt_mapper.class_.__name__}")
-        sql = []
-        dataset_table_name = f'"{self.model.normalize_table_name(self._dataset_table.name)}"'
+        # Prefer MANYTOONE when multiple paths exist (often best for joins)
+        candidates.sort(key=lambda r: r.direction.name != "MANYTOONE")
+        rel = candidates[0]
-        def column_name(col: Column) -> str:
-            return f'"{self.model.normalize_table_name(col.table.name)}"."{col.name}"'
+        # Bind to the actual source (alias or class)
+        return getattr(source, rel.key) if isinstance(source, AliasedClass) else rel.class_attribute
-        for ts, on in paths:
-            tables = " JOIN ".join(ts)
-            on_expression = " and ".join([f"{column_name(left)}={column_name(right)}" for left, right in on])
-            sql.append(
-                f"SELECT {select_args} FROM {tables} "
-                f"{'ON ' + on_expression if on_expression else ''} "
-                f"WHERE {dataset_table_name}.RID IN ({datasets})"
-            )
-            if table_name == self.model.normalize_table_name(self._dataset_table.name):
-                sql.append(
-                    f"SELECT {select_args} FROM {dataset_table_name} WHERE {dataset_table_name}.RID IN ({datasets})"
-                )
-        sql = " UNION ".join(sql) if len(sql) > 1 else sql[0]
-        return sql
+    def _dataset_table_view(self, table: str) -> CompoundSelect[Any]:
+        """Return a SQL command that will return all of the elements in the specified table that are associated with
+        dataset_rid"""
+        table_class = self.model.get_orm_class_by_name(table)
+        dataset_table_class = self.model.get_orm_class_by_name(self._dataset_table.name)
+        dataset_rids = [self.dataset_rid] + [c.dataset_rid for c in self.list_dataset_children(recurse=True)]
+        paths = [[t.name for t in p] for p in self.model._schema_to_paths() if p[-1].name == table]
+        sql_cmds = []
+        for path in paths:
+            path_sql = select(table_class)
+            last_class = self.model.get_orm_class_by_name(path[0])
+            for t in path[1:]:
+                t_class = self.model.get_orm_class_by_name(t)
+                path_sql = path_sql.join(self._find_relationship_attr(last_class, t_class))
+                last_class = t_class
+            path_sql = path_sql.where(dataset_table_class.RID.in_(dataset_rids))
+            sql_cmds.append(path_sql)
+        return union(*sql_cmds)
     def get_table(self, table: str) -> Generator[tuple, None, None]:
         """Retrieve the contents of the specified table. If schema is not provided as part of the table name,
@@ -146,9 +142,10 @@ class DatasetBag:
           A generator that yields tuples of column values.
         """
-        result = self.database.execute(self._dataset_table_view(table))
-        while row := result.fetchone():
-            yield row
+        with Session(self.engine) as session:
+            result = session.execute(self._dataset_table_view(table))
+            for row in result:
+                yield row
     def get_table_as_dataframe(self, table: str) -> pd.DataFrame:
         """Retrieve the contents of the specified table as a dataframe.
@@ -163,7 +160,7 @@ class DatasetBag:
         Returns:
           A dataframe containing the contents of the specified table.
         """
-        return pd.read_sql(self._dataset_table_view(table), self.database)
+        return pd.read_sql(self._dataset_table_view(table), self.engine)
     def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
         """Retrieve the contents of the specified table as a dictionary.
@@ -176,15 +173,12 @@ class DatasetBag:
           A generator producing dictionaries containing the contents of the specified table as name/value pairs.
         """
-        table_name = self.model.normalize_table_name(table)
-        schema, table = table_name.split(":")
-        with self.database as _dbase:
-            mapper = SQLMapper(self.model, table)
-            result = self.database.execute(self._dataset_table_view(table))
-            while row := result.fetchone():
-                yield mapper.transform_tuple(row)
+        with Session(self.engine) as session:
+            result = session.execute(self._dataset_table_view(table))
+            for row in result.mappings():
+                yield row
-    @validate_call
+    # @validate_call
     def list_dataset_members(self, recurse: bool = False) -> dict[str, list[dict[str, Any]]]:
         """Return a list of entities associated with a specific dataset.
@@ -198,39 +192,31 @@ class DatasetBag:
         # Look at each of the element types that might be in the _dataset_table and get the list of rid for them from
         # the appropriate association table.
         members = defaultdict(list)
-        for assoc_table in self._dataset_table.find_associations():
-            member_fkey = assoc_table.other_fkeys.pop()
-            if member_fkey.pk_table.name == "Dataset" and member_fkey.foreign_key_columns[0].name != "Nested_Dataset":
-                # Sometimes find_assoc gets confused on Dataset_Dataset.
-                member_fkey = assoc_table.self_fkey
-            target_table = member_fkey.pk_table
-            member_table = assoc_table.table
-            if target_table.schema.name != self.model.domain_schema and not (
-                target_table == self._dataset_table or target_table.name == "File"
-            ):
+        dataset_class = self.model.get_orm_class_for_table(self._dataset_table)
+        for element_table in self.model.list_dataset_element_types():
+            element_class = self.model.get_orm_class_for_table(element_table)
+            assoc_class, dataset_rel, element_rel = self.model.get_orm_association_class(dataset_class, element_class)
+            element_table = inspect(element_class).mapped_table
+            if element_table.schema != self.model.domain_schema and element_table.name not in ["Dataset", "File"]:
                 # Look at domain tables and nested datasets.
                 continue
-            sql_target = self.model.normalize_table_name(target_table.name)
-            sql_member = self.model.normalize_table_name(member_table.name)
             # Get the names of the columns that we are going to need for linking
-            member_link = tuple(c.name for c in next(iter(member_fkey.column_map.items())))
-            with self.database as db:
-                col_names = [c[1] for c in db.execute(f'PRAGMA table_info("{sql_target}")').fetchall()]
-                select_cols = ",".join([f'"{sql_target}".{c}' for c in col_names])
+            with Session(self.engine) as session:
                 sql_cmd = (
-                    f'SELECT {select_cols} FROM "{sql_member}" '
-                    f'JOIN "{sql_target}" ON "{sql_member}".{member_link[0]} = "{sql_target}".{member_link[1]} '
-                    f'WHERE "{self.dataset_rid}" = "{sql_member}".Dataset;'
+                    select(element_class)
+                    .join(element_rel)
+                    .where(self.dataset_rid == assoc_class.__table__.c["Dataset"])
                 )
-                mapper = SQLMapper(self.model, sql_target)
-                target_entities = [mapper.transform_tuple(e) for e in db.execute(sql_cmd).fetchall()]
-            members[target_table.name].extend(target_entities)
-            if recurse and (target_table.name == self._dataset_table.name):
+                # Get back the list of ORM entities and convert them to dictionaries.
+                element_entities = session.scalars(sql_cmd).all()
+                element_rows = [{c.key: getattr(obj, c.key) for c in obj.__table__.columns} for obj in element_entities]
+            members[element_table.name].extend(element_rows)
+            if recurse and (element_table.name == self._dataset_table.name):
                 # Get the members for all the nested datasets and add to the member list.
-                nested_datasets = [d["RID"] for d in target_entities]
+                nested_datasets = [d["RID"] for d in element_rows]
                 for ds in nested_datasets:
                     nested_dataset = self.model.get_dataset(ds)
                     for k, v in nested_dataset.list_dataset_members(recurse=recurse).items():
@@ -259,12 +245,10 @@ class DatasetBag:
             Feature values.
         """
         feature = self.model.lookup_feature(table, feature_name)
-        feature_table = self.model.normalize_table_name(feature.feature_table.name)
-        with self.database as db:
-            col_names = [c[1] for c in db.execute(f'PRAGMA table_info("{feature_table}")').fetchall()]
-            sql_cmd = f'SELECT * FROM "{feature_table}"'
-            return cast(datapath._ResultSet, [dict(zip(col_names, r)) for r in db.execute(sql_cmd).fetchall()])
+        feature_class = self.model.get_orm_class_for_table(feature.feature_table)
+        with Session(self.engine) as session:
+            sql_cmd = select(feature_class)
+            return cast(datapath._ResultSet, [row for row in session.execute(sql_cmd).mappings()])
     def list_dataset_element_types(self) -> list[Table]:
         """
@@ -291,18 +275,18 @@ class DatasetBag:
         Returns:
             List of child dataset bags.
         """
-        ds_table = self.model.normalize_table_name("Dataset")
-        nds_table = self.model.normalize_table_name("Dataset_Dataset")
-        dv_table = self.model.normalize_table_name("Dataset_Version")
-        with self.database as db:
+        ds_table = self.model.get_orm_class_by_name(f"{self.model.ml_schema}.Dataset")
+        nds_table = self.model.get_orm_class_by_name(f"{self.model.ml_schema}.Dataset_Dataset")
+        dv_table = self.model.get_orm_class_by_name(f"{self.model.ml_schema}.Dataset_Version")
+        with Session(self.engine) as session:
             sql_cmd = (
-                f'SELECT  "{nds_table}".Nested_Dataset, "{dv_table}".Version '
-                f'FROM "{nds_table}" JOIN "{dv_table}" JOIN "{ds_table}" on '
-                f'"{ds_table}".Version == "{dv_table}".RID AND '
-                f'"{nds_table}".Nested_Dataset == "{ds_table}".RID '
-                f'where "{nds_table}".Dataset == "{self.dataset_rid}"'
+                select(nds_table.Nested_Dataset, dv_table.Version)
+                .join_from(ds_table, nds_table, onclause=ds_table.RID == nds_table.Nested_Dataset)
+                .join_from(ds_table, dv_table, onclause=ds_table.Version == dv_table.RID)
+                .where(nds_table.Dataset == self.dataset_rid)
             )
-            nested = [DatasetBag(self.model, r[0]) for r in db.execute(sql_cmd).fetchall()]
+            nested = [DatasetBag(self.model, r[0]) for r in session.execute(sql_cmd).all()]
         result = copy(nested)
         if recurse:
@@ -336,20 +320,19 @@ class DatasetBag:
                 >>> term = ml.lookup_term("tissue_types", "epithelium")
         """
         # Get and validate vocabulary table reference
-        vocab_table = self.model.normalize_table_name(table)
         if not self.model.is_vocabulary(table):
             raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
         # Search for term by name or synonym
-        for term in self.get_table_as_dict(vocab_table):
+        for term in self.get_table_as_dict(table):
             if term_name == term["Name"] or (term["Synonyms"] and term_name in term["Synonyms"]):
                 term["Synonyms"] = list(term["Synonyms"])
                 return VocabularyTerm.model_validate(term)
         # Term not found
-        raise DerivaMLInvalidTerm(vocab_table, term_name)
+        raise DerivaMLInvalidTerm(table, term_name)
-    def _denormalize(self, include_tables: list[str] | None) -> str:
+    def _denormalize(self, include_tables: list[str]) -> Select:
         """
         Generates an SQL statement for denormalizing the dataset based on the tables to include. Processes cycles in
         graph relationships, ensures proper join order, and generates selected columns for denormalization.
@@ -361,48 +344,57 @@ class DatasetBag:
         Returns:
             str: SQL query string that represents the process of denormalization.
         """
-        def column_name(col: Column) -> str:
-            return f'"{self.model.normalize_table_name(col.table.name)}"."{col.name}"'
         # Skip over tables that we don't want to include in the denormalized dataset.
         # Also, strip off the Dataset/Dataset_X part of the path so we don't include dataset columns in the denormalized
         # table.
-        join_tables, tables, denormalized_columns, dataset_rids, dataset_element_tables = (
+        def find_relationship(table, join_condition):
+            side1 = (join_condition[0].table.name, join_condition[0].name)
+            side2 = (join_condition[1].table.name, join_condition[1].name)
+            for relationship in inspect(table).relationships:
+                local_columns = list(relationship.local_columns)[0].table.name, list(relationship.local_columns)[0].name
+                remote_side = list(relationship.remote_side)[0].table.name, list(relationship.remote_side)[0].name
+                if local_columns == side1 and remote_side == side2 or local_columns == side2 and remote_side == side1:
+                    return relationship
+            return None
+        join_tables, denormalized_columns = (
             self.model._prepare_wide_table(self, self.dataset_rid, include_tables)
         )
-        select_args = [
-            # SQLlite will strip out the table name from the column in the select statement, so we need to add
-            # an explicit alias to the column name.
-            f'"{self.model.normalize_table_name(table_name)}"."{column_name}" AS "{table_name}.{column_name}"'
+        denormalized_columns = [
+            self.model.get_orm_class_by_name(table_name)
+            .__table__.columns[column_name]
+            .label(f"{table_name}.{column_name}")
             for table_name, column_name in denormalized_columns
         ]
-        # First table in the table list is the table specified in the method call.
-        normalized_join_tables = [self.model.normalize_table_name(t) for t in join_tables]
-        sql_statement = f'SELECT {",".join(select_args)} FROM "{normalized_join_tables[0]}"'
-        for t in normalized_join_tables[1:]:
-            on = tables[t]
-            sql_statement += f' LEFT JOIN "{t}" ON '
-            sql_statement += "OR ".join([f"{column_name(o[0])} = {column_name(o[1])}" for o in on])
-        # Select only rows from the datasets you wish to include.
-        dataset_rid_list = ",".join([f'"{self.dataset_rid}"'] + [f'"{b.dataset_rid}"' for b in dataset_rids])
-        sql_statement += f'WHERE  "{self.model.normalize_table_name("Dataset")}"."RID" IN ({dataset_rid_list})'
-        # Only include rows that have actual values in them.
-        real_row = [f'"{self.model.normalize_table_name(t)}".RID IS NOT NULL ' for t in dataset_element_tables]
-        sql_statement += f" AND ({' OR '.join(real_row)})"
-        return sql_statement
-    def denormalize_as_dataframe(self, include_tables: list[str] | None = None) -> pd.DataFrame:
+        sql_statements = []
+        for key, (path, join_conditions) in join_tables.items():
+            sql_statement = select(*denormalized_columns).select_from(
+                self.model.get_orm_class_for_table(self._dataset_table)
+            )
+            for table_name in path[1:]:  # Skip over dataset table
+                table_class = self.model.get_orm_class_by_name(table_name)
+                on_clause = [
+                    getattr(table_class, r.key)
+                    for on_condition in join_conditions[table_name]
+                    if (r := find_relationship(table_class, on_condition))
+                ]
+                sql_statement = sql_statement.join(table_class, onclause=and_(*on_clause))
+            dataset_rid_list = [self.dataset_rid] + self.list_dataset_children(recurse=True)
+            dataset_class = self.model.get_orm_class_by_name(self._dataset_table.name)
+            sql_statement = sql_statement.where(dataset_class.RID.in_(dataset_rid_list))
+            sql_statements.append(sql_statement)
+        return union(*sql_statements)
+    def denormalize_as_dataframe(self, include_tables: list[str]) -> pd.DataFrame:
         """
         Denormalize the dataset and return the result as a dataframe.
-        This routine will examine the domain schema for the dataset, determine which tables to include and denormalize
-        the dataset values into a single wide table.  The result is returned as a dataframe.
+         This routine will examine the domain schema for the dataset, determine which tables to include and denormalize
+        the dataset values into a single wide table.  The result is returned as a generator that returns a dictionary
+        for each row in the denormalized wide table.
         The optional argument include_tables can be used to specify a subset of tables to include in the denormalized
         view.  The tables in this argument can appear anywhere in the dataset schema.  The method will determine which
@@ -412,28 +404,27 @@ class DatasetBag:
         The resulting wide table will include a column for every table needed to complete the denormalization process.
         Args:
-            include_tables: List of table names to include in the denormalized dataset. If None, than the entire schema
-            is used.
+            include_tables: List of table names to include in the denormalized dataset.
         Returns:
             Dataframe containing the denormalized dataset.
         """
-        return pd.read_sql(self._denormalize(include_tables=include_tables), self.database)
+        return pd.read_sql(self._denormalize(include_tables=include_tables), self.engine)
-    def denormalize_as_dict(self, include_tables: list[str] | None = None) -> Generator[dict[str, Any], None, None]:
+    def denormalize_as_dict(self, include_tables: list[str]) -> Generator[RowMapping, None, None]:
         """
-        Denormalize the dataset and return the result as a set of dictionarys.
+        Denormalize the dataset and return the result as a set of dictionary's.
         This routine will examine the domain schema for the dataset, determine which tables to include and denormalize
-        the dataset values into a single wide table.  The result is returned as a generateor that returns a dictionary
-        for each row in the denormlized wide table.
+        the dataset values into a single wide table.  The result is returned as a generator that returns a dictionary
+        for each row in the denormalized wide table.
         The optional argument include_tables can be used to specify a subset of tables to include in the denormalized
         view.  The tables in this argument can appear anywhere in the dataset schema.  The method will determine which
         additional tables are required to complete the denormalization process.  If include_tables is not specified,
         all of the tables in the schema will be included.
-        The resulting wide table will include a column for every table needed to complete the denormalization process.
+        The resulting wide table will include a only those column for the tables listed in include_columns.
         Args:
             include_tables: List of table names to include in the denormalized dataset. If None, than the entire schema
@@ -442,11 +433,13 @@ class DatasetBag:
         Returns:
             A generator that returns a dictionary representation of each row in the denormalized dataset.
         """
-        with self.database as dbase:
-            cursor = dbase.execute(self._denormalize(include_tables=include_tables))
-            columns = [desc[0] for desc in cursor.description]
-            for row in cursor:
-                yield dict(zip(columns, row))
+        with Session(self.engine) as session:
+            cursor = session.execute(
+                self._denormalize(include_tables=include_tables)
+            )
+            yield from cursor.mappings()
+            for row in cursor.mappings():
+                yield row
 # Add annotations after definition to deal with forward reference issues in pydantic

deriva_ml/dataset/upload.py CHANGED Viewed

@@ -77,11 +77,11 @@ feature_value_regex = feature_table_dir_regex + f"{SEP}(?P=feature_name)[.](?P<e
 feature_asset_dir_regex = feature_table_dir_regex + f"{SEP}asset{SEP}(?P<asset_table>[-\\w]+)"
 feature_asset_regex = feature_asset_dir_regex + f"{SEP}(?P<file>[A-Za-z0-9_-]+)[.](?P<ext>[a-z0-9]*)$"
-asset_path_regex = exec_dir_regex + f"{SEP}asset{SEP}(?P<schema>[-\\w]+){SEP}(?P<asset_table>[-\\w]*)"
+asset_path_regex = exec_dir_regex + rf"{SEP}asset{SEP}(?P<schema>[-\w]+){SEP}(?P<asset_table>[-\w]*)"
 asset_file_regex = r"(?P<file>[-\w]+)[.](?P<ext>[a-z0-9]*)$"
-table_regex = exec_dir_regex + f"{SEP}table{SEP}(?P<schema>[-\\w]+){SEP}(?P<table>[-\\w]+){SEP}(?P=table)[.](csv|json)$"
+table_regex = exec_dir_regex + rf"{SEP}table{SEP}(?P<schema>[-\w]+){SEP}(?P<table>[-\w]+){SEP}(?P=table)[.](csv|json)$"
 def is_feature_dir(path: Path) -> Optional[re.Match]:
@@ -190,7 +190,9 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
     metadata_columns = model.asset_metadata(asset_table)
     asset_table = model.name_to_table(asset_table)
     schema = model.name_to_table(asset_table).schema.name
-    metadata_path = "/".join([rf"(?P<{c}>[-\w]+)" for c in metadata_columns])
+    # Be careful here as a metadata value might be a string with can contain special characters.
+    metadata_path = "/".join([rf"(?P<{c}>[-:._ \w]+)" for c in metadata_columns])
     asset_path = f"{exec_dir_regex}/asset/{schema}/{asset_table.name}/{metadata_path}/{asset_file_regex}"
     asset_table = model.name_to_table(asset_table)
     schema = model.name_to_table(asset_table).schema.name
@@ -417,7 +419,7 @@ def asset_file_path(
         raise DerivaMLException(f"Metadata {metadata} does not match asset metadata {asset_metadata}")
     for m in asset_metadata:
-        path = path / metadata.get(m, "None")
+        path = path / str(metadata.get(m, "None"))
     path.mkdir(parents=True, exist_ok=True)
     return path / file_name

deriva_ml/demo_catalog.py CHANGED Viewed

@@ -5,6 +5,7 @@ import itertools
 import logging
 import string
 from collections.abc import Iterator, Sequence
+from datetime import datetime
 from numbers import Integral
 from pathlib import Path
 from random import choice, randint, random
@@ -54,7 +55,13 @@ def populate_demo_catalog(ml_instance: DerivaML) -> None:
     )
     with execution.execute() as e:
         for s in ss:
-            image_file = e.asset_file_path("Image", f"test_{s['RID']}.txt", Subject=s["RID"])
+            image_file = e.asset_file_path(
+                "Image",
+                f"test_{s['RID']}.txt",
+                Subject=s["RID"],
+                Acquisition_Time=datetime.now(),
+                Acquisition_Date=datetime.now().date(),
+            )
             with image_file.open("w") as f:
                 f.write(f"Hello there {random()}\n")
         execution.upload_execution_outputs()
@@ -343,7 +350,14 @@ def create_domain_schema(catalog: ErmrestCatalog, sname: str) -> None:
     )
     with TemporaryDirectory() as tmpdir:
         ml_instance = DerivaML(hostname=catalog.deriva_server.server, catalog_id=catalog.catalog_id, working_dir=tmpdir)
-        ml_instance.create_asset("Image", referenced_tables=[subject_table])
+        ml_instance.create_asset(
+            "Image",
+            column_defs=[
+                Column.define("Acquisition_Time", builtin_types.timestamp),
+                Column.define("Acquisition_Date", builtin_types.date),
+            ],
+            referenced_tables=[subject_table],
+        )
         catalog_annotation(ml_instance.model)

deriva_ml/execution/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import TYPE_CHECKING
 # Safe imports - no circular dependencies
-from deriva_ml.execution.execution_configuration import ExecutionConfiguration
+from deriva_ml.execution.execution_configuration import ExecutionConfiguration, AssetRIDConfig
 from deriva_ml.execution.workflow import Workflow
 if TYPE_CHECKING:
@@ -22,4 +22,5 @@ __all__ = [
     "Execution",  # Lazy-loaded
     "ExecutionConfiguration",
     "Workflow",
+    "AssetRIDConfig"
 ]

deriva_ml/execution/execution.py CHANGED Viewed

@@ -583,7 +583,6 @@ class Execution:
                     asset_rid=status.result["RID"],
                 )
             )
         self._update_asset_execution_table(asset_map)
         self.update_status(Status.running, "Updating features...")
@@ -805,7 +804,7 @@ class Execution:
         self,
         uploaded_assets: dict[str, list[AssetFilePath]],
         asset_role: str = "Output",
-    ):
+    ) -> None:
         """Add entry to the association table connecting an asset to an execution RID
         Args:
@@ -814,6 +813,9 @@ class Execution:
              asset_role: A term or list of terms from the Asset_Role vocabulary.
         """
         # Make sure the asset role is in the controlled vocabulary table.
+        if self._dry_run:
+            # Don't do any updates of we are doing a dry run.
+            return
         self._ml_object.lookup_term(MLVocab.asset_role, asset_role)
         pb = self._ml_object.pathBuilder

deriva_ml/execution/execution_configuration.py CHANGED Viewed

@@ -22,15 +22,17 @@ Typical usage example:
 from __future__ import annotations
+from dataclasses import dataclass
 import json
 import sys
 from pathlib import Path
 from typing import Any
+from hydra_zen import builds
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 from deriva_ml.core.definitions import RID
-from deriva_ml.dataset.aux_classes import DatasetList, DatasetSpec
+from deriva_ml.dataset.aux_classes import DatasetSpec
 from deriva_ml.execution.workflow import Workflow
@@ -64,7 +66,7 @@ class ExecutionConfiguration(BaseModel):
         ... )
     """
-    datasets: list[DatasetSpec] | DatasetList = []
+    datasets: list[DatasetSpec] = []
     assets: list[RID] = []
     workflow: RID | Workflow
     description: str = ""
@@ -72,13 +74,13 @@ class ExecutionConfiguration(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    @field_validator("datasets", mode="before")
-    @classmethod
-    def validate_datasets(cls, value: Any) -> Any:
-        if isinstance(value, DatasetList):
-            config_list: DatasetList = value
-            value = config_list.datasets
-        return value
+  #  @field_validator("datasets", mode="before")
+  #  @classmethod
+  #  def validate_datasets(cls, value: Any) -> Any:
+  #      if isinstance(value, DatasetList):
+  #          config_list: DatasetList = value
+  #          value = config_list.datasets
+  #      return value
     @field_validator("workflow", mode="before")
     @classmethod
@@ -137,3 +139,20 @@ class ExecutionConfiguration(BaseModel):
     #         hs = HatracStore("https", self.host_name, self.credential)
     #         hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
     #         return ExecutionConfiguration.load_configuration(Path(dest_file.name))
+@dataclass
+class AssetRID(str):
+    rid: str
+    description: str = ""
+    def __new__(cls, rid: str, description: str = ""):
+        obj = super().__new__(cls, rid)
+        obj.description = description
+        return obj
+AssetRIDConfig = builds(AssetRID, populate_full_signature=True)

deriva-ml 1.16.0__py3-none-any.whl → 1.17.0__py3-none-any.whl

deriva-ml 1.16.0py3-none-any.whl → 1.17.0py3-none-any.whl