PyPI - deriva-ml - Versions diffs - 1.14.0__py3-none-any.whl → 1.14.26__py3-none-any.whl - Mend

deriva-ml 1.14.0py3-none-any.whl → 1.14.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

deriva_ml/__init__.py +25 -30
deriva_ml/core/__init__.py +39 -0
deriva_ml/core/base.py +1489 -0
deriva_ml/core/constants.py +36 -0
deriva_ml/core/definitions.py +74 -0
deriva_ml/core/enums.py +222 -0
deriva_ml/core/ermrest.py +288 -0
deriva_ml/core/exceptions.py +28 -0
deriva_ml/core/filespec.py +116 -0
deriva_ml/dataset/__init__.py +4 -0
deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
deriva_ml/{dataset.py → dataset/dataset.py} +405 -428
deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
deriva_ml/{history.py → dataset/history.py} +51 -33
deriva_ml/{upload.py → dataset/upload.py} +48 -70
deriva_ml/demo_catalog.py +233 -183
deriva_ml/execution/environment.py +290 -0
deriva_ml/{execution.py → execution/execution.py} +365 -252
deriva_ml/execution/execution_configuration.py +163 -0
deriva_ml/{execution_configuration.py → execution/workflow.py} +206 -218
deriva_ml/feature.py +83 -46
deriva_ml/model/__init__.py +0 -0
deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
deriva_ml/{database_model.py → model/database.py} +52 -74
deriva_ml/model/sql_mapper.py +44 -0
deriva_ml/run_notebook.py +19 -11
deriva_ml/schema/__init__.py +3 -0
deriva_ml/{schema_setup → schema}/annotations.py +31 -22
deriva_ml/schema/check_schema.py +104 -0
deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
deriva_ml/schema/deriva-ml-reference.json +8525 -0
deriva_ml/schema/table_comments_utils.py +57 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/METADATA +5 -4
deriva_ml-1.14.26.dist-info/RECORD +40 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/entry_points.txt +1 -0
deriva_ml/deriva_definitions.py +0 -391
deriva_ml/deriva_ml_base.py +0 -1046
deriva_ml/execution_environment.py +0 -139
deriva_ml/schema_setup/table_comments_utils.py +0 -56
deriva_ml/test-files/execution-parameters.json +0 -1
deriva_ml/test-files/notebook-parameters.json +0 -5
deriva_ml/test_functions.py +0 -141
deriva_ml/test_notebook.ipynb +0 -197
deriva_ml-1.14.0.dist-info/RECORD +0 -31
/deriva_ml/{schema_setup → execution}/__init__.py +0 -0
/deriva_ml/{schema_setup → schema}/policy.json +0 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/WHEEL +0 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/top_level.txt +0 -0

deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} RENAMED Viewed

@@ -2,20 +2,32 @@
 The module implements the sqllite interface to a set of directories representing a dataset bag.
 """
-from deriva.core.ermrest_model import Table, Column
-import deriva.core.datapath as datapath
+from __future__ import annotations
+import sqlite3
+# Standard library imports
 from collections import defaultdict
 from copy import copy
-from typing import Any, Generator, TYPE_CHECKING, Optional, Iterable
+from typing import TYPE_CHECKING, Any, Generator, Iterable, cast
+import deriva.core.datapath as datapath
+# Third-party imports
 import pandas as pd
-from pydantic import validate_call
-from .deriva_definitions import RID
-from .feature import Feature
+# Deriva imports
+from deriva.core.ermrest_model import Column, Table
+from pydantic import ConfigDict, validate_call
+# Local imports
+from deriva_ml.core.definitions import RID, VocabularyTerm
+from deriva_ml.core.exceptions import DerivaMLException, DerivaMLInvalidTerm
+from deriva_ml.feature import Feature
+from deriva_ml.model.sql_mapper import SQLMapper
 if TYPE_CHECKING:
-    from .database_model import DatabaseModel
+    from deriva_ml.model.database import DatabaseModel
 try:
     from icecream import ic
@@ -24,41 +36,41 @@ except ImportError:  # Graceful fallback if IceCream isn't installed.
 class DatasetBag:
-    """DatasetBag is a class that manages a materialized bag.  It is created from a locally materialized BDBag for a
-    dataset_table, which is created either by DerivaML.create_execution, or directly by calling DerivaML.download_dataset.
+    """
+    DatasetBag is a class that manages a materialized bag.  It is created from a locally materialized
+    BDBag for a dataset_table, which is created either by DerivaML.create_execution, or directly by
+    calling DerivaML.download_dataset.
-    A general a bag may contain multiple datasets, if the dataset is nested. The DatasetBag is used to represent only
-    one of the datasets in the bag.
+    A general a bag may contain multiple datasets, if the dataset is nested. The DatasetBag is used to
+    represent only one of the datasets in the bag.
     All the metadata associated with the dataset is stored in a SQLLite database that can be queried using SQL.
-    Attributes
+    Attributes:
         dataset_rid (RID): RID for the specified dataset
         version: The version of the dataset
         model (DatabaseModel): The Database model that has all the catalog metadata associated with this dataset.
             database:
-        dbase (Connection): connection to the sqlite database holding table values
+        dbase (sqlite3.Connection): connection to the sqlite database holding table values
         domain_schema (str): Name of the domain schema
     """
-    # Validate call
-    def __init__(
-        self, database_model: "DatabaseModel", dataset_rid: Optional[RID]
-    ) -> None:
+    def __init__(self, database_model: DatabaseModel, dataset_rid: RID | None = None) -> None:
         """
         Initialize a DatasetBag instance.
         Args:
             database_model: Database version of the bag.
+            dataset_rid: Optional RID for the dataset.
         """
         self.model = database_model
-        self.database = self.model.dbase
+        self.database = cast(sqlite3.Connection, self.model.dbase)
         self.dataset_rid = dataset_rid or self.model.dataset_rid
-        self.model.rid_lookup(
-            dataset_rid
-        )  # Check to make sure that this dataset is in the bag.
+        if not self.dataset_rid:
+            raise DerivaMLException("No dataset RID provided")
+        self.model.rid_lookup(self.dataset_rid)  # Check to make sure that this dataset is in the bag.
         self.version = self.model.dataset_version(self.dataset_rid)
         self._dataset_table = self.model.dataset_table
@@ -75,20 +87,24 @@ class DatasetBag:
         return self.model.list_tables()
     def _dataset_table_view(self, table: str) -> str:
+        """Return a SQL command that will return all of the elements in the specified table that are associated with
+        dataset_rid"""
         table_name = self.model.normalize_table_name(table)
+        # Get the names of the columns in the table.
         with self.database as dbase:
             select_args = ",".join(
-                [
-                    f'"{table_name}"."{c[1]}"'
-                    for c in dbase.execute(
-                        f'PRAGMA table_info("{table_name}")'
-                    ).fetchall()
-                ]
+                [f'"{table_name}"."{c[1]}"' for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()]
             )
+        # Get the list of datasets in the bag including the dataset itself.
         datasets = ",".join(
-            [f'"{self.dataset_rid}"']
-            + [f'"{ds.dataset_rid}"' for ds in self.list_dataset_children(recurse=True)]
+            [f'"{self.dataset_rid}"'] + [f'"{ds.dataset_rid}"' for ds in self.list_dataset_children(recurse=True)]
         )
+        # Find the paths that terminate in the table we are looking for
+        # Assemble the ON clause by looking at each table pair, and looking up the FK columns that connect them.
         paths = [
             (
                 [f'"{self.model.normalize_table_name(t.name)}"' for t in p],
@@ -99,21 +115,23 @@ class DatasetBag:
         ]
         sql = []
-        dataset_table_name = (
-            f'"{self.model.normalize_table_name(self._dataset_table.name)}"'
-        )
+        dataset_table_name = f'"{self.model.normalize_table_name(self._dataset_table.name)}"'
         def column_name(col: Column) -> str:
             return f'"{self.model.normalize_table_name(col.table.name)}"."{col.name}"'
         for ts, on in paths:
             tables = " JOIN ".join(ts)
-            on_expression = " and ".join(
-                [f"{column_name(left)}={column_name(right)}" for left, right in on]
-            )
+            on_expression = " and ".join([f"{column_name(left)}={column_name(right)}" for left, right in on])
             sql.append(
-                f"SELECT {select_args} FROM {tables} ON {on_expression} WHERE {dataset_table_name}.RID IN ({datasets})"
+                f"SELECT {select_args} FROM {tables} "
+                f"{'ON ' + on_expression if on_expression else ''} "
+                f"WHERE {dataset_table_name}.RID IN ({datasets})"
             )
+            if table_name == self.model.normalize_table_name(self._dataset_table.name):
+                sql.append(
+                    f"SELECT {select_args} FROM {dataset_table_name} WHERE {dataset_table_name}.RID IN ({datasets})"
+                )
         sql = " UNION ".join(sql) if len(sql) > 1 else sql[0]
         return sql
@@ -157,120 +175,105 @@ class DatasetBag:
         Returns:
           A generator producing dictionaries containing the contents of the specified table as name/value pairs.
         """
         table_name = self.model.normalize_table_name(table)
-        with self.database as dbase:
-            col_names = [
-                c[1]
-                for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()
-            ]
+        schema, table = table_name.split(":")
+        with self.database as _dbase:
+            mapper = SQLMapper(self.model, table)
             result = self.database.execute(self._dataset_table_view(table))
             while row := result.fetchone():
-                yield dict(zip(col_names, row))
+                yield mapper.transform_tuple(row)
     @validate_call
-    def list_dataset_members(self, recurse: bool = False) -> dict[str, dict[str, list]]:
-        """Return a list of entities associated with a specific _dataset_table.
+    def list_dataset_members(self, recurse: bool = False) -> dict[str, list[dict[str, Any]]]:
+        """Return a list of entities associated with a specific dataset.
         Args:
-           recurse:  (Default value = False)
+           recurse: Whether to include nested datasets.
         Returns:
-            Dictionary of entities associated with a specific _dataset_table.  Key is the table from which the elements
-            were taken.
+            Dictionary of entities associated with the dataset.
         """
         # Look at each of the element types that might be in the _dataset_table and get the list of rid for them from
         # the appropriate association table.
         members = defaultdict(list)
         for assoc_table in self._dataset_table.find_associations():
-            other_fkey = assoc_table.other_fkeys.pop()
-            self_fkey = assoc_table.self_fkey
-            target_table = other_fkey.pk_table
+            member_fkey = assoc_table.other_fkeys.pop()
+            if member_fkey.pk_table.name == "Dataset" and member_fkey.foreign_key_columns[0].name != "Nested_Dataset":
+                # Sometimes find_assoc gets confused on Dataset_Dataset.
+                member_fkey = assoc_table.self_fkey
+            target_table = member_fkey.pk_table
             member_table = assoc_table.table
-            if (
-                target_table.schema.name != self.model.domain_schema
-                and target_table != self._dataset_table
+            if target_table.schema.name != self.model.domain_schema and not (
+                target_table == self._dataset_table or target_table.name == "File"
             ):
                 # Look at domain tables and nested datasets.
                 continue
-            if target_table == self._dataset_table:
-                # find_assoc gives us the keys in the wrong position, so swap.
-                self_fkey, other_fkey = other_fkey, self_fkey
             sql_target = self.model.normalize_table_name(target_table.name)
             sql_member = self.model.normalize_table_name(member_table.name)
             # Get the names of the columns that we are going to need for linking
-            member_link = tuple(
-                c.name for c in next(iter(other_fkey.column_map.items()))
-            )
+            member_link = tuple(c.name for c in next(iter(member_fkey.column_map.items())))
             with self.database as db:
-                col_names = [
-                    c[1]
-                    for c in db.execute(f'PRAGMA table_info("{sql_target}")').fetchall()
-                ]
+                col_names = [c[1] for c in db.execute(f'PRAGMA table_info("{sql_target}")').fetchall()]
                 select_cols = ",".join([f'"{sql_target}".{c}' for c in col_names])
                 sql_cmd = (
                     f'SELECT {select_cols} FROM "{sql_member}" '
                     f'JOIN "{sql_target}" ON "{sql_member}".{member_link[0]} = "{sql_target}".{member_link[1]} '
                     f'WHERE "{self.dataset_rid}" = "{sql_member}".Dataset;'
                 )
-                target_entities = [
-                    dict(zip(col_names, e)) for e in db.execute(sql_cmd).fetchall()
-                ]
-                members[target_table.name].extend(target_entities)
-            target_entities = []  # path.entities().fetch()
+                mapper = SQLMapper(self.model, sql_target)
+                target_entities = [mapper.transform_tuple(e) for e in db.execute(sql_cmd).fetchall()]
             members[target_table.name].extend(target_entities)
-            if recurse and target_table.name == self._dataset_table:
+            if recurse and (target_table.name == self._dataset_table.name):
                 # Get the members for all the nested datasets and add to the member list.
                 nested_datasets = [d["RID"] for d in target_entities]
                 for ds in nested_datasets:
-                    for k, v in DatasetBag.list_dataset_members(
-                        ds, recurse=False
-                    ).items():
+                    nested_dataset = self.model.get_dataset(ds)
+                    for k, v in nested_dataset.list_dataset_members(recurse=recurse).items():
                         members[k].extend(v)
         return dict(members)
     def find_features(self, table: str | Table) -> Iterable[Feature]:
-        """
+        """Find features for a table.
         Args:
             table: The table to find features for.
-            table: Table | str:
         Returns:
-            An iterable of FeatureResult instances that describe the current features in the table.
+            An iterable of Feature instances.
         """
         return self.model.find_features(table)
-    # noinspection PyProtectedMember
-    def list_feature_values(
-        self, table: Table | str, feature_name: str
-    ) -> datapath._ResultSet:
-        """Return a datapath ResultSet containing all values of a feature associated with a table.
+    def list_feature_values(self, table: Table | str, feature_name: str) -> datapath._ResultSet:
+        """Return feature values for a table.
         Args:
-            table: param feature_name:
-            table: Table | str:
-            feature_name: str:
+            table: The table to get feature values for.
+            feature_name: Name of the feature.
         Returns:
+            Feature values.
         """
         feature = self.model.lookup_feature(table, feature_name)
         feature_table = self.model.normalize_table_name(feature.feature_table.name)
         with self.database as db:
+            col_names = [c[1] for c in db.execute(f'PRAGMA table_info("{feature_table}")').fetchall()]
             sql_cmd = f'SELECT * FROM "{feature_table}"'
-            return db.execute(sql_cmd).fetchall()
+            return cast(datapath._ResultSet, [dict(zip(col_names, r)) for r in db.execute(sql_cmd).fetchall()])
-    # @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
-    def list_dataset_children(self, recurse: bool = False) -> list["DatasetBag"]:
-        """Given a _dataset_table RID, return a list of RIDs of any nested datasets.
+    def list_dataset_children(self, recurse: bool = False) -> list[DatasetBag]:
+        """Get nested datasets.
-        Returns:
-          list of RIDs of nested datasets.
+        Args:
+            recurse: Whether to include children of children.
+        Returns:
+            List of child dataset bags.
         """
         ds_table = self.model.normalize_table_name("Dataset")
         nds_table = self.model.normalize_table_name("Dataset_Dataset")
@@ -283,9 +286,7 @@ class DatasetBag:
                 f'"{nds_table}".Nested_Dataset == "{ds_table}".RID '
                 f'where "{nds_table}".Dataset == "{self.dataset_rid}"'
             )
-            nested = [
-                DatasetBag(self.model, r[0]) for r in db.execute(sql_cmd).fetchall()
-            ]
+            nested = [DatasetBag(self.model, r[0]) for r in db.execute(sql_cmd).fetchall()]
         result = copy(nested)
         if recurse:
@@ -293,10 +294,49 @@ class DatasetBag:
                 result.extend(child.list_dataset_children(recurse))
         return result
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
+    def lookup_term(self, table: str | Table, term_name: str) -> VocabularyTerm:
+        """Finds a term in a vocabulary table.
+        Searches for a term in the specified vocabulary table, matching either the primary name
+        or any of its synonyms.
+        Args:
+            table: Vocabulary table to search in (name or Table object).
+            term_name: Name or synonym of the term to find.
+        Returns:
+            VocabularyTerm: The matching vocabulary term.
+        Raises:
+            DerivaMLVocabularyException: If the table is not a vocabulary table, or term is not found.
+        Examples:
+            Look up by primary name:
+                >>> term = ml.lookup_term("tissue_types", "epithelial")
+                >>> print(term.description)
+            Look up by synonym:
+                >>> term = ml.lookup_term("tissue_types", "epithelium")
+        """
+        # Get and validate vocabulary table reference
+        vocab_table = self.model.normalize_table_name(table)
+        if not self.model.is_vocabulary(table):
+            raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
+        # Search for term by name or synonym
+        for term in self.get_table_as_dict(vocab_table):
+            if term_name == term["Name"] or (term["Synonyms"] and term_name in term["Synonyms"]):
+                term["Synonyms"] = list(term["Synonyms"])
+                return VocabularyTerm.model_validate(term)
+        # Term not found
+        raise DerivaMLInvalidTerm(vocab_table, term_name)
 # Add annotations after definition to deal with forward reference issues in pydantic
 DatasetBag.list_dataset_children = validate_call(
-    config=dict(arbitrary_types_allowed=True),
+    config=ConfigDict(arbitrary_types_allowed=True),
     validate_return=True,
 )(DatasetBag.list_dataset_children)

deriva_ml/{history.py → dataset/history.py} RENAMED Viewed

@@ -1,10 +1,30 @@
+import base64
+import struct
 from datetime import datetime
 from dateutil.parser import isoparse
 from deriva.core import urlquote
 # -- ==============================================================================================
 def get_record_history(server, cid, sname, tname, kvals, kcols=["RID"], snap=None):
+    """Get the history of a record from the catalog.
+    Args:
+        server: The server instance.
+        cid: The catalog ID.
+        sname: The schema name.
+        tname: The table name.
+        kvals: The key values to look up.
+        kcols: The key columns. Defaults to ["RID"].
+        snap: Optional snapshot ID.
+    Returns:
+        The history data for the record.
+    Raises:
+        ValueError: If more than one row is returned.
+    """
     parts = {
         "cid": urlquote(cid),
         "sname": urlquote(sname),
@@ -30,13 +50,13 @@ def get_record_history(server, cid, sname, tname, kvals, kcols=["RID"], snap=Non
     while True:
         url = path % parts
         # sys.stderr.write("%s\n" % url)
-        l = server.get(url).json()
-        if len(l) > 1:
+        response_data = server.get(url).json()
+        if len(response_data) > 1:
             raise ValueError("got more than one row for %r" % url)
-        if len(l) == 0:
+        if len(response_data) == 0:
             #  sys.stderr.write("ERROR: %s: No record found \n" % (url))
             break
-        row = l[0]
+        row = response_data[0]
         snap2rows[parts["snap"]] = row
         rows_found.append(row)
         rmt = datetime.fromisoformat(row["RMT"])
@@ -48,8 +68,15 @@ def get_record_history(server, cid, sname, tname, kvals, kcols=["RID"], snap=Non
 # -- --------------------------------------------------------------------------------------
 def datetime_epoch_us(dt):
-    """Return microseconds-since-epoch integer for given timezone-qualified datetime"""
-    return int(dt.timestamp()) * 1000000 + dt.microsecond
+    """Convert datetime to epoch microseconds.
+    Args:
+        dt: The datetime object to convert.
+    Returns:
+        The epoch time in microseconds.
+    """
+    return int(dt.timestamp() * 1000000)
 # -- --------------------------------------------------------------------------------------
@@ -58,34 +85,25 @@ def datetime_epoch_us(dt):
 def iso_to_snap(iso_datetime):
-    rmt = isoparse(iso_datetime)  # datetime.fromisoformat(iso_datetime)
-    return urlb32_encode(datetime_epoch_us(rmt))
+    """Convert ISO datetime string to snapshot format.
+    Args:
+        iso_datetime: The ISO datetime string.
+    Returns:
+        The snapshot timestamp.
+    """
+    return datetime_epoch_us(isoparse(iso_datetime))
 # -- --------------------------------------------------------------------------------------
 def urlb32_encode(i):
-    """Encode integer as per ERMrest's base-32 snapshot encoding"""
-    if i > 2**63 - 1:
-        raise ValueError(i)
-    elif i < -(2**63):
-        raise ValueError(i)
-    # pad 64 bit to 65 bits for 13 5-bit digits
-    raw = i << 1
-    encoded_rev = []
-    for d in range(1, 14):
-        if d > 2 and ((d - 1) % 4) == 0:
-            encoded_rev.append("-")
-        code = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"[raw % 32]
-        encoded_rev.append(code)
-        raw = raw // 32
-    while encoded_rev and encoded_rev[-1] in {"0", "-"}:
-        del encoded_rev[-1]
-    if not encoded_rev:
-        encoded_rev = ["0"]
-    encoded = reversed(encoded_rev)
-    return "".join(encoded)
+    """Encode an integer to URL-safe base32.
+    Args:
+        i: The integer to encode.
+    Returns:
+        The URL-safe base32 encoded string.
+    """
+    return base64.urlsafe_b64encode(struct.pack(">Q", i)).decode("ascii").rstrip("=")

deriva-ml 1.14.0__py3-none-any.whl → 1.14.26__py3-none-any.whl

deriva-ml 1.14.0py3-none-any.whl → 1.14.26py3-none-any.whl