PyPI - deriva-ml - Versions diffs - 1.14.46__tar.gz → 1.16.0__tar.gz - Mend

deriva-ml 1.14.46tar.gz → 1.16.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

{deriva_ml-1.14.46 → deriva_ml-1.16.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deriva-ml
-Version: 1.14.46
+Version: 1.16.0
 Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
 Author-email: ISRD <isrd-dev@isi.edu>
 Requires-Python: >=3.10
@@ -21,6 +21,7 @@ Requires-Dist: nbstripout
 Requires-Dist: papermill
 Requires-Dist: pandas-stubs==2.2.3.250527
 Requires-Dist: pyyaml
+Requires-Dist: hydra_zen
 Dynamic: license-file
 # DerivaML

{deriva_ml-1.14.46 → deriva_ml-1.16.0}/pyproject.toml RENAMED Viewed

@@ -22,7 +22,8 @@ dependencies = [
     "nbstripout",
     "papermill",
     "pandas-stubs==2.2.3.250527",
-    "pyyaml"
+    "pyyaml",
+    "hydra_zen",
 ]
 [project.scripts]

deriva_ml-1.16.0/src/deriva_ml/__init__.py ADDED Viewed

@@ -0,0 +1,87 @@
+from importlib.metadata import PackageNotFoundError, version
+from typing import TYPE_CHECKING
+# Safe imports - no circular dependencies
+from deriva_ml.core.config import DerivaMLConfig
+from deriva_ml.core.definitions import (
+    RID,
+    BuiltinTypes,
+    ColumnDefinition,
+    DerivaAssetColumns,
+    DerivaSystemColumns,
+    ExecAssetType,
+    ExecMetadataType,
+    FileSpec,
+    FileUploadState,
+    ForeignKeyDefinition,
+    KeyDefinition,
+    MLAsset,
+    MLVocab,
+    TableDefinition,
+    UploadState,
+)
+from deriva_ml.core.exceptions import (
+    DerivaMLException,
+    DerivaMLInvalidTerm,
+    DerivaMLTableTypeError,
+)
+from deriva_ml.dataset.aux_classes import DatasetConfig, DatasetConfigList, DatasetSpec, DatasetVersion
+from .execution import Execution, ExecutionConfiguration, Workflow
+# Type-checking only - avoid circular import at runtime
+if TYPE_CHECKING:
+    from deriva_ml.core.base import DerivaML
+# Lazy import function for runtime usage
+def __getattr__(name):
+    """Lazy import to avoid circular dependencies."""
+    if name == "DerivaML":
+        from deriva_ml.core.base import DerivaML
+        return DerivaML
+    elif name == "Execution":
+        from deriva_ml.execution.execution import Execution
+        return Execution
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+__all__ = [
+    "DerivaML",  # Lazy-loaded
+    "DerivaMLConfig",
+    "DatasetConfig",
+    "DatasetConfigList",
+    "DatasetSpec",
+    "DatasetVersion",
+    "Execution",
+    "ExecutionConfiguration",
+    "Workflow",
+    # Exceptions
+    "DerivaMLException",
+    "DerivaMLInvalidTerm",
+    "DerivaMLTableTypeError",
+    # Definitions
+    "RID",
+    "BuiltinTypes",
+    "ColumnDefinition",
+    "DerivaSystemColumns",
+    "DerivaAssetColumns",
+    "ExecAssetType",
+    "ExecMetadataType",
+    "FileSpec",
+    "FileUploadState",
+    "ForeignKeyDefinition",
+    "KeyDefinition",
+    "MLAsset",
+    "MLVocab",
+    "TableDefinition",
+    "UploadState",
+]
+try:
+    __version__ = version("deriva_ml")
+except PackageNotFoundError:
+    # package is not installed
+    pass

{deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/bump_version.py RENAMED Viewed

@@ -105,7 +105,7 @@ def main() -> int:
     # Find latest semver tag with prefix
     tag = latest_semver_tag(prefix)
+    print(f"Latest semver tag: {tag}")
     if not tag:
         seed_initial_tag(f"{prefix}{start}")
         print(f"Seeded {prefix}{start}. Done.")

{deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/core/__init__.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from deriva_ml.core.base import DerivaML
+from deriva_ml.core.config import DerivaMLConfig
 from deriva_ml.core.definitions import (
     RID,
     BuiltinTypes,
@@ -17,12 +18,11 @@ from deriva_ml.core.exceptions import DerivaMLException, DerivaMLInvalidTerm, De
 __all__ = [
     "DerivaML",
+    "DerivaMLConfig",
     # Exceptions
     "DerivaMLException",
     "DerivaMLInvalidTerm",
     "DerivaMLTableTypeError",
     # Definitions
     "RID",
     "BuiltinTypes",

{deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/core/base.py RENAMED Viewed

@@ -15,7 +15,6 @@ from __future__ import annotations  # noqa: I001
 # Standard library imports
 from collections import defaultdict
-import getpass
 import logging
 from datetime import datetime
 from itertools import chain
@@ -29,12 +28,7 @@ import requests
 from pydantic import ConfigDict, validate_call
 # Deriva imports
-from deriva.core import (
-    DEFAULT_SESSION_CONFIG,
-    format_exception,
-    get_credential,
-    urlquote,
-)
+from deriva.core import DEFAULT_SESSION_CONFIG, format_exception, get_credential, urlquote, init_logging
 import deriva.core.datapath as datapath
 from deriva.core.datapath import DataPathException, _SchemaWrapper as SchemaWrapper
@@ -55,6 +49,7 @@ from deriva_ml.core.definitions import (
     TableDefinition,
     VocabularyTerm,
 )
+from deriva_ml.core.config import DerivaMLConfig
 from deriva_ml.core.exceptions import DerivaMLTableTypeError, DerivaMLException
 from deriva_ml.dataset.aux_classes import DatasetSpec
 from deriva_ml.dataset.dataset import Dataset
@@ -116,8 +111,10 @@ class DerivaML(Dataset):
         project_name: str | None = None,
         cache_dir: str | Path | None = None,
         working_dir: str | Path | None = None,
+        hydra_runtime_output_dir: str | Path | None = None,
         ml_schema: str = ML_SCHEMA,
         logging_level=logging.WARNING,
+        deriva_logging_level=logging.WARNING,
         credential=None,
         use_minid: bool = True,
         check_auth: bool = True,
@@ -166,12 +163,10 @@ class DerivaML(Dataset):
         self.model = DerivaModel(self.catalog.getCatalogModel(), domain_schema=domain_schema)
         # Set up working and cache directories
-        default_workdir = self.__class__.__name__ + "_working"
-        self.working_dir = (
-            Path(working_dir) / getpass.getuser() if working_dir else Path.home() / "deriva-ml"
-        ) / default_workdir
+        self.working_dir = DerivaMLConfig.compute_workdir(working_dir)
         self.working_dir.mkdir(parents=True, exist_ok=True)
+        self.hydra_runtime_output_dir = hydra_runtime_output_dir
         self.cache_dir = Path(cache_dir) if cache_dir else self.working_dir / "cache"
         self.cache_dir.mkdir(parents=True, exist_ok=True)
@@ -182,6 +177,11 @@ class DerivaML(Dataset):
         self._logger = logging.getLogger("deriva_ml")
         self._logger.setLevel(logging_level)
+        # Configure deriva logging level
+        init_logging(deriva_logging_level)
+        logging.getLogger("bagit").setLevel(deriva_logging_level)
+        logging.getLogger("bdbag").setLevel(deriva_logging_level)
         # Store instance configuration
         self.host_name = hostname
         self.catalog_id = catalog_id

deriva_ml-1.16.0/src/deriva_ml/core/config.py ADDED Viewed

@@ -0,0 +1,67 @@
+import logging
+from pathlib import Path
+from typing import Any
+from hydra.conf import HydraConf, RunDir
+from hydra.core.hydra_config import HydraConfig
+from hydra_zen import store
+from omegaconf import OmegaConf
+from pydantic import BaseModel, model_validator
+from deriva_ml.core.definitions import ML_SCHEMA
+class DerivaMLConfig(BaseModel):
+    hostname: str
+    catalog_id: str | int = 1
+    domain_schema: str | None = None
+    project_name: str | None = None
+    cache_dir: str | Path | None = None
+    working_dir: str | Path | None = None
+    hydra_runtime_output_dir: str | Path | None = None
+    ml_schema: str = ML_SCHEMA
+    logging_level: Any = logging.WARNING
+    deriva_logging_level: Any = logging.WARNING
+    credential: Any = None
+    use_minid: bool = True
+    check_auth: bool = True
+    @model_validator(mode="after")
+    def init_working_dir(self):
+        """
+        Sets up the working directory for the model.
+        This method configures the working directory, ensuring that all required
+        file operations are performed in the appropriate location. If the user does not
+        specify a directory, a default directory based on the user's home directory
+        or username will be used.
+        This is a repeat of what is in the DerivaML.__init__ bu we put this here so that the working
+        directory is available to hydra.
+        Returns:
+            Self: The object instance with the working directory initialized.
+        """
+        self.working_dir = DerivaMLConfig.compute_workdir(self.working_dir)
+        self.hydra_runtime_output_dir = Path(HydraConfig.get().runtime.output_dir)
+        return self
+    @staticmethod
+    def compute_workdir(working_dir) -> Path:
+        # Create a default working directory if none is provided
+        working_dir = Path(working_dir) if working_dir else Path.home() / "deriva-ml"
+        return working_dir.absolute()
+OmegaConf.register_new_resolver("compute_workdir", DerivaMLConfig.compute_workdir, replace=True)
+store(
+    HydraConf(
+        run=RunDir("${compute_workdir:${deriva_ml.working_dir}}/hydra/${now:%Y-%m-%d_%H-%M-%S}"),
+        output_subdir="hydra-config",
+    ),
+    group="hydra",
+    name="config",
+)
+store.add_to_hydra_store()

deriva_ml-1.16.0/src/deriva_ml/dataset/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+from typing import Protocol, runtime_checkable
+from deriva_ml.core.definitions import RID
+from .aux_classes import DatasetConfig, DatasetConfigList, DatasetSpec, DatasetVersion, VersionPart
+from .dataset import Dataset
+from .dataset_bag import DatasetBag
+__all__ = [
+    "Dataset",
+    "DatasetSpec",
+    "DatasetConfig",
+    "DatasetConfigList",
+    "DatasetBag",
+    "DatasetVersion",
+    "VersionPart",
+]

{deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/dataset/aux_classes.py RENAMED Viewed

@@ -5,6 +5,7 @@ THis module defines the DataSet class with is used to manipulate n
 from enum import Enum
 from typing import Any, Optional, SupportsInt
+from hydra_zen import hydrated_dataclass
 from pydantic import (
     BaseModel,
     ConfigDict,
@@ -182,8 +183,9 @@ class DatasetSpec(BaseModel):
     """
     rid: RID
-    materialize: bool = True
     version: DatasetVersion | conlist(item_type=int, min_length=3, max_length=3) | tuple[int, int, int] | str
+    materialize: bool = True
+    description: str = ""
     model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -208,3 +210,20 @@ class DatasetSpec(BaseModel):
     @field_serializer("version")
     def serialize_version(self, version: DatasetVersion) -> dict[str, Any]:
         return version.to_dict()
+@hydrated_dataclass(DatasetSpec)
+class DatasetConfig:
+    rid: str
+    version: str
+    materialize: bool = True
+    description: str = ""
+class DatasetList(BaseModel):
+    datasets: list[DatasetSpec]
+    description: str = ""
+@hydrated_dataclass(DatasetList)
+class DatasetConfigList:
+    datasets: list[DatasetConfig]
+    description: str = ""

{deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/dataset/dataset.py RENAMED Viewed

@@ -22,10 +22,11 @@ Typical usage example:
 from __future__ import annotations
-# Standard library imports
 import json
 import logging
 from collections import defaultdict
+# Standard library imports
 from graphlib import TopologicalSorter
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -1138,7 +1139,7 @@ class Dataset:
         with TemporaryDirectory() as tmp_dir:
             if self._use_minid:
                 # Get bag from S3
-                archive_path = fetch_single_file(minid.bag_url)
+                archive_path = fetch_single_file(minid.bag_url, output_path=tmp_dir)
             else:
                 exporter = DerivaExport(host=self._model.catalog.deriva_server.server, output_dir=tmp_dir)
                 archive_path = exporter.retrieve_file(minid.bag_url)

{deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/dataset/dataset_bag.py RENAMED Viewed

@@ -266,6 +266,22 @@ class DatasetBag:
             sql_cmd = f'SELECT * FROM "{feature_table}"'
             return cast(datapath._ResultSet, [dict(zip(col_names, r)) for r in db.execute(sql_cmd).fetchall()])
+    def list_dataset_element_types(self) -> list[Table]:
+        """
+        Lists the data types of elements contained within a dataset.
+        This method analyzes the dataset and identifies the data types for all
+        elements within it. It is useful for understanding the structure and
+        content of the dataset and allows for better manipulation and usage of its
+        data.
+        Returns:
+            list[str]: A list of strings where each string represents a data type
+            of an element found in the dataset.
+        """
+        return self.model.list_dataset_element_types()
     def list_dataset_children(self, recurse: bool = False) -> list[DatasetBag]:
         """Get nested datasets.
@@ -333,6 +349,105 @@ class DatasetBag:
         # Term not found
         raise DerivaMLInvalidTerm(vocab_table, term_name)
+    def _denormalize(self, include_tables: list[str] | None) -> str:
+        """
+        Generates an SQL statement for denormalizing the dataset based on the tables to include. Processes cycles in
+        graph relationships, ensures proper join order, and generates selected columns for denormalization.
+        Args:
+            include_tables (list[str] | None): List of table names to include in the denormalized dataset. If None,
+                all tables from the dataset will be included.
+        Returns:
+            str: SQL query string that represents the process of denormalization.
+        """
+        def column_name(col: Column) -> str:
+            return f'"{self.model.normalize_table_name(col.table.name)}"."{col.name}"'
+        # Skip over tables that we don't want to include in the denormalized dataset.
+        # Also, strip off the Dataset/Dataset_X part of the path so we don't include dataset columns in the denormalized
+        # table.
+        join_tables, tables, denormalized_columns, dataset_rids, dataset_element_tables = (
+            self.model._prepare_wide_table(self, self.dataset_rid, include_tables)
+        )
+        select_args = [
+            # SQLlite will strip out the table name from the column in the select statement, so we need to add
+            # an explicit alias to the column name.
+            f'"{self.model.normalize_table_name(table_name)}"."{column_name}" AS "{table_name}.{column_name}"'
+            for table_name, column_name in denormalized_columns
+        ]
+        # First table in the table list is the table specified in the method call.
+        normalized_join_tables = [self.model.normalize_table_name(t) for t in join_tables]
+        sql_statement = f'SELECT {",".join(select_args)} FROM "{normalized_join_tables[0]}"'
+        for t in normalized_join_tables[1:]:
+            on = tables[t]
+            sql_statement += f' LEFT JOIN "{t}" ON '
+            sql_statement += "OR ".join([f"{column_name(o[0])} = {column_name(o[1])}" for o in on])
+        # Select only rows from the datasets you wish to include.
+        dataset_rid_list = ",".join([f'"{self.dataset_rid}"'] + [f'"{b.dataset_rid}"' for b in dataset_rids])
+        sql_statement += f'WHERE  "{self.model.normalize_table_name("Dataset")}"."RID" IN ({dataset_rid_list})'
+        # Only include rows that have actual values in them.
+        real_row = [f'"{self.model.normalize_table_name(t)}".RID IS NOT NULL ' for t in dataset_element_tables]
+        sql_statement += f" AND ({' OR '.join(real_row)})"
+        return sql_statement
+    def denormalize_as_dataframe(self, include_tables: list[str] | None = None) -> pd.DataFrame:
+        """
+        Denormalize the dataset and return the result as a dataframe.
+        This routine will examine the domain schema for the dataset, determine which tables to include and denormalize
+        the dataset values into a single wide table.  The result is returned as a dataframe.
+        The optional argument include_tables can be used to specify a subset of tables to include in the denormalized
+        view.  The tables in this argument can appear anywhere in the dataset schema.  The method will determine which
+        additional tables are required to complete the denormalization process.  If include_tables is not specified,
+        all of the tables in the schema will be included.
+        The resulting wide table will include a column for every table needed to complete the denormalization process.
+        Args:
+            include_tables: List of table names to include in the denormalized dataset. If None, than the entire schema
+            is used.
+        Returns:
+            Dataframe containing the denormalized dataset.
+        """
+        return pd.read_sql(self._denormalize(include_tables=include_tables), self.database)
+    def denormalize_as_dict(self, include_tables: list[str] | None = None) -> Generator[dict[str, Any], None, None]:
+        """
+        Denormalize the dataset and return the result as a set of dictionarys.
+        This routine will examine the domain schema for the dataset, determine which tables to include and denormalize
+        the dataset values into a single wide table.  The result is returned as a generateor that returns a dictionary
+        for each row in the denormlized wide table.
+        The optional argument include_tables can be used to specify a subset of tables to include in the denormalized
+        view.  The tables in this argument can appear anywhere in the dataset schema.  The method will determine which
+        additional tables are required to complete the denormalization process.  If include_tables is not specified,
+        all of the tables in the schema will be included.
+        The resulting wide table will include a column for every table needed to complete the denormalization process.
+        Args:
+            include_tables: List of table names to include in the denormalized dataset. If None, than the entire schema
+            is used.
+        Returns:
+            A generator that returns a dictionary representation of each row in the denormalized dataset.
+        """
+        with self.database as dbase:
+            cursor = dbase.execute(self._denormalize(include_tables=include_tables))
+            columns = [desc[0] for desc in cursor.description]
+            for row in cursor:
+                yield dict(zip(columns, row))
 # Add annotations after definition to deal with forward reference issues in pydantic

{deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/dataset/upload.py RENAMED Viewed

@@ -412,6 +412,7 @@ def asset_file_path(
         "Description",
     }.union(set(DerivaSystemColumns))
     asset_metadata = {c.name for c in asset_table.columns} - asset_columns
     if not (asset_metadata >= set(metadata.keys())):
         raise DerivaMLException(f"Metadata {metadata} does not match asset metadata {asset_metadata}")

{deriva_ml-1.14.46 → deriva_ml-1.16.0}/src/deriva_ml/demo_catalog.py RENAMED Viewed

@@ -367,7 +367,7 @@ def create_demo_catalog(
     create_features=False,
     create_datasets=False,
     on_exit_delete=True,
-    logging_level=logging.INFO,
+    logging_level=logging.WARNING,
 ) -> ErmrestCatalog:
     test_catalog = create_ml_catalog(hostname, project_name=project_name)
     if on_exit_delete:

deriva_ml-1.16.0/src/deriva_ml/execution/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+from typing import TYPE_CHECKING
+# Safe imports - no circular dependencies
+from deriva_ml.execution.execution_configuration import ExecutionConfiguration
+from deriva_ml.execution.workflow import Workflow
+if TYPE_CHECKING:
+    from deriva_ml.execution.execution import Execution
+# Lazy import for runtime
+def __getattr__(name):
+    """Lazy import to avoid circular dependencies."""
+    if name == "Execution":
+        from deriva_ml.execution.execution import Execution
+        return Execution
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+__all__ = [
+    "Execution",  # Lazy-loaded
+    "ExecutionConfiguration",
+    "Workflow",
+]

deriva-ml 1.14.46__tar.gz → 1.16.0__tar.gz

deriva-ml 1.14.46tar.gz → 1.16.0tar.gz