PyPI - deriva-ml - Versions diffs - 1.17.14__py3-none-any.whl → 1.17.16__py3-none-any.whl - Mend

deriva-ml 1.17.14py3-none-any.whl → 1.17.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

deriva_ml/__init__.py +2 -2
deriva_ml/asset/asset.py +0 -4
deriva_ml/catalog/__init__.py +6 -0
deriva_ml/catalog/clone.py +1591 -38
deriva_ml/catalog/localize.py +66 -29
deriva_ml/core/base.py +12 -9
deriva_ml/core/definitions.py +13 -12
deriva_ml/core/ermrest.py +11 -12
deriva_ml/core/mixins/annotation.py +2 -2
deriva_ml/core/mixins/asset.py +3 -3
deriva_ml/core/mixins/dataset.py +3 -3
deriva_ml/core/mixins/execution.py +1 -0
deriva_ml/core/mixins/feature.py +2 -2
deriva_ml/core/mixins/file.py +2 -2
deriva_ml/core/mixins/path_builder.py +2 -2
deriva_ml/core/mixins/rid_resolution.py +2 -2
deriva_ml/core/mixins/vocabulary.py +2 -2
deriva_ml/core/mixins/workflow.py +3 -3
deriva_ml/dataset/catalog_graph.py +3 -4
deriva_ml/dataset/dataset.py +5 -3
deriva_ml/dataset/dataset_bag.py +0 -2
deriva_ml/dataset/upload.py +2 -2
deriva_ml/demo_catalog.py +0 -1
deriva_ml/execution/__init__.py +8 -8
deriva_ml/execution/base_config.py +2 -2
deriva_ml/execution/execution.py +5 -3
deriva_ml/execution/execution_record.py +0 -1
deriva_ml/execution/model_protocol.py +1 -1
deriva_ml/execution/multirun_config.py +0 -1
deriva_ml/execution/runner.py +3 -3
deriva_ml/experiment/experiment.py +3 -3
deriva_ml/feature.py +2 -2
deriva_ml/interfaces.py +2 -2
deriva_ml/model/__init__.py +45 -24
deriva_ml/model/annotations.py +0 -1
deriva_ml/model/catalog.py +3 -2
deriva_ml/model/data_loader.py +330 -0
deriva_ml/model/data_sources.py +439 -0
deriva_ml/model/database.py +216 -32
deriva_ml/model/fk_orderer.py +379 -0
deriva_ml/model/handles.py +1 -1
deriva_ml/model/schema_builder.py +816 -0
deriva_ml/run_model.py +3 -3
deriva_ml/schema/annotations.py +2 -1
deriva_ml/schema/create_schema.py +1 -1
deriva_ml/schema/validation.py +1 -1
{deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/METADATA +1 -1
deriva_ml-1.17.16.dist-info/RECORD +81 -0
deriva_ml-1.17.14.dist-info/RECORD +0 -77
{deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/WHEEL +0 -0
{deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/entry_points.txt +0 -0
{deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/top_level.txt +0 -0

deriva_ml/execution/execution.py CHANGED Viewed

@@ -39,12 +39,16 @@ import time
 from collections import defaultdict
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Callable, Iterable, List
+from typing import TYPE_CHECKING, Any, Callable, Iterable, List
 from deriva.core import format_exception
+if TYPE_CHECKING:
+    from deriva_ml.asset.asset import Asset
 from deriva.core.hatrac_store import HatracStore
 from pydantic import ConfigDict, validate_call
+from deriva_ml.asset.aux_classes import AssetFilePath
 from deriva_ml.core.base import DerivaML
 from deriva_ml.core.definitions import (
     DRY_RUN_RID,
@@ -58,7 +62,6 @@ from deriva_ml.core.definitions import (
     UploadProgress,
 )
 from deriva_ml.core.exceptions import DerivaMLException
-from deriva_ml.asset.aux_classes import AssetFilePath
 from deriva_ml.dataset.aux_classes import DatasetSpec, DatasetVersion
 from deriva_ml.dataset.dataset import Dataset
 from deriva_ml.dataset.dataset_bag import DatasetBag
@@ -1170,7 +1173,6 @@ class Execution:
             return self._execution_record.list_assets(asset_role=asset_role)
         # Fallback for dry_run mode
-        from deriva_ml.asset.asset import Asset
         pb = self._ml_object.pathBuilder()
         asset_exec = pb.schemas[self._ml_object.ml_schema].Execution_Asset_Execution

deriva_ml/execution/execution_record.py CHANGED Viewed

@@ -533,7 +533,6 @@ class ExecutionRecord(BaseModel):
             >>> for asset in record.list_assets(asset_role="Output"):
             ...     print(f"Output Asset: {asset.asset_rid}")
         """
-        from deriva_ml.asset.asset import Asset
         if self._ml_instance is None:
             raise DerivaMLException("ExecutionRecord is not bound to a catalog")

deriva_ml/execution/model_protocol.py CHANGED Viewed

@@ -82,7 +82,7 @@ The protocol uses @runtime_checkable, so isinstance() checks work at runtime.
 from __future__ import annotations
-from typing import Protocol, Any, runtime_checkable, TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
 if TYPE_CHECKING:
     from deriva_ml import DerivaML

deriva_ml/execution/multirun_config.py CHANGED Viewed

@@ -37,7 +37,6 @@ Benefits:
 """
 from dataclasses import dataclass, field
-from typing import Any
 @dataclass

deriva_ml/execution/runner.py CHANGED Viewed

@@ -145,7 +145,7 @@ from __future__ import annotations
 import atexit
 import logging
 from pathlib import Path
-from typing import Any, TypeVar, TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, TypeVar
 from hydra.core.hydra_config import HydraConfig
 from hydra_zen import builds
@@ -153,9 +153,9 @@ from hydra_zen import builds
 if TYPE_CHECKING:
     from deriva_ml import DerivaML
     from deriva_ml.core.config import DerivaMLConfig
-    from deriva_ml.dataset import DatasetSpec
-    from deriva_ml.execution import ExecutionConfiguration, Workflow
     from deriva_ml.core.definitions import RID
+    from deriva_ml.dataset import DatasetSpec
+    from deriva_ml.execution import Workflow
 # Type variable for DerivaML and its subclasses

deriva_ml/experiment/experiment.py CHANGED Viewed

@@ -26,10 +26,10 @@ import yaml
 from deriva.core.hatrac_store import HatracStore
 if TYPE_CHECKING:
-    from deriva_ml.core.base import DerivaML
-    from deriva_ml.execution.execution_record import ExecutionRecord
     from deriva_ml.asset.asset import Asset
+    from deriva_ml.core.base import DerivaML
     from deriva_ml.dataset.dataset import Dataset
+    from deriva_ml.execution.execution_record import ExecutionRecord
 @dataclass
@@ -403,7 +403,7 @@ class Experiment:
             >>> exp = ml.lookup_experiment("47BE")
             >>> exp.display_markdown()
         """
-        from IPython.display import display, Markdown
+        from IPython.display import Markdown, display
         display(Markdown(self.to_markdown(show_datasets, show_assets)))

deriva_ml/feature.py CHANGED Viewed

@@ -12,12 +12,12 @@ Typical usage example:
     >>> record = FeatureClass(value="high", confidence=0.95)
 """
+# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
+import importlib
 from pathlib import Path
 from types import UnionType
 from typing import TYPE_CHECKING, ClassVar, Optional, Type
-# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
-import importlib
 _ermrest_model = importlib.import_module("deriva.core.ermrest_model")
 Column = _ermrest_model.Column
 FindAssociationResult = _ermrest_model.FindAssociationResult

deriva_ml/interfaces.py CHANGED Viewed

@@ -59,13 +59,13 @@ Implementation Notes
 from __future__ import annotations
+# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
+import importlib
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Iterable, Protocol, Self, runtime_checkable
 import pandas as pd
-# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
-import importlib
 _deriva_core = importlib.import_module("deriva.core")
 _datapath = importlib.import_module("deriva.core.datapath")
 _ermrest_catalog = importlib.import_module("deriva.core.ermrest_catalog")

deriva_ml/model/__init__.py CHANGED Viewed

@@ -3,47 +3,60 @@
 This module provides catalog and database model classes, as well as
 handle wrappers for ERMrest model objects and annotation builders.
+Key components:
+- DerivaModel: Schema analysis utilities
+- DatabaseModel: SQLite database from BDBag
+- SchemaBuilder/SchemaORM: Create ORM from Deriva Model (Phase 1)
+- DataLoader: Fill database from data source (Phase 2)
+- DataSource: Protocol for data sources (BagDataSource, CatalogDataSource)
+- ForeignKeyOrderer: Compute FK-safe insertion order
 Lazy imports are used for DatabaseModel and DerivaMLDatabase to avoid
 circular imports with the dataset module.
 """
-from deriva_ml.model.catalog import DerivaModel
-from deriva_ml.model.handles import ColumnHandle, TableHandle
 # Annotation builders - import the most common ones for convenience
 from deriva_ml.model.annotations import (
-    # Builders
-    Display,
-    VisibleColumns,
-    VisibleForeignKeys,
-    TableDisplay,
-    TableDisplayOptions,
+    CONTEXT_COMPACT,
+    # Context constants
+    CONTEXT_DEFAULT,
+    CONTEXT_DETAILED,
+    CONTEXT_ENTRY,
+    CONTEXT_FILTER,
+    Aggregate,
+    ArrayUxMode,
     ColumnDisplay,
     ColumnDisplayOptions,
-    PreFormat,
-    PseudoColumn,
-    PseudoColumnDisplay,
+    # Builders
+    Display,
     Facet,
     FacetList,
     FacetRange,
-    SortKey,
-    NameStyle,
+    FacetUxMode,
     # FK helpers
     InboundFK,
+    NameStyle,
     OutboundFK,
-    fk_constraint,
+    PreFormat,
+    PseudoColumn,
+    PseudoColumnDisplay,
+    SortKey,
+    TableDisplay,
+    TableDisplayOptions,
     # Enums
     TemplateEngine,
-    Aggregate,
-    ArrayUxMode,
-    FacetUxMode,
-    # Context constants
-    CONTEXT_DEFAULT,
-    CONTEXT_COMPACT,
-    CONTEXT_DETAILED,
-    CONTEXT_ENTRY,
-    CONTEXT_FILTER,
+    VisibleColumns,
+    VisibleForeignKeys,
+    fk_constraint,
 )
+from deriva_ml.model.catalog import DerivaModel
+from deriva_ml.model.data_loader import DataLoader
+from deriva_ml.model.data_sources import BagDataSource, CatalogDataSource, DataSource
+from deriva_ml.model.fk_orderer import ForeignKeyOrderer
+from deriva_ml.model.handles import ColumnHandle, TableHandle
+# Two-phase ORM creation components
+from deriva_ml.model.schema_builder import SchemaBuilder, SchemaORM
 __all__ = [
     # Core classes
@@ -52,6 +65,14 @@ __all__ = [
     "DerivaMLDatabase",
     "TableHandle",
     "ColumnHandle",
+    # Two-phase ORM creation
+    "SchemaBuilder",
+    "SchemaORM",
+    "DataSource",
+    "BagDataSource",
+    "CatalogDataSource",
+    "DataLoader",
+    "ForeignKeyOrderer",
     # Annotation builders
     "Display",
     "VisibleColumns",

deriva_ml/model/annotations.py CHANGED Viewed

@@ -131,7 +131,6 @@ from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Literal
 # =============================================================================
 # Enums for constrained values
 # =============================================================================

deriva_ml/model/catalog.py CHANGED Viewed

@@ -7,13 +7,14 @@ ML-specific functionality. It handles schema management, feature definitions, an
 from __future__ import annotations
+# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
+import importlib
 # Standard library imports
 from collections import Counter, defaultdict
 from graphlib import CycleError, TopologicalSorter
 from typing import Any, Callable, Final, Iterable, NewType, TypeAlias
-# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
-import importlib
 _ermrest_catalog = importlib.import_module("deriva.core.ermrest_catalog")
 _ermrest_model = importlib.import_module("deriva.core.ermrest_model")

deriva_ml/model/data_loader.py ADDED Viewed

@@ -0,0 +1,330 @@
+"""Load data into SQLite database with FK ordering.
+This module provides the DataLoader class which loads data from a
+DataSource into a SchemaORM database. It handles:
+- Automatic FK dependency ordering
+- Batch inserts with conflict handling
+- Progress tracking
+This is Phase 2 of the two-phase pattern:
+1. Phase 1 (SchemaBuilder): Create ORM structure without data
+2. Phase 2 (DataLoader): Fill database from a data source
+Example:
+    # Phase 1: Create ORM
+    orm = SchemaBuilder(model, schemas).build()
+    # Phase 2: Fill with data
+    source = BagDataSource(bag_path)
+    loader = DataLoader(orm, source)
+    counts = loader.load_tables(['Subject', 'Image', 'Diagnosis'])
+    print(f"Loaded {sum(counts.values())} rows")
+"""
+from __future__ import annotations
+import logging
+from typing import Any, Callable
+from deriva.core.ermrest_model import Table as DerivaTable
+from sqlalchemy.dialects.sqlite import insert as sqlite_insert
+from .data_sources import DataSource
+from .fk_orderer import ForeignKeyOrderer
+from .schema_builder import SchemaORM
+logger = logging.getLogger(__name__)
+class DataLoader:
+    """Loads data into a database with FK ordering.
+    Phase 2 of the two-phase database creation pattern. Takes a
+    SchemaORM (from Phase 1) and populates it from a DataSource.
+    Automatically orders tables by FK dependencies to ensure
+    referential integrity during loading.
+    Example:
+        # Phase 1: Create ORM
+        orm = SchemaBuilder(model, schemas).build()
+        # Phase 2: Fill with data from bag
+        source = BagDataSource(bag_path)
+        loader = DataLoader(orm, source)
+        counts = loader.load_tables()  # All tables
+        print(f"Loaded {sum(counts.values())} total rows")
+        # Or load specific tables
+        counts = loader.load_tables(['Subject', 'Image'])
+        # With progress callback
+        def on_progress(table, count, total):
+            print(f"Loaded {table}: {count} rows")
+        loader.load_tables(progress_callback=on_progress)
+    """
+    def __init__(
+        self,
+        schema_orm: SchemaORM,
+        data_source: DataSource,
+    ):
+        """Initialize the loader.
+        Args:
+            schema_orm: ORM structure from SchemaBuilder.
+            data_source: Source of data to load (BagDataSource, CatalogDataSource, etc.).
+        """
+        self.orm = schema_orm
+        self.source = data_source
+        self.orderer = ForeignKeyOrderer(
+            schema_orm.model,
+            schema_orm.schemas,
+        )
+    def load_tables(
+        self,
+        tables: list[str | DerivaTable] | None = None,
+        on_conflict: str = "ignore",
+        batch_size: int = 1000,
+        progress_callback: Callable[[str, int, int], None] | None = None,
+    ) -> dict[str, int]:
+        """Load data into specified tables with FK ordering.
+        Tables are automatically ordered by FK dependencies to ensure
+        referenced tables are populated first.
+        Args:
+            tables: Tables to load. If None, loads all tables that have
+                data in the source.
+            on_conflict: How to handle duplicate keys:
+                - "ignore": Skip rows with duplicate keys (default)
+                - "replace": Replace existing rows
+                - "error": Raise error on duplicates
+            batch_size: Number of rows per insert batch.
+            progress_callback: Optional callback(table_name, rows_loaded, total_tables)
+                called after each table is loaded.
+        Returns:
+            Dict mapping table names to row counts loaded.
+        """
+        # Determine tables to load
+        if tables is None:
+            # Get all tables that have data in source
+            available = set(self.source.list_available_tables())
+            # Filter to tables that exist in ORM
+            orm_tables = set(self.orm.list_tables())
+            # Match available tables to ORM tables
+            tables_to_load = []
+            for orm_table in orm_tables:
+                # Check both qualified and unqualified names
+                table_name = orm_table.split(".")[-1]
+                if orm_table in available or table_name in available:
+                    tables_to_load.append(orm_table)
+        else:
+            tables_to_load = [
+                t if isinstance(t, str) else f"{t.schema.name}.{t.name}"
+                for t in tables
+            ]
+        # Compute insertion order
+        try:
+            ordered_tables = self.orderer.get_insertion_order(tables_to_load)
+        except ValueError as e:
+            # Some tables might not be in the model, just use original order
+            logger.warning(f"Could not compute FK ordering: {e}")
+            ordered_tables = [
+                self.orderer._to_table(t) if isinstance(t, str) else t
+                for t in tables_to_load
+                if self._table_exists(t)
+            ]
+        # Load in order
+        counts = {}
+        total_tables = len(ordered_tables)
+        for i, table in enumerate(ordered_tables):
+            table_key = f"{table.schema.name}.{table.name}"
+            count = self._load_table(table, on_conflict, batch_size)
+            counts[table_key] = count
+            if progress_callback:
+                progress_callback(table_key, count, total_tables)
+            if count > 0:
+                logger.info(f"Loaded {count} rows into {table_key}")
+        return counts
+    def _table_exists(self, table: str | DerivaTable) -> bool:
+        """Check if table exists in ORM."""
+        try:
+            if isinstance(table, str):
+                self.orm.find_table(table)
+            else:
+                self.orm.find_table(f"{table.schema.name}.{table.name}")
+            return True
+        except KeyError:
+            return False
+    def _load_table(
+        self,
+        table: DerivaTable,
+        on_conflict: str,
+        batch_size: int,
+    ) -> int:
+        """Load a single table.
+        Args:
+            table: Table to load.
+            on_conflict: Conflict handling strategy.
+            batch_size: Rows per batch.
+        Returns:
+            Number of rows loaded.
+        """
+        table_key = f"{table.schema.name}.{table.name}"
+        # Find SQL table
+        try:
+            sql_table = self.orm.find_table(table_key)
+        except KeyError:
+            logger.warning(f"Table {table_key} not found in ORM")
+            return 0
+        # Check if source has data
+        if not self.source.has_table(table):
+            logger.debug(f"No data for {table_key} in source")
+            return 0
+        # Get data from source
+        rows_loaded = 0
+        batch = []
+        with self.orm.engine.begin() as conn:
+            for row in self.source.get_table_data(table):
+                batch.append(row)
+                if len(batch) >= batch_size:
+                    rows_loaded += self._insert_batch(
+                        conn, sql_table, batch, on_conflict
+                    )
+                    batch = []
+            # Insert remaining rows
+            if batch:
+                rows_loaded += self._insert_batch(
+                    conn, sql_table, batch, on_conflict
+                )
+        return rows_loaded
+    def _insert_batch(
+        self,
+        conn: Any,
+        sql_table: Any,
+        rows: list[dict[str, Any]],
+        on_conflict: str,
+    ) -> int:
+        """Insert a batch of rows.
+        Args:
+            conn: Database connection.
+            sql_table: SQLAlchemy table.
+            rows: List of row dictionaries.
+            on_conflict: Conflict handling strategy.
+        Returns:
+            Number of rows inserted.
+        """
+        if not rows:
+            return 0
+        try:
+            if on_conflict == "ignore":
+                stmt = sqlite_insert(sql_table).on_conflict_do_nothing()
+            elif on_conflict == "replace":
+                # For SQLite, we need to specify all columns for upsert
+                stmt = sqlite_insert(sql_table)
+                update_cols = {
+                    c.name: c for c in stmt.excluded
+                    if c.name not in ("RID",)  # Don't update primary key
+                }
+                stmt = stmt.on_conflict_do_update(
+                    index_elements=["RID"],
+                    set_=update_cols,
+                )
+            else:
+                stmt = sql_table.insert()
+            conn.execute(stmt, rows)
+            return len(rows)
+        except Exception as e:
+            logger.error(f"Error inserting into {sql_table.name}: {e}")
+            if on_conflict == "error":
+                raise
+            return 0
+    def load_table(
+        self,
+        table: str | DerivaTable,
+        on_conflict: str = "ignore",
+        batch_size: int = 1000,
+    ) -> int:
+        """Load a single table (without FK ordering).
+        Use this when you know the dependencies are already satisfied
+        or for loading a single table.
+        Args:
+            table: Table to load.
+            on_conflict: Conflict handling strategy.
+            batch_size: Rows per batch.
+        Returns:
+            Number of rows loaded.
+        """
+        if isinstance(table, str):
+            table = self.orderer._to_table(table)
+        return self._load_table(table, on_conflict, batch_size)
+    def get_load_order(
+        self,
+        tables: list[str | DerivaTable] | None = None,
+    ) -> list[str]:
+        """Get the FK-safe load order for tables without loading.
+        Useful for previewing or manually controlling load order.
+        Args:
+            tables: Tables to order. If None, orders all available.
+        Returns:
+            List of table names in safe insertion order.
+        """
+        if tables is None:
+            available = self.source.list_available_tables()
+            tables = [t for t in available if self._table_exists(t)]
+        ordered = self.orderer.get_insertion_order(tables)
+        return [f"{t.schema.name}.{t.name}" for t in ordered]
+    def validate_load_order(
+        self,
+        tables: list[str | DerivaTable],
+    ) -> list[tuple[str, str, str]]:
+        """Validate that tables can be loaded in the given order.
+        Args:
+            tables: Ordered list of tables.
+        Returns:
+            List of FK violations as (table, missing_dep, fk_name) tuples.
+            Empty if order is valid.
+        """
+        return self.orderer.validate_insertion_order(tables)

deriva-ml 1.17.14__py3-none-any.whl → 1.17.16__py3-none-any.whl

deriva-ml 1.17.14py3-none-any.whl → 1.17.16py3-none-any.whl