PyPI - deriva-ml - Versions diffs - 1.17.9__py3-none-any.whl → 1.17.11__py3-none-any.whl - Mend

deriva-ml 1.17.9py3-none-any.whl → 1.17.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

deriva_ml/__init__.py +43 -1
deriva_ml/asset/__init__.py +17 -0
deriva_ml/asset/asset.py +357 -0
deriva_ml/asset/aux_classes.py +100 -0
deriva_ml/bump_version.py +254 -11
deriva_ml/catalog/__init__.py +21 -0
deriva_ml/catalog/clone.py +1199 -0
deriva_ml/catalog/localize.py +426 -0
deriva_ml/core/__init__.py +29 -0
deriva_ml/core/base.py +817 -1067
deriva_ml/core/config.py +169 -21
deriva_ml/core/constants.py +120 -19
deriva_ml/core/definitions.py +123 -13
deriva_ml/core/enums.py +47 -73
deriva_ml/core/ermrest.py +226 -193
deriva_ml/core/exceptions.py +297 -14
deriva_ml/core/filespec.py +99 -28
deriva_ml/core/logging_config.py +225 -0
deriva_ml/core/mixins/__init__.py +42 -0
deriva_ml/core/mixins/annotation.py +915 -0
deriva_ml/core/mixins/asset.py +384 -0
deriva_ml/core/mixins/dataset.py +237 -0
deriva_ml/core/mixins/execution.py +408 -0
deriva_ml/core/mixins/feature.py +365 -0
deriva_ml/core/mixins/file.py +263 -0
deriva_ml/core/mixins/path_builder.py +145 -0
deriva_ml/core/mixins/rid_resolution.py +204 -0
deriva_ml/core/mixins/vocabulary.py +400 -0
deriva_ml/core/mixins/workflow.py +322 -0
deriva_ml/core/validation.py +389 -0
deriva_ml/dataset/__init__.py +2 -1
deriva_ml/dataset/aux_classes.py +20 -4
deriva_ml/dataset/catalog_graph.py +575 -0
deriva_ml/dataset/dataset.py +1242 -1008
deriva_ml/dataset/dataset_bag.py +1311 -182
deriva_ml/dataset/history.py +27 -14
deriva_ml/dataset/upload.py +225 -38
deriva_ml/demo_catalog.py +186 -105
deriva_ml/execution/__init__.py +46 -2
deriva_ml/execution/base_config.py +639 -0
deriva_ml/execution/execution.py +545 -244
deriva_ml/execution/execution_configuration.py +26 -11
deriva_ml/execution/execution_record.py +592 -0
deriva_ml/execution/find_caller.py +298 -0
deriva_ml/execution/model_protocol.py +175 -0
deriva_ml/execution/multirun_config.py +153 -0
deriva_ml/execution/runner.py +595 -0
deriva_ml/execution/workflow.py +224 -35
deriva_ml/experiment/__init__.py +8 -0
deriva_ml/experiment/experiment.py +411 -0
deriva_ml/feature.py +6 -1
deriva_ml/install_kernel.py +143 -6
deriva_ml/interfaces.py +862 -0
deriva_ml/model/__init__.py +99 -0
deriva_ml/model/annotations.py +1278 -0
deriva_ml/model/catalog.py +286 -60
deriva_ml/model/database.py +144 -649
deriva_ml/model/deriva_ml_database.py +308 -0
deriva_ml/model/handles.py +14 -0
deriva_ml/run_model.py +319 -0
deriva_ml/run_notebook.py +507 -38
deriva_ml/schema/__init__.py +18 -2
deriva_ml/schema/annotations.py +62 -33
deriva_ml/schema/create_schema.py +169 -69
deriva_ml/schema/validation.py +601 -0
{deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -5
deriva_ml-1.17.11.dist-info/RECORD +77 -0
{deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
{deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +2 -0
deriva_ml/protocols/dataset.py +0 -19
deriva_ml/test.py +0 -94
deriva_ml-1.17.9.dist-info/RECORD +0 -45
{deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0

deriva_ml/dataset/dataset_bag.py CHANGED Viewed

@@ -1,13 +1,44 @@
-"""
-The module implements the sqllite interface to a set of directories representing a dataset bag.
+"""SQLite-backed dataset access for downloaded BDBags.
+This module provides the DatasetBag class, which allows querying and navigating
+downloaded dataset bags using SQLite. When a dataset is downloaded from a Deriva
+catalog, it is stored as a BDBag (Big Data Bag) containing:
+- CSV files with table data
+- Asset files (images, documents, etc.)
+- A schema.json describing the catalog structure
+- A fetch.txt manifest of referenced files
+The DatasetBag class provides a read-only interface to this data, mirroring
+the Dataset class API where possible. This allows code to work uniformly
+with both live catalog datasets and downloaded bags.
+Key concepts:
+- DatasetBag wraps a single dataset within a downloaded bag
+- A bag may contain multiple datasets (nested/hierarchical)
+- All operations are read-only (bags are immutable snapshots)
+- Queries use SQLite via SQLAlchemy ORM
+- Table-level access (get_table_as_dict, lookup_term) is on the catalog (DerivaMLDatabase)
+Typical usage:
+    >>> # Download a dataset from a catalog
+    >>> bag = ml.download_dataset_bag(dataset_spec)
+    >>> # List dataset members by type
+    >>> members = bag.list_dataset_members(recurse=True)
+    >>> for image in members.get("Image", []):
+    ...     print(image["Filename"])
 """
 from __future__ import annotations
 # Standard library imports
+import logging
+import shutil
 from collections import defaultdict
 from copy import copy
-from typing import TYPE_CHECKING, Any, Generator, Iterable, cast
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Generator, Iterable, Self, cast
 import deriva.core.datapath as datapath
@@ -18,17 +49,17 @@ import pandas as pd
 from deriva.core.ermrest_model import Table
 # Deriva imports
-from pydantic import ConfigDict, validate_call
-from sqlalchemy import CompoundSelect, Engine, RowMapping, Select, and_, inspect, select, union
+from sqlalchemy import CompoundSelect, Engine, Select, and_, inspect, select, union
 from sqlalchemy.orm import RelationshipProperty, Session
 from sqlalchemy.orm.util import AliasedClass
-from deriva_ml.core.definitions import RID, VocabularyTerm
-from deriva_ml.core.exceptions import DerivaMLException, DerivaMLInvalidTerm
-from deriva_ml.feature import Feature
+from deriva_ml.core.definitions import RID
+from deriva_ml.core.exceptions import DerivaMLException
+from deriva_ml.dataset.aux_classes import DatasetHistory, DatasetVersion
+from deriva_ml.feature import Feature, FeatureRecord
 if TYPE_CHECKING:
-    from deriva_ml.model.database import DatabaseModel
+    from deriva_ml.model.deriva_ml_database import DerivaMLDatabase
 try:
     from icecream import ic
@@ -36,69 +67,235 @@ except ImportError:  # Graceful fallback if IceCream isn't installed.
     ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
-class DatasetBag:
+@dataclass
+class FeatureValueRecord:
+    """A feature value record with execution provenance.
+    This class represents a single feature value assigned to an asset,
+    including the execution that created it. Used by restructure_assets
+    when a value_selector function needs to choose between multiple
+    feature values for the same asset.
+    The raw_record attribute contains the complete feature table row as
+    a dictionary, which can be used to access all columns including any
+    additional metadata or columns beyond the primary value.
+    Attributes:
+        target_rid: RID of the asset/entity this feature value applies to.
+        feature_name: Name of the feature.
+        value: The feature value (typically a vocabulary term name).
+        execution_rid: RID of the execution that created this feature value, if any.
+            Use this to distinguish between values from different executions.
+        raw_record: The complete raw record from the feature table as a dictionary.
+            Access all columns via dict keys, e.g., record.raw_record["MyColumn"].
+    Example:
+        Using a value_selector to choose the most recent feature value::
+            def select_by_execution(records: list[FeatureValueRecord]) -> FeatureValueRecord:
+                # Select value from most recent execution (assuming RIDs are sortable)
+                return max(records, key=lambda r: r.execution_rid or "")
+            bag.restructure_assets(
+                output_dir="./ml_data",
+                group_by=["Diagnosis"],
+                value_selector=select_by_execution,
+            )
+        Accessing raw record data::
+            def select_by_confidence(records: list[FeatureValueRecord]) -> FeatureValueRecord:
+                # Select value with highest confidence score from raw record
+                return max(records, key=lambda r: r.raw_record.get("Confidence", 0))
     """
-    DatasetBag is a class that manages a materialized bag.  It is created from a locally materialized
-    BDBag for a dataset_table, which is created either by DerivaML.create_execution, or directly by
-    calling DerivaML.download_dataset.
+    target_rid: RID
+    feature_name: str
+    value: Any
+    execution_rid: RID | None = None
+    raw_record: dict[str, Any] = field(default_factory=dict)
+    def __repr__(self) -> str:
+        return (f"FeatureValueRecord(target_rid='{self.target_rid}', "
+                f"feature_name='{self.feature_name}', value='{self.value}', "
+                f"execution_rid='{self.execution_rid}')")
+class DatasetBag:
+    """Read-only interface to a downloaded dataset bag.
+    DatasetBag manages access to a materialized BDBag (Big Data Bag) that contains
+    a snapshot of dataset data from a Deriva catalog. It provides methods for:
-    A general a bag may contain multiple datasets, if the dataset is nested. The DatasetBag is used to
-    represent only one of the datasets in the bag.
+    - Listing dataset members and their attributes
+    - Navigating dataset relationships (parents, children)
+    - Accessing feature values
+    - Denormalizing data across related tables
-    All the metadata associated with the dataset is stored in a SQLLite database that can be queried using SQL.
+    A bag may contain multiple datasets when nested datasets are involved. Each
+    DatasetBag instance represents a single dataset within the bag - use
+    list_dataset_children() to navigate to nested datasets.
+    For catalog-level operations like querying arbitrary tables or looking up
+    vocabulary terms, use the DerivaMLDatabase class instead.
+    The class implements the DatasetLike protocol, providing the same read interface
+    as the Dataset class. This allows code to work with both live catalogs and
+    downloaded bags interchangeably.
     Attributes:
-        dataset_rid (RID): RID for the specified dataset
-        version: The version of the dataset
-        model (DatabaseModel): The Database model that has all the catalog metadata associated with this dataset.
-            database:
-        dbase (sqlite3.Connection): connection to the sqlite database holding table values
-        domain_schema (str): Name of the domain schema
+        dataset_rid (RID): The unique Resource Identifier for this dataset.
+        dataset_types (list[str]): List of vocabulary terms describing the dataset type.
+        description (str): Human-readable description of the dataset.
+        execution_rid (RID | None): RID of the execution associated with this dataset version, if any.
+        model (DatabaseModel): The DatabaseModel providing SQLite access to bag data.
+        engine (Engine): SQLAlchemy engine for database queries.
+        metadata (MetaData): SQLAlchemy metadata with table definitions.
+    Example:
+        >>> # Download a dataset
+        >>> bag = dataset.download_dataset_bag(version="1.0.0")
+        >>> # List members by type
+        >>> members = bag.list_dataset_members()
+        >>> for image in members.get("Image", []):
+        ...     print(f"File: {image['Filename']}")
+        >>> # Navigate to nested datasets
+        >>> for child in bag.list_dataset_children():
+        ...     print(f"Nested: {child.dataset_rid}")
     """
-    def __init__(self, database_model: DatabaseModel, dataset_rid: RID | None = None) -> None:
-        """
-        Initialize a DatasetBag instance.
+    def __init__(
+        self,
+        catalog: "DerivaMLDatabase",
+        dataset_rid: RID | None = None,
+        dataset_types: str | list[str] | None = None,
+        description: str = "",
+        execution_rid: RID | None = None,
+    ):
+        """Initialize a DatasetBag instance for a dataset within a downloaded bag.
+        This mirrors the Dataset class initialization pattern, where both classes
+        take a catalog-like object as their first argument for consistency.
         Args:
-            database_model: Database version of the bag.
-            dataset_rid: Optional RID for the dataset.
+            catalog: The DerivaMLDatabase instance providing access to the bag's data.
+                This implements the DerivaMLCatalog protocol.
+            dataset_rid: The RID of the dataset to wrap. If None, uses the primary
+                dataset RID from the bag.
+            dataset_types: One or more dataset type terms. Can be a single string
+                or list of strings.
+            description: Human-readable description of the dataset.
+            execution_rid: RID of the execution associated with this dataset version.
+                If None, will be looked up from the Dataset_Version table.
+        Raises:
+            DerivaMLException: If no dataset_rid is provided and none can be
+                determined from the bag, or if the RID doesn't exist in the bag.
         """
-        self.model = database_model
+        # Store reference to the catalog and extract the underlying model
+        self._catalog = catalog
+        self.model = catalog.model
         self.engine = cast(Engine, self.model.engine)
         self.metadata = self.model.metadata
+        # Use provided RID or fall back to the bag's primary dataset
         self.dataset_rid = dataset_rid or self.model.dataset_rid
+        self.description = description
+        self.execution_rid = execution_rid or (
+            self.model._get_dataset_execution(self.dataset_rid) or {}
+        ).get("Execution")
+        # Normalize dataset_types to always be a list of strings for consistency
+        # with the Dataset class interface
+        if dataset_types is None:
+            self.dataset_types: list[str] = []
+        elif isinstance(dataset_types, str):
+            self.dataset_types: list[str] = [dataset_types]
+        else:
+            self.dataset_types: list[str] = list(dataset_types)
         if not self.dataset_rid:
             raise DerivaMLException("No dataset RID provided")
-        self.model.rid_lookup(self.dataset_rid)  # Check to make sure that this dataset is in the bag.
+        # Validate that this dataset exists in the bag
+        self.model.rid_lookup(self.dataset_rid)
-        self.version = self.model.dataset_version(self.dataset_rid)
+        # Cache the version and dataset table reference
+        self._current_version = self.model.dataset_version(self.dataset_rid)
         self._dataset_table = self.model.dataset_table
     def __repr__(self) -> str:
-        return f"<deriva_ml.DatasetBag object {self.dataset_rid} at {hex(id(self))}>"
+        """Return a string representation of the DatasetBag for debugging."""
+        return (f"<deriva_ml.DatasetBag object at {hex(id(self))}: rid='{self.dataset_rid}', "
+                f"version='{self.current_version}', types={self.dataset_types}>")
+    @property
+    def current_version(self) -> DatasetVersion:
+        """Get the version of the dataset at the time the bag was downloaded.
+        For a DatasetBag, this is the version that was current when the bag was
+        created. Unlike the live Dataset class, this value is immutable since
+        bags are read-only snapshots.
+        Returns:
+            DatasetVersion: The semantic version (major.minor.patch) of this dataset.
+        """
+        return self._current_version
     def list_tables(self) -> list[str]:
-        """List the names of the tables in the catalog
+        """List all tables available in the bag's SQLite database.
+        Returns the fully-qualified names of all tables (e.g., "domain.Image",
+        "deriva-ml.Dataset") that were exported in this bag.
         Returns:
-            A list of table names.  These names are all qualified with the Deriva schema name.
+            list[str]: Table names in "schema.table" format, sorted alphabetically.
         """
         return self.model.list_tables()
+    def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
+        """Get table contents as dictionaries.
+        Convenience method that delegates to the underlying catalog. This provides
+        access to all rows in a table, not just those belonging to this dataset.
+        For dataset-filtered results, use list_dataset_members() instead.
+        Args:
+            table: Name of the table to retrieve (e.g., "Subject", "Image").
+        Yields:
+            dict: Dictionary for each row in the table.
+        Example:
+            >>> for subject in bag.get_table_as_dict("Subject"):
+            ...     print(subject["Name"])
+        """
+        return self._catalog.get_table_as_dict(table)
     @staticmethod
     def _find_relationship_attr(source, target):
-        """
-        Return the relationship attribute (InstrumentedAttribute) on `source`
-        that points to `target`. Works with classes or AliasedClass.
-        Raises LookupError if not found.
+        """Find the SQLAlchemy relationship attribute connecting two ORM classes.
+        Searches for a relationship on `source` that points to `target`, which is
+        needed to construct proper JOIN clauses in SQL queries.
+        Args:
+            source: Source ORM class or AliasedClass.
+            target: Target ORM class or AliasedClass.
+        Returns:
+            InstrumentedAttribute: The relationship attribute on source pointing to target.
+        Raises:
+            LookupError: If no relationship exists between the two classes.
+        Note:
+            When multiple relationships exist, prefers MANYTOONE direction as this
+            is typically the more natural join direction for denormalization.
         """
         src_mapper = inspect(source).mapper
         tgt_mapper = inspect(target).mapper
-        # collect relationships on the *class* mapper (not on alias)
+        # Collect all relationships on the source mapper that point to target
         candidates: list[RelationshipProperty] = [rel for rel in src_mapper.relationships if rel.mapper is tgt_mapper]
         if not candidates:
@@ -108,86 +305,117 @@ class DatasetBag:
         candidates.sort(key=lambda r: r.direction.name != "MANYTOONE")
         rel = candidates[0]
-        # Bind to the actual source (alias or class)
+        # Return the bound attribute (handles AliasedClass properly)
         return getattr(source, rel.key) if isinstance(source, AliasedClass) else rel.class_attribute
     def _dataset_table_view(self, table: str) -> CompoundSelect[Any]:
-        """Return a SQL command that will return all of the elements in the specified table that are associated with
-        dataset_rid"""
+        """Build a SQL query for all rows in a table that belong to this dataset.
+        Creates a UNION of queries that traverse all possible paths from the
+        Dataset table to the target table, filtering by this dataset's RID
+        (and any nested dataset RIDs).
+        This is necessary because table data may be linked to datasets through
+        different relationship paths (e.g., Image might be linked directly to
+        Dataset or through an intermediate Subject table).
+        Args:
+            table: Name of the table to query.
+        Returns:
+            CompoundSelect: A SQLAlchemy UNION query selecting all matching rows.
+        """
         table_class = self.model.get_orm_class_by_name(table)
         dataset_table_class = self.model.get_orm_class_by_name(self._dataset_table.name)
+        # Include this dataset and all nested datasets in the query
         dataset_rids = [self.dataset_rid] + [c.dataset_rid for c in self.list_dataset_children(recurse=True)]
+        # Find all paths from Dataset to the target table
         paths = [[t.name for t in p] for p in self.model._schema_to_paths() if p[-1].name == table]
+        # Build a SELECT query for each path and UNION them together
         sql_cmds = []
         for path in paths:
             path_sql = select(table_class)
             last_class = self.model.get_orm_class_by_name(path[0])
+            # Join through each table in the path
             for t in path[1:]:
                 t_class = self.model.get_orm_class_by_name(t)
                 path_sql = path_sql.join(self._find_relationship_attr(last_class, t_class))
                 last_class = t_class
+            # Filter to only rows belonging to our dataset(s)
             path_sql = path_sql.where(dataset_table_class.RID.in_(dataset_rids))
             sql_cmds.append(path_sql)
         return union(*sql_cmds)
-    def get_table(self, table: str) -> Generator[tuple, None, None]:
-        """Retrieve the contents of the specified table. If schema is not provided as part of the table name,
-        the method will attempt to locate the schema for the table.
+    def dataset_history(self) -> list[DatasetHistory]:
+        """Retrieves the version history of a dataset.
-        Args:
-            table: return: A generator that yields tuples of column values.
+        Returns a chronological list of dataset versions, including their version numbers,
+        creation times, and associated metadata.
         Returns:
-          A generator that yields tuples of column values.
+            list[DatasetHistory]: List of history entries, each containing:
+                - dataset_version: Version number (major.minor.patch)
+                - minid: Minimal Viable Identifier
+                - snapshot: Catalog snapshot time
+                - dataset_rid: Dataset Resource Identifier
+                - version_rid: Version Resource Identifier
+                - description: Version description
+                - execution_rid: Associated execution RID
-        """
-        with Session(self.engine) as session:
-            result = session.execute(self._dataset_table_view(table))
-            for row in result:
-                yield row
-    def get_table_as_dataframe(self, table: str) -> pd.DataFrame:
-        """Retrieve the contents of the specified table as a dataframe.
-        If schema is not provided as part of the table name,
-        the method will attempt to locate the schema for the table.
-        Args:
-            table: Table to retrieve data from.
-        Returns:
-          A dataframe containing the contents of the specified table.
-        """
-        return pd.read_sql(self._dataset_table_view(table), self.engine)
-    def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
-        """Retrieve the contents of the specified table as a dictionary.
-        Args:
-            table: Table to retrieve data from. f schema is not provided as part of the table name,
-                the method will attempt to locate the schema for the table.
+        Raises:
+            DerivaMLException: If dataset_rid is not a valid dataset RID.
-        Returns:
-          A generator producing dictionaries containing the contents of the specified table as name/value pairs.
+        Example:
+            >>> history = ml.dataset_history("1-abc123")
+            >>> for entry in history:
+            ...     print(f"Version {entry.dataset_version}: {entry.description}")
         """
+        # Query Dataset_Version table directly via the model
+        return [
+            DatasetHistory(
+                dataset_version=DatasetVersion.parse(v["Version"]),
+                minid=v["Minid"],
+                snapshot=v["Snapshot"],
+                dataset_rid=self.dataset_rid,
+                version_rid=v["RID"],
+                description=v["Description"],
+                execution_rid=v["Execution"],
+            )
+            for v in self.model._get_table_contents("Dataset_Version")
+            if v["Dataset"] == self.dataset_rid
+        ]
-        with Session(self.engine) as session:
-            result = session.execute(self._dataset_table_view(table))
-            for row in result.mappings():
-                yield row
-    # @validate_call
-    def list_dataset_members(self, recurse: bool = False) -> dict[str, list[dict[str, Any]]]:
+    def list_dataset_members(
+        self,
+        recurse: bool = False,
+        limit: int | None = None,
+        _visited: set[RID] | None = None,
+        version: Any = None,
+        **kwargs: Any,
+    ) -> dict[str, list[dict[str, Any]]]:
         """Return a list of entities associated with a specific dataset.
         Args:
-           recurse: Whether to include nested datasets.
+            recurse: Whether to include members of nested datasets.
+            limit: Maximum number of members to return per type. None for no limit.
+            _visited: Internal parameter to track visited datasets and prevent infinite recursion.
+            version: Ignored (bags are immutable snapshots).
+            **kwargs: Additional arguments (ignored, for protocol compatibility).
         Returns:
-            Dictionary of entities associated with the dataset.
+            Dictionary mapping member types to lists of member records.
         """
+        # Initialize visited set for recursion guard
+        if _visited is None:
+            _visited = set()
+        # Prevent infinite recursion by checking if we've already visited this dataset
+        if self.dataset_rid in _visited:
+            return {}
+        _visited.add(self.dataset_rid)
         # Look at each of the element types that might be in the _dataset_table and get the list of rid for them from
         # the appropriate association table.
@@ -200,16 +428,29 @@ class DatasetBag:
             assoc_class, dataset_rel, element_rel = self.model.get_orm_association_class(dataset_class, element_class)
             element_table = inspect(element_class).mapped_table
-            if element_table.schema != self.model.domain_schema and element_table.name not in ["Dataset", "File"]:
+            if not self.model.is_domain_schema(element_table.schema) and element_table.name not in ["Dataset", "File"]:
                 # Look at domain tables and nested datasets.
                 continue
             # Get the names of the columns that we are going to need for linking
             with Session(self.engine) as session:
-                sql_cmd = (
-                    select(element_class)
-                    .join(element_rel)
-                    .where(self.dataset_rid == assoc_class.__table__.c["Dataset"])
-                )
+                # For Dataset_Dataset, use Nested_Dataset column to find nested datasets
+                # (similar to how the live catalog does it in Dataset.list_dataset_members)
+                if element_table.name == "Dataset":
+                    sql_cmd = (
+                        select(element_class)
+                        .join(assoc_class, element_class.RID == assoc_class.__table__.c["Nested_Dataset"])
+                        .where(self.dataset_rid == assoc_class.__table__.c["Dataset"])
+                    )
+                else:
+                    # For other tables, use the original join via element_rel
+                    sql_cmd = (
+                        select(element_class)
+                        .join(element_rel)
+                        .where(self.dataset_rid == assoc_class.__table__.c["Dataset"])
+                    )
+                if limit is not None:
+                    sql_cmd = sql_cmd.limit(limit)
                 # Get back the list of ORM entities and convert them to dictionaries.
                 element_entities = session.scalars(sql_cmd).all()
                 element_rows = [{c.key: getattr(obj, c.key) for c in obj.__table__.columns} for obj in element_entities]
@@ -218,8 +459,8 @@ class DatasetBag:
                 # Get the members for all the nested datasets and add to the member list.
                 nested_datasets = [d["RID"] for d in element_rows]
                 for ds in nested_datasets:
-                    nested_dataset = self.model.get_dataset(ds)
-                    for k, v in nested_dataset.list_dataset_members(recurse=recurse).items():
+                    nested_dataset = self._catalog.lookup_dataset(ds)
+                    for k, v in nested_dataset.list_dataset_members(recurse=recurse, limit=limit, _visited=_visited).items():
                         members[k].extend(v)
         return dict(members)
@@ -234,25 +475,63 @@ class DatasetBag:
         """
         return self.model.find_features(table)
-    def list_feature_values(self, table: Table | str, feature_name: str) -> datapath._ResultSet:
-        """Return feature values for a table.
+    def list_feature_values(
+        self, table: Table | str, feature_name: str
+    ) -> Iterable[FeatureRecord]:
+        """Retrieves all values for a feature as typed FeatureRecord instances.
+        Returns an iterator of dynamically-generated FeatureRecord objects for each
+        feature value. Each record is an instance of a Pydantic model specific to
+        this feature, with typed attributes for all columns including the Execution
+        that created the feature value.
         Args:
-            table: The table to get feature values for.
-            feature_name: Name of the feature.
+            table: The table containing the feature, either as name or Table object.
+            feature_name: Name of the feature to retrieve values for.
         Returns:
-            Feature values.
+            Iterable[FeatureRecord]: An iterator of FeatureRecord instances.
+                Each instance has:
+                - Execution: RID of the execution that created this feature value
+                - Feature_Name: Name of the feature
+                - All feature-specific columns as typed attributes
+                - model_dump() method to convert back to a dictionary
+        Raises:
+            DerivaMLException: If the feature doesn't exist or cannot be accessed.
+        Example:
+            >>> # Get typed feature records
+            >>> for record in bag.list_feature_values("Image", "Quality"):
+            ...     print(f"Image {record.Image}: {record.ImageQuality}")
+            ...     print(f"Created by execution: {record.Execution}")
+            >>> # Convert records to dictionaries
+            >>> records = list(bag.list_feature_values("Image", "Quality"))
+            >>> dicts = [r.model_dump() for r in records]
         """
+        # Get table and feature
         feature = self.model.lookup_feature(table, feature_name)
-        feature_class = self.model.get_orm_class_for_table(feature.feature_table)
+        # Get the dynamically-generated FeatureRecord subclass for this feature
+        record_class = feature.feature_record_class()
+        # Query raw values from SQLite
+        feature_table = self.model.find_table(feature.feature_table.name)
         with Session(self.engine) as session:
-            sql_cmd = select(feature_class)
-            return cast(datapath._ResultSet, [row for row in session.execute(sql_cmd).mappings()])
+            sql_cmd = select(feature_table)
+            result = session.execute(sql_cmd)
+            rows = [dict(row._mapping) for row in result]
-    def list_dataset_element_types(self) -> list[Table]:
-        """
-        Lists the data types of elements contained within a dataset.
+        # Convert to typed records
+        for raw_value in rows:
+            # Filter to only include fields that the record class expects
+            field_names = set(record_class.model_fields.keys())
+            filtered_data = {k: v for k, v in raw_value.items() if k in field_names}
+            yield record_class(**filtered_data)
+    def list_dataset_element_types(self) -> Iterable[Table]:
+        """List the types of elements that can be contained in datasets.
         This method analyzes the dataset and identifies the data types for all
         elements within it. It is useful for understanding the structure and
@@ -266,15 +545,33 @@ class DatasetBag:
         """
         return self.model.list_dataset_element_types()
-    def list_dataset_children(self, recurse: bool = False) -> list[DatasetBag]:
+    def list_dataset_children(
+        self,
+        recurse: bool = False,
+        _visited: set[RID] | None = None,
+        version: Any = None,
+        **kwargs: Any,
+    ) -> list[Self]:
         """Get nested datasets.
         Args:
             recurse: Whether to include children of children.
+            _visited: Internal parameter to track visited datasets and prevent infinite recursion.
+            version: Ignored (bags are immutable snapshots).
+            **kwargs: Additional arguments (ignored, for protocol compatibility).
         Returns:
             List of child dataset bags.
         """
+        # Initialize visited set for recursion guard
+        if _visited is None:
+            _visited = set()
+        # Prevent infinite recursion by checking if we've already visited this dataset
+        if self.dataset_rid in _visited:
+            return []
+        _visited.add(self.dataset_rid)
         ds_table = self.model.get_orm_class_by_name(f"{self.model.ml_schema}.Dataset")
         nds_table = self.model.get_orm_class_by_name(f"{self.model.ml_schema}.Dataset_Dataset")
         dv_table = self.model.get_orm_class_by_name(f"{self.model.ml_schema}.Dataset_Version")
@@ -286,63 +583,102 @@ class DatasetBag:
                 .join_from(ds_table, dv_table, onclause=ds_table.Version == dv_table.RID)
                 .where(nds_table.Dataset == self.dataset_rid)
             )
-            nested = [DatasetBag(self.model, r[0]) for r in session.execute(sql_cmd).all()]
+            nested = [self._catalog.lookup_dataset(r[0]) for r in session.execute(sql_cmd).all()]
         result = copy(nested)
         if recurse:
             for child in nested:
-                result.extend(child.list_dataset_children(recurse))
+                result.extend(child.list_dataset_children(recurse=recurse, _visited=_visited))
         return result
-    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
-    def lookup_term(self, table: str | Table, term_name: str) -> VocabularyTerm:
-        """Finds a term in a vocabulary table.
-        Searches for a term in the specified vocabulary table, matching either the primary name
-        or any of its synonyms.
+    def list_dataset_parents(
+        self,
+        recurse: bool = False,
+        _visited: set[RID] | None = None,
+        version: Any = None,
+        **kwargs: Any,
+    ) -> list[Self]:
+        """Given a dataset_table RID, return a list of RIDs of the parent datasets if this is included in a
+        nested dataset.
         Args:
-            table: Vocabulary table to search in (name or Table object).
-            term_name: Name or synonym of the term to find.
+            recurse: If True, recursively return all ancestor datasets.
+            _visited: Internal parameter to track visited datasets and prevent infinite recursion.
+            version: Ignored (bags are immutable snapshots).
+            **kwargs: Additional arguments (ignored, for protocol compatibility).
         Returns:
-            VocabularyTerm: The matching vocabulary term.
+            List of parent dataset bags.
+        """
+        # Initialize visited set for recursion guard
+        if _visited is None:
+            _visited = set()
-        Raises:
-            DerivaMLVocabularyException: If the table is not a vocabulary table, or term is not found.
+        # Prevent infinite recursion by checking if we've already visited this dataset
+        if self.dataset_rid in _visited:
+            return []
+        _visited.add(self.dataset_rid)
-        Examples:
-            Look up by primary name:
-                >>> term = ml.lookup_term("tissue_types", "epithelial")
-                >>> print(term.description)
+        nds_table = self.model.get_orm_class_by_name(f"{self.model.ml_schema}.Dataset_Dataset")
-            Look up by synonym:
-                >>> term = ml.lookup_term("tissue_types", "epithelium")
-        """
-        # Get and validate vocabulary table reference
-        if not self.model.is_vocabulary(table):
-            raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
+        with Session(self.engine) as session:
+            sql_cmd = select(nds_table.Dataset).where(nds_table.Nested_Dataset == self.dataset_rid)
+            parents = [self._catalog.lookup_dataset(r[0]) for r in session.execute(sql_cmd).all()]
-        # Search for term by name or synonym
-        for term in self.get_table_as_dict(table):
-            if term_name == term["Name"] or (term["Synonyms"] and term_name in term["Synonyms"]):
-                term["Synonyms"] = list(term["Synonyms"])
-                return VocabularyTerm.model_validate(term)
+        if recurse:
+            for parent in parents.copy():
+                parents.extend(parent.list_dataset_parents(recurse=True, _visited=_visited))
+        return parents
-        # Term not found
-        raise DerivaMLInvalidTerm(table, term_name)
+    def list_executions(self) -> list[RID]:
+        """List all execution RIDs associated with this dataset.
-    def _denormalize(self, include_tables: list[str]) -> Select:
+        Returns all executions that used this dataset as input. This is
+        tracked through the Dataset_Execution association table.
+        Note:
+            Unlike the live Dataset class which returns Execution objects,
+            DatasetBag returns a list of execution RIDs since the bag is
+            an offline snapshot and cannot look up live execution objects.
+        Returns:
+            List of execution RIDs associated with this dataset.
+        Example:
+            >>> bag = ml.download_dataset_bag(dataset_spec)
+            >>> execution_rids = bag.list_executions()
+            >>> for rid in execution_rids:
+            ...     print(f"Associated execution: {rid}")
         """
-        Generates an SQL statement for denormalizing the dataset based on the tables to include. Processes cycles in
-        graph relationships, ensures proper join order, and generates selected columns for denormalization.
+        de_table = self.model.get_orm_class_by_name(f"{self.model.ml_schema}.Dataset_Execution")
+        with Session(self.engine) as session:
+            sql_cmd = select(de_table.Execution).where(de_table.Dataset == self.dataset_rid)
+            return [r[0] for r in session.execute(sql_cmd).all()]
+    def _denormalize(self, include_tables: list[str]) -> Select:
+        """Build a SQL query that joins multiple tables into a denormalized view.
+        This method creates a "wide table" by joining related tables together,
+        producing a single query that returns columns from all specified tables.
+        This is useful for machine learning pipelines that need flat data.
+        The method:
+        1. Analyzes the schema to find join paths between tables
+        2. Determines the correct join order based on foreign key relationships
+        3. Builds SELECT statements with properly aliased columns
+        4. Creates a UNION if multiple paths exist to the same tables
         Args:
-            include_tables (list[str] | None): List of table names to include in the denormalized dataset. If None,
-                all tables from the dataset will be included.
+            include_tables: List of table names to include in the output. Additional
+                tables may be included if they're needed to join the requested tables.
         Returns:
-            str: SQL query string that represents the process of denormalization.
+            Select: A SQLAlchemy query that produces the denormalized result.
+        Note:
+            Column names in the result are prefixed with the table name to avoid
+            collisions (e.g., "Image.Filename", "Subject.RID").
         """
         # Skip over tables that we don't want to include in the denormalized dataset.
         # Also, strip off the Dataset/Dataset_X part of the path so we don't include dataset columns in the denormalized
@@ -359,9 +695,7 @@ class DatasetBag:
                     return relationship
             return None
-        join_tables, denormalized_columns = (
-            self.model._prepare_wide_table(self, self.dataset_rid, include_tables)
-        )
+        join_tables, denormalized_columns = self.model._prepare_wide_table(self, self.dataset_rid, include_tables)
         denormalized_columns = [
             self.model.get_orm_class_by_name(table_name)
@@ -382,69 +716,864 @@ class DatasetBag:
                     if (r := find_relationship(table_class, on_condition))
                 ]
                 sql_statement = sql_statement.join(table_class, onclause=and_(*on_clause))
-            dataset_rid_list = [self.dataset_rid] + self.list_dataset_children(recurse=True)
+            dataset_rid_list = [self.dataset_rid] + [c.dataset_rid for c in self.list_dataset_children(recurse=True)]
             dataset_class = self.model.get_orm_class_by_name(self._dataset_table.name)
             sql_statement = sql_statement.where(dataset_class.RID.in_(dataset_rid_list))
             sql_statements.append(sql_statement)
         return union(*sql_statements)
-    def denormalize_as_dataframe(self, include_tables: list[str]) -> pd.DataFrame:
+    def _denormalize_from_members(
+        self,
+        include_tables: list[str],
+    ) -> Generator[dict[str, Any], None, None]:
+        """Denormalize dataset members by joining related tables.
+        This method creates a "wide table" view by joining related tables together,
+        using list_dataset_members() as the data source. This ensures consistency
+        with the catalog-based denormalize implementation. The result has outer join
+        semantics - tables without FK relationships are included with NULL values.
+        The method:
+        1. Gets the list of dataset members for each included table via list_dataset_members
+        2. For each member in the first table, follows foreign key relationships to
+           get related records from other tables
+        3. Tables without FK connections to the first table are included with NULLs
+        4. Includes nested dataset members recursively
+        Args:
+            include_tables: List of table names to include in the output.
+        Yields:
+            dict[str, Any]: Rows with column names prefixed by table name (e.g., "Image.Filename").
+                Unrelated tables have NULL values for their columns.
+        Note:
+            Column names in the result are prefixed with the table name to avoid
+            collisions (e.g., "Image.Filename", "Subject.RID").
         """
-        Denormalize the dataset and return the result as a dataframe.
+        # Skip system columns in output
+        skip_columns = {"RCT", "RMT", "RCB", "RMB"}
+        # Get all members for the included tables (recursively includes nested datasets)
+        members = self.list_dataset_members(recurse=True)
+        # Build a lookup of columns for each table
+        table_columns: dict[str, list[str]] = {}
+        for table_name in include_tables:
+            table = self.model.name_to_table(table_name)
+            table_columns[table_name] = [
+                c.name for c in table.columns if c.name not in skip_columns
+            ]
+        # Find the primary table (first non-empty table in include_tables)
+        primary_table = None
+        for table_name in include_tables:
+            if table_name in members and members[table_name]:
+                primary_table = table_name
+                break
+        if primary_table is None:
+            # No data at all
+            return
+        primary_table_obj = self.model.name_to_table(primary_table)
+        for member in members[primary_table]:
+            # Build the row with all columns from all tables
+            row: dict[str, Any] = {}
+            # Add primary table columns
+            for col_name in table_columns[primary_table]:
+                prefixed_name = f"{primary_table}.{col_name}"
+                row[prefixed_name] = member.get(col_name)
+            # For each other table, try to join or add NULL values
+            for other_table_name in include_tables:
+                if other_table_name == primary_table:
+                    continue
+                other_table = self.model.name_to_table(other_table_name)
+                other_cols = table_columns[other_table_name]
+                # Initialize all columns to None (outer join behavior)
+                for col_name in other_cols:
+                    prefixed_name = f"{other_table_name}.{col_name}"
+                    row[prefixed_name] = None
+                # Try to find FK relationship and join
+                if other_table_name in members:
+                    try:
+                        relationship = self.model._table_relationship(
+                            primary_table_obj, other_table
+                        )
+                        fk_col, pk_col = relationship
+                        # Look up the related record
+                        fk_value = member.get(fk_col.name)
+                        if fk_value:
+                            for other_member in members.get(other_table_name, []):
+                                if other_member.get(pk_col.name) == fk_value:
+                                    for col_name in other_cols:
+                                        prefixed_name = f"{other_table_name}.{col_name}"
+                                        row[prefixed_name] = other_member.get(col_name)
+                                    break
+                    except DerivaMLException:
+                        # No FK relationship - columns remain NULL (outer join)
+                        pass
+            yield row
+    def denormalize_as_dataframe(
+        self,
+        include_tables: list[str],
+        version: Any = None,
+        **kwargs: Any,
+    ) -> pd.DataFrame:
+        """Denormalize the dataset bag into a single wide table (DataFrame).
+        Denormalization transforms normalized relational data into a single "wide table"
+        (also called a "flat table" or "denormalized table") by joining related tables
+        together. This produces a DataFrame where each row contains all related information
+        from multiple source tables, with columns from each table combined side-by-side.
+        Wide tables are the standard input format for most machine learning frameworks,
+        which expect all features for a single observation to be in one row. This method
+        bridges the gap between normalized database schemas and ML-ready tabular data.
+        **How it works:**
+        Tables are joined based on their foreign key relationships stored in the bag's
+        schema. For example, if Image has a foreign key to Subject, denormalizing
+        ["Subject", "Image"] produces rows where each image appears with its subject's
+        metadata.
+        **Column naming:**
+        Column names are prefixed with the source table name using dots to avoid
+        collisions (e.g., "Image.Filename", "Subject.RID"). This differs from the
+        live Dataset class which uses underscores.
+        Args:
+            include_tables: List of table names to include in the output. Tables
+                are joined based on their foreign key relationships.
+                Order doesn't matter - the join order is determined automatically.
+            version: Ignored (bags are immutable snapshots of a specific version).
+            **kwargs: Additional arguments (ignored, for protocol compatibility).
+        Returns:
+            pd.DataFrame: Wide table with columns from all included tables.
+        Example:
+            Create a training dataset from a downloaded bag::
+                >>> # Download and materialize the dataset
+                >>> bag = ml.download_dataset_bag(spec, materialize=True)
+                >>> # Denormalize into a wide table
+                >>> df = bag.denormalize_as_dataframe(["Image", "Diagnosis"])
+                >>> print(df.columns.tolist())
+                ['Image.RID', 'Image.Filename', 'Image.URL', 'Diagnosis.RID',
+                 'Diagnosis.Label', 'Diagnosis.Confidence']
+                >>> # Access local file paths for images
+                >>> for _, row in df.iterrows():
+                ...     local_path = bag.get_asset_path("Image", row["Image.RID"])
+                ...     label = row["Diagnosis.Label"]
+                ...     # Train on local_path with label
+        See Also:
+            denormalize_as_dict: Generator version for memory-efficient processing.
+        """
+        rows = list(self._denormalize_from_members(include_tables=include_tables))
+        return pd.DataFrame(rows)
+    def denormalize_as_dict(
+        self,
+        include_tables: list[str],
+        version: Any = None,
+        **kwargs: Any,
+    ) -> Generator[dict[str, Any], None, None]:
+        """Denormalize the dataset bag and yield rows as dictionaries.
+        This is a memory-efficient alternative to denormalize_as_dataframe() that
+        yields one row at a time as a dictionary instead of loading all data into
+        a DataFrame. Use this when processing large datasets that may not fit in
+        memory, or when you want to process rows incrementally.
-         This routine will examine the domain schema for the dataset, determine which tables to include and denormalize
-        the dataset values into a single wide table.  The result is returned as a generator that returns a dictionary
-        for each row in the denormalized wide table.
+        Like denormalize_as_dataframe(), this produces a "wide table" representation
+        where each yielded dictionary contains all columns from the joined tables.
+        See denormalize_as_dataframe() for detailed explanation of how denormalization
+        works.
-        The optional argument include_tables can be used to specify a subset of tables to include in the denormalized
-        view.  The tables in this argument can appear anywhere in the dataset schema.  The method will determine which
-        additional tables are required to complete the denormalization process.  If include_tables is not specified,
-        all of the tables in the schema will be included.
+        **Column naming:**
-        The resulting wide table will include a column for every table needed to complete the denormalization process.
+        Column names are prefixed with the source table name using dots to avoid
+        collisions (e.g., "Image.Filename", "Subject.RID"). This differs from the
+        live Dataset class which uses underscores.
         Args:
-            include_tables: List of table names to include in the denormalized dataset.
+            include_tables: List of table names to include in the output.
+                Tables are joined based on their foreign key relationships.
+            version: Ignored (bags are immutable snapshots of a specific version).
+            **kwargs: Additional arguments (ignored, for protocol compatibility).
+        Yields:
+            dict[str, Any]: Dictionary representing one row of the wide table.
+                Keys are column names in "Table.Column" format.
+        Example:
+            Stream through a large dataset for training::
+                >>> bag = ml.download_dataset_bag(spec, materialize=True)
+                >>> for row in bag.denormalize_as_dict(["Image", "Diagnosis"]):
+                ...     # Get local file path for this image
+                ...     local_path = bag.get_asset_path("Image", row["Image.RID"])
+                ...     label = row["Diagnosis.Label"]
+                ...     # Process image and label...
+            Build a PyTorch dataset efficiently::
+                >>> class BagDataset(torch.utils.data.IterableDataset):
+                ...     def __init__(self, bag, tables):
+                ...         self.bag = bag
+                ...         self.tables = tables
+                ...     def __iter__(self):
+                ...         for row in self.bag.denormalize_as_dict(self.tables):
+                ...             img_path = self.bag.get_asset_path("Image", row["Image.RID"])
+                ...             yield load_image(img_path), row["Diagnosis.Label"]
+        See Also:
+            denormalize_as_dataframe: Returns all data as a pandas DataFrame.
+        """
+        yield from self._denormalize_from_members(include_tables=include_tables)
+    # =========================================================================
+    # Asset Restructuring Methods
+    # =========================================================================
+    def _build_dataset_type_path_map(
+        self,
+        type_selector: Callable[[list[str]], str] | None = None,
+    ) -> dict[RID, list[str]]:
+        """Build a mapping from dataset RID to its type path in the hierarchy.
+        Recursively traverses nested datasets to create a mapping where each
+        dataset RID maps to its hierarchical type path (e.g., ["complete", "training"]).
+        Args:
+            type_selector: Function to select type when dataset has multiple types.
+                Receives list of type names, returns selected type name.
+                Defaults to selecting first type or "unknown" if no types.
         Returns:
-            Dataframe containing the denormalized dataset.
+            Dictionary mapping dataset RID to list of type names from root to leaf.
+            e.g., {"4-ABC": ["complete", "training"], "4-DEF": ["complete", "testing"]}
         """
-        return pd.read_sql(self._denormalize(include_tables=include_tables), self.engine)
+        if type_selector is None:
+            type_selector = lambda types: types[0] if types else "Testing"
+        type_paths: dict[RID, list[str]] = {}
+        def traverse(dataset: DatasetBag, parent_path: list[str], visited: set[RID]) -> None:
+            if dataset.dataset_rid in visited:
+                return
+            visited.add(dataset.dataset_rid)
+            current_type = type_selector(dataset.dataset_types)
+            current_path = parent_path + [current_type]
+            type_paths[dataset.dataset_rid] = current_path
+            for child in dataset.list_dataset_children():
+                traverse(child, current_path, visited)
+        traverse(self, [], set())
+        return type_paths
+    def _get_asset_dataset_mapping(self, asset_table: str) -> dict[RID, RID]:
+        """Map asset RIDs to their containing dataset RID.
+        For each asset in the specified table, determines which dataset it belongs to.
+        This uses _dataset_table_view to find assets reachable through any FK path
+        from the dataset, not just directly associated assets.
-    def denormalize_as_dict(self, include_tables: list[str]) -> Generator[RowMapping, None, None]:
+        Assets are mapped to their most specific (leaf) dataset in the hierarchy.
+        For example, if a Split dataset contains Training and Testing children,
+        and images are members of Training, the images map to Training (not Split).
+        Args:
+            asset_table: Name of the asset table (e.g., "Image")
+        Returns:
+            Dictionary mapping asset RID to the dataset RID that contains it.
         """
-        Denormalize the dataset and return the result as a set of dictionary's.
+        asset_to_dataset: dict[RID, RID] = {}
+        def collect_from_dataset(dataset: DatasetBag, visited: set[RID]) -> None:
+            if dataset.dataset_rid in visited:
+                return
+            visited.add(dataset.dataset_rid)
+            # Process children FIRST (depth-first) so leaf datasets get priority
+            # This ensures assets are mapped to their most specific dataset
+            for child in dataset.list_dataset_children():
+                collect_from_dataset(child, visited)
+            # Then process this dataset's assets
+            # Only set if not already mapped (child/leaf dataset wins)
+            for asset in dataset._get_reachable_assets(asset_table):
+                if asset["RID"] not in asset_to_dataset:
+                    asset_to_dataset[asset["RID"]] = dataset.dataset_rid
-        This routine will examine the domain schema for the dataset, determine which tables to include and denormalize
-        the dataset values into a single wide table.  The result is returned as a generator that returns a dictionary
-        for each row in the denormalized wide table.
+        collect_from_dataset(self, set())
+        return asset_to_dataset
-        The optional argument include_tables can be used to specify a subset of tables to include in the denormalized
-        view.  The tables in this argument can appear anywhere in the dataset schema.  The method will determine which
-        additional tables are required to complete the denormalization process.  If include_tables is not specified,
-        all of the tables in the schema will be included.
+    def _get_reachable_assets(self, asset_table: str) -> list[dict[str, Any]]:
+        """Get all assets reachable from this dataset through any FK path.
-        The resulting wide table will include a only those column for the tables listed in include_columns.
+        Unlike list_dataset_members which only returns directly associated entities,
+        this method traverses foreign key relationships to find assets that are
+        indirectly connected to the dataset. For example, if a dataset contains
+        Subjects, and Subject -> Encounter -> Image, this method will find those
+        Images even though they're not directly in the Dataset_Image association table.
         Args:
-            include_tables: List of table names to include in the denormalized dataset. If None, than the entire schema
-            is used.
+            asset_table: Name of the asset table (e.g., "Image")
         Returns:
-            A generator that returns a dictionary representation of each row in the denormalized dataset.
+            List of asset records as dictionaries.
         """
+        # Use the _dataset_table_view query which traverses all FK paths
+        sql_query = self._dataset_table_view(asset_table)
         with Session(self.engine) as session:
-            cursor = session.execute(
-                self._denormalize(include_tables=include_tables)
-            )
-            yield from cursor.mappings()
-            for row in cursor.mappings():
-                yield row
+            result = session.execute(sql_query)
+            # Convert rows to dictionaries
+            rows = [dict(row._mapping) for row in result]
+        return rows
+    def _load_feature_values_cache(
+        self,
+        asset_table: str,
+        group_keys: list[str],
+        enforce_vocabulary: bool = True,
+        value_selector: Callable[[list[FeatureValueRecord]], FeatureValueRecord] | None = None,
+    ) -> dict[str, dict[RID, Any]]:
+        """Load feature values into a cache for efficient lookup.
+        Pre-loads feature values for any group_keys that are feature names,
+        organizing them by target entity RID for fast lookup.
+        Args:
+            asset_table: The asset table name to find features for.
+            group_keys: List of potential feature names to cache. Supports two formats:
+                - "FeatureName": Uses the first term column (default behavior)
+                - "FeatureName.column_name": Uses the specified column from the feature table
+            enforce_vocabulary: If True (default), only allow features with
+                controlled vocabulary term columns and raise an error if an
+                asset has multiple values. If False, allow any feature type
+                and use the first value found when multiple exist.
+            value_selector: Optional function to select which feature value to use
+                when an asset has multiple values for the same feature. Receives a
+                list of FeatureValueRecord objects (each with execution_rid for
+                provenance) and returns the selected one. If not provided and
+                multiple values exist, raises DerivaMLException when
+                enforce_vocabulary=True or uses the first value when False.
+        Returns:
+            Dictionary mapping group_key -> {target_rid -> feature_value}
+            Only includes entries for keys that are actually features.
+        Raises:
+            DerivaMLException: If enforce_vocabulary is True and:
+                - A feature has no term columns (not vocabulary-based), or
+                - An asset has multiple different vocabulary term values for the same feature
+                  and no value_selector is provided.
+        """
+        from deriva_ml.core.exceptions import DerivaMLException
+        cache: dict[str, dict[RID, Any]] = {}
+        # Store all feature value records for later selection when there are multiples
+        records_cache: dict[str, dict[RID, list[FeatureValueRecord]]] = {}
+        logger = logging.getLogger("deriva_ml")
+        # Parse group_keys to extract feature names and optional column specifications
+        # Format: "FeatureName" or "FeatureName.column_name"
+        feature_column_map: dict[str, str | None] = {}  # group_key -> specific column or None
+        feature_names_to_check: set[str] = set()
+        for key in group_keys:
+            if "." in key:
+                parts = key.split(".", 1)
+                feature_name = parts[0]
+                column_name = parts[1]
+                feature_column_map[key] = column_name
+                feature_names_to_check.add(feature_name)
+            else:
+                feature_column_map[key] = None
+                feature_names_to_check.add(key)
+        def process_feature(feat: Any, table_name: str, group_key: str, specific_column: str | None) -> None:
+            """Process a single feature and add its values to the cache."""
+            term_cols = [c.name for c in feat.term_columns]
+            value_cols = [c.name for c in feat.value_columns]
+            all_cols = term_cols + value_cols
+            # Determine which column to use for the value
+            if specific_column:
+                # User specified a specific column
+                if specific_column not in all_cols:
+                    raise DerivaMLException(
+                        f"Column '{specific_column}' not found in feature '{feat.feature_name}'. "
+                        f"Available columns: {all_cols}"
+                    )
+                use_column = specific_column
+            elif term_cols:
+                # Use first term column (default behavior)
+                use_column = term_cols[0]
+            elif not enforce_vocabulary and value_cols:
+                # Fall back to value columns if allowed
+                use_column = value_cols[0]
+            else:
+                if enforce_vocabulary:
+                    raise DerivaMLException(
+                        f"Feature '{feat.feature_name}' on table '{table_name}' has no "
+                        f"controlled vocabulary term columns. Only vocabulary-based features "
+                        f"can be used for grouping when enforce_vocabulary=True. "
+                        f"Set enforce_vocabulary=False to allow non-vocabulary features."
+                    )
+                return
+            records_cache[group_key] = defaultdict(list)
+            feature_values = self.list_feature_values(table_name, feat.feature_name)
+            for fv in feature_values:
+                # Convert FeatureRecord to dict for easier access
+                fv_dict = fv.model_dump()
+                target_col = table_name
+                if target_col not in fv_dict:
+                    continue
+                target_rid = fv_dict[target_col]
+                # Get the value from the specified column
+                value = fv_dict.get(use_column) if use_column in fv_dict else None
+                if value is None:
+                    continue
+                # Create a FeatureValueRecord with execution provenance
+                record = FeatureValueRecord(
+                    target_rid=target_rid,
+                    feature_name=feat.feature_name,
+                    value=value,
+                    execution_rid=fv_dict.get("Execution"),
+                    raw_record=fv_dict,
+                )
+                records_cache[group_key][target_rid].append(record)
+        # Find all features on tables that this asset table references
+        asset_table_obj = self.model.name_to_table(asset_table)
+        # Check features on the asset table itself
+        for feature in self.find_features(asset_table):
+            if feature.feature_name in feature_names_to_check:
+                # Find all group_keys that reference this feature
+                for group_key, specific_col in feature_column_map.items():
+                    # Check if this group_key references this feature
+                    key_feature = group_key.split(".")[0] if "." in group_key else group_key
+                    if key_feature == feature.feature_name:
+                        try:
+                            process_feature(feature, asset_table, group_key, specific_col)
+                        except DerivaMLException:
+                            raise
+                        except Exception as e:
+                            logger.warning(f"Could not load feature {feature.feature_name}: {e}")
+        # Also check features on referenced tables (via foreign keys)
+        for fk in asset_table_obj.foreign_keys:
+            target_table = fk.pk_table
+            for feature in self.find_features(target_table):
+                if feature.feature_name in feature_names_to_check:
+                    # Find all group_keys that reference this feature
+                    for group_key, specific_col in feature_column_map.items():
+                        # Check if this group_key references this feature
+                        key_feature = group_key.split(".")[0] if "." in group_key else group_key
+                        if key_feature == feature.feature_name:
+                            try:
+                                process_feature(feature, target_table.name, group_key, specific_col)
+                            except DerivaMLException:
+                                raise
+                            except Exception as e:
+                                logger.warning(f"Could not load feature {feature.feature_name}: {e}")
+        # Now resolve multiple values using value_selector or error handling
+        for group_key, target_records in records_cache.items():
+            cache[group_key] = {}
+            for target_rid, records in target_records.items():
+                if len(records) == 1:
+                    # Single value - straightforward
+                    cache[group_key][target_rid] = records[0].value
+                elif len(records) > 1:
+                    # Multiple values - need to resolve
+                    unique_values = set(r.value for r in records)
+                    if len(unique_values) == 1:
+                        # All records have same value, use it
+                        cache[group_key][target_rid] = records[0].value
+                    elif value_selector:
+                        # Use provided selector function
+                        selected = value_selector(records)
+                        cache[group_key][target_rid] = selected.value
+                    elif enforce_vocabulary:
+                        # Multiple different values without selector - error
+                        values_str = ", ".join(f"'{r.value}' (exec: {r.execution_rid})" for r in records)
+                        raise DerivaMLException(
+                            f"Asset '{target_rid}' has multiple different values for "
+                            f"feature '{records[0].feature_name}': {values_str}. "
+                            f"Provide a value_selector function to choose between values, "
+                            f"or set enforce_vocabulary=False to use the first value."
+                        )
+                    else:
+                        # Not enforcing - use first value
+                        cache[group_key][target_rid] = records[0].value
+        return cache
+    def _resolve_grouping_value(
+        self,
+        asset: dict[str, Any],
+        group_key: str,
+        feature_cache: dict[str, dict[RID, Any]],
+    ) -> str:
+        """Resolve a grouping value for an asset.
+        First checks if group_key is a direct column on the asset record,
+        then checks if it's a feature name in the feature cache.
+        Args:
+            asset: The asset record dictionary.
+            group_key: Column name or feature name to group by.
+            feature_cache: Pre-loaded feature values keyed by feature name -> target RID -> value.
+        Returns:
+            The resolved value as a string, or "Unknown" if not found or None.
+            Uses "Unknown" (capitalized) to match vocabulary term naming conventions.
+        """
+        # First check if it's a direct column on the asset table
+        if group_key in asset:
+            value = asset[group_key]
+            if value is not None:
+                return str(value)
+            return "Unknown"
+        # Check if it's a feature name
+        if group_key in feature_cache:
+            feature_values = feature_cache[group_key]
+            # Check each column in the asset that might be a FK to the feature target
+            for column_name, column_value in asset.items():
+                if column_value and column_value in feature_values:
+                    return str(feature_values[column_value])
+            # Also check if the asset's own RID is in the feature values
+            if asset.get("RID") in feature_values:
+                return str(feature_values[asset["RID"]])
+        return "Unknown"
+    def _detect_asset_table(self) -> str | None:
+        """Auto-detect the asset table from dataset members.
+        Searches for asset tables in the dataset members by examining
+        the schema. Returns the first asset table found, or None if
+        no asset tables are in the dataset.
+        Returns:
+            Name of the detected asset table, or None if not found.
+        """
+        members = self.list_dataset_members(recurse=True)
+        for table_name in members:
+            if table_name == "Dataset":
+                continue
+            # Check if this table is an asset table
+            try:
+                table = self.model.name_to_table(table_name)
+                if self.model.is_asset(table):
+                    return table_name
+            except (KeyError, AttributeError):
+                continue
+        return None
+    def _validate_dataset_types(self) -> list[str] | None:
+        """Validate that the dataset or its children have Training/Testing types.
+        Checks if this dataset is of type Training or Testing, or if it has
+        nested children of those types. Returns the valid types found.
+        Returns:
+            List of Training/Testing type names found, or None if validation fails.
+        """
+        valid_types = {"Training", "Testing"}
+        found_types: set[str] = set()
+        def check_dataset(ds: DatasetBag, visited: set[RID]) -> None:
+            if ds.dataset_rid in visited:
+                return
+            visited.add(ds.dataset_rid)
+            for dtype in ds.dataset_types:
+                if dtype in valid_types:
+                    found_types.add(dtype)
+            for child in ds.list_dataset_children():
+                check_dataset(child, visited)
+        check_dataset(self, set())
+        return list(found_types) if found_types else None
+    def restructure_assets(
+        self,
+        output_dir: Path | str,
+        asset_table: str | None = None,
+        group_by: list[str] | None = None,
+        use_symlinks: bool = True,
+        type_selector: Callable[[list[str]], str] | None = None,
+        type_to_dir_map: dict[str, str] | None = None,
+        enforce_vocabulary: bool = True,
+        value_selector: Callable[[list[FeatureValueRecord]], FeatureValueRecord] | None = None,
+    ) -> Path:
+        """Restructure downloaded assets into a directory hierarchy.
+        Creates a directory structure organizing assets by dataset types and
+        grouping values. This is useful for ML workflows that expect data
+        organized in conventional folder structures (e.g., PyTorch ImageFolder).
+        The dataset should be of type Training or Testing, or have nested
+        children of those types. The top-level directory name is determined
+        by the dataset type (e.g., "Training" -> "training").
+        **Finding assets through foreign key relationships:**
+        Assets are found by traversing all foreign key paths from the dataset,
+        not just direct associations. For example, if a dataset contains Subjects,
+        and the schema has Subject -> Encounter -> Image relationships, this method
+        will find all Images reachable through those paths even though they are
+        not directly in a Dataset_Image association table.
+        **Handling datasets without types (prediction scenarios):**
+        If a dataset has no type defined, it is treated as Testing. This is
+        common for prediction/inference scenarios where you want to apply a
+        trained model to new unlabeled data.
+        **Handling missing labels:**
+        If an asset doesn't have a value for a group_by key (e.g., no label
+        assigned), it is placed in an "Unknown" directory. This allows
+        restructure_assets to work with unlabeled data for prediction.
+        Args:
+            output_dir: Base directory for restructured assets.
+            asset_table: Name of the asset table (e.g., "Image"). If None,
+                auto-detects from dataset members. Raises DerivaMLException
+                if multiple asset tables are found and none is specified.
+            group_by: Names to group assets by. Each name creates a subdirectory
+                level after the dataset type path. Names can be:
+                - **Column names**: Direct columns on the asset table. The column
+                  value becomes the subdirectory name.
+                - **Feature names**: Features defined on the asset table (or tables
+                  it references via foreign keys). The feature's vocabulary term
+                  value becomes the subdirectory name.
+                - **Feature.column**: Specify a particular column from a multi-term
+                  feature (e.g., "Classification.Label" to use the Label column).
+                Column names are checked first, then feature names. If a value
+                is not found, "unknown" is used as the subdirectory name.
+            use_symlinks: If True (default), create symlinks to original files.
+                If False, copy files. Symlinks save disk space but require
+                the original bag to remain in place.
+            type_selector: Function to select type when dataset has multiple types.
+                Receives list of type names, returns selected type name.
+                Defaults to selecting first type or "unknown" if no types.
+            type_to_dir_map: Optional mapping from dataset type names to directory
+                names. Defaults to {"Training": "training", "Testing": "testing",
+                "Unknown": "unknown"}. Use this to customize directory names or
+                add new type mappings.
+            enforce_vocabulary: If True (default), only allow features that have
+                controlled vocabulary term columns, and raise an error if an asset
+                has multiple different values for the same feature without a
+                value_selector. This ensures clean, unambiguous directory structures.
+                If False, allow any feature type and use the first value found
+                when multiple values exist.
+            value_selector: Optional function to select which feature value to use
+                when an asset has multiple values for the same feature. Receives a
+                list of FeatureValueRecord objects (each containing target_rid,
+                feature_name, value, execution_rid, and raw_record) and returns
+                the selected FeatureValueRecord. Use execution_rid to distinguish
+                between values from different executions.
+        Returns:
+            Path to the output directory.
+        Raises:
+            DerivaMLException: If asset_table cannot be determined (multiple
+                asset tables exist without specification), if no valid dataset
+                types (Training/Testing) are found, or if enforce_vocabulary
+                is True and a feature has multiple values without value_selector.
+        Examples:
+            Basic restructuring with auto-detected asset table::
+                bag.restructure_assets(
+                    output_dir="./ml_data",
+                    group_by=["Diagnosis"],
+                )
+                # Creates:
+                # ./ml_data/training/Normal/image1.jpg
+                # ./ml_data/testing/Abnormal/image2.jpg
+            Custom type-to-directory mapping::
+                bag.restructure_assets(
+                    output_dir="./ml_data",
+                    group_by=["Diagnosis"],
+                    type_to_dir_map={"Training": "train", "Testing": "test"},
+                )
+                # Creates:
+                # ./ml_data/train/Normal/image1.jpg
+                # ./ml_data/test/Abnormal/image2.jpg
+            Select specific feature column for multi-term features::
+                bag.restructure_assets(
+                    output_dir="./ml_data",
+                    group_by=["Classification.Label"],  # Use Label column
+                )
+            Handle multiple feature values with a selector::
+                def select_latest(records: list[FeatureValueRecord]) -> FeatureValueRecord:
+                    # Select value from most recent execution
+                    return max(records, key=lambda r: r.execution_rid or "")
+                bag.restructure_assets(
+                    output_dir="./ml_data",
+                    group_by=["Diagnosis"],
+                    value_selector=select_latest,
+                )
+            Prediction scenario with unlabeled data::
+                # Dataset has no type - treated as Testing
+                # Assets have no labels - placed in Unknown directory
+                bag.restructure_assets(
+                    output_dir="./prediction_data",
+                    group_by=["Diagnosis"],
+                )
+                # Creates:
+                # ./prediction_data/testing/Unknown/image1.jpg
+                # ./prediction_data/testing/Unknown/image2.jpg
+        """
+        logger = logging.getLogger("deriva_ml")
+        group_by = group_by or []
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Default type-to-directory mapping
+        if type_to_dir_map is None:
+            type_to_dir_map = {"Training": "training", "Testing": "testing", "Unknown": "unknown"}
+        # Auto-detect asset table if not provided
+        if asset_table is None:
+            asset_table = self._detect_asset_table()
+            if asset_table is None:
+                raise DerivaMLException(
+                    "Could not auto-detect asset table. No asset tables found in dataset members. "
+                    "Specify the asset_table parameter explicitly."
+                )
+            logger.info(f"Auto-detected asset table: {asset_table}")
+        # Step 1: Build dataset type path map with directory name mapping
+        def map_type_to_dir(types: list[str]) -> str:
+            """Map dataset types to directory name using type_to_dir_map.
+            If dataset has no types, treat it as Testing (prediction use case).
+            """
+            if not types:
+                # No types defined - treat as Testing for prediction scenarios
+                return type_to_dir_map.get("Testing", "testing")
+            if type_selector:
+                selected_type = type_selector(types)
+            else:
+                selected_type = types[0]
+            return type_to_dir_map.get(selected_type, selected_type.lower())
+        type_path_map = self._build_dataset_type_path_map(map_type_to_dir)
+        # Step 2: Get asset-to-dataset mapping
+        asset_dataset_map = self._get_asset_dataset_mapping(asset_table)
+        # Step 3: Load feature values cache for relevant features
+        feature_cache = self._load_feature_values_cache(
+            asset_table, group_by, enforce_vocabulary, value_selector
+        )
+        # Step 4: Get all assets reachable through FK paths
+        # This uses _get_reachable_assets which traverses FK relationships,
+        # so assets connected via Subject -> Encounter -> Image are found
+        # even if the dataset only contains Subjects directly.
+        assets = self._get_reachable_assets(asset_table)
+        if not assets:
+            logger.warning(f"No assets found in table '{asset_table}'")
+            return output_dir
+        # Step 5: Process each asset
+        for asset in assets:
+            # Get source file path
+            filename = asset.get("Filename")
+            if not filename:
+                logger.warning(f"Asset {asset.get('RID')} has no Filename")
+                continue
+            source_path = Path(filename)
+            if not source_path.exists():
+                logger.warning(f"Asset file not found: {source_path}")
+                continue
+            # Get dataset type path
+            dataset_rid = asset_dataset_map.get(asset["RID"])
+            type_path = type_path_map.get(dataset_rid, ["unknown"])
+            # Resolve grouping values
+            group_path = []
+            for key in group_by:
+                value = self._resolve_grouping_value(asset, key, feature_cache)
+                group_path.append(value)
+            # Build target directory
+            target_dir = output_dir.joinpath(*type_path, *group_path)
+            target_dir.mkdir(parents=True, exist_ok=True)
+            # Create link or copy
+            target_path = target_dir / source_path.name
+            # Handle existing files
+            if target_path.exists() or target_path.is_symlink():
+                target_path.unlink()
+            if use_symlinks:
+                try:
+                    target_path.symlink_to(source_path.resolve())
+                except OSError as e:
+                    # Fall back to copy on platforms that don't support symlinks
+                    logger.warning(f"Symlink failed, falling back to copy: {e}")
+                    shutil.copy2(source_path, target_path)
+            else:
+                shutil.copy2(source_path, target_path)
+        return output_dir
-# Add annotations after definition to deal with forward reference issues in pydantic
-DatasetBag.list_dataset_children = validate_call(
-    config=ConfigDict(arbitrary_types_allowed=True),
-    validate_return=True,
-)(DatasetBag.list_dataset_children)
+# Note: validate_call decorators with Self return types were removed because
+# Pydantic doesn't support typing.Self in validate_call contexts.

deriva-ml 1.17.9__py3-none-any.whl → 1.17.11__py3-none-any.whl

deriva-ml 1.17.9py3-none-any.whl → 1.17.11py3-none-any.whl