PyPI - deriva-ml - Versions diffs - 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl - Mend

deriva-ml 1.17.10py3-none-any.whl → 1.17.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

deriva_ml/__init__.py +43 -1
deriva_ml/asset/__init__.py +17 -0
deriva_ml/asset/asset.py +357 -0
deriva_ml/asset/aux_classes.py +100 -0
deriva_ml/bump_version.py +254 -11
deriva_ml/catalog/__init__.py +21 -0
deriva_ml/catalog/clone.py +1199 -0
deriva_ml/catalog/localize.py +426 -0
deriva_ml/core/__init__.py +29 -0
deriva_ml/core/base.py +817 -1067
deriva_ml/core/config.py +169 -21
deriva_ml/core/constants.py +120 -19
deriva_ml/core/definitions.py +123 -13
deriva_ml/core/enums.py +47 -73
deriva_ml/core/ermrest.py +226 -193
deriva_ml/core/exceptions.py +297 -14
deriva_ml/core/filespec.py +99 -28
deriva_ml/core/logging_config.py +225 -0
deriva_ml/core/mixins/__init__.py +42 -0
deriva_ml/core/mixins/annotation.py +915 -0
deriva_ml/core/mixins/asset.py +384 -0
deriva_ml/core/mixins/dataset.py +237 -0
deriva_ml/core/mixins/execution.py +408 -0
deriva_ml/core/mixins/feature.py +365 -0
deriva_ml/core/mixins/file.py +263 -0
deriva_ml/core/mixins/path_builder.py +145 -0
deriva_ml/core/mixins/rid_resolution.py +204 -0
deriva_ml/core/mixins/vocabulary.py +400 -0
deriva_ml/core/mixins/workflow.py +322 -0
deriva_ml/core/validation.py +389 -0
deriva_ml/dataset/__init__.py +2 -1
deriva_ml/dataset/aux_classes.py +20 -4
deriva_ml/dataset/catalog_graph.py +575 -0
deriva_ml/dataset/dataset.py +1242 -1008
deriva_ml/dataset/dataset_bag.py +1311 -182
deriva_ml/dataset/history.py +27 -14
deriva_ml/dataset/upload.py +225 -38
deriva_ml/demo_catalog.py +126 -110
deriva_ml/execution/__init__.py +46 -2
deriva_ml/execution/base_config.py +639 -0
deriva_ml/execution/execution.py +543 -242
deriva_ml/execution/execution_configuration.py +26 -11
deriva_ml/execution/execution_record.py +592 -0
deriva_ml/execution/find_caller.py +298 -0
deriva_ml/execution/model_protocol.py +175 -0
deriva_ml/execution/multirun_config.py +153 -0
deriva_ml/execution/runner.py +595 -0
deriva_ml/execution/workflow.py +223 -34
deriva_ml/experiment/__init__.py +8 -0
deriva_ml/experiment/experiment.py +411 -0
deriva_ml/feature.py +6 -1
deriva_ml/install_kernel.py +143 -6
deriva_ml/interfaces.py +862 -0
deriva_ml/model/__init__.py +99 -0
deriva_ml/model/annotations.py +1278 -0
deriva_ml/model/catalog.py +286 -60
deriva_ml/model/database.py +144 -649
deriva_ml/model/deriva_ml_database.py +308 -0
deriva_ml/model/handles.py +14 -0
deriva_ml/run_model.py +319 -0
deriva_ml/run_notebook.py +507 -38
deriva_ml/schema/__init__.py +18 -2
deriva_ml/schema/annotations.py +62 -33
deriva_ml/schema/create_schema.py +169 -69
deriva_ml/schema/validation.py +601 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -4
deriva_ml-1.17.11.dist-info/RECORD +77 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +1 -0
deriva_ml/protocols/dataset.py +0 -19
deriva_ml/test.py +0 -94
deriva_ml-1.17.10.dist-info/RECORD +0 -45
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0

deriva_ml/dataset/dataset.py CHANGED Viewed

@@ -14,10 +14,16 @@ DerivaML class instances.
 Typical usage example:
     >>> ml = DerivaML('deriva.example.org', 'my_catalog')
-    >>> dataset_rid = ml.create_dataset('experiment', 'Experimental data')
-    >>> ml.add_dataset_members(dataset_rid=dataset_rid, members=['1-abc123', '1-def456'])
-    >>> ml.increment_dataset_version(datset_rid=dataset_rid, component=VersionPart.minor,
-    ...     description='Added new samples')
+    >>> with ml.create_execution(config) as exe:
+    ...     dataset = exe.create_dataset(
+    ...         dataset_types=['experiment'],
+    ...         description='Experimental data'
+    ...     )
+    ...     dataset.add_dataset_members(members=['1-abc123', '1-def456'])
+    ...     dataset.increment_dataset_version(
+    ...         component=VersionPart.minor,
+    ...         description='Added new samples'
+    ...     )
 """
 from __future__ import annotations
@@ -29,21 +35,23 @@ from collections import defaultdict
 # Standard library imports
 from graphlib import TopologicalSorter
 from pathlib import Path
+# Local imports
+from pprint import pformat
 from tempfile import TemporaryDirectory
-from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator
+from typing import Any, Generator, Iterable, Self
 from urllib.parse import urlparse
+# Deriva imports
 import deriva.core.utils.hash_utils as hash_utils
-import requests
 # Third-party imports
+import pandas as pd
+import requests
 from bdbag import bdbag_api as bdb
 from bdbag.fetch.fetcher import fetch_single_file
-# Deriva imports
 from deriva.core.ermrest_model import Table
 from deriva.core.utils.core_utils import format_exception
-from deriva.core.utils.core_utils import tag as deriva_tags
 from deriva.transfer.download.deriva_download import (
     DerivaDownloadAuthenticationError,
     DerivaDownloadAuthorizationError,
@@ -54,22 +62,25 @@ from deriva.transfer.download.deriva_download import (
 from deriva.transfer.download.deriva_export import DerivaExport
 from pydantic import ConfigDict, validate_call
-# Local imports
 try:
     from icecream import ic
-    ic.configureOutput(includeContext=True)
+    ic.configureOutput(
+        includeContext=True,
+        argToStringFunction=lambda x: pformat(x.model_dump() if hasattr(x, "model_dump") else x, width=80, depth=10),
+    )
 except ImportError:  # Graceful fallback if IceCream isn't installed.
     ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
 from deriva_ml.core.constants import RID
 from deriva_ml.core.definitions import (
     DRY_RUN_RID,
-    ML_SCHEMA,
     MLVocab,
     Status,
+    VocabularyTerm,
 )
-from deriva_ml.core.exceptions import DerivaMLException, DerivaMLTableTypeError
+from deriva_ml.core.exceptions import DerivaMLException
 from deriva_ml.dataset.aux_classes import (
     DatasetHistory,
     DatasetMinid,
@@ -77,18 +88,12 @@ from deriva_ml.dataset.aux_classes import (
     DatasetVersion,
     VersionPart,
 )
+from deriva_ml.dataset.catalog_graph import CatalogGraph
 from deriva_ml.dataset.dataset_bag import DatasetBag
-from deriva_ml.model.catalog import DerivaModel
+from deriva_ml.feature import Feature
+from deriva_ml.interfaces import DerivaMLCatalog
 from deriva_ml.model.database import DatabaseModel
-from .history import iso_to_snap
-# Stop pycharm from complaining about undefined reference in docstring....
-ml: DerivaML
-if TYPE_CHECKING:
-    from deriva_ml.core.base import DerivaML
 class Dataset:
     """Manages dataset operations in a Deriva catalog.
@@ -96,139 +101,368 @@ class Dataset:
     The Dataset class provides functionality for creating, modifying, and tracking datasets
     in a Deriva catalog. It handles versioning, relationships between datasets, and data export.
+    A Dataset is a versioned collection of related data elements. Each dataset:
+    - Has a unique RID (Resource Identifier) within the catalog
+    - Maintains a version history using semantic versioning (major.minor.patch)
+    - Can contain nested datasets, forming a hierarchy
+    - Can be exported as a BDBag for offline use or sharing
+    The class implements the DatasetLike protocol, allowing code to work uniformly
+    with both live catalog datasets and downloaded DatasetBag objects.
     Attributes:
-        dataset_table (Table): ERMrest table storing dataset information.
-        _model (DerivaModel): Catalog model instance.
-        _ml_schema (str): Schema name for ML-specific tables.
-        _cache_dir (Path): Directory for caching downloaded datasets.
-        _working_dir (Path): Directory for working data.
-        _use_minid (bool): Whether to use MINID service for dataset identification.
-    Note:
-        This class is typically used as a base class, with its methods accessed through
-        DerivaML class instances rather than directly.
+        dataset_rid (RID): The unique Resource Identifier for this dataset.
+        dataset_types (list[str]): List of vocabulary terms describing the dataset type.
+        description (str): Human-readable description of the dataset.
+        execution_rid (RID | None): Optional RID of the execution that created this dataset.
+        _ml_instance (DerivaMLCatalog): Reference to the catalog containing this dataset.
+    Example:
+        >>> # Create a new dataset via an execution
+        >>> with ml.create_execution(config) as exe:
+        ...     dataset = exe.create_dataset(
+        ...         dataset_types=["training_data"],
+        ...         description="Image classification training set"
+        ...     )
+        ...     # Add members to the dataset
+        ...     dataset.add_dataset_members(members=["1-abc", "1-def"])
+        ...     # Increment version after changes
+        ...     new_version = dataset.increment_dataset_version(VersionPart.minor, "Added samples")
+        >>> # Download for offline use
+        >>> bag = dataset.download_dataset_bag(version=new_version)
     """
-    _Logger = logging.getLogger("deriva_ml")
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def __init__(
         self,
-        model: DerivaModel,
-        cache_dir: Path,
-        working_dir: Path,
-        use_minid: bool = True,
+        catalog: DerivaMLCatalog,
+        dataset_rid: RID,
+        description: str = "",
+        execution_rid: RID | None = None,
     ):
-        """Initializes a Dataset instance.
+        """Initialize a Dataset object from an existing dataset in the catalog.
+        This constructor wraps an existing dataset record. To create a new dataset
+        in the catalog, use the static method Dataset.create_dataset() instead.
         Args:
-            model: DerivaModel instance representing the catalog.
-            cache_dir: Directory path for caching downloaded datasets.
-            working_dir: Directory path for working data.
-            use_minid: Whether to use MINID service for dataset identification.
+            catalog: The DerivaMLCatalog instance containing this dataset.
+            dataset_rid: The RID of the existing dataset record.
+            description: Human-readable description of the dataset's purpose and contents.
+            execution_rid: Optional execution RID that created or is associated with this dataset.
+        Example:
+            >>> # Wrap an existing dataset
+            >>> dataset = Dataset(catalog=ml, dataset_rid="4HM")
         """
-        self._model = model
-        self._ml_schema = ML_SCHEMA
-        self._cache_dir = cache_dir
-        self._working_dir = working_dir
         self._logger = logging.getLogger("deriva_ml")
-        self._use_minid = use_minid
+        self.dataset_rid = dataset_rid
+        self.execution_rid = execution_rid
+        self._ml_instance = catalog
+        self.description = description
+    def __repr__(self) -> str:
+        """Return a string representation of the Dataset for debugging."""
+        return (f"<deriva_ml.Dataset object at {hex(id(self))}: rid='{self.dataset_rid}', "
+                f"version='{self.current_version}', types={self.dataset_types}>")
+    def __hash__(self) -> int:
+        """Return hash based on dataset RID for use in sets and as dict keys.
+        This allows Dataset objects to be stored in sets and used as dictionary keys.
+        Two Dataset objects with the same RID will hash to the same value.
+        """
+        return hash(self.dataset_rid)
+    def __eq__(self, other: object) -> bool:
+        """Check equality based on dataset RID.
+        Two Dataset objects are considered equal if they reference the same
+        dataset RID, regardless of other attributes like version or types.
+        Args:
+            other: Object to compare with.
+        Returns:
+            True if other is a Dataset with the same RID, False otherwise.
+            Returns NotImplemented for non-Dataset objects.
+        """
+        if not isinstance(other, Dataset):
+            return NotImplemented
+        return self.dataset_rid == other.dataset_rid
+    def _get_dataset_type_association_table(self) -> tuple[str, Any]:
+        """Get the association table for dataset types.
+        Returns:
+            Tuple of (table_name, table_path) for the Dataset-Dataset_Type association table.
+        """
+        associations = list(
+            self._ml_instance.model.schemas[self._ml_instance.ml_schema]
+            .tables[MLVocab.dataset_type]
+            .find_associations()
+        )
+        atable_name = associations[0].name if associations else None
+        pb = self._ml_instance.pathBuilder()
+        atable_path = pb.schemas[self._ml_instance.ml_schema].tables[atable_name]
+        return atable_name, atable_path
     @property
-    def _dataset_table(self):
-        return self._model.schemas[self._ml_schema].tables["Dataset"]
+    def dataset_types(self) -> list[str]:
+        """Get the dataset types from the catalog.
-    def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
-        try:
-            rid_info = self._model.catalog.resolve_rid(dataset_rid, self._model.model)
-        except KeyError as _e:
-            raise DerivaMLException(f"Invalid RID {dataset_rid}")
-        if rid_info.table != self._dataset_table:
-            return False
-        elif deleted:
-            # Got a dataset rid. Now check to see if its deleted or not.
-            return True
-        else:
-            return not list(rid_info.datapath.entities().fetch())[0]["Deleted"]
+        This property fetches the current dataset types directly from the catalog,
+        ensuring consistency when multiple Dataset instances reference the same
+        dataset or when types are modified externally.
-    def _insert_dataset_versions(
-        self,
-        dataset_list: list[DatasetSpec],
-        description: str | None = "",
-        execution_rid: RID | None = None,
-    ) -> None:
-        schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
-        # determine snapshot after changes were made
+        Returns:
+            List of dataset type term names from the Dataset_Type vocabulary.
+        """
+        _, atable_path = self._get_dataset_type_association_table()
+        ds_types = (
+            atable_path.filter(atable_path.Dataset == self.dataset_rid)
+            .attributes(atable_path.Dataset_Type)
+            .fetch()
+        )
+        return [ds[MLVocab.dataset_type] for ds in ds_types]
-        # Construct version records for insert
-        version_records = schema_path.tables["Dataset_Version"].insert(
+    @staticmethod
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
+    def create_dataset(
+        ml_instance: DerivaMLCatalog,
+        execution_rid: RID,
+        dataset_types: str | list[str] | None = None,
+        description: str = "",
+        version: DatasetVersion | None = None,
+    ) -> Self:
+        """Creates a new dataset in the catalog.
+        Creates a dataset with specified types and description. The dataset must be
+        associated with an execution for provenance tracking.
+        Args:
+            ml_instance: DerivaMLCatalog instance.
+            execution_rid: Execution RID to associate with dataset creation (required).
+            dataset_types: One or more dataset type terms from Dataset_Type vocabulary.
+            description: Description of the dataset's purpose and contents.
+            version: Optional initial version number. Defaults to 0.1.0.
+        Returns:
+            Dataset: The newly created dataset.
+        Raises:
+            DerivaMLException: If dataset_types are invalid or creation fails.
+        Example:
+            >>> with ml.create_execution(config) as exe:
+            ...     dataset = exe.create_dataset(
+            ...         dataset_types=["experiment", "raw_data"],
+            ...         description="RNA sequencing experiment data",
+            ...         version=DatasetVersion(1, 0, 0)
+            ...     )
+        """
+        version = version or DatasetVersion(0, 1, 0)
+        # Validate dataset types
+        ds_types = [dataset_types] if isinstance(dataset_types, str) else dataset_types
+        dataset_types = [ml_instance.lookup_term(MLVocab.dataset_type, t) for t in ds_types]
+        # Create the entry for the new dataset_table and get its RID.
+        pb = ml_instance.pathBuilder()
+        dataset_table_path = pb.schemas[ml_instance._dataset_table.schema.name].tables[ml_instance._dataset_table.name]
+        dataset_rid = dataset_table_path.insert(
             [
                 {
-                    "Dataset": dataset.rid,
-                    "Version": str(dataset.version),
                     "Description": description,
-                    "Execution": execution_rid,
+                    "Deleted": False,
                 }
-                for dataset in dataset_list
             ]
+        )[0]["RID"]
+        pb.schemas[ml_instance.model.ml_schema].Dataset_Execution.insert(
+            [{"Dataset": dataset_rid, "Execution": execution_rid}]
         )
-        version_records = list(version_records)
-        snap = self._model.catalog.get("/").json()["snaptime"]
-        schema_path.tables["Dataset_Version"].update(
-            [{"RID": v["RID"], "Dataset": v["Dataset"], "Snapshot": snap} for v in version_records]
+        Dataset._insert_dataset_versions(
+            ml_instance=ml_instance,
+            dataset_list=[DatasetSpec(rid=dataset_rid, version=version)],
+            execution_rid=execution_rid,
+            description="Initial dataset creation.",
+        )
+        dataset = Dataset(
+            catalog=ml_instance,
+            dataset_rid=dataset_rid,
+            description=description,
         )
-        # And update the dataset records.
-        schema_path.tables["Dataset"].update([{"Version": v["RID"], "RID": v["Dataset"]} for v in version_records])
+        # Skip version increment during initial creation (version already set above)
+        dataset.add_dataset_types(dataset_types, _skip_version_increment=True)
+        return dataset
-    def _bootstrap_versions(self):
-        datasets = [ds["RID"] for ds in self.find_datasets()]
-        ds_version = [
-            {
-                "Dataset": d,
-                "Version": "0.1.0",
-                "Description": "Dataset at the time of conversion to versioned datasets",
-            }
-            for d in datasets
-        ]
-        schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
-        version_path = schema_path.tables["Dataset_Version"]
-        dataset_path = schema_path.tables["Dataset"]
-        history = list(version_path.insert(ds_version))
-        dataset_versions = [{"RID": h["Dataset"], "Version": h["Version"]} for h in history]
-        dataset_path.update(dataset_versions)
-    def _synchronize_dataset_versions(self):
-        datasets = [ds["RID"] for ds in self.find_datasets()]
-        for ds in datasets:
-            self.dataset_version(ds)
-        schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
-        dataset_version_path = schema_path.tables["Dataset_Version"]
-        # Get the maximum version number for each dataset.
-        versions = {}
-        for v in dataset_version_path.entities().fetch():
-            if v["Version"] > versions.get("Dataset", DatasetVersion(0, 0, 0)):
-                versions[v["Dataset"]] = v
-        dataset_path = schema_path.tables["Dataset"]
-        dataset_path.update([{"RID": dataset, "Version": version["RID"]} for dataset, version in versions.items()])
-    def _set_version_snapshot(self):
-        """Update the Snapshot column of the Dataset_Version table to the correct time."""
-        dataset_version_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Version"]
-        versions = dataset_version_path.entities().fetch()
-        dataset_version_path.update(
-            [{"RID": h["RID"], "Snapshot": iso_to_snap(h["RCT"])} for h in versions if not h["Snapshot"]]
-        )
+    def add_dataset_type(
+        self,
+        dataset_type: str | VocabularyTerm,
+        _skip_version_increment: bool = False,
+    ) -> None:
+        """Add a dataset type to this dataset.
+        Adds a type term to this dataset if it's not already present. The term must
+        exist in the Dataset_Type vocabulary. Also increments the dataset's minor
+        version to reflect the metadata change.
+        Args:
+            dataset_type: Term name (string) or VocabularyTerm object from Dataset_Type vocabulary.
+            _skip_version_increment: Internal parameter to skip version increment when
+                called from add_dataset_types (which handles versioning itself).
+        Raises:
+            DerivaMLInvalidTerm: If the term doesn't exist in the Dataset_Type vocabulary.
+        Example:
+            >>> dataset.add_dataset_type("Training")
+            >>> dataset.add_dataset_type("Validation")
+        """
+        # Convert to VocabularyTerm if needed (validates the term exists)
+        if isinstance(dataset_type, VocabularyTerm):
+            vocab_term = dataset_type
+        else:
+            vocab_term = self._ml_instance.lookup_term(MLVocab.dataset_type, dataset_type)
+        # Check if already present
+        if vocab_term.name in self.dataset_types:
+            return
+        # Insert into association table
+        _, atable_path = self._get_dataset_type_association_table()
+        atable_path.insert([{MLVocab.dataset_type: vocab_term.name, "Dataset": self.dataset_rid}])
+        # Increment minor version to reflect metadata change (unless called from add_dataset_types)
+        if not _skip_version_increment:
+            self.increment_dataset_version(
+                VersionPart.minor,
+                description=f"Added dataset type: {vocab_term.name}",
+            )
+    def remove_dataset_type(self, dataset_type: str | VocabularyTerm) -> None:
+        """Remove a dataset type from this dataset.
+        Removes a type term from this dataset if it's currently associated. The term
+        must exist in the Dataset_Type vocabulary.
+        Args:
+            dataset_type: Term name (string) or VocabularyTerm object from Dataset_Type vocabulary.
+        Raises:
+            DerivaMLInvalidTerm: If the term doesn't exist in the Dataset_Type vocabulary.
+        Example:
+            >>> dataset.remove_dataset_type("Training")
+        """
+        # Convert to VocabularyTerm if needed (validates the term exists)
+        if isinstance(dataset_type, VocabularyTerm):
+            vocab_term = dataset_type
+        else:
+            vocab_term = self._ml_instance.lookup_term(MLVocab.dataset_type, dataset_type)
+        # Check if present
+        if vocab_term.name not in self.dataset_types:
+            return
+        # Delete from association table
+        _, atable_path = self._get_dataset_type_association_table()
+        atable_path.filter(
+            (atable_path.Dataset == self.dataset_rid) & (atable_path.Dataset_Type == vocab_term.name)
+        ).delete()
+    def add_dataset_types(
+        self,
+        dataset_types: str | VocabularyTerm | list[str | VocabularyTerm],
+        _skip_version_increment: bool = False,
+    ) -> None:
+        """Add one or more dataset types to this dataset.
+        Convenience method for adding multiple types at once. Each term must exist
+        in the Dataset_Type vocabulary. Types that are already associated with the
+        dataset are silently skipped. Increments the dataset's minor version once
+        after all types are added.
-    def dataset_history(self, dataset_rid: RID) -> list[DatasetHistory]:
+        Args:
+            dataset_types: Single term or list of terms. Can be strings (term names)
+                or VocabularyTerm objects.
+            _skip_version_increment: Internal parameter to skip version increment
+                (used during initial dataset creation).
+        Raises:
+            DerivaMLInvalidTerm: If any term doesn't exist in the Dataset_Type vocabulary.
+        Example:
+            >>> dataset.add_dataset_types(["Training", "Image"])
+            >>> dataset.add_dataset_types("Testing")
+        """
+        # Normalize input to a list
+        types_to_add = [dataset_types] if not isinstance(dataset_types, list) else dataset_types
+        # Track which types were actually added (not already present)
+        added_types: list[str] = []
+        for term in types_to_add:
+            # Get term name before calling add_dataset_type
+            if isinstance(term, VocabularyTerm):
+                term_name = term.name
+            else:
+                term_name = self._ml_instance.lookup_term(MLVocab.dataset_type, term).name
+            # Check if already present before adding
+            if term_name not in self.dataset_types:
+                self.add_dataset_type(term, _skip_version_increment=True)
+                added_types.append(term_name)
+        # Increment version once for all added types (if any were added)
+        if added_types and not _skip_version_increment:
+            type_names = ", ".join(added_types)
+            self.increment_dataset_version(
+                VersionPart.minor,
+                description=f"Added dataset type(s): {type_names}",
+            )
+    @property
+    def _dataset_table(self) -> Table:
+        """Get the Dataset table from the catalog schema.
+        Returns:
+            Table: The Deriva Table object for the Dataset table in the ML schema.
+        """
+        return self._ml_instance.model.schemas[self._ml_instance.ml_schema].tables["Dataset"]
+    # ==================== Read Interface Methods ====================
+    # These methods implement the DatasetLike protocol for read operations.
+    # They delegate to the catalog instance for actual data retrieval.
+    # This allows Dataset and DatasetBag to share a common interface.
+    def list_dataset_element_types(self) -> Iterable[Table]:
+        """List the types of elements that can be contained in this dataset.
+        Returns:
+            Iterable of Table objects representing element types.
+        """
+        return self._ml_instance.list_dataset_element_types()
+    def find_features(self, table: str | Table) -> Iterable[Feature]:
+        """Find features associated with a table.
+        Args:
+            table: Table to find features for.
+        Returns:
+            Iterable of Feature objects.
+        """
+        return self._ml_instance.find_features(table)
+    def dataset_history(self) -> list[DatasetHistory]:
         """Retrieves the version history of a dataset.
         Returns a chronological list of dataset versions, including their version numbers,
         creation times, and associated metadata.
-        Args:
-            dataset_rid: Resource Identifier of the dataset.
         Returns:
             list[DatasetHistory]: List of history entries, each containing:
                 - dataset_version: Version number (major.minor.patch)
@@ -248,38 +482,36 @@ class Dataset:
             ...     print(f"Version {entry.dataset_version}: {entry.description}")
         """
-        if not self._is_dataset_rid(dataset_rid):
-            raise DerivaMLException(f"RID is not for a data set: {dataset_rid}")
-        version_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Version"]
+        if not self._ml_instance.model.is_dataset_rid(self.dataset_rid):
+            raise DerivaMLException(f"RID is not for a data set: {self.dataset_rid}")
+        version_path = self._ml_instance.pathBuilder().schemas[self._ml_instance.ml_schema].tables["Dataset_Version"]
         return [
             DatasetHistory(
                 dataset_version=DatasetVersion.parse(v["Version"]),
                 minid=v["Minid"],
                 snapshot=v["Snapshot"],
-                dataset_rid=dataset_rid,
+                dataset_rid=self.dataset_rid,
                 version_rid=v["RID"],
                 description=v["Description"],
                 execution_rid=v["Execution"],
             )
-            for v in version_path.filter(version_path.Dataset == dataset_rid).entities().fetch()
+            for v in version_path.filter(version_path.Dataset == self.dataset_rid).entities().fetch()
         ]
-    @validate_call
-    def dataset_version(self, dataset_rid: RID) -> DatasetVersion:
+    @property
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
+    def current_version(self) -> DatasetVersion:
         """Retrieve the current version of the specified dataset_table.
-        Given a rid, return the most recent version of the dataset. It is important to remember that this version
+        Return the most recent version of the dataset. It is important to remember that this version
         captures the state of the catalog at the time the version was created, not the current state of the catalog.
         This means that its possible that the values associated with an object in the catalog may be different
         from the values of that object in the dataset.
-        Args:
-            dataset_rid: The RID of the dataset to retrieve the version for.
         Returns:
             A tuple with the semantic version of the dataset_table.
         """
-        history = self.dataset_history(dataset_rid)
+        history = self.dataset_history()
         if not history:
             return DatasetVersion(0, 1, 0)
         else:
@@ -287,28 +519,129 @@ class Dataset:
             versions = [h.dataset_version for h in history]
             return max(versions) if versions else DatasetVersion(0, 1, 0)
-    def _build_dataset_graph(self, dataset_rid: RID) -> Iterable[RID]:
+    def get_chaise_url(self) -> str:
+        """Get the Chaise URL for viewing this dataset in the browser.
+        Returns:
+            URL string for the dataset record in Chaise.
+        """
+        return (
+            f"https://{self._ml_instance.host_name}/chaise/record/"
+            f"#{self._ml_instance.catalog_id}/deriva-ml:Dataset/RID={self.dataset_rid}"
+        )
+    def to_markdown(self, show_children: bool = False, indent: int = 0) -> str:
+        """Generate a markdown representation of this dataset.
+        Returns a formatted markdown string with a link to the dataset,
+        version, types, and description. Optionally includes nested children.
+        Args:
+            show_children: If True, include direct child datasets.
+            indent: Number of indent levels (each level is 2 spaces).
+        Returns:
+            Markdown-formatted string.
+        Example:
+            >>> ds = ml.lookup_dataset("4HM")
+            >>> print(ds.to_markdown())
+        """
+        prefix = "  " * indent
+        version = str(self.current_version) if self.current_version else "n/a"
+        types = ", ".join(self.dataset_types) if self.dataset_types else ""
+        desc = self.description or ""
+        line = f"{prefix}- [{self.dataset_rid}]({self.get_chaise_url()}) v{version}"
+        if types:
+            line += f" [{types}]"
+        if desc:
+            line += f": {desc}"
+        lines = [line]
+        if show_children:
+            children = self.list_dataset_children(recurse=False)
+            for child in children:
+                lines.append(child.to_markdown(show_children=False, indent=indent + 1))
+        return "\n".join(lines)
+    def display_markdown(self, show_children: bool = False, indent: int = 0) -> None:
+        """Display a formatted markdown representation of this dataset in Jupyter.
+        Convenience method that calls to_markdown() and displays the result
+        using IPython.display.Markdown.
+        Args:
+            show_children: If True, include direct child datasets.
+            indent: Number of indent levels (each level is 2 spaces).
+        Example:
+            >>> ds = ml.lookup_dataset("4HM")
+            >>> ds.display_markdown(show_children=True)
+        """
+        from IPython.display import display, Markdown
+        display(Markdown(self.to_markdown(show_children, indent)))
+    def _build_dataset_graph(self) -> Iterable[Dataset]:
+        """Build a dependency graph of all related datasets and return in topological order.
+        This method is used when incrementing dataset versions. Because datasets can be
+        nested (parent-child relationships), changing the version of one dataset may
+        require updating related datasets.
+        The topological sort ensures that children are processed before parents,
+        so version updates propagate correctly through the hierarchy.
+        Returns:
+            Iterable[Dataset]: Datasets in topological order (children before parents).
+        Example:
+            If dataset A contains nested dataset B, which contains C:
+            A -> B -> C
+            The returned order would be [C, B, A], ensuring C's version is
+            updated before B's, and B's before A's.
+        """
         ts: TopologicalSorter = TopologicalSorter()
-        self._build_dataset_graph_1(dataset_rid, ts, set())
+        self._build_dataset_graph_1(ts, set())
         return ts.static_order()
-    def _build_dataset_graph_1(self, dataset_rid: RID, ts: TopologicalSorter, visited) -> None:
-        """Use topological sort to return bottom up list of nested datasets"""
-        ts.add(dataset_rid)
-        if dataset_rid not in visited:
-            visited.add(dataset_rid)
-            children = self.list_dataset_children(dataset_rid=dataset_rid)
-            parents = self.list_dataset_parents(dataset_rid=dataset_rid)
-            for parent in parents:
-                # Convert string to RID type
-                self._build_dataset_graph_1(RID(parent), ts, visited)
-            for child in children:
-                self._build_dataset_graph_1(child, ts, visited)
+    def _build_dataset_graph_1(self, ts: TopologicalSorter, visited: set[str]) -> None:
+        """Recursively build the dataset dependency graph.
+        Uses topological sort where parents depend on their children, ensuring
+        children are processed before parents in the resulting order.
+        Args:
+            ts: TopologicalSorter instance to add nodes and dependencies to.
+            visited: Set of already-visited dataset RIDs to avoid cycles.
+        """
+        if self.dataset_rid in visited:
+            return
+        visited.add(self.dataset_rid)
+        # Use current catalog state for graph traversal, not version snapshot.
+        # Parent/child relationships need to reflect current state for version updates.
+        children = self._list_dataset_children_current()
+        parents = self._list_dataset_parents_current()
+        # Add this node with its children as dependencies.
+        # This means: self depends on children, so children will be ordered before self.
+        ts.add(self, *children)
+        # Recursively process children
+        for child in children:
+            child._build_dataset_graph_1(ts, visited)
+        # Recursively process parents (they will depend on this node)
+        for parent in parents:
+            parent._build_dataset_graph_1(ts, visited)
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def increment_dataset_version(
         self,
-        dataset_rid: RID,
         component: VersionPart,
         description: str | None = "",
         execution_rid: RID | None = None,
@@ -320,7 +653,6 @@ class Dataset:
         and execution reference.
         Args:
-            dataset_rid: Resource Identifier of the dataset to version.
             component: Which version component to increment ('major', 'minor', or 'patch').
             description: Optional description of the changes in this version.
             execution_rid: Optional execution RID to associate with this version.
@@ -341,190 +673,27 @@ class Dataset:
         """
         # Find all the datasets that are reachable from this dataset and determine their new version numbers.
-        related_datasets = list(self._build_dataset_graph(dataset_rid=dataset_rid))
+        related_datasets = list(self._build_dataset_graph())
         version_update_list = [
             DatasetSpec(
-                rid=ds_rid,
-                version=self.dataset_version(ds_rid).increment_version(component),
+                rid=ds.dataset_rid,
+                version=ds.current_version.increment_version(component),
             )
-            for ds_rid in related_datasets
+            for ds in related_datasets
         ]
-        self._insert_dataset_versions(version_update_list, description=description, execution_rid=execution_rid)
-        return next((d.version for d in version_update_list if d.rid == dataset_rid))
-    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
-    def create_dataset(
-        self,
-        dataset_types: str | list[str] | None = None,
-        description: str = "",
-        execution_rid: RID | None = None,
-        version: DatasetVersion | None = None,
-    ) -> RID:
-        """Creates a new dataset in the catalog.
-        Creates a dataset with specified types and description. The dataset can be associated
-        with an execution and initialized with a specific version.
-        Args:
-            dataset_types: One or more dataset type terms from Dataset_Type vocabulary.
-            description: Description of the dataset's purpose and contents.
-            execution_rid: Optional execution RID to associate with dataset creation.
-            version: Optional initial version number. Defaults to 0.1.0.
-        Returns:
-            RID: Resource Identifier of the newly created dataset.
-        Raises:
-            DerivaMLException: If dataset_types are invalid or creation fails.
-        Example:
-            >>> rid = ml.create_dataset(
-            ...     dataset_types=["experiment", "raw_data"],
-            ...     description="RNA sequencing experiment data",
-            ...     version=DatasetVersion(1, 0, 0)
-            ... )
-        """
-        version = version or DatasetVersion(0, 1, 0)
-        dataset_types = dataset_types or []
-        type_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables[MLVocab.dataset_type.value]
-        defined_types = list(type_path.entities().fetch())
-        def check_dataset_type(dtype: str) -> bool:
-            for term in defined_types:
-                if dtype == term["Name"] or (term["Synonyms"] and ds_type in term["Synonyms"]):
-                    return True
-            return False
-        # Create the entry for the new dataset_table and get its RID.
-        ds_types = [dataset_types] if isinstance(dataset_types, str) else dataset_types
-        pb = self._model.catalog.getPathBuilder()
-        for ds_type in ds_types:
-            if not check_dataset_type(ds_type):
-                raise DerivaMLException("Dataset type must be a vocabulary term.")
-        dataset_table_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
-        dataset_rid = dataset_table_path.insert(
-            [
-                {
-                    "Description": description,
-                    "Deleted": False,
-                }
-            ]
-        )[0]["RID"]
-        # Get the name of the association table between dataset_table and dataset_type.
-        associations = list(self._model.schemas[self._ml_schema].tables[MLVocab.dataset_type].find_associations())
-        atable = associations[0].name if associations else None
-        pb.schemas[self._ml_schema].tables[atable].insert(
-            [{MLVocab.dataset_type: ds_type, "Dataset": dataset_rid} for ds_type in ds_types]
+        Dataset._insert_dataset_versions(
+            self._ml_instance, version_update_list, description=description, execution_rid=execution_rid
         )
-        if execution_rid is not None:
-            pb.schemas[self._ml_schema].Dataset_Execution.insert([{"Dataset": dataset_rid, "Execution": execution_rid}])
-        self._insert_dataset_versions(
-            [DatasetSpec(rid=dataset_rid, version=version)],
-            execution_rid=execution_rid,
-            description="Initial dataset creation.",
-        )
-        return dataset_rid
-    @validate_call
-    def delete_dataset(self, dataset_rid: RID, recurse: bool = False) -> None:
-        """Delete a dataset_table from the catalog.
-        Args:
-            dataset_rid: RID of the dataset_table to delete.
-            recurse: If True, delete the dataset_table along with any nested datasets. (Default value = False)
-        """
-        # Get association table entries for this dataset_table
-        # Delete association table entries
-        if not self._is_dataset_rid(dataset_rid):
-            raise DerivaMLException("Dataset_rid is not a dataset.")
-        if parents := self.list_dataset_parents(dataset_rid):
-            raise DerivaMLException(f'Dataset_rid "{dataset_rid}" is in a nested dataset: {parents}.')
-        pb = self._model.catalog.getPathBuilder()
-        dataset_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
-        rid_list = [dataset_rid] + (self.list_dataset_children(dataset_rid=dataset_rid) if recurse else [])
-        dataset_path.update([{"RID": r, "Deleted": True} for r in rid_list])
-    def find_datasets(self, deleted: bool = False) -> Iterable[dict[str, Any]]:
-        """Returns a list of currently available datasets.
-        Arguments:
-            deleted: If True, included the datasets that have been deleted.
-        Returns:
-             list of currently available datasets.
-        """
-        # Get datapath to all the tables we will need: Dataset, DatasetType and the association table.
-        pb = self._model.catalog.getPathBuilder()
-        dataset_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
-        associations = list(self._model.schemas[self._ml_schema].tables[MLVocab.dataset_type].find_associations())
-        atable = associations[0].name if associations else None
-        ml_path = pb.schemas[self._ml_schema]
-        atable_path = ml_path.tables[atable]
-        if deleted:
-            filtered_path = dataset_path
-        else:
-            filtered_path = dataset_path.filter(
-                (dataset_path.Deleted == False) | (dataset_path.Deleted == None)  # noqa: E711, E712
-            )
-        # Get a list of all the dataset_type values associated with this dataset_table.
-        datasets = []
-        for dataset in filtered_path.entities().fetch():
-            ds_types = (
-                atable_path.filter(atable_path.Dataset == dataset["RID"]).attributes(atable_path.Dataset_Type).fetch()
-            )
-            datasets.append(dataset | {MLVocab.dataset_type: [ds[MLVocab.dataset_type] for ds in ds_types]})
-        return datasets
-    def list_dataset_element_types(self) -> Iterable[Table]:
-        """List the types of entities that can be added to a dataset_table.
-        Returns:
-          :return: An iterable of Table objects that can be included as an element of a dataset_table.
-        """
-        def domain_table(table: Table) -> bool:
-            return table.schema.name == self._model.domain_schema or table.name == self._dataset_table.name
-        return [t for a in self._dataset_table.find_associations() if domain_table(t := a.other_fkeys.pop().pk_table)]
+        return next((d.version for d in version_update_list if d.rid == self.dataset_rid))
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
-    def add_dataset_element_type(self, element: str | Table) -> Table:
-        """A dataset_table is a heterogeneous collection of objects, each of which comes from a different table. This
-        routine makes it possible to add objects from the specified table to a dataset_table.
-        Args:
-            element: Name of the table or table object that is to be added to the dataset_table.
-        Returns:
-            The table object that was added to the dataset_table.
-        """
-        # Add table to map
-        element_table = self._model.name_to_table(element)
-        atable_def = Table.define_association([self._dataset_table, element_table])
-        try:
-            table = self._model.schemas[self._model.domain_schema].create_table(atable_def)
-        except ValueError as e:
-            if "already exists" in str(e):
-                table = self._model.name_to_table(atable_def["table_name"])
-            else:
-                raise e
-        # self.model = self.catalog.getCatalogModel()
-        self._dataset_table.annotations.update(self._generate_dataset_download_annotations())
-        self._model.model.apply()
-        return table
-    # @validate_call
     def list_dataset_members(
-        self, dataset_rid: RID, recurse: bool = False, limit: int | None = None
+        self,
+        recurse: bool = False,
+        limit: int | None = None,
+        _visited: set[RID] | None = None,
+        version: DatasetVersion | str | None = None,
+        **kwargs: Any,
     ) -> dict[str, list[dict[str, Any]]]:
         """Lists members of a dataset.
@@ -532,9 +701,11 @@ class Dataset:
         recurse through nested datasets and limit the number of results.
         Args:
-            dataset_rid: Resource Identifier of the dataset.
             recurse: Whether to include members of nested datasets. Defaults to False.
             limit: Maximum number of members to return per type. None for no limit.
+            _visited: Internal parameter to track visited datasets and prevent infinite recursion.
+            version: Dataset version to list members from. Defaults to the current version.
+            **kwargs: Additional arguments (ignored, for protocol compatibility).
         Returns:
             dict[str, list[dict[str, Any]]]: Dictionary mapping member types to lists of members.
@@ -548,21 +719,27 @@ class Dataset:
             >>> for type_name, records in members.items():
             ...     print(f"{type_name}: {len(records)} records")
         """
+        # Initialize visited set for recursion guard
+        if _visited is None:
+            _visited = set()
-        if not self._is_dataset_rid(dataset_rid):
-            raise DerivaMLException(f"RID is not for a dataset_table: {dataset_rid}")
+        # Prevent infinite recursion by checking if we've already visited this dataset
+        if self.dataset_rid in _visited:
+            return {}
+        _visited.add(self.dataset_rid)
         # Look at each of the element types that might be in the dataset_table and get the list of rid for them from
         # the appropriate association table.
         members = defaultdict(list)
-        pb = self._model.catalog.getPathBuilder()
+        version_snapshot_catalog = self._version_snapshot_catalog(version)
+        pb = version_snapshot_catalog.pathBuilder()
         for assoc_table in self._dataset_table.find_associations():
             other_fkey = assoc_table.other_fkeys.pop()
             target_table = other_fkey.pk_table
             member_table = assoc_table.table
             # Look at domain tables and nested datasets.
-            if target_table.schema.name != self._model.domain_schema and not (
+            if not self._ml_instance.model.is_domain_schema(target_table.schema.name) and not (
                 target_table == self._dataset_table or target_table.name == "File"
             ):
                 continue
@@ -573,7 +750,7 @@ class Dataset:
             target_path = pb.schemas[target_table.schema.name].tables[target_table.name]
             member_path = pb.schemas[member_table.schema.name].tables[member_table.name]
-            path = member_path.filter(member_path.Dataset == dataset_rid).link(
+            path = member_path.filter(member_path.Dataset == self.dataset_rid).link(
                 target_path,
                 on=(member_path.columns[member_column] == target_path.columns["RID"]),
             )
@@ -582,15 +759,241 @@ class Dataset:
             if recurse and target_table == self._dataset_table:
                 # Get the members for all the nested datasets and add to the member list.
                 nested_datasets = [d["RID"] for d in target_entities]
-                for ds in nested_datasets:
-                    for k, v in self.list_dataset_members(ds, recurse=recurse).items():
+                for ds_rid in nested_datasets:
+                    ds = version_snapshot_catalog.lookup_dataset(ds_rid)
+                    for k, v in ds.list_dataset_members(version=version, recurse=recurse, _visited=_visited).items():
                         members[k].extend(v)
         return dict(members)
-    @validate_call
+    def _denormalize_datapath(
+        self,
+        include_tables: list[str],
+        version: DatasetVersion | str | None = None,
+    ) -> Generator[dict[str, Any], None, None]:
+        """Denormalize dataset members by joining related tables.
+        This method creates a "wide table" view by joining related tables together using
+        the Deriva datapath API, producing rows that contain columns from all specified
+        tables. The result has outer join semantics - rows from tables without FK
+        relationships are included with NULL values for unrelated columns.
+        The method:
+        1. Gets the list of dataset members for each included table
+        2. For each member in the first table, follows foreign key relationships to
+           get related records from other tables
+        3. Tables without FK connections to the first table are included with NULLs
+        4. Includes nested dataset members recursively
+        Args:
+            include_tables: List of table names to include in the output.
+            version: Dataset version to query. Defaults to current version.
+        Yields:
+            dict[str, Any]: Rows with column names prefixed by table name (e.g., "Image_Filename").
+                Unrelated tables have NULL values for their columns.
+        Note:
+            Column names in the result are prefixed with the table name to avoid
+            collisions (e.g., "Image_Filename", "Subject_RID").
+        """
+        # Skip system columns in output
+        skip_columns = {"RCT", "RMT", "RCB", "RMB"}
+        # Get all members for the included tables (recursively includes nested datasets)
+        members = self.list_dataset_members(version=version, recurse=True)
+        # Build a lookup of columns for each table
+        table_columns: dict[str, list[str]] = {}
+        for table_name in include_tables:
+            table = self._ml_instance.model.name_to_table(table_name)
+            table_columns[table_name] = [
+                c.name for c in table.columns if c.name not in skip_columns
+            ]
+        # Find the primary table (first non-empty table in include_tables)
+        primary_table = None
+        for table_name in include_tables:
+            if table_name in members and members[table_name]:
+                primary_table = table_name
+                break
+        if primary_table is None:
+            # No data at all
+            return
+        primary_table_obj = self._ml_instance.model.name_to_table(primary_table)
+        for member in members[primary_table]:
+            # Build the row with all columns from all tables
+            row: dict[str, Any] = {}
+            # Add primary table columns
+            for col_name in table_columns[primary_table]:
+                prefixed_name = f"{primary_table}_{col_name}"
+                row[prefixed_name] = member.get(col_name)
+            # For each other table, try to join or add NULL values
+            for other_table_name in include_tables:
+                if other_table_name == primary_table:
+                    continue
+                other_table = self._ml_instance.model.name_to_table(other_table_name)
+                other_cols = table_columns[other_table_name]
+                # Initialize all columns to None (outer join behavior)
+                for col_name in other_cols:
+                    prefixed_name = f"{other_table_name}_{col_name}"
+                    row[prefixed_name] = None
+                # Try to find FK relationship and join
+                if other_table_name in members:
+                    try:
+                        relationship = self._ml_instance.model._table_relationship(
+                            primary_table_obj, other_table
+                        )
+                        fk_col, pk_col = relationship
+                        # Look up the related record
+                        fk_value = member.get(fk_col.name)
+                        if fk_value:
+                            for other_member in members.get(other_table_name, []):
+                                if other_member.get(pk_col.name) == fk_value:
+                                    for col_name in other_cols:
+                                        prefixed_name = f"{other_table_name}_{col_name}"
+                                        row[prefixed_name] = other_member.get(col_name)
+                                    break
+                    except DerivaMLException:
+                        # No FK relationship - columns remain NULL (outer join)
+                        pass
+            yield row
+    def denormalize_as_dataframe(
+        self,
+        include_tables: list[str],
+        version: DatasetVersion | str | None = None,
+        **kwargs: Any,
+    ) -> pd.DataFrame:
+        """Denormalize the dataset into a single wide table (DataFrame).
+        Denormalization transforms normalized relational data into a single "wide table"
+        (also called a "flat table" or "denormalized table") by joining related tables
+        together. This produces a DataFrame where each row contains all related information
+        from multiple source tables, with columns from each table combined side-by-side.
+        Wide tables are the standard input format for most machine learning frameworks,
+        which expect all features for a single observation to be in one row. This method
+        bridges the gap between normalized database schemas and ML-ready tabular data.
+        **How it works:**
+        Tables are joined based on their foreign key relationships. For example, if
+        Image has a foreign key to Subject, and Diagnosis has a foreign key to Image,
+        then denormalizing ["Subject", "Image", "Diagnosis"] produces rows where each
+        image appears with its subject's metadata and any associated diagnoses.
+        **Column naming:**
+        Column names are prefixed with the source table name using underscores
+        to avoid collisions (e.g., "Image_Filename", "Subject_RID").
+        Args:
+            include_tables: List of table names to include in the output. Tables
+                are joined based on their foreign key relationships.
+                Order doesn't matter - the join order is determined automatically.
+            version: Dataset version to query. Defaults to current version.
+                Use this to get a reproducible snapshot of the data.
+            **kwargs: Additional arguments (ignored, for protocol compatibility).
+        Returns:
+            pd.DataFrame: Wide table with columns from all included tables.
+        Example:
+            Create a training dataset with images and their labels::
+                >>> # Get all images with their diagnoses in one table
+                >>> df = dataset.denormalize_as_dataframe(["Image", "Diagnosis"])
+                >>> print(df.columns.tolist())
+                ['Image_RID', 'Image_Filename', 'Image_URL', 'Diagnosis_RID',
+                 'Diagnosis_Label', 'Diagnosis_Confidence']
+                >>> # Use with scikit-learn
+                >>> X = df[["Image_Filename"]]  # Features
+                >>> y = df["Diagnosis_Label"]    # Labels
+            Include subject metadata for stratified splitting::
+                >>> df = dataset.denormalize_as_dataframe(
+                ...     ["Subject", "Image", "Diagnosis"]
+                ... )
+                >>> # Now df has Subject_Age, Subject_Gender, etc.
+                >>> # for stratified train/test splits by subject
+        See Also:
+            denormalize_as_dict: Generator version for memory-efficient processing.
+        """
+        rows = list(self._denormalize_datapath(include_tables, version))
+        return pd.DataFrame(rows)
+    def denormalize_as_dict(
+        self,
+        include_tables: list[str],
+        version: DatasetVersion | str | None = None,
+        **kwargs: Any,
+    ) -> Generator[dict[str, Any], None, None]:
+        """Denormalize the dataset and yield rows as dictionaries.
+        This is a memory-efficient alternative to denormalize_as_dataframe() that
+        yields one row at a time as a dictionary instead of loading all data into
+        a DataFrame. Use this when processing large datasets that may not fit in
+        memory, or when you want to process rows incrementally.
+        Like denormalize_as_dataframe(), this produces a "wide table" representation
+        where each yielded dictionary contains all columns from the joined tables.
+        See denormalize_as_dataframe() for detailed explanation of how denormalization
+        works.
+        **Column naming:**
+        Column names are prefixed with the source table name using underscores
+        to avoid collisions (e.g., "Image_Filename", "Subject_RID").
+        Args:
+            include_tables: List of table names to include in the output.
+                Tables are joined based on their foreign key relationships.
+            version: Dataset version to query. Defaults to current version.
+            **kwargs: Additional arguments (ignored, for protocol compatibility).
+        Yields:
+            dict[str, Any]: Dictionary representing one row of the wide table.
+                Keys are column names in "Table_Column" format.
+        Example:
+            Process images one at a time for training::
+                >>> for row in dataset.denormalize_as_dict(["Image", "Diagnosis"]):
+                ...     # Load and preprocess each image
+                ...     img = load_image(row["Image_Filename"])
+                ...     label = row["Diagnosis_Label"]
+                ...     yield img, label  # Feed to training loop
+            Count labels without loading all data into memory::
+                >>> from collections import Counter
+                >>> labels = Counter()
+                >>> for row in dataset.denormalize_as_dict(["Image", "Diagnosis"]):
+                ...     labels[row["Diagnosis_Label"]] += 1
+                >>> print(labels)
+                Counter({'Normal': 450, 'Abnormal': 150})
+        See Also:
+            denormalize_as_dataframe: Returns all data as a pandas DataFrame.
+        """
+        yield from self._denormalize_datapath(include_tables, version)
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def add_dataset_members(
         self,
-        dataset_rid: RID,
         members: list[RID] | dict[str, list[RID]],
         validate: bool = True,
         description: str | None = "",
@@ -598,30 +1001,58 @@ class Dataset:
     ) -> None:
         """Adds members to a dataset.
-        Associates one or more records with a dataset. Can optionally validate member types
-        and create a new dataset version to track the changes.
+        Associates one or more records with a dataset. Members can be provided in two forms:
+        **List of RIDs (simpler but slower):**
+        When `members` is a list of RIDs, each RID is resolved to determine which table
+        it belongs to. This uses batch RID resolution for efficiency, but still requires
+        querying the catalog to identify each RID's table.
+        **Dictionary by table name (faster, recommended for large datasets):**
+        When `members` is a dict mapping table names to lists of RIDs, no RID resolution
+        is needed. The RIDs are inserted directly into the dataset. Use this form when
+        you already know which table each RID belongs to.
+        **Important:** Members can only be added from tables that have been registered as
+        dataset element types. Use :meth:`DerivaML.add_dataset_element_type` to register
+        a table before adding its records to datasets.
+        Adding members automatically increments the dataset's minor version.
         Args:
-            dataset_rid: Resource Identifier of the dataset.
-            members: List of RIDs to add as dataset members. Can be orginized into a dictionary that indicates the
-                table that the member rids belong to.
-            validate: Whether to validate member types. Defaults to True.
+            members: Either:
+                - list[RID]: List of RIDs to add. Each RID will be resolved to find its table.
+                - dict[str, list[RID]]: Mapping of table names to RID lists. Skips resolution.
+            validate: Whether to validate that members don't already exist. Defaults to True.
             description: Optional description of the member additions.
             execution_rid: Optional execution RID to associate with changes.
         Raises:
             DerivaMLException: If:
-                - dataset_rid is invalid
-                - members are invalid or of wrong type
-                - adding members would create a cycle
-                - validation fails
-        Example:
-            >>> ml.add_dataset_members(
-            ...     dataset_rid="1-abc123",
-            ...     members=["1-def456", "1-ghi789"],
-            ...     description="Added sample data"
-            ... )
+                - Any RID is invalid or cannot be resolved
+                - Any RID belongs to a table that isn't registered as a dataset element type
+                - Adding members would create a cycle (for nested datasets)
+                - Validation finds duplicate members (when validate=True)
+        See Also:
+            :meth:`DerivaML.add_dataset_element_type`: Register a table as a dataset element type.
+            :meth:`DerivaML.list_dataset_element_types`: List registered dataset element types.
+        Examples:
+            Using a list of RIDs (simpler):
+                >>> dataset.add_dataset_members(
+                ...     members=["1-ABC", "1-DEF", "1-GHI"],
+                ...     description="Added sample images"
+                ... )
+            Using a dict by table name (faster for large datasets):
+                >>> dataset.add_dataset_members(
+                ...     members={
+                ...         "Image": ["1-ABC", "1-DEF"],
+                ...         "Subject": ["2-XYZ"]
+                ...     },
+                ...     description="Added images and subjects"
+                ... )
         """
         description = description or "Updated dataset via add_dataset_members"
@@ -635,410 +1066,535 @@ class Dataset:
             Returns:
             """
-            path = path or set(dataset_rid)
+            path = path or set(self.dataset_rid)
             return member_rid in path
         if validate:
-            existing_rids = set(m["RID"] for ms in self.list_dataset_members(dataset_rid).values() for m in ms)
+            existing_rids = set(m["RID"] for ms in self.list_dataset_members().values() for m in ms)
             if overlap := set(existing_rids).intersection(members):
-                raise DerivaMLException(f"Attempting to add existing member to dataset_table {dataset_rid}: {overlap}")
+                raise DerivaMLException(
+                    f"Attempting to add existing member to dataset_table {self.dataset_rid}: {overlap}"
+                )
         # Now go through every rid to be added to the data set and sort them based on what association table entries
         # need to be made.
-        dataset_elements = {}
-        association_map = {
-            a.other_fkeys.pop().pk_table.name: a.table.name for a in self._dataset_table.find_associations()
-        }
+        dataset_elements: dict[str, list[RID]] = {}
+        # Build map of valid element tables to their association tables
+        associations = list(self._dataset_table.find_associations())
+        association_map = {a.other_fkeys.pop().pk_table.name: a.table.name for a in associations}
         # Get a list of all the object types that can be linked to a dataset_table.
         if type(members) is list:
             members = set(members)
-            for m in members:
-                try:
-                    rid_info = self._model.catalog.resolve_rid(m)
-                except KeyError:
-                    raise DerivaMLException(f"Invalid RID: {m}")
-                if rid_info.table.name not in association_map:
-                    raise DerivaMLException(f"RID table: {rid_info.table.name} not part of dataset_table")
+            # Get candidate tables for batch resolution (only tables that can be dataset elements)
+            candidate_tables = [
+                self._ml_instance.model.name_to_table(table_name) for table_name in association_map.keys()
+            ]
+            # Batch resolve all RIDs at once instead of one-by-one
+            rid_results = self._ml_instance.resolve_rids(members, candidate_tables=candidate_tables)
+            # Group by table and validate
+            for rid, rid_info in rid_results.items():
+                if rid_info.table_name not in association_map:
+                    raise DerivaMLException(f"RID table: {rid_info.table_name} not part of dataset_table")
                 if rid_info.table == self._dataset_table and check_dataset_cycle(rid_info.rid):
                     raise DerivaMLException("Creating cycle of datasets is not allowed")
-                dataset_elements.setdefault(rid_info.table.name, []).append(rid_info.rid)
+                dataset_elements.setdefault(rid_info.table_name, []).append(rid_info.rid)
         else:
-            dataset_elements = {t: set(ms) for t, ms in members.items()}
+            dataset_elements = {t: list(set(ms)) for t, ms in members.items()}
         # Now make the entries into the association tables.
-        pb = self._model.catalog.getPathBuilder()
+        pb = self._ml_instance.pathBuilder()
         for table, elements in dataset_elements.items():
-            schema_path = pb.schemas[
-                self._ml_schema if (table == "Dataset" or table == "File") else self._model.domain_schema
-            ]
+            # Determine schema: ML schema for Dataset/File, otherwise use the table's actual schema
+            if table == "Dataset" or table == "File":
+                schema_name = self._ml_instance.ml_schema
+            else:
+                # Find the table and use its schema
+                table_obj = self._ml_instance.model.name_to_table(table)
+                schema_name = table_obj.schema.name
+            schema_path = pb.schemas[schema_name]
             fk_column = "Nested_Dataset" if table == "Dataset" else table
             if len(elements):
                 # Find out the name of the column in the association table.
                 schema_path.tables[association_map[table]].insert(
-                    [{"Dataset": dataset_rid, fk_column: e} for e in elements]
+                    [{"Dataset": self.dataset_rid, fk_column: e} for e in elements]
                 )
         self.increment_dataset_version(
-            dataset_rid,
             VersionPart.minor,
             description=description,
             execution_rid=execution_rid,
         )
-    @validate_call
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def delete_dataset_members(
         self,
-        dataset_rid: RID,
         members: list[RID],
         description: str = "",
         execution_rid: RID | None = None,
     ) -> None:
-        """Remove elements to an existing dataset_table.
+        """Remove members from this dataset.
-        Delete elements from an existing dataset. In addition to deleting members, the minor version number of the
-        dataset is incremented and the description, if provide is applied to that new version.
+        Removes the specified members from the dataset. In addition to removing members,
+        the minor version number of the dataset is incremented and the description,
+        if provided, is applied to that new version.
         Args:
-            dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
-            members: List of member RIDs to add to the dataset_table.
-            description: Markdown description of the updated dataset.
+            members: List of member RIDs to remove from the dataset.
+            description: Optional description of the removal operation.
             execution_rid: Optional RID of execution associated with this operation.
-        """
+        Raises:
+            DerivaMLException: If any RID is invalid or not part of this dataset.
+        Example:
+            >>> dataset.delete_dataset_members(
+            ...     members=["1-ABC", "1-DEF"],
+            ...     description="Removed corrupted samples"
+            ... )
+        """
         members = set(members)
-        description = description or "Deletes dataset members"
+        description = description or "Deleted dataset members"
-        # Now go through every rid to be added to the data set and sort them based on what association table entries
-        # need to be made.
+        # Go through every rid to be deleted and sort them based on what association table entries
+        # need to be removed.
         dataset_elements = {}
         association_map = {
             a.other_fkeys.pop().pk_table.name: a.table.name for a in self._dataset_table.find_associations()
         }
-        # Get a list of all the object types that can be linked to a dataset_table.
+        # Get a list of all the object types that can be linked to a dataset.
         for m in members:
             try:
-                rid_info = self._model.catalog.resolve_rid(m)
+                rid_info = self._ml_instance.resolve_rid(m)
             except KeyError:
                 raise DerivaMLException(f"Invalid RID: {m}")
             if rid_info.table.name not in association_map:
-                raise DerivaMLException(f"RID table: {rid_info.table.name} not part of dataset_table")
+                raise DerivaMLException(f"RID table: {rid_info.table.name} not part of dataset")
             dataset_elements.setdefault(rid_info.table.name, []).append(rid_info.rid)
-        # Now make the entries into the association tables.
-        pb = self._model.catalog.getPathBuilder()
+        # Delete the entries from the association tables.
+        pb = self._ml_instance.pathBuilder()
         for table, elements in dataset_elements.items():
-            schema_path = pb.schemas[self._ml_schema if table == "Dataset" else self._model.domain_schema]
+            # Determine schema: ML schema for Dataset, otherwise use the table's actual schema
+            if table == "Dataset":
+                schema_name = self._ml_instance.ml_schema
+            else:
+                # Find the table and use its schema
+                table_obj = self._ml_instance.model.name_to_table(table)
+                schema_name = table_obj.schema.name
+            schema_path = pb.schemas[schema_name]
             fk_column = "Nested_Dataset" if table == "Dataset" else table
             if len(elements):
                 atable_path = schema_path.tables[association_map[table]]
-                # Find out the name of the column in the association table.
                 for e in elements:
                     entity = atable_path.filter(
-                        (atable_path.Dataset == dataset_rid) & (atable_path.columns[fk_column] == e),
+                        (atable_path.Dataset == self.dataset_rid) & (atable_path.columns[fk_column] == e),
                     )
                     entity.delete()
         self.increment_dataset_version(
-            dataset_rid,
             VersionPart.minor,
             description=description,
             execution_rid=execution_rid,
         )
-    @validate_call
-    def list_dataset_parents(self, dataset_rid: RID) -> list[str]:
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
+    def list_dataset_parents(
+        self,
+        recurse: bool = False,
+        _visited: set[RID] | None = None,
+        version: DatasetVersion | str | None = None,
+        **kwargs: Any,
+    ) -> list[Self]:
         """Given a dataset_table RID, return a list of RIDs of the parent datasets if this is included in a
         nested dataset.
         Args:
-            dataset_rid: return: RID of the parent dataset_table.
+            recurse: If True, recursively return all ancestor datasets.
+            _visited: Internal parameter to track visited datasets and prevent infinite recursion.
+            version: Dataset version to list parents from. Defaults to the current version.
+            **kwargs: Additional arguments (ignored, for protocol compatibility).
         Returns:
-            RID of the parent dataset_table.
+            List of parent datasets.
         """
-        if not self._is_dataset_rid(dataset_rid):
-            raise DerivaMLException(f"RID: {dataset_rid} does not belong to dataset_table {self._dataset_table.name}")
+        # Initialize visited set for recursion guard
+        if _visited is None:
+            _visited = set()
+        # Prevent infinite recursion by checking if we've already visited this dataset
+        if self.dataset_rid in _visited:
+            return []
+        _visited.add(self.dataset_rid)
         # Get association table for nested datasets
-        pb = self._model.catalog.getPathBuilder()
-        atable_path = pb.schemas[self._ml_schema].Dataset_Dataset
-        return [p["Dataset"] for p in atable_path.filter(atable_path.Nested_Dataset == dataset_rid).entities().fetch()]
+        version_snapshot_catalog = self._version_snapshot_catalog(version)
+        pb = version_snapshot_catalog.pathBuilder()
+        atable_path = pb.schemas[self._ml_instance.ml_schema].Dataset_Dataset
+        parents = [
+            version_snapshot_catalog.lookup_dataset(p["Dataset"])
+            for p in atable_path.filter(atable_path.Nested_Dataset == self.dataset_rid).entities().fetch()
+        ]
+        if recurse:
+            for parent in parents.copy():
+                parents.extend(parent.list_dataset_parents(recurse=True, _visited=_visited, version=version))
+        return parents
-    @validate_call
-    def list_dataset_children(self, dataset_rid: RID, recurse: bool = False) -> list[RID]:
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
+    def list_dataset_children(
+        self,
+        recurse: bool = False,
+        _visited: set[RID] | None = None,
+        version: DatasetVersion | str | None = None,
+        **kwargs: Any,
+    ) -> list[Self]:
         """Given a dataset_table RID, return a list of RIDs for any nested datasets.
         Args:
-            dataset_rid: A dataset_table RID.
             recurse: If True, return a list of nested datasets RIDs.
+            _visited: Internal parameter to track visited datasets and prevent infinite recursion.
+            version: Dataset version to list children from. Defaults to the current version.
+            **kwargs: Additional arguments (ignored, for protocol compatibility).
         Returns:
           list of nested dataset RIDs.
         """
-        dataset_dataset_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Dataset"]
+        # Initialize visited set for recursion guard
+        if _visited is None:
+            _visited = set()
+        version = DatasetVersion.parse(version) if isinstance(version, str) else version
+        version_snapshot_catalog = self._version_snapshot_catalog(version)
+        dataset_dataset_path = (
+           version_snapshot_catalog.pathBuilder().schemas[self._ml_instance.ml_schema].tables["Dataset_Dataset"]
+        )
         nested_datasets = list(dataset_dataset_path.entities().fetch())
-        def find_children(rid: RID):
+        def find_children(rid: RID) -> list[RID]:
+            # Prevent infinite recursion by checking if we've already visited this dataset
+            if rid in _visited:
+                return []
+            _visited.add(rid)
             children = [child["Nested_Dataset"] for child in nested_datasets if child["Dataset"] == rid]
             if recurse:
                 for child in children.copy():
                     children.extend(find_children(child))
             return children
-        return find_children(dataset_rid)
+        return [version_snapshot_catalog.lookup_dataset(rid) for rid in find_children(self.dataset_rid)]
-    def _export_vocabulary(self, writer: Callable[[str, str, Table], list[dict[str, Any]]]) -> list[dict[str, Any]]:
-        """
+    def _list_dataset_parents_current(self) -> list[Self]:
+        """Return parent datasets using current catalog state (not version snapshot).
-        Args:
-          writer: Callable[[list[Table]]: list[dict[str: Any]]]:
+        Used by _build_dataset_graph_1 to find all related datasets for version updates.
+        """
+        pb = self._ml_instance.pathBuilder()
+        atable_path = pb.schemas[self._ml_instance.ml_schema].Dataset_Dataset
+        return [
+            self._ml_instance.lookup_dataset(p["Dataset"])
+            for p in atable_path.filter(atable_path.Nested_Dataset == self.dataset_rid).entities().fetch()
+        ]
-        Returns:
+    def _list_dataset_children_current(self) -> list[Self]:
+        """Return child datasets using current catalog state (not version snapshot).
+        Used by _build_dataset_graph_1 to find all related datasets for version updates.
         """
-        vocabs = [
-            table
-            for s in self._model.schemas.values()
-            for table in s.tables.values()
-            if self._model.is_vocabulary(table)
-        ]
-        return [o for table in vocabs for o in writer(f"{table.schema.name}:{table.name}", table.name, table)]
+        dataset_dataset_path = (
+            self._ml_instance.pathBuilder().schemas[self._ml_instance.ml_schema].tables["Dataset_Dataset"]
+        )
+        nested_datasets = list(dataset_dataset_path.entities().fetch())
-    def _table_paths(
-        self,
-        dataset: DatasetSpec | None = None,
-        snapshot_catalog: DerivaML | None = None,
-    ) -> Iterator[tuple[str, str, Table]]:
-        paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
-        def source_path(path: tuple[Table, ...]) -> list[str]:
-            """Convert a tuple representing a path into a source path component with FK linkage"""
-            path = list(path)
-            p = [f"{self._model.ml_schema}:Dataset/RID={{RID}}"]
-            for table in path[1:]:
-                if table.name == "Dataset_Dataset":
-                    p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
-                elif table.name == "Dataset":
-                    p.append("(Nested_Dataset)=(deriva-ml:Dataset:RID)")
-                elif table.name == "Dataset_Version":
-                    p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
-                else:
-                    p.append(f"{table.schema.name}:{table.name}")
-            return p
-        src_paths = ["/".join(source_path(p)) for p in paths]
-        dest_paths = ["/".join([t.name for t in p]) for p in paths]
-        target_tables = [p[-1] for p in paths]
-        return zip(src_paths, dest_paths, target_tables)
-    def _collect_paths(
-        self,
-        dataset_rid: RID | None = None,
-        snapshot: Dataset | None = None,
-        dataset_nesting_depth: int | None = None,
-    ) -> set[tuple[Table, ...]]:
-        snapshot_catalog = snapshot if snapshot else self
-        dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables["Dataset"]
-        dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
-        # Figure out what types of elements the dataset contains.
-        dataset_associations = [
-            a
-            for a in self._dataset_table.find_associations()
-            if a.table.schema.name != self._ml_schema or a.table.name == "Dataset_Dataset"
-        ]
-        if dataset_rid:
-            # Get a list of the members of the dataset so we can figure out which tables to query.
-            dataset_elements = [
-                snapshot_catalog._model.name_to_table(e)
-                for e, m in snapshot_catalog.list_dataset_members(
-                    dataset_rid=dataset_rid,  #  limit=1 Limit seems to make things run slow.
-                ).items()
-                if m
-            ]
-            included_associations = [
-                a.table for a in dataset_table.find_associations() if a.other_fkeys.pop().pk_table in dataset_elements
-            ]
-        else:
-            included_associations = dataset_associations
-        # Get the paths through the schema and filter out all the dataset paths not used by this dataset.
-        paths = {
-            tuple(p)
-            for p in snapshot_catalog._model._schema_to_paths()
-            if (len(p) == 1)
-            or (p[1] not in dataset_associations)  # Tables in the domain schema
-            or (p[1] in included_associations)  # Tables that include members of the dataset
-        }
-        # Now get paths for nested datasets
-        nested_paths = set()
-        if dataset_rid:
-            for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
-                nested_paths |= self._collect_paths(c, snapshot=snapshot_catalog)
-        else:
-            # Initialize nesting depth if not already provided.
-            dataset_nesting_depth = (
-                self._dataset_nesting_depth() if dataset_nesting_depth is None else dataset_nesting_depth
-            )
-            if dataset_nesting_depth:
-                nested_paths = self._collect_paths(dataset_nesting_depth=dataset_nesting_depth - 1)
-        if nested_paths:
-            paths |= {
-                tuple([dataset_table]),
-                (dataset_table, dataset_dataset),
-            }
-        paths |= {(self._dataset_table, dataset_dataset) + p for p in nested_paths}
-        return paths
-    def _dataset_nesting_depth(self, dataset_rid: RID | None = None) -> int:
-        """Determine the maximum dataset nesting depth in the current catalog.
+        def find_children(rid: RID) -> list[RID]:
+            return [child["Nested_Dataset"] for child in nested_datasets if child["Dataset"] == rid]
+        return [self._ml_instance.lookup_dataset(rid) for rid in find_children(self.dataset_rid)]
+    def list_executions(self) -> list["Execution"]:
+        """List all executions associated with this dataset.
+        Returns all executions that used this dataset as input. This is
+        tracked through the Dataset_Execution association table.
         Returns:
+            List of Execution objects associated with this dataset.
+        Example:
+            >>> dataset = ml.lookup_dataset("1-abc123")
+            >>> executions = dataset.list_executions()
+            >>> for exe in executions:
+            ...     print(f"Execution {exe.execution_rid}: {exe.status}")
         """
+        # Import here to avoid circular dependency
+        from deriva_ml.execution.execution import Execution
-        def children_depth(dataset_rid: RID, nested_datasets: dict[str, list[str]]) -> int:
-            """Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
-            try:
-                children = nested_datasets[dataset_rid]
-                return max(map(lambda x: children_depth(x, nested_datasets), children)) + 1 if children else 1
-            except KeyError:
-                return 0
+        pb = self._ml_instance.pathBuilder()
+        dataset_execution_path = pb.schemas[self._ml_instance.ml_schema].Dataset_Execution
+        # Query for all executions associated with this dataset
+        records = list(
+            dataset_execution_path.filter(dataset_execution_path.Dataset == self.dataset_rid)
+            .entities()
+            .fetch()
+        )
+        return [self._ml_instance.lookup_execution(record["Execution"]) for record in records]
+    @staticmethod
+    def _insert_dataset_versions(
+        ml_instance: DerivaMLCatalog,
+        dataset_list: list[DatasetSpec],
+        description: str | None = "",
+        execution_rid: RID | None = None,
+    ) -> None:
+        """Insert new version records for a list of datasets.
+        This internal method creates Dataset_Version records in the catalog for
+        each dataset in the list. It also captures a catalog snapshot timestamp
+        to associate with these versions.
+        The version record links:
+        - The dataset RID to its new version number
+        - An optional description of what changed
+        - An optional execution that triggered the version change
+        - The catalog snapshot time for reproducibility
+        Args:
+            ml_instance: The catalog instance to insert versions into.
+            dataset_list: List of DatasetSpec objects containing RID and version info.
+            description: Optional description of the version change.
+            execution_rid: Optional execution RID to associate with the version.
+        """
+        schema_path = ml_instance.pathBuilder().schemas[ml_instance.ml_schema]
-        # Build up the dataset_table nesting graph...
-        pb = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Dataset"]
-        dataset_children = (
+        # Insert version records for all datasets in the list
+        version_records = schema_path.tables["Dataset_Version"].insert(
             [
                 {
-                    "Dataset": dataset_rid,
-                    "Nested_Dataset": c,
-                }  # Make uniform with return from datapath
-                for c in self.list_dataset_children(dataset_rid=dataset_rid)
+                    "Dataset": dataset.rid,
+                    "Version": str(dataset.version),
+                    "Description": description,
+                    "Execution": execution_rid,
+                }
+                for dataset in dataset_list
             ]
-            if dataset_rid
-            else pb.entities().fetch()
         )
-        nested_dataset = defaultdict(list)
-        for ds in dataset_children:
-            nested_dataset[ds["Dataset"]].append(ds["Nested_Dataset"])
-        return max(map(lambda d: children_depth(d, dict(nested_dataset)), nested_dataset)) if nested_dataset else 0
+        version_records = list(version_records)
-    def _dataset_specification(
-        self,
-        writer: Callable[[str, str, Table], list[dict[str, Any]]],
-        dataset: DatasetSpec | None = None,
-        snapshot_catalog: DerivaML | None = None,
-    ) -> list[dict[str, Any]]:
-        """Output a download/export specification for a dataset_table.  Each element of the dataset_table
-        will be placed in its own directory.
-        The top level data directory of the resulting BDBag will have one subdirectory for element type.
-        The subdirectory will contain the CSV indicating which elements of that type are present in the
-        dataset_table, and then there will be a subdirectory for each object that is reachable from the
-        dataset_table members.
-        To simplify reconstructing the relationship between tables, the CVS for each element is included.
-        The top level data directory will also contain a subdirectory for any controlled vocabularies used in
-        the dataset_table. All assets will be placed into a directory named asset in a subdirectory with the
-        asset table name.
-        For example, consider a dataset_table that consists of two element types, T1 and T2. T1 has foreign
-        key relationships to objects in tables T3 and T4. There are also two controlled vocabularies, CV1 and
-        CV2. T2 is an asset table which has two assets in it. The layout of the resulting bdbag would be:
-              data
-                CV1/
-                    cv1.csv
-                CV2/
-                    cv2.csv
-                Dataset/
-                    T1/
-                        t1.csv
-                        T3/
-                            t3.csv
-                        T4/
-                            t4.csv
-                    T2/
-                        t2.csv
-                asset/
-                  T2
-                    f1
-                    f2
+        # Capture the current catalog snapshot timestamp. This allows us to
+        # recreate the exact state of the catalog when this version was created.
+        snap = ml_instance.catalog.get("/").json()["snaptime"]
-        Args:
-          writer: Callable[[list[Table]]: list[dict[str:  Any]]]:
+        # Update version records with the snapshot timestamp
+        schema_path.tables["Dataset_Version"].update(
+            [{"RID": v["RID"], "Dataset": v["Dataset"], "Snapshot": snap} for v in version_records]
+        )
-        Returns:
-            A dataset_table specification.
-        """
-        element_spec = self._export_vocabulary(writer)
-        for path in self._table_paths(dataset=dataset, snapshot_catalog=snapshot_catalog):
-            element_spec.extend(writer(*path))
-        return element_spec
+        # Update each dataset's current version pointer to the new version record
+        schema_path.tables["Dataset"].update([{"Version": v["RID"], "RID": v["Dataset"]} for v in version_records])
-    def _download_dataset_bag(
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
+    def download_dataset_bag(
         self,
-        dataset: DatasetSpec,
-        execution_rid: RID | None = None,
-        snapshot_catalog: DerivaML | None = None,
+        version: DatasetVersion | str,
+        materialize: bool = True,
+        use_minid: bool = False,
     ) -> DatasetBag:
-        """Download a dataset onto the local file system.  Create a MINID for the dataset if one doesn't already exist.
+        """Downloads a dataset to the local filesystem and optionally creates a MINID.
+        Downloads a dataset to the local file system. If the dataset has a version set, that version is used.
+        If the dataset has a version and a version is provided, the version specified takes precedence.
         Args:
-            dataset: Specification of the dataset to be downloaded.
-            execution_rid: Execution RID for the dataset.
-            snapshot_catalog: Snapshot catalog for the dataset version if specified.
+            version: Dataset version to download. If not specified, the version must be set in the dataset.
+            materialize: If True, materialize the dataset after downloading.
+            use_minid: If True, upload the bag to S3 and create a MINID for the dataset.
+                Requires s3_bucket to be configured on the catalog. Defaults to False.
         Returns:
-            Tuple consisting of the path to the dataset, the RID of the dataset that was downloaded and the MINID
-            for the dataset.
+            DatasetBag: Object containing:
+                - path: Local filesystem path to downloaded dataset
+                - rid: Dataset's Resource Identifier
+                - minid: Dataset's Minimal Viable Identifier (if use_minid=True)
+        Raises:
+            DerivaMLException: If use_minid=True but s3_bucket is not configured on the catalog.
+        Examples:
+            Download without MINID (default):
+                >>> bag = dataset.download_dataset_bag(version="1.0.0")
+                >>> print(f"Downloaded to {bag.path}")
+            Download with MINID (requires s3_bucket configured):
+                >>> # Catalog must be created with s3_bucket="s3://my-bucket"
+                >>> bag = dataset.download_dataset_bag(version="1.0.0", use_minid=True)
         """
-        if (
-            execution_rid
-            and execution_rid != DRY_RUN_RID
-            and self._model.catalog.resolve_rid(execution_rid).table.name != "Execution"
-        ):
-            raise DerivaMLException(f"RID {execution_rid} is not an execution")
-        minid = self._get_dataset_minid(dataset, snapshot_catalog=snapshot_catalog)
+        if isinstance(version, str):
+            version = DatasetVersion.parse(version)
+        # Validate use_minid requires s3_bucket configuration
+        if use_minid and not self._ml_instance.s3_bucket:
+            raise DerivaMLException(
+                "Cannot use use_minid=True without s3_bucket configured. "
+                "Configure s3_bucket when creating the DerivaML instance to enable MINID support."
+            )
+        minid = self._get_dataset_minid(version, create=True, use_minid=use_minid)
         bag_path = (
-            self._materialize_dataset_bag(minid, execution_rid=execution_rid)
-            if dataset.materialize
-            else self._download_dataset_minid(minid)
+            self._materialize_dataset_bag(minid, use_minid=use_minid)
+            if materialize
+            else self._download_dataset_minid(minid, use_minid)
         )
-        return DatabaseModel(minid, bag_path, self._working_dir).get_dataset()
+        from deriva_ml.model.deriva_ml_database import DerivaMLDatabase
+        db_model = DatabaseModel(minid, bag_path, self._ml_instance.working_dir)
+        return DerivaMLDatabase(db_model).lookup_dataset(self.dataset_rid)
+    def _version_snapshot_catalog(self, dataset_version: DatasetVersion | str | None) -> DerivaMLCatalog:
+        """Get a catalog instance bound to a specific version's snapshot.
+        Dataset versions are associated with catalog snapshots, which represent
+        the exact state of the catalog at the time the version was created.
+        This method returns a catalog instance that queries against that snapshot,
+        ensuring reproducible access to historical data.
+        Args:
+            dataset_version: The version to get a snapshot for, or None to use
+                the current catalog state.
+        Returns:
+            DerivaMLCatalog: Either a snapshot-bound catalog or the current catalog.
+        """
+        if isinstance(dataset_version, str) and str:
+            dataset_version = DatasetVersion.parse(dataset_version)
+        if dataset_version:
+            return self._ml_instance.catalog_snapshot(self._version_snapshot_catalog_id(dataset_version))
+        else:
+            return self._ml_instance
+    def _version_snapshot_catalog_id(self, version: DatasetVersion | str) -> str:
+        """Get the catalog ID with snapshot suffix for a specific version.
+        Constructs a catalog identifier in the format "catalog_id@snapshot_time"
+        that can be used to access the catalog state at the time the version
+        was created.
+        Args:
+            version: The dataset version to get the snapshot for.
+        Returns:
+            str: Catalog ID with snapshot suffix (e.g., "1@2023-01-15T10:30:00").
-    def _version_snapshot(self, dataset: DatasetSpec) -> str:
-        """Return a catalog with snapshot for the specified dataset version"""
+        Raises:
+            DerivaMLException: If the specified version doesn't exist.
+        """
+        version = str(version)
         try:
-            version_record = next(
-                h for h in self.dataset_history(dataset_rid=dataset.rid) if h.dataset_version == dataset.version
-            )
+            version_record = next(h for h in self.dataset_history() if h.dataset_version == version)
         except StopIteration:
-            raise DerivaMLException(f"Dataset version {dataset.version} not found for dataset {dataset.rid}")
-        return f"{self._model.catalog.catalog_id}@{version_record.snapshot}"
+            raise DerivaMLException(f"Dataset version {version} not found for dataset {self.dataset_rid}")
+        return (
+            f"{self._ml_instance.catalog.catalog_id}@{version_record.snapshot}"
+            if version_record.snapshot
+            else self._ml_instance.catalog.catalog_id
+        )
+    def _download_dataset_minid(self, minid: DatasetMinid, use_minid: bool) -> Path:
+        """Download and extract a dataset bag from a MINID or direct URL.
+        This method handles the download of a BDBag archive, either from S3 storage
+        (if using MINIDs) or directly from the catalog server. Downloaded bags are
+        cached by checksum to avoid redundant downloads.
+        Args:
+            minid: DatasetMinid containing the bag URL and metadata.
+            use_minid: If True, download from S3 using the MINID URL.
+                If False, download directly from the catalog server.
+        Returns:
+            Path: The path to the extracted and validated bag directory.
+        Note:
+            Bags are cached in the cache_dir with the naming convention:
+            "{dataset_rid}_{checksum}/Dataset_{dataset_rid}"
+        """
+        # Check to see if we have an existing idempotent materialization of the desired bag. If so, then reuse
+        # it.  If not, then we need to extract the contents of the archive into our cache directory.
+        bag_dir = self._ml_instance.cache_dir / f"{minid.dataset_rid}_{minid.checksum}"
+        if bag_dir.exists():
+            self._logger.info(f"Using cached bag for  {minid.dataset_rid} Version:{minid.dataset_version}")
+            return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
+        # Either bag hasn't been downloaded yet, or we are not using a Minid, so we don't know the checksum yet.
+        with TemporaryDirectory() as tmp_dir:
+            if use_minid:
+                # Get bag from S3
+                bag_path = Path(tmp_dir) / Path(urlparse(minid.bag_url).path).name
+                archive_path = fetch_single_file(minid.bag_url, output_path=bag_path)
+            else:
+                exporter = DerivaExport(host=self._ml_instance.catalog.deriva_server.server, output_dir=tmp_dir)
+                archive_path = exporter.retrieve_file(minid.bag_url)
+                hashes = hash_utils.compute_file_hashes(archive_path, hashes=["md5", "sha256"])
+                checksum = hashes["sha256"][0]
+                bag_dir = self._ml_instance.cache_dir / f"{minid.dataset_rid}_{checksum}"
+                if bag_dir.exists():
+                    self._logger.info(f"Using cached bag for  {minid.dataset_rid} Version:{minid.dataset_version}")
+                    return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
+            bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
+        bdb.validate_bag_structure(bag_path)
+        return Path(bag_path)
+    def _create_dataset_minid(self, version: DatasetVersion, use_minid=True) -> str:
+        """Create a new MINID (Minimal Viable Identifier) for the dataset.
+        This method generates a BDBag export of the dataset and optionally
+        registers it with a MINID service for persistent identification.
+        The bag is uploaded to S3 storage when using MINIDs.
+        Args:
+            version: The dataset version to create a MINID for.
+            use_minid: If True, register with MINID service and upload to S3.
+                If False, just generate the bag and return a local URL.
-    def _create_dataset_minid(self, dataset: DatasetSpec, snapshot_catalog: DerivaML | None = None) -> str:
+        Returns:
+            str: URL to the MINID landing page (if use_minid=True) or
+                the direct bag download URL.
+        """
         with TemporaryDirectory() as tmp_dir:
             # Generate a download specification file for the current catalog schema. By default, this spec
             # will generate a minid and place the bag into S3 storage.
             spec_file = Path(tmp_dir) / "download_spec.json"
+            version_snapshot_catalog = self._version_snapshot_catalog(version)
             with spec_file.open("w", encoding="utf-8") as ds:
-                json.dump(self._generate_dataset_download_spec(dataset, snapshot_catalog), ds)
+                downloader = CatalogGraph(
+                    version_snapshot_catalog,
+                    s3_bucket=self._ml_instance.s3_bucket,
+                    use_minid=use_minid,
+                )
+                json.dump(downloader.generate_dataset_download_spec(self), ds)
             try:
                 self._logger.info(
                     "Downloading dataset %s for catalog: %s@%s"
                     % (
-                        "minid" if self._use_minid else "bag",
-                        dataset.rid,
-                        str(dataset.version),
+                        "minid" if use_minid else "bag",
+                        self.dataset_rid,
+                        str(version),
                     )
                 )
                 # Generate the bag and put into S3 storage.
                 exporter = DerivaExport(
-                    host=self._model.catalog.deriva_server.server,
+                    host=self._ml_instance.catalog.deriva_server.server,
                     config_file=spec_file,
                     output_dir=tmp_dir,
                     defer_download=True,
                     timeout=(10, 610),
-                    envars={"RID": dataset.rid},
+                    envars={"RID": self.dataset_rid},
                 )
                 minid_page_url = exporter.export()[0]  # Get the MINID launch page
             except (
@@ -1050,131 +1606,117 @@ class Dataset:
             ) as e:
                 raise DerivaMLException(format_exception(e))
             # Update version table with MINID.
-            if self._use_minid:
-                version_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Version"]
-                version_rid = [
-                    h for h in self.dataset_history(dataset_rid=dataset.rid) if h.dataset_version == dataset.version
-                ][0].version_rid
+            if use_minid:
+                version_path = (
+                    self._ml_instance.pathBuilder().schemas[self._ml_instance.ml_schema].tables["Dataset_Version"]
+                )
+                version_rid = [h for h in self.dataset_history() if h.dataset_version == version][0].version_rid
                 version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
         return minid_page_url
     def _get_dataset_minid(
         self,
-        dataset: DatasetSpec,
-        snapshot_catalog: DerivaML | None = None,
-        create: bool = True,
+        version: DatasetVersion,
+        create: bool,
+        use_minid: bool,
     ) -> DatasetMinid | None:
-        """Return a MINID for the specified dataset. If no version is specified, use the latest.
+        """Get or create a MINID for the specified dataset version.
+        This method retrieves the MINID associated with a specific dataset version,
+        optionally creating one if it doesn't exist.
         Args:
-            dataset: Specification of the dataset.
-            snapshot_catalog: Snapshot catalog for the dataset version if specified.
-            create: Create a new MINID if one doesn't already exist.
+            version: The dataset version to get the MINID for.
+            create: If True, create a new MINID if one doesn't already exist.
+                If False, raise an exception if no MINID exists.
+            use_minid: If True, use the MINID service for persistent identification.
+                If False, generate a direct download URL without MINID registration.
         Returns:
-            New or existing MINID for the dataset.
-        """
-        rid = dataset.rid
-        # Case 1: RID is already a MINID or direct URL
-        if rid.startswith("minid"):
-            return self._fetch_minid_metadata(f"https://identifiers.org/{rid}", dataset.version)
-        if rid.startswith("http"):
-            return self._fetch_minid_metadata(rid, dataset.version)
+            DatasetMinid: Object containing the MINID URL, checksum, and metadata.
-        # Case 2: RID is a dataset RID – validate existence
-        if not any(rid == ds["RID"] for ds in self.find_datasets()):
-            raise DerivaMLTableTypeError("Dataset", rid)
+        Raises:
+            DerivaMLException: If the version doesn't exist, or if create=False
+                and no MINID exists.
+        """
         # Find dataset version record
-        version_str = str(dataset.version)
-        history = self.dataset_history(rid)
+        version_str = str(version)
+        history = self.dataset_history()
         try:
             version_record = next(v for v in history if v.dataset_version == version_str)
         except StopIteration:
-            raise DerivaMLException(f"Version {version_str} does not exist for RID {rid}")
+            raise DerivaMLException(f"Version {version_str} does not exist for RID {self.dataset_rid}")
         # Check or create MINID
         minid_url = version_record.minid
         # If we either don't have a MINID, or we have a MINID, but we don't want to use it, generate a new one.
-        if (not minid_url) or (not self._use_minid):
+        if (not minid_url) or (not use_minid):
             if not create:
-                raise DerivaMLException(f"Minid for dataset {rid} doesn't exist")
-            if self._use_minid:
-                self._logger.info("Creating new MINID for dataset %s", rid)
-            minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
+                raise DerivaMLException(f"Minid for dataset {self.dataset_rid} doesn't exist")
+            if use_minid:
+                self._logger.info("Creating new MINID for dataset %s", self.dataset_rid)
+            minid_url = self._create_dataset_minid(version, use_minid=use_minid)
         # Return based on MINID usage
-        if self._use_minid:
-            return self._fetch_minid_metadata(minid_url, dataset.version)
+        if use_minid:
+            return self._fetch_minid_metadata(version, minid_url)
         return DatasetMinid(
-            dataset_version=dataset.version,
-            RID=f"{rid}@{version_record.snapshot}",
+            dataset_version=version,
+            RID=f"{self.dataset_rid}@{version_record.snapshot}",
             location=minid_url,
         )
-    def _fetch_minid_metadata(self, url: str, version: DatasetVersion) -> DatasetMinid:
-        r = requests.get(url, headers={"accept": "application/json"})
-        r.raise_for_status()
-        return DatasetMinid(dataset_version=version, **r.json())
-    def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
-        """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it, and
-        validate that all the metadata is correct
+    def _fetch_minid_metadata(self, version: DatasetVersion, url: str) -> DatasetMinid:
+        """Fetch MINID metadata from the MINID service.
         Args:
-            minid: The RID of a dataset_table or a minid to an existing bag.
-        Returns:
-            the location of the unpacked and validated dataset_table bag and the RID of the bag and the bag MINID
-        """
+            version: The dataset version associated with this MINID.
+            url: The MINID landing page URL.
-        # Check to see if we have an existing idempotent materialization of the desired bag. If so, then reuse
-        # it.  If not, then we need to extract the contents of the archive into our cache directory.
-        bag_dir = self._cache_dir / f"{minid.dataset_rid}_{minid.checksum}"
-        if bag_dir.exists():
-            self._logger.info(f"Using cached bag for  {minid.dataset_rid} Version:{minid.dataset_version}")
-            return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
+        Returns:
+            DatasetMinid: Parsed metadata including bag URL, checksum, and identifiers.
-        # Either bag hasn't been downloaded yet, or we are not using a Minid, so we don't know the checksum yet.
-        with TemporaryDirectory() as tmp_dir:
-            if self._use_minid:
-                # Get bag from S3
-                bag_path = Path(tmp_dir) / Path(urlparse(minid.bag_url).path).name
-                archive_path = fetch_single_file(minid.bag_url, output_path=bag_path)
-            else:
-                exporter = DerivaExport(host=self._model.catalog.deriva_server.server, output_dir=tmp_dir)
-                archive_path = exporter.retrieve_file(minid.bag_url)
-                hashes = hash_utils.compute_file_hashes(archive_path, hashes=["md5", "sha256"])
-                checksum = hashes["sha256"][0]
-                bag_dir = self._cache_dir / f"{minid.dataset_rid}_{checksum}"
-                if bag_dir.exists():
-                    self._logger.info(f"Using cached bag for  {minid.dataset_rid} Version:{minid.dataset_version}")
-                    return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
-            bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
-        bdb.validate_bag_structure(bag_path)
-        return Path(bag_path)
+        Raises:
+            requests.HTTPError: If the MINID service request fails.
+        """
+        r = requests.get(url, headers={"accept": "application/json"})
+        r.raise_for_status()
+        return DatasetMinid(dataset_version=version, **r.json())
     def _materialize_dataset_bag(
         self,
         minid: DatasetMinid,
-        execution_rid: RID | None = None,
+        use_minid: bool,
     ) -> Path:
-        """Materialize a dataset_table bag into a local directory
+        """Materialize a dataset bag by downloading all referenced files.
+        This method downloads a BDBag and then "materializes" it by fetching
+        all files referenced in the bag's fetch.txt manifest. This includes
+        data files, assets, and any other content referenced by the bag.
+        Progress is reported through callbacks that update the execution status
+        if this download is associated with an execution.
         Args:
-            minid: A MINID to an existing bag or a RID of the dataset_table that should be downloaded.
+            minid: DatasetMinid containing the bag URL and metadata.
+            use_minid: If True, download from S3 using the MINID URL.
         Returns:
-            A tuple containing the path to the bag, the RID of the bag, and the MINID to the bag.
+            Path: The path to the fully materialized bag directory.
+        Note:
+            Materialization status is cached via a 'validated_check.txt' marker
+            file to avoid re-downloading already-materialized bags.
         """
         def update_status(status: Status, msg: str) -> None:
             """Update the current status for this execution in the catalog"""
-            if execution_rid and execution_rid != DRY_RUN_RID:
-                self._model.catalog.getPathBuilder().schemas[self._ml_schema].Execution.update(
+            if self.execution_rid and self.execution_rid != DRY_RUN_RID:
+                self._ml_instance.pathBuilder().schemas[self._ml_instance.ml_schema].Execution.update(
                     [
                         {
-                            "RID": execution_rid,
+                            "RID": self.execution_rid,
                             "Status": status.value,
                             "Status_Detail": msg,
                         }
@@ -1184,18 +1726,18 @@ class Dataset:
         def fetch_progress_callback(current, total):
             msg = f"Materializing bag: {current} of {total} file(s) downloaded."
-            if execution_rid:
+            if self.execution_rid:
                 update_status(Status.running, msg)
             return True
         def validation_progress_callback(current, total):
             msg = f"Validating bag: {current} of {total} file(s) validated."
-            if execution_rid:
+            if self.execution_rid:
                 update_status(Status.running, msg)
             return True
         # request metadata
-        bag_path = self._download_dataset_minid(minid)
+        bag_path = self._download_dataset_minid(minid, use_minid)
         bag_dir = bag_path.parent
         validated_check = bag_dir / "validated_check.txt"
@@ -1209,311 +1751,3 @@ class Dataset:
             )
             validated_check.touch()
         return Path(bag_path)
-    def _export_annotation(
-        self,
-        snapshot_catalog: DerivaML | None = None,
-    ) -> list[dict[str, Any]]:
-        """Return and output specification for the datasets in the provided model
-        Returns:
-          An export specification suitable for Chaise.
-        """
-        # Export specification is a specification for the datasets, plus any controlled vocabulary
-        return [
-            {
-                "source": {"api": False, "skip_root_path": True},
-                "destination": {"type": "env", "params": {"query_keys": ["snaptime"]}},
-            },
-            {
-                "source": {"api": "entity"},
-                "destination": {
-                    "type": "env",
-                    "params": {"query_keys": ["RID", "Description"]},
-                },
-            },
-            {
-                "source": {"api": "schema", "skip_root_path": True},
-                "destination": {"type": "json", "name": "schema"},
-            },
-        ] + self._dataset_specification(
-            self._export_annotation_dataset_element,
-            None,
-            snapshot_catalog=snapshot_catalog,
-        )
-    def _export_specification(
-        self, dataset: DatasetSpec, snapshot_catalog: DerivaML | None = None
-    ) -> list[dict[str, Any]]:
-        """
-        Generate a specification for export engine for specific dataset.
-        Returns:
-          a download specification for the datasets in the provided model.
-        """
-        # Download spec is the spec for any controlled vocabulary and for the dataset_table.
-        return [
-            {
-                "processor": "json",
-                "processor_params": {"query_path": "/schema", "output_path": "schema"},
-            }
-        ] + self._dataset_specification(self._export_specification_dataset_element, dataset, snapshot_catalog)
-    @staticmethod
-    def _export_specification_dataset_element(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
-        """Return the download specification for the data object indicated by a path through the data model.
-        Args:
-          spath: Source path
-          dpath: Destination path
-          table: Table referenced to by the path
-        Returns:
-          The download specification that will retrieve that data from the catalog and place it into a BDBag.
-        """
-        exports = [
-            {
-                "processor": "csv",
-                "processor_params": {
-                    "query_path": f"/entity/{spath}",
-                    "output_path": dpath,
-                },
-            }
-        ]
-        # If this table is an asset table, then we need to output the files associated with the asset.
-        asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
-        if asset_columns.issubset({c.name for c in table.columns}):
-            exports.append(
-                {
-                    "processor": "fetch",
-                    "processor_params": {
-                        "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5,asset_rid:=RID",
-                        "output_path": "asset/{asset_rid}/" + table.name,
-                    },
-                }
-            )
-        return exports
-    def _export_annotation_dataset_element(self, spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
-        """Given a path in the data model, output an export specification for the path taken to get to the
-        current table.
-        Args:
-          spath: Source path
-          dpath: Destination path
-          table: Table referenced to by the path
-        Returns:
-          The export specification that will retrieve that data from the catalog and place it into a BDBag.
-        """
-        # The table is the last element of the path.  Generate the ERMRest query by converting the list of tables
-        # into a path in the form of /S:T1/S:T2/S:Table
-        # Generate the destination path in the file system using just the table names.
-        skip_root_path = False
-        if spath.startswith(f"{self._ml_schema}:Dataset/"):
-            # Chaise will add table name and RID filter, so strip it off.
-            spath = "/".join(spath.split("/")[2:])
-            if spath == "":
-                # This path is to just the dataset table.
-                return []
-        else:
-            # A vocabulary table, so we don't want the root_path.
-            skip_root_path = True
-        exports = [
-            {
-                "source": {
-                    "api": "entity",
-                    "path": spath,
-                    "skip_root_path": skip_root_path,
-                },
-                "destination": {"name": dpath, "type": "csv"},
-            }
-        ]
-        # If this table is an asset table, then we need to output the files associated with the asset.
-        asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
-        if asset_columns.issubset({c.name for c in table.columns}):
-            exports.append(
-                {
-                    "source": {
-                        "skip_root_path": False,
-                        "api": "attribute",
-                        "path": f"{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5, asset_rid:=RID",
-                    },
-                    "destination": {"name": "asset/{asset_rid}/" + table.name, "type": "fetch"},
-                }
-            )
-        return exports
-    def _generate_dataset_download_spec(
-        self, dataset: DatasetSpec, snapshot_catalog: DerivaML | None = None
-    ) -> dict[str, Any]:
-        """
-        Generate a specification for downloading a specific dataset.
-        This routine creates a download specification that can be used by the Deriva export processor to download
-        a specific dataset as a MINID.
-        Returns:
-        """
-        s3_target = "s3://eye-ai-shared"
-        minid_test = False
-        catalog_id = self._version_snapshot(dataset)
-        post_processors = (
-            {
-                "post_processors": [
-                    {
-                        "processor": "cloud_upload",
-                        "processor_params": {
-                            "acl": "public-read",
-                            "target_url": s3_target,
-                        },
-                    },
-                    {
-                        "processor": "identifier",
-                        "processor_params": {
-                            "test": minid_test,
-                            "env_column_map": {
-                                "RID": "{RID}@{snaptime}",
-                                "Description": "{Description}",
-                            },
-                        },
-                    },
-                ]
-            }
-            if self._use_minid
-            else {}
-        )
-        return post_processors | {
-            "env": {"RID": "{RID}"},
-            "bag": {
-                "bag_name": "Dataset_{RID}",
-                "bag_algorithms": ["md5"],
-                "bag_archiver": "zip",
-                "bag_metadata": {},
-                "bag_idempotent": True,
-            },
-            "catalog": {
-                "host": f"{self._model.catalog.deriva_server.scheme}://{self._model.catalog.deriva_server.server}",
-                "catalog_id": catalog_id,
-                "query_processors": [
-                    {
-                        "processor": "env",
-                        "processor_params": {
-                            "output_path": "Dataset",
-                            "query_keys": ["snaptime"],
-                            "query_path": "/",
-                        },
-                    },
-                    {
-                        "processor": "env",
-                        "processor_params": {
-                            "query_path": "/entity/M:=deriva-ml:Dataset/RID={RID}",
-                            "output_path": "Dataset",
-                            "query_keys": ["RID", "Description"],
-                        },
-                    },
-                ]
-                + self._export_specification(dataset, snapshot_catalog),
-            },
-        }
-    def _generate_dataset_download_annotations(self) -> dict[str, Any]:
-        post_processors = (
-            {
-                "type": "BAG",
-                "outputs": [{"fragment_key": "dataset_export_outputs"}],
-                "displayname": "BDBag to Cloud",
-                "bag_idempotent": True,
-                "postprocessors": [
-                    {
-                        "processor": "cloud_upload",
-                        "processor_params": {
-                            "acl": "public-read",
-                            "target_url": "s3://eye-ai-shared/",
-                        },
-                    },
-                    {
-                        "processor": "identifier",
-                        "processor_params": {
-                            "test": False,
-                            "env_column_map": {
-                                "RID": "{RID}@{snaptime}",
-                                "Description": "{Description}",
-                            },
-                        },
-                    },
-                ],
-            }
-            if self._use_minid
-            else {}
-        )
-        return {
-            deriva_tags.export_fragment_definitions: {"dataset_export_outputs": self._export_annotation()},
-            deriva_tags.visible_foreign_keys: self._dataset_visible_fkeys(),
-            deriva_tags.export_2019: {
-                "detailed": {
-                    "templates": [
-                        {
-                            "type": "BAG",
-                            "outputs": [{"fragment_key": "dataset_export_outputs"}],
-                            "displayname": "BDBag Download",
-                            "bag_idempotent": True,
-                        }
-                        | post_processors
-                    ]
-                }
-            },
-        }
-    def _dataset_visible_fkeys(self) -> dict[str, Any]:
-        def fkey_name(fk):
-            return [fk.name[0].name, fk.name[1]]
-        dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
-        source_list = [
-            {
-                "source": [
-                    {"inbound": ["deriva-ml", "Dataset_Version_Dataset_fkey"]},
-                    "RID",
-                ],
-                "markdown_name": "Previous Versions",
-                "entity": True,
-            },
-            {
-                "source": [
-                    {"inbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
-                    {"outbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
-                    "RID",
-                ],
-                "markdown_name": "Parent Datasets",
-            },
-            {
-                "source": [
-                    {"inbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
-                    {"outbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
-                    "RID",
-                ],
-                "markdown_name": "Child Datasets",
-            },
-        ]
-        source_list.extend(
-            [
-                {
-                    "source": [
-                        {"inbound": fkey_name(fkey.self_fkey)},
-                        {"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
-                        "RID",
-                    ],
-                    "markdown_name": other_fkey.pk_table.name,
-                }
-                for fkey in dataset_table.find_associations(max_arity=3, pure=False)
-            ]
-        )
-        return {"detailed": source_list}

deriva-ml 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl

deriva-ml 1.17.10py3-none-any.whl → 1.17.11py3-none-any.whl