PyPI - deriva-ml - Versions diffs - 1.13.3__py3-none-any.whl → 1.14.26__py3-none-any.whl - Mend

deriva-ml 1.13.3py3-none-any.whl → 1.14.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

deriva_ml/__init__.py +25 -30
deriva_ml/core/__init__.py +39 -0
deriva_ml/core/base.py +1489 -0
deriva_ml/core/constants.py +36 -0
deriva_ml/core/definitions.py +74 -0
deriva_ml/core/enums.py +222 -0
deriva_ml/core/ermrest.py +288 -0
deriva_ml/core/exceptions.py +28 -0
deriva_ml/core/filespec.py +116 -0
deriva_ml/dataset/__init__.py +4 -0
deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
deriva_ml/{dataset.py → dataset/dataset.py} +408 -416
deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
deriva_ml/{history.py → dataset/history.py} +52 -33
deriva_ml/{upload.py → dataset/upload.py} +48 -70
deriva_ml/demo_catalog.py +233 -183
deriva_ml/execution/environment.py +290 -0
deriva_ml/{execution.py → execution/execution.py} +365 -252
deriva_ml/execution/execution_configuration.py +163 -0
deriva_ml/{execution_configuration.py → execution/workflow.py} +206 -218
deriva_ml/feature.py +83 -46
deriva_ml/model/__init__.py +0 -0
deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
deriva_ml/{database_model.py → model/database.py} +52 -74
deriva_ml/model/sql_mapper.py +44 -0
deriva_ml/run_notebook.py +19 -11
deriva_ml/schema/__init__.py +3 -0
deriva_ml/{schema_setup → schema}/annotations.py +31 -22
deriva_ml/schema/check_schema.py +104 -0
deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
deriva_ml/schema/deriva-ml-reference.json +8525 -0
deriva_ml/schema/table_comments_utils.py +57 -0
{deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/METADATA +5 -4
deriva_ml-1.14.26.dist-info/RECORD +40 -0
{deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/entry_points.txt +1 -0
deriva_ml/deriva_definitions.py +0 -372
deriva_ml/deriva_ml_base.py +0 -1046
deriva_ml/execution_environment.py +0 -139
deriva_ml/schema_setup/table_comments_utils.py +0 -56
deriva_ml/test-files/execution-parameters.json +0 -1
deriva_ml/test-files/notebook-parameters.json +0 -5
deriva_ml/test_functions.py +0 -141
deriva_ml/test_notebook.ipynb +0 -197
deriva_ml-1.13.3.dist-info/RECORD +0 -31
/deriva_ml/{schema_setup → execution}/__init__.py +0 -0
/deriva_ml/{schema_setup → schema}/policy.json +0 -0
{deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/WHEEL +0 -0
{deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/top_level.txt +0 -0

deriva_ml/{dataset.py → dataset/dataset.py} RENAMED Viewed

@@ -1,74 +1,110 @@
-"""
-This module defines the DataSet class with is used to manipulate datasets in DerivaML.
-The intended use of this class is as a base class in DerivaML, so all the methods documented here are
-accessible via a DerivaML class instance.
+"""Dataset management for DerivaML.
+This module provides functionality for managing datasets in DerivaML. A dataset represents a collection
+of related data that can be versioned, downloaded, and tracked. The module includes:
+- Dataset class: Core class for dataset operations
+- Version management: Track and update dataset versions
+- History tracking: Record dataset changes over time
+- Download capabilities: Export datasets as BDBags
+- Relationship management: Handle dataset dependencies and hierarchies
+The Dataset class serves as a base class in DerivaML, making its methods accessible through
+DerivaML class instances.
+Typical usage example:
+    >>> ml = DerivaML('deriva.example.org', 'my_catalog')
+    >>> dataset_rid = ml.create_dataset('experiment', 'Experimental data')
+    >>> ml.add_dataset_members(dataset_rid=dataset_rid, members=['1-abc123', '1-def456'])
+    >>> ml.increment_dataset_version(datset_rid=dataset_rid, component=VersionPart.minor,
+    ...     description='Added new samples')
 """
 from __future__ import annotations
-from bdbag import bdbag_api as bdb
-from bdbag.fetch.fetcher import fetch_single_file
-from collections import defaultdict
-from graphlib import TopologicalSorter
+# Standard library imports
 import json
 import logging
+from collections import defaultdict
+from graphlib import TopologicalSorter
 from pathlib import Path
-from pydantic import (
-    validate_call,
-    ConfigDict,
-)
-import requests
 from tempfile import TemporaryDirectory
-from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator
+import deriva.core.utils.hash_utils as hash_utils
+import requests
+# Third-party imports
+from bdbag import bdbag_api as bdb
+from bdbag.fetch.fetcher import fetch_single_file
+# Deriva imports
 from deriva.core.ermrest_model import Table
-from deriva.core.utils.core_utils import tag as deriva_tags, format_exception
-import deriva.core.utils.hash_utils as hash_utils
-from deriva.transfer.download.deriva_export import DerivaExport
+from deriva.core.utils.core_utils import format_exception
+from deriva.core.utils.core_utils import tag as deriva_tags
 from deriva.transfer.download.deriva_download import (
-    DerivaDownloadConfigurationError,
-    DerivaDownloadError,
     DerivaDownloadAuthenticationError,
     DerivaDownloadAuthorizationError,
+    DerivaDownloadConfigurationError,
+    DerivaDownloadError,
     DerivaDownloadTimeoutError,
 )
+from deriva.transfer.download.deriva_export import DerivaExport
+from pydantic import ConfigDict, validate_call
+# Local imports
 try:
     from icecream import ic
+    ic.configureOutput(includeContext=True)
 except ImportError:  # Graceful fallback if IceCream isn't installed.
     ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
-from deriva_ml import DatasetBag
-from .deriva_definitions import (
+from deriva_ml.core.constants import RID
+from deriva_ml.core.definitions import (
+    DRY_RUN_RID,
     ML_SCHEMA,
-    DerivaMLException,
     MLVocab,
     Status,
-    RID,
-    DRY_RUN_RID,
 )
-from .deriva_model import DerivaModel
-from .database_model import DatabaseModel
-from .dataset_aux_classes import (
-    DatasetVersion,
-    DatasetMinid,
+from deriva_ml.core.exceptions import DerivaMLException, DerivaMLTableTypeError
+from deriva_ml.dataset.aux_classes import (
     DatasetHistory,
-    VersionPart,
+    DatasetMinid,
     DatasetSpec,
+    DatasetVersion,
+    VersionPart,
 )
+from deriva_ml.dataset.dataset_bag import DatasetBag
+from deriva_ml.model.catalog import DerivaModel
+from deriva_ml.model.database import DatabaseModel
+from .history import iso_to_snap
+# Stop pycharm from complaining about undefined reference in docstring....
+ml: DerivaML
 if TYPE_CHECKING:
-    from .deriva_ml_base import DerivaML
+    from deriva_ml.core.base import DerivaML
 class Dataset:
-    """
-    Class to manipulate a dataset.
+    """Manages dataset operations in a Deriva catalog.
+    The Dataset class provides functionality for creating, modifying, and tracking datasets
+    in a Deriva catalog. It handles versioning, relationships between datasets, and data export.
     Attributes:
-        dataset_table (Table): ERMRest table holding dataset information.
+        dataset_table (Table): ERMrest table storing dataset information.
+        _model (DerivaModel): Catalog model instance.
+        _ml_schema (str): Schema name for ML-specific tables.
+        _cache_dir (Path): Directory for caching downloaded datasets.
+        _working_dir (Path): Directory for working data.
+        _use_minid (bool): Whether to use MINID service for dataset identification.
+    Note:
+        This class is typically used as a base class, with its methods accessed through
+        DerivaML class instances rather than directly.
     """
     _Logger = logging.getLogger("deriva_ml")
@@ -80,20 +116,31 @@ class Dataset:
         working_dir: Path,
         use_minid: bool = True,
     ):
+        """Initializes a Dataset instance.
+        Args:
+            model: DerivaModel instance representing the catalog.
+            cache_dir: Directory path for caching downloaded datasets.
+            working_dir: Directory path for working data.
+            use_minid: Whether to use MINID service for dataset identification.
+        """
         self._model = model
         self._ml_schema = ML_SCHEMA
-        self.dataset_table = self._model.schemas[self._ml_schema].tables["Dataset"]
         self._cache_dir = cache_dir
         self._working_dir = working_dir
         self._logger = logging.getLogger("deriva_ml")
         self._use_minid = use_minid
+    @property
+    def _dataset_table(self):
+        return self._model.schemas[self._ml_schema].tables["Dataset"]
     def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
         try:
             rid_info = self._model.catalog.resolve_rid(dataset_rid, self._model.model)
         except KeyError as _e:
             raise DerivaMLException(f"Invalid RID {dataset_rid}")
-        if rid_info.table != self.dataset_table:
+        if rid_info.table != self._dataset_table:
             return False
         elif deleted:
             # Got a dataset rid. Now check to see if its deleted or not.
@@ -104,12 +151,12 @@ class Dataset:
     def _insert_dataset_versions(
         self,
         dataset_list: list[DatasetSpec],
-        description: Optional[str] = "",
-        execution_rid: Optional[RID] = None,
+        description: str | None = "",
+        execution_rid: RID | None = None,
     ) -> None:
         schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
         # determine snapshot after changes were made
-        snap = self._model.catalog.get("/").json()["snaptime"]
         # Construct version records for insert
         version_records = schema_path.tables["Dataset_Version"].insert(
             [
@@ -118,16 +165,18 @@ class Dataset:
                     "Version": str(dataset.version),
                     "Description": description,
                     "Execution": execution_rid,
-                    "Snapshot": snap,
                 }
                 for dataset in dataset_list
             ]
         )
+        version_records = list(version_records)
+        snap = self._model.catalog.get("/").json()["snaptime"]
+        schema_path.tables["Dataset_Version"].update(
+            [{"RID": v["RID"], "Dataset": v["Dataset"], "Snapshot": snap} for v in version_records]
+        )
         # And update the dataset records.
-        schema_path.tables["Dataset"].update(
-            [{"Version": v["RID"], "RID": v["Dataset"]} for v in version_records]
-        )
+        schema_path.tables["Dataset"].update([{"Version": v["RID"], "RID": v["Dataset"]} for v in version_records])
     def _bootstrap_versions(self):
         datasets = [ds["RID"] for ds in self.find_datasets()]
@@ -143,9 +192,7 @@ class Dataset:
         version_path = schema_path.tables["Dataset_Version"]
         dataset_path = schema_path.tables["Dataset"]
         history = list(version_path.insert(ds_version))
-        dataset_versions = [
-            {"RID": h["Dataset"], "Version": h["Version"]} for h in history
-        ]
+        dataset_versions = [{"RID": h["Dataset"], "Version": h["Version"]} for h in history]
         dataset_path.update(dataset_versions)
     def _synchronize_dataset_versions(self):
@@ -161,30 +208,46 @@ class Dataset:
                 versions[v["Dataset"]] = v
         dataset_path = schema_path.tables["Dataset"]
-        dataset_path.update(
-            [
-                {"RID": dataset, "Version": version["RID"]}
-                for dataset, version in versions.items()
-            ]
+        dataset_path.update([{"RID": dataset, "Version": version["RID"]} for dataset, version in versions.items()])
+    def _set_version_snapshot(self):
+        dataset_version_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Version"]
+        versions = dataset_version_path.entities().fetch()
+        dataset_version_path.update(
+            [{"RID": h["RID"], "Snapshot": iso_to_snap(h["RCT"])} for h in versions if not h["Snapshot"]]
         )
     def dataset_history(self, dataset_rid: RID) -> list[DatasetHistory]:
-        """Return a list of DatasetHistory objects representing the dataset
+        """Retrieves the version history of a dataset.
+        Returns a chronological list of dataset versions, including their version numbers,
+        creation times, and associated metadata.
         Args:
-            dataset_rid: A RID to the dataset for which history is to be fetched.
+            dataset_rid: Resource Identifier of the dataset.
         Returns:
-            A list of DatasetHistory objects which indicate the version-number, creation time, and bag instantiation of the dataset.
+            list[DatasetHistory]: List of history entries, each containing:
+                - dataset_version: Version number (major.minor.patch)
+                - minid: Minimal Viable Identifier
+                - snapshot: Catalog snapshot time
+                - dataset_rid: Dataset Resource Identifier
+                - version_rid: Version Resource Identifier
+                - description: Version description
+                - execution_rid: Associated execution RID
+        Raises:
+            DerivaMLException: If dataset_rid is not a valid dataset RID.
+        Example:
+            >>> history = ml.dataset_history("1-abc123")
+            >>> for entry in history:
+            ...     print(f"Version {entry.dataset_version}: {entry.description}")
         """
         if not self._is_dataset_rid(dataset_rid):
             raise DerivaMLException(f"RID is not for a data set: {dataset_rid}")
-        version_path = (
-            self._model.catalog.getPathBuilder()
-            .schemas[self._ml_schema]
-            .tables["Dataset_Version"]
-        )
+        version_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Version"]
         return [
             DatasetHistory(
                 dataset_version=DatasetVersion.parse(v["Version"]),
@@ -195,9 +258,7 @@ class Dataset:
                 description=v["Description"],
                 execution_rid=v["Execution"],
             )
-            for v in version_path.filter(version_path.Dataset == dataset_rid)
-            .entities()
-            .fetch()
+            for v in version_path.filter(version_path.Dataset == dataset_rid).entities().fetch()
         ]
     @validate_call
@@ -219,14 +280,16 @@ class Dataset:
         if not history:
             return DatasetVersion(0, 1, 0)
         else:
-            return max([h.dataset_version for h in self.dataset_history(dataset_rid)])
+            # Ensure we return a DatasetVersion, not a string
+            versions = [h.dataset_version for h in history]
+            return max(versions) if versions else DatasetVersion(0, 1, 0)
     def _build_dataset_graph(self, dataset_rid: RID) -> Iterable[RID]:
-        ts = TopologicalSorter()
+        ts: TopologicalSorter = TopologicalSorter()
         self._build_dataset_graph_1(dataset_rid, ts, set())
         return ts.static_order()
-    def _build_dataset_graph_1(self, dataset_rid: RID, ts, visited) -> None:
+    def _build_dataset_graph_1(self, dataset_rid: RID, ts: TopologicalSorter, visited) -> None:
         """Use topological sort to return bottom up list of nested datasets"""
         ts.add(dataset_rid)
         if dataset_rid not in visited:
@@ -234,7 +297,8 @@ class Dataset:
             children = self.list_dataset_children(dataset_rid=dataset_rid)
             parents = self.list_dataset_parents(dataset_rid=dataset_rid)
             for parent in parents:
-                self._build_dataset_graph_1(parent, ts, visited)
+                # Convert string to RID type
+                self._build_dataset_graph_1(RID(parent), ts, visited)
             for child in children:
                 self._build_dataset_graph_1(child, ts, visited)
@@ -243,22 +307,34 @@ class Dataset:
         self,
         dataset_rid: RID,
         component: VersionPart,
-        description: Optional[str] = "",
-        execution_rid: Optional[RID] = None,
+        description: str | None = "",
+        execution_rid: RID | None = None,
     ) -> DatasetVersion:
-        """Increment the version of the specified dataset_table.
+        """Increments a dataset's version number.
+        Creates a new version of the dataset by incrementing the specified version component
+        (major, minor, or patch). The new version is recorded with an optional description
+        and execution reference.
         Args:
-            dataset_rid: RID of the dataset whose version is to be incremented.
-            component: Which version of the dataset_table to increment. Major, Minor, or Patch
-            description: Description of the version update of the dataset_table.
-            execution_rid: Which execution is performing increment.
+            dataset_rid: Resource Identifier of the dataset to version.
+            component: Which version component to increment ('major', 'minor', or 'patch').
+            description: Optional description of the changes in this version.
+            execution_rid: Optional execution RID to associate with this version.
         Returns:
-          new semantic version of the dataset_table as a 3-tuple
+            DatasetVersion: The new version number.
         Raises:
-          DerivaMLException: if provided, RID is not to a dataset_table.
+            DerivaMLException: If dataset_rid is invalid or version increment fails.
+        Example:
+            >>> new_version = ml.increment_dataset_version(
+            ...     dataset_rid="1-abc123",
+            ...     component="minor",
+            ...     description="Added new samples"
+            ... )
+            >>> print(f"New version: {new_version}")  # e.g., "1.2.0"
         """
         # Find all the datasets that are reachable from this dataset and determine their new version numbers.
@@ -270,46 +346,51 @@ class Dataset:
             )
             for ds_rid in related_datasets
         ]
-        self._insert_dataset_versions(
-            version_update_list, description=description, execution_rid=execution_rid
-        )
-        return [d.version for d in version_update_list if d.rid == dataset_rid][0]
+        self._insert_dataset_versions(version_update_list, description=description, execution_rid=execution_rid)
+        return next((d.version for d in version_update_list if d.rid == dataset_rid))
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def create_dataset(
         self,
-        dataset_types: str | list[str],
-        description: str,
-        execution_rid: Optional[RID] = None,
-        version: Optional[DatasetVersion] = None,
+        dataset_types: str | list[str] | None = None,
+        description: str = "",
+        execution_rid: RID | None = None,
+        version: DatasetVersion | None = None,
     ) -> RID:
-        """Create a new dataset_table from the specified list of RIDs.
+        """Creates a new dataset in the catalog.
+        Creates a dataset with specified types and description. The dataset can be associated
+        with an execution and initialized with a specific version.
         Args:
-            dataset_types: One or more dataset_table types.  Must be a term from the DatasetType controlled vocabulary.
-            description: Description of the dataset_table.
-            execution_rid: Execution under which the dataset_table will be created.
-            version: Version of the dataset_table.
+            dataset_types: One or more dataset type terms from Dataset_Type vocabulary.
+            description: Description of the dataset's purpose and contents.
+            execution_rid: Optional execution RID to associate with dataset creation.
+            version: Optional initial version number. Defaults to 0.1.0.
         Returns:
-            New dataset_table RID.
+            RID: Resource Identifier of the newly created dataset.
+        Raises:
+            DerivaMLException: If dataset_types are invalid or creation fails.
+        Example:
+            >>> rid = ml.create_dataset(
+            ...     dataset_types=["experiment", "raw_data"],
+            ...     description="RNA sequencing experiment data",
+            ...     version=DatasetVersion(1, 0, 0)
+            ... )
         """
         version = version or DatasetVersion(0, 1, 0)
+        dataset_types = dataset_types or []
-        type_path = (
-            self._model.catalog.getPathBuilder()
-            .schemas[self._ml_schema]
-            .tables[MLVocab.dataset_type.value]
-        )
+        type_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables[MLVocab.dataset_type.value]
         defined_types = list(type_path.entities().fetch())
         def check_dataset_type(dtype: str) -> bool:
             for term in defined_types:
-                if dtype == term["Name"] or (
-                    term["Synonyms"] and ds_type in term["Synonyms"]
-                ):
+                if dtype == term["Name"] or (term["Synonyms"] and ds_type in term["Synonyms"]):
                     return True
             return False
@@ -319,9 +400,7 @@ class Dataset:
         for ds_type in ds_types:
             if not check_dataset_type(ds_type):
                 raise DerivaMLException("Dataset type must be a vocabulary term.")
-        dataset_table_path = pb.schemas[self.dataset_table.schema.name].tables[
-            self.dataset_table.name
-        ]
+        dataset_table_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
         dataset_rid = dataset_table_path.insert(
             [
                 {
@@ -332,21 +411,13 @@ class Dataset:
         )[0]["RID"]
         # Get the name of the association table between dataset_table and dataset_type.
-        atable = next(
-            self._model.schemas[self._ml_schema]
-            .tables[MLVocab.dataset_type]
-            .find_associations()
-        ).name
+        associations = list(self._model.schemas[self._ml_schema].tables[MLVocab.dataset_type].find_associations())
+        atable = associations[0].name if associations else None
         pb.schemas[self._ml_schema].tables[atable].insert(
-            [
-                {MLVocab.dataset_type: ds_type, "Dataset": dataset_rid}
-                for ds_type in ds_types
-            ]
+            [{MLVocab.dataset_type: ds_type, "Dataset": dataset_rid} for ds_type in ds_types]
         )
         if execution_rid is not None:
-            pb.schemas[self._ml_schema].Dataset_Execution.insert(
-                [{"Dataset": dataset_rid, "Execution": execution_rid}]
-            )
+            pb.schemas[self._ml_schema].Dataset_Execution.insert([{"Dataset": dataset_rid, "Execution": execution_rid}])
         self._insert_dataset_versions(
             [DatasetSpec(rid=dataset_rid, version=version)],
             execution_rid=execution_rid,
@@ -368,18 +439,12 @@ class Dataset:
             raise DerivaMLException("Dataset_rid is not a dataset.")
         if parents := self.list_dataset_parents(dataset_rid):
-            raise DerivaMLException(
-                f'Dataset_rid "{dataset_rid}" is in a nested dataset: {parents}.'
-            )
+            raise DerivaMLException(f'Dataset_rid "{dataset_rid}" is in a nested dataset: {parents}.')
         pb = self._model.catalog.getPathBuilder()
-        dataset_path = pb.schemas[self.dataset_table.schema.name].tables[
-            self.dataset_table.name
-        ]
+        dataset_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
-        rid_list = [dataset_rid] + (
-            self.list_dataset_children(dataset_rid) if recurse else []
-        )
+        rid_list = [dataset_rid] + (self.list_dataset_children(dataset_rid=dataset_rid) if recurse else [])
         dataset_path.update([{"RID": r, "Deleted": True} for r in rid_list])
     def find_datasets(self, deleted: bool = False) -> Iterable[dict[str, Any]]:
@@ -393,14 +458,9 @@ class Dataset:
         """
         # Get datapath to all the tables we will need: Dataset, DatasetType and the association table.
         pb = self._model.catalog.getPathBuilder()
-        dataset_path = pb.schemas[self.dataset_table.schema.name].tables[
-            self.dataset_table.name
-        ]
-        atable = next(
-            self._model.schemas[self._ml_schema]
-            .tables[MLVocab.dataset_type]
-            .find_associations()
-        ).name
+        dataset_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
+        associations = list(self._model.schemas[self._ml_schema].tables[MLVocab.dataset_type].find_associations())
+        atable = associations[0].name if associations else None
         ml_path = pb.schemas[self._ml_schema]
         atable_path = ml_path.tables[atable]
@@ -408,21 +468,16 @@ class Dataset:
             filtered_path = dataset_path
         else:
             filtered_path = dataset_path.filter(
-                (dataset_path.Deleted == False) | (dataset_path.Deleted == None)  # noqa: E712
+                (dataset_path.Deleted == False) | (dataset_path.Deleted == None)  # noqa: E711, E712
             )
         # Get a list of all the dataset_type values associated with this dataset_table.
         datasets = []
         for dataset in filtered_path.entities().fetch():
             ds_types = (
-                atable_path.filter(atable_path.Dataset == dataset["RID"])
-                .attributes(atable_path.Dataset_Type)
-                .fetch()
-            )
-            datasets.append(
-                dataset
-                | {MLVocab.dataset_type: [ds[MLVocab.dataset_type] for ds in ds_types]}
+                atable_path.filter(atable_path.Dataset == dataset["RID"]).attributes(atable_path.Dataset_Type).fetch()
             )
+            datasets.append(dataset | {MLVocab.dataset_type: [ds[MLVocab.dataset_type] for ds in ds_types]})
         return datasets
     def list_dataset_element_types(self) -> Iterable[Table]:
@@ -433,16 +488,9 @@ class Dataset:
         """
         def domain_table(table: Table) -> bool:
-            return (
-                table.schema.name == self._model.domain_schema
-                or table.name == self.dataset_table.name
-            )
+            return table.schema.name == self._model.domain_schema or table.name == self._dataset_table.name
-        return [
-            t
-            for a in self.dataset_table.find_associations()
-            if domain_table(t := a.other_fkeys.pop().pk_table)
-        ]
+        return [t for a in self._dataset_table.find_associations() if domain_table(t := a.other_fkeys.pop().pk_table)]
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def add_dataset_element_type(self, element: str | Table) -> Table:
@@ -457,31 +505,45 @@ class Dataset:
         """
         # Add table to map
         element_table = self._model.name_to_table(element)
-        table = self._model.schemas[self._model.domain_schema].create_table(
-            Table.define_association([self.dataset_table, element_table])
-        )
+        atable_def = Table.define_association([self._dataset_table, element_table])
+        try:
+            table = self._model.schemas[self._model.domain_schema].create_table(atable_def)
+        except ValueError as e:
+            if "already exists" in str(e):
+                table = self._model.name_to_table(atable_def["table_name"])
+            else:
+                raise e
         # self.model = self.catalog.getCatalogModel()
-        self.dataset_table.annotations.update(
-            self._generate_dataset_download_annotations()
-        )
+        self._dataset_table.annotations.update(self._generate_dataset_download_annotations())
         self._model.model.apply()
         return table
     # @validate_call
     def list_dataset_members(
-        self, dataset_rid: RID, recurse: bool = False, limit: Optional[int] = None
+        self, dataset_rid: RID, recurse: bool = False, limit: int | None = None
     ) -> dict[str, list[dict[str, Any]]]:
-        """Return a list of entities associated with a specific dataset_table.
+        """Lists members of a dataset.
+        Returns a dictionary mapping member types to lists of member records. Can optionally
+        recurse through nested datasets and limit the number of results.
         Args:
-            dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
-            recurse: (Default value = False)
-            limit: If provided, the maximum number of members to return for each element type.
+            dataset_rid: Resource Identifier of the dataset.
+            recurse: Whether to include members of nested datasets. Defaults to False.
+            limit: Maximum number of members to return per type. None for no limit.
         Returns:
-            Dictionary of entities associated with a specific dataset_table.  Key is the table from which the elements
-            were taken.
+            dict[str, list[dict[str, Any]]]: Dictionary mapping member types to lists of members.
+                Each member is a dictionary containing the record's attributes.
+        Raises:
+            DerivaMLException: If dataset_rid is invalid.
+        Example:
+            >>> members = ml.list_dataset_members("1-abc123", recurse=True)
+            >>> for type_name, records in members.items():
+            ...     print(f"{type_name}: {len(records)} records")
         """
         if not self._is_dataset_rid(dataset_rid):
@@ -491,21 +553,18 @@ class Dataset:
         # the appropriate association table.
         members = defaultdict(list)
         pb = self._model.catalog.getPathBuilder()
-        for assoc_table in self.dataset_table.find_associations():
+        for assoc_table in self._dataset_table.find_associations():
             other_fkey = assoc_table.other_fkeys.pop()
             target_table = other_fkey.pk_table
             member_table = assoc_table.table
             # Look at domain tables and nested datasets.
-            if (
-                target_table.schema.name != self._model.domain_schema
-                and target_table != self.dataset_table
+            if target_table.schema.name != self._model.domain_schema and not (
+                target_table == self._dataset_table or target_table.name == "File"
             ):
                 continue
             member_column = (
-                "Nested_Dataset"
-                if target_table == self.dataset_table
-                else other_fkey.foreign_key_columns[0].name
+                "Nested_Dataset" if target_table == self._dataset_table else other_fkey.foreign_key_columns[0].name
             )
             target_path = pb.schemas[target_table.schema.name].tables[target_table.name]
@@ -515,15 +574,13 @@ class Dataset:
                 target_path,
                 on=(member_path.columns[member_column] == target_path.columns["RID"]),
             )
-            target_entities = list(
-                path.entities().fetch(limit=limit) if limit else path.entities().fetch()
-            )
+            target_entities = list(path.entities().fetch(limit=limit) if limit else path.entities().fetch())
             members[target_table.name].extend(target_entities)
-            if recurse and target_table == self.dataset_table:
+            if recurse and target_table == self._dataset_table:
                 # Get the members for all the nested datasets and add to the member list.
                 nested_datasets = [d["RID"] for d in target_entities]
                 for ds in nested_datasets:
-                    for k, v in self.list_dataset_members(ds, recurse=False).items():
+                    for k, v in self.list_dataset_members(ds, recurse=recurse).items():
                         members[k].extend(v)
         return dict(members)
@@ -531,24 +588,38 @@ class Dataset:
     def add_dataset_members(
         self,
         dataset_rid: RID,
-        members: list[RID],
+        members: list[RID] | dict[str, list[RID]],
         validate: bool = True,
-        description: Optional[str] = "",
-        execution_rid: Optional[RID] = None,
+        description: str | None = "",
+        execution_rid: RID | None = None,
     ) -> None:
-        """Add additional elements to an existing dataset_table.
+        """Adds members to a dataset.
-        Add new elements to an existing dataset. In addition to adding new members, the minor version number of the
-        dataset is incremented and the description, if provide is applied to that new version.
+        Associates one or more records with a dataset. Can optionally validate member types
+        and create a new dataset version to track the changes.
         Args:
-            dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
-            members: List of member RIDs to add to the dataset_table.
-            validate: Check rid_list to make sure elements are not already in the dataset_table.
-            description: Markdown description of the updated dataset.
-            execution_rid: Optional RID of execution associated with this dataset.
+            dataset_rid: Resource Identifier of the dataset.
+            members: List of RIDs to add as dataset members. Can be orginized into a dictionary that indicates the
+                table that the member rids belong to.
+            validate: Whether to validate member types. Defaults to True.
+            description: Optional description of the member additions.
+            execution_rid: Optional execution RID to associate with changes.
+        Raises:
+            DerivaMLException: If:
+                - dataset_rid is invalid
+                - members are invalid or of wrong type
+                - adding members would create a cycle
+                - validation fails
+        Example:
+            >>> ml.add_dataset_members(
+            ...     dataset_rid="1-abc123",
+            ...     members=["1-def456", "1-ghi789"],
+            ...     description="Added sample data"
+            ... )
         """
-        members = set(members)
         description = description or "Updated dataset via add_dataset_members"
         def check_dataset_cycle(member_rid, path=None):
@@ -565,43 +636,37 @@ class Dataset:
             return member_rid in path
         if validate:
-            existing_rids = set(
-                m["RID"]
-                for ms in self.list_dataset_members(dataset_rid).values()
-                for m in ms
-            )
+            existing_rids = set(m["RID"] for ms in self.list_dataset_members(dataset_rid).values() for m in ms)
             if overlap := set(existing_rids).intersection(members):
-                raise DerivaMLException(
-                    f"Attempting to add existing member to dataset_table {dataset_rid}: {overlap}"
-                )
+                raise DerivaMLException(f"Attempting to add existing member to dataset_table {dataset_rid}: {overlap}")
         # Now go through every rid to be added to the data set and sort them based on what association table entries
         # need to be made.
         dataset_elements = {}
         association_map = {
-            a.other_fkeys.pop().pk_table.name: a.table.name
-            for a in self.dataset_table.find_associations()
+            a.other_fkeys.pop().pk_table.name: a.table.name for a in self._dataset_table.find_associations()
         }
         # Get a list of all the object types that can be linked to a dataset_table.
-        for m in members:
-            try:
-                rid_info = self._model.catalog.resolve_rid(m)
-            except KeyError:
-                raise DerivaMLException(f"Invalid RID: {m}")
-            if rid_info.table.name not in association_map:
-                raise DerivaMLException(
-                    f"RID table: {rid_info.table.name} not part of dataset_table"
-                )
-            if rid_info.table == self.dataset_table and check_dataset_cycle(
-                rid_info.rid
-            ):
-                raise DerivaMLException("Creating cycle of datasets is not allowed")
-            dataset_elements.setdefault(rid_info.table.name, []).append(rid_info.rid)
+        if type(members) is list:
+            members = set(members)
+            for m in members:
+                try:
+                    rid_info = self._model.catalog.resolve_rid(m)
+                except KeyError:
+                    raise DerivaMLException(f"Invalid RID: {m}")
+                if rid_info.table.name not in association_map:
+                    raise DerivaMLException(f"RID table: {rid_info.table.name} not part of dataset_table")
+                if rid_info.table == self._dataset_table and check_dataset_cycle(rid_info.rid):
+                    raise DerivaMLException("Creating cycle of datasets is not allowed")
+                dataset_elements.setdefault(rid_info.table.name, []).append(rid_info.rid)
+        else:
+            dataset_elements = {t: set(ms) for t, ms in members.items()}
         # Now make the entries into the association tables.
         pb = self._model.catalog.getPathBuilder()
         for table, elements in dataset_elements.items():
             schema_path = pb.schemas[
-                self._ml_schema if table == "Dataset" else self._model.domain_schema
+                self._ml_schema if (table == "Dataset" or table == "File") else self._model.domain_schema
             ]
             fk_column = "Nested_Dataset" if table == "Dataset" else table
             if len(elements):
@@ -622,7 +687,7 @@ class Dataset:
         dataset_rid: RID,
         members: list[RID],
         description: str = "",
-        execution_rid: Optional[RID] = None,
+        execution_rid: RID | None = None,
     ) -> None:
         """Remove elements to an existing dataset_table.
@@ -643,8 +708,7 @@ class Dataset:
         # need to be made.
         dataset_elements = {}
         association_map = {
-            a.other_fkeys.pop().pk_table.name: a.table.name
-            for a in self.dataset_table.find_associations()
+            a.other_fkeys.pop().pk_table.name: a.table.name for a in self._dataset_table.find_associations()
         }
         # Get a list of all the object types that can be linked to a dataset_table.
         for m in members:
@@ -653,16 +717,12 @@ class Dataset:
             except KeyError:
                 raise DerivaMLException(f"Invalid RID: {m}")
             if rid_info.table.name not in association_map:
-                raise DerivaMLException(
-                    f"RID table: {rid_info.table.name} not part of dataset_table"
-                )
+                raise DerivaMLException(f"RID table: {rid_info.table.name} not part of dataset_table")
             dataset_elements.setdefault(rid_info.table.name, []).append(rid_info.rid)
         # Now make the entries into the association tables.
         pb = self._model.catalog.getPathBuilder()
         for table, elements in dataset_elements.items():
-            schema_path = pb.schemas[
-                self._ml_schema if table == "Dataset" else self._model.domain_schema
-            ]
+            schema_path = pb.schemas[self._ml_schema if table == "Dataset" else self._model.domain_schema]
             fk_column = "Nested_Dataset" if table == "Dataset" else table
             if len(elements):
@@ -670,8 +730,7 @@ class Dataset:
                 # Find out the name of the column in the association table.
                 for e in elements:
                     entity = atable_path.filter(
-                        (atable_path.Dataset == dataset_rid)
-                        & (atable_path.columns[fk_column] == e),
+                        (atable_path.Dataset == dataset_rid) & (atable_path.columns[fk_column] == e),
                     )
                     entity.delete()
         self.increment_dataset_version(
@@ -693,21 +752,14 @@ class Dataset:
             RID of the parent dataset_table.
         """
         if not self._is_dataset_rid(dataset_rid):
-            raise DerivaMLException(
-                f"RID: {dataset_rid} does not belong to dataset_table {self.dataset_table.name}"
-            )
+            raise DerivaMLException(f"RID: {dataset_rid} does not belong to dataset_table {self._dataset_table.name}")
         # Get association table for nested datasets
         pb = self._model.catalog.getPathBuilder()
         atable_path = pb.schemas[self._ml_schema].Dataset_Dataset
-        return [
-            p["Dataset"]
-            for p in atable_path.filter(atable_path.Nested_Dataset == dataset_rid)
-            .entities()
-            .fetch()
-        ]
+        return [p["Dataset"] for p in atable_path.filter(atable_path.Nested_Dataset == dataset_rid).entities().fetch()]
     @validate_call
-    def list_dataset_children(self, dataset_rid: RID, recurse=False) -> list[RID]:
+    def list_dataset_children(self, dataset_rid: RID, recurse: bool = False) -> list[RID]:
         """Given a dataset_table RID, return a list of RIDs for any nested datasets.
         Args:
@@ -718,19 +770,11 @@ class Dataset:
           list of nested dataset RIDs.
         """
-        dataset_dataset_path = (
-            self._model.catalog.getPathBuilder()
-            .schemas[self._ml_schema]
-            .tables["Dataset_Dataset"]
-        )
+        dataset_dataset_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Dataset"]
         nested_datasets = list(dataset_dataset_path.entities().fetch())
         def find_children(rid: RID):
-            children = [
-                child["Nested_Dataset"]
-                for child in nested_datasets
-                if child["Dataset"] == rid
-            ]
+            children = [child["Nested_Dataset"] for child in nested_datasets if child["Dataset"] == rid]
             if recurse:
                 for child in children.copy():
                     children.extend(find_children(child))
@@ -738,9 +782,7 @@ class Dataset:
         return find_children(dataset_rid)
-    def _export_vocabulary(
-        self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
-    ) -> list[dict[str, Any]]:
+    def _export_vocabulary(self, writer: Callable[[str, str, Table], list[dict[str, Any]]]) -> list[dict[str, Any]]:
         """
         Args:
@@ -755,16 +797,12 @@ class Dataset:
             for table in s.tables.values()
             if self._model.is_vocabulary(table)
         ]
-        return [
-            o
-            for table in vocabs
-            for o in writer(f"{table.schema.name}:{table.name}", table.name, table)
-        ]
+        return [o for table in vocabs for o in writer(f"{table.schema.name}:{table.name}", table.name, table)]
     def _table_paths(
         self,
-        dataset: Optional[DatasetSpec] = None,
-        snapshot_catalog: Optional[DerivaML] = None,
+        dataset: DatasetSpec | None = None,
+        snapshot_catalog: DerivaML | None = None,
     ) -> Iterator[tuple[str, str, Table]]:
         paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
@@ -790,25 +828,20 @@ class Dataset:
     def _collect_paths(
         self,
-        dataset_rid: Optional[RID] = None,
-        snapshot: Optional[Dataset] = None,
-        dataset_nesting_depth: Optional[int] = None,
+        dataset_rid: RID | None = None,
+        snapshot: Dataset | None = None,
+        dataset_nesting_depth: int | None = None,
     ) -> set[tuple[Table, ...]]:
         snapshot_catalog = snapshot if snapshot else self
-        dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables[
-            "Dataset"
-        ]
-        dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables[
-            "Dataset_Dataset"
-        ]
+        dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables["Dataset"]
+        dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
         # Figure out what types of elements the dataset contains.
         dataset_associations = [
             a
-            for a in self.dataset_table.find_associations()
-            if a.table.schema.name != self._ml_schema
-            or a.table.name == "Dataset_Dataset"
+            for a in self._dataset_table.find_associations()
+            if a.table.schema.name != self._ml_schema or a.table.name == "Dataset_Dataset"
         ]
         if dataset_rid:
             # Get a list of the members of the dataset so we can figure out which tables to query.
@@ -820,9 +853,7 @@ class Dataset:
                 if m
             ]
             included_associations = [
-                a.table
-                for a in dataset_table.find_associations()
-                if a.other_fkeys.pop().pk_table in dataset_elements
+                a.table for a in dataset_table.find_associations() if a.other_fkeys.pop().pk_table in dataset_elements
             ]
         else:
             included_associations = dataset_associations
@@ -833,9 +864,7 @@ class Dataset:
             for p in snapshot_catalog._model._schema_to_paths()
             if (len(p) == 1)
             or (p[1] not in dataset_associations)  # Tables in the domain schema
-            or (
-                p[1] in included_associations
-            )  # Tables that include members of the dataset
+            or (p[1] in included_associations)  # Tables that include members of the dataset
         }
         # Now get paths for nested datasets
         nested_paths = set()
@@ -845,56 +874,42 @@ class Dataset:
         else:
             # Initialize nesting depth if not already provided.
             dataset_nesting_depth = (
-                self._dataset_nesting_depth()
-                if dataset_nesting_depth is None
-                else dataset_nesting_depth
+                self._dataset_nesting_depth() if dataset_nesting_depth is None else dataset_nesting_depth
             )
             if dataset_nesting_depth:
-                nested_paths = self._collect_paths(
-                    dataset_nesting_depth=dataset_nesting_depth - 1
-                )
+                nested_paths = self._collect_paths(dataset_nesting_depth=dataset_nesting_depth - 1)
         if nested_paths:
             paths |= {
                 tuple([dataset_table]),
                 (dataset_table, dataset_dataset),
             }
-        paths |= {(self.dataset_table, dataset_dataset) + p for p in nested_paths}
+        paths |= {(self._dataset_table, dataset_dataset) + p for p in nested_paths}
         return paths
-    def _dataset_nesting_depth(self, dataset_rid: Optional[RID] = None) -> int:
+    def _dataset_nesting_depth(self, dataset_rid: RID | None = None) -> int:
         """Determine the maximum dataset nesting depth in the current catalog.
         Returns:
         """
-        def children_depth(
-            dataset_rid: RID, nested_datasets: dict[str, list[str]]
-        ) -> int:
+        def children_depth(dataset_rid: RID, nested_datasets: dict[str, list[str]]) -> int:
             """Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
             try:
                 children = nested_datasets[dataset_rid]
-                return (
-                    max(map(lambda x: children_depth(x, nested_datasets), children)) + 1
-                    if children
-                    else 1
-                )
+                return max(map(lambda x: children_depth(x, nested_datasets), children)) + 1 if children else 1
             except KeyError:
                 return 0
         # Build up the dataset_table nesting graph...
-        pb = (
-            self._model.catalog.getPathBuilder()
-            .schemas[self._ml_schema]
-            .tables["Dataset_Dataset"]
-        )
+        pb = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Dataset"]
         dataset_children = (
             [
                 {
                     "Dataset": dataset_rid,
                     "Nested_Dataset": c,
                 }  # Make uniform with return from datapath
-                for c in self.list_dataset_children(dataset_rid)
+                for c in self.list_dataset_children(dataset_rid=dataset_rid)
             ]
             if dataset_rid
             else pb.entities().fetch()
@@ -902,30 +917,29 @@ class Dataset:
         nested_dataset = defaultdict(list)
         for ds in dataset_children:
             nested_dataset[ds["Dataset"]].append(ds["Nested_Dataset"])
-        return (
-            max(map(lambda d: children_depth(d, dict(nested_dataset)), nested_dataset))
-            if nested_dataset
-            else 0
-        )
+        return max(map(lambda d: children_depth(d, dict(nested_dataset)), nested_dataset)) if nested_dataset else 0
     def _dataset_specification(
         self,
         writer: Callable[[str, str, Table], list[dict[str, Any]]],
-        dataset: Optional[DatasetSpec] = None,
-        snapshot_catalog: Optional[DerivaML] = None,
+        dataset: DatasetSpec | None = None,
+        snapshot_catalog: DerivaML | None = None,
     ) -> list[dict[str, Any]]:
-        """Output a download/export specification for a dataset_table.  Each element of the dataset_table will be placed in its own dir
-        The top level data directory of the resulting BDBag will have one subdirectory for element type. The subdirectory
-        will contain the CSV indicating which elements of that type are present in the dataset_table, and then there will be a
-         subdirectory for each object that is reachable from the dataset_table members.
-        To simplify reconstructing the relationship between tables, the CVS for each
-        The top level data directory will also contain a subdirectory for any controlled vocabularies used in the dataset_table.
-        All assets will be placed into a directory named asset in a subdirectory with the asset table name.
-        For example, consider a dataset_table that consists of two element types, T1 and T2. T1 has foreign key relationships to
-        objects in tables T3 and T4.  There are also two controlled vocabularies, CV1 and CV2.  T2 is an asset table
-        which has two assets in it. The layout of the resulting bdbag would be:
+        """Output a download/export specification for a dataset_table.  Each element of the dataset_table
+        will be placed in its own directory.
+        The top level data directory of the resulting BDBag will have one subdirectory for element type.
+        The subdirectory will contain the CSV indicating which elements of that type are present in the
+        dataset_table, and then there will be a subdirectory for each object that is reachable from the
+        dataset_table members.
+        To simplify reconstructing the relationship between tables, the CVS for each element is included.
+        The top level data directory will also contain a subdirectory for any controlled vocabularies used in
+        the dataset_table. All assets will be placed into a directory named asset in a subdirectory with the
+        asset table name.
+        For example, consider a dataset_table that consists of two element types, T1 and T2. T1 has foreign
+        key relationships to objects in tables T3 and T4. There are also two controlled vocabularies, CV1 and
+        CV2. T2 is an asset table which has two assets in it. The layout of the resulting bdbag would be:
               data
                 CV1/
                     cv1.csv
@@ -952,17 +966,15 @@ class Dataset:
             A dataset_table specification.
         """
         element_spec = self._export_vocabulary(writer)
-        for path in self._table_paths(
-            dataset=dataset, snapshot_catalog=snapshot_catalog
-        ):
+        for path in self._table_paths(dataset=dataset, snapshot_catalog=snapshot_catalog):
             element_spec.extend(writer(*path))
         return element_spec
     def _download_dataset_bag(
         self,
         dataset: DatasetSpec,
-        execution_rid: Optional[RID] = None,
-        snapshot_catalog: Optional[DerivaML] = None,
+        execution_rid: RID | None = None,
+        snapshot_catalog: DerivaML | None = None,
     ) -> DatasetBag:
         """Download a dataset onto the local file system.  Create a MINID for the dataset if one doesn't already exist.
@@ -992,27 +1004,29 @@ class Dataset:
     def _version_snapshot(self, dataset: DatasetSpec) -> str:
         """Return a catalog with snapshot for the specified dataset version"""
-        version_record = [
-            h
-            for h in self.dataset_history(dataset_rid=dataset.rid)
-            if h.dataset_version == dataset.version
-        ][0]
+        try:
+            version_record = next(
+                h for h in self.dataset_history(dataset_rid=dataset.rid) if h.dataset_version == dataset.version
+            )
+        except StopIteration:
+            raise DerivaMLException(f"Dataset version {dataset.version} not found for dataset {dataset.rid}")
         return f"{self._model.catalog.catalog_id}@{version_record.snapshot}"
-    def _create_dataset_minid(
-        self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
-    ) -> str:
+    def _create_dataset_minid(self, dataset: DatasetSpec, snapshot_catalog: DerivaML | None = None) -> str:
         with TemporaryDirectory() as tmp_dir:
             # Generate a download specification file for the current catalog schema. By default, this spec
             # will generate a minid and place the bag into S3 storage.
-            spec_file = f"{tmp_dir}/download_spec.json"
-            with open(spec_file, "w", encoding="utf-8") as ds:
-                json.dump(
-                    self._generate_dataset_download_spec(dataset, snapshot_catalog), ds
-                )
+            spec_file = Path(tmp_dir) / "download_spec.json"
+            with spec_file.open("w", encoding="utf-8") as ds:
+                json.dump(self._generate_dataset_download_spec(dataset, snapshot_catalog), ds)
             try:
                 self._logger.info(
-                    f"Downloading dataset {'minid' if self._use_minid else 'bag'} for catalog: {dataset.rid}@{str(dataset.version)}"
+                    "Downloading dataset %s for catalog: %s@%s"
+                    % (
+                        'minid' if self._use_minid else 'bag',
+                        dataset.rid,
+                        str(dataset.version),
+                    )
                 )
                 # Generate the bag and put into S3 storage.
                 exporter = DerivaExport(
@@ -1035,15 +1049,9 @@ class Dataset:
                 raise DerivaMLException(format_exception(e))
             # Update version table with MINID.
             if self._use_minid:
-                version_path = (
-                    self._model.catalog.getPathBuilder()
-                    .schemas[self._ml_schema]
-                    .tables["Dataset_Version"]
-                )
+                version_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Version"]
                 version_rid = [
-                    h
-                    for h in self.dataset_history(dataset_rid=dataset.rid)
-                    if h.dataset_version == dataset.version
+                    h for h in self.dataset_history(dataset_rid=dataset.rid) if h.dataset_version == dataset.version
                 ][0].version_rid
                 version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
         return minid_page_url
@@ -1051,10 +1059,10 @@ class Dataset:
     def _get_dataset_minid(
         self,
         dataset: DatasetSpec,
-        snapshot_catalog: Optional[DerivaML] = None,
+        snapshot_catalog: DerivaML | None = None,
         create: bool = True,
-    ) -> DatasetMinid:
-        """Return a MINID to the specified dataset.  If no version is specified, use the latest.
+    ) -> DatasetMinid | None:
+        """Return a MINID for the specified dataset. If no version is specified, use the latest.
         Args:
             dataset: Specification of the dataset.
@@ -1064,50 +1072,53 @@ class Dataset:
         Returns:
             New or existing MINID for the dataset.
         """
-        if dataset.rid.startswith("minid"):
-            minid_url = f"https://identifiers.org/{dataset.rid}"
-        elif dataset.rid.startswith("http"):
-            minid_url = dataset.rid
-        else:
-            if not any([dataset.rid == ds["RID"] for ds in self.find_datasets()]):
-                raise DerivaMLException(f"RID {dataset.rid} is not a dataset_table")
-            # Get the history record for the version we are looking for.
-            dataset_version_record = [
-                v
-                for v in self.dataset_history(dataset.rid)
-                if v.dataset_version == str(dataset.version)
-            ][0]
-            if not dataset_version_record:
-                raise DerivaMLException(
-                    f"Version {str(dataset.version)} does not exist for RID {dataset.rid}"
-                )
-            minid_url = dataset_version_record.minid
-            if not minid_url:
-                if not create:
-                    raise DerivaMLException(
-                        f"Minid for dataset {dataset.rid} doesn't exist"
-                    )
-                if self._use_minid:
-                    self._logger.info("Creating new MINID for dataset %s", dataset.rid)
-                minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
-            # If provided a MINID, use the MINID metadata to get the checksum and download the bag.
+        rid = dataset.rid
+        # Case 1: RID is already a MINID or direct URL
+        if rid.startswith("minid"):
+            return self._fetch_minid_metadata(f"https://identifiers.org/{rid}", dataset.version)
+        if rid.startswith("http"):
+            return self._fetch_minid_metadata(rid, dataset.version)
+        # Case 2: RID is a dataset RID – validate existence
+        if not any(rid == ds["RID"] for ds in self.find_datasets()):
+            raise DerivaMLTableTypeError("Dataset", rid)
+        # Find dataset version record
+        version_str = str(dataset.version)
+        history = self.dataset_history(rid)
+        try:
+            version_record = next(v for v in history if v.dataset_version == version_str)
+        except StopIteration:
+            raise DerivaMLException(f"Version {version_str} does not exist for RID {rid}")
+        # Check or create MINID
+        minid_url = version_record.minid
+        if not minid_url:
+            if not create:
+                raise DerivaMLException(f"Minid for dataset {rid} doesn't exist")
             if self._use_minid:
-                r = requests.get(minid_url, headers={"accept": "application/json"})
-                dataset_minid = DatasetMinid(
-                    dataset_version=dataset.version, **r.json()
-                )
-            else:
-                dataset_minid = DatasetMinid(
-                    dataset_version=dataset.version,
-                    RID=f"{dataset.rid}@{dataset_version_record.snapshot}",
-                    location=minid_url,
-                )
-            return dataset_minid
+                self._logger.info("Creating new MINID for dataset %s", rid)
+            minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
+        # Return based on MINID usage
+        if self._use_minid:
+            return self._fetch_minid_metadata(minid_url, dataset.version)
+        return DatasetMinid(
+            dataset_version=dataset.version,
+            RID=f"{rid}@{version_record.snapshot}",
+            location=minid_url,
+        )
+    def _fetch_minid_metadata(self, url: str, version: DatasetVersion) -> DatasetMinid:
+        r = requests.get(url, headers={"accept": "application/json"})
+        r.raise_for_status()
+        return DatasetMinid(dataset_version=version, **r.json())
     def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
-        """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it, and validate
-        that all the metadata is correct
+        """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it, and
+        validate that all the metadata is correct
         Args:
             minid: The RID of a dataset_table or a minid to an existing bag.
@@ -1119,9 +1130,7 @@ class Dataset:
         # it.  If not, then we need to extract the contents of the archive into our cache directory.
         bag_dir = self._cache_dir / f"{minid.dataset_rid}_{minid.checksum}"
         if bag_dir.exists():
-            self._logger.info(
-                f"Using cached bag for  {minid.dataset_rid} Version:{minid.dataset_version}"
-            )
+            self._logger.info(f"Using cached bag for  {minid.dataset_rid} Version:{minid.dataset_version}")
             return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
         # Either bag hasn't been downloaded yet, or we are not using a Minid, so we don't know the checksum yet.
@@ -1130,19 +1139,13 @@ class Dataset:
                 # Get bag from S3
                 archive_path = fetch_single_file(minid.bag_url)
             else:
-                exporter = DerivaExport(
-                    host=self._model.catalog.deriva_server.server, output_dir=tmp_dir
-                )
+                exporter = DerivaExport(host=self._model.catalog.deriva_server.server, output_dir=tmp_dir)
                 archive_path = exporter.retrieve_file(minid.bag_url)
-                hashes = hash_utils.compute_file_hashes(
-                    archive_path, hashes=["md5", "sha256"]
-                )
+                hashes = hash_utils.compute_file_hashes(archive_path, hashes=["md5", "sha256"])
                 checksum = hashes["sha256"][0]
                 bag_dir = self._cache_dir / f"{minid.dataset_rid}_{checksum}"
                 if bag_dir.exists():
-                    self._logger.info(
-                        f"Using cached bag for  {minid.dataset_rid} Version:{minid.dataset_version}"
-                    )
+                    self._logger.info(f"Using cached bag for  {minid.dataset_rid} Version:{minid.dataset_version}")
                     return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
             bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
         bdb.validate_bag_structure(bag_path)
@@ -1151,7 +1154,7 @@ class Dataset:
     def _materialize_dataset_bag(
         self,
         minid: DatasetMinid,
-        execution_rid: Optional[RID] = None,
+        execution_rid: RID | None = None,
     ) -> Path:
         """Materialize a dataset_table bag into a local directory
@@ -1165,9 +1168,7 @@ class Dataset:
         def update_status(status: Status, msg: str) -> None:
             """Update the current status for this execution in the catalog"""
             if execution_rid and execution_rid != DRY_RUN_RID:
-                self._model.catalog.getPathBuilder().schemas[
-                    self._ml_schema
-                ].Execution.update(
+                self._model.catalog.getPathBuilder().schemas[self._ml_schema].Execution.update(
                     [
                         {
                             "RID": execution_rid,
@@ -1197,9 +1198,7 @@ class Dataset:
         # If this bag has already been validated, our work is done.  Otherwise, materialize the bag.
         if not validated_check.exists():
-            self._logger.info(
-                f"Materializing bag {minid.dataset_rid} Version:{minid.dataset_version}"
-            )
+            self._logger.info(f"Materializing bag {minid.dataset_rid} Version:{minid.dataset_version}")
             bdb.materialize(
                 bag_path.as_posix(),
                 fetch_callback=fetch_progress_callback,
@@ -1210,7 +1209,7 @@ class Dataset:
     def _export_annotation(
         self,
-        snapshot_catalog: Optional[DerivaML] = None,
+        snapshot_catalog: DerivaML | None = None,
     ) -> list[dict[str, Any]]:
         """Return and output specification for the datasets in the provided model
@@ -1242,7 +1241,7 @@ class Dataset:
         )
     def _export_specification(
-        self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
+        self, dataset: DatasetSpec, snapshot_catalog: DerivaML | None = None
     ) -> list[dict[str, Any]]:
         """
         Generate a specification for export engine for specific dataset.
@@ -1258,14 +1257,10 @@ class Dataset:
                 "processor": "json",
                 "processor_params": {"query_path": "/schema", "output_path": "schema"},
             }
-        ] + self._dataset_specification(
-            self._export_specification_dataset_element, dataset, snapshot_catalog
-        )
+        ] + self._dataset_specification(self._export_specification_dataset_element, dataset, snapshot_catalog)
     @staticmethod
-    def _export_specification_dataset_element(
-        spath: str, dpath: str, table: Table
-    ) -> list[dict[str, Any]]:
+    def _export_specification_dataset_element(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
         """Return the download specification for the data object indicated by a path through the data model.
         Args:
@@ -1300,10 +1295,9 @@ class Dataset:
             )
         return exports
-    def _export_annotation_dataset_element(
-        self, spath: str, dpath: str, table: Table
-    ) -> list[dict[str, Any]]:
-        """Given a path in the data model, output an export specification for the path taken to get to the current table.
+    def _export_annotation_dataset_element(self, spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
+        """Given a path in the data model, output an export specification for the path taken to get to the
+        current table.
         Args:
           spath: Source path
@@ -1354,7 +1348,7 @@ class Dataset:
         return exports
     def _generate_dataset_download_spec(
-        self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
+        self, dataset: DatasetSpec, snapshot_catalog: DerivaML | None = None
     ) -> dict[str, Any]:
         """
         Generate a specification for downloading a specific dataset.
@@ -1457,9 +1451,7 @@ class Dataset:
             else {}
         )
         return {
-            deriva_tags.export_fragment_definitions: {
-                "dataset_export_outputs": self._export_annotation()
-            },
+            deriva_tags.export_fragment_definitions: {"dataset_export_outputs": self._export_annotation()},
             deriva_tags.visible_foreign_keys: self._dataset_visible_fkeys(),
             deriva_tags.export_2019: {
                 "detailed": {

deriva-ml 1.13.3__py3-none-any.whl → 1.14.26__py3-none-any.whl

deriva-ml 1.13.3py3-none-any.whl → 1.14.26py3-none-any.whl