PyPI - deriva-ml - Versions diffs - 1.13.2__py3-none-any.whl → 1.14.0__py3-none-any.whl - Mend

deriva-ml 1.13.2py3-none-any.whl → 1.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

deriva_ml/database_model.py +5 -11
deriva_ml/dataset.py +294 -295
deriva_ml/dataset_aux_classes.py +10 -10
deriva_ml/demo_catalog.py +90 -67
deriva_ml/deriva_definitions.py +62 -4
deriva_ml/deriva_ml_base.py +24 -29
deriva_ml/deriva_model.py +17 -5
deriva_ml/execution.py +23 -3
deriva_ml/history.py +4 -1
deriva_ml/schema_setup/annotations.py +341 -126
deriva_ml/schema_setup/create_schema.py +33 -65
deriva_ml/schema_setup/policy.json +7 -3
deriva_ml/upload.py +3 -3
{deriva_ml-1.13.2.dist-info → deriva_ml-1.14.0.dist-info}/METADATA +2 -2
deriva_ml-1.14.0.dist-info/RECORD +31 -0
{deriva_ml-1.13.2.dist-info → deriva_ml-1.14.0.dist-info}/WHEEL +1 -1
deriva_ml-1.13.2.dist-info/RECORD +0 -31
{deriva_ml-1.13.2.dist-info → deriva_ml-1.14.0.dist-info}/entry_points.txt +0 -0
{deriva_ml-1.13.2.dist-info → deriva_ml-1.14.0.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.13.2.dist-info → deriva_ml-1.14.0.dist-info}/top_level.txt +0 -0

deriva_ml/dataset.py CHANGED Viewed

@@ -1,18 +1,30 @@
 """
-This module defines the DataSet class with is used to manipulate datasets in DerivaML,
-The intended use of this class is as a base class in DerivaML so all the methods documented here are
+This module defines the DataSet class with is used to manipulate datasets in DerivaML.
+The intended use of this class is as a base class in DerivaML, so all the methods documented here are
 accessible via a DerivaML class instance.
 """
 from __future__ import annotations
-from bdbag.fetch.fetcher import fetch_single_file
 from bdbag import bdbag_api as bdb
+from bdbag.fetch.fetcher import fetch_single_file
 from collections import defaultdict
+from graphlib import TopologicalSorter
+import json
+import logging
+from pathlib import Path
+from pydantic import (
+    validate_call,
+    ConfigDict,
+)
+import requests
+from tempfile import TemporaryDirectory
+from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
+from .history import iso_to_snap
 from deriva.core.ermrest_model import Table
 from deriva.core.utils.core_utils import tag as deriva_tags, format_exception
+import deriva.core.utils.hash_utils as hash_utils
 from deriva.transfer.download.deriva_export import DerivaExport
 from deriva.transfer.download.deriva_download import (
     DerivaDownloadConfigurationError,
@@ -22,24 +34,12 @@ from deriva.transfer.download.deriva_download import (
     DerivaDownloadTimeoutError,
 )
 try:
     from icecream import ic
 except ImportError:  # Graceful fallback if IceCream isn't installed.
     ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
-from graphlib import TopologicalSorter
-import json
-import logging
-from pathlib import Path
-from pydantic import (
-    validate_call,
-    ConfigDict,
-)
-import requests
-from tempfile import TemporaryDirectory, NamedTemporaryFile
-from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
 from deriva_ml import DatasetBag
 from .deriva_definitions import (
     ML_SCHEMA,
@@ -49,7 +49,6 @@ from .deriva_definitions import (
     RID,
     DRY_RUN_RID,
 )
-from .history import iso_to_snap
 from .deriva_model import DerivaModel
 from .database_model import DatabaseModel
 from .dataset_aux_classes import (
@@ -74,13 +73,20 @@ class Dataset:
     _Logger = logging.getLogger("deriva_ml")
-    def __init__(self, model: DerivaModel, cache_dir: Path, working_dir: Path):
+    def __init__(
+        self,
+        model: DerivaModel,
+        cache_dir: Path,
+        working_dir: Path,
+        use_minid: bool = True,
+    ):
         self._model = model
         self._ml_schema = ML_SCHEMA
         self.dataset_table = self._model.schemas[self._ml_schema].tables["Dataset"]
         self._cache_dir = cache_dir
         self._working_dir = working_dir
         self._logger = logging.getLogger("deriva_ml")
+        self._use_minid = use_minid
     def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
         try:
@@ -100,27 +106,28 @@ class Dataset:
         dataset_list: list[DatasetSpec],
         description: Optional[str] = "",
         execution_rid: Optional[RID] = None,
-    ) -> list[dict[str, Any]]:
+    ) -> None:
         schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
+        # determine snapshot after changes were made
+        snap = self._model.catalog.get("/").json()["snaptime"]
         # Construct version records for insert
-        version_records = [
-            {
-                "Dataset": dataset.rid,
-                "Version": str(dataset.version),
-                "Description": description,
-                "Execution": execution_rid,
-            }
-            for dataset in dataset_list
-        ]
+        version_records = schema_path.tables["Dataset_Version"].insert(
+            [
+                {
+                    "Dataset": dataset.rid,
+                    "Version": str(dataset.version),
+                    "Description": description,
+                    "Execution": execution_rid,
+                    "Snapshot": snap,
+                }
+                for dataset in dataset_list
+            ]
+        )
-        # Insert version records and construct entities for updating the dataset version column.
-        version_rids = [
-            {"Version": v["RID"], "RID": v["Dataset"]}
-            for v in schema_path.tables["Dataset_Version"].insert(version_records)
-        ]
-        schema_path.tables["Dataset"].update(version_rids)
-        return version_rids
+        # And update the dataset records.
+        schema_path.tables["Dataset"].update(
+            [{"Version": v["RID"], "RID": v["Dataset"]} for v in version_records]
+        )
     def _bootstrap_versions(self):
         datasets = [ds["RID"] for ds in self.find_datasets()]
@@ -161,6 +168,21 @@ class Dataset:
             ]
         )
+    def _set_version_snapshot(self):
+        dataset_version_path = (
+            self._model.catalog.getPathBuilder()
+            .schemas[self._ml_schema]
+            .tables["Dataset_Version"]
+        )
+        versions = dataset_version_path.entities().fetch()
+        dataset_version_path.update(
+            [
+                {"RID": h["RID"], "Snapshot": iso_to_snap(h["RCT"])}
+                for h in versions
+                if not h["Snapshot"]
+            ]
+        )
     def dataset_history(self, dataset_rid: RID) -> list[DatasetHistory]:
         """Return a list of DatasetHistory objects representing the dataset
@@ -170,6 +192,9 @@ class Dataset:
         Returns:
             A list of DatasetHistory objects which indicate the version-number, creation time, and bag instantiation of the dataset.
         """
+        if not self._is_dataset_rid(dataset_rid):
+            raise DerivaMLException(f"RID is not for a data set: {dataset_rid}")
         version_path = (
             self._model.catalog.getPathBuilder()
             .schemas[self._ml_schema]
@@ -179,7 +204,7 @@ class Dataset:
             DatasetHistory(
                 dataset_version=DatasetVersion.parse(v["Version"]),
                 minid=v["Minid"],
-                timestamp=v["RCT"],
+                snapshot=v["Snapshot"],
                 dataset_rid=dataset_rid,
                 version_rid=v["RID"],
                 description=v["Description"],
@@ -240,7 +265,7 @@ class Dataset:
         Args:
             dataset_rid: RID of the dataset whose version is to be incremented.
-            component: Which version of the dataset_table to increment. Major, Minor or Patch
+            component: Which version of the dataset_table to increment. Major, Minor, or Patch
             description: Description of the version update of the dataset_table.
             execution_rid: Which execution is performing increment.
@@ -248,7 +273,7 @@ class Dataset:
           new semantic version of the dataset_table as a 3-tuple
         Raises:
-          DerivaMLException: if provided RID is not to a dataset_table.
+          DerivaMLException: if provided, RID is not to a dataset_table.
         """
         # Find all the datasets that are reachable from this dataset and determine their new version numbers.
@@ -268,7 +293,7 @@ class Dataset:
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def create_dataset(
         self,
-        type: str | list[str],
+        dataset_types: str | list[str],
         description: str,
         execution_rid: Optional[RID] = None,
         version: Optional[DatasetVersion] = None,
@@ -276,7 +301,7 @@ class Dataset:
         """Create a new dataset_table from the specified list of RIDs.
         Args:
-            type: One or more dataset_table types.  Must be a term from the DatasetType controlled vocabulary.
+            dataset_types: One or more dataset_table types.  Must be a term from the DatasetType controlled vocabulary.
             description: Description of the dataset_table.
             execution_rid: Execution under which the dataset_table will be created.
             version: Version of the dataset_table.
@@ -304,7 +329,7 @@ class Dataset:
             return False
         # Create the entry for the new dataset_table and get its RID.
-        ds_types = [type] if isinstance(type, str) else type
+        ds_types = [dataset_types] if isinstance(dataset_types, str) else dataset_types
         pb = self._model.catalog.getPathBuilder()
         for ds_type in ds_types:
             if not check_dataset_type(ds_type):
@@ -452,7 +477,9 @@ class Dataset:
         )
         # self.model = self.catalog.getCatalogModel()
-        self.dataset_table.annotations.update(self._generate_dataset_annotations())
+        self.dataset_table.annotations.update(
+            self._generate_dataset_download_annotations()
+        )
         self._model.model.apply()
         return table
@@ -464,7 +491,7 @@ class Dataset:
         Args:
             dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
-            recurse:  (Default value = False)
+            recurse: (Default value = False)
             limit: If provided, the maximum number of members to return for each element type.
         Returns:
@@ -530,8 +557,8 @@ class Dataset:
         dataset is incremented and the description, if provide is applied to that new version.
         Args:
-            dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
-            members: List of RIDs of members to add to the  dataset_table.
+            dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
+            members: List of member RIDs to add to the dataset_table.
             validate: Check rid_list to make sure elements are not already in the dataset_table.
             description: Markdown description of the updated dataset.
             execution_rid: Optional RID of execution associated with this dataset.
@@ -544,7 +571,7 @@ class Dataset:
             Args:
               member_rid:
-              path:  (Default value = None)
+              path: (Default value = None)
             Returns:
@@ -570,7 +597,7 @@ class Dataset:
             a.other_fkeys.pop().pk_table.name: a.table.name
             for a in self.dataset_table.find_associations()
         }
-        # Get a list of all the types of objects that can be linked to a dataset_table.
+        # Get a list of all the object types that can be linked to a dataset_table.
         for m in members:
             try:
                 rid_info = self._model.catalog.resolve_rid(m)
@@ -618,8 +645,8 @@ class Dataset:
         dataset is incremented and the description, if provide is applied to that new version.
         Args:
-            dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
-            members: List of RIDs of members to add to the  dataset_table.
+            dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
+            members: List of member RIDs to add to the dataset_table.
             description: Markdown description of the updated dataset.
             execution_rid: Optional RID of execution associated with this operation.
         """
@@ -634,7 +661,7 @@ class Dataset:
             a.other_fkeys.pop().pk_table.name: a.table.name
             for a in self.dataset_table.find_associations()
         }
-        # Get a list of all the types of objects that can be linked to a dataset_table.
+        # Get a list of all the object types that can be linked to a dataset_table.
         for m in members:
             try:
                 rid_info = self._model.catalog.resolve_rid(m)
@@ -670,7 +697,7 @@ class Dataset:
         )
     @validate_call
-    def list_dataset_parents(self, dataset_rid: RID) -> list[RID]:
+    def list_dataset_parents(self, dataset_rid: RID) -> list[str]:
         """Given a dataset_table RID, return a list of RIDs of the parent datasets if this is included in a
         nested dataset.
@@ -696,14 +723,14 @@ class Dataset:
     @validate_call
     def list_dataset_children(self, dataset_rid: RID, recurse=False) -> list[RID]:
-        """Given a dataset_table RID, return a list of RIDs of any nested datasets.
+        """Given a dataset_table RID, return a list of RIDs for any nested datasets.
         Args:
             dataset_rid: A dataset_table RID.
-            recurse: If True, return a list of RIDs of any nested datasets.
+            recurse: If True, return a list of nested datasets RIDs.
         Returns:
-          list of RIDs of nested datasets.
+          list of nested dataset RIDs.
         """
         dataset_dataset_path = (
@@ -726,7 +753,7 @@ class Dataset:
         return find_children(dataset_rid)
-    def _vocabulary_specification(
+    def _export_vocabulary(
         self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
     ) -> list[dict[str, Any]]:
         """
@@ -756,10 +783,10 @@ class Dataset:
     ) -> Iterator[tuple[str, str, Table]]:
         paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
-        def source_path(path: tuple[Table, ...]):
+        def source_path(path: tuple[Table, ...]) -> list[str]:
             """Convert a tuple representing a path into a source path component with FK linkage"""
             path = list(path)
-            p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
+            p = [f"{self._model.ml_schema}:Dataset/RID={{RID}}"]
             for table in path[1:]:
                 if table.name == "Dataset_Dataset":
                     p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
@@ -803,7 +830,7 @@ class Dataset:
             dataset_elements = [
                 snapshot_catalog._model.name_to_table(e)
                 for e, m in snapshot_catalog.list_dataset_members(
-                    dataset_rid=dataset_rid,  #  limit=1  Limit seems to make things run slow.
+                    dataset_rid=dataset_rid,  #  limit=1 Limit seems to make things run slow.
                 ).items()
                 if m
             ]
@@ -857,7 +884,7 @@ class Dataset:
         """
         def children_depth(
-            dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
+            dataset_rid: RID, nested_datasets: dict[str, list[str]]
         ) -> int:
             """Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
             try:
@@ -899,13 +926,13 @@ class Dataset:
     def _dataset_specification(
         self,
         writer: Callable[[str, str, Table], list[dict[str, Any]]],
-        dataset: DatasetSpec,
+        dataset: Optional[DatasetSpec] = None,
         snapshot_catalog: Optional[DerivaML] = None,
     ) -> list[dict[str, Any]]:
         """Output a download/export specification for a dataset_table.  Each element of the dataset_table will be placed in its own dir
-        The top level data directory of the resulting BDBag will have one subdirectory for element type. the subdirectory
+        The top level data directory of the resulting BDBag will have one subdirectory for element type. The subdirectory
         will contain the CSV indicating which elements of that type are present in the dataset_table, and then there will be a
-        subdirectories for each object that is reachable from the dataset_table members.
+         subdirectory for each object that is reachable from the dataset_table members.
         To simplify reconstructing the relationship between tables, the CVS for each
         The top level data directory will also contain a subdirectory for any controlled vocabularies used in the dataset_table.
@@ -913,7 +940,7 @@ class Dataset:
         For example, consider a dataset_table that consists of two element types, T1 and T2. T1 has foreign key relationships to
         objects in tables T3 and T4.  There are also two controlled vocabularies, CV1 and CV2.  T2 is an asset table
-        which has two asset in it. The layout of the resulting bdbag would be:
+        which has two assets in it. The layout of the resulting bdbag would be:
               data
                 CV1/
                     cv1.csv
@@ -939,12 +966,12 @@ class Dataset:
         Returns:
             A dataset_table specification.
         """
-        element_spec = []
+        element_spec = self._export_vocabulary(writer)
         for path in self._table_paths(
             dataset=dataset, snapshot_catalog=snapshot_catalog
         ):
             element_spec.extend(writer(*path))
-        return self._vocabulary_specification(writer) + element_spec
+        return element_spec
     def _download_dataset_bag(
         self,
@@ -985,7 +1012,7 @@ class Dataset:
             for h in self.dataset_history(dataset_rid=dataset.rid)
             if h.dataset_version == dataset.version
         ][0]
-        return f"{self._model.catalog.catalog_id}@{iso_to_snap(version_record.timestamp.isoformat())}"
+        return f"{self._model.catalog.catalog_id}@{version_record.snapshot}"
     def _create_dataset_minid(
         self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
@@ -1000,7 +1027,7 @@ class Dataset:
                 )
             try:
                 self._logger.info(
-                    f"Downloading dataset minid for catalog: {dataset.rid}@{str(dataset.version)}"
+                    f"Downloading dataset {'minid' if self._use_minid else 'bag'} for catalog: {dataset.rid}@{str(dataset.version)}"
                 )
                 # Generate the bag and put into S3 storage.
                 exporter = DerivaExport(
@@ -1009,9 +1036,10 @@ class Dataset:
                     output_dir=tmp_dir,
                     defer_download=True,
                     timeout=(10, 610),
-                    envars={"Dataset_RID": dataset.rid},
+                    envars={"RID": dataset.rid},
                 )
                 minid_page_url = exporter.export()[0]  # Get the MINID launch page
             except (
                 DerivaDownloadError,
                 DerivaDownloadConfigurationError,
@@ -1021,17 +1049,18 @@ class Dataset:
             ) as e:
                 raise DerivaMLException(format_exception(e))
             # Update version table with MINID.
-            version_path = (
-                self._model.catalog.getPathBuilder()
-                .schemas[self._ml_schema]
-                .tables["Dataset_Version"]
-            )
-            version_rid = [
-                h
-                for h in self.dataset_history(dataset_rid=dataset.rid)
-                if h.dataset_version == dataset.version
-            ][0].version_rid
-            version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
+            if self._use_minid:
+                version_path = (
+                    self._model.catalog.getPathBuilder()
+                    .schemas[self._ml_schema]
+                    .tables["Dataset_Version"]
+                )
+                version_rid = [
+                    h
+                    for h in self.dataset_history(dataset_rid=dataset.rid)
+                    if h.dataset_version == dataset.version
+                ][0].version_rid
+                version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
         return minid_page_url
     def _get_dataset_minid(
@@ -1074,14 +1103,25 @@ class Dataset:
                     raise DerivaMLException(
                         f"Minid for dataset {dataset.rid} doesn't exist"
                     )
-                self._logger.info("Creating new MINID for dataset %s", dataset.rid)
+                if self._use_minid:
+                    self._logger.info("Creating new MINID for dataset %s", dataset.rid)
                 minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
             # If provided a MINID, use the MINID metadata to get the checksum and download the bag.
-        r = requests.get(minid_url, headers={"accept": "application/json"})
-        return DatasetMinid(dataset_version=dataset.version, **r.json())
+            if self._use_minid:
+                r = requests.get(minid_url, headers={"accept": "application/json"})
+                dataset_minid = DatasetMinid(
+                    dataset_version=dataset.version, **r.json()
+                )
+            else:
+                dataset_minid = DatasetMinid(
+                    dataset_version=dataset.version,
+                    RID=f"{dataset.rid}@{dataset_version_record.snapshot}",
+                    location=minid_url,
+                )
+            return dataset_minid
     def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
-        """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
+        """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it, and validate
         that all the metadata is correct
         Args:
@@ -1090,19 +1130,37 @@ class Dataset:
             the location of the unpacked and validated dataset_table bag and the RID of the bag and the bag MINID
         """
-        # Check to see if we have an existing idempotent materialization of the desired bag. If so, then just reuse
+        # Check to see if we have an existing idempotent materialization of the desired bag. If so, then reuse
         # it.  If not, then we need to extract the contents of the archive into our cache directory.
         bag_dir = self._cache_dir / f"{minid.dataset_rid}_{minid.checksum}"
         if bag_dir.exists():
-            bag_path = (bag_dir / f"Dataset_{minid.dataset_rid}").as_posix()
-        else:
-            bag_dir.mkdir(parents=True, exist_ok=True)
-            with NamedTemporaryFile(
-                delete=False, suffix=f"Dataset_{minid.dataset_rid}.zip"
-            ) as zip_file:
-                archive_path = fetch_single_file(minid.bag_url, zip_file.name)
-                bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
-            bdb.validate_bag_structure(bag_path)
+            self._logger.info(
+                f"Using cached bag for  {minid.dataset_rid} Version:{minid.dataset_version}"
+            )
+            return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
+        # Either bag hasn't been downloaded yet, or we are not using a Minid, so we don't know the checksum yet.
+        with TemporaryDirectory() as tmp_dir:
+            if self._use_minid:
+                # Get bag from S3
+                archive_path = fetch_single_file(minid.bag_url)
+            else:
+                exporter = DerivaExport(
+                    host=self._model.catalog.deriva_server.server, output_dir=tmp_dir
+                )
+                archive_path = exporter.retrieve_file(minid.bag_url)
+                hashes = hash_utils.compute_file_hashes(
+                    archive_path, hashes=["md5", "sha256"]
+                )
+                checksum = hashes["sha256"][0]
+                bag_dir = self._cache_dir / f"{minid.dataset_rid}_{checksum}"
+                if bag_dir.exists():
+                    self._logger.info(
+                        f"Using cached bag for  {minid.dataset_rid} Version:{minid.dataset_version}"
+                    )
+                    return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
+            bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
+        bdb.validate_bag_structure(bag_path)
         return Path(bag_path)
     def _materialize_dataset_bag(
@@ -1154,6 +1212,9 @@ class Dataset:
         # If this bag has already been validated, our work is done.  Otherwise, materialize the bag.
         if not validated_check.exists():
+            self._logger.info(
+                f"Materializing bag {minid.dataset_rid} Version:{minid.dataset_version}"
+            )
             bdb.materialize(
                 bag_path.as_posix(),
                 fetch_callback=fetch_progress_callback,
@@ -1162,9 +1223,8 @@ class Dataset:
             validated_check.touch()
         return Path(bag_path)
-    def _export_outputs(
+    def _export_annotation(
         self,
-        dataset: Optional[DatasetSpec] = None,
         snapshot_catalog: Optional[DerivaML] = None,
     ) -> list[dict[str, Any]]:
         """Return and output specification for the datasets in the provided model
@@ -1173,19 +1233,6 @@ class Dataset:
           An export specification suitable for Chaise.
         """
-        def writer(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
-            """
-            Args:
-              spath: list[Table]:
-              dpath: list[Table]:
-              table: Table
-            Returns:
-                An export specification suitable for Chaise.
-            """
-            return self._export_dataset_element(spath, dpath, table)
         # Export specification is a specification for the datasets, plus any controlled vocabulary
         return [
             {
@@ -1204,41 +1251,34 @@ class Dataset:
                 "destination": {"type": "json", "name": "schema"},
             },
         ] + self._dataset_specification(
-            writer, dataset, snapshot_catalog=snapshot_catalog
+            self._export_annotation_dataset_element,
+            None,
+            snapshot_catalog=snapshot_catalog,
         )
-    def _processor_params(
+    def _export_specification(
         self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
     ) -> list[dict[str, Any]]:
         """
+        Generate a specification for export engine for specific dataset.
         Returns:
           a download specification for the datasets in the provided model.
         """
-        def writer(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
-            """
-            Args:
-              spath:
-              dpath:
-              table: Table
-            Returns:
-            """
-            return self._download_dataset_element(spath, dpath, table)
         # Download spec is the spec for any controlled vocabulary and for the dataset_table.
         return [
             {
                 "processor": "json",
                 "processor_params": {"query_path": "/schema", "output_path": "schema"},
             }
-        ] + self._dataset_specification(writer, dataset, snapshot_catalog)
+        ] + self._dataset_specification(
+            self._export_specification_dataset_element, dataset, snapshot_catalog
+        )
     @staticmethod
-    def _download_dataset_element(
+    def _export_specification_dataset_element(
         spath: str, dpath: str, table: Table
     ) -> list[dict[str, Any]]:
         """Return the download specification for the data object indicated by a path through the data model.
@@ -1255,7 +1295,7 @@ class Dataset:
             {
                 "processor": "csv",
                 "processor_params": {
-                    "query_path": f"/entity/{spath}?limit=none",
+                    "query_path": f"/entity/{spath}",
                     "output_path": dpath,
                 },
             }
@@ -1268,16 +1308,15 @@ class Dataset:
                 {
                     "processor": "fetch",
                     "processor_params": {
-                        "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
+                        "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5",
                         "output_path": f"asset/{table.name}",
                     },
                 }
             )
         return exports
-    @staticmethod
-    def _export_dataset_element(
-        spath: str, dpath: str, table: Table
+    def _export_annotation_dataset_element(
+        self, spath: str, dpath: str, table: Table
     ) -> list[dict[str, Any]]:
         """Given a path in the data model, output an export specification for the path taken to get to the current table.
@@ -1293,9 +1332,23 @@ class Dataset:
         # into a path in the form of /S:T1/S:T2/S:Table
         # Generate the destination path in the file system using just the table names.
+        skip_root_path = False
+        if spath.startswith(f"{self._ml_schema}:Dataset/"):
+            # Chaise will add table name and RID filter, so strip it off.
+            spath = "/".join(spath.split("/")[2:])
+            if spath == "":
+                # This path is to just the dataset table.
+                return []
+        else:
+            # A vocabulary table, so we don't want the root_path.
+            skip_root_path = True
         exports = [
             {
-                "source": {"api": "entity", "path": spath},
+                "source": {
+                    "api": "entity",
+                    "path": spath,
+                    "skip_root_path": skip_root_path,
+                },
                 "destination": {"name": dpath, "type": "csv"},
             }
         ]
@@ -1306,6 +1359,7 @@ class Dataset:
             exports.append(
                 {
                     "source": {
+                        "skip_root_path": False,
                         "api": "attribute",
                         "path": f"{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5",
                     },
@@ -1315,44 +1369,53 @@ class Dataset:
         return exports
     def _generate_dataset_download_spec(
-        self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML]
+        self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
     ) -> dict[str, Any]:
         """
+        Generate a specification for downloading a specific dataset.
+        This routine creates a download specification that can be used by the Deriva export processor to download
+        a specific dataset as a MINID.
         Returns:
         """
         s3_target = "s3://eye-ai-shared"
         minid_test = False
         catalog_id = self._version_snapshot(dataset)
-        return {
-            "env": {"Dataset_RID": "{Dataset_RID}"},
+        post_processors = (
+            {
+                "post_processors": [
+                    {
+                        "processor": "cloud_upload",
+                        "processor_params": {
+                            "acl": "public-read",
+                            "target_url": s3_target,
+                        },
+                    },
+                    {
+                        "processor": "identifier",
+                        "processor_params": {
+                            "test": minid_test,
+                            "env_column_map": {
+                                "RID": "{RID}@{snaptime}",
+                                "Description": "{Description}",
+                            },
+                        },
+                    },
+                ]
+            }
+            if self._use_minid
+            else {}
+        )
+        return post_processors | {
+            "env": {"RID": "{RID}"},
             "bag": {
-                "bag_name": "Dataset_{Dataset_RID}",
+                "bag_name": "Dataset_{RID}",
                 "bag_algorithms": ["md5"],
                 "bag_archiver": "zip",
                 "bag_metadata": {},
                 "bag_idempotent": True,
             },
-            "post_processors": [
-                {
-                    "processor": "cloud_upload",
-                    "processor_params": {
-                        "acl": "public-read",
-                        "target_url": s3_target,
-                    },
-                },
-                {
-                    "processor": "identifier",
-                    "processor_params": {
-                        "test": minid_test,
-                        "env_column_map": {
-                            "Dataset_RID": "{RID}@{snaptime}",
-                            "Description": "{Description}",
-                        },
-                    },
-                },
-            ],
             "catalog": {
                 "host": f"{self._model.catalog.deriva_server.scheme}://{self._model.catalog.deriva_server.server}",
                 "catalog_id": catalog_id,
@@ -1368,125 +1431,50 @@ class Dataset:
                     {
                         "processor": "env",
                         "processor_params": {
-                            "query_path": "/entity/M:=deriva-ml:Dataset/RID={Dataset_RID}?limit=none",
+                            "query_path": "/entity/M:=deriva-ml:Dataset/RID={RID}",
                             "output_path": "Dataset",
                             "query_keys": ["RID", "Description"],
                         },
                     },
                 ]
-                + self._processor_params(dataset, snapshot_catalog),
+                + self._export_specification(dataset, snapshot_catalog),
             },
         }
-    def dataset_visible_columns(self) -> dict[str, Any]:
-        dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
-        rcb_name = next(
-            [fk.name[0].name, fk.name[1]]
-            for fk in dataset_table.foreign_keys
-            if fk.name[1] == "Dataset_RCB_fkey"
-        )
-        rmb_name = next(
-            [fk.name[0].name, fk.name[1]]
-            for fk in dataset_table.foreign_keys
-            if fk.name[1] == "Dataset_RMB_fkey"
-        )
-        return {
-            "*": [
-                "RID",
-                "Description",
-                {
-                    "display": {
-                        "markdown_pattern": "[Annotate Dataset](https://www.eye-ai.org/apps/grading-interface/main?dataset_rid={{{RID}}}){: .btn}"
-                    },
-                    "markdown_name": "Annotation App",
-                },
-                rcb_name,
-                rmb_name,
-            ],
-            "detailed": [
-                "RID",
-                "Description",
-                {
-                    "source": [
-                        {"inbound": ["deriva-ml", "Dataset_Dataset_Type_Dataset_fkey"]},
-                        {
-                            "outbound": [
-                                "deriva-ml",
-                                "Dataset_Dataset_Type_Dataset_Type_fkey",
-                            ]
+    def _generate_dataset_download_annotations(self) -> dict[str, Any]:
+        post_processors = (
+            {
+                "type": "BAG",
+                "outputs": [{"fragment_key": "dataset_export_outputs"}],
+                "displayname": "BDBag to Cloud",
+                "bag_idempotent": True,
+                "postprocessors": [
+                    {
+                        "processor": "cloud_upload",
+                        "processor_params": {
+                            "acl": "public-read",
+                            "target_url": "s3://eye-ai-shared/",
                         },
-                        "RID",
-                    ],
-                    "markdown_name": "Dataset Types",
-                },
-                {
-                    "display": {
-                        "markdown_pattern": "[Annotate Dataset](https://www.eye-ai.org/apps/grading-interface/main?dataset_rid={{{RID}}}){: .btn}"
                     },
-                    "markdown_name": "Annotation App",
-                },
-                rcb_name,
-                rmb_name,
-            ],
-            "filter": {
-                "and": [
-                    {"source": "RID"},
-                    {"source": "Description"},
                     {
-                        "source": [
-                            {
-                                "inbound": [
-                                    "deriva-ml",
-                                    "Dataset_Dataset_Type_Dataset_fkey",
-                                ]
-                            },
-                            {
-                                "outbound": [
-                                    "deriva-ml",
-                                    "Dataset_Dataset_Type_Dataset_Type_fkey",
-                                ]
+                        "processor": "identifier",
+                        "processor_params": {
+                            "test": False,
+                            "env_column_map": {
+                                "RID": "{RID}@{snaptime}",
+                                "Description": "{Description}",
                             },
-                            "RID",
-                        ],
-                        "markdown_name": "Dataset Types",
-                    },
-                    {
-                        "source": [{"outbound": rcb_name}, "RID"],
-                        "markdown_name": "Created By",
-                    },
-                    {
-                        "source": [{"outbound": rmb_name}, "RID"],
-                        "markdown_name": "Modified By",
+                        },
                     },
-                ]
-            },
-        }
-    def _dataset_visible_fkeys(self) -> dict[str, Any]:
-        def fkey_name(fk):
-            return [fk.name[0].name, fk.name[1]]
-        dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
-        source_list = [
-            {
-                "source": [
-                    {"inbound": fkey_name(fkey.self_fkey)},
-                    {"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
-                    "RID",
                 ],
-                "markdown_name": other_fkey.pk_table.name,
             }
-            for fkey in dataset_table.find_associations(max_arity=3, pure=False)
-        ]
-        return {"detailed": source_list}
-    def _generate_dataset_annotations(self) -> dict[str, Any]:
+            if self._use_minid
+            else {}
+        )
         return {
             deriva_tags.export_fragment_definitions: {
-                "dataset_export_outputs": self._export_outputs()
+                "dataset_export_outputs": self._export_annotation()
             },
-            deriva_tags.visible_columns: self.dataset_visible_columns(),
             deriva_tags.visible_foreign_keys: self._dataset_visible_fkeys(),
             deriva_tags.export_2019: {
                 "detailed": {
@@ -1496,45 +1484,56 @@ class Dataset:
                             "outputs": [{"fragment_key": "dataset_export_outputs"}],
                             "displayname": "BDBag Download",
                             "bag_idempotent": True,
-                            "postprocessors": [
-                                {
-                                    "processor": "identifier",
-                                    "processor_params": {
-                                        "test": False,
-                                        "env_column_map": {
-                                            "Dataset_RID": "{RID}@{snaptime}",
-                                            "Description": "{Description}",
-                                        },
-                                    },
-                                }
-                            ],
-                        },
-                        {
-                            "type": "BAG",
-                            "outputs": [{"fragment_key": "dataset_export_outputs"}],
-                            "displayname": "BDBag to Cloud",
-                            "bag_idempotent": True,
-                            "postprocessors": [
-                                {
-                                    "processor": "cloud_upload",
-                                    "processor_params": {
-                                        "acl": "public-read",
-                                        "target_url": "s3://eye-ai-shared/",
-                                    },
-                                },
-                                {
-                                    "processor": "identifier",
-                                    "processor_params": {
-                                        "test": False,
-                                        "env_column_map": {
-                                            "Dataset_RID": "{RID}@{snaptime}",
-                                            "Description": "{Description}",
-                                        },
-                                    },
-                                },
-                            ],
-                        },
+                        }
+                        | post_processors
                     ]
                 }
             },
         }
+    def _dataset_visible_fkeys(self) -> dict[str, Any]:
+        def fkey_name(fk):
+            return [fk.name[0].name, fk.name[1]]
+        dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
+        source_list = [
+            {
+                "source": [
+                    {"inbound": ["deriva-ml", "Dataset_Version_Dataset_fkey"]},
+                    "RID",
+                ],
+                "markdown_name": "Previous Versions",
+                "entity": True,
+            },
+            {
+                "source": [
+                    {"inbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
+                    {"outbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
+                    "RID",
+                ],
+                "markdown_name": "Parent Datasets",
+            },
+            {
+                "source": [
+                    {"inbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
+                    {"outbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
+                    "RID",
+                ],
+                "markdown_name": "Child Datasets",
+            },
+        ]
+        source_list.extend(
+            [
+                {
+                    "source": [
+                        {"inbound": fkey_name(fkey.self_fkey)},
+                        {"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
+                        "RID",
+                    ],
+                    "markdown_name": other_fkey.pk_table.name,
+                }
+                for fkey in dataset_table.find_associations(max_arity=3, pure=False)
+            ]
+        )
+        return {"detailed": source_list}

deriva-ml 1.13.2__py3-none-any.whl → 1.14.0__py3-none-any.whl

deriva-ml 1.13.2py3-none-any.whl → 1.14.0py3-none-any.whl