PyPI - deriva-ml - Versions diffs - 1.6.8__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

deriva-ml 1.6.8py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

deriva_ml/VERSION.py +1 -1
deriva_ml/__init__.py +2 -0
deriva_ml/database_model.py +23 -80
deriva_ml/dataset.py +144 -193
deriva_ml/dataset_aux_classes.py +1 -0
deriva_ml/dataset_bag.py +101 -7
deriva_ml/demo_catalog.py +94 -14
deriva_ml/deriva_definitions.py +80 -31
deriva_ml/deriva_ml_base.py +118 -11
deriva_ml/deriva_model.py +98 -2
deriva_ml/execution.py +64 -9
deriva_ml/execution_configuration.py +2 -1
deriva_ml/execution_environment.py +4 -2
deriva_ml/feature.py +0 -3
deriva_ml/history.py +1 -2
deriva_ml/schema_setup/create_schema.py +34 -7
deriva_ml/test_functions.py +4 -24
deriva_ml/upload.py +1 -2
{deriva_ml-1.6.8.dist-info → deriva_ml-1.8.0.dist-info}/METADATA +1 -1
deriva_ml-1.8.0.dist-info/RECORD +34 -0
{deriva_ml-1.6.8.dist-info → deriva_ml-1.8.0.dist-info}/WHEEL +1 -1
deriva_ml-1.6.8.dist-info/RECORD +0 -34
{deriva_ml-1.6.8.dist-info → deriva_ml-1.8.0.dist-info}/LICENSE +0 -0
{deriva_ml-1.6.8.dist-info → deriva_ml-1.8.0.dist-info}/entry_points.txt +0 -0
{deriva_ml-1.6.8.dist-info → deriva_ml-1.8.0.dist-info}/top_level.txt +0 -0

deriva_ml/dataset.py CHANGED Viewed

@@ -9,6 +9,7 @@ accessible via a DerivaML class instance.
 from bdbag.fetch.fetcher import fetch_single_file
 from bdbag import bdbag_api as bdb
 from collections import defaultdict
 from deriva.core.ermrest_model import Table
 from deriva.core.utils.core_utils import tag as deriva_tags, format_exception
 from deriva.transfer.download.deriva_export import DerivaExport
@@ -25,6 +26,7 @@ try:
 except ImportError:  # Graceful fallback if IceCream isn't installed.
     ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
+from graphlib import TopologicalSorter
 import json
 import logging
 from pathlib import Path
@@ -35,7 +37,7 @@ from pydantic import (
 import requests
 from tempfile import TemporaryDirectory, NamedTemporaryFile
-from typing import Any, Callable, Optional, Iterable
+from typing import Any, Callable, Optional, Iterable, Iterator
 from deriva_ml import DatasetBag
 from .deriva_definitions import ML_SCHEMA, DerivaMLException, MLVocab, Status, RID
@@ -73,9 +75,10 @@ class Dataset:
             rid_info = self._model.catalog.resolve_rid(dataset_rid, self._model.model)
         except KeyError as _e:
             raise DerivaMLException(f"Invalid RID {dataset_rid}")
-        # Got a dataset rid. Now check to see if its deleted or not.
-        if deleted:
+        if rid_info.table != self.dataset_table:
+            return False
+        elif deleted:
+            # Got a dataset rid. Now check to see if its deleted or not.
             return True
         else:
             return not list(rid_info.datapath.entities().fetch())[0]["Deleted"]
@@ -85,6 +88,7 @@ class Dataset:
         dataset_rid: RID,
         dataset_version: DatasetVersion,
         description: Optional[str] = "",
+        execution_rid: Optional[RID] = None,
     ) -> RID:
         schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
         version_path = schema_path.tables["Dataset_Version"]
@@ -94,6 +98,7 @@ class Dataset:
                     "Dataset": dataset_rid,
                     "Version": str(dataset_version),
                     "Description": description,
+                    "Execution": execution_rid,
                 }
             ]
         )[0]["RID"]
@@ -163,6 +168,7 @@ class Dataset:
                 dataset_rid=dataset_rid,
                 version_rid=v["RID"],
                 description=v["Description"],
+                execution_rid=v["Execution"],
             )
             for v in version_path.filter(version_path.Dataset == dataset_rid)
             .entities()
@@ -190,11 +196,30 @@ class Dataset:
         else:
             return max([h.dataset_version for h in self.dataset_history(dataset_rid)])
+    def _build_dataset_graph(self, dataset_rid: RID) -> Iterable[RID]:
+        ts = TopologicalSorter()
+        self._build_dataset_graph_1(dataset_rid, ts, set())
+        return ts.static_order()
+    def _build_dataset_graph_1(self, dataset_rid: RID, ts, visited) -> None:
+        """Use topological sort to return bottom up list of nested datasets"""
+        ts.add(dataset_rid)
+        if dataset_rid not in visited:
+            visited.add(dataset_rid)
+            children = self.list_dataset_children(dataset_rid=dataset_rid)
+            parents = self.list_dataset_parents(dataset_rid=dataset_rid)
+            for parent in parents:
+                self._build_dataset_graph_1(parent, ts, visited)
+            for child in children:
+                self._build_dataset_graph_1(child, ts, visited)
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def increment_dataset_version(
         self,
         dataset_rid: RID,
         component: VersionPart,
         description: Optional[str] = "",
+        execution_rid: Optional[RID] = None,
     ) -> DatasetVersion:
         """Increment the version of the specified dataset_table.
@@ -204,6 +229,7 @@ class Dataset:
           dataset_rid: RID of the dataset whose version is to be incremented.
           component: Major, Minor or Patch
           description: Description of the version update of the dataset_table.
+          execution_rid: Which execution is performing increment.
         Returns:
           new semantic version of the dataset_table as a 3-tuple
@@ -211,16 +237,16 @@ class Dataset:
         Raises:
           DerivaMLException: if provided RID is not to a dataset_table.
         """
-        for ds in self.list_dataset_children(dataset_rid):
-            self.increment_dataset_version(
-                ds,
-                component,
-                description=f"Increment version of nested dataset: {description}",
+        for dataset in self._build_dataset_graph(dataset_rid=dataset_rid):
+            version = self.dataset_version(dataset)
+            new_version = version.increment_version(component)
+            self._insert_dataset_version(
+                dataset,
+                new_version,
+                description=description,
+                execution_rid=execution_rid,
             )
-        version = self.dataset_version(dataset_rid)
-        new_version = version.increment_version(component)
-        self._insert_dataset_version(dataset_rid, new_version, description=description)
-        return new_version
+        return self.dataset_version(dataset_rid)
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def create_dataset(
@@ -268,7 +294,7 @@ class Dataset:
         pb = self._model.catalog.getPathBuilder()
         for ds_type in ds_types:
             if not check_dataset_type(ds_type):
-                raise DerivaMLException(f"Dataset type must be a vocabulary term.")
+                raise DerivaMLException("Dataset type must be a vocabulary term.")
         dataset_table_path = pb.schemas[self.dataset_table.schema.name].tables[
             self.dataset_table.name
         ]
@@ -297,7 +323,12 @@ class Dataset:
             pb.schemas[self._ml_schema].Dataset_Execution.insert(
                 [{"Dataset": dataset_rid, "Execution": execution_rid}]
             )
-        self._insert_dataset_version(dataset_rid, version)
+        self._insert_dataset_version(
+            dataset_rid,
+            dataset_version=version,
+            execution_rid=execution_rid,
+            description="Initial dataset creation.",
+        )
         return dataset_rid
     @validate_call
@@ -414,7 +445,7 @@ class Dataset:
         self._model.model.apply()
         return table
-    @validate_call
+    # @validate_call
     def list_dataset_members(
         self, dataset_rid: RID, recurse: bool = False
     ) -> dict[str, list[dict[str, Any]]]:
@@ -439,34 +470,27 @@ class Dataset:
         pb = self._model.catalog.getPathBuilder()
         for assoc_table in self.dataset_table.find_associations():
             other_fkey = assoc_table.other_fkeys.pop()
-            self_fkey = assoc_table.self_fkey
             target_table = other_fkey.pk_table
             member_table = assoc_table.table
+            # Look at domain tables and nested datasets.
             if (
                 target_table.schema.name != self._model.domain_schema
                 and target_table != self.dataset_table
             ):
-                # Look at domain tables and nested datasets.
                 continue
-            if target_table == self.dataset_table:
-                # find_assoc gives us the keys in the wrong position, so swap.
-                self_fkey, other_fkey = other_fkey, self_fkey
+            member_column = (
+                "Nested_Dataset"
+                if target_table == self.dataset_table
+                else other_fkey.foreign_key_columns[0].name
+            )
             target_path = pb.schemas[target_table.schema.name].tables[target_table.name]
             member_path = pb.schemas[member_table.schema.name].tables[member_table.name]
-            # Get the names of the columns that we are going to need for linking
-            member_link = tuple(
-                c.name for c in next(iter(other_fkey.column_map.items()))
-            )
-            path = pb.schemas[member_table.schema.name].tables[member_table.name].path
-            path.filter(member_path.Dataset == dataset_rid)
-            path.link(
+            path = member_path.filter(member_path.Dataset == dataset_rid).link(
                 target_path,
-                on=(
-                    member_path.columns[member_link[0]]
-                    == target_path.columns[member_link[1]]
-                ),
+                on=(member_path.columns[member_column] == target_path.columns["RID"]),
             )
             target_entities = list(path.entities().fetch())
             members[target_table.name].extend(target_entities)
@@ -485,6 +509,7 @@ class Dataset:
         members: list[RID],
         validate: bool = True,
         description: Optional[str] = "",
+        execution_rid: Optional[RID] = None,
     ) -> None:
         """Add additional elements to an existing dataset_table.
@@ -496,6 +521,7 @@ class Dataset:
             members: List of RIDs of members to add to the  dataset_table.
             validate: Check rid_list to make sure elements are not already in the dataset_table.
             description: Markdown description of the updated dataset.
+            execution_rid: Optional RID of execution associated with this dataset.
         """
         members = set(members)
         description = description or "Updated dataset via add_dataset_members"
@@ -559,12 +585,19 @@ class Dataset:
                     [{"Dataset": dataset_rid, fk_column: e} for e in elements]
                 )
         self.increment_dataset_version(
-            dataset_rid, VersionPart.minor, description=description
+            dataset_rid,
+            VersionPart.minor,
+            description=description,
+            execution_rid=execution_rid,
         )
     @validate_call
     def delete_dataset_members(
-        self, dataset_rid: RID, members: list[RID], description=""
+        self,
+        dataset_rid: RID,
+        members: list[RID],
+        description: str = "",
+        execution_rid: Optional[RID] = None,
     ) -> None:
         """Remove elements to an existing dataset_table.
@@ -575,6 +608,7 @@ class Dataset:
             dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
             members: List of RIDs of members to add to the  dataset_table.
             description: Markdown description of the updated dataset.
+            execution_rid: Optional RID of execution associated with this operation.
         """
         members = set(members)
@@ -616,7 +650,10 @@ class Dataset:
                     )
                     entity.delete()
         self.increment_dataset_version(
-            dataset_rid, VersionPart.minor, description=description
+            dataset_rid,
+            VersionPart.minor,
+            description=description,
+            execution_rid=execution_rid,
         )
     @validate_call
@@ -663,44 +700,6 @@ class Dataset:
                 children.extend(self.list_dataset_children(child, recurse=recurse))
         return children
-    @staticmethod
-    def _download_dataset_element(
-        spath: str, dpath: str, table: Table
-    ) -> list[dict[str, Any]]:
-        """Return the download specification for the data object indicated by a path through the data model.
-        Args:
-          spath: Source path
-          dpath: Destination path
-          table: Table referenced to by the path
-        Returns:
-          The download specification that will retrieve that data from the catalog and place it into a BDBag.
-        """
-        exports = [
-            {
-                "processor": "csv",
-                "processor_params": {
-                    "query_path": f"/entity/{spath}?limit=none",
-                    "output_path": dpath,
-                },
-            }
-        ]
-        # If this table is an asset table, then we need to output the files associated with the asset.
-        asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
-        if asset_columns.issubset({c.name for c in table.columns}):
-            exports.append(
-                {
-                    "processor": "fetch",
-                    "processor_params": {
-                        "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
-                        "output_path": f"asset/{table.name}",
-                    },
-                }
-            )
-        return exports
     def _vocabulary_specification(
         self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
     ) -> list[dict[str, Any]]:
@@ -724,82 +723,38 @@ class Dataset:
             for o in writer(f"{table.schema.name}:{table.name}", table.name, table)
         ]
-    def _domain_table_paths(
-        self,
-        graph: dict[Table, list[dict[Table, Any]]],
-        spath: str = None,
-        dpath: str = None,
-        sprefix: str = "deriva-ml:Dataset/RID={Dataset_RID}",
-        dprefix: str = "Dataset",
-        nested: bool = False,
-    ) -> list[tuple[str, str, Table]]:
-        """Recursively walk over the domain schema graph and extend the current path.
+    def _table_paths(self) -> Iterator[tuple[list[str], list[str], list[Table]]]:
-        Args:
-            graph: An undirected, acyclic graph of schema.  Represented as a dictionary whose name is the table name.
-                and whose values are the child nodes of the table.
-            spath: Source path so far
-            dpath: Destination path so far
-            sprefix: Initial path to be included.  Allows for nested datasets
-            dprefix: Initial path to be included.  Allows for nested datasets
-            nested: If true, skip initial data segment.
+        dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
+        paths = self._model._schema_to_paths()
+        nested_paths = paths
-        Returns:
-          A list of all the paths through the graph.  Each path is a list of tables.
-        """
-        source_path = spath or sprefix
-        dest_path = dpath or dprefix
-        paths = []
-        for node, children in graph.items():
-            if node.name == "Dataset":
-                paths.append(
-                    (
-                        f"{sprefix}/(RID)=({self._ml_schema}:Dataset_Version:Dataset)",
-                        f"{dprefix}/Dataset_Version",
-                        self._model.schemas[self._ml_schema].tables["Dataset_Version"],
-                    )
-                )
-                new_spath = sprefix
-                new_dpath = dprefix
-                if not nested:
-                    paths.append((new_spath, new_dpath, node))
-            else:
-                new_spath = source_path + f"/{node.schema.name}:{node.name}"
-                new_dpath = dest_path + f"/{node.name}"
-                paths.append((new_spath, new_dpath, node))
-            for child in children:
-                paths.extend(
-                    self._domain_table_paths(child, new_spath, new_dpath, nested=nested)
-                )
-        return paths
-    def _table_paths(self, graph) -> list[tuple[str, str, Table]]:
-        sprefix = "deriva-ml:Dataset/RID={Dataset_RID}"
-        dprefix = "Dataset"
-        dataset_dataset_table = self._model.schemas[self._ml_schema].tables[
-            "Dataset_Dataset"
-        ]
-        table_paths = self._domain_table_paths(
-            graph=graph, sprefix=sprefix, dprefix=dprefix
-        )
-        nested_sprefix = sprefix
-        nested_dprefix = dprefix
         for i in range(self._dataset_nesting_depth()):
-            nested_sprefix += f"/(RID)=(deriva-ml:Dataset_Dataset:Dataset)"
-            nested_dprefix += f"/Dataset_Dataset"
-            table_paths.append((nested_sprefix, nested_dprefix, dataset_dataset_table))
-            nested_sprefix += f"/(Nested_Dataset)=(deriva-ml:Dataset:RID)"
-            nested_dprefix += f"/Dataset"
-            table_paths.append((nested_sprefix, nested_dprefix, self.dataset_table))
-            # Get CSV for nested datasets.
-            table_paths.extend(
-                self._domain_table_paths(
-                    graph, sprefix=nested_sprefix, dprefix=nested_dprefix, nested=True
-                )
-            )
-        return table_paths
+            if i == 0:
+                paths.extend([[self.dataset_table, dataset_dataset]])
+            nested_paths = [
+                [self.dataset_table, dataset_dataset] + p for p in nested_paths
+            ]
+            paths.extend(nested_paths)
+        def source_path(path):
+            p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
+            for table in path[1:]:
+                if table == dataset_dataset:
+                    p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
+                elif table == self.dataset_table:
+                    p.append("(Nested_Dataset)=(deriva-ml:Dataset:RID)")
+                elif table.name == "Dataset_Version":
+                    p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
+                else:
+                    p.append(f"{table.schema.name}:{table.name}")
+            return p
+        src_paths = ["/".join(source_path(p)) for p in paths]
+        dest_paths = ["/".join([t.name for t in p]) for p in paths]
+        target_tables = [p[-1] for p in paths]
+        return zip(src_paths, dest_paths, target_tables)
     def _dataset_nesting_depth(self):
         """Determine the maximum dataset nesting depth in the current catalog.
@@ -811,6 +766,7 @@ class Dataset:
         def children_depth(
             dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
         ) -> int:
+            """Return the number of nested datasets in the current catalog"""
             try:
                 children = nested_datasets[dataset_rid]
                 return (
@@ -836,50 +792,6 @@ class Dataset:
             else 0
         )
-    def _schema_graph(
-        self, node: Table, visited_nodes: Optional[set] = None
-    ) -> dict[Table, list[dict[Table, list]]]:
-        """Generate an undirected, acyclic graph of domain schema. We do this by traversing the schema foreign key
-        relationships.  We stop when we hit the deriva-ml schema or when we reach a node that we have already seen.
-        Nested datasets need to be unfolded
-        Args:
-          node: Current (starting) node in the graph.
-          visited_nodes: param nested_dataset: Are we in a nested dataset_table, (i.e. have we seen the DataSet table)?
-        Returns:
-            Graph of the schema, starting from node.
-        """
-        visited_nodes = visited_nodes or set()
-        graph = {node: []}
-        def include_node(child: Table) -> bool:
-            """Indicate if the table should be included in the graph.
-            Include node in the graph if it's not a loopback from fk<-> referred_by, you have not already been to the
-            node.
-            """
-            return (
-                child != node
-                and child not in visited_nodes
-                and child.schema.name == self._model.domain_schema
-            )
-        # Get all the tables reachable from the end of the path avoiding loops from T1<->T2 via referenced_by
-        nodes = {fk.pk_table for fk in node.foreign_keys if include_node(fk.pk_table)}
-        nodes |= {fk.table for fk in node.referenced_by if include_node(fk.table)}
-        for t in nodes:
-            new_visited_nodes = visited_nodes.copy()
-            new_visited_nodes.add(t)
-            if self._model.is_vocabulary(t):
-                # If the end of the path is a vocabulary table, we are at a terminal node in the ERD, so stop
-                continue
-            # Get all the paths that extend the current path
-            graph[node].append(self._schema_graph(t, new_visited_nodes))
-        return graph
     def _dataset_specification(
         self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
     ) -> list[dict[str, Any]]:
@@ -921,7 +833,7 @@ class Dataset:
             A dataset_table specification.
         """
         element_spec = []
-        for path in self._table_paths(self._schema_graph(self.dataset_table)):
+        for path in self._table_paths():
             element_spec.extend(writer(*path))
         return self._vocabulary_specification(writer) + element_spec
@@ -953,7 +865,7 @@ class Dataset:
             if dataset.materialize
             else self._download_dataset_bag(minid)
         )
-        return DatabaseModel.register(minid, bag_path).get_dataset()
+        return DatabaseModel(minid, bag_path).get_dataset()
     def _version_snapshot(self, dataset: DatasetSpec) -> str:
         version_record = [
@@ -1089,6 +1001,7 @@ class Dataset:
         """
         def update_status(status: Status, msg: str) -> None:
+            """Update the current status for this execution in the catalog"""
             self._model.catalog.getPathBuilder().schemas[
                 self._ml_schema
             ].Execution.update(
@@ -1192,10 +1105,48 @@ class Dataset:
         return [
             {
                 "processor": "json",
-                "processor_params": {"query_path": f"/schema", "output_path": "schema"},
+                "processor_params": {"query_path": "/schema", "output_path": "schema"},
             }
         ] + self._dataset_specification(writer)
+    @staticmethod
+    def _download_dataset_element(
+        spath: str, dpath: str, table: Table
+    ) -> list[dict[str, Any]]:
+        """Return the download specification for the data object indicated by a path through the data model.
+        Args:
+          spath: Source path
+          dpath: Destination path
+          table: Table referenced to by the path
+        Returns:
+          The download specification that will retrieve that data from the catalog and place it into a BDBag.
+        """
+        exports = [
+            {
+                "processor": "csv",
+                "processor_params": {
+                    "query_path": f"/entity/{spath}?limit=none",
+                    "output_path": dpath,
+                },
+            }
+        ]
+        # If this table is an asset table, then we need to output the files associated with the asset.
+        asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
+        if asset_columns.issubset({c.name for c in table.columns}):
+            exports.append(
+                {
+                    "processor": "fetch",
+                    "processor_params": {
+                        "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
+                        "output_path": f"asset/{table.name}",
+                    },
+                }
+            )
+        return exports
     @staticmethod
     def _export_dataset_element(
         spath: str, dpath: str, table: Table

deriva_ml/dataset_aux_classes.py CHANGED Viewed

@@ -104,6 +104,7 @@ class DatasetHistory(BaseModel):
     dataset_version: DatasetVersion
     dataset_rid: RID
     version_rid: RID
+    execution_rid: Optional[RID] = None
     description: str = ""
     minid: Optional[str] = None
     timestamp: Optional[datetime] = None

deriva-ml 1.6.8__py3-none-any.whl → 1.8.0__py3-none-any.whl

deriva-ml 1.6.8py3-none-any.whl → 1.8.0py3-none-any.whl