PyPI - deriva-ml - Versions diffs - 1.6.7__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

deriva-ml 1.6.7py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

deriva_ml/VERSION.py +1 -1
deriva_ml/database_model.py +23 -80
deriva_ml/dataset.py +128 -171
deriva_ml/dataset_aux_classes.py +1 -0
deriva_ml/dataset_bag.py +101 -7
deriva_ml/demo_catalog.py +93 -12
deriva_ml/deriva_definitions.py +43 -32
deriva_ml/deriva_ml_base.py +133 -10
deriva_ml/deriva_model.py +98 -2
deriva_ml/execution.py +122 -248
deriva_ml/execution_configuration.py +3 -2
deriva_ml/execution_environment.py +2 -0
deriva_ml/feature.py +0 -3
deriva_ml/history.py +1 -2
deriva_ml/schema_setup/create_schema.py +1 -0
deriva_ml/test_functions.py +1 -17
deriva_ml/upload.py +1 -1
{deriva_ml-1.6.7.dist-info → deriva_ml-1.7.0.dist-info}/METADATA +1 -1
deriva_ml-1.7.0.dist-info/RECORD +34 -0
{deriva_ml-1.6.7.dist-info → deriva_ml-1.7.0.dist-info}/WHEEL +1 -1
deriva_ml-1.6.7.dist-info/RECORD +0 -34
{deriva_ml-1.6.7.dist-info → deriva_ml-1.7.0.dist-info}/LICENSE +0 -0
{deriva_ml-1.6.7.dist-info → deriva_ml-1.7.0.dist-info}/entry_points.txt +0 -0
{deriva_ml-1.6.7.dist-info → deriva_ml-1.7.0.dist-info}/top_level.txt +0 -0

deriva_ml/VERSION.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.6.7"
1	+ __version__ = "1.7.0"

deriva_ml/database_model.py CHANGED Viewed

@@ -1,14 +1,15 @@
+"""Ths module constains the definition of the DatabaseModel class.  The role of this class is to provide an nterface between the BDBag representation
+of a dataset and a sqllite database in which the contents of the bag are stored.
+"""
 import logging
 import sqlite3
 from csv import reader
 from pathlib import Path
-from typing import Any, Generator, Optional
+from typing import Any, Optional
 from urllib.parse import urlparse
-import pandas as pd
 from deriva.core.ermrest_model import Model
-from pydantic import validate_call
 from .deriva_definitions import ML_SCHEMA, MLVocab, RID, DerivaMLException
 from .dataset_aux_classes import DatasetVersion, DatasetMinid
@@ -16,7 +17,21 @@ from .deriva_model import DerivaModel
 from .dataset_bag import DatasetBag
-class DatabaseModel(DerivaModel):
+class DatabaseModelMeta(type):
+    """Use metaclass to ensure that there is onl one instance per path"""
+    _paths_loaded: dict[Path:"DatabaseModel"] = {}
+    def __call__(cls, *args, **kwargs):
+        logger = logging.getLogger("deriva_ml")
+        bag_path: Path = args[1]
+        if bag_path.as_posix() not in cls._paths_loaded:
+            logger.info(f"Loading {bag_path}")
+            cls._paths_loaded[bag_path] = super().__call__(*args, **kwargs)
+        return cls._paths_loaded[bag_path]
+class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
     """Read in the contents of a BDBag and create a local SQLite database.
         As part of its initialization, this routine will create a sqlite database that has the contents of all the tables
@@ -32,6 +47,9 @@ class DatabaseModel(DerivaModel):
     Because of nested datasets, it's possible that more than one dataset rid is in a bag, or that a dataset rid might
     appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
     into DatabaseModels, is kept in the class variable `_rid_map`.
+    Because you can load diffent versions of a dataset simultaniously, the dataset RID and version number are tracked, and a new
+    sqllite instance is created for every new dataset version present.
     Attributes:
         bag_path (Path): path to the local copy of the BDBag
@@ -42,29 +60,9 @@ class DatabaseModel(DerivaModel):
         dataset_table  (Table): the dataset table in the ERMRest model.
     """
-    # Keep track of what databases we have loaded.
-    _paths_loaded: dict[Path:"DatabaseModel"] = {}
     # Maintain a global map of RIDS to versions and databases.
     _rid_map: dict[RID, list[tuple[DatasetVersion, "DatabaseModel"]]] = {}
-    @classmethod
-    @validate_call
-    def register(cls, minid: DatasetMinid, bag_path: Path):
-        """Register a new minid in the list of local databases if it's new, otherwise, return an existing DatabaseModel.
-        Args:
-            minid: MINID to the databag that is to be loaded.
-            bag_path: Path to the bag on the local filesystem./
-        Returns:
-            A DatabaseModel instance to the loaded bag.
-        """
-        o = cls._paths_loaded.get(bag_path.as_posix())
-        if o:
-            return o
-        return cls(minid, bag_path)
     @staticmethod
     def rid_lookup(dataset_rid: RID) -> list[tuple[DatasetVersion, "DatabaseModel"]]:
         """Return a list of DatasetVersion/DatabaseModel instances corresponding to the given RID.
@@ -84,13 +82,12 @@ class DatabaseModel(DerivaModel):
             raise DerivaMLException(f"Dataset {dataset_rid} not found")
     def __init__(self, minid: DatasetMinid, bag_path: Path):
-        """Create a new DatabaseModel.  This should only be called via the static Register method
+        """Create a new DatabaseModel.
         Args:
             minid: Minid for the specified bag.
             bag_path:  Path to the local copy of the BDBag.
         """
-        DatabaseModel._paths_loaded[bag_path.as_posix()] = self
         self.bag_path = bag_path
         self.minid = minid
@@ -342,60 +339,6 @@ class DatabaseModel(DerivaModel):
         except KeyError:
             raise DerivaMLException(f'Table name "{table}" does not exist.')
-    def get_table(self, table: str) -> Generator[tuple, None, None]:
-        """Retrieve the contents of the specified table. If schema is not provided as part of the table name,
-        the method will attempt to locate the schema for the table.
-        Args:
-            table: return: A generator that yields tuples of column values.
-        Returns:
-          A generator that yields tuples of column values.
-        """
-        table_name = self.normalize_table_name(table)
-        result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
-        while row := result.fetchone():
-            yield row
-    def get_table_as_dataframe(self, table: str) -> pd.DataFrame:
-        """Retrieve the contents of the specified table as a dataframe.
-        If schema is not provided as part of the table name,
-        the method will attempt to locate the schema for the table.
-        Args:
-            table: Table to retrieve data from.
-        Returns:
-          A dataframe containing the contents of the specified table.
-        """
-        table_name = self.normalize_table_name(table)
-        return pd.read_sql(f'SELECT * FROM "{table_name}"', con=self.dbase)
-    def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
-        """Retrieve the contents of the specified table as a dictionary.
-        Args:
-            table: Table to retrieve data from. f schema is not provided as part of the table name,
-                the method will attempt to locate the schema for the table.
-        Returns:
-          A generator producing dictionaries containing the contents of the specified table as name/value pairs.
-        """
-        table_name = self.normalize_table_name(table)
-        with self.dbase:
-            col_names = [
-                c[1]
-                for c in self.dbase.execute(
-                    f'PRAGMA table_info("{table_name}")'
-                ).fetchall()
-            ]
-            result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
-            while row := result.fetchone():
-                yield dict(zip(col_names, row))
     def delete_database(self):
         """

deriva_ml/dataset.py CHANGED Viewed

@@ -9,6 +9,7 @@ accessible via a DerivaML class instance.
 from bdbag.fetch.fetcher import fetch_single_file
 from bdbag import bdbag_api as bdb
 from collections import defaultdict
 from deriva.core.ermrest_model import Table
 from deriva.core.utils.core_utils import tag as deriva_tags, format_exception
 from deriva.transfer.download.deriva_export import DerivaExport
@@ -25,6 +26,7 @@ try:
 except ImportError:  # Graceful fallback if IceCream isn't installed.
     ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
+from graphlib import TopologicalSorter
 import json
 import logging
 from pathlib import Path
@@ -35,7 +37,7 @@ from pydantic import (
 import requests
 from tempfile import TemporaryDirectory, NamedTemporaryFile
-from typing import Any, Callable, Optional, Iterable
+from typing import Any, Callable, Optional, Iterable, Iterator
 from deriva_ml import DatasetBag
 from .deriva_definitions import ML_SCHEMA, DerivaMLException, MLVocab, Status, RID
@@ -85,6 +87,7 @@ class Dataset:
         dataset_rid: RID,
         dataset_version: DatasetVersion,
         description: Optional[str] = "",
+        execution_rid: Optional[RID] = None,
     ) -> RID:
         schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
         version_path = schema_path.tables["Dataset_Version"]
@@ -94,6 +97,7 @@ class Dataset:
                     "Dataset": dataset_rid,
                     "Version": str(dataset_version),
                     "Description": description,
+                    "Execution": execution_rid,
                 }
             ]
         )[0]["RID"]
@@ -163,6 +167,7 @@ class Dataset:
                 dataset_rid=dataset_rid,
                 version_rid=v["RID"],
                 description=v["Description"],
+                execution_rid=v["Execution"],
             )
             for v in version_path.filter(version_path.Dataset == dataset_rid)
             .entities()
@@ -190,11 +195,30 @@ class Dataset:
         else:
             return max([h.dataset_version for h in self.dataset_history(dataset_rid)])
+    def _build_dataset_graph(self, dataset_rid: RID) -> Iterable[RID]:
+        ts = TopologicalSorter()
+        self._build_dataset_graph_1(dataset_rid, ts, set())
+        return ts.static_order()
+    def _build_dataset_graph_1(self, dataset_rid: RID, ts, visited) -> None:
+        """Use topological sort to return bottom up list of nested datasets"""
+        ts.add(dataset_rid)
+        if dataset_rid not in visited:
+            visited.add(dataset_rid)
+            children = self.list_dataset_children(dataset_rid=dataset_rid)
+            parents = self.list_dataset_parents(dataset_rid=dataset_rid)
+            for parent in parents:
+                self._build_dataset_graph_1(parent, ts, visited)
+            for child in children:
+                self._build_dataset_graph_1(child, ts, visited)
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def increment_dataset_version(
         self,
         dataset_rid: RID,
         component: VersionPart,
         description: Optional[str] = "",
+        execution_rid: Optional[RID] = None,
     ) -> DatasetVersion:
         """Increment the version of the specified dataset_table.
@@ -204,6 +228,7 @@ class Dataset:
           dataset_rid: RID of the dataset whose version is to be incremented.
           component: Major, Minor or Patch
           description: Description of the version update of the dataset_table.
+          execution_rid: Which execution is performing increment.
         Returns:
           new semantic version of the dataset_table as a 3-tuple
@@ -211,16 +236,16 @@ class Dataset:
         Raises:
           DerivaMLException: if provided RID is not to a dataset_table.
         """
-        for ds in self.list_dataset_children(dataset_rid):
-            self.increment_dataset_version(
-                ds,
-                component,
-                description=f"Increment version of nested dataset: {description}",
+        for dataset in self._build_dataset_graph(dataset_rid=dataset_rid):
+            version = self.dataset_version(dataset)
+            new_version = version.increment_version(component)
+            self._insert_dataset_version(
+                dataset,
+                new_version,
+                description=description,
+                execution_rid=execution_rid,
             )
-        version = self.dataset_version(dataset_rid)
-        new_version = version.increment_version(component)
-        self._insert_dataset_version(dataset_rid, new_version, description=description)
-        return new_version
+        return self.dataset_version(dataset_rid)
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def create_dataset(
@@ -297,7 +322,12 @@ class Dataset:
             pb.schemas[self._ml_schema].Dataset_Execution.insert(
                 [{"Dataset": dataset_rid, "Execution": execution_rid}]
             )
-        self._insert_dataset_version(dataset_rid, version)
+        self._insert_dataset_version(
+            dataset_rid,
+            dataset_version=version,
+            execution_rid=execution_rid,
+            description="Initial dataset creation.",
+        )
         return dataset_rid
     @validate_call
@@ -485,6 +515,7 @@ class Dataset:
         members: list[RID],
         validate: bool = True,
         description: Optional[str] = "",
+        execution_rid: Optional[RID] = None,
     ) -> None:
         """Add additional elements to an existing dataset_table.
@@ -496,6 +527,7 @@ class Dataset:
             members: List of RIDs of members to add to the  dataset_table.
             validate: Check rid_list to make sure elements are not already in the dataset_table.
             description: Markdown description of the updated dataset.
+            execution_rid: Optional RID of execution associated with this dataset.
         """
         members = set(members)
         description = description or "Updated dataset via add_dataset_members"
@@ -559,12 +591,19 @@ class Dataset:
                     [{"Dataset": dataset_rid, fk_column: e} for e in elements]
                 )
         self.increment_dataset_version(
-            dataset_rid, VersionPart.minor, description=description
+            dataset_rid,
+            VersionPart.minor,
+            description=description,
+            execution_rid=execution_rid,
         )
     @validate_call
     def delete_dataset_members(
-        self, dataset_rid: RID, members: list[RID], description=""
+        self,
+        dataset_rid: RID,
+        members: list[RID],
+        description: str = "",
+        execution_rid: Optional[RID] = None,
     ) -> None:
         """Remove elements to an existing dataset_table.
@@ -575,6 +614,7 @@ class Dataset:
             dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
             members: List of RIDs of members to add to the  dataset_table.
             description: Markdown description of the updated dataset.
+            execution_rid: Optional RID of execution associated with this operation.
         """
         members = set(members)
@@ -616,7 +656,10 @@ class Dataset:
                     )
                     entity.delete()
         self.increment_dataset_version(
-            dataset_rid, VersionPart.minor, description=description
+            dataset_rid,
+            VersionPart.minor,
+            description=description,
+            execution_rid=execution_rid,
         )
     @validate_call
@@ -663,44 +706,6 @@ class Dataset:
                 children.extend(self.list_dataset_children(child, recurse=recurse))
         return children
-    @staticmethod
-    def _download_dataset_element(
-        spath: str, dpath: str, table: Table
-    ) -> list[dict[str, Any]]:
-        """Return the download specification for the data object indicated by a path through the data model.
-        Args:
-          spath: Source path
-          dpath: Destination path
-          table: Table referenced to by the path
-        Returns:
-          The download specification that will retrieve that data from the catalog and place it into a BDBag.
-        """
-        exports = [
-            {
-                "processor": "csv",
-                "processor_params": {
-                    "query_path": f"/entity/{spath}?limit=none",
-                    "output_path": dpath,
-                },
-            }
-        ]
-        # If this table is an asset table, then we need to output the files associated with the asset.
-        asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
-        if asset_columns.issubset({c.name for c in table.columns}):
-            exports.append(
-                {
-                    "processor": "fetch",
-                    "processor_params": {
-                        "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
-                        "output_path": f"asset/{table.name}",
-                    },
-                }
-            )
-        return exports
     def _vocabulary_specification(
         self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
     ) -> list[dict[str, Any]]:
@@ -724,82 +729,38 @@ class Dataset:
             for o in writer(f"{table.schema.name}:{table.name}", table.name, table)
         ]
-    def _domain_table_paths(
-        self,
-        graph: dict[Table, list[dict[Table, Any]]],
-        spath: str = None,
-        dpath: str = None,
-        sprefix: str = "deriva-ml:Dataset/RID={Dataset_RID}",
-        dprefix: str = "Dataset",
-        nested: bool = False,
-    ) -> list[tuple[str, str, Table]]:
-        """Recursively walk over the domain schema graph and extend the current path.
-        Args:
-            graph: An undirected, acyclic graph of schema.  Represented as a dictionary whose name is the table name.
-                and whose values are the child nodes of the table.
-            spath: Source path so far
-            dpath: Destination path so far
-            sprefix: Initial path to be included.  Allows for nested datasets
-            dprefix: Initial path to be included.  Allows for nested datasets
-            nested: If true, skip initial data segment.
-        Returns:
-          A list of all the paths through the graph.  Each path is a list of tables.
+    def _table_paths(self) -> Iterator[tuple[list[str], list[str], list[Table]]]:
-        """
-        source_path = spath or sprefix
-        dest_path = dpath or dprefix
-        paths = []
-        for node, children in graph.items():
-            if node.name == "Dataset":
-                paths.append(
-                    (
-                        f"{sprefix}/(RID)=({self._ml_schema}:Dataset_Version:Dataset)",
-                        f"{dprefix}/Dataset_Version",
-                        self._model.schemas[self._ml_schema].tables["Dataset_Version"],
-                    )
-                )
-                new_spath = sprefix
-                new_dpath = dprefix
-                if not nested:
-                    paths.append((new_spath, new_dpath, node))
-            else:
-                new_spath = source_path + f"/{node.schema.name}:{node.name}"
-                new_dpath = dest_path + f"/{node.name}"
-                paths.append((new_spath, new_dpath, node))
-            for child in children:
-                paths.extend(
-                    self._domain_table_paths(child, new_spath, new_dpath, nested=nested)
-                )
-        return paths
+        dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
+        paths = self._model._schema_to_paths()
+        nested_paths = paths
-    def _table_paths(self, graph) -> list[tuple[str, str, Table]]:
-        sprefix = "deriva-ml:Dataset/RID={Dataset_RID}"
-        dprefix = "Dataset"
-        dataset_dataset_table = self._model.schemas[self._ml_schema].tables[
-            "Dataset_Dataset"
-        ]
-        table_paths = self._domain_table_paths(
-            graph=graph, sprefix=sprefix, dprefix=dprefix
-        )
-        nested_sprefix = sprefix
-        nested_dprefix = dprefix
         for i in range(self._dataset_nesting_depth()):
-            nested_sprefix += f"/(RID)=(deriva-ml:Dataset_Dataset:Dataset)"
-            nested_dprefix += f"/Dataset_Dataset"
-            table_paths.append((nested_sprefix, nested_dprefix, dataset_dataset_table))
-            nested_sprefix += f"/(Nested_Dataset)=(deriva-ml:Dataset:RID)"
-            nested_dprefix += f"/Dataset"
-            table_paths.append((nested_sprefix, nested_dprefix, self.dataset_table))
-            # Get CSV for nested datasets.
-            table_paths.extend(
-                self._domain_table_paths(
-                    graph, sprefix=nested_sprefix, dprefix=nested_dprefix, nested=True
-                )
-            )
-        return table_paths
+            if i == 0:
+                paths.extend([[self.dataset_table, dataset_dataset]])
+            nested_paths = [
+                [self.dataset_table, dataset_dataset] + p for p in nested_paths
+            ]
+            paths.extend(nested_paths)
+        def source_path(path):
+            p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
+            for table in path[1:]:
+                if table == dataset_dataset:
+                    p.append(f"(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
+                elif table == self.dataset_table:
+                    p.append(f"(Nested_Dataset)=(deriva-ml:Dataset:RID)")
+                elif table.name == "Dataset_Version":
+                    p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
+                else:
+                    p.append(f"{table.schema.name}:{table.name}")
+            return p
+        src_paths = ["/".join(source_path(p)) for p in paths]
+        dest_paths = ["/".join([t.name for t in p]) for p in paths]
+        target_tables = [p[-1] for p in paths]
+        return zip(src_paths, dest_paths, target_tables)
     def _dataset_nesting_depth(self):
         """Determine the maximum dataset nesting depth in the current catalog.
@@ -811,6 +772,7 @@ class Dataset:
         def children_depth(
             dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
         ) -> int:
+            """Return the number of nested datasets in the current catalog"""
             try:
                 children = nested_datasets[dataset_rid]
                 return (
@@ -836,50 +798,6 @@ class Dataset:
             else 0
         )
-    def _schema_graph(
-        self, node: Table, visited_nodes: Optional[set] = None
-    ) -> dict[Table, list[dict[Table, list]]]:
-        """Generate an undirected, acyclic graph of domain schema. We do this by traversing the schema foreign key
-        relationships.  We stop when we hit the deriva-ml schema or when we reach a node that we have already seen.
-        Nested datasets need to be unfolded
-        Args:
-          node: Current (starting) node in the graph.
-          visited_nodes: param nested_dataset: Are we in a nested dataset_table, (i.e. have we seen the DataSet table)?
-        Returns:
-            Graph of the schema, starting from node.
-        """
-        visited_nodes = visited_nodes or set()
-        graph = {node: []}
-        def include_node(child: Table) -> bool:
-            """Indicate if the table should be included in the graph.
-            Include node in the graph if it's not a loopback from fk<-> referred_by, you have not already been to the
-            node.
-            """
-            return (
-                child != node
-                and child not in visited_nodes
-                and child.schema.name == self._model.domain_schema
-            )
-        # Get all the tables reachable from the end of the path avoiding loops from T1<->T2 via referenced_by
-        nodes = {fk.pk_table for fk in node.foreign_keys if include_node(fk.pk_table)}
-        nodes |= {fk.table for fk in node.referenced_by if include_node(fk.table)}
-        for t in nodes:
-            new_visited_nodes = visited_nodes.copy()
-            new_visited_nodes.add(t)
-            if self._model.is_vocabulary(t):
-                # If the end of the path is a vocabulary table, we are at a terminal node in the ERD, so stop
-                continue
-            # Get all the paths that extend the current path
-            graph[node].append(self._schema_graph(t, new_visited_nodes))
-        return graph
     def _dataset_specification(
         self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
     ) -> list[dict[str, Any]]:
@@ -921,7 +839,7 @@ class Dataset:
             A dataset_table specification.
         """
         element_spec = []
-        for path in self._table_paths(self._schema_graph(self.dataset_table)):
+        for path in self._table_paths():
             element_spec.extend(writer(*path))
         return self._vocabulary_specification(writer) + element_spec
@@ -953,7 +871,7 @@ class Dataset:
             if dataset.materialize
             else self._download_dataset_bag(minid)
         )
-        return DatabaseModel.register(minid, bag_path).get_dataset()
+        return DatabaseModel(minid, bag_path).get_dataset()
     def _version_snapshot(self, dataset: DatasetSpec) -> str:
         version_record = [
@@ -1089,6 +1007,7 @@ class Dataset:
         """
         def update_status(status: Status, msg: str) -> None:
+            """Update the current status for this execution in the catalog"""
             self._model.catalog.getPathBuilder().schemas[
                 self._ml_schema
             ].Execution.update(
@@ -1196,6 +1115,44 @@ class Dataset:
             }
         ] + self._dataset_specification(writer)
+    @staticmethod
+    def _download_dataset_element(
+        spath: str, dpath: str, table: Table
+    ) -> list[dict[str, Any]]:
+        """Return the download specification for the data object indicated by a path through the data model.
+        Args:
+          spath: Source path
+          dpath: Destination path
+          table: Table referenced to by the path
+        Returns:
+          The download specification that will retrieve that data from the catalog and place it into a BDBag.
+        """
+        exports = [
+            {
+                "processor": "csv",
+                "processor_params": {
+                    "query_path": f"/entity/{spath}?limit=none",
+                    "output_path": dpath,
+                },
+            }
+        ]
+        # If this table is an asset table, then we need to output the files associated with the asset.
+        asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
+        if asset_columns.issubset({c.name for c in table.columns}):
+            exports.append(
+                {
+                    "processor": "fetch",
+                    "processor_params": {
+                        "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
+                        "output_path": f"asset/{table.name}",
+                    },
+                }
+            )
+        return exports
     @staticmethod
     def _export_dataset_element(
         spath: str, dpath: str, table: Table

deriva_ml/dataset_aux_classes.py CHANGED Viewed

@@ -104,6 +104,7 @@ class DatasetHistory(BaseModel):
     dataset_version: DatasetVersion
     dataset_rid: RID
     version_rid: RID
+    execution_rid: Optional[RID] = None
     description: str = ""
     minid: Optional[str] = None
     timestamp: Optional[datetime] = None

deriva-ml 1.6.7__py3-none-any.whl → 1.7.0__py3-none-any.whl

deriva-ml 1.6.7py3-none-any.whl → 1.7.0py3-none-any.whl