deriva-ml 1.6.8__tar.gz → 1.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {deriva_ml-1.6.8/src/deriva_ml.egg-info → deriva_ml-1.7.0}/PKG-INFO +1 -1
  2. deriva_ml-1.7.0/src/deriva_ml/VERSION.py +1 -0
  3. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/database_model.py +23 -80
  4. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/dataset.py +128 -171
  5. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/dataset_aux_classes.py +1 -0
  6. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/dataset_bag.py +101 -7
  7. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/demo_catalog.py +93 -12
  8. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/deriva_definitions.py +39 -31
  9. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/deriva_ml_base.py +39 -7
  10. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/deriva_model.py +98 -2
  11. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/execution.py +58 -3
  12. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/execution_configuration.py +2 -1
  13. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/execution_environment.py +2 -0
  14. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/feature.py +0 -3
  15. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/history.py +1 -2
  16. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/schema_setup/create_schema.py +1 -0
  17. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/test_functions.py +1 -17
  18. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/upload.py +1 -1
  19. {deriva_ml-1.6.8 → deriva_ml-1.7.0/src/deriva_ml.egg-info}/PKG-INFO +1 -1
  20. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/tests/test_dataset.py +107 -17
  21. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/tests/test_download.py +9 -0
  22. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/tests/test_features.py +15 -4
  23. deriva_ml-1.6.8/src/deriva_ml/VERSION.py +0 -1
  24. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/LICENSE +0 -0
  25. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/README.md +0 -0
  26. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/pyproject.toml +0 -0
  27. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/setup.cfg +0 -0
  28. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/__init__.py +0 -0
  29. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/build/lib/schema_setup/__init__.py +0 -0
  30. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/build/lib/schema_setup/alter_annotation.py +0 -0
  31. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/build/lib/schema_setup/annotation_temp.py +0 -0
  32. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/build/lib/schema_setup/create_schema.py +0 -0
  33. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/build/lib/schema_setup/table_comments_utils.py +0 -0
  34. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/schema_setup/__init__.py +0 -0
  35. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/schema_setup/alter_annotation.py +0 -0
  36. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/schema_setup/annotations.py +0 -0
  37. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/schema_setup/policy.json +0 -0
  38. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml/schema_setup/table_comments_utils.py +0 -0
  39. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml.egg-info/SOURCES.txt +0 -0
  40. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
  41. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml.egg-info/entry_points.txt +0 -0
  42. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml.egg-info/requires.txt +0 -0
  43. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/src/deriva_ml.egg-info/top_level.txt +0 -0
  44. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/tests/test_basic_tables.py +0 -0
  45. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/tests/test_execution.py +0 -0
  46. {deriva_ml-1.6.8 → deriva_ml-1.7.0}/tests/test_upload.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: deriva-ml
3
- Version: 1.6.8
3
+ Version: 1.7.0
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -0,0 +1 @@
1
+ __version__ = "1.7.0"
@@ -1,14 +1,15 @@
1
+ """Ths module constains the definition of the DatabaseModel class. The role of this class is to provide an nterface between the BDBag representation
2
+ of a dataset and a sqllite database in which the contents of the bag are stored.
3
+ """
1
4
  import logging
2
5
  import sqlite3
3
6
 
4
7
  from csv import reader
5
8
  from pathlib import Path
6
- from typing import Any, Generator, Optional
9
+ from typing import Any, Optional
7
10
  from urllib.parse import urlparse
8
11
 
9
- import pandas as pd
10
12
  from deriva.core.ermrest_model import Model
11
- from pydantic import validate_call
12
13
 
13
14
  from .deriva_definitions import ML_SCHEMA, MLVocab, RID, DerivaMLException
14
15
  from .dataset_aux_classes import DatasetVersion, DatasetMinid
@@ -16,7 +17,21 @@ from .deriva_model import DerivaModel
16
17
  from .dataset_bag import DatasetBag
17
18
 
18
19
 
19
- class DatabaseModel(DerivaModel):
20
+ class DatabaseModelMeta(type):
21
+ """Use metaclass to ensure that there is onl one instance per path"""
22
+
23
+ _paths_loaded: dict[Path:"DatabaseModel"] = {}
24
+
25
+ def __call__(cls, *args, **kwargs):
26
+ logger = logging.getLogger("deriva_ml")
27
+ bag_path: Path = args[1]
28
+ if bag_path.as_posix() not in cls._paths_loaded:
29
+ logger.info(f"Loading {bag_path}")
30
+ cls._paths_loaded[bag_path] = super().__call__(*args, **kwargs)
31
+ return cls._paths_loaded[bag_path]
32
+
33
+
34
+ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
20
35
  """Read in the contents of a BDBag and create a local SQLite database.
21
36
 
22
37
  As part of its initialization, this routine will create a sqlite database that has the contents of all the tables
@@ -32,6 +47,9 @@ class DatabaseModel(DerivaModel):
32
47
  Because of nested datasets, it's possible that more than one dataset rid is in a bag, or that a dataset rid might
33
48
  appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
34
49
  into DatabaseModels, is kept in the class variable `_rid_map`.
50
+
51
+ Because you can load diffent versions of a dataset simultaniously, the dataset RID and version number are tracked, and a new
52
+ sqllite instance is created for every new dataset version present.
35
53
 
36
54
  Attributes:
37
55
  bag_path (Path): path to the local copy of the BDBag
@@ -42,29 +60,9 @@ class DatabaseModel(DerivaModel):
42
60
  dataset_table (Table): the dataset table in the ERMRest model.
43
61
  """
44
62
 
45
- # Keep track of what databases we have loaded.
46
- _paths_loaded: dict[Path:"DatabaseModel"] = {}
47
-
48
63
  # Maintain a global map of RIDS to versions and databases.
49
64
  _rid_map: dict[RID, list[tuple[DatasetVersion, "DatabaseModel"]]] = {}
50
65
 
51
- @classmethod
52
- @validate_call
53
- def register(cls, minid: DatasetMinid, bag_path: Path):
54
- """Register a new minid in the list of local databases if it's new, otherwise, return an existing DatabaseModel.
55
-
56
- Args:
57
- minid: MINID to the databag that is to be loaded.
58
- bag_path: Path to the bag on the local filesystem./
59
-
60
- Returns:
61
- A DatabaseModel instance to the loaded bag.
62
- """
63
- o = cls._paths_loaded.get(bag_path.as_posix())
64
- if o:
65
- return o
66
- return cls(minid, bag_path)
67
-
68
66
  @staticmethod
69
67
  def rid_lookup(dataset_rid: RID) -> list[tuple[DatasetVersion, "DatabaseModel"]]:
70
68
  """Return a list of DatasetVersion/DatabaseModel instances corresponding to the given RID.
@@ -84,13 +82,12 @@ class DatabaseModel(DerivaModel):
84
82
  raise DerivaMLException(f"Dataset {dataset_rid} not found")
85
83
 
86
84
  def __init__(self, minid: DatasetMinid, bag_path: Path):
87
- """Create a new DatabaseModel. This should only be called via the static Register method
85
+ """Create a new DatabaseModel.
88
86
 
89
87
  Args:
90
88
  minid: Minid for the specified bag.
91
89
  bag_path: Path to the local copy of the BDBag.
92
90
  """
93
- DatabaseModel._paths_loaded[bag_path.as_posix()] = self
94
91
 
95
92
  self.bag_path = bag_path
96
93
  self.minid = minid
@@ -342,60 +339,6 @@ class DatabaseModel(DerivaModel):
342
339
  except KeyError:
343
340
  raise DerivaMLException(f'Table name "{table}" does not exist.')
344
341
 
345
- def get_table(self, table: str) -> Generator[tuple, None, None]:
346
- """Retrieve the contents of the specified table. If schema is not provided as part of the table name,
347
- the method will attempt to locate the schema for the table.
348
-
349
- Args:
350
- table: return: A generator that yields tuples of column values.
351
-
352
- Returns:
353
- A generator that yields tuples of column values.
354
-
355
- """
356
- table_name = self.normalize_table_name(table)
357
- result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
358
- while row := result.fetchone():
359
- yield row
360
-
361
- def get_table_as_dataframe(self, table: str) -> pd.DataFrame:
362
- """Retrieve the contents of the specified table as a dataframe.
363
-
364
-
365
- If schema is not provided as part of the table name,
366
- the method will attempt to locate the schema for the table.
367
-
368
- Args:
369
- table: Table to retrieve data from.
370
-
371
- Returns:
372
- A dataframe containing the contents of the specified table.
373
- """
374
- table_name = self.normalize_table_name(table)
375
- return pd.read_sql(f'SELECT * FROM "{table_name}"', con=self.dbase)
376
-
377
- def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
378
- """Retrieve the contents of the specified table as a dictionary.
379
-
380
- Args:
381
- table: Table to retrieve data from. f schema is not provided as part of the table name,
382
- the method will attempt to locate the schema for the table.
383
-
384
- Returns:
385
- A generator producing dictionaries containing the contents of the specified table as name/value pairs.
386
- """
387
- table_name = self.normalize_table_name(table)
388
- with self.dbase:
389
- col_names = [
390
- c[1]
391
- for c in self.dbase.execute(
392
- f'PRAGMA table_info("{table_name}")'
393
- ).fetchall()
394
- ]
395
- result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
396
- while row := result.fetchone():
397
- yield dict(zip(col_names, row))
398
-
399
342
  def delete_database(self):
400
343
  """
401
344
 
@@ -9,6 +9,7 @@ accessible via a DerivaML class instance.
9
9
  from bdbag.fetch.fetcher import fetch_single_file
10
10
  from bdbag import bdbag_api as bdb
11
11
  from collections import defaultdict
12
+
12
13
  from deriva.core.ermrest_model import Table
13
14
  from deriva.core.utils.core_utils import tag as deriva_tags, format_exception
14
15
  from deriva.transfer.download.deriva_export import DerivaExport
@@ -25,6 +26,7 @@ try:
25
26
  except ImportError: # Graceful fallback if IceCream isn't installed.
26
27
  ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
27
28
 
29
+ from graphlib import TopologicalSorter
28
30
  import json
29
31
  import logging
30
32
  from pathlib import Path
@@ -35,7 +37,7 @@ from pydantic import (
35
37
  import requests
36
38
 
37
39
  from tempfile import TemporaryDirectory, NamedTemporaryFile
38
- from typing import Any, Callable, Optional, Iterable
40
+ from typing import Any, Callable, Optional, Iterable, Iterator
39
41
 
40
42
  from deriva_ml import DatasetBag
41
43
  from .deriva_definitions import ML_SCHEMA, DerivaMLException, MLVocab, Status, RID
@@ -85,6 +87,7 @@ class Dataset:
85
87
  dataset_rid: RID,
86
88
  dataset_version: DatasetVersion,
87
89
  description: Optional[str] = "",
90
+ execution_rid: Optional[RID] = None,
88
91
  ) -> RID:
89
92
  schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
90
93
  version_path = schema_path.tables["Dataset_Version"]
@@ -94,6 +97,7 @@ class Dataset:
94
97
  "Dataset": dataset_rid,
95
98
  "Version": str(dataset_version),
96
99
  "Description": description,
100
+ "Execution": execution_rid,
97
101
  }
98
102
  ]
99
103
  )[0]["RID"]
@@ -163,6 +167,7 @@ class Dataset:
163
167
  dataset_rid=dataset_rid,
164
168
  version_rid=v["RID"],
165
169
  description=v["Description"],
170
+ execution_rid=v["Execution"],
166
171
  )
167
172
  for v in version_path.filter(version_path.Dataset == dataset_rid)
168
173
  .entities()
@@ -190,11 +195,30 @@ class Dataset:
190
195
  else:
191
196
  return max([h.dataset_version for h in self.dataset_history(dataset_rid)])
192
197
 
198
+ def _build_dataset_graph(self, dataset_rid: RID) -> Iterable[RID]:
199
+ ts = TopologicalSorter()
200
+ self._build_dataset_graph_1(dataset_rid, ts, set())
201
+ return ts.static_order()
202
+
203
+ def _build_dataset_graph_1(self, dataset_rid: RID, ts, visited) -> None:
204
+ """Use topological sort to return bottom up list of nested datasets"""
205
+ ts.add(dataset_rid)
206
+ if dataset_rid not in visited:
207
+ visited.add(dataset_rid)
208
+ children = self.list_dataset_children(dataset_rid=dataset_rid)
209
+ parents = self.list_dataset_parents(dataset_rid=dataset_rid)
210
+ for parent in parents:
211
+ self._build_dataset_graph_1(parent, ts, visited)
212
+ for child in children:
213
+ self._build_dataset_graph_1(child, ts, visited)
214
+
215
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
193
216
  def increment_dataset_version(
194
217
  self,
195
218
  dataset_rid: RID,
196
219
  component: VersionPart,
197
220
  description: Optional[str] = "",
221
+ execution_rid: Optional[RID] = None,
198
222
  ) -> DatasetVersion:
199
223
  """Increment the version of the specified dataset_table.
200
224
 
@@ -204,6 +228,7 @@ class Dataset:
204
228
  dataset_rid: RID of the dataset whose version is to be incremented.
205
229
  component: Major, Minor or Patch
206
230
  description: Description of the version update of the dataset_table.
231
+ execution_rid: Which execution is performing increment.
207
232
 
208
233
  Returns:
209
234
  new semantic version of the dataset_table as a 3-tuple
@@ -211,16 +236,16 @@ class Dataset:
211
236
  Raises:
212
237
  DerivaMLException: if provided RID is not to a dataset_table.
213
238
  """
214
- for ds in self.list_dataset_children(dataset_rid):
215
- self.increment_dataset_version(
216
- ds,
217
- component,
218
- description=f"Increment version of nested dataset: {description}",
239
+ for dataset in self._build_dataset_graph(dataset_rid=dataset_rid):
240
+ version = self.dataset_version(dataset)
241
+ new_version = version.increment_version(component)
242
+ self._insert_dataset_version(
243
+ dataset,
244
+ new_version,
245
+ description=description,
246
+ execution_rid=execution_rid,
219
247
  )
220
- version = self.dataset_version(dataset_rid)
221
- new_version = version.increment_version(component)
222
- self._insert_dataset_version(dataset_rid, new_version, description=description)
223
- return new_version
248
+ return self.dataset_version(dataset_rid)
224
249
 
225
250
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
226
251
  def create_dataset(
@@ -297,7 +322,12 @@ class Dataset:
297
322
  pb.schemas[self._ml_schema].Dataset_Execution.insert(
298
323
  [{"Dataset": dataset_rid, "Execution": execution_rid}]
299
324
  )
300
- self._insert_dataset_version(dataset_rid, version)
325
+ self._insert_dataset_version(
326
+ dataset_rid,
327
+ dataset_version=version,
328
+ execution_rid=execution_rid,
329
+ description="Initial dataset creation.",
330
+ )
301
331
  return dataset_rid
302
332
 
303
333
  @validate_call
@@ -485,6 +515,7 @@ class Dataset:
485
515
  members: list[RID],
486
516
  validate: bool = True,
487
517
  description: Optional[str] = "",
518
+ execution_rid: Optional[RID] = None,
488
519
  ) -> None:
489
520
  """Add additional elements to an existing dataset_table.
490
521
 
@@ -496,6 +527,7 @@ class Dataset:
496
527
  members: List of RIDs of members to add to the dataset_table.
497
528
  validate: Check rid_list to make sure elements are not already in the dataset_table.
498
529
  description: Markdown description of the updated dataset.
530
+ execution_rid: Optional RID of execution associated with this dataset.
499
531
  """
500
532
  members = set(members)
501
533
  description = description or "Updated dataset via add_dataset_members"
@@ -559,12 +591,19 @@ class Dataset:
559
591
  [{"Dataset": dataset_rid, fk_column: e} for e in elements]
560
592
  )
561
593
  self.increment_dataset_version(
562
- dataset_rid, VersionPart.minor, description=description
594
+ dataset_rid,
595
+ VersionPart.minor,
596
+ description=description,
597
+ execution_rid=execution_rid,
563
598
  )
564
599
 
565
600
  @validate_call
566
601
  def delete_dataset_members(
567
- self, dataset_rid: RID, members: list[RID], description=""
602
+ self,
603
+ dataset_rid: RID,
604
+ members: list[RID],
605
+ description: str = "",
606
+ execution_rid: Optional[RID] = None,
568
607
  ) -> None:
569
608
  """Remove elements to an existing dataset_table.
570
609
 
@@ -575,6 +614,7 @@ class Dataset:
575
614
  dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
576
615
  members: List of RIDs of members to add to the dataset_table.
577
616
  description: Markdown description of the updated dataset.
617
+ execution_rid: Optional RID of execution associated with this operation.
578
618
  """
579
619
 
580
620
  members = set(members)
@@ -616,7 +656,10 @@ class Dataset:
616
656
  )
617
657
  entity.delete()
618
658
  self.increment_dataset_version(
619
- dataset_rid, VersionPart.minor, description=description
659
+ dataset_rid,
660
+ VersionPart.minor,
661
+ description=description,
662
+ execution_rid=execution_rid,
620
663
  )
621
664
 
622
665
  @validate_call
@@ -663,44 +706,6 @@ class Dataset:
663
706
  children.extend(self.list_dataset_children(child, recurse=recurse))
664
707
  return children
665
708
 
666
- @staticmethod
667
- def _download_dataset_element(
668
- spath: str, dpath: str, table: Table
669
- ) -> list[dict[str, Any]]:
670
- """Return the download specification for the data object indicated by a path through the data model.
671
-
672
- Args:
673
- spath: Source path
674
- dpath: Destination path
675
- table: Table referenced to by the path
676
-
677
- Returns:
678
- The download specification that will retrieve that data from the catalog and place it into a BDBag.
679
- """
680
- exports = [
681
- {
682
- "processor": "csv",
683
- "processor_params": {
684
- "query_path": f"/entity/{spath}?limit=none",
685
- "output_path": dpath,
686
- },
687
- }
688
- ]
689
-
690
- # If this table is an asset table, then we need to output the files associated with the asset.
691
- asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
692
- if asset_columns.issubset({c.name for c in table.columns}):
693
- exports.append(
694
- {
695
- "processor": "fetch",
696
- "processor_params": {
697
- "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
698
- "output_path": f"asset/{table.name}",
699
- },
700
- }
701
- )
702
- return exports
703
-
704
709
  def _vocabulary_specification(
705
710
  self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
706
711
  ) -> list[dict[str, Any]]:
@@ -724,82 +729,38 @@ class Dataset:
724
729
  for o in writer(f"{table.schema.name}:{table.name}", table.name, table)
725
730
  ]
726
731
 
727
- def _domain_table_paths(
728
- self,
729
- graph: dict[Table, list[dict[Table, Any]]],
730
- spath: str = None,
731
- dpath: str = None,
732
- sprefix: str = "deriva-ml:Dataset/RID={Dataset_RID}",
733
- dprefix: str = "Dataset",
734
- nested: bool = False,
735
- ) -> list[tuple[str, str, Table]]:
736
- """Recursively walk over the domain schema graph and extend the current path.
737
-
738
- Args:
739
- graph: An undirected, acyclic graph of schema. Represented as a dictionary whose name is the table name.
740
- and whose values are the child nodes of the table.
741
- spath: Source path so far
742
- dpath: Destination path so far
743
- sprefix: Initial path to be included. Allows for nested datasets
744
- dprefix: Initial path to be included. Allows for nested datasets
745
- nested: If true, skip initial data segment.
746
-
747
- Returns:
748
- A list of all the paths through the graph. Each path is a list of tables.
732
+ def _table_paths(self) -> Iterator[tuple[list[str], list[str], list[Table]]]:
749
733
 
750
- """
751
- source_path = spath or sprefix
752
- dest_path = dpath or dprefix
753
- paths = []
754
- for node, children in graph.items():
755
- if node.name == "Dataset":
756
- paths.append(
757
- (
758
- f"{sprefix}/(RID)=({self._ml_schema}:Dataset_Version:Dataset)",
759
- f"{dprefix}/Dataset_Version",
760
- self._model.schemas[self._ml_schema].tables["Dataset_Version"],
761
- )
762
- )
763
- new_spath = sprefix
764
- new_dpath = dprefix
765
-
766
- if not nested:
767
- paths.append((new_spath, new_dpath, node))
768
- else:
769
- new_spath = source_path + f"/{node.schema.name}:{node.name}"
770
- new_dpath = dest_path + f"/{node.name}"
771
- paths.append((new_spath, new_dpath, node))
772
- for child in children:
773
- paths.extend(
774
- self._domain_table_paths(child, new_spath, new_dpath, nested=nested)
775
- )
776
- return paths
734
+ dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
735
+ paths = self._model._schema_to_paths()
736
+ nested_paths = paths
777
737
 
778
- def _table_paths(self, graph) -> list[tuple[str, str, Table]]:
779
- sprefix = "deriva-ml:Dataset/RID={Dataset_RID}"
780
- dprefix = "Dataset"
781
- dataset_dataset_table = self._model.schemas[self._ml_schema].tables[
782
- "Dataset_Dataset"
783
- ]
784
- table_paths = self._domain_table_paths(
785
- graph=graph, sprefix=sprefix, dprefix=dprefix
786
- )
787
- nested_sprefix = sprefix
788
- nested_dprefix = dprefix
789
738
  for i in range(self._dataset_nesting_depth()):
790
- nested_sprefix += f"/(RID)=(deriva-ml:Dataset_Dataset:Dataset)"
791
- nested_dprefix += f"/Dataset_Dataset"
792
- table_paths.append((nested_sprefix, nested_dprefix, dataset_dataset_table))
793
- nested_sprefix += f"/(Nested_Dataset)=(deriva-ml:Dataset:RID)"
794
- nested_dprefix += f"/Dataset"
795
- table_paths.append((nested_sprefix, nested_dprefix, self.dataset_table))
796
- # Get CSV for nested datasets.
797
- table_paths.extend(
798
- self._domain_table_paths(
799
- graph, sprefix=nested_sprefix, dprefix=nested_dprefix, nested=True
800
- )
801
- )
802
- return table_paths
739
+ if i == 0:
740
+ paths.extend([[self.dataset_table, dataset_dataset]])
741
+ nested_paths = [
742
+ [self.dataset_table, dataset_dataset] + p for p in nested_paths
743
+ ]
744
+ paths.extend(nested_paths)
745
+
746
+ def source_path(path):
747
+ p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
748
+ for table in path[1:]:
749
+ if table == dataset_dataset:
750
+ p.append(f"(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
751
+ elif table == self.dataset_table:
752
+ p.append(f"(Nested_Dataset)=(deriva-ml:Dataset:RID)")
753
+ elif table.name == "Dataset_Version":
754
+ p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
755
+ else:
756
+ p.append(f"{table.schema.name}:{table.name}")
757
+ return p
758
+
759
+ src_paths = ["/".join(source_path(p)) for p in paths]
760
+ dest_paths = ["/".join([t.name for t in p]) for p in paths]
761
+ target_tables = [p[-1] for p in paths]
762
+
763
+ return zip(src_paths, dest_paths, target_tables)
803
764
 
804
765
  def _dataset_nesting_depth(self):
805
766
  """Determine the maximum dataset nesting depth in the current catalog.
@@ -811,6 +772,7 @@ class Dataset:
811
772
  def children_depth(
812
773
  dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
813
774
  ) -> int:
775
+ """Return the number of nested datasets in the current catalog"""
814
776
  try:
815
777
  children = nested_datasets[dataset_rid]
816
778
  return (
@@ -836,50 +798,6 @@ class Dataset:
836
798
  else 0
837
799
  )
838
800
 
839
- def _schema_graph(
840
- self, node: Table, visited_nodes: Optional[set] = None
841
- ) -> dict[Table, list[dict[Table, list]]]:
842
- """Generate an undirected, acyclic graph of domain schema. We do this by traversing the schema foreign key
843
- relationships. We stop when we hit the deriva-ml schema or when we reach a node that we have already seen.
844
-
845
- Nested datasets need to be unfolded
846
-
847
- Args:
848
- node: Current (starting) node in the graph.
849
- visited_nodes: param nested_dataset: Are we in a nested dataset_table, (i.e. have we seen the DataSet table)?
850
-
851
- Returns:
852
- Graph of the schema, starting from node.
853
- """
854
-
855
- visited_nodes = visited_nodes or set()
856
- graph = {node: []}
857
-
858
- def include_node(child: Table) -> bool:
859
- """Indicate if the table should be included in the graph.
860
-
861
- Include node in the graph if it's not a loopback from fk<-> referred_by, you have not already been to the
862
- node.
863
- """
864
- return (
865
- child != node
866
- and child not in visited_nodes
867
- and child.schema.name == self._model.domain_schema
868
- )
869
-
870
- # Get all the tables reachable from the end of the path avoiding loops from T1<->T2 via referenced_by
871
- nodes = {fk.pk_table for fk in node.foreign_keys if include_node(fk.pk_table)}
872
- nodes |= {fk.table for fk in node.referenced_by if include_node(fk.table)}
873
- for t in nodes:
874
- new_visited_nodes = visited_nodes.copy()
875
- new_visited_nodes.add(t)
876
- if self._model.is_vocabulary(t):
877
- # If the end of the path is a vocabulary table, we are at a terminal node in the ERD, so stop
878
- continue
879
- # Get all the paths that extend the current path
880
- graph[node].append(self._schema_graph(t, new_visited_nodes))
881
- return graph
882
-
883
801
  def _dataset_specification(
884
802
  self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
885
803
  ) -> list[dict[str, Any]]:
@@ -921,7 +839,7 @@ class Dataset:
921
839
  A dataset_table specification.
922
840
  """
923
841
  element_spec = []
924
- for path in self._table_paths(self._schema_graph(self.dataset_table)):
842
+ for path in self._table_paths():
925
843
  element_spec.extend(writer(*path))
926
844
  return self._vocabulary_specification(writer) + element_spec
927
845
 
@@ -953,7 +871,7 @@ class Dataset:
953
871
  if dataset.materialize
954
872
  else self._download_dataset_bag(minid)
955
873
  )
956
- return DatabaseModel.register(minid, bag_path).get_dataset()
874
+ return DatabaseModel(minid, bag_path).get_dataset()
957
875
 
958
876
  def _version_snapshot(self, dataset: DatasetSpec) -> str:
959
877
  version_record = [
@@ -1089,6 +1007,7 @@ class Dataset:
1089
1007
  """
1090
1008
 
1091
1009
  def update_status(status: Status, msg: str) -> None:
1010
+ """Update the current status for this execution in the catalog"""
1092
1011
  self._model.catalog.getPathBuilder().schemas[
1093
1012
  self._ml_schema
1094
1013
  ].Execution.update(
@@ -1196,6 +1115,44 @@ class Dataset:
1196
1115
  }
1197
1116
  ] + self._dataset_specification(writer)
1198
1117
 
1118
+ @staticmethod
1119
+ def _download_dataset_element(
1120
+ spath: str, dpath: str, table: Table
1121
+ ) -> list[dict[str, Any]]:
1122
+ """Return the download specification for the data object indicated by a path through the data model.
1123
+
1124
+ Args:
1125
+ spath: Source path
1126
+ dpath: Destination path
1127
+ table: Table referenced to by the path
1128
+
1129
+ Returns:
1130
+ The download specification that will retrieve that data from the catalog and place it into a BDBag.
1131
+ """
1132
+ exports = [
1133
+ {
1134
+ "processor": "csv",
1135
+ "processor_params": {
1136
+ "query_path": f"/entity/{spath}?limit=none",
1137
+ "output_path": dpath,
1138
+ },
1139
+ }
1140
+ ]
1141
+
1142
+ # If this table is an asset table, then we need to output the files associated with the asset.
1143
+ asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
1144
+ if asset_columns.issubset({c.name for c in table.columns}):
1145
+ exports.append(
1146
+ {
1147
+ "processor": "fetch",
1148
+ "processor_params": {
1149
+ "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
1150
+ "output_path": f"asset/{table.name}",
1151
+ },
1152
+ }
1153
+ )
1154
+ return exports
1155
+
1199
1156
  @staticmethod
1200
1157
  def _export_dataset_element(
1201
1158
  spath: str, dpath: str, table: Table
@@ -104,6 +104,7 @@ class DatasetHistory(BaseModel):
104
104
  dataset_version: DatasetVersion
105
105
  dataset_rid: RID
106
106
  version_rid: RID
107
+ execution_rid: Optional[RID] = None
107
108
  description: str = ""
108
109
  minid: Optional[str] = None
109
110
  timestamp: Optional[datetime] = None