deriva-ml 1.6.8__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deriva_ml/dataset.py CHANGED
@@ -9,6 +9,7 @@ accessible via a DerivaML class instance.
9
9
  from bdbag.fetch.fetcher import fetch_single_file
10
10
  from bdbag import bdbag_api as bdb
11
11
  from collections import defaultdict
12
+
12
13
  from deriva.core.ermrest_model import Table
13
14
  from deriva.core.utils.core_utils import tag as deriva_tags, format_exception
14
15
  from deriva.transfer.download.deriva_export import DerivaExport
@@ -25,6 +26,7 @@ try:
25
26
  except ImportError: # Graceful fallback if IceCream isn't installed.
26
27
  ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
27
28
 
29
+ from graphlib import TopologicalSorter
28
30
  import json
29
31
  import logging
30
32
  from pathlib import Path
@@ -35,7 +37,7 @@ from pydantic import (
35
37
  import requests
36
38
 
37
39
  from tempfile import TemporaryDirectory, NamedTemporaryFile
38
- from typing import Any, Callable, Optional, Iterable
40
+ from typing import Any, Callable, Optional, Iterable, Iterator
39
41
 
40
42
  from deriva_ml import DatasetBag
41
43
  from .deriva_definitions import ML_SCHEMA, DerivaMLException, MLVocab, Status, RID
@@ -73,9 +75,10 @@ class Dataset:
73
75
  rid_info = self._model.catalog.resolve_rid(dataset_rid, self._model.model)
74
76
  except KeyError as _e:
75
77
  raise DerivaMLException(f"Invalid RID {dataset_rid}")
76
-
77
- # Got a dataset rid. Now check to see if its deleted or not.
78
- if deleted:
78
+ if rid_info.table != self.dataset_table:
79
+ return False
80
+ elif deleted:
81
+ # Got a dataset rid. Now check to see if its deleted or not.
79
82
  return True
80
83
  else:
81
84
  return not list(rid_info.datapath.entities().fetch())[0]["Deleted"]
@@ -85,6 +88,7 @@ class Dataset:
85
88
  dataset_rid: RID,
86
89
  dataset_version: DatasetVersion,
87
90
  description: Optional[str] = "",
91
+ execution_rid: Optional[RID] = None,
88
92
  ) -> RID:
89
93
  schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
90
94
  version_path = schema_path.tables["Dataset_Version"]
@@ -94,6 +98,7 @@ class Dataset:
94
98
  "Dataset": dataset_rid,
95
99
  "Version": str(dataset_version),
96
100
  "Description": description,
101
+ "Execution": execution_rid,
97
102
  }
98
103
  ]
99
104
  )[0]["RID"]
@@ -163,6 +168,7 @@ class Dataset:
163
168
  dataset_rid=dataset_rid,
164
169
  version_rid=v["RID"],
165
170
  description=v["Description"],
171
+ execution_rid=v["Execution"],
166
172
  )
167
173
  for v in version_path.filter(version_path.Dataset == dataset_rid)
168
174
  .entities()
@@ -190,11 +196,30 @@ class Dataset:
190
196
  else:
191
197
  return max([h.dataset_version for h in self.dataset_history(dataset_rid)])
192
198
 
199
+ def _build_dataset_graph(self, dataset_rid: RID) -> Iterable[RID]:
200
+ ts = TopologicalSorter()
201
+ self._build_dataset_graph_1(dataset_rid, ts, set())
202
+ return ts.static_order()
203
+
204
+ def _build_dataset_graph_1(self, dataset_rid: RID, ts, visited) -> None:
205
+ """Use topological sort to return bottom up list of nested datasets"""
206
+ ts.add(dataset_rid)
207
+ if dataset_rid not in visited:
208
+ visited.add(dataset_rid)
209
+ children = self.list_dataset_children(dataset_rid=dataset_rid)
210
+ parents = self.list_dataset_parents(dataset_rid=dataset_rid)
211
+ for parent in parents:
212
+ self._build_dataset_graph_1(parent, ts, visited)
213
+ for child in children:
214
+ self._build_dataset_graph_1(child, ts, visited)
215
+
216
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
193
217
  def increment_dataset_version(
194
218
  self,
195
219
  dataset_rid: RID,
196
220
  component: VersionPart,
197
221
  description: Optional[str] = "",
222
+ execution_rid: Optional[RID] = None,
198
223
  ) -> DatasetVersion:
199
224
  """Increment the version of the specified dataset_table.
200
225
 
@@ -204,6 +229,7 @@ class Dataset:
204
229
  dataset_rid: RID of the dataset whose version is to be incremented.
205
230
  component: Major, Minor or Patch
206
231
  description: Description of the version update of the dataset_table.
232
+ execution_rid: Which execution is performing increment.
207
233
 
208
234
  Returns:
209
235
  new semantic version of the dataset_table as a 3-tuple
@@ -211,16 +237,16 @@ class Dataset:
211
237
  Raises:
212
238
  DerivaMLException: if provided RID is not to a dataset_table.
213
239
  """
214
- for ds in self.list_dataset_children(dataset_rid):
215
- self.increment_dataset_version(
216
- ds,
217
- component,
218
- description=f"Increment version of nested dataset: {description}",
240
+ for dataset in self._build_dataset_graph(dataset_rid=dataset_rid):
241
+ version = self.dataset_version(dataset)
242
+ new_version = version.increment_version(component)
243
+ self._insert_dataset_version(
244
+ dataset,
245
+ new_version,
246
+ description=description,
247
+ execution_rid=execution_rid,
219
248
  )
220
- version = self.dataset_version(dataset_rid)
221
- new_version = version.increment_version(component)
222
- self._insert_dataset_version(dataset_rid, new_version, description=description)
223
- return new_version
249
+ return self.dataset_version(dataset_rid)
224
250
 
225
251
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
226
252
  def create_dataset(
@@ -268,7 +294,7 @@ class Dataset:
268
294
  pb = self._model.catalog.getPathBuilder()
269
295
  for ds_type in ds_types:
270
296
  if not check_dataset_type(ds_type):
271
- raise DerivaMLException(f"Dataset type must be a vocabulary term.")
297
+ raise DerivaMLException("Dataset type must be a vocabulary term.")
272
298
  dataset_table_path = pb.schemas[self.dataset_table.schema.name].tables[
273
299
  self.dataset_table.name
274
300
  ]
@@ -297,7 +323,12 @@ class Dataset:
297
323
  pb.schemas[self._ml_schema].Dataset_Execution.insert(
298
324
  [{"Dataset": dataset_rid, "Execution": execution_rid}]
299
325
  )
300
- self._insert_dataset_version(dataset_rid, version)
326
+ self._insert_dataset_version(
327
+ dataset_rid,
328
+ dataset_version=version,
329
+ execution_rid=execution_rid,
330
+ description="Initial dataset creation.",
331
+ )
301
332
  return dataset_rid
302
333
 
303
334
  @validate_call
@@ -414,7 +445,7 @@ class Dataset:
414
445
  self._model.model.apply()
415
446
  return table
416
447
 
417
- @validate_call
448
+ # @validate_call
418
449
  def list_dataset_members(
419
450
  self, dataset_rid: RID, recurse: bool = False
420
451
  ) -> dict[str, list[dict[str, Any]]]:
@@ -439,34 +470,27 @@ class Dataset:
439
470
  pb = self._model.catalog.getPathBuilder()
440
471
  for assoc_table in self.dataset_table.find_associations():
441
472
  other_fkey = assoc_table.other_fkeys.pop()
442
- self_fkey = assoc_table.self_fkey
443
473
  target_table = other_fkey.pk_table
444
474
  member_table = assoc_table.table
445
475
 
476
+ # Look at domain tables and nested datasets.
446
477
  if (
447
478
  target_table.schema.name != self._model.domain_schema
448
479
  and target_table != self.dataset_table
449
480
  ):
450
- # Look at domain tables and nested datasets.
451
481
  continue
452
- if target_table == self.dataset_table:
453
- # find_assoc gives us the keys in the wrong position, so swap.
454
- self_fkey, other_fkey = other_fkey, self_fkey
482
+ member_column = (
483
+ "Nested_Dataset"
484
+ if target_table == self.dataset_table
485
+ else other_fkey.foreign_key_columns[0].name
486
+ )
455
487
 
456
488
  target_path = pb.schemas[target_table.schema.name].tables[target_table.name]
457
489
  member_path = pb.schemas[member_table.schema.name].tables[member_table.name]
458
- # Get the names of the columns that we are going to need for linking
459
- member_link = tuple(
460
- c.name for c in next(iter(other_fkey.column_map.items()))
461
- )
462
- path = pb.schemas[member_table.schema.name].tables[member_table.name].path
463
- path.filter(member_path.Dataset == dataset_rid)
464
- path.link(
490
+
491
+ path = member_path.filter(member_path.Dataset == dataset_rid).link(
465
492
  target_path,
466
- on=(
467
- member_path.columns[member_link[0]]
468
- == target_path.columns[member_link[1]]
469
- ),
493
+ on=(member_path.columns[member_column] == target_path.columns["RID"]),
470
494
  )
471
495
  target_entities = list(path.entities().fetch())
472
496
  members[target_table.name].extend(target_entities)
@@ -485,6 +509,7 @@ class Dataset:
485
509
  members: list[RID],
486
510
  validate: bool = True,
487
511
  description: Optional[str] = "",
512
+ execution_rid: Optional[RID] = None,
488
513
  ) -> None:
489
514
  """Add additional elements to an existing dataset_table.
490
515
 
@@ -496,6 +521,7 @@ class Dataset:
496
521
  members: List of RIDs of members to add to the dataset_table.
497
522
  validate: Check rid_list to make sure elements are not already in the dataset_table.
498
523
  description: Markdown description of the updated dataset.
524
+ execution_rid: Optional RID of execution associated with this dataset.
499
525
  """
500
526
  members = set(members)
501
527
  description = description or "Updated dataset via add_dataset_members"
@@ -559,12 +585,19 @@ class Dataset:
559
585
  [{"Dataset": dataset_rid, fk_column: e} for e in elements]
560
586
  )
561
587
  self.increment_dataset_version(
562
- dataset_rid, VersionPart.minor, description=description
588
+ dataset_rid,
589
+ VersionPart.minor,
590
+ description=description,
591
+ execution_rid=execution_rid,
563
592
  )
564
593
 
565
594
  @validate_call
566
595
  def delete_dataset_members(
567
- self, dataset_rid: RID, members: list[RID], description=""
596
+ self,
597
+ dataset_rid: RID,
598
+ members: list[RID],
599
+ description: str = "",
600
+ execution_rid: Optional[RID] = None,
568
601
  ) -> None:
569
602
  """Remove elements to an existing dataset_table.
570
603
 
@@ -575,6 +608,7 @@ class Dataset:
575
608
  dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
576
609
  members: List of RIDs of members to add to the dataset_table.
577
610
  description: Markdown description of the updated dataset.
611
+ execution_rid: Optional RID of execution associated with this operation.
578
612
  """
579
613
 
580
614
  members = set(members)
@@ -616,7 +650,10 @@ class Dataset:
616
650
  )
617
651
  entity.delete()
618
652
  self.increment_dataset_version(
619
- dataset_rid, VersionPart.minor, description=description
653
+ dataset_rid,
654
+ VersionPart.minor,
655
+ description=description,
656
+ execution_rid=execution_rid,
620
657
  )
621
658
 
622
659
  @validate_call
@@ -663,44 +700,6 @@ class Dataset:
663
700
  children.extend(self.list_dataset_children(child, recurse=recurse))
664
701
  return children
665
702
 
666
- @staticmethod
667
- def _download_dataset_element(
668
- spath: str, dpath: str, table: Table
669
- ) -> list[dict[str, Any]]:
670
- """Return the download specification for the data object indicated by a path through the data model.
671
-
672
- Args:
673
- spath: Source path
674
- dpath: Destination path
675
- table: Table referenced to by the path
676
-
677
- Returns:
678
- The download specification that will retrieve that data from the catalog and place it into a BDBag.
679
- """
680
- exports = [
681
- {
682
- "processor": "csv",
683
- "processor_params": {
684
- "query_path": f"/entity/{spath}?limit=none",
685
- "output_path": dpath,
686
- },
687
- }
688
- ]
689
-
690
- # If this table is an asset table, then we need to output the files associated with the asset.
691
- asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
692
- if asset_columns.issubset({c.name for c in table.columns}):
693
- exports.append(
694
- {
695
- "processor": "fetch",
696
- "processor_params": {
697
- "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
698
- "output_path": f"asset/{table.name}",
699
- },
700
- }
701
- )
702
- return exports
703
-
704
703
  def _vocabulary_specification(
705
704
  self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
706
705
  ) -> list[dict[str, Any]]:
@@ -724,82 +723,38 @@ class Dataset:
724
723
  for o in writer(f"{table.schema.name}:{table.name}", table.name, table)
725
724
  ]
726
725
 
727
- def _domain_table_paths(
728
- self,
729
- graph: dict[Table, list[dict[Table, Any]]],
730
- spath: str = None,
731
- dpath: str = None,
732
- sprefix: str = "deriva-ml:Dataset/RID={Dataset_RID}",
733
- dprefix: str = "Dataset",
734
- nested: bool = False,
735
- ) -> list[tuple[str, str, Table]]:
736
- """Recursively walk over the domain schema graph and extend the current path.
726
+ def _table_paths(self) -> Iterator[tuple[list[str], list[str], list[Table]]]:
737
727
 
738
- Args:
739
- graph: An undirected, acyclic graph of schema. Represented as a dictionary whose name is the table name.
740
- and whose values are the child nodes of the table.
741
- spath: Source path so far
742
- dpath: Destination path so far
743
- sprefix: Initial path to be included. Allows for nested datasets
744
- dprefix: Initial path to be included. Allows for nested datasets
745
- nested: If true, skip initial data segment.
728
+ dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
729
+ paths = self._model._schema_to_paths()
730
+ nested_paths = paths
746
731
 
747
- Returns:
748
- A list of all the paths through the graph. Each path is a list of tables.
749
-
750
- """
751
- source_path = spath or sprefix
752
- dest_path = dpath or dprefix
753
- paths = []
754
- for node, children in graph.items():
755
- if node.name == "Dataset":
756
- paths.append(
757
- (
758
- f"{sprefix}/(RID)=({self._ml_schema}:Dataset_Version:Dataset)",
759
- f"{dprefix}/Dataset_Version",
760
- self._model.schemas[self._ml_schema].tables["Dataset_Version"],
761
- )
762
- )
763
- new_spath = sprefix
764
- new_dpath = dprefix
765
-
766
- if not nested:
767
- paths.append((new_spath, new_dpath, node))
768
- else:
769
- new_spath = source_path + f"/{node.schema.name}:{node.name}"
770
- new_dpath = dest_path + f"/{node.name}"
771
- paths.append((new_spath, new_dpath, node))
772
- for child in children:
773
- paths.extend(
774
- self._domain_table_paths(child, new_spath, new_dpath, nested=nested)
775
- )
776
- return paths
777
-
778
- def _table_paths(self, graph) -> list[tuple[str, str, Table]]:
779
- sprefix = "deriva-ml:Dataset/RID={Dataset_RID}"
780
- dprefix = "Dataset"
781
- dataset_dataset_table = self._model.schemas[self._ml_schema].tables[
782
- "Dataset_Dataset"
783
- ]
784
- table_paths = self._domain_table_paths(
785
- graph=graph, sprefix=sprefix, dprefix=dprefix
786
- )
787
- nested_sprefix = sprefix
788
- nested_dprefix = dprefix
789
732
  for i in range(self._dataset_nesting_depth()):
790
- nested_sprefix += f"/(RID)=(deriva-ml:Dataset_Dataset:Dataset)"
791
- nested_dprefix += f"/Dataset_Dataset"
792
- table_paths.append((nested_sprefix, nested_dprefix, dataset_dataset_table))
793
- nested_sprefix += f"/(Nested_Dataset)=(deriva-ml:Dataset:RID)"
794
- nested_dprefix += f"/Dataset"
795
- table_paths.append((nested_sprefix, nested_dprefix, self.dataset_table))
796
- # Get CSV for nested datasets.
797
- table_paths.extend(
798
- self._domain_table_paths(
799
- graph, sprefix=nested_sprefix, dprefix=nested_dprefix, nested=True
800
- )
801
- )
802
- return table_paths
733
+ if i == 0:
734
+ paths.extend([[self.dataset_table, dataset_dataset]])
735
+ nested_paths = [
736
+ [self.dataset_table, dataset_dataset] + p for p in nested_paths
737
+ ]
738
+ paths.extend(nested_paths)
739
+
740
+ def source_path(path):
741
+ p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
742
+ for table in path[1:]:
743
+ if table == dataset_dataset:
744
+ p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
745
+ elif table == self.dataset_table:
746
+ p.append("(Nested_Dataset)=(deriva-ml:Dataset:RID)")
747
+ elif table.name == "Dataset_Version":
748
+ p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
749
+ else:
750
+ p.append(f"{table.schema.name}:{table.name}")
751
+ return p
752
+
753
+ src_paths = ["/".join(source_path(p)) for p in paths]
754
+ dest_paths = ["/".join([t.name for t in p]) for p in paths]
755
+ target_tables = [p[-1] for p in paths]
756
+
757
+ return zip(src_paths, dest_paths, target_tables)
803
758
 
804
759
  def _dataset_nesting_depth(self):
805
760
  """Determine the maximum dataset nesting depth in the current catalog.
@@ -811,6 +766,7 @@ class Dataset:
811
766
  def children_depth(
812
767
  dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
813
768
  ) -> int:
769
+ """Return the number of nested datasets in the current catalog"""
814
770
  try:
815
771
  children = nested_datasets[dataset_rid]
816
772
  return (
@@ -836,50 +792,6 @@ class Dataset:
836
792
  else 0
837
793
  )
838
794
 
839
- def _schema_graph(
840
- self, node: Table, visited_nodes: Optional[set] = None
841
- ) -> dict[Table, list[dict[Table, list]]]:
842
- """Generate an undirected, acyclic graph of domain schema. We do this by traversing the schema foreign key
843
- relationships. We stop when we hit the deriva-ml schema or when we reach a node that we have already seen.
844
-
845
- Nested datasets need to be unfolded
846
-
847
- Args:
848
- node: Current (starting) node in the graph.
849
- visited_nodes: param nested_dataset: Are we in a nested dataset_table, (i.e. have we seen the DataSet table)?
850
-
851
- Returns:
852
- Graph of the schema, starting from node.
853
- """
854
-
855
- visited_nodes = visited_nodes or set()
856
- graph = {node: []}
857
-
858
- def include_node(child: Table) -> bool:
859
- """Indicate if the table should be included in the graph.
860
-
861
- Include node in the graph if it's not a loopback from fk<-> referred_by, you have not already been to the
862
- node.
863
- """
864
- return (
865
- child != node
866
- and child not in visited_nodes
867
- and child.schema.name == self._model.domain_schema
868
- )
869
-
870
- # Get all the tables reachable from the end of the path avoiding loops from T1<->T2 via referenced_by
871
- nodes = {fk.pk_table for fk in node.foreign_keys if include_node(fk.pk_table)}
872
- nodes |= {fk.table for fk in node.referenced_by if include_node(fk.table)}
873
- for t in nodes:
874
- new_visited_nodes = visited_nodes.copy()
875
- new_visited_nodes.add(t)
876
- if self._model.is_vocabulary(t):
877
- # If the end of the path is a vocabulary table, we are at a terminal node in the ERD, so stop
878
- continue
879
- # Get all the paths that extend the current path
880
- graph[node].append(self._schema_graph(t, new_visited_nodes))
881
- return graph
882
-
883
795
  def _dataset_specification(
884
796
  self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
885
797
  ) -> list[dict[str, Any]]:
@@ -921,7 +833,7 @@ class Dataset:
921
833
  A dataset_table specification.
922
834
  """
923
835
  element_spec = []
924
- for path in self._table_paths(self._schema_graph(self.dataset_table)):
836
+ for path in self._table_paths():
925
837
  element_spec.extend(writer(*path))
926
838
  return self._vocabulary_specification(writer) + element_spec
927
839
 
@@ -953,7 +865,7 @@ class Dataset:
953
865
  if dataset.materialize
954
866
  else self._download_dataset_bag(minid)
955
867
  )
956
- return DatabaseModel.register(minid, bag_path).get_dataset()
868
+ return DatabaseModel(minid, bag_path).get_dataset()
957
869
 
958
870
  def _version_snapshot(self, dataset: DatasetSpec) -> str:
959
871
  version_record = [
@@ -1089,6 +1001,7 @@ class Dataset:
1089
1001
  """
1090
1002
 
1091
1003
  def update_status(status: Status, msg: str) -> None:
1004
+ """Update the current status for this execution in the catalog"""
1092
1005
  self._model.catalog.getPathBuilder().schemas[
1093
1006
  self._ml_schema
1094
1007
  ].Execution.update(
@@ -1192,10 +1105,48 @@ class Dataset:
1192
1105
  return [
1193
1106
  {
1194
1107
  "processor": "json",
1195
- "processor_params": {"query_path": f"/schema", "output_path": "schema"},
1108
+ "processor_params": {"query_path": "/schema", "output_path": "schema"},
1196
1109
  }
1197
1110
  ] + self._dataset_specification(writer)
1198
1111
 
1112
+ @staticmethod
1113
+ def _download_dataset_element(
1114
+ spath: str, dpath: str, table: Table
1115
+ ) -> list[dict[str, Any]]:
1116
+ """Return the download specification for the data object indicated by a path through the data model.
1117
+
1118
+ Args:
1119
+ spath: Source path
1120
+ dpath: Destination path
1121
+ table: Table referenced to by the path
1122
+
1123
+ Returns:
1124
+ The download specification that will retrieve that data from the catalog and place it into a BDBag.
1125
+ """
1126
+ exports = [
1127
+ {
1128
+ "processor": "csv",
1129
+ "processor_params": {
1130
+ "query_path": f"/entity/{spath}?limit=none",
1131
+ "output_path": dpath,
1132
+ },
1133
+ }
1134
+ ]
1135
+
1136
+ # If this table is an asset table, then we need to output the files associated with the asset.
1137
+ asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
1138
+ if asset_columns.issubset({c.name for c in table.columns}):
1139
+ exports.append(
1140
+ {
1141
+ "processor": "fetch",
1142
+ "processor_params": {
1143
+ "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
1144
+ "output_path": f"asset/{table.name}",
1145
+ },
1146
+ }
1147
+ )
1148
+ return exports
1149
+
1199
1150
  @staticmethod
1200
1151
  def _export_dataset_element(
1201
1152
  spath: str, dpath: str, table: Table
@@ -104,6 +104,7 @@ class DatasetHistory(BaseModel):
104
104
  dataset_version: DatasetVersion
105
105
  dataset_rid: RID
106
106
  version_rid: RID
107
+ execution_rid: Optional[RID] = None
107
108
  description: str = ""
108
109
  minid: Optional[str] = None
109
110
  timestamp: Optional[datetime] = None