deriva-ml 1.8.1__tar.gz → 1.8.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {deriva_ml-1.8.1/src/deriva_ml.egg-info → deriva_ml-1.8.2}/PKG-INFO +1 -1
  2. deriva_ml-1.8.2/src/deriva_ml/VERSION.py +1 -0
  3. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/dataset.py +95 -24
  4. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/deriva_model.py +4 -1
  5. {deriva_ml-1.8.1 → deriva_ml-1.8.2/src/deriva_ml.egg-info}/PKG-INFO +1 -1
  6. deriva_ml-1.8.1/src/deriva_ml/VERSION.py +0 -1
  7. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/LICENSE +0 -0
  8. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/README.md +0 -0
  9. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/pyproject.toml +0 -0
  10. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/setup.cfg +0 -0
  11. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/__init__.py +0 -0
  12. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/build/lib/schema_setup/__init__.py +0 -0
  13. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/build/lib/schema_setup/alter_annotation.py +0 -0
  14. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/build/lib/schema_setup/annotation_temp.py +0 -0
  15. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/build/lib/schema_setup/create_schema.py +0 -0
  16. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/build/lib/schema_setup/table_comments_utils.py +0 -0
  17. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/database_model.py +0 -0
  18. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/dataset_aux_classes.py +0 -0
  19. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/dataset_bag.py +0 -0
  20. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/demo_catalog.py +0 -0
  21. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/deriva_definitions.py +0 -0
  22. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/deriva_ml_base.py +0 -0
  23. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/execution.py +0 -0
  24. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/execution_configuration.py +0 -0
  25. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/execution_environment.py +0 -0
  26. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/feature.py +0 -0
  27. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/history.py +0 -0
  28. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/schema_setup/__init__.py +0 -0
  29. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/schema_setup/alter_annotation.py +0 -0
  30. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/schema_setup/annotations.py +0 -0
  31. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/schema_setup/create_schema.py +0 -0
  32. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/schema_setup/policy.json +0 -0
  33. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/schema_setup/table_comments_utils.py +0 -0
  34. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/test_functions.py +0 -0
  35. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml/upload.py +0 -0
  36. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml.egg-info/SOURCES.txt +0 -0
  37. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
  38. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml.egg-info/entry_points.txt +0 -0
  39. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml.egg-info/requires.txt +0 -0
  40. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/src/deriva_ml.egg-info/top_level.txt +0 -0
  41. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/tests/test_basic_tables.py +0 -0
  42. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/tests/test_dataset.py +0 -0
  43. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/tests/test_download.py +0 -0
  44. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/tests/test_execution.py +0 -0
  45. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/tests/test_features.py +0 -0
  46. {deriva_ml-1.8.1 → deriva_ml-1.8.2}/tests/test_upload.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: deriva-ml
3
- Version: 1.8.1
3
+ Version: 1.8.2
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -0,0 +1 @@
1
+ __version__ = "1.8.2"
@@ -447,7 +447,7 @@ class Dataset:
447
447
 
448
448
  # @validate_call
449
449
  def list_dataset_members(
450
- self, dataset_rid: RID, recurse: bool = False
450
+ self, dataset_rid: RID, recurse: bool = False, limit: Optional[int] = None
451
451
  ) -> dict[str, list[dict[str, Any]]]:
452
452
  """Return a list of entities associated with a specific dataset_table.
453
453
 
@@ -455,6 +455,7 @@ class Dataset:
455
455
  dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
456
456
  dataset_rid: RID:
457
457
  recurse: (Default value = False)
458
+ limit: If provided, the maxiumum number of members to return for each element type.
458
459
 
459
460
  Returns:
460
461
  Dictionary of entities associated with a specific dataset_table. Key is the table from which the elements
@@ -492,7 +493,9 @@ class Dataset:
492
493
  target_path,
493
494
  on=(member_path.columns[member_column] == target_path.columns["RID"]),
494
495
  )
495
- target_entities = list(path.entities().fetch())
496
+ target_entities = list(
497
+ path.entities().fetch(limit=limit) if limit else path.entities().fetch()
498
+ )
496
499
  members[target_table.name].extend(target_entities)
497
500
  if recurse and target_table == self.dataset_table:
498
501
  # Get the members for all the nested datasets and add to the member list.
@@ -723,21 +726,16 @@ class Dataset:
723
726
  for o in writer(f"{table.schema.name}:{table.name}", table.name, table)
724
727
  ]
725
728
 
726
- def _table_paths(self) -> Iterator[tuple[list[str], list[str], list[Table]]]:
729
+ def _table_paths(
730
+ self, dataset: DatasetSpec = None
731
+ ) -> Iterator[tuple[list[str], list[str], list[Table]]]:
727
732
 
728
733
  dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
729
- paths = self._model._schema_to_paths()
730
- nested_paths = paths
731
-
732
- for i in range(self._dataset_nesting_depth()):
733
- if i == 0:
734
- paths.extend([[self.dataset_table, dataset_dataset]])
735
- nested_paths = [
736
- [self.dataset_table, dataset_dataset] + p for p in nested_paths
737
- ]
738
- paths.extend(nested_paths)
739
734
 
740
- def source_path(path):
735
+ paths = self._collect_paths(dataset and dataset.rid)
736
+
737
+ def source_path(path: tuple[Table, ...]):
738
+ path = list(path)
741
739
  p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
742
740
  for table in path[1:]:
743
741
  if table == dataset_dataset:
@@ -756,7 +754,65 @@ class Dataset:
756
754
 
757
755
  return zip(src_paths, dest_paths, target_tables)
758
756
 
759
- def _dataset_nesting_depth(self):
757
+ def _collect_paths(
758
+ self,
759
+ dataset_rid: Optional[RID] = None,
760
+ dataset_nesting_depth: Optional[int] = None,
761
+ ) -> set[tuple[Table, ...]]:
762
+
763
+ dataset_nesting_depth = (
764
+ self._dataset_nesting_depth()
765
+ if dataset_nesting_depth is None
766
+ else dataset_nesting_depth
767
+ )
768
+ dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
769
+
770
+ # Figure out which paths we don't need to query for this dataset. If no dataset is provided, use them all.
771
+ dataset_elements = (
772
+ [
773
+ self._model.name_to_table(e)
774
+ for e, m in self.list_dataset_members(
775
+ dataset_rid=dataset_rid, limit=1
776
+ ).items()
777
+ if m
778
+ ]
779
+ if dataset_rid
780
+ else self.list_dataset_element_types()
781
+ )
782
+
783
+ dataset_associations = [a.table for a in self.dataset_table.find_associations()]
784
+ included_associations = [
785
+ a.table
786
+ for a in self.dataset_table.find_associations()
787
+ if a.other_fkeys.pop().pk_table in dataset_elements
788
+ ]
789
+ # Get the paths through the schema and filter out all of dataset paths not used by this dataset.
790
+ paths = {
791
+ tuple(p)
792
+ for p in self._model._schema_to_paths()
793
+ if (len(p) == 1)
794
+ or (p[1] not in dataset_associations)
795
+ or (p[1] in included_associations)
796
+ }
797
+ # Now get paths for nested datasets
798
+ nested_paths = set()
799
+ if dataset_rid:
800
+ for c in self.list_dataset_children(dataset_rid=dataset_rid):
801
+ nested_paths |= self._collect_paths(c)
802
+ else:
803
+ if dataset_nesting_depth:
804
+ nested_paths = self._collect_paths(
805
+ dataset_nesting_depth=dataset_nesting_depth - 1
806
+ )
807
+ if nested_paths:
808
+ paths |= {
809
+ tuple([self.dataset_table]),
810
+ (self.dataset_table, dataset_dataset),
811
+ }
812
+ paths |= {(self.dataset_table, dataset_dataset) + p for p in nested_paths}
813
+ return paths
814
+
815
+ def _dataset_nesting_depth(self, dataset_rid: Optional[RID] = None) -> int:
760
816
  """Determine the maximum dataset nesting depth in the current catalog.
761
817
 
762
818
  Returns:
@@ -766,7 +822,7 @@ class Dataset:
766
822
  def children_depth(
767
823
  dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
768
824
  ) -> int:
769
- """Return the number of nested datasets in the current catalog"""
825
+ """Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
770
826
  try:
771
827
  children = nested_datasets[dataset_rid]
772
828
  return (
@@ -783,8 +839,19 @@ class Dataset:
783
839
  .schemas[self._ml_schema]
784
840
  .tables["Dataset_Dataset"]
785
841
  )
842
+ dataset_children = (
843
+ [
844
+ {
845
+ "Dataset": dataset_rid,
846
+ "Nested_Dataset": c,
847
+ } # Make uniform with return from datapath
848
+ for c in self.list_dataset_children(dataset_rid)
849
+ ]
850
+ if dataset_rid
851
+ else pb.entities().fetch()
852
+ )
786
853
  nested_dataset = defaultdict(list)
787
- for ds in pb.entities().fetch():
854
+ for ds in dataset_children:
788
855
  nested_dataset[ds["Dataset"]].append(ds["Nested_Dataset"])
789
856
  return (
790
857
  max(map(lambda d: children_depth(d, dict(nested_dataset)), nested_dataset))
@@ -793,7 +860,9 @@ class Dataset:
793
860
  )
794
861
 
795
862
  def _dataset_specification(
796
- self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
863
+ self,
864
+ writer: Callable[[str, str, Table], list[dict[str, Any]]],
865
+ dataset: DatasetSpec,
797
866
  ) -> list[dict[str, Any]]:
798
867
  """Output a download/export specification for a dataset_table. Each element of the dataset_table will be placed in its own dir
799
868
  The top level data directory of the resulting BDBag will have one subdirectory for element type. the subdirectory
@@ -833,7 +902,7 @@ class Dataset:
833
902
  A dataset_table specification.
834
903
  """
835
904
  element_spec = []
836
- for path in self._table_paths():
905
+ for path in self._table_paths(dataset=dataset):
837
906
  element_spec.extend(writer(*path))
838
907
  return self._vocabulary_specification(writer) + element_spec
839
908
 
@@ -1042,7 +1111,9 @@ class Dataset:
1042
1111
  validated_check.touch()
1043
1112
  return Path(bag_path)
1044
1113
 
1045
- def _export_outputs(self) -> list[dict[str, Any]]:
1114
+ def _export_outputs(
1115
+ self, dataset: Optional[DatasetSpec] = None
1116
+ ) -> list[dict[str, Any]]:
1046
1117
  """Return and output specification for the datasets in the provided model
1047
1118
 
1048
1119
  Returns:
@@ -1079,9 +1150,9 @@ class Dataset:
1079
1150
  "source": {"api": "schema", "skip_root_path": True},
1080
1151
  "destination": {"type": "json", "name": "schema"},
1081
1152
  },
1082
- ] + self._dataset_specification(writer)
1153
+ ] + self._dataset_specification(writer, dataset)
1083
1154
 
1084
- def _processor_params(self) -> list[dict[str, Any]]:
1155
+ def _processor_params(self, dataset: DatasetSpec) -> list[dict[str, Any]]:
1085
1156
  """
1086
1157
  Returns:
1087
1158
  a download specification for the datasets in the provided model.
@@ -1107,7 +1178,7 @@ class Dataset:
1107
1178
  "processor": "json",
1108
1179
  "processor_params": {"query_path": "/schema", "output_path": "schema"},
1109
1180
  }
1110
- ] + self._dataset_specification(writer)
1181
+ ] + self._dataset_specification(writer, dataset)
1111
1182
 
1112
1183
  @staticmethod
1113
1184
  def _download_dataset_element(
@@ -1244,7 +1315,7 @@ class Dataset:
1244
1315
  },
1245
1316
  },
1246
1317
  ]
1247
- + self._processor_params(),
1318
+ + self._processor_params(dataset),
1248
1319
  },
1249
1320
  }
1250
1321
 
@@ -265,7 +265,9 @@ class DerivaModel:
265
265
  return relationships[0]
266
266
 
267
267
  def _schema_to_paths(
268
- self, root: Table = None, path: list[Table] = None
268
+ self,
269
+ root: Table = None,
270
+ path: list[Table] = None,
269
271
  ) -> list[list[Table]]:
270
272
  """Recursively walk over the domain schema graph and extend the current path.
271
273
 
@@ -278,6 +280,7 @@ class DerivaModel:
278
280
  A list of all the paths through the graph. Each path is a list of tables.
279
281
 
280
282
  """
283
+
281
284
  root = root or self.model.schemas[self.ml_schema].tables["Dataset"]
282
285
  path = path.copy() if path else []
283
286
  parent = path[-1] if path else None # Table that we are coming from.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: deriva-ml
3
- Version: 1.8.1
3
+ Version: 1.8.2
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -1 +0,0 @@
1
- __version__ = "1.8.1"
File without changes
File without changes
File without changes
File without changes