deriva-ml 1.8.0__py3-none-any.whl → 1.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deriva_ml/VERSION.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.8.0"
1
+ __version__ = "1.8.2"
deriva_ml/dataset.py CHANGED
@@ -447,7 +447,7 @@ class Dataset:
447
447
 
448
448
  # @validate_call
449
449
  def list_dataset_members(
450
- self, dataset_rid: RID, recurse: bool = False
450
+ self, dataset_rid: RID, recurse: bool = False, limit: Optional[int] = None
451
451
  ) -> dict[str, list[dict[str, Any]]]:
452
452
  """Return a list of entities associated with a specific dataset_table.
453
453
 
@@ -455,6 +455,7 @@ class Dataset:
455
455
  dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
456
456
  dataset_rid: RID:
457
457
  recurse: (Default value = False)
458
+ limit: If provided, the maxiumum number of members to return for each element type.
458
459
 
459
460
  Returns:
460
461
  Dictionary of entities associated with a specific dataset_table. Key is the table from which the elements
@@ -492,7 +493,9 @@ class Dataset:
492
493
  target_path,
493
494
  on=(member_path.columns[member_column] == target_path.columns["RID"]),
494
495
  )
495
- target_entities = list(path.entities().fetch())
496
+ target_entities = list(
497
+ path.entities().fetch(limit=limit) if limit else path.entities().fetch()
498
+ )
496
499
  members[target_table.name].extend(target_entities)
497
500
  if recurse and target_table == self.dataset_table:
498
501
  # Get the members for all the nested datasets and add to the member list.
@@ -723,21 +726,16 @@ class Dataset:
723
726
  for o in writer(f"{table.schema.name}:{table.name}", table.name, table)
724
727
  ]
725
728
 
726
- def _table_paths(self) -> Iterator[tuple[list[str], list[str], list[Table]]]:
729
+ def _table_paths(
730
+ self, dataset: DatasetSpec = None
731
+ ) -> Iterator[tuple[list[str], list[str], list[Table]]]:
727
732
 
728
733
  dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
729
- paths = self._model._schema_to_paths()
730
- nested_paths = paths
731
-
732
- for i in range(self._dataset_nesting_depth()):
733
- if i == 0:
734
- paths.extend([[self.dataset_table, dataset_dataset]])
735
- nested_paths = [
736
- [self.dataset_table, dataset_dataset] + p for p in nested_paths
737
- ]
738
- paths.extend(nested_paths)
739
734
 
740
- def source_path(path):
735
+ paths = self._collect_paths(dataset and dataset.rid)
736
+
737
+ def source_path(path: tuple[Table, ...]):
738
+ path = list(path)
741
739
  p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
742
740
  for table in path[1:]:
743
741
  if table == dataset_dataset:
@@ -756,7 +754,65 @@ class Dataset:
756
754
 
757
755
  return zip(src_paths, dest_paths, target_tables)
758
756
 
759
- def _dataset_nesting_depth(self):
757
+ def _collect_paths(
758
+ self,
759
+ dataset_rid: Optional[RID] = None,
760
+ dataset_nesting_depth: Optional[int] = None,
761
+ ) -> set[tuple[Table, ...]]:
762
+
763
+ dataset_nesting_depth = (
764
+ self._dataset_nesting_depth()
765
+ if dataset_nesting_depth is None
766
+ else dataset_nesting_depth
767
+ )
768
+ dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
769
+
770
+ # Figure out which paths we don't need to query for this dataset. If no dataset is provided, use them all.
771
+ dataset_elements = (
772
+ [
773
+ self._model.name_to_table(e)
774
+ for e, m in self.list_dataset_members(
775
+ dataset_rid=dataset_rid, limit=1
776
+ ).items()
777
+ if m
778
+ ]
779
+ if dataset_rid
780
+ else self.list_dataset_element_types()
781
+ )
782
+
783
+ dataset_associations = [a.table for a in self.dataset_table.find_associations()]
784
+ included_associations = [
785
+ a.table
786
+ for a in self.dataset_table.find_associations()
787
+ if a.other_fkeys.pop().pk_table in dataset_elements
788
+ ]
789
+ # Get the paths through the schema and filter out all of dataset paths not used by this dataset.
790
+ paths = {
791
+ tuple(p)
792
+ for p in self._model._schema_to_paths()
793
+ if (len(p) == 1)
794
+ or (p[1] not in dataset_associations)
795
+ or (p[1] in included_associations)
796
+ }
797
+ # Now get paths for nested datasets
798
+ nested_paths = set()
799
+ if dataset_rid:
800
+ for c in self.list_dataset_children(dataset_rid=dataset_rid):
801
+ nested_paths |= self._collect_paths(c)
802
+ else:
803
+ if dataset_nesting_depth:
804
+ nested_paths = self._collect_paths(
805
+ dataset_nesting_depth=dataset_nesting_depth - 1
806
+ )
807
+ if nested_paths:
808
+ paths |= {
809
+ tuple([self.dataset_table]),
810
+ (self.dataset_table, dataset_dataset),
811
+ }
812
+ paths |= {(self.dataset_table, dataset_dataset) + p for p in nested_paths}
813
+ return paths
814
+
815
+ def _dataset_nesting_depth(self, dataset_rid: Optional[RID] = None) -> int:
760
816
  """Determine the maximum dataset nesting depth in the current catalog.
761
817
 
762
818
  Returns:
@@ -766,7 +822,7 @@ class Dataset:
766
822
  def children_depth(
767
823
  dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
768
824
  ) -> int:
769
- """Return the number of nested datasets in the current catalog"""
825
+ """Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
770
826
  try:
771
827
  children = nested_datasets[dataset_rid]
772
828
  return (
@@ -783,8 +839,19 @@ class Dataset:
783
839
  .schemas[self._ml_schema]
784
840
  .tables["Dataset_Dataset"]
785
841
  )
842
+ dataset_children = (
843
+ [
844
+ {
845
+ "Dataset": dataset_rid,
846
+ "Nested_Dataset": c,
847
+ } # Make uniform with return from datapath
848
+ for c in self.list_dataset_children(dataset_rid)
849
+ ]
850
+ if dataset_rid
851
+ else pb.entities().fetch()
852
+ )
786
853
  nested_dataset = defaultdict(list)
787
- for ds in pb.entities().fetch():
854
+ for ds in dataset_children:
788
855
  nested_dataset[ds["Dataset"]].append(ds["Nested_Dataset"])
789
856
  return (
790
857
  max(map(lambda d: children_depth(d, dict(nested_dataset)), nested_dataset))
@@ -793,7 +860,9 @@ class Dataset:
793
860
  )
794
861
 
795
862
  def _dataset_specification(
796
- self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
863
+ self,
864
+ writer: Callable[[str, str, Table], list[dict[str, Any]]],
865
+ dataset: DatasetSpec,
797
866
  ) -> list[dict[str, Any]]:
798
867
  """Output a download/export specification for a dataset_table. Each element of the dataset_table will be placed in its own dir
799
868
  The top level data directory of the resulting BDBag will have one subdirectory for element type. the subdirectory
@@ -833,7 +902,7 @@ class Dataset:
833
902
  A dataset_table specification.
834
903
  """
835
904
  element_spec = []
836
- for path in self._table_paths():
905
+ for path in self._table_paths(dataset=dataset):
837
906
  element_spec.extend(writer(*path))
838
907
  return self._vocabulary_specification(writer) + element_spec
839
908
 
@@ -892,7 +961,7 @@ class Dataset:
892
961
  config_file=spec_file,
893
962
  output_dir=tmp_dir,
894
963
  defer_download=True,
895
- timeout=(10, 300),
964
+ timeout=(10, 610),
896
965
  envars={"Dataset_RID": dataset.rid},
897
966
  )
898
967
  minid_page_url = exporter.export()[0] # Get the MINID launch page
@@ -1042,7 +1111,9 @@ class Dataset:
1042
1111
  validated_check.touch()
1043
1112
  return Path(bag_path)
1044
1113
 
1045
- def _export_outputs(self) -> list[dict[str, Any]]:
1114
+ def _export_outputs(
1115
+ self, dataset: Optional[DatasetSpec] = None
1116
+ ) -> list[dict[str, Any]]:
1046
1117
  """Return and output specification for the datasets in the provided model
1047
1118
 
1048
1119
  Returns:
@@ -1079,9 +1150,9 @@ class Dataset:
1079
1150
  "source": {"api": "schema", "skip_root_path": True},
1080
1151
  "destination": {"type": "json", "name": "schema"},
1081
1152
  },
1082
- ] + self._dataset_specification(writer)
1153
+ ] + self._dataset_specification(writer, dataset)
1083
1154
 
1084
- def _processor_params(self) -> list[dict[str, Any]]:
1155
+ def _processor_params(self, dataset: DatasetSpec) -> list[dict[str, Any]]:
1085
1156
  """
1086
1157
  Returns:
1087
1158
  a download specification for the datasets in the provided model.
@@ -1107,7 +1178,7 @@ class Dataset:
1107
1178
  "processor": "json",
1108
1179
  "processor_params": {"query_path": "/schema", "output_path": "schema"},
1109
1180
  }
1110
- ] + self._dataset_specification(writer)
1181
+ ] + self._dataset_specification(writer, dataset)
1111
1182
 
1112
1183
  @staticmethod
1113
1184
  def _download_dataset_element(
@@ -1244,7 +1315,7 @@ class Dataset:
1244
1315
  },
1245
1316
  },
1246
1317
  ]
1247
- + self._processor_params(),
1318
+ + self._processor_params(dataset),
1248
1319
  },
1249
1320
  }
1250
1321
 
@@ -187,6 +187,14 @@ class DatasetSpec(BaseModel):
187
187
 
188
188
  model_config = ConfigDict(arbitrary_types_allowed=True)
189
189
 
190
+ @field_validator("version", mode="before")
191
+ @classmethod
192
+ def version_field_validator(cls, v: Any) -> Any:
193
+ if isinstance(v, dict):
194
+ return DatasetVersion(**v)
195
+ else:
196
+ return v
197
+
190
198
  @model_validator(mode="before")
191
199
  @classmethod
192
200
  def _check_bare_rid(cls, data: Any) -> dict[str, str | bool]:
deriva_ml/demo_catalog.py CHANGED
@@ -294,6 +294,7 @@ def create_demo_catalog(
294
294
  project_name=project_name,
295
295
  logging_level=logging.WARN,
296
296
  )
297
+ working_dir = deriva_ml.working_dir
297
298
  dataset_table = deriva_ml.dataset_table
298
299
  dataset_table.annotations.update(
299
300
  Dataset(
@@ -115,10 +115,12 @@ class DerivaML(Dataset):
115
115
  if working_dir
116
116
  else Path.home() / "deriva-ml"
117
117
  ) / default_workdir
118
+
118
119
  self.working_dir.mkdir(parents=True, exist_ok=True)
119
120
  self.cache_dir = (
120
121
  Path(cache_dir) if cache_dir else Path.home() / "deriva-ml" / "cache"
121
122
  )
123
+
122
124
  self.cache_dir.mkdir(parents=True, exist_ok=True)
123
125
 
124
126
  # Initialize dataset class.
deriva_ml/deriva_model.py CHANGED
@@ -265,7 +265,9 @@ class DerivaModel:
265
265
  return relationships[0]
266
266
 
267
267
  def _schema_to_paths(
268
- self, root: Table = None, path: list[Table] = None
268
+ self,
269
+ root: Table = None,
270
+ path: list[Table] = None,
269
271
  ) -> list[list[Table]]:
270
272
  """Recursively walk over the domain schema graph and extend the current path.
271
273
 
@@ -278,6 +280,7 @@ class DerivaModel:
278
280
  A list of all the paths through the graph. Each path is a list of tables.
279
281
 
280
282
  """
283
+
281
284
  root = root or self.model.schemas[self.ml_schema].tables["Dataset"]
282
285
  path = path.copy() if path else []
283
286
  parent = path[-1] if path else None # Table that we are coming from.
@@ -1,12 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
- from typing import Optional
4
+ from typing import Optional, Any
5
5
 
6
6
  from pydantic import (
7
7
  BaseModel,
8
8
  conlist,
9
- ConfigDict,
9
+ ConfigDict, field_validator,
10
10
  )
11
11
  from pathlib import Path
12
12
 
@@ -96,12 +96,12 @@ def execution_test(ml_instance):
96
96
  vc.workflow_type, "ML Demo", description="A ML Workflow that uses Deriva ML API"
97
97
  )
98
98
 
99
- api_workflow = Workflow(
99
+ api_workflow = ml_instance.add_workflow(Workflow(
100
100
  name="Manual Workflow",
101
101
  url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/Notebooks/DerivaML%20Execution.ipynb",
102
102
  workflow_type="Manual Workflow",
103
103
  description="A manual operation",
104
- )
104
+ ))
105
105
 
106
106
  manual_execution = ml_instance.create_execution(
107
107
  ExecutionConfiguration(description="Sample Execution", workflow=api_workflow)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: deriva-ml
3
- Version: 1.8.0
3
+ Version: 1.8.2
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -1,19 +1,19 @@
1
- deriva_ml/VERSION.py,sha256=Oc_xF94AMAHKZkZlB5rBt1iO0TXWFalg65MP4T2qt-A,22
1
+ deriva_ml/VERSION.py,sha256=d6593s-XBNvVxri9lr2qLUDZQ3Zk3-VXHEwdb4pj8qA,22
2
2
  deriva_ml/__init__.py,sha256=0PHNB8gRDALLtaffRmU7wCUgWbRHVQZcjuPJxMLNEco,856
3
3
  deriva_ml/database_model.py,sha256=uhoyVyd8MQmY8J9ovCH8fjxhZDxxXNkdJyYdeyEGPXA,13898
4
- deriva_ml/dataset.py,sha256=b0FqUHKsaIKyyjq3O_P9lSP5I5aWAkEDp4nrQn3rzqw,55524
5
- deriva_ml/dataset_aux_classes.py,sha256=8cnhVTuaIS7NdxF34oKvzH3U1Rx9sc6jiTC6vOmAPF0,6313
4
+ deriva_ml/dataset.py,sha256=5STHbjWomTCPl8isdlcDgLk_K9DLCfACajBAreUAXTQ,58272
5
+ deriva_ml/dataset_aux_classes.py,sha256=YxjQnu2kS9kK_f8bGqhmgE6ty9GNeitCxfvReT9vaM0,6537
6
6
  deriva_ml/dataset_bag.py,sha256=e6IHv3saZUnZRfl0EjfnlV2NnmPeOagYYv3PuZqS1l0,11501
7
- deriva_ml/demo_catalog.py,sha256=eWUiQ1DMkMXFZEYqCGVlLJHOZd7uE5L6-Zr3kl8YUI0,11001
7
+ deriva_ml/demo_catalog.py,sha256=xQPhFlflqwJskNQrQ-jdBSnGzBm2-aONBgcRxfsdNKM,11045
8
8
  deriva_ml/deriva_definitions.py,sha256=MGl29ogCzqrlRilMhSuR5tECo4NSHP4CLbJAXRtPH6E,8914
9
- deriva_ml/deriva_ml_base.py,sha256=GFz6SSjcwFt5KdJyWb7kcjr1srZjyQtanpdo81LlXCI,36441
10
- deriva_ml/deriva_model.py,sha256=F5zDw-MDV55POAjKB6dorpa7P4KTSN9hjDCN4E9zB9A,11986
9
+ deriva_ml/deriva_ml_base.py,sha256=ShDZlG9F4XrGRUcUINT3bb_P_UdvV1FqSnnPsjGTCLU,36443
10
+ deriva_ml/deriva_model.py,sha256=LV3FjIhIlz13ckZSmu0aOJhT9EVE0-M9oVMudfkxb0g,12004
11
11
  deriva_ml/execution.py,sha256=UcXWY1W5Mt_Yzuayd3Pjd-lKzLlMV5QXZFcLvE6Lt0E,28390
12
- deriva_ml/execution_configuration.py,sha256=PeBB4ZOcZwwMeRpg0QR1sCKD2AwSYdpzIl_PcVllYc0,3678
12
+ deriva_ml/execution_configuration.py,sha256=nMeaG1qYdIgu4BV5atSUlcL8VZ3O6ohGY5iBhtD_LQ4,3700
13
13
  deriva_ml/execution_environment.py,sha256=bCRKrCELDbGQDo7_FKfw7e8iMzVjSRZK3baKkqH5-_0,3264
14
14
  deriva_ml/feature.py,sha256=7e8WYPCfJSrGxJh9oUTduYSnB5ekybRhXa_0HIigS_w,5459
15
15
  deriva_ml/history.py,sha256=qTDLDs8Ow_6r7mDO0gZm0Fg81SWKOAgtCU5pzZoDRgM,2828
16
- deriva_ml/test_functions.py,sha256=stunfznC759qCrhUBVeymDNBikrfBhqe4pLTw8p6PzA,4370
16
+ deriva_ml/test_functions.py,sha256=-eqLHjjCQCLBNAr1ofbZekNiCOfMISSACRxT_YHER8I,4396
17
17
  deriva_ml/upload.py,sha256=HCOChW6bALW_gt0sWUs_81bNPsb72TNs4o0FQsGSLM4,22222
18
18
  deriva_ml/build/lib/schema_setup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  deriva_ml/build/lib/schema_setup/alter_annotation.py,sha256=pkwk0WystN69JfAFK4iBJZAZVQKbRs-gN9IFYuS9rfg,1739
@@ -26,9 +26,9 @@ deriva_ml/schema_setup/annotations.py,sha256=Uogm9YkRtoKSdgfQlICqRywbCATppwBO-Xr
26
26
  deriva_ml/schema_setup/create_schema.py,sha256=jwziMWJPbjRgjiRBT-KtidnXI8YNEFO74A9fwfptjHY,10626
27
27
  deriva_ml/schema_setup/policy.json,sha256=77sf0Imy6CAQV0_VwwbA56_KROJ05WXsvT-Wjtkk538,1633
28
28
  deriva_ml/schema_setup/table_comments_utils.py,sha256=-2_ubEpoH7ViLVb-ZfW9wZbQ26DTKNgjkCABMzGu4i4,2140
29
- deriva_ml-1.8.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
30
- deriva_ml-1.8.0.dist-info/METADATA,sha256=X_PTMVr4ZhOe7XBaEKWk2nPRpfvB3ayZIK5MRdPjQV8,556
31
- deriva_ml-1.8.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
32
- deriva_ml-1.8.0.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
33
- deriva_ml-1.8.0.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
34
- deriva_ml-1.8.0.dist-info/RECORD,,
29
+ deriva_ml-1.8.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
30
+ deriva_ml-1.8.2.dist-info/METADATA,sha256=DnSPqOt32ddlxTwuxGo9iL3DSbUKLIiMitRDMbxZcYQ,556
31
+ deriva_ml-1.8.2.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
32
+ deriva_ml-1.8.2.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
33
+ deriva_ml-1.8.2.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
34
+ deriva_ml-1.8.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.2)
2
+ Generator: setuptools (76.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5