deriva-ml 1.8.1__py3-none-any.whl → 1.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/VERSION.py +1 -1
- deriva_ml/dataset.py +95 -24
- deriva_ml/deriva_model.py +4 -1
- {deriva_ml-1.8.1.dist-info → deriva_ml-1.8.2.dist-info}/METADATA +1 -1
- {deriva_ml-1.8.1.dist-info → deriva_ml-1.8.2.dist-info}/RECORD +9 -9
- {deriva_ml-1.8.1.dist-info → deriva_ml-1.8.2.dist-info}/WHEEL +1 -1
- {deriva_ml-1.8.1.dist-info → deriva_ml-1.8.2.dist-info}/LICENSE +0 -0
- {deriva_ml-1.8.1.dist-info → deriva_ml-1.8.2.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.8.1.dist-info → deriva_ml-1.8.2.dist-info}/top_level.txt +0 -0
deriva_ml/VERSION.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.8.
|
|
1
|
+
__version__ = "1.8.2"
|
deriva_ml/dataset.py
CHANGED
|
@@ -447,7 +447,7 @@ class Dataset:
|
|
|
447
447
|
|
|
448
448
|
# @validate_call
|
|
449
449
|
def list_dataset_members(
|
|
450
|
-
self, dataset_rid: RID, recurse: bool = False
|
|
450
|
+
self, dataset_rid: RID, recurse: bool = False, limit: Optional[int] = None
|
|
451
451
|
) -> dict[str, list[dict[str, Any]]]:
|
|
452
452
|
"""Return a list of entities associated with a specific dataset_table.
|
|
453
453
|
|
|
@@ -455,6 +455,7 @@ class Dataset:
|
|
|
455
455
|
dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
|
|
456
456
|
dataset_rid: RID:
|
|
457
457
|
recurse: (Default value = False)
|
|
458
|
+
limit: If provided, the maxiumum number of members to return for each element type.
|
|
458
459
|
|
|
459
460
|
Returns:
|
|
460
461
|
Dictionary of entities associated with a specific dataset_table. Key is the table from which the elements
|
|
@@ -492,7 +493,9 @@ class Dataset:
|
|
|
492
493
|
target_path,
|
|
493
494
|
on=(member_path.columns[member_column] == target_path.columns["RID"]),
|
|
494
495
|
)
|
|
495
|
-
target_entities = list(
|
|
496
|
+
target_entities = list(
|
|
497
|
+
path.entities().fetch(limit=limit) if limit else path.entities().fetch()
|
|
498
|
+
)
|
|
496
499
|
members[target_table.name].extend(target_entities)
|
|
497
500
|
if recurse and target_table == self.dataset_table:
|
|
498
501
|
# Get the members for all the nested datasets and add to the member list.
|
|
@@ -723,21 +726,16 @@ class Dataset:
|
|
|
723
726
|
for o in writer(f"{table.schema.name}:{table.name}", table.name, table)
|
|
724
727
|
]
|
|
725
728
|
|
|
726
|
-
def _table_paths(
|
|
729
|
+
def _table_paths(
|
|
730
|
+
self, dataset: DatasetSpec = None
|
|
731
|
+
) -> Iterator[tuple[list[str], list[str], list[Table]]]:
|
|
727
732
|
|
|
728
733
|
dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
|
|
729
|
-
paths = self._model._schema_to_paths()
|
|
730
|
-
nested_paths = paths
|
|
731
|
-
|
|
732
|
-
for i in range(self._dataset_nesting_depth()):
|
|
733
|
-
if i == 0:
|
|
734
|
-
paths.extend([[self.dataset_table, dataset_dataset]])
|
|
735
|
-
nested_paths = [
|
|
736
|
-
[self.dataset_table, dataset_dataset] + p for p in nested_paths
|
|
737
|
-
]
|
|
738
|
-
paths.extend(nested_paths)
|
|
739
734
|
|
|
740
|
-
|
|
735
|
+
paths = self._collect_paths(dataset and dataset.rid)
|
|
736
|
+
|
|
737
|
+
def source_path(path: tuple[Table, ...]):
|
|
738
|
+
path = list(path)
|
|
741
739
|
p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
|
|
742
740
|
for table in path[1:]:
|
|
743
741
|
if table == dataset_dataset:
|
|
@@ -756,7 +754,65 @@ class Dataset:
|
|
|
756
754
|
|
|
757
755
|
return zip(src_paths, dest_paths, target_tables)
|
|
758
756
|
|
|
759
|
-
def
|
|
757
|
+
def _collect_paths(
|
|
758
|
+
self,
|
|
759
|
+
dataset_rid: Optional[RID] = None,
|
|
760
|
+
dataset_nesting_depth: Optional[int] = None,
|
|
761
|
+
) -> set[tuple[Table, ...]]:
|
|
762
|
+
|
|
763
|
+
dataset_nesting_depth = (
|
|
764
|
+
self._dataset_nesting_depth()
|
|
765
|
+
if dataset_nesting_depth is None
|
|
766
|
+
else dataset_nesting_depth
|
|
767
|
+
)
|
|
768
|
+
dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
|
|
769
|
+
|
|
770
|
+
# Figure out which paths we don't need to query for this dataset. If no dataset is provided, use them all.
|
|
771
|
+
dataset_elements = (
|
|
772
|
+
[
|
|
773
|
+
self._model.name_to_table(e)
|
|
774
|
+
for e, m in self.list_dataset_members(
|
|
775
|
+
dataset_rid=dataset_rid, limit=1
|
|
776
|
+
).items()
|
|
777
|
+
if m
|
|
778
|
+
]
|
|
779
|
+
if dataset_rid
|
|
780
|
+
else self.list_dataset_element_types()
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
dataset_associations = [a.table for a in self.dataset_table.find_associations()]
|
|
784
|
+
included_associations = [
|
|
785
|
+
a.table
|
|
786
|
+
for a in self.dataset_table.find_associations()
|
|
787
|
+
if a.other_fkeys.pop().pk_table in dataset_elements
|
|
788
|
+
]
|
|
789
|
+
# Get the paths through the schema and filter out all of dataset paths not used by this dataset.
|
|
790
|
+
paths = {
|
|
791
|
+
tuple(p)
|
|
792
|
+
for p in self._model._schema_to_paths()
|
|
793
|
+
if (len(p) == 1)
|
|
794
|
+
or (p[1] not in dataset_associations)
|
|
795
|
+
or (p[1] in included_associations)
|
|
796
|
+
}
|
|
797
|
+
# Now get paths for nested datasets
|
|
798
|
+
nested_paths = set()
|
|
799
|
+
if dataset_rid:
|
|
800
|
+
for c in self.list_dataset_children(dataset_rid=dataset_rid):
|
|
801
|
+
nested_paths |= self._collect_paths(c)
|
|
802
|
+
else:
|
|
803
|
+
if dataset_nesting_depth:
|
|
804
|
+
nested_paths = self._collect_paths(
|
|
805
|
+
dataset_nesting_depth=dataset_nesting_depth - 1
|
|
806
|
+
)
|
|
807
|
+
if nested_paths:
|
|
808
|
+
paths |= {
|
|
809
|
+
tuple([self.dataset_table]),
|
|
810
|
+
(self.dataset_table, dataset_dataset),
|
|
811
|
+
}
|
|
812
|
+
paths |= {(self.dataset_table, dataset_dataset) + p for p in nested_paths}
|
|
813
|
+
return paths
|
|
814
|
+
|
|
815
|
+
def _dataset_nesting_depth(self, dataset_rid: Optional[RID] = None) -> int:
|
|
760
816
|
"""Determine the maximum dataset nesting depth in the current catalog.
|
|
761
817
|
|
|
762
818
|
Returns:
|
|
@@ -766,7 +822,7 @@ class Dataset:
|
|
|
766
822
|
def children_depth(
|
|
767
823
|
dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
|
|
768
824
|
) -> int:
|
|
769
|
-
"""Return the number of nested datasets in the current catalog"""
|
|
825
|
+
"""Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
|
|
770
826
|
try:
|
|
771
827
|
children = nested_datasets[dataset_rid]
|
|
772
828
|
return (
|
|
@@ -783,8 +839,19 @@ class Dataset:
|
|
|
783
839
|
.schemas[self._ml_schema]
|
|
784
840
|
.tables["Dataset_Dataset"]
|
|
785
841
|
)
|
|
842
|
+
dataset_children = (
|
|
843
|
+
[
|
|
844
|
+
{
|
|
845
|
+
"Dataset": dataset_rid,
|
|
846
|
+
"Nested_Dataset": c,
|
|
847
|
+
} # Make uniform with return from datapath
|
|
848
|
+
for c in self.list_dataset_children(dataset_rid)
|
|
849
|
+
]
|
|
850
|
+
if dataset_rid
|
|
851
|
+
else pb.entities().fetch()
|
|
852
|
+
)
|
|
786
853
|
nested_dataset = defaultdict(list)
|
|
787
|
-
for ds in
|
|
854
|
+
for ds in dataset_children:
|
|
788
855
|
nested_dataset[ds["Dataset"]].append(ds["Nested_Dataset"])
|
|
789
856
|
return (
|
|
790
857
|
max(map(lambda d: children_depth(d, dict(nested_dataset)), nested_dataset))
|
|
@@ -793,7 +860,9 @@ class Dataset:
|
|
|
793
860
|
)
|
|
794
861
|
|
|
795
862
|
def _dataset_specification(
|
|
796
|
-
self,
|
|
863
|
+
self,
|
|
864
|
+
writer: Callable[[str, str, Table], list[dict[str, Any]]],
|
|
865
|
+
dataset: DatasetSpec,
|
|
797
866
|
) -> list[dict[str, Any]]:
|
|
798
867
|
"""Output a download/export specification for a dataset_table. Each element of the dataset_table will be placed in its own dir
|
|
799
868
|
The top level data directory of the resulting BDBag will have one subdirectory for element type. the subdirectory
|
|
@@ -833,7 +902,7 @@ class Dataset:
|
|
|
833
902
|
A dataset_table specification.
|
|
834
903
|
"""
|
|
835
904
|
element_spec = []
|
|
836
|
-
for path in self._table_paths():
|
|
905
|
+
for path in self._table_paths(dataset=dataset):
|
|
837
906
|
element_spec.extend(writer(*path))
|
|
838
907
|
return self._vocabulary_specification(writer) + element_spec
|
|
839
908
|
|
|
@@ -1042,7 +1111,9 @@ class Dataset:
|
|
|
1042
1111
|
validated_check.touch()
|
|
1043
1112
|
return Path(bag_path)
|
|
1044
1113
|
|
|
1045
|
-
def _export_outputs(
|
|
1114
|
+
def _export_outputs(
|
|
1115
|
+
self, dataset: Optional[DatasetSpec] = None
|
|
1116
|
+
) -> list[dict[str, Any]]:
|
|
1046
1117
|
"""Return and output specification for the datasets in the provided model
|
|
1047
1118
|
|
|
1048
1119
|
Returns:
|
|
@@ -1079,9 +1150,9 @@ class Dataset:
|
|
|
1079
1150
|
"source": {"api": "schema", "skip_root_path": True},
|
|
1080
1151
|
"destination": {"type": "json", "name": "schema"},
|
|
1081
1152
|
},
|
|
1082
|
-
] + self._dataset_specification(writer)
|
|
1153
|
+
] + self._dataset_specification(writer, dataset)
|
|
1083
1154
|
|
|
1084
|
-
def _processor_params(self) -> list[dict[str, Any]]:
|
|
1155
|
+
def _processor_params(self, dataset: DatasetSpec) -> list[dict[str, Any]]:
|
|
1085
1156
|
"""
|
|
1086
1157
|
Returns:
|
|
1087
1158
|
a download specification for the datasets in the provided model.
|
|
@@ -1107,7 +1178,7 @@ class Dataset:
|
|
|
1107
1178
|
"processor": "json",
|
|
1108
1179
|
"processor_params": {"query_path": "/schema", "output_path": "schema"},
|
|
1109
1180
|
}
|
|
1110
|
-
] + self._dataset_specification(writer)
|
|
1181
|
+
] + self._dataset_specification(writer, dataset)
|
|
1111
1182
|
|
|
1112
1183
|
@staticmethod
|
|
1113
1184
|
def _download_dataset_element(
|
|
@@ -1244,7 +1315,7 @@ class Dataset:
|
|
|
1244
1315
|
},
|
|
1245
1316
|
},
|
|
1246
1317
|
]
|
|
1247
|
-
+ self._processor_params(),
|
|
1318
|
+
+ self._processor_params(dataset),
|
|
1248
1319
|
},
|
|
1249
1320
|
}
|
|
1250
1321
|
|
deriva_ml/deriva_model.py
CHANGED
|
@@ -265,7 +265,9 @@ class DerivaModel:
|
|
|
265
265
|
return relationships[0]
|
|
266
266
|
|
|
267
267
|
def _schema_to_paths(
|
|
268
|
-
self,
|
|
268
|
+
self,
|
|
269
|
+
root: Table = None,
|
|
270
|
+
path: list[Table] = None,
|
|
269
271
|
) -> list[list[Table]]:
|
|
270
272
|
"""Recursively walk over the domain schema graph and extend the current path.
|
|
271
273
|
|
|
@@ -278,6 +280,7 @@ class DerivaModel:
|
|
|
278
280
|
A list of all the paths through the graph. Each path is a list of tables.
|
|
279
281
|
|
|
280
282
|
"""
|
|
283
|
+
|
|
281
284
|
root = root or self.model.schemas[self.ml_schema].tables["Dataset"]
|
|
282
285
|
path = path.copy() if path else []
|
|
283
286
|
parent = path[-1] if path else None # Table that we are coming from.
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
deriva_ml/VERSION.py,sha256=
|
|
1
|
+
deriva_ml/VERSION.py,sha256=d6593s-XBNvVxri9lr2qLUDZQ3Zk3-VXHEwdb4pj8qA,22
|
|
2
2
|
deriva_ml/__init__.py,sha256=0PHNB8gRDALLtaffRmU7wCUgWbRHVQZcjuPJxMLNEco,856
|
|
3
3
|
deriva_ml/database_model.py,sha256=uhoyVyd8MQmY8J9ovCH8fjxhZDxxXNkdJyYdeyEGPXA,13898
|
|
4
|
-
deriva_ml/dataset.py,sha256=
|
|
4
|
+
deriva_ml/dataset.py,sha256=5STHbjWomTCPl8isdlcDgLk_K9DLCfACajBAreUAXTQ,58272
|
|
5
5
|
deriva_ml/dataset_aux_classes.py,sha256=YxjQnu2kS9kK_f8bGqhmgE6ty9GNeitCxfvReT9vaM0,6537
|
|
6
6
|
deriva_ml/dataset_bag.py,sha256=e6IHv3saZUnZRfl0EjfnlV2NnmPeOagYYv3PuZqS1l0,11501
|
|
7
7
|
deriva_ml/demo_catalog.py,sha256=xQPhFlflqwJskNQrQ-jdBSnGzBm2-aONBgcRxfsdNKM,11045
|
|
8
8
|
deriva_ml/deriva_definitions.py,sha256=MGl29ogCzqrlRilMhSuR5tECo4NSHP4CLbJAXRtPH6E,8914
|
|
9
9
|
deriva_ml/deriva_ml_base.py,sha256=ShDZlG9F4XrGRUcUINT3bb_P_UdvV1FqSnnPsjGTCLU,36443
|
|
10
|
-
deriva_ml/deriva_model.py,sha256=
|
|
10
|
+
deriva_ml/deriva_model.py,sha256=LV3FjIhIlz13ckZSmu0aOJhT9EVE0-M9oVMudfkxb0g,12004
|
|
11
11
|
deriva_ml/execution.py,sha256=UcXWY1W5Mt_Yzuayd3Pjd-lKzLlMV5QXZFcLvE6Lt0E,28390
|
|
12
12
|
deriva_ml/execution_configuration.py,sha256=nMeaG1qYdIgu4BV5atSUlcL8VZ3O6ohGY5iBhtD_LQ4,3700
|
|
13
13
|
deriva_ml/execution_environment.py,sha256=bCRKrCELDbGQDo7_FKfw7e8iMzVjSRZK3baKkqH5-_0,3264
|
|
@@ -26,9 +26,9 @@ deriva_ml/schema_setup/annotations.py,sha256=Uogm9YkRtoKSdgfQlICqRywbCATppwBO-Xr
|
|
|
26
26
|
deriva_ml/schema_setup/create_schema.py,sha256=jwziMWJPbjRgjiRBT-KtidnXI8YNEFO74A9fwfptjHY,10626
|
|
27
27
|
deriva_ml/schema_setup/policy.json,sha256=77sf0Imy6CAQV0_VwwbA56_KROJ05WXsvT-Wjtkk538,1633
|
|
28
28
|
deriva_ml/schema_setup/table_comments_utils.py,sha256=-2_ubEpoH7ViLVb-ZfW9wZbQ26DTKNgjkCABMzGu4i4,2140
|
|
29
|
-
deriva_ml-1.8.
|
|
30
|
-
deriva_ml-1.8.
|
|
31
|
-
deriva_ml-1.8.
|
|
32
|
-
deriva_ml-1.8.
|
|
33
|
-
deriva_ml-1.8.
|
|
34
|
-
deriva_ml-1.8.
|
|
29
|
+
deriva_ml-1.8.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
30
|
+
deriva_ml-1.8.2.dist-info/METADATA,sha256=DnSPqOt32ddlxTwuxGo9iL3DSbUKLIiMitRDMbxZcYQ,556
|
|
31
|
+
deriva_ml-1.8.2.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
|
32
|
+
deriva_ml-1.8.2.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
|
|
33
|
+
deriva_ml-1.8.2.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
|
|
34
|
+
deriva_ml-1.8.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|