deriva-ml 1.8.0__py3-none-any.whl → 1.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/VERSION.py +1 -1
- deriva_ml/dataset.py +96 -25
- deriva_ml/dataset_aux_classes.py +8 -0
- deriva_ml/demo_catalog.py +1 -0
- deriva_ml/deriva_ml_base.py +2 -0
- deriva_ml/deriva_model.py +4 -1
- deriva_ml/execution_configuration.py +2 -2
- deriva_ml/test_functions.py +2 -2
- {deriva_ml-1.8.0.dist-info → deriva_ml-1.8.2.dist-info}/METADATA +1 -1
- {deriva_ml-1.8.0.dist-info → deriva_ml-1.8.2.dist-info}/RECORD +14 -14
- {deriva_ml-1.8.0.dist-info → deriva_ml-1.8.2.dist-info}/WHEEL +1 -1
- {deriva_ml-1.8.0.dist-info → deriva_ml-1.8.2.dist-info}/LICENSE +0 -0
- {deriva_ml-1.8.0.dist-info → deriva_ml-1.8.2.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.8.0.dist-info → deriva_ml-1.8.2.dist-info}/top_level.txt +0 -0
deriva_ml/VERSION.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.8.
|
|
1
|
+
__version__ = "1.8.2"
|
deriva_ml/dataset.py
CHANGED
|
@@ -447,7 +447,7 @@ class Dataset:
|
|
|
447
447
|
|
|
448
448
|
# @validate_call
|
|
449
449
|
def list_dataset_members(
|
|
450
|
-
self, dataset_rid: RID, recurse: bool = False
|
|
450
|
+
self, dataset_rid: RID, recurse: bool = False, limit: Optional[int] = None
|
|
451
451
|
) -> dict[str, list[dict[str, Any]]]:
|
|
452
452
|
"""Return a list of entities associated with a specific dataset_table.
|
|
453
453
|
|
|
@@ -455,6 +455,7 @@ class Dataset:
|
|
|
455
455
|
dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
|
|
456
456
|
dataset_rid: RID:
|
|
457
457
|
recurse: (Default value = False)
|
|
458
|
+
limit: If provided, the maxiumum number of members to return for each element type.
|
|
458
459
|
|
|
459
460
|
Returns:
|
|
460
461
|
Dictionary of entities associated with a specific dataset_table. Key is the table from which the elements
|
|
@@ -492,7 +493,9 @@ class Dataset:
|
|
|
492
493
|
target_path,
|
|
493
494
|
on=(member_path.columns[member_column] == target_path.columns["RID"]),
|
|
494
495
|
)
|
|
495
|
-
target_entities = list(
|
|
496
|
+
target_entities = list(
|
|
497
|
+
path.entities().fetch(limit=limit) if limit else path.entities().fetch()
|
|
498
|
+
)
|
|
496
499
|
members[target_table.name].extend(target_entities)
|
|
497
500
|
if recurse and target_table == self.dataset_table:
|
|
498
501
|
# Get the members for all the nested datasets and add to the member list.
|
|
@@ -723,21 +726,16 @@ class Dataset:
|
|
|
723
726
|
for o in writer(f"{table.schema.name}:{table.name}", table.name, table)
|
|
724
727
|
]
|
|
725
728
|
|
|
726
|
-
def _table_paths(
|
|
729
|
+
def _table_paths(
|
|
730
|
+
self, dataset: DatasetSpec = None
|
|
731
|
+
) -> Iterator[tuple[list[str], list[str], list[Table]]]:
|
|
727
732
|
|
|
728
733
|
dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
|
|
729
|
-
paths = self._model._schema_to_paths()
|
|
730
|
-
nested_paths = paths
|
|
731
|
-
|
|
732
|
-
for i in range(self._dataset_nesting_depth()):
|
|
733
|
-
if i == 0:
|
|
734
|
-
paths.extend([[self.dataset_table, dataset_dataset]])
|
|
735
|
-
nested_paths = [
|
|
736
|
-
[self.dataset_table, dataset_dataset] + p for p in nested_paths
|
|
737
|
-
]
|
|
738
|
-
paths.extend(nested_paths)
|
|
739
734
|
|
|
740
|
-
|
|
735
|
+
paths = self._collect_paths(dataset and dataset.rid)
|
|
736
|
+
|
|
737
|
+
def source_path(path: tuple[Table, ...]):
|
|
738
|
+
path = list(path)
|
|
741
739
|
p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
|
|
742
740
|
for table in path[1:]:
|
|
743
741
|
if table == dataset_dataset:
|
|
@@ -756,7 +754,65 @@ class Dataset:
|
|
|
756
754
|
|
|
757
755
|
return zip(src_paths, dest_paths, target_tables)
|
|
758
756
|
|
|
759
|
-
def
|
|
757
|
+
def _collect_paths(
|
|
758
|
+
self,
|
|
759
|
+
dataset_rid: Optional[RID] = None,
|
|
760
|
+
dataset_nesting_depth: Optional[int] = None,
|
|
761
|
+
) -> set[tuple[Table, ...]]:
|
|
762
|
+
|
|
763
|
+
dataset_nesting_depth = (
|
|
764
|
+
self._dataset_nesting_depth()
|
|
765
|
+
if dataset_nesting_depth is None
|
|
766
|
+
else dataset_nesting_depth
|
|
767
|
+
)
|
|
768
|
+
dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
|
|
769
|
+
|
|
770
|
+
# Figure out which paths we don't need to query for this dataset. If no dataset is provided, use them all.
|
|
771
|
+
dataset_elements = (
|
|
772
|
+
[
|
|
773
|
+
self._model.name_to_table(e)
|
|
774
|
+
for e, m in self.list_dataset_members(
|
|
775
|
+
dataset_rid=dataset_rid, limit=1
|
|
776
|
+
).items()
|
|
777
|
+
if m
|
|
778
|
+
]
|
|
779
|
+
if dataset_rid
|
|
780
|
+
else self.list_dataset_element_types()
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
dataset_associations = [a.table for a in self.dataset_table.find_associations()]
|
|
784
|
+
included_associations = [
|
|
785
|
+
a.table
|
|
786
|
+
for a in self.dataset_table.find_associations()
|
|
787
|
+
if a.other_fkeys.pop().pk_table in dataset_elements
|
|
788
|
+
]
|
|
789
|
+
# Get the paths through the schema and filter out all of dataset paths not used by this dataset.
|
|
790
|
+
paths = {
|
|
791
|
+
tuple(p)
|
|
792
|
+
for p in self._model._schema_to_paths()
|
|
793
|
+
if (len(p) == 1)
|
|
794
|
+
or (p[1] not in dataset_associations)
|
|
795
|
+
or (p[1] in included_associations)
|
|
796
|
+
}
|
|
797
|
+
# Now get paths for nested datasets
|
|
798
|
+
nested_paths = set()
|
|
799
|
+
if dataset_rid:
|
|
800
|
+
for c in self.list_dataset_children(dataset_rid=dataset_rid):
|
|
801
|
+
nested_paths |= self._collect_paths(c)
|
|
802
|
+
else:
|
|
803
|
+
if dataset_nesting_depth:
|
|
804
|
+
nested_paths = self._collect_paths(
|
|
805
|
+
dataset_nesting_depth=dataset_nesting_depth - 1
|
|
806
|
+
)
|
|
807
|
+
if nested_paths:
|
|
808
|
+
paths |= {
|
|
809
|
+
tuple([self.dataset_table]),
|
|
810
|
+
(self.dataset_table, dataset_dataset),
|
|
811
|
+
}
|
|
812
|
+
paths |= {(self.dataset_table, dataset_dataset) + p for p in nested_paths}
|
|
813
|
+
return paths
|
|
814
|
+
|
|
815
|
+
def _dataset_nesting_depth(self, dataset_rid: Optional[RID] = None) -> int:
|
|
760
816
|
"""Determine the maximum dataset nesting depth in the current catalog.
|
|
761
817
|
|
|
762
818
|
Returns:
|
|
@@ -766,7 +822,7 @@ class Dataset:
|
|
|
766
822
|
def children_depth(
|
|
767
823
|
dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
|
|
768
824
|
) -> int:
|
|
769
|
-
"""Return the number of nested datasets in the current catalog"""
|
|
825
|
+
"""Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
|
|
770
826
|
try:
|
|
771
827
|
children = nested_datasets[dataset_rid]
|
|
772
828
|
return (
|
|
@@ -783,8 +839,19 @@ class Dataset:
|
|
|
783
839
|
.schemas[self._ml_schema]
|
|
784
840
|
.tables["Dataset_Dataset"]
|
|
785
841
|
)
|
|
842
|
+
dataset_children = (
|
|
843
|
+
[
|
|
844
|
+
{
|
|
845
|
+
"Dataset": dataset_rid,
|
|
846
|
+
"Nested_Dataset": c,
|
|
847
|
+
} # Make uniform with return from datapath
|
|
848
|
+
for c in self.list_dataset_children(dataset_rid)
|
|
849
|
+
]
|
|
850
|
+
if dataset_rid
|
|
851
|
+
else pb.entities().fetch()
|
|
852
|
+
)
|
|
786
853
|
nested_dataset = defaultdict(list)
|
|
787
|
-
for ds in
|
|
854
|
+
for ds in dataset_children:
|
|
788
855
|
nested_dataset[ds["Dataset"]].append(ds["Nested_Dataset"])
|
|
789
856
|
return (
|
|
790
857
|
max(map(lambda d: children_depth(d, dict(nested_dataset)), nested_dataset))
|
|
@@ -793,7 +860,9 @@ class Dataset:
|
|
|
793
860
|
)
|
|
794
861
|
|
|
795
862
|
def _dataset_specification(
|
|
796
|
-
self,
|
|
863
|
+
self,
|
|
864
|
+
writer: Callable[[str, str, Table], list[dict[str, Any]]],
|
|
865
|
+
dataset: DatasetSpec,
|
|
797
866
|
) -> list[dict[str, Any]]:
|
|
798
867
|
"""Output a download/export specification for a dataset_table. Each element of the dataset_table will be placed in its own dir
|
|
799
868
|
The top level data directory of the resulting BDBag will have one subdirectory for element type. the subdirectory
|
|
@@ -833,7 +902,7 @@ class Dataset:
|
|
|
833
902
|
A dataset_table specification.
|
|
834
903
|
"""
|
|
835
904
|
element_spec = []
|
|
836
|
-
for path in self._table_paths():
|
|
905
|
+
for path in self._table_paths(dataset=dataset):
|
|
837
906
|
element_spec.extend(writer(*path))
|
|
838
907
|
return self._vocabulary_specification(writer) + element_spec
|
|
839
908
|
|
|
@@ -892,7 +961,7 @@ class Dataset:
|
|
|
892
961
|
config_file=spec_file,
|
|
893
962
|
output_dir=tmp_dir,
|
|
894
963
|
defer_download=True,
|
|
895
|
-
timeout=(10,
|
|
964
|
+
timeout=(10, 610),
|
|
896
965
|
envars={"Dataset_RID": dataset.rid},
|
|
897
966
|
)
|
|
898
967
|
minid_page_url = exporter.export()[0] # Get the MINID launch page
|
|
@@ -1042,7 +1111,9 @@ class Dataset:
|
|
|
1042
1111
|
validated_check.touch()
|
|
1043
1112
|
return Path(bag_path)
|
|
1044
1113
|
|
|
1045
|
-
def _export_outputs(
|
|
1114
|
+
def _export_outputs(
|
|
1115
|
+
self, dataset: Optional[DatasetSpec] = None
|
|
1116
|
+
) -> list[dict[str, Any]]:
|
|
1046
1117
|
"""Return and output specification for the datasets in the provided model
|
|
1047
1118
|
|
|
1048
1119
|
Returns:
|
|
@@ -1079,9 +1150,9 @@ class Dataset:
|
|
|
1079
1150
|
"source": {"api": "schema", "skip_root_path": True},
|
|
1080
1151
|
"destination": {"type": "json", "name": "schema"},
|
|
1081
1152
|
},
|
|
1082
|
-
] + self._dataset_specification(writer)
|
|
1153
|
+
] + self._dataset_specification(writer, dataset)
|
|
1083
1154
|
|
|
1084
|
-
def _processor_params(self) -> list[dict[str, Any]]:
|
|
1155
|
+
def _processor_params(self, dataset: DatasetSpec) -> list[dict[str, Any]]:
|
|
1085
1156
|
"""
|
|
1086
1157
|
Returns:
|
|
1087
1158
|
a download specification for the datasets in the provided model.
|
|
@@ -1107,7 +1178,7 @@ class Dataset:
|
|
|
1107
1178
|
"processor": "json",
|
|
1108
1179
|
"processor_params": {"query_path": "/schema", "output_path": "schema"},
|
|
1109
1180
|
}
|
|
1110
|
-
] + self._dataset_specification(writer)
|
|
1181
|
+
] + self._dataset_specification(writer, dataset)
|
|
1111
1182
|
|
|
1112
1183
|
@staticmethod
|
|
1113
1184
|
def _download_dataset_element(
|
|
@@ -1244,7 +1315,7 @@ class Dataset:
|
|
|
1244
1315
|
},
|
|
1245
1316
|
},
|
|
1246
1317
|
]
|
|
1247
|
-
+ self._processor_params(),
|
|
1318
|
+
+ self._processor_params(dataset),
|
|
1248
1319
|
},
|
|
1249
1320
|
}
|
|
1250
1321
|
|
deriva_ml/dataset_aux_classes.py
CHANGED
|
@@ -187,6 +187,14 @@ class DatasetSpec(BaseModel):
|
|
|
187
187
|
|
|
188
188
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
189
189
|
|
|
190
|
+
@field_validator("version", mode="before")
|
|
191
|
+
@classmethod
|
|
192
|
+
def version_field_validator(cls, v: Any) -> Any:
|
|
193
|
+
if isinstance(v, dict):
|
|
194
|
+
return DatasetVersion(**v)
|
|
195
|
+
else:
|
|
196
|
+
return v
|
|
197
|
+
|
|
190
198
|
@model_validator(mode="before")
|
|
191
199
|
@classmethod
|
|
192
200
|
def _check_bare_rid(cls, data: Any) -> dict[str, str | bool]:
|
deriva_ml/demo_catalog.py
CHANGED
deriva_ml/deriva_ml_base.py
CHANGED
|
@@ -115,10 +115,12 @@ class DerivaML(Dataset):
|
|
|
115
115
|
if working_dir
|
|
116
116
|
else Path.home() / "deriva-ml"
|
|
117
117
|
) / default_workdir
|
|
118
|
+
|
|
118
119
|
self.working_dir.mkdir(parents=True, exist_ok=True)
|
|
119
120
|
self.cache_dir = (
|
|
120
121
|
Path(cache_dir) if cache_dir else Path.home() / "deriva-ml" / "cache"
|
|
121
122
|
)
|
|
123
|
+
|
|
122
124
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
123
125
|
|
|
124
126
|
# Initialize dataset class.
|
deriva_ml/deriva_model.py
CHANGED
|
@@ -265,7 +265,9 @@ class DerivaModel:
|
|
|
265
265
|
return relationships[0]
|
|
266
266
|
|
|
267
267
|
def _schema_to_paths(
|
|
268
|
-
self,
|
|
268
|
+
self,
|
|
269
|
+
root: Table = None,
|
|
270
|
+
path: list[Table] = None,
|
|
269
271
|
) -> list[list[Table]]:
|
|
270
272
|
"""Recursively walk over the domain schema graph and extend the current path.
|
|
271
273
|
|
|
@@ -278,6 +280,7 @@ class DerivaModel:
|
|
|
278
280
|
A list of all the paths through the graph. Each path is a list of tables.
|
|
279
281
|
|
|
280
282
|
"""
|
|
283
|
+
|
|
281
284
|
root = root or self.model.schemas[self.ml_schema].tables["Dataset"]
|
|
282
285
|
path = path.copy() if path else []
|
|
283
286
|
parent = path[-1] if path else None # Table that we are coming from.
|
deriva_ml/test_functions.py
CHANGED
|
@@ -96,12 +96,12 @@ def execution_test(ml_instance):
|
|
|
96
96
|
vc.workflow_type, "ML Demo", description="A ML Workflow that uses Deriva ML API"
|
|
97
97
|
)
|
|
98
98
|
|
|
99
|
-
api_workflow = Workflow(
|
|
99
|
+
api_workflow = ml_instance.add_workflow(Workflow(
|
|
100
100
|
name="Manual Workflow",
|
|
101
101
|
url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/Notebooks/DerivaML%20Execution.ipynb",
|
|
102
102
|
workflow_type="Manual Workflow",
|
|
103
103
|
description="A manual operation",
|
|
104
|
-
)
|
|
104
|
+
))
|
|
105
105
|
|
|
106
106
|
manual_execution = ml_instance.create_execution(
|
|
107
107
|
ExecutionConfiguration(description="Sample Execution", workflow=api_workflow)
|
|
@@ -1,19 +1,19 @@
|
|
|
1
|
-
deriva_ml/VERSION.py,sha256=
|
|
1
|
+
deriva_ml/VERSION.py,sha256=d6593s-XBNvVxri9lr2qLUDZQ3Zk3-VXHEwdb4pj8qA,22
|
|
2
2
|
deriva_ml/__init__.py,sha256=0PHNB8gRDALLtaffRmU7wCUgWbRHVQZcjuPJxMLNEco,856
|
|
3
3
|
deriva_ml/database_model.py,sha256=uhoyVyd8MQmY8J9ovCH8fjxhZDxxXNkdJyYdeyEGPXA,13898
|
|
4
|
-
deriva_ml/dataset.py,sha256=
|
|
5
|
-
deriva_ml/dataset_aux_classes.py,sha256=
|
|
4
|
+
deriva_ml/dataset.py,sha256=5STHbjWomTCPl8isdlcDgLk_K9DLCfACajBAreUAXTQ,58272
|
|
5
|
+
deriva_ml/dataset_aux_classes.py,sha256=YxjQnu2kS9kK_f8bGqhmgE6ty9GNeitCxfvReT9vaM0,6537
|
|
6
6
|
deriva_ml/dataset_bag.py,sha256=e6IHv3saZUnZRfl0EjfnlV2NnmPeOagYYv3PuZqS1l0,11501
|
|
7
|
-
deriva_ml/demo_catalog.py,sha256=
|
|
7
|
+
deriva_ml/demo_catalog.py,sha256=xQPhFlflqwJskNQrQ-jdBSnGzBm2-aONBgcRxfsdNKM,11045
|
|
8
8
|
deriva_ml/deriva_definitions.py,sha256=MGl29ogCzqrlRilMhSuR5tECo4NSHP4CLbJAXRtPH6E,8914
|
|
9
|
-
deriva_ml/deriva_ml_base.py,sha256=
|
|
10
|
-
deriva_ml/deriva_model.py,sha256=
|
|
9
|
+
deriva_ml/deriva_ml_base.py,sha256=ShDZlG9F4XrGRUcUINT3bb_P_UdvV1FqSnnPsjGTCLU,36443
|
|
10
|
+
deriva_ml/deriva_model.py,sha256=LV3FjIhIlz13ckZSmu0aOJhT9EVE0-M9oVMudfkxb0g,12004
|
|
11
11
|
deriva_ml/execution.py,sha256=UcXWY1W5Mt_Yzuayd3Pjd-lKzLlMV5QXZFcLvE6Lt0E,28390
|
|
12
|
-
deriva_ml/execution_configuration.py,sha256=
|
|
12
|
+
deriva_ml/execution_configuration.py,sha256=nMeaG1qYdIgu4BV5atSUlcL8VZ3O6ohGY5iBhtD_LQ4,3700
|
|
13
13
|
deriva_ml/execution_environment.py,sha256=bCRKrCELDbGQDo7_FKfw7e8iMzVjSRZK3baKkqH5-_0,3264
|
|
14
14
|
deriva_ml/feature.py,sha256=7e8WYPCfJSrGxJh9oUTduYSnB5ekybRhXa_0HIigS_w,5459
|
|
15
15
|
deriva_ml/history.py,sha256=qTDLDs8Ow_6r7mDO0gZm0Fg81SWKOAgtCU5pzZoDRgM,2828
|
|
16
|
-
deriva_ml/test_functions.py,sha256
|
|
16
|
+
deriva_ml/test_functions.py,sha256=-eqLHjjCQCLBNAr1ofbZekNiCOfMISSACRxT_YHER8I,4396
|
|
17
17
|
deriva_ml/upload.py,sha256=HCOChW6bALW_gt0sWUs_81bNPsb72TNs4o0FQsGSLM4,22222
|
|
18
18
|
deriva_ml/build/lib/schema_setup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
19
|
deriva_ml/build/lib/schema_setup/alter_annotation.py,sha256=pkwk0WystN69JfAFK4iBJZAZVQKbRs-gN9IFYuS9rfg,1739
|
|
@@ -26,9 +26,9 @@ deriva_ml/schema_setup/annotations.py,sha256=Uogm9YkRtoKSdgfQlICqRywbCATppwBO-Xr
|
|
|
26
26
|
deriva_ml/schema_setup/create_schema.py,sha256=jwziMWJPbjRgjiRBT-KtidnXI8YNEFO74A9fwfptjHY,10626
|
|
27
27
|
deriva_ml/schema_setup/policy.json,sha256=77sf0Imy6CAQV0_VwwbA56_KROJ05WXsvT-Wjtkk538,1633
|
|
28
28
|
deriva_ml/schema_setup/table_comments_utils.py,sha256=-2_ubEpoH7ViLVb-ZfW9wZbQ26DTKNgjkCABMzGu4i4,2140
|
|
29
|
-
deriva_ml-1.8.
|
|
30
|
-
deriva_ml-1.8.
|
|
31
|
-
deriva_ml-1.8.
|
|
32
|
-
deriva_ml-1.8.
|
|
33
|
-
deriva_ml-1.8.
|
|
34
|
-
deriva_ml-1.8.
|
|
29
|
+
deriva_ml-1.8.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
30
|
+
deriva_ml-1.8.2.dist-info/METADATA,sha256=DnSPqOt32ddlxTwuxGo9iL3DSbUKLIiMitRDMbxZcYQ,556
|
|
31
|
+
deriva_ml-1.8.2.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
|
32
|
+
deriva_ml-1.8.2.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
|
|
33
|
+
deriva_ml-1.8.2.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
|
|
34
|
+
deriva_ml-1.8.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|