deriva-ml 1.6.8__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/VERSION.py +1 -1
- deriva_ml/__init__.py +2 -0
- deriva_ml/database_model.py +23 -80
- deriva_ml/dataset.py +144 -193
- deriva_ml/dataset_aux_classes.py +1 -0
- deriva_ml/dataset_bag.py +101 -7
- deriva_ml/demo_catalog.py +94 -14
- deriva_ml/deriva_definitions.py +80 -31
- deriva_ml/deriva_ml_base.py +118 -11
- deriva_ml/deriva_model.py +98 -2
- deriva_ml/execution.py +64 -9
- deriva_ml/execution_configuration.py +2 -1
- deriva_ml/execution_environment.py +4 -2
- deriva_ml/feature.py +0 -3
- deriva_ml/history.py +1 -2
- deriva_ml/schema_setup/create_schema.py +34 -7
- deriva_ml/test_functions.py +4 -24
- deriva_ml/upload.py +1 -2
- {deriva_ml-1.6.8.dist-info → deriva_ml-1.8.0.dist-info}/METADATA +1 -1
- deriva_ml-1.8.0.dist-info/RECORD +34 -0
- {deriva_ml-1.6.8.dist-info → deriva_ml-1.8.0.dist-info}/WHEEL +1 -1
- deriva_ml-1.6.8.dist-info/RECORD +0 -34
- {deriva_ml-1.6.8.dist-info → deriva_ml-1.8.0.dist-info}/LICENSE +0 -0
- {deriva_ml-1.6.8.dist-info → deriva_ml-1.8.0.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.6.8.dist-info → deriva_ml-1.8.0.dist-info}/top_level.txt +0 -0
deriva_ml/dataset.py
CHANGED
|
@@ -9,6 +9,7 @@ accessible via a DerivaML class instance.
|
|
|
9
9
|
from bdbag.fetch.fetcher import fetch_single_file
|
|
10
10
|
from bdbag import bdbag_api as bdb
|
|
11
11
|
from collections import defaultdict
|
|
12
|
+
|
|
12
13
|
from deriva.core.ermrest_model import Table
|
|
13
14
|
from deriva.core.utils.core_utils import tag as deriva_tags, format_exception
|
|
14
15
|
from deriva.transfer.download.deriva_export import DerivaExport
|
|
@@ -25,6 +26,7 @@ try:
|
|
|
25
26
|
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
26
27
|
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
27
28
|
|
|
29
|
+
from graphlib import TopologicalSorter
|
|
28
30
|
import json
|
|
29
31
|
import logging
|
|
30
32
|
from pathlib import Path
|
|
@@ -35,7 +37,7 @@ from pydantic import (
|
|
|
35
37
|
import requests
|
|
36
38
|
|
|
37
39
|
from tempfile import TemporaryDirectory, NamedTemporaryFile
|
|
38
|
-
from typing import Any, Callable, Optional, Iterable
|
|
40
|
+
from typing import Any, Callable, Optional, Iterable, Iterator
|
|
39
41
|
|
|
40
42
|
from deriva_ml import DatasetBag
|
|
41
43
|
from .deriva_definitions import ML_SCHEMA, DerivaMLException, MLVocab, Status, RID
|
|
@@ -73,9 +75,10 @@ class Dataset:
|
|
|
73
75
|
rid_info = self._model.catalog.resolve_rid(dataset_rid, self._model.model)
|
|
74
76
|
except KeyError as _e:
|
|
75
77
|
raise DerivaMLException(f"Invalid RID {dataset_rid}")
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
78
|
+
if rid_info.table != self.dataset_table:
|
|
79
|
+
return False
|
|
80
|
+
elif deleted:
|
|
81
|
+
# Got a dataset rid. Now check to see if its deleted or not.
|
|
79
82
|
return True
|
|
80
83
|
else:
|
|
81
84
|
return not list(rid_info.datapath.entities().fetch())[0]["Deleted"]
|
|
@@ -85,6 +88,7 @@ class Dataset:
|
|
|
85
88
|
dataset_rid: RID,
|
|
86
89
|
dataset_version: DatasetVersion,
|
|
87
90
|
description: Optional[str] = "",
|
|
91
|
+
execution_rid: Optional[RID] = None,
|
|
88
92
|
) -> RID:
|
|
89
93
|
schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
|
|
90
94
|
version_path = schema_path.tables["Dataset_Version"]
|
|
@@ -94,6 +98,7 @@ class Dataset:
|
|
|
94
98
|
"Dataset": dataset_rid,
|
|
95
99
|
"Version": str(dataset_version),
|
|
96
100
|
"Description": description,
|
|
101
|
+
"Execution": execution_rid,
|
|
97
102
|
}
|
|
98
103
|
]
|
|
99
104
|
)[0]["RID"]
|
|
@@ -163,6 +168,7 @@ class Dataset:
|
|
|
163
168
|
dataset_rid=dataset_rid,
|
|
164
169
|
version_rid=v["RID"],
|
|
165
170
|
description=v["Description"],
|
|
171
|
+
execution_rid=v["Execution"],
|
|
166
172
|
)
|
|
167
173
|
for v in version_path.filter(version_path.Dataset == dataset_rid)
|
|
168
174
|
.entities()
|
|
@@ -190,11 +196,30 @@ class Dataset:
|
|
|
190
196
|
else:
|
|
191
197
|
return max([h.dataset_version for h in self.dataset_history(dataset_rid)])
|
|
192
198
|
|
|
199
|
+
def _build_dataset_graph(self, dataset_rid: RID) -> Iterable[RID]:
|
|
200
|
+
ts = TopologicalSorter()
|
|
201
|
+
self._build_dataset_graph_1(dataset_rid, ts, set())
|
|
202
|
+
return ts.static_order()
|
|
203
|
+
|
|
204
|
+
def _build_dataset_graph_1(self, dataset_rid: RID, ts, visited) -> None:
|
|
205
|
+
"""Use topological sort to return bottom up list of nested datasets"""
|
|
206
|
+
ts.add(dataset_rid)
|
|
207
|
+
if dataset_rid not in visited:
|
|
208
|
+
visited.add(dataset_rid)
|
|
209
|
+
children = self.list_dataset_children(dataset_rid=dataset_rid)
|
|
210
|
+
parents = self.list_dataset_parents(dataset_rid=dataset_rid)
|
|
211
|
+
for parent in parents:
|
|
212
|
+
self._build_dataset_graph_1(parent, ts, visited)
|
|
213
|
+
for child in children:
|
|
214
|
+
self._build_dataset_graph_1(child, ts, visited)
|
|
215
|
+
|
|
216
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
193
217
|
def increment_dataset_version(
|
|
194
218
|
self,
|
|
195
219
|
dataset_rid: RID,
|
|
196
220
|
component: VersionPart,
|
|
197
221
|
description: Optional[str] = "",
|
|
222
|
+
execution_rid: Optional[RID] = None,
|
|
198
223
|
) -> DatasetVersion:
|
|
199
224
|
"""Increment the version of the specified dataset_table.
|
|
200
225
|
|
|
@@ -204,6 +229,7 @@ class Dataset:
|
|
|
204
229
|
dataset_rid: RID of the dataset whose version is to be incremented.
|
|
205
230
|
component: Major, Minor or Patch
|
|
206
231
|
description: Description of the version update of the dataset_table.
|
|
232
|
+
execution_rid: Which execution is performing increment.
|
|
207
233
|
|
|
208
234
|
Returns:
|
|
209
235
|
new semantic version of the dataset_table as a 3-tuple
|
|
@@ -211,16 +237,16 @@ class Dataset:
|
|
|
211
237
|
Raises:
|
|
212
238
|
DerivaMLException: if provided RID is not to a dataset_table.
|
|
213
239
|
"""
|
|
214
|
-
for
|
|
215
|
-
self.
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
240
|
+
for dataset in self._build_dataset_graph(dataset_rid=dataset_rid):
|
|
241
|
+
version = self.dataset_version(dataset)
|
|
242
|
+
new_version = version.increment_version(component)
|
|
243
|
+
self._insert_dataset_version(
|
|
244
|
+
dataset,
|
|
245
|
+
new_version,
|
|
246
|
+
description=description,
|
|
247
|
+
execution_rid=execution_rid,
|
|
219
248
|
)
|
|
220
|
-
|
|
221
|
-
new_version = version.increment_version(component)
|
|
222
|
-
self._insert_dataset_version(dataset_rid, new_version, description=description)
|
|
223
|
-
return new_version
|
|
249
|
+
return self.dataset_version(dataset_rid)
|
|
224
250
|
|
|
225
251
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
226
252
|
def create_dataset(
|
|
@@ -268,7 +294,7 @@ class Dataset:
|
|
|
268
294
|
pb = self._model.catalog.getPathBuilder()
|
|
269
295
|
for ds_type in ds_types:
|
|
270
296
|
if not check_dataset_type(ds_type):
|
|
271
|
-
raise DerivaMLException(
|
|
297
|
+
raise DerivaMLException("Dataset type must be a vocabulary term.")
|
|
272
298
|
dataset_table_path = pb.schemas[self.dataset_table.schema.name].tables[
|
|
273
299
|
self.dataset_table.name
|
|
274
300
|
]
|
|
@@ -297,7 +323,12 @@ class Dataset:
|
|
|
297
323
|
pb.schemas[self._ml_schema].Dataset_Execution.insert(
|
|
298
324
|
[{"Dataset": dataset_rid, "Execution": execution_rid}]
|
|
299
325
|
)
|
|
300
|
-
self._insert_dataset_version(
|
|
326
|
+
self._insert_dataset_version(
|
|
327
|
+
dataset_rid,
|
|
328
|
+
dataset_version=version,
|
|
329
|
+
execution_rid=execution_rid,
|
|
330
|
+
description="Initial dataset creation.",
|
|
331
|
+
)
|
|
301
332
|
return dataset_rid
|
|
302
333
|
|
|
303
334
|
@validate_call
|
|
@@ -414,7 +445,7 @@ class Dataset:
|
|
|
414
445
|
self._model.model.apply()
|
|
415
446
|
return table
|
|
416
447
|
|
|
417
|
-
@validate_call
|
|
448
|
+
# @validate_call
|
|
418
449
|
def list_dataset_members(
|
|
419
450
|
self, dataset_rid: RID, recurse: bool = False
|
|
420
451
|
) -> dict[str, list[dict[str, Any]]]:
|
|
@@ -439,34 +470,27 @@ class Dataset:
|
|
|
439
470
|
pb = self._model.catalog.getPathBuilder()
|
|
440
471
|
for assoc_table in self.dataset_table.find_associations():
|
|
441
472
|
other_fkey = assoc_table.other_fkeys.pop()
|
|
442
|
-
self_fkey = assoc_table.self_fkey
|
|
443
473
|
target_table = other_fkey.pk_table
|
|
444
474
|
member_table = assoc_table.table
|
|
445
475
|
|
|
476
|
+
# Look at domain tables and nested datasets.
|
|
446
477
|
if (
|
|
447
478
|
target_table.schema.name != self._model.domain_schema
|
|
448
479
|
and target_table != self.dataset_table
|
|
449
480
|
):
|
|
450
|
-
# Look at domain tables and nested datasets.
|
|
451
481
|
continue
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
482
|
+
member_column = (
|
|
483
|
+
"Nested_Dataset"
|
|
484
|
+
if target_table == self.dataset_table
|
|
485
|
+
else other_fkey.foreign_key_columns[0].name
|
|
486
|
+
)
|
|
455
487
|
|
|
456
488
|
target_path = pb.schemas[target_table.schema.name].tables[target_table.name]
|
|
457
489
|
member_path = pb.schemas[member_table.schema.name].tables[member_table.name]
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
c.name for c in next(iter(other_fkey.column_map.items()))
|
|
461
|
-
)
|
|
462
|
-
path = pb.schemas[member_table.schema.name].tables[member_table.name].path
|
|
463
|
-
path.filter(member_path.Dataset == dataset_rid)
|
|
464
|
-
path.link(
|
|
490
|
+
|
|
491
|
+
path = member_path.filter(member_path.Dataset == dataset_rid).link(
|
|
465
492
|
target_path,
|
|
466
|
-
on=(
|
|
467
|
-
member_path.columns[member_link[0]]
|
|
468
|
-
== target_path.columns[member_link[1]]
|
|
469
|
-
),
|
|
493
|
+
on=(member_path.columns[member_column] == target_path.columns["RID"]),
|
|
470
494
|
)
|
|
471
495
|
target_entities = list(path.entities().fetch())
|
|
472
496
|
members[target_table.name].extend(target_entities)
|
|
@@ -485,6 +509,7 @@ class Dataset:
|
|
|
485
509
|
members: list[RID],
|
|
486
510
|
validate: bool = True,
|
|
487
511
|
description: Optional[str] = "",
|
|
512
|
+
execution_rid: Optional[RID] = None,
|
|
488
513
|
) -> None:
|
|
489
514
|
"""Add additional elements to an existing dataset_table.
|
|
490
515
|
|
|
@@ -496,6 +521,7 @@ class Dataset:
|
|
|
496
521
|
members: List of RIDs of members to add to the dataset_table.
|
|
497
522
|
validate: Check rid_list to make sure elements are not already in the dataset_table.
|
|
498
523
|
description: Markdown description of the updated dataset.
|
|
524
|
+
execution_rid: Optional RID of execution associated with this dataset.
|
|
499
525
|
"""
|
|
500
526
|
members = set(members)
|
|
501
527
|
description = description or "Updated dataset via add_dataset_members"
|
|
@@ -559,12 +585,19 @@ class Dataset:
|
|
|
559
585
|
[{"Dataset": dataset_rid, fk_column: e} for e in elements]
|
|
560
586
|
)
|
|
561
587
|
self.increment_dataset_version(
|
|
562
|
-
dataset_rid,
|
|
588
|
+
dataset_rid,
|
|
589
|
+
VersionPart.minor,
|
|
590
|
+
description=description,
|
|
591
|
+
execution_rid=execution_rid,
|
|
563
592
|
)
|
|
564
593
|
|
|
565
594
|
@validate_call
|
|
566
595
|
def delete_dataset_members(
|
|
567
|
-
self,
|
|
596
|
+
self,
|
|
597
|
+
dataset_rid: RID,
|
|
598
|
+
members: list[RID],
|
|
599
|
+
description: str = "",
|
|
600
|
+
execution_rid: Optional[RID] = None,
|
|
568
601
|
) -> None:
|
|
569
602
|
"""Remove elements to an existing dataset_table.
|
|
570
603
|
|
|
@@ -575,6 +608,7 @@ class Dataset:
|
|
|
575
608
|
dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
|
|
576
609
|
members: List of RIDs of members to add to the dataset_table.
|
|
577
610
|
description: Markdown description of the updated dataset.
|
|
611
|
+
execution_rid: Optional RID of execution associated with this operation.
|
|
578
612
|
"""
|
|
579
613
|
|
|
580
614
|
members = set(members)
|
|
@@ -616,7 +650,10 @@ class Dataset:
|
|
|
616
650
|
)
|
|
617
651
|
entity.delete()
|
|
618
652
|
self.increment_dataset_version(
|
|
619
|
-
dataset_rid,
|
|
653
|
+
dataset_rid,
|
|
654
|
+
VersionPart.minor,
|
|
655
|
+
description=description,
|
|
656
|
+
execution_rid=execution_rid,
|
|
620
657
|
)
|
|
621
658
|
|
|
622
659
|
@validate_call
|
|
@@ -663,44 +700,6 @@ class Dataset:
|
|
|
663
700
|
children.extend(self.list_dataset_children(child, recurse=recurse))
|
|
664
701
|
return children
|
|
665
702
|
|
|
666
|
-
@staticmethod
|
|
667
|
-
def _download_dataset_element(
|
|
668
|
-
spath: str, dpath: str, table: Table
|
|
669
|
-
) -> list[dict[str, Any]]:
|
|
670
|
-
"""Return the download specification for the data object indicated by a path through the data model.
|
|
671
|
-
|
|
672
|
-
Args:
|
|
673
|
-
spath: Source path
|
|
674
|
-
dpath: Destination path
|
|
675
|
-
table: Table referenced to by the path
|
|
676
|
-
|
|
677
|
-
Returns:
|
|
678
|
-
The download specification that will retrieve that data from the catalog and place it into a BDBag.
|
|
679
|
-
"""
|
|
680
|
-
exports = [
|
|
681
|
-
{
|
|
682
|
-
"processor": "csv",
|
|
683
|
-
"processor_params": {
|
|
684
|
-
"query_path": f"/entity/{spath}?limit=none",
|
|
685
|
-
"output_path": dpath,
|
|
686
|
-
},
|
|
687
|
-
}
|
|
688
|
-
]
|
|
689
|
-
|
|
690
|
-
# If this table is an asset table, then we need to output the files associated with the asset.
|
|
691
|
-
asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
|
|
692
|
-
if asset_columns.issubset({c.name for c in table.columns}):
|
|
693
|
-
exports.append(
|
|
694
|
-
{
|
|
695
|
-
"processor": "fetch",
|
|
696
|
-
"processor_params": {
|
|
697
|
-
"query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
|
|
698
|
-
"output_path": f"asset/{table.name}",
|
|
699
|
-
},
|
|
700
|
-
}
|
|
701
|
-
)
|
|
702
|
-
return exports
|
|
703
|
-
|
|
704
703
|
def _vocabulary_specification(
|
|
705
704
|
self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
|
|
706
705
|
) -> list[dict[str, Any]]:
|
|
@@ -724,82 +723,38 @@ class Dataset:
|
|
|
724
723
|
for o in writer(f"{table.schema.name}:{table.name}", table.name, table)
|
|
725
724
|
]
|
|
726
725
|
|
|
727
|
-
def
|
|
728
|
-
self,
|
|
729
|
-
graph: dict[Table, list[dict[Table, Any]]],
|
|
730
|
-
spath: str = None,
|
|
731
|
-
dpath: str = None,
|
|
732
|
-
sprefix: str = "deriva-ml:Dataset/RID={Dataset_RID}",
|
|
733
|
-
dprefix: str = "Dataset",
|
|
734
|
-
nested: bool = False,
|
|
735
|
-
) -> list[tuple[str, str, Table]]:
|
|
736
|
-
"""Recursively walk over the domain schema graph and extend the current path.
|
|
726
|
+
def _table_paths(self) -> Iterator[tuple[list[str], list[str], list[Table]]]:
|
|
737
727
|
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
spath: Source path so far
|
|
742
|
-
dpath: Destination path so far
|
|
743
|
-
sprefix: Initial path to be included. Allows for nested datasets
|
|
744
|
-
dprefix: Initial path to be included. Allows for nested datasets
|
|
745
|
-
nested: If true, skip initial data segment.
|
|
728
|
+
dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
|
|
729
|
+
paths = self._model._schema_to_paths()
|
|
730
|
+
nested_paths = paths
|
|
746
731
|
|
|
747
|
-
Returns:
|
|
748
|
-
A list of all the paths through the graph. Each path is a list of tables.
|
|
749
|
-
|
|
750
|
-
"""
|
|
751
|
-
source_path = spath or sprefix
|
|
752
|
-
dest_path = dpath or dprefix
|
|
753
|
-
paths = []
|
|
754
|
-
for node, children in graph.items():
|
|
755
|
-
if node.name == "Dataset":
|
|
756
|
-
paths.append(
|
|
757
|
-
(
|
|
758
|
-
f"{sprefix}/(RID)=({self._ml_schema}:Dataset_Version:Dataset)",
|
|
759
|
-
f"{dprefix}/Dataset_Version",
|
|
760
|
-
self._model.schemas[self._ml_schema].tables["Dataset_Version"],
|
|
761
|
-
)
|
|
762
|
-
)
|
|
763
|
-
new_spath = sprefix
|
|
764
|
-
new_dpath = dprefix
|
|
765
|
-
|
|
766
|
-
if not nested:
|
|
767
|
-
paths.append((new_spath, new_dpath, node))
|
|
768
|
-
else:
|
|
769
|
-
new_spath = source_path + f"/{node.schema.name}:{node.name}"
|
|
770
|
-
new_dpath = dest_path + f"/{node.name}"
|
|
771
|
-
paths.append((new_spath, new_dpath, node))
|
|
772
|
-
for child in children:
|
|
773
|
-
paths.extend(
|
|
774
|
-
self._domain_table_paths(child, new_spath, new_dpath, nested=nested)
|
|
775
|
-
)
|
|
776
|
-
return paths
|
|
777
|
-
|
|
778
|
-
def _table_paths(self, graph) -> list[tuple[str, str, Table]]:
|
|
779
|
-
sprefix = "deriva-ml:Dataset/RID={Dataset_RID}"
|
|
780
|
-
dprefix = "Dataset"
|
|
781
|
-
dataset_dataset_table = self._model.schemas[self._ml_schema].tables[
|
|
782
|
-
"Dataset_Dataset"
|
|
783
|
-
]
|
|
784
|
-
table_paths = self._domain_table_paths(
|
|
785
|
-
graph=graph, sprefix=sprefix, dprefix=dprefix
|
|
786
|
-
)
|
|
787
|
-
nested_sprefix = sprefix
|
|
788
|
-
nested_dprefix = dprefix
|
|
789
732
|
for i in range(self._dataset_nesting_depth()):
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
733
|
+
if i == 0:
|
|
734
|
+
paths.extend([[self.dataset_table, dataset_dataset]])
|
|
735
|
+
nested_paths = [
|
|
736
|
+
[self.dataset_table, dataset_dataset] + p for p in nested_paths
|
|
737
|
+
]
|
|
738
|
+
paths.extend(nested_paths)
|
|
739
|
+
|
|
740
|
+
def source_path(path):
|
|
741
|
+
p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
|
|
742
|
+
for table in path[1:]:
|
|
743
|
+
if table == dataset_dataset:
|
|
744
|
+
p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
|
|
745
|
+
elif table == self.dataset_table:
|
|
746
|
+
p.append("(Nested_Dataset)=(deriva-ml:Dataset:RID)")
|
|
747
|
+
elif table.name == "Dataset_Version":
|
|
748
|
+
p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
|
|
749
|
+
else:
|
|
750
|
+
p.append(f"{table.schema.name}:{table.name}")
|
|
751
|
+
return p
|
|
752
|
+
|
|
753
|
+
src_paths = ["/".join(source_path(p)) for p in paths]
|
|
754
|
+
dest_paths = ["/".join([t.name for t in p]) for p in paths]
|
|
755
|
+
target_tables = [p[-1] for p in paths]
|
|
756
|
+
|
|
757
|
+
return zip(src_paths, dest_paths, target_tables)
|
|
803
758
|
|
|
804
759
|
def _dataset_nesting_depth(self):
|
|
805
760
|
"""Determine the maximum dataset nesting depth in the current catalog.
|
|
@@ -811,6 +766,7 @@ class Dataset:
|
|
|
811
766
|
def children_depth(
|
|
812
767
|
dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
|
|
813
768
|
) -> int:
|
|
769
|
+
"""Return the number of nested datasets in the current catalog"""
|
|
814
770
|
try:
|
|
815
771
|
children = nested_datasets[dataset_rid]
|
|
816
772
|
return (
|
|
@@ -836,50 +792,6 @@ class Dataset:
|
|
|
836
792
|
else 0
|
|
837
793
|
)
|
|
838
794
|
|
|
839
|
-
def _schema_graph(
|
|
840
|
-
self, node: Table, visited_nodes: Optional[set] = None
|
|
841
|
-
) -> dict[Table, list[dict[Table, list]]]:
|
|
842
|
-
"""Generate an undirected, acyclic graph of domain schema. We do this by traversing the schema foreign key
|
|
843
|
-
relationships. We stop when we hit the deriva-ml schema or when we reach a node that we have already seen.
|
|
844
|
-
|
|
845
|
-
Nested datasets need to be unfolded
|
|
846
|
-
|
|
847
|
-
Args:
|
|
848
|
-
node: Current (starting) node in the graph.
|
|
849
|
-
visited_nodes: param nested_dataset: Are we in a nested dataset_table, (i.e. have we seen the DataSet table)?
|
|
850
|
-
|
|
851
|
-
Returns:
|
|
852
|
-
Graph of the schema, starting from node.
|
|
853
|
-
"""
|
|
854
|
-
|
|
855
|
-
visited_nodes = visited_nodes or set()
|
|
856
|
-
graph = {node: []}
|
|
857
|
-
|
|
858
|
-
def include_node(child: Table) -> bool:
|
|
859
|
-
"""Indicate if the table should be included in the graph.
|
|
860
|
-
|
|
861
|
-
Include node in the graph if it's not a loopback from fk<-> referred_by, you have not already been to the
|
|
862
|
-
node.
|
|
863
|
-
"""
|
|
864
|
-
return (
|
|
865
|
-
child != node
|
|
866
|
-
and child not in visited_nodes
|
|
867
|
-
and child.schema.name == self._model.domain_schema
|
|
868
|
-
)
|
|
869
|
-
|
|
870
|
-
# Get all the tables reachable from the end of the path avoiding loops from T1<->T2 via referenced_by
|
|
871
|
-
nodes = {fk.pk_table for fk in node.foreign_keys if include_node(fk.pk_table)}
|
|
872
|
-
nodes |= {fk.table for fk in node.referenced_by if include_node(fk.table)}
|
|
873
|
-
for t in nodes:
|
|
874
|
-
new_visited_nodes = visited_nodes.copy()
|
|
875
|
-
new_visited_nodes.add(t)
|
|
876
|
-
if self._model.is_vocabulary(t):
|
|
877
|
-
# If the end of the path is a vocabulary table, we are at a terminal node in the ERD, so stop
|
|
878
|
-
continue
|
|
879
|
-
# Get all the paths that extend the current path
|
|
880
|
-
graph[node].append(self._schema_graph(t, new_visited_nodes))
|
|
881
|
-
return graph
|
|
882
|
-
|
|
883
795
|
def _dataset_specification(
|
|
884
796
|
self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
|
|
885
797
|
) -> list[dict[str, Any]]:
|
|
@@ -921,7 +833,7 @@ class Dataset:
|
|
|
921
833
|
A dataset_table specification.
|
|
922
834
|
"""
|
|
923
835
|
element_spec = []
|
|
924
|
-
for path in self._table_paths(
|
|
836
|
+
for path in self._table_paths():
|
|
925
837
|
element_spec.extend(writer(*path))
|
|
926
838
|
return self._vocabulary_specification(writer) + element_spec
|
|
927
839
|
|
|
@@ -953,7 +865,7 @@ class Dataset:
|
|
|
953
865
|
if dataset.materialize
|
|
954
866
|
else self._download_dataset_bag(minid)
|
|
955
867
|
)
|
|
956
|
-
return DatabaseModel
|
|
868
|
+
return DatabaseModel(minid, bag_path).get_dataset()
|
|
957
869
|
|
|
958
870
|
def _version_snapshot(self, dataset: DatasetSpec) -> str:
|
|
959
871
|
version_record = [
|
|
@@ -1089,6 +1001,7 @@ class Dataset:
|
|
|
1089
1001
|
"""
|
|
1090
1002
|
|
|
1091
1003
|
def update_status(status: Status, msg: str) -> None:
|
|
1004
|
+
"""Update the current status for this execution in the catalog"""
|
|
1092
1005
|
self._model.catalog.getPathBuilder().schemas[
|
|
1093
1006
|
self._ml_schema
|
|
1094
1007
|
].Execution.update(
|
|
@@ -1192,10 +1105,48 @@ class Dataset:
|
|
|
1192
1105
|
return [
|
|
1193
1106
|
{
|
|
1194
1107
|
"processor": "json",
|
|
1195
|
-
"processor_params": {"query_path":
|
|
1108
|
+
"processor_params": {"query_path": "/schema", "output_path": "schema"},
|
|
1196
1109
|
}
|
|
1197
1110
|
] + self._dataset_specification(writer)
|
|
1198
1111
|
|
|
1112
|
+
@staticmethod
|
|
1113
|
+
def _download_dataset_element(
|
|
1114
|
+
spath: str, dpath: str, table: Table
|
|
1115
|
+
) -> list[dict[str, Any]]:
|
|
1116
|
+
"""Return the download specification for the data object indicated by a path through the data model.
|
|
1117
|
+
|
|
1118
|
+
Args:
|
|
1119
|
+
spath: Source path
|
|
1120
|
+
dpath: Destination path
|
|
1121
|
+
table: Table referenced to by the path
|
|
1122
|
+
|
|
1123
|
+
Returns:
|
|
1124
|
+
The download specification that will retrieve that data from the catalog and place it into a BDBag.
|
|
1125
|
+
"""
|
|
1126
|
+
exports = [
|
|
1127
|
+
{
|
|
1128
|
+
"processor": "csv",
|
|
1129
|
+
"processor_params": {
|
|
1130
|
+
"query_path": f"/entity/{spath}?limit=none",
|
|
1131
|
+
"output_path": dpath,
|
|
1132
|
+
},
|
|
1133
|
+
}
|
|
1134
|
+
]
|
|
1135
|
+
|
|
1136
|
+
# If this table is an asset table, then we need to output the files associated with the asset.
|
|
1137
|
+
asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
|
|
1138
|
+
if asset_columns.issubset({c.name for c in table.columns}):
|
|
1139
|
+
exports.append(
|
|
1140
|
+
{
|
|
1141
|
+
"processor": "fetch",
|
|
1142
|
+
"processor_params": {
|
|
1143
|
+
"query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
|
|
1144
|
+
"output_path": f"asset/{table.name}",
|
|
1145
|
+
},
|
|
1146
|
+
}
|
|
1147
|
+
)
|
|
1148
|
+
return exports
|
|
1149
|
+
|
|
1199
1150
|
@staticmethod
|
|
1200
1151
|
def _export_dataset_element(
|
|
1201
1152
|
spath: str, dpath: str, table: Table
|
deriva_ml/dataset_aux_classes.py
CHANGED