deriva-ml 1.6.7__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/VERSION.py +1 -1
- deriva_ml/database_model.py +23 -80
- deriva_ml/dataset.py +128 -171
- deriva_ml/dataset_aux_classes.py +1 -0
- deriva_ml/dataset_bag.py +101 -7
- deriva_ml/demo_catalog.py +93 -12
- deriva_ml/deriva_definitions.py +43 -32
- deriva_ml/deriva_ml_base.py +133 -10
- deriva_ml/deriva_model.py +98 -2
- deriva_ml/execution.py +122 -248
- deriva_ml/execution_configuration.py +3 -2
- deriva_ml/execution_environment.py +2 -0
- deriva_ml/feature.py +0 -3
- deriva_ml/history.py +1 -2
- deriva_ml/schema_setup/create_schema.py +1 -0
- deriva_ml/test_functions.py +1 -17
- deriva_ml/upload.py +1 -1
- {deriva_ml-1.6.7.dist-info → deriva_ml-1.7.0.dist-info}/METADATA +1 -1
- deriva_ml-1.7.0.dist-info/RECORD +34 -0
- {deriva_ml-1.6.7.dist-info → deriva_ml-1.7.0.dist-info}/WHEEL +1 -1
- deriva_ml-1.6.7.dist-info/RECORD +0 -34
- {deriva_ml-1.6.7.dist-info → deriva_ml-1.7.0.dist-info}/LICENSE +0 -0
- {deriva_ml-1.6.7.dist-info → deriva_ml-1.7.0.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.6.7.dist-info → deriva_ml-1.7.0.dist-info}/top_level.txt +0 -0
deriva_ml/VERSION.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.
|
|
1
|
+
__version__ = "1.7.0"
|
deriva_ml/database_model.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
|
+
"""Ths module constains the definition of the DatabaseModel class. The role of this class is to provide an nterface between the BDBag representation
|
|
2
|
+
of a dataset and a sqllite database in which the contents of the bag are stored.
|
|
3
|
+
"""
|
|
1
4
|
import logging
|
|
2
5
|
import sqlite3
|
|
3
6
|
|
|
4
7
|
from csv import reader
|
|
5
8
|
from pathlib import Path
|
|
6
|
-
from typing import Any,
|
|
9
|
+
from typing import Any, Optional
|
|
7
10
|
from urllib.parse import urlparse
|
|
8
11
|
|
|
9
|
-
import pandas as pd
|
|
10
12
|
from deriva.core.ermrest_model import Model
|
|
11
|
-
from pydantic import validate_call
|
|
12
13
|
|
|
13
14
|
from .deriva_definitions import ML_SCHEMA, MLVocab, RID, DerivaMLException
|
|
14
15
|
from .dataset_aux_classes import DatasetVersion, DatasetMinid
|
|
@@ -16,7 +17,21 @@ from .deriva_model import DerivaModel
|
|
|
16
17
|
from .dataset_bag import DatasetBag
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
class
|
|
20
|
+
class DatabaseModelMeta(type):
|
|
21
|
+
"""Use metaclass to ensure that there is onl one instance per path"""
|
|
22
|
+
|
|
23
|
+
_paths_loaded: dict[Path:"DatabaseModel"] = {}
|
|
24
|
+
|
|
25
|
+
def __call__(cls, *args, **kwargs):
|
|
26
|
+
logger = logging.getLogger("deriva_ml")
|
|
27
|
+
bag_path: Path = args[1]
|
|
28
|
+
if bag_path.as_posix() not in cls._paths_loaded:
|
|
29
|
+
logger.info(f"Loading {bag_path}")
|
|
30
|
+
cls._paths_loaded[bag_path] = super().__call__(*args, **kwargs)
|
|
31
|
+
return cls._paths_loaded[bag_path]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
20
35
|
"""Read in the contents of a BDBag and create a local SQLite database.
|
|
21
36
|
|
|
22
37
|
As part of its initialization, this routine will create a sqlite database that has the contents of all the tables
|
|
@@ -32,6 +47,9 @@ class DatabaseModel(DerivaModel):
|
|
|
32
47
|
Because of nested datasets, it's possible that more than one dataset rid is in a bag, or that a dataset rid might
|
|
33
48
|
appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
|
|
34
49
|
into DatabaseModels, is kept in the class variable `_rid_map`.
|
|
50
|
+
|
|
51
|
+
Because you can load diffent versions of a dataset simultaniously, the dataset RID and version number are tracked, and a new
|
|
52
|
+
sqllite instance is created for every new dataset version present.
|
|
35
53
|
|
|
36
54
|
Attributes:
|
|
37
55
|
bag_path (Path): path to the local copy of the BDBag
|
|
@@ -42,29 +60,9 @@ class DatabaseModel(DerivaModel):
|
|
|
42
60
|
dataset_table (Table): the dataset table in the ERMRest model.
|
|
43
61
|
"""
|
|
44
62
|
|
|
45
|
-
# Keep track of what databases we have loaded.
|
|
46
|
-
_paths_loaded: dict[Path:"DatabaseModel"] = {}
|
|
47
|
-
|
|
48
63
|
# Maintain a global map of RIDS to versions and databases.
|
|
49
64
|
_rid_map: dict[RID, list[tuple[DatasetVersion, "DatabaseModel"]]] = {}
|
|
50
65
|
|
|
51
|
-
@classmethod
|
|
52
|
-
@validate_call
|
|
53
|
-
def register(cls, minid: DatasetMinid, bag_path: Path):
|
|
54
|
-
"""Register a new minid in the list of local databases if it's new, otherwise, return an existing DatabaseModel.
|
|
55
|
-
|
|
56
|
-
Args:
|
|
57
|
-
minid: MINID to the databag that is to be loaded.
|
|
58
|
-
bag_path: Path to the bag on the local filesystem./
|
|
59
|
-
|
|
60
|
-
Returns:
|
|
61
|
-
A DatabaseModel instance to the loaded bag.
|
|
62
|
-
"""
|
|
63
|
-
o = cls._paths_loaded.get(bag_path.as_posix())
|
|
64
|
-
if o:
|
|
65
|
-
return o
|
|
66
|
-
return cls(minid, bag_path)
|
|
67
|
-
|
|
68
66
|
@staticmethod
|
|
69
67
|
def rid_lookup(dataset_rid: RID) -> list[tuple[DatasetVersion, "DatabaseModel"]]:
|
|
70
68
|
"""Return a list of DatasetVersion/DatabaseModel instances corresponding to the given RID.
|
|
@@ -84,13 +82,12 @@ class DatabaseModel(DerivaModel):
|
|
|
84
82
|
raise DerivaMLException(f"Dataset {dataset_rid} not found")
|
|
85
83
|
|
|
86
84
|
def __init__(self, minid: DatasetMinid, bag_path: Path):
|
|
87
|
-
"""Create a new DatabaseModel.
|
|
85
|
+
"""Create a new DatabaseModel.
|
|
88
86
|
|
|
89
87
|
Args:
|
|
90
88
|
minid: Minid for the specified bag.
|
|
91
89
|
bag_path: Path to the local copy of the BDBag.
|
|
92
90
|
"""
|
|
93
|
-
DatabaseModel._paths_loaded[bag_path.as_posix()] = self
|
|
94
91
|
|
|
95
92
|
self.bag_path = bag_path
|
|
96
93
|
self.minid = minid
|
|
@@ -342,60 +339,6 @@ class DatabaseModel(DerivaModel):
|
|
|
342
339
|
except KeyError:
|
|
343
340
|
raise DerivaMLException(f'Table name "{table}" does not exist.')
|
|
344
341
|
|
|
345
|
-
def get_table(self, table: str) -> Generator[tuple, None, None]:
|
|
346
|
-
"""Retrieve the contents of the specified table. If schema is not provided as part of the table name,
|
|
347
|
-
the method will attempt to locate the schema for the table.
|
|
348
|
-
|
|
349
|
-
Args:
|
|
350
|
-
table: return: A generator that yields tuples of column values.
|
|
351
|
-
|
|
352
|
-
Returns:
|
|
353
|
-
A generator that yields tuples of column values.
|
|
354
|
-
|
|
355
|
-
"""
|
|
356
|
-
table_name = self.normalize_table_name(table)
|
|
357
|
-
result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
|
|
358
|
-
while row := result.fetchone():
|
|
359
|
-
yield row
|
|
360
|
-
|
|
361
|
-
def get_table_as_dataframe(self, table: str) -> pd.DataFrame:
|
|
362
|
-
"""Retrieve the contents of the specified table as a dataframe.
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
If schema is not provided as part of the table name,
|
|
366
|
-
the method will attempt to locate the schema for the table.
|
|
367
|
-
|
|
368
|
-
Args:
|
|
369
|
-
table: Table to retrieve data from.
|
|
370
|
-
|
|
371
|
-
Returns:
|
|
372
|
-
A dataframe containing the contents of the specified table.
|
|
373
|
-
"""
|
|
374
|
-
table_name = self.normalize_table_name(table)
|
|
375
|
-
return pd.read_sql(f'SELECT * FROM "{table_name}"', con=self.dbase)
|
|
376
|
-
|
|
377
|
-
def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
|
|
378
|
-
"""Retrieve the contents of the specified table as a dictionary.
|
|
379
|
-
|
|
380
|
-
Args:
|
|
381
|
-
table: Table to retrieve data from. f schema is not provided as part of the table name,
|
|
382
|
-
the method will attempt to locate the schema for the table.
|
|
383
|
-
|
|
384
|
-
Returns:
|
|
385
|
-
A generator producing dictionaries containing the contents of the specified table as name/value pairs.
|
|
386
|
-
"""
|
|
387
|
-
table_name = self.normalize_table_name(table)
|
|
388
|
-
with self.dbase:
|
|
389
|
-
col_names = [
|
|
390
|
-
c[1]
|
|
391
|
-
for c in self.dbase.execute(
|
|
392
|
-
f'PRAGMA table_info("{table_name}")'
|
|
393
|
-
).fetchall()
|
|
394
|
-
]
|
|
395
|
-
result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
|
|
396
|
-
while row := result.fetchone():
|
|
397
|
-
yield dict(zip(col_names, row))
|
|
398
|
-
|
|
399
342
|
def delete_database(self):
|
|
400
343
|
"""
|
|
401
344
|
|
deriva_ml/dataset.py
CHANGED
|
@@ -9,6 +9,7 @@ accessible via a DerivaML class instance.
|
|
|
9
9
|
from bdbag.fetch.fetcher import fetch_single_file
|
|
10
10
|
from bdbag import bdbag_api as bdb
|
|
11
11
|
from collections import defaultdict
|
|
12
|
+
|
|
12
13
|
from deriva.core.ermrest_model import Table
|
|
13
14
|
from deriva.core.utils.core_utils import tag as deriva_tags, format_exception
|
|
14
15
|
from deriva.transfer.download.deriva_export import DerivaExport
|
|
@@ -25,6 +26,7 @@ try:
|
|
|
25
26
|
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
26
27
|
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
27
28
|
|
|
29
|
+
from graphlib import TopologicalSorter
|
|
28
30
|
import json
|
|
29
31
|
import logging
|
|
30
32
|
from pathlib import Path
|
|
@@ -35,7 +37,7 @@ from pydantic import (
|
|
|
35
37
|
import requests
|
|
36
38
|
|
|
37
39
|
from tempfile import TemporaryDirectory, NamedTemporaryFile
|
|
38
|
-
from typing import Any, Callable, Optional, Iterable
|
|
40
|
+
from typing import Any, Callable, Optional, Iterable, Iterator
|
|
39
41
|
|
|
40
42
|
from deriva_ml import DatasetBag
|
|
41
43
|
from .deriva_definitions import ML_SCHEMA, DerivaMLException, MLVocab, Status, RID
|
|
@@ -85,6 +87,7 @@ class Dataset:
|
|
|
85
87
|
dataset_rid: RID,
|
|
86
88
|
dataset_version: DatasetVersion,
|
|
87
89
|
description: Optional[str] = "",
|
|
90
|
+
execution_rid: Optional[RID] = None,
|
|
88
91
|
) -> RID:
|
|
89
92
|
schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
|
|
90
93
|
version_path = schema_path.tables["Dataset_Version"]
|
|
@@ -94,6 +97,7 @@ class Dataset:
|
|
|
94
97
|
"Dataset": dataset_rid,
|
|
95
98
|
"Version": str(dataset_version),
|
|
96
99
|
"Description": description,
|
|
100
|
+
"Execution": execution_rid,
|
|
97
101
|
}
|
|
98
102
|
]
|
|
99
103
|
)[0]["RID"]
|
|
@@ -163,6 +167,7 @@ class Dataset:
|
|
|
163
167
|
dataset_rid=dataset_rid,
|
|
164
168
|
version_rid=v["RID"],
|
|
165
169
|
description=v["Description"],
|
|
170
|
+
execution_rid=v["Execution"],
|
|
166
171
|
)
|
|
167
172
|
for v in version_path.filter(version_path.Dataset == dataset_rid)
|
|
168
173
|
.entities()
|
|
@@ -190,11 +195,30 @@ class Dataset:
|
|
|
190
195
|
else:
|
|
191
196
|
return max([h.dataset_version for h in self.dataset_history(dataset_rid)])
|
|
192
197
|
|
|
198
|
+
def _build_dataset_graph(self, dataset_rid: RID) -> Iterable[RID]:
|
|
199
|
+
ts = TopologicalSorter()
|
|
200
|
+
self._build_dataset_graph_1(dataset_rid, ts, set())
|
|
201
|
+
return ts.static_order()
|
|
202
|
+
|
|
203
|
+
def _build_dataset_graph_1(self, dataset_rid: RID, ts, visited) -> None:
|
|
204
|
+
"""Use topological sort to return bottom up list of nested datasets"""
|
|
205
|
+
ts.add(dataset_rid)
|
|
206
|
+
if dataset_rid not in visited:
|
|
207
|
+
visited.add(dataset_rid)
|
|
208
|
+
children = self.list_dataset_children(dataset_rid=dataset_rid)
|
|
209
|
+
parents = self.list_dataset_parents(dataset_rid=dataset_rid)
|
|
210
|
+
for parent in parents:
|
|
211
|
+
self._build_dataset_graph_1(parent, ts, visited)
|
|
212
|
+
for child in children:
|
|
213
|
+
self._build_dataset_graph_1(child, ts, visited)
|
|
214
|
+
|
|
215
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
193
216
|
def increment_dataset_version(
|
|
194
217
|
self,
|
|
195
218
|
dataset_rid: RID,
|
|
196
219
|
component: VersionPart,
|
|
197
220
|
description: Optional[str] = "",
|
|
221
|
+
execution_rid: Optional[RID] = None,
|
|
198
222
|
) -> DatasetVersion:
|
|
199
223
|
"""Increment the version of the specified dataset_table.
|
|
200
224
|
|
|
@@ -204,6 +228,7 @@ class Dataset:
|
|
|
204
228
|
dataset_rid: RID of the dataset whose version is to be incremented.
|
|
205
229
|
component: Major, Minor or Patch
|
|
206
230
|
description: Description of the version update of the dataset_table.
|
|
231
|
+
execution_rid: Which execution is performing increment.
|
|
207
232
|
|
|
208
233
|
Returns:
|
|
209
234
|
new semantic version of the dataset_table as a 3-tuple
|
|
@@ -211,16 +236,16 @@ class Dataset:
|
|
|
211
236
|
Raises:
|
|
212
237
|
DerivaMLException: if provided RID is not to a dataset_table.
|
|
213
238
|
"""
|
|
214
|
-
for
|
|
215
|
-
self.
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
239
|
+
for dataset in self._build_dataset_graph(dataset_rid=dataset_rid):
|
|
240
|
+
version = self.dataset_version(dataset)
|
|
241
|
+
new_version = version.increment_version(component)
|
|
242
|
+
self._insert_dataset_version(
|
|
243
|
+
dataset,
|
|
244
|
+
new_version,
|
|
245
|
+
description=description,
|
|
246
|
+
execution_rid=execution_rid,
|
|
219
247
|
)
|
|
220
|
-
|
|
221
|
-
new_version = version.increment_version(component)
|
|
222
|
-
self._insert_dataset_version(dataset_rid, new_version, description=description)
|
|
223
|
-
return new_version
|
|
248
|
+
return self.dataset_version(dataset_rid)
|
|
224
249
|
|
|
225
250
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
226
251
|
def create_dataset(
|
|
@@ -297,7 +322,12 @@ class Dataset:
|
|
|
297
322
|
pb.schemas[self._ml_schema].Dataset_Execution.insert(
|
|
298
323
|
[{"Dataset": dataset_rid, "Execution": execution_rid}]
|
|
299
324
|
)
|
|
300
|
-
self._insert_dataset_version(
|
|
325
|
+
self._insert_dataset_version(
|
|
326
|
+
dataset_rid,
|
|
327
|
+
dataset_version=version,
|
|
328
|
+
execution_rid=execution_rid,
|
|
329
|
+
description="Initial dataset creation.",
|
|
330
|
+
)
|
|
301
331
|
return dataset_rid
|
|
302
332
|
|
|
303
333
|
@validate_call
|
|
@@ -485,6 +515,7 @@ class Dataset:
|
|
|
485
515
|
members: list[RID],
|
|
486
516
|
validate: bool = True,
|
|
487
517
|
description: Optional[str] = "",
|
|
518
|
+
execution_rid: Optional[RID] = None,
|
|
488
519
|
) -> None:
|
|
489
520
|
"""Add additional elements to an existing dataset_table.
|
|
490
521
|
|
|
@@ -496,6 +527,7 @@ class Dataset:
|
|
|
496
527
|
members: List of RIDs of members to add to the dataset_table.
|
|
497
528
|
validate: Check rid_list to make sure elements are not already in the dataset_table.
|
|
498
529
|
description: Markdown description of the updated dataset.
|
|
530
|
+
execution_rid: Optional RID of execution associated with this dataset.
|
|
499
531
|
"""
|
|
500
532
|
members = set(members)
|
|
501
533
|
description = description or "Updated dataset via add_dataset_members"
|
|
@@ -559,12 +591,19 @@ class Dataset:
|
|
|
559
591
|
[{"Dataset": dataset_rid, fk_column: e} for e in elements]
|
|
560
592
|
)
|
|
561
593
|
self.increment_dataset_version(
|
|
562
|
-
dataset_rid,
|
|
594
|
+
dataset_rid,
|
|
595
|
+
VersionPart.minor,
|
|
596
|
+
description=description,
|
|
597
|
+
execution_rid=execution_rid,
|
|
563
598
|
)
|
|
564
599
|
|
|
565
600
|
@validate_call
|
|
566
601
|
def delete_dataset_members(
|
|
567
|
-
self,
|
|
602
|
+
self,
|
|
603
|
+
dataset_rid: RID,
|
|
604
|
+
members: list[RID],
|
|
605
|
+
description: str = "",
|
|
606
|
+
execution_rid: Optional[RID] = None,
|
|
568
607
|
) -> None:
|
|
569
608
|
"""Remove elements to an existing dataset_table.
|
|
570
609
|
|
|
@@ -575,6 +614,7 @@ class Dataset:
|
|
|
575
614
|
dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
|
|
576
615
|
members: List of RIDs of members to add to the dataset_table.
|
|
577
616
|
description: Markdown description of the updated dataset.
|
|
617
|
+
execution_rid: Optional RID of execution associated with this operation.
|
|
578
618
|
"""
|
|
579
619
|
|
|
580
620
|
members = set(members)
|
|
@@ -616,7 +656,10 @@ class Dataset:
|
|
|
616
656
|
)
|
|
617
657
|
entity.delete()
|
|
618
658
|
self.increment_dataset_version(
|
|
619
|
-
dataset_rid,
|
|
659
|
+
dataset_rid,
|
|
660
|
+
VersionPart.minor,
|
|
661
|
+
description=description,
|
|
662
|
+
execution_rid=execution_rid,
|
|
620
663
|
)
|
|
621
664
|
|
|
622
665
|
@validate_call
|
|
@@ -663,44 +706,6 @@ class Dataset:
|
|
|
663
706
|
children.extend(self.list_dataset_children(child, recurse=recurse))
|
|
664
707
|
return children
|
|
665
708
|
|
|
666
|
-
@staticmethod
|
|
667
|
-
def _download_dataset_element(
|
|
668
|
-
spath: str, dpath: str, table: Table
|
|
669
|
-
) -> list[dict[str, Any]]:
|
|
670
|
-
"""Return the download specification for the data object indicated by a path through the data model.
|
|
671
|
-
|
|
672
|
-
Args:
|
|
673
|
-
spath: Source path
|
|
674
|
-
dpath: Destination path
|
|
675
|
-
table: Table referenced to by the path
|
|
676
|
-
|
|
677
|
-
Returns:
|
|
678
|
-
The download specification that will retrieve that data from the catalog and place it into a BDBag.
|
|
679
|
-
"""
|
|
680
|
-
exports = [
|
|
681
|
-
{
|
|
682
|
-
"processor": "csv",
|
|
683
|
-
"processor_params": {
|
|
684
|
-
"query_path": f"/entity/{spath}?limit=none",
|
|
685
|
-
"output_path": dpath,
|
|
686
|
-
},
|
|
687
|
-
}
|
|
688
|
-
]
|
|
689
|
-
|
|
690
|
-
# If this table is an asset table, then we need to output the files associated with the asset.
|
|
691
|
-
asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
|
|
692
|
-
if asset_columns.issubset({c.name for c in table.columns}):
|
|
693
|
-
exports.append(
|
|
694
|
-
{
|
|
695
|
-
"processor": "fetch",
|
|
696
|
-
"processor_params": {
|
|
697
|
-
"query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
|
|
698
|
-
"output_path": f"asset/{table.name}",
|
|
699
|
-
},
|
|
700
|
-
}
|
|
701
|
-
)
|
|
702
|
-
return exports
|
|
703
|
-
|
|
704
709
|
def _vocabulary_specification(
|
|
705
710
|
self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
|
|
706
711
|
) -> list[dict[str, Any]]:
|
|
@@ -724,82 +729,38 @@ class Dataset:
|
|
|
724
729
|
for o in writer(f"{table.schema.name}:{table.name}", table.name, table)
|
|
725
730
|
]
|
|
726
731
|
|
|
727
|
-
def
|
|
728
|
-
self,
|
|
729
|
-
graph: dict[Table, list[dict[Table, Any]]],
|
|
730
|
-
spath: str = None,
|
|
731
|
-
dpath: str = None,
|
|
732
|
-
sprefix: str = "deriva-ml:Dataset/RID={Dataset_RID}",
|
|
733
|
-
dprefix: str = "Dataset",
|
|
734
|
-
nested: bool = False,
|
|
735
|
-
) -> list[tuple[str, str, Table]]:
|
|
736
|
-
"""Recursively walk over the domain schema graph and extend the current path.
|
|
737
|
-
|
|
738
|
-
Args:
|
|
739
|
-
graph: An undirected, acyclic graph of schema. Represented as a dictionary whose name is the table name.
|
|
740
|
-
and whose values are the child nodes of the table.
|
|
741
|
-
spath: Source path so far
|
|
742
|
-
dpath: Destination path so far
|
|
743
|
-
sprefix: Initial path to be included. Allows for nested datasets
|
|
744
|
-
dprefix: Initial path to be included. Allows for nested datasets
|
|
745
|
-
nested: If true, skip initial data segment.
|
|
746
|
-
|
|
747
|
-
Returns:
|
|
748
|
-
A list of all the paths through the graph. Each path is a list of tables.
|
|
732
|
+
def _table_paths(self) -> Iterator[tuple[list[str], list[str], list[Table]]]:
|
|
749
733
|
|
|
750
|
-
""
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
paths = []
|
|
754
|
-
for node, children in graph.items():
|
|
755
|
-
if node.name == "Dataset":
|
|
756
|
-
paths.append(
|
|
757
|
-
(
|
|
758
|
-
f"{sprefix}/(RID)=({self._ml_schema}:Dataset_Version:Dataset)",
|
|
759
|
-
f"{dprefix}/Dataset_Version",
|
|
760
|
-
self._model.schemas[self._ml_schema].tables["Dataset_Version"],
|
|
761
|
-
)
|
|
762
|
-
)
|
|
763
|
-
new_spath = sprefix
|
|
764
|
-
new_dpath = dprefix
|
|
765
|
-
|
|
766
|
-
if not nested:
|
|
767
|
-
paths.append((new_spath, new_dpath, node))
|
|
768
|
-
else:
|
|
769
|
-
new_spath = source_path + f"/{node.schema.name}:{node.name}"
|
|
770
|
-
new_dpath = dest_path + f"/{node.name}"
|
|
771
|
-
paths.append((new_spath, new_dpath, node))
|
|
772
|
-
for child in children:
|
|
773
|
-
paths.extend(
|
|
774
|
-
self._domain_table_paths(child, new_spath, new_dpath, nested=nested)
|
|
775
|
-
)
|
|
776
|
-
return paths
|
|
734
|
+
dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
|
|
735
|
+
paths = self._model._schema_to_paths()
|
|
736
|
+
nested_paths = paths
|
|
777
737
|
|
|
778
|
-
def _table_paths(self, graph) -> list[tuple[str, str, Table]]:
|
|
779
|
-
sprefix = "deriva-ml:Dataset/RID={Dataset_RID}"
|
|
780
|
-
dprefix = "Dataset"
|
|
781
|
-
dataset_dataset_table = self._model.schemas[self._ml_schema].tables[
|
|
782
|
-
"Dataset_Dataset"
|
|
783
|
-
]
|
|
784
|
-
table_paths = self._domain_table_paths(
|
|
785
|
-
graph=graph, sprefix=sprefix, dprefix=dprefix
|
|
786
|
-
)
|
|
787
|
-
nested_sprefix = sprefix
|
|
788
|
-
nested_dprefix = dprefix
|
|
789
738
|
for i in range(self._dataset_nesting_depth()):
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
739
|
+
if i == 0:
|
|
740
|
+
paths.extend([[self.dataset_table, dataset_dataset]])
|
|
741
|
+
nested_paths = [
|
|
742
|
+
[self.dataset_table, dataset_dataset] + p for p in nested_paths
|
|
743
|
+
]
|
|
744
|
+
paths.extend(nested_paths)
|
|
745
|
+
|
|
746
|
+
def source_path(path):
|
|
747
|
+
p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
|
|
748
|
+
for table in path[1:]:
|
|
749
|
+
if table == dataset_dataset:
|
|
750
|
+
p.append(f"(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
|
|
751
|
+
elif table == self.dataset_table:
|
|
752
|
+
p.append(f"(Nested_Dataset)=(deriva-ml:Dataset:RID)")
|
|
753
|
+
elif table.name == "Dataset_Version":
|
|
754
|
+
p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
|
|
755
|
+
else:
|
|
756
|
+
p.append(f"{table.schema.name}:{table.name}")
|
|
757
|
+
return p
|
|
758
|
+
|
|
759
|
+
src_paths = ["/".join(source_path(p)) for p in paths]
|
|
760
|
+
dest_paths = ["/".join([t.name for t in p]) for p in paths]
|
|
761
|
+
target_tables = [p[-1] for p in paths]
|
|
762
|
+
|
|
763
|
+
return zip(src_paths, dest_paths, target_tables)
|
|
803
764
|
|
|
804
765
|
def _dataset_nesting_depth(self):
|
|
805
766
|
"""Determine the maximum dataset nesting depth in the current catalog.
|
|
@@ -811,6 +772,7 @@ class Dataset:
|
|
|
811
772
|
def children_depth(
|
|
812
773
|
dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
|
|
813
774
|
) -> int:
|
|
775
|
+
"""Return the number of nested datasets in the current catalog"""
|
|
814
776
|
try:
|
|
815
777
|
children = nested_datasets[dataset_rid]
|
|
816
778
|
return (
|
|
@@ -836,50 +798,6 @@ class Dataset:
|
|
|
836
798
|
else 0
|
|
837
799
|
)
|
|
838
800
|
|
|
839
|
-
def _schema_graph(
|
|
840
|
-
self, node: Table, visited_nodes: Optional[set] = None
|
|
841
|
-
) -> dict[Table, list[dict[Table, list]]]:
|
|
842
|
-
"""Generate an undirected, acyclic graph of domain schema. We do this by traversing the schema foreign key
|
|
843
|
-
relationships. We stop when we hit the deriva-ml schema or when we reach a node that we have already seen.
|
|
844
|
-
|
|
845
|
-
Nested datasets need to be unfolded
|
|
846
|
-
|
|
847
|
-
Args:
|
|
848
|
-
node: Current (starting) node in the graph.
|
|
849
|
-
visited_nodes: param nested_dataset: Are we in a nested dataset_table, (i.e. have we seen the DataSet table)?
|
|
850
|
-
|
|
851
|
-
Returns:
|
|
852
|
-
Graph of the schema, starting from node.
|
|
853
|
-
"""
|
|
854
|
-
|
|
855
|
-
visited_nodes = visited_nodes or set()
|
|
856
|
-
graph = {node: []}
|
|
857
|
-
|
|
858
|
-
def include_node(child: Table) -> bool:
|
|
859
|
-
"""Indicate if the table should be included in the graph.
|
|
860
|
-
|
|
861
|
-
Include node in the graph if it's not a loopback from fk<-> referred_by, you have not already been to the
|
|
862
|
-
node.
|
|
863
|
-
"""
|
|
864
|
-
return (
|
|
865
|
-
child != node
|
|
866
|
-
and child not in visited_nodes
|
|
867
|
-
and child.schema.name == self._model.domain_schema
|
|
868
|
-
)
|
|
869
|
-
|
|
870
|
-
# Get all the tables reachable from the end of the path avoiding loops from T1<->T2 via referenced_by
|
|
871
|
-
nodes = {fk.pk_table for fk in node.foreign_keys if include_node(fk.pk_table)}
|
|
872
|
-
nodes |= {fk.table for fk in node.referenced_by if include_node(fk.table)}
|
|
873
|
-
for t in nodes:
|
|
874
|
-
new_visited_nodes = visited_nodes.copy()
|
|
875
|
-
new_visited_nodes.add(t)
|
|
876
|
-
if self._model.is_vocabulary(t):
|
|
877
|
-
# If the end of the path is a vocabulary table, we are at a terminal node in the ERD, so stop
|
|
878
|
-
continue
|
|
879
|
-
# Get all the paths that extend the current path
|
|
880
|
-
graph[node].append(self._schema_graph(t, new_visited_nodes))
|
|
881
|
-
return graph
|
|
882
|
-
|
|
883
801
|
def _dataset_specification(
|
|
884
802
|
self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
|
|
885
803
|
) -> list[dict[str, Any]]:
|
|
@@ -921,7 +839,7 @@ class Dataset:
|
|
|
921
839
|
A dataset_table specification.
|
|
922
840
|
"""
|
|
923
841
|
element_spec = []
|
|
924
|
-
for path in self._table_paths(
|
|
842
|
+
for path in self._table_paths():
|
|
925
843
|
element_spec.extend(writer(*path))
|
|
926
844
|
return self._vocabulary_specification(writer) + element_spec
|
|
927
845
|
|
|
@@ -953,7 +871,7 @@ class Dataset:
|
|
|
953
871
|
if dataset.materialize
|
|
954
872
|
else self._download_dataset_bag(minid)
|
|
955
873
|
)
|
|
956
|
-
return DatabaseModel
|
|
874
|
+
return DatabaseModel(minid, bag_path).get_dataset()
|
|
957
875
|
|
|
958
876
|
def _version_snapshot(self, dataset: DatasetSpec) -> str:
|
|
959
877
|
version_record = [
|
|
@@ -1089,6 +1007,7 @@ class Dataset:
|
|
|
1089
1007
|
"""
|
|
1090
1008
|
|
|
1091
1009
|
def update_status(status: Status, msg: str) -> None:
|
|
1010
|
+
"""Update the current status for this execution in the catalog"""
|
|
1092
1011
|
self._model.catalog.getPathBuilder().schemas[
|
|
1093
1012
|
self._ml_schema
|
|
1094
1013
|
].Execution.update(
|
|
@@ -1196,6 +1115,44 @@ class Dataset:
|
|
|
1196
1115
|
}
|
|
1197
1116
|
] + self._dataset_specification(writer)
|
|
1198
1117
|
|
|
1118
|
+
@staticmethod
|
|
1119
|
+
def _download_dataset_element(
|
|
1120
|
+
spath: str, dpath: str, table: Table
|
|
1121
|
+
) -> list[dict[str, Any]]:
|
|
1122
|
+
"""Return the download specification for the data object indicated by a path through the data model.
|
|
1123
|
+
|
|
1124
|
+
Args:
|
|
1125
|
+
spath: Source path
|
|
1126
|
+
dpath: Destination path
|
|
1127
|
+
table: Table referenced to by the path
|
|
1128
|
+
|
|
1129
|
+
Returns:
|
|
1130
|
+
The download specification that will retrieve that data from the catalog and place it into a BDBag.
|
|
1131
|
+
"""
|
|
1132
|
+
exports = [
|
|
1133
|
+
{
|
|
1134
|
+
"processor": "csv",
|
|
1135
|
+
"processor_params": {
|
|
1136
|
+
"query_path": f"/entity/{spath}?limit=none",
|
|
1137
|
+
"output_path": dpath,
|
|
1138
|
+
},
|
|
1139
|
+
}
|
|
1140
|
+
]
|
|
1141
|
+
|
|
1142
|
+
# If this table is an asset table, then we need to output the files associated with the asset.
|
|
1143
|
+
asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
|
|
1144
|
+
if asset_columns.issubset({c.name for c in table.columns}):
|
|
1145
|
+
exports.append(
|
|
1146
|
+
{
|
|
1147
|
+
"processor": "fetch",
|
|
1148
|
+
"processor_params": {
|
|
1149
|
+
"query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
|
|
1150
|
+
"output_path": f"asset/{table.name}",
|
|
1151
|
+
},
|
|
1152
|
+
}
|
|
1153
|
+
)
|
|
1154
|
+
return exports
|
|
1155
|
+
|
|
1199
1156
|
@staticmethod
|
|
1200
1157
|
def _export_dataset_element(
|
|
1201
1158
|
spath: str, dpath: str, table: Table
|
deriva_ml/dataset_aux_classes.py
CHANGED