deriva-ml 1.17.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/.DS_Store +0 -0
- deriva_ml/__init__.py +79 -0
- deriva_ml/bump_version.py +142 -0
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1527 -0
- deriva_ml/core/config.py +69 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +12 -0
- deriva_ml/dataset/aux_classes.py +225 -0
- deriva_ml/dataset/dataset.py +1519 -0
- deriva_ml/dataset/dataset_bag.py +450 -0
- deriva_ml/dataset/history.py +109 -0
- deriva_ml/dataset/upload.py +439 -0
- deriva_ml/demo_catalog.py +495 -0
- deriva_ml/execution/__init__.py +26 -0
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/execution/execution.py +1180 -0
- deriva_ml/execution/execution_configuration.py +147 -0
- deriva_ml/execution/workflow.py +413 -0
- deriva_ml/feature.py +228 -0
- deriva_ml/install_kernel.py +71 -0
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/model/catalog.py +485 -0
- deriva_ml/model/database.py +719 -0
- deriva_ml/protocols/dataset.py +19 -0
- deriva_ml/run_notebook.py +228 -0
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/schema/annotations.py +473 -0
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/schema/create_schema.py +393 -0
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/policy.json +81 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- deriva_ml/test.py +94 -0
- deriva_ml-1.17.10.dist-info/METADATA +38 -0
- deriva_ml-1.17.10.dist-info/RECORD +45 -0
- deriva_ml-1.17.10.dist-info/WHEEL +5 -0
- deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
- deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
- deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1519 @@
|
|
|
1
|
+
"""Dataset management for DerivaML.
|
|
2
|
+
|
|
3
|
+
This module provides functionality for managing datasets in DerivaML. A dataset represents a collection
|
|
4
|
+
of related data that can be versioned, downloaded, and tracked. The module includes:
|
|
5
|
+
|
|
6
|
+
- Dataset class: Core class for dataset operations
|
|
7
|
+
- Version management: Track and update dataset versions
|
|
8
|
+
- History tracking: Record dataset changes over time
|
|
9
|
+
- Download capabilities: Export datasets as BDBags
|
|
10
|
+
- Relationship management: Handle dataset dependencies and hierarchies
|
|
11
|
+
|
|
12
|
+
The Dataset class serves as a base class in DerivaML, making its methods accessible through
|
|
13
|
+
DerivaML class instances.
|
|
14
|
+
|
|
15
|
+
Typical usage example:
|
|
16
|
+
>>> ml = DerivaML('deriva.example.org', 'my_catalog')
|
|
17
|
+
>>> dataset_rid = ml.create_dataset('experiment', 'Experimental data')
|
|
18
|
+
>>> ml.add_dataset_members(dataset_rid=dataset_rid, members=['1-abc123', '1-def456'])
|
|
19
|
+
>>> ml.increment_dataset_version(datset_rid=dataset_rid, component=VersionPart.minor,
|
|
20
|
+
... description='Added new samples')
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import json
|
|
26
|
+
import logging
|
|
27
|
+
from collections import defaultdict
|
|
28
|
+
|
|
29
|
+
# Standard library imports
|
|
30
|
+
from graphlib import TopologicalSorter
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
from tempfile import TemporaryDirectory
|
|
33
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator
|
|
34
|
+
from urllib.parse import urlparse
|
|
35
|
+
|
|
36
|
+
import deriva.core.utils.hash_utils as hash_utils
|
|
37
|
+
import requests
|
|
38
|
+
|
|
39
|
+
# Third-party imports
|
|
40
|
+
from bdbag import bdbag_api as bdb
|
|
41
|
+
from bdbag.fetch.fetcher import fetch_single_file
|
|
42
|
+
|
|
43
|
+
# Deriva imports
|
|
44
|
+
from deriva.core.ermrest_model import Table
|
|
45
|
+
from deriva.core.utils.core_utils import format_exception
|
|
46
|
+
from deriva.core.utils.core_utils import tag as deriva_tags
|
|
47
|
+
from deriva.transfer.download.deriva_download import (
|
|
48
|
+
DerivaDownloadAuthenticationError,
|
|
49
|
+
DerivaDownloadAuthorizationError,
|
|
50
|
+
DerivaDownloadConfigurationError,
|
|
51
|
+
DerivaDownloadError,
|
|
52
|
+
DerivaDownloadTimeoutError,
|
|
53
|
+
)
|
|
54
|
+
from deriva.transfer.download.deriva_export import DerivaExport
|
|
55
|
+
from pydantic import ConfigDict, validate_call
|
|
56
|
+
|
|
57
|
+
# Local imports
|
|
58
|
+
try:
|
|
59
|
+
from icecream import ic
|
|
60
|
+
|
|
61
|
+
ic.configureOutput(includeContext=True)
|
|
62
|
+
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
63
|
+
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
64
|
+
|
|
65
|
+
from deriva_ml.core.constants import RID
|
|
66
|
+
from deriva_ml.core.definitions import (
|
|
67
|
+
DRY_RUN_RID,
|
|
68
|
+
ML_SCHEMA,
|
|
69
|
+
MLVocab,
|
|
70
|
+
Status,
|
|
71
|
+
)
|
|
72
|
+
from deriva_ml.core.exceptions import DerivaMLException, DerivaMLTableTypeError
|
|
73
|
+
from deriva_ml.dataset.aux_classes import (
|
|
74
|
+
DatasetHistory,
|
|
75
|
+
DatasetMinid,
|
|
76
|
+
DatasetSpec,
|
|
77
|
+
DatasetVersion,
|
|
78
|
+
VersionPart,
|
|
79
|
+
)
|
|
80
|
+
from deriva_ml.dataset.dataset_bag import DatasetBag
|
|
81
|
+
from deriva_ml.model.catalog import DerivaModel
|
|
82
|
+
from deriva_ml.model.database import DatabaseModel
|
|
83
|
+
|
|
84
|
+
from .history import iso_to_snap
|
|
85
|
+
|
|
86
|
+
# Stop pycharm from complaining about undefined reference in docstring....
|
|
87
|
+
ml: DerivaML
|
|
88
|
+
|
|
89
|
+
if TYPE_CHECKING:
|
|
90
|
+
from deriva_ml.core.base import DerivaML
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class Dataset:
|
|
94
|
+
"""Manages dataset operations in a Deriva catalog.
|
|
95
|
+
|
|
96
|
+
The Dataset class provides functionality for creating, modifying, and tracking datasets
|
|
97
|
+
in a Deriva catalog. It handles versioning, relationships between datasets, and data export.
|
|
98
|
+
|
|
99
|
+
Attributes:
|
|
100
|
+
dataset_table (Table): ERMrest table storing dataset information.
|
|
101
|
+
_model (DerivaModel): Catalog model instance.
|
|
102
|
+
_ml_schema (str): Schema name for ML-specific tables.
|
|
103
|
+
_cache_dir (Path): Directory for caching downloaded datasets.
|
|
104
|
+
_working_dir (Path): Directory for working data.
|
|
105
|
+
_use_minid (bool): Whether to use MINID service for dataset identification.
|
|
106
|
+
|
|
107
|
+
Note:
|
|
108
|
+
This class is typically used as a base class, with its methods accessed through
|
|
109
|
+
DerivaML class instances rather than directly.
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
_Logger = logging.getLogger("deriva_ml")
|
|
113
|
+
|
|
114
|
+
def __init__(
|
|
115
|
+
self,
|
|
116
|
+
model: DerivaModel,
|
|
117
|
+
cache_dir: Path,
|
|
118
|
+
working_dir: Path,
|
|
119
|
+
use_minid: bool = True,
|
|
120
|
+
):
|
|
121
|
+
"""Initializes a Dataset instance.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
model: DerivaModel instance representing the catalog.
|
|
125
|
+
cache_dir: Directory path for caching downloaded datasets.
|
|
126
|
+
working_dir: Directory path for working data.
|
|
127
|
+
use_minid: Whether to use MINID service for dataset identification.
|
|
128
|
+
"""
|
|
129
|
+
self._model = model
|
|
130
|
+
self._ml_schema = ML_SCHEMA
|
|
131
|
+
self._cache_dir = cache_dir
|
|
132
|
+
self._working_dir = working_dir
|
|
133
|
+
self._logger = logging.getLogger("deriva_ml")
|
|
134
|
+
self._use_minid = use_minid
|
|
135
|
+
|
|
136
|
+
@property
|
|
137
|
+
def _dataset_table(self):
|
|
138
|
+
return self._model.schemas[self._ml_schema].tables["Dataset"]
|
|
139
|
+
|
|
140
|
+
def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
|
|
141
|
+
try:
|
|
142
|
+
rid_info = self._model.catalog.resolve_rid(dataset_rid, self._model.model)
|
|
143
|
+
except KeyError as _e:
|
|
144
|
+
raise DerivaMLException(f"Invalid RID {dataset_rid}")
|
|
145
|
+
if rid_info.table != self._dataset_table:
|
|
146
|
+
return False
|
|
147
|
+
elif deleted:
|
|
148
|
+
# Got a dataset rid. Now check to see if its deleted or not.
|
|
149
|
+
return True
|
|
150
|
+
else:
|
|
151
|
+
return not list(rid_info.datapath.entities().fetch())[0]["Deleted"]
|
|
152
|
+
|
|
153
|
+
def _insert_dataset_versions(
|
|
154
|
+
self,
|
|
155
|
+
dataset_list: list[DatasetSpec],
|
|
156
|
+
description: str | None = "",
|
|
157
|
+
execution_rid: RID | None = None,
|
|
158
|
+
) -> None:
|
|
159
|
+
schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
|
|
160
|
+
# determine snapshot after changes were made
|
|
161
|
+
|
|
162
|
+
# Construct version records for insert
|
|
163
|
+
version_records = schema_path.tables["Dataset_Version"].insert(
|
|
164
|
+
[
|
|
165
|
+
{
|
|
166
|
+
"Dataset": dataset.rid,
|
|
167
|
+
"Version": str(dataset.version),
|
|
168
|
+
"Description": description,
|
|
169
|
+
"Execution": execution_rid,
|
|
170
|
+
}
|
|
171
|
+
for dataset in dataset_list
|
|
172
|
+
]
|
|
173
|
+
)
|
|
174
|
+
version_records = list(version_records)
|
|
175
|
+
snap = self._model.catalog.get("/").json()["snaptime"]
|
|
176
|
+
schema_path.tables["Dataset_Version"].update(
|
|
177
|
+
[{"RID": v["RID"], "Dataset": v["Dataset"], "Snapshot": snap} for v in version_records]
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# And update the dataset records.
|
|
181
|
+
schema_path.tables["Dataset"].update([{"Version": v["RID"], "RID": v["Dataset"]} for v in version_records])
|
|
182
|
+
|
|
183
|
+
def _bootstrap_versions(self):
|
|
184
|
+
datasets = [ds["RID"] for ds in self.find_datasets()]
|
|
185
|
+
ds_version = [
|
|
186
|
+
{
|
|
187
|
+
"Dataset": d,
|
|
188
|
+
"Version": "0.1.0",
|
|
189
|
+
"Description": "Dataset at the time of conversion to versioned datasets",
|
|
190
|
+
}
|
|
191
|
+
for d in datasets
|
|
192
|
+
]
|
|
193
|
+
schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
|
|
194
|
+
version_path = schema_path.tables["Dataset_Version"]
|
|
195
|
+
dataset_path = schema_path.tables["Dataset"]
|
|
196
|
+
history = list(version_path.insert(ds_version))
|
|
197
|
+
dataset_versions = [{"RID": h["Dataset"], "Version": h["Version"]} for h in history]
|
|
198
|
+
dataset_path.update(dataset_versions)
|
|
199
|
+
|
|
200
|
+
def _synchronize_dataset_versions(self):
|
|
201
|
+
datasets = [ds["RID"] for ds in self.find_datasets()]
|
|
202
|
+
for ds in datasets:
|
|
203
|
+
self.dataset_version(ds)
|
|
204
|
+
schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
|
|
205
|
+
dataset_version_path = schema_path.tables["Dataset_Version"]
|
|
206
|
+
# Get the maximum version number for each dataset.
|
|
207
|
+
versions = {}
|
|
208
|
+
for v in dataset_version_path.entities().fetch():
|
|
209
|
+
if v["Version"] > versions.get("Dataset", DatasetVersion(0, 0, 0)):
|
|
210
|
+
versions[v["Dataset"]] = v
|
|
211
|
+
dataset_path = schema_path.tables["Dataset"]
|
|
212
|
+
|
|
213
|
+
dataset_path.update([{"RID": dataset, "Version": version["RID"]} for dataset, version in versions.items()])
|
|
214
|
+
|
|
215
|
+
def _set_version_snapshot(self):
|
|
216
|
+
"""Update the Snapshot column of the Dataset_Version table to the correct time."""
|
|
217
|
+
dataset_version_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Version"]
|
|
218
|
+
versions = dataset_version_path.entities().fetch()
|
|
219
|
+
dataset_version_path.update(
|
|
220
|
+
[{"RID": h["RID"], "Snapshot": iso_to_snap(h["RCT"])} for h in versions if not h["Snapshot"]]
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
def dataset_history(self, dataset_rid: RID) -> list[DatasetHistory]:
|
|
224
|
+
"""Retrieves the version history of a dataset.
|
|
225
|
+
|
|
226
|
+
Returns a chronological list of dataset versions, including their version numbers,
|
|
227
|
+
creation times, and associated metadata.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
dataset_rid: Resource Identifier of the dataset.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
list[DatasetHistory]: List of history entries, each containing:
|
|
234
|
+
- dataset_version: Version number (major.minor.patch)
|
|
235
|
+
- minid: Minimal Viable Identifier
|
|
236
|
+
- snapshot: Catalog snapshot time
|
|
237
|
+
- dataset_rid: Dataset Resource Identifier
|
|
238
|
+
- version_rid: Version Resource Identifier
|
|
239
|
+
- description: Version description
|
|
240
|
+
- execution_rid: Associated execution RID
|
|
241
|
+
|
|
242
|
+
Raises:
|
|
243
|
+
DerivaMLException: If dataset_rid is not a valid dataset RID.
|
|
244
|
+
|
|
245
|
+
Example:
|
|
246
|
+
>>> history = ml.dataset_history("1-abc123")
|
|
247
|
+
>>> for entry in history:
|
|
248
|
+
... print(f"Version {entry.dataset_version}: {entry.description}")
|
|
249
|
+
"""
|
|
250
|
+
|
|
251
|
+
if not self._is_dataset_rid(dataset_rid):
|
|
252
|
+
raise DerivaMLException(f"RID is not for a data set: {dataset_rid}")
|
|
253
|
+
version_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Version"]
|
|
254
|
+
return [
|
|
255
|
+
DatasetHistory(
|
|
256
|
+
dataset_version=DatasetVersion.parse(v["Version"]),
|
|
257
|
+
minid=v["Minid"],
|
|
258
|
+
snapshot=v["Snapshot"],
|
|
259
|
+
dataset_rid=dataset_rid,
|
|
260
|
+
version_rid=v["RID"],
|
|
261
|
+
description=v["Description"],
|
|
262
|
+
execution_rid=v["Execution"],
|
|
263
|
+
)
|
|
264
|
+
for v in version_path.filter(version_path.Dataset == dataset_rid).entities().fetch()
|
|
265
|
+
]
|
|
266
|
+
|
|
267
|
+
@validate_call
|
|
268
|
+
def dataset_version(self, dataset_rid: RID) -> DatasetVersion:
|
|
269
|
+
"""Retrieve the current version of the specified dataset_table.
|
|
270
|
+
|
|
271
|
+
Given a rid, return the most recent version of the dataset. It is important to remember that this version
|
|
272
|
+
captures the state of the catalog at the time the version was created, not the current state of the catalog.
|
|
273
|
+
This means that its possible that the values associated with an object in the catalog may be different
|
|
274
|
+
from the values of that object in the dataset.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
dataset_rid: The RID of the dataset to retrieve the version for.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
A tuple with the semantic version of the dataset_table.
|
|
281
|
+
"""
|
|
282
|
+
history = self.dataset_history(dataset_rid)
|
|
283
|
+
if not history:
|
|
284
|
+
return DatasetVersion(0, 1, 0)
|
|
285
|
+
else:
|
|
286
|
+
# Ensure we return a DatasetVersion, not a string
|
|
287
|
+
versions = [h.dataset_version for h in history]
|
|
288
|
+
return max(versions) if versions else DatasetVersion(0, 1, 0)
|
|
289
|
+
|
|
290
|
+
def _build_dataset_graph(self, dataset_rid: RID) -> Iterable[RID]:
|
|
291
|
+
ts: TopologicalSorter = TopologicalSorter()
|
|
292
|
+
self._build_dataset_graph_1(dataset_rid, ts, set())
|
|
293
|
+
return ts.static_order()
|
|
294
|
+
|
|
295
|
+
def _build_dataset_graph_1(self, dataset_rid: RID, ts: TopologicalSorter, visited) -> None:
|
|
296
|
+
"""Use topological sort to return bottom up list of nested datasets"""
|
|
297
|
+
ts.add(dataset_rid)
|
|
298
|
+
if dataset_rid not in visited:
|
|
299
|
+
visited.add(dataset_rid)
|
|
300
|
+
children = self.list_dataset_children(dataset_rid=dataset_rid)
|
|
301
|
+
parents = self.list_dataset_parents(dataset_rid=dataset_rid)
|
|
302
|
+
for parent in parents:
|
|
303
|
+
# Convert string to RID type
|
|
304
|
+
self._build_dataset_graph_1(RID(parent), ts, visited)
|
|
305
|
+
for child in children:
|
|
306
|
+
self._build_dataset_graph_1(child, ts, visited)
|
|
307
|
+
|
|
308
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
309
|
+
def increment_dataset_version(
|
|
310
|
+
self,
|
|
311
|
+
dataset_rid: RID,
|
|
312
|
+
component: VersionPart,
|
|
313
|
+
description: str | None = "",
|
|
314
|
+
execution_rid: RID | None = None,
|
|
315
|
+
) -> DatasetVersion:
|
|
316
|
+
"""Increments a dataset's version number.
|
|
317
|
+
|
|
318
|
+
Creates a new version of the dataset by incrementing the specified version component
|
|
319
|
+
(major, minor, or patch). The new version is recorded with an optional description
|
|
320
|
+
and execution reference.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
dataset_rid: Resource Identifier of the dataset to version.
|
|
324
|
+
component: Which version component to increment ('major', 'minor', or 'patch').
|
|
325
|
+
description: Optional description of the changes in this version.
|
|
326
|
+
execution_rid: Optional execution RID to associate with this version.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
DatasetVersion: The new version number.
|
|
330
|
+
|
|
331
|
+
Raises:
|
|
332
|
+
DerivaMLException: If dataset_rid is invalid or version increment fails.
|
|
333
|
+
|
|
334
|
+
Example:
|
|
335
|
+
>>> new_version = ml.increment_dataset_version(
|
|
336
|
+
... dataset_rid="1-abc123",
|
|
337
|
+
... component="minor",
|
|
338
|
+
... description="Added new samples"
|
|
339
|
+
... )
|
|
340
|
+
>>> print(f"New version: {new_version}") # e.g., "1.2.0"
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
# Find all the datasets that are reachable from this dataset and determine their new version numbers.
|
|
344
|
+
related_datasets = list(self._build_dataset_graph(dataset_rid=dataset_rid))
|
|
345
|
+
version_update_list = [
|
|
346
|
+
DatasetSpec(
|
|
347
|
+
rid=ds_rid,
|
|
348
|
+
version=self.dataset_version(ds_rid).increment_version(component),
|
|
349
|
+
)
|
|
350
|
+
for ds_rid in related_datasets
|
|
351
|
+
]
|
|
352
|
+
self._insert_dataset_versions(version_update_list, description=description, execution_rid=execution_rid)
|
|
353
|
+
return next((d.version for d in version_update_list if d.rid == dataset_rid))
|
|
354
|
+
|
|
355
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
356
|
+
def create_dataset(
|
|
357
|
+
self,
|
|
358
|
+
dataset_types: str | list[str] | None = None,
|
|
359
|
+
description: str = "",
|
|
360
|
+
execution_rid: RID | None = None,
|
|
361
|
+
version: DatasetVersion | None = None,
|
|
362
|
+
) -> RID:
|
|
363
|
+
"""Creates a new dataset in the catalog.
|
|
364
|
+
|
|
365
|
+
Creates a dataset with specified types and description. The dataset can be associated
|
|
366
|
+
with an execution and initialized with a specific version.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
dataset_types: One or more dataset type terms from Dataset_Type vocabulary.
|
|
370
|
+
description: Description of the dataset's purpose and contents.
|
|
371
|
+
execution_rid: Optional execution RID to associate with dataset creation.
|
|
372
|
+
version: Optional initial version number. Defaults to 0.1.0.
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
RID: Resource Identifier of the newly created dataset.
|
|
376
|
+
|
|
377
|
+
Raises:
|
|
378
|
+
DerivaMLException: If dataset_types are invalid or creation fails.
|
|
379
|
+
|
|
380
|
+
Example:
|
|
381
|
+
>>> rid = ml.create_dataset(
|
|
382
|
+
... dataset_types=["experiment", "raw_data"],
|
|
383
|
+
... description="RNA sequencing experiment data",
|
|
384
|
+
... version=DatasetVersion(1, 0, 0)
|
|
385
|
+
... )
|
|
386
|
+
"""
|
|
387
|
+
|
|
388
|
+
version = version or DatasetVersion(0, 1, 0)
|
|
389
|
+
dataset_types = dataset_types or []
|
|
390
|
+
|
|
391
|
+
type_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables[MLVocab.dataset_type.value]
|
|
392
|
+
defined_types = list(type_path.entities().fetch())
|
|
393
|
+
|
|
394
|
+
def check_dataset_type(dtype: str) -> bool:
|
|
395
|
+
for term in defined_types:
|
|
396
|
+
if dtype == term["Name"] or (term["Synonyms"] and ds_type in term["Synonyms"]):
|
|
397
|
+
return True
|
|
398
|
+
return False
|
|
399
|
+
|
|
400
|
+
# Create the entry for the new dataset_table and get its RID.
|
|
401
|
+
ds_types = [dataset_types] if isinstance(dataset_types, str) else dataset_types
|
|
402
|
+
pb = self._model.catalog.getPathBuilder()
|
|
403
|
+
for ds_type in ds_types:
|
|
404
|
+
if not check_dataset_type(ds_type):
|
|
405
|
+
raise DerivaMLException("Dataset type must be a vocabulary term.")
|
|
406
|
+
dataset_table_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
|
|
407
|
+
dataset_rid = dataset_table_path.insert(
|
|
408
|
+
[
|
|
409
|
+
{
|
|
410
|
+
"Description": description,
|
|
411
|
+
"Deleted": False,
|
|
412
|
+
}
|
|
413
|
+
]
|
|
414
|
+
)[0]["RID"]
|
|
415
|
+
|
|
416
|
+
# Get the name of the association table between dataset_table and dataset_type.
|
|
417
|
+
associations = list(self._model.schemas[self._ml_schema].tables[MLVocab.dataset_type].find_associations())
|
|
418
|
+
atable = associations[0].name if associations else None
|
|
419
|
+
pb.schemas[self._ml_schema].tables[atable].insert(
|
|
420
|
+
[{MLVocab.dataset_type: ds_type, "Dataset": dataset_rid} for ds_type in ds_types]
|
|
421
|
+
)
|
|
422
|
+
if execution_rid is not None:
|
|
423
|
+
pb.schemas[self._ml_schema].Dataset_Execution.insert([{"Dataset": dataset_rid, "Execution": execution_rid}])
|
|
424
|
+
self._insert_dataset_versions(
|
|
425
|
+
[DatasetSpec(rid=dataset_rid, version=version)],
|
|
426
|
+
execution_rid=execution_rid,
|
|
427
|
+
description="Initial dataset creation.",
|
|
428
|
+
)
|
|
429
|
+
return dataset_rid
|
|
430
|
+
|
|
431
|
+
@validate_call
|
|
432
|
+
def delete_dataset(self, dataset_rid: RID, recurse: bool = False) -> None:
|
|
433
|
+
"""Delete a dataset_table from the catalog.
|
|
434
|
+
|
|
435
|
+
Args:
|
|
436
|
+
dataset_rid: RID of the dataset_table to delete.
|
|
437
|
+
recurse: If True, delete the dataset_table along with any nested datasets. (Default value = False)
|
|
438
|
+
"""
|
|
439
|
+
# Get association table entries for this dataset_table
|
|
440
|
+
# Delete association table entries
|
|
441
|
+
if not self._is_dataset_rid(dataset_rid):
|
|
442
|
+
raise DerivaMLException("Dataset_rid is not a dataset.")
|
|
443
|
+
|
|
444
|
+
if parents := self.list_dataset_parents(dataset_rid):
|
|
445
|
+
raise DerivaMLException(f'Dataset_rid "{dataset_rid}" is in a nested dataset: {parents}.')
|
|
446
|
+
|
|
447
|
+
pb = self._model.catalog.getPathBuilder()
|
|
448
|
+
dataset_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
|
|
449
|
+
|
|
450
|
+
rid_list = [dataset_rid] + (self.list_dataset_children(dataset_rid=dataset_rid) if recurse else [])
|
|
451
|
+
dataset_path.update([{"RID": r, "Deleted": True} for r in rid_list])
|
|
452
|
+
|
|
453
|
+
def find_datasets(self, deleted: bool = False) -> Iterable[dict[str, Any]]:
|
|
454
|
+
"""Returns a list of currently available datasets.
|
|
455
|
+
|
|
456
|
+
Arguments:
|
|
457
|
+
deleted: If True, included the datasets that have been deleted.
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
list of currently available datasets.
|
|
461
|
+
"""
|
|
462
|
+
# Get datapath to all the tables we will need: Dataset, DatasetType and the association table.
|
|
463
|
+
pb = self._model.catalog.getPathBuilder()
|
|
464
|
+
dataset_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
|
|
465
|
+
associations = list(self._model.schemas[self._ml_schema].tables[MLVocab.dataset_type].find_associations())
|
|
466
|
+
atable = associations[0].name if associations else None
|
|
467
|
+
ml_path = pb.schemas[self._ml_schema]
|
|
468
|
+
atable_path = ml_path.tables[atable]
|
|
469
|
+
|
|
470
|
+
if deleted:
|
|
471
|
+
filtered_path = dataset_path
|
|
472
|
+
else:
|
|
473
|
+
filtered_path = dataset_path.filter(
|
|
474
|
+
(dataset_path.Deleted == False) | (dataset_path.Deleted == None) # noqa: E711, E712
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
# Get a list of all the dataset_type values associated with this dataset_table.
|
|
478
|
+
datasets = []
|
|
479
|
+
for dataset in filtered_path.entities().fetch():
|
|
480
|
+
ds_types = (
|
|
481
|
+
atable_path.filter(atable_path.Dataset == dataset["RID"]).attributes(atable_path.Dataset_Type).fetch()
|
|
482
|
+
)
|
|
483
|
+
datasets.append(dataset | {MLVocab.dataset_type: [ds[MLVocab.dataset_type] for ds in ds_types]})
|
|
484
|
+
return datasets
|
|
485
|
+
|
|
486
|
+
def list_dataset_element_types(self) -> Iterable[Table]:
|
|
487
|
+
"""List the types of entities that can be added to a dataset_table.
|
|
488
|
+
|
|
489
|
+
Returns:
|
|
490
|
+
:return: An iterable of Table objects that can be included as an element of a dataset_table.
|
|
491
|
+
"""
|
|
492
|
+
|
|
493
|
+
def domain_table(table: Table) -> bool:
|
|
494
|
+
return table.schema.name == self._model.domain_schema or table.name == self._dataset_table.name
|
|
495
|
+
|
|
496
|
+
return [t for a in self._dataset_table.find_associations() if domain_table(t := a.other_fkeys.pop().pk_table)]
|
|
497
|
+
|
|
498
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
499
|
+
def add_dataset_element_type(self, element: str | Table) -> Table:
|
|
500
|
+
"""A dataset_table is a heterogeneous collection of objects, each of which comes from a different table. This
|
|
501
|
+
routine makes it possible to add objects from the specified table to a dataset_table.
|
|
502
|
+
|
|
503
|
+
Args:
|
|
504
|
+
element: Name of the table or table object that is to be added to the dataset_table.
|
|
505
|
+
|
|
506
|
+
Returns:
|
|
507
|
+
The table object that was added to the dataset_table.
|
|
508
|
+
"""
|
|
509
|
+
# Add table to map
|
|
510
|
+
element_table = self._model.name_to_table(element)
|
|
511
|
+
atable_def = Table.define_association([self._dataset_table, element_table])
|
|
512
|
+
try:
|
|
513
|
+
table = self._model.schemas[self._model.domain_schema].create_table(atable_def)
|
|
514
|
+
except ValueError as e:
|
|
515
|
+
if "already exists" in str(e):
|
|
516
|
+
table = self._model.name_to_table(atable_def["table_name"])
|
|
517
|
+
else:
|
|
518
|
+
raise e
|
|
519
|
+
|
|
520
|
+
# self.model = self.catalog.getCatalogModel()
|
|
521
|
+
self._dataset_table.annotations.update(self._generate_dataset_download_annotations())
|
|
522
|
+
self._model.model.apply()
|
|
523
|
+
return table
|
|
524
|
+
|
|
525
|
+
# @validate_call
|
|
526
|
+
def list_dataset_members(
|
|
527
|
+
self, dataset_rid: RID, recurse: bool = False, limit: int | None = None
|
|
528
|
+
) -> dict[str, list[dict[str, Any]]]:
|
|
529
|
+
"""Lists members of a dataset.
|
|
530
|
+
|
|
531
|
+
Returns a dictionary mapping member types to lists of member records. Can optionally
|
|
532
|
+
recurse through nested datasets and limit the number of results.
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
dataset_rid: Resource Identifier of the dataset.
|
|
536
|
+
recurse: Whether to include members of nested datasets. Defaults to False.
|
|
537
|
+
limit: Maximum number of members to return per type. None for no limit.
|
|
538
|
+
|
|
539
|
+
Returns:
|
|
540
|
+
dict[str, list[dict[str, Any]]]: Dictionary mapping member types to lists of members.
|
|
541
|
+
Each member is a dictionary containing the record's attributes.
|
|
542
|
+
|
|
543
|
+
Raises:
|
|
544
|
+
DerivaMLException: If dataset_rid is invalid.
|
|
545
|
+
|
|
546
|
+
Example:
|
|
547
|
+
>>> members = ml.list_dataset_members("1-abc123", recurse=True)
|
|
548
|
+
>>> for type_name, records in members.items():
|
|
549
|
+
... print(f"{type_name}: {len(records)} records")
|
|
550
|
+
"""
|
|
551
|
+
|
|
552
|
+
if not self._is_dataset_rid(dataset_rid):
|
|
553
|
+
raise DerivaMLException(f"RID is not for a dataset_table: {dataset_rid}")
|
|
554
|
+
|
|
555
|
+
# Look at each of the element types that might be in the dataset_table and get the list of rid for them from
|
|
556
|
+
# the appropriate association table.
|
|
557
|
+
members = defaultdict(list)
|
|
558
|
+
pb = self._model.catalog.getPathBuilder()
|
|
559
|
+
for assoc_table in self._dataset_table.find_associations():
|
|
560
|
+
other_fkey = assoc_table.other_fkeys.pop()
|
|
561
|
+
target_table = other_fkey.pk_table
|
|
562
|
+
member_table = assoc_table.table
|
|
563
|
+
|
|
564
|
+
# Look at domain tables and nested datasets.
|
|
565
|
+
if target_table.schema.name != self._model.domain_schema and not (
|
|
566
|
+
target_table == self._dataset_table or target_table.name == "File"
|
|
567
|
+
):
|
|
568
|
+
continue
|
|
569
|
+
member_column = (
|
|
570
|
+
"Nested_Dataset" if target_table == self._dataset_table else other_fkey.foreign_key_columns[0].name
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
target_path = pb.schemas[target_table.schema.name].tables[target_table.name]
|
|
574
|
+
member_path = pb.schemas[member_table.schema.name].tables[member_table.name]
|
|
575
|
+
|
|
576
|
+
path = member_path.filter(member_path.Dataset == dataset_rid).link(
|
|
577
|
+
target_path,
|
|
578
|
+
on=(member_path.columns[member_column] == target_path.columns["RID"]),
|
|
579
|
+
)
|
|
580
|
+
target_entities = list(path.entities().fetch(limit=limit) if limit else path.entities().fetch())
|
|
581
|
+
members[target_table.name].extend(target_entities)
|
|
582
|
+
if recurse and target_table == self._dataset_table:
|
|
583
|
+
# Get the members for all the nested datasets and add to the member list.
|
|
584
|
+
nested_datasets = [d["RID"] for d in target_entities]
|
|
585
|
+
for ds in nested_datasets:
|
|
586
|
+
for k, v in self.list_dataset_members(ds, recurse=recurse).items():
|
|
587
|
+
members[k].extend(v)
|
|
588
|
+
return dict(members)
|
|
589
|
+
|
|
590
|
+
@validate_call
|
|
591
|
+
def add_dataset_members(
|
|
592
|
+
self,
|
|
593
|
+
dataset_rid: RID,
|
|
594
|
+
members: list[RID] | dict[str, list[RID]],
|
|
595
|
+
validate: bool = True,
|
|
596
|
+
description: str | None = "",
|
|
597
|
+
execution_rid: RID | None = None,
|
|
598
|
+
) -> None:
|
|
599
|
+
"""Adds members to a dataset.
|
|
600
|
+
|
|
601
|
+
Associates one or more records with a dataset. Can optionally validate member types
|
|
602
|
+
and create a new dataset version to track the changes.
|
|
603
|
+
|
|
604
|
+
Args:
|
|
605
|
+
dataset_rid: Resource Identifier of the dataset.
|
|
606
|
+
members: List of RIDs to add as dataset members. Can be orginized into a dictionary that indicates the
|
|
607
|
+
table that the member rids belong to.
|
|
608
|
+
validate: Whether to validate member types. Defaults to True.
|
|
609
|
+
description: Optional description of the member additions.
|
|
610
|
+
execution_rid: Optional execution RID to associate with changes.
|
|
611
|
+
|
|
612
|
+
Raises:
|
|
613
|
+
DerivaMLException: If:
|
|
614
|
+
- dataset_rid is invalid
|
|
615
|
+
- members are invalid or of wrong type
|
|
616
|
+
- adding members would create a cycle
|
|
617
|
+
- validation fails
|
|
618
|
+
|
|
619
|
+
Example:
|
|
620
|
+
>>> ml.add_dataset_members(
|
|
621
|
+
... dataset_rid="1-abc123",
|
|
622
|
+
... members=["1-def456", "1-ghi789"],
|
|
623
|
+
... description="Added sample data"
|
|
624
|
+
... )
|
|
625
|
+
"""
|
|
626
|
+
description = description or "Updated dataset via add_dataset_members"
|
|
627
|
+
|
|
628
|
+
def check_dataset_cycle(member_rid, path=None):
|
|
629
|
+
"""
|
|
630
|
+
|
|
631
|
+
Args:
|
|
632
|
+
member_rid:
|
|
633
|
+
path: (Default value = None)
|
|
634
|
+
|
|
635
|
+
Returns:
|
|
636
|
+
|
|
637
|
+
"""
|
|
638
|
+
path = path or set(dataset_rid)
|
|
639
|
+
return member_rid in path
|
|
640
|
+
|
|
641
|
+
if validate:
|
|
642
|
+
existing_rids = set(m["RID"] for ms in self.list_dataset_members(dataset_rid).values() for m in ms)
|
|
643
|
+
if overlap := set(existing_rids).intersection(members):
|
|
644
|
+
raise DerivaMLException(f"Attempting to add existing member to dataset_table {dataset_rid}: {overlap}")
|
|
645
|
+
|
|
646
|
+
# Now go through every rid to be added to the data set and sort them based on what association table entries
|
|
647
|
+
# need to be made.
|
|
648
|
+
dataset_elements = {}
|
|
649
|
+
association_map = {
|
|
650
|
+
a.other_fkeys.pop().pk_table.name: a.table.name for a in self._dataset_table.find_associations()
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
# Get a list of all the object types that can be linked to a dataset_table.
|
|
654
|
+
if type(members) is list:
|
|
655
|
+
members = set(members)
|
|
656
|
+
for m in members:
|
|
657
|
+
try:
|
|
658
|
+
rid_info = self._model.catalog.resolve_rid(m)
|
|
659
|
+
except KeyError:
|
|
660
|
+
raise DerivaMLException(f"Invalid RID: {m}")
|
|
661
|
+
if rid_info.table.name not in association_map:
|
|
662
|
+
raise DerivaMLException(f"RID table: {rid_info.table.name} not part of dataset_table")
|
|
663
|
+
if rid_info.table == self._dataset_table and check_dataset_cycle(rid_info.rid):
|
|
664
|
+
raise DerivaMLException("Creating cycle of datasets is not allowed")
|
|
665
|
+
dataset_elements.setdefault(rid_info.table.name, []).append(rid_info.rid)
|
|
666
|
+
else:
|
|
667
|
+
dataset_elements = {t: set(ms) for t, ms in members.items()}
|
|
668
|
+
# Now make the entries into the association tables.
|
|
669
|
+
pb = self._model.catalog.getPathBuilder()
|
|
670
|
+
for table, elements in dataset_elements.items():
|
|
671
|
+
schema_path = pb.schemas[
|
|
672
|
+
self._ml_schema if (table == "Dataset" or table == "File") else self._model.domain_schema
|
|
673
|
+
]
|
|
674
|
+
fk_column = "Nested_Dataset" if table == "Dataset" else table
|
|
675
|
+
if len(elements):
|
|
676
|
+
# Find out the name of the column in the association table.
|
|
677
|
+
schema_path.tables[association_map[table]].insert(
|
|
678
|
+
[{"Dataset": dataset_rid, fk_column: e} for e in elements]
|
|
679
|
+
)
|
|
680
|
+
self.increment_dataset_version(
|
|
681
|
+
dataset_rid,
|
|
682
|
+
VersionPart.minor,
|
|
683
|
+
description=description,
|
|
684
|
+
execution_rid=execution_rid,
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
@validate_call
|
|
688
|
+
def delete_dataset_members(
|
|
689
|
+
self,
|
|
690
|
+
dataset_rid: RID,
|
|
691
|
+
members: list[RID],
|
|
692
|
+
description: str = "",
|
|
693
|
+
execution_rid: RID | None = None,
|
|
694
|
+
) -> None:
|
|
695
|
+
"""Remove elements to an existing dataset_table.
|
|
696
|
+
|
|
697
|
+
Delete elements from an existing dataset. In addition to deleting members, the minor version number of the
|
|
698
|
+
dataset is incremented and the description, if provide is applied to that new version.
|
|
699
|
+
|
|
700
|
+
Args:
|
|
701
|
+
dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
|
|
702
|
+
members: List of member RIDs to add to the dataset_table.
|
|
703
|
+
description: Markdown description of the updated dataset.
|
|
704
|
+
execution_rid: Optional RID of execution associated with this operation.
|
|
705
|
+
"""
|
|
706
|
+
|
|
707
|
+
members = set(members)
|
|
708
|
+
description = description or "Deletes dataset members"
|
|
709
|
+
|
|
710
|
+
# Now go through every rid to be added to the data set and sort them based on what association table entries
|
|
711
|
+
# need to be made.
|
|
712
|
+
dataset_elements = {}
|
|
713
|
+
association_map = {
|
|
714
|
+
a.other_fkeys.pop().pk_table.name: a.table.name for a in self._dataset_table.find_associations()
|
|
715
|
+
}
|
|
716
|
+
# Get a list of all the object types that can be linked to a dataset_table.
|
|
717
|
+
for m in members:
|
|
718
|
+
try:
|
|
719
|
+
rid_info = self._model.catalog.resolve_rid(m)
|
|
720
|
+
except KeyError:
|
|
721
|
+
raise DerivaMLException(f"Invalid RID: {m}")
|
|
722
|
+
if rid_info.table.name not in association_map:
|
|
723
|
+
raise DerivaMLException(f"RID table: {rid_info.table.name} not part of dataset_table")
|
|
724
|
+
dataset_elements.setdefault(rid_info.table.name, []).append(rid_info.rid)
|
|
725
|
+
# Now make the entries into the association tables.
|
|
726
|
+
pb = self._model.catalog.getPathBuilder()
|
|
727
|
+
for table, elements in dataset_elements.items():
|
|
728
|
+
schema_path = pb.schemas[self._ml_schema if table == "Dataset" else self._model.domain_schema]
|
|
729
|
+
fk_column = "Nested_Dataset" if table == "Dataset" else table
|
|
730
|
+
|
|
731
|
+
if len(elements):
|
|
732
|
+
atable_path = schema_path.tables[association_map[table]]
|
|
733
|
+
# Find out the name of the column in the association table.
|
|
734
|
+
for e in elements:
|
|
735
|
+
entity = atable_path.filter(
|
|
736
|
+
(atable_path.Dataset == dataset_rid) & (atable_path.columns[fk_column] == e),
|
|
737
|
+
)
|
|
738
|
+
entity.delete()
|
|
739
|
+
self.increment_dataset_version(
|
|
740
|
+
dataset_rid,
|
|
741
|
+
VersionPart.minor,
|
|
742
|
+
description=description,
|
|
743
|
+
execution_rid=execution_rid,
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
@validate_call
|
|
747
|
+
def list_dataset_parents(self, dataset_rid: RID) -> list[str]:
|
|
748
|
+
"""Given a dataset_table RID, return a list of RIDs of the parent datasets if this is included in a
|
|
749
|
+
nested dataset.
|
|
750
|
+
|
|
751
|
+
Args:
|
|
752
|
+
dataset_rid: return: RID of the parent dataset_table.
|
|
753
|
+
|
|
754
|
+
Returns:
|
|
755
|
+
RID of the parent dataset_table.
|
|
756
|
+
"""
|
|
757
|
+
if not self._is_dataset_rid(dataset_rid):
|
|
758
|
+
raise DerivaMLException(f"RID: {dataset_rid} does not belong to dataset_table {self._dataset_table.name}")
|
|
759
|
+
# Get association table for nested datasets
|
|
760
|
+
pb = self._model.catalog.getPathBuilder()
|
|
761
|
+
atable_path = pb.schemas[self._ml_schema].Dataset_Dataset
|
|
762
|
+
return [p["Dataset"] for p in atable_path.filter(atable_path.Nested_Dataset == dataset_rid).entities().fetch()]
|
|
763
|
+
|
|
764
|
+
@validate_call
|
|
765
|
+
def list_dataset_children(self, dataset_rid: RID, recurse: bool = False) -> list[RID]:
|
|
766
|
+
"""Given a dataset_table RID, return a list of RIDs for any nested datasets.
|
|
767
|
+
|
|
768
|
+
Args:
|
|
769
|
+
dataset_rid: A dataset_table RID.
|
|
770
|
+
recurse: If True, return a list of nested datasets RIDs.
|
|
771
|
+
|
|
772
|
+
Returns:
|
|
773
|
+
list of nested dataset RIDs.
|
|
774
|
+
|
|
775
|
+
"""
|
|
776
|
+
dataset_dataset_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Dataset"]
|
|
777
|
+
nested_datasets = list(dataset_dataset_path.entities().fetch())
|
|
778
|
+
|
|
779
|
+
def find_children(rid: RID):
|
|
780
|
+
children = [child["Nested_Dataset"] for child in nested_datasets if child["Dataset"] == rid]
|
|
781
|
+
if recurse:
|
|
782
|
+
for child in children.copy():
|
|
783
|
+
children.extend(find_children(child))
|
|
784
|
+
return children
|
|
785
|
+
|
|
786
|
+
return find_children(dataset_rid)
|
|
787
|
+
|
|
788
|
+
def _export_vocabulary(self, writer: Callable[[str, str, Table], list[dict[str, Any]]]) -> list[dict[str, Any]]:
|
|
789
|
+
"""
|
|
790
|
+
|
|
791
|
+
Args:
|
|
792
|
+
writer: Callable[[list[Table]]: list[dict[str: Any]]]:
|
|
793
|
+
|
|
794
|
+
Returns:
|
|
795
|
+
|
|
796
|
+
"""
|
|
797
|
+
vocabs = [
|
|
798
|
+
table
|
|
799
|
+
for s in self._model.schemas.values()
|
|
800
|
+
for table in s.tables.values()
|
|
801
|
+
if self._model.is_vocabulary(table)
|
|
802
|
+
]
|
|
803
|
+
return [o for table in vocabs for o in writer(f"{table.schema.name}:{table.name}", table.name, table)]
|
|
804
|
+
|
|
805
|
+
def _table_paths(
|
|
806
|
+
self,
|
|
807
|
+
dataset: DatasetSpec | None = None,
|
|
808
|
+
snapshot_catalog: DerivaML | None = None,
|
|
809
|
+
) -> Iterator[tuple[str, str, Table]]:
|
|
810
|
+
paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
|
|
811
|
+
|
|
812
|
+
def source_path(path: tuple[Table, ...]) -> list[str]:
|
|
813
|
+
"""Convert a tuple representing a path into a source path component with FK linkage"""
|
|
814
|
+
path = list(path)
|
|
815
|
+
p = [f"{self._model.ml_schema}:Dataset/RID={{RID}}"]
|
|
816
|
+
for table in path[1:]:
|
|
817
|
+
if table.name == "Dataset_Dataset":
|
|
818
|
+
p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
|
|
819
|
+
elif table.name == "Dataset":
|
|
820
|
+
p.append("(Nested_Dataset)=(deriva-ml:Dataset:RID)")
|
|
821
|
+
elif table.name == "Dataset_Version":
|
|
822
|
+
p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
|
|
823
|
+
else:
|
|
824
|
+
p.append(f"{table.schema.name}:{table.name}")
|
|
825
|
+
return p
|
|
826
|
+
|
|
827
|
+
src_paths = ["/".join(source_path(p)) for p in paths]
|
|
828
|
+
dest_paths = ["/".join([t.name for t in p]) for p in paths]
|
|
829
|
+
target_tables = [p[-1] for p in paths]
|
|
830
|
+
return zip(src_paths, dest_paths, target_tables)
|
|
831
|
+
|
|
832
|
+
def _collect_paths(
|
|
833
|
+
self,
|
|
834
|
+
dataset_rid: RID | None = None,
|
|
835
|
+
snapshot: Dataset | None = None,
|
|
836
|
+
dataset_nesting_depth: int | None = None,
|
|
837
|
+
) -> set[tuple[Table, ...]]:
|
|
838
|
+
snapshot_catalog = snapshot if snapshot else self
|
|
839
|
+
|
|
840
|
+
dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables["Dataset"]
|
|
841
|
+
dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
|
|
842
|
+
|
|
843
|
+
# Figure out what types of elements the dataset contains.
|
|
844
|
+
dataset_associations = [
|
|
845
|
+
a
|
|
846
|
+
for a in self._dataset_table.find_associations()
|
|
847
|
+
if a.table.schema.name != self._ml_schema or a.table.name == "Dataset_Dataset"
|
|
848
|
+
]
|
|
849
|
+
if dataset_rid:
|
|
850
|
+
# Get a list of the members of the dataset so we can figure out which tables to query.
|
|
851
|
+
dataset_elements = [
|
|
852
|
+
snapshot_catalog._model.name_to_table(e)
|
|
853
|
+
for e, m in snapshot_catalog.list_dataset_members(
|
|
854
|
+
dataset_rid=dataset_rid, # limit=1 Limit seems to make things run slow.
|
|
855
|
+
).items()
|
|
856
|
+
if m
|
|
857
|
+
]
|
|
858
|
+
included_associations = [
|
|
859
|
+
a.table for a in dataset_table.find_associations() if a.other_fkeys.pop().pk_table in dataset_elements
|
|
860
|
+
]
|
|
861
|
+
else:
|
|
862
|
+
included_associations = dataset_associations
|
|
863
|
+
|
|
864
|
+
# Get the paths through the schema and filter out all the dataset paths not used by this dataset.
|
|
865
|
+
paths = {
|
|
866
|
+
tuple(p)
|
|
867
|
+
for p in snapshot_catalog._model._schema_to_paths()
|
|
868
|
+
if (len(p) == 1)
|
|
869
|
+
or (p[1] not in dataset_associations) # Tables in the domain schema
|
|
870
|
+
or (p[1] in included_associations) # Tables that include members of the dataset
|
|
871
|
+
}
|
|
872
|
+
# Now get paths for nested datasets
|
|
873
|
+
nested_paths = set()
|
|
874
|
+
if dataset_rid:
|
|
875
|
+
for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
|
|
876
|
+
nested_paths |= self._collect_paths(c, snapshot=snapshot_catalog)
|
|
877
|
+
else:
|
|
878
|
+
# Initialize nesting depth if not already provided.
|
|
879
|
+
dataset_nesting_depth = (
|
|
880
|
+
self._dataset_nesting_depth() if dataset_nesting_depth is None else dataset_nesting_depth
|
|
881
|
+
)
|
|
882
|
+
if dataset_nesting_depth:
|
|
883
|
+
nested_paths = self._collect_paths(dataset_nesting_depth=dataset_nesting_depth - 1)
|
|
884
|
+
if nested_paths:
|
|
885
|
+
paths |= {
|
|
886
|
+
tuple([dataset_table]),
|
|
887
|
+
(dataset_table, dataset_dataset),
|
|
888
|
+
}
|
|
889
|
+
paths |= {(self._dataset_table, dataset_dataset) + p for p in nested_paths}
|
|
890
|
+
return paths
|
|
891
|
+
|
|
892
|
+
def _dataset_nesting_depth(self, dataset_rid: RID | None = None) -> int:
|
|
893
|
+
"""Determine the maximum dataset nesting depth in the current catalog.
|
|
894
|
+
|
|
895
|
+
Returns:
|
|
896
|
+
|
|
897
|
+
"""
|
|
898
|
+
|
|
899
|
+
def children_depth(dataset_rid: RID, nested_datasets: dict[str, list[str]]) -> int:
|
|
900
|
+
"""Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
|
|
901
|
+
try:
|
|
902
|
+
children = nested_datasets[dataset_rid]
|
|
903
|
+
return max(map(lambda x: children_depth(x, nested_datasets), children)) + 1 if children else 1
|
|
904
|
+
except KeyError:
|
|
905
|
+
return 0
|
|
906
|
+
|
|
907
|
+
# Build up the dataset_table nesting graph...
|
|
908
|
+
pb = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Dataset"]
|
|
909
|
+
dataset_children = (
|
|
910
|
+
[
|
|
911
|
+
{
|
|
912
|
+
"Dataset": dataset_rid,
|
|
913
|
+
"Nested_Dataset": c,
|
|
914
|
+
} # Make uniform with return from datapath
|
|
915
|
+
for c in self.list_dataset_children(dataset_rid=dataset_rid)
|
|
916
|
+
]
|
|
917
|
+
if dataset_rid
|
|
918
|
+
else pb.entities().fetch()
|
|
919
|
+
)
|
|
920
|
+
nested_dataset = defaultdict(list)
|
|
921
|
+
for ds in dataset_children:
|
|
922
|
+
nested_dataset[ds["Dataset"]].append(ds["Nested_Dataset"])
|
|
923
|
+
return max(map(lambda d: children_depth(d, dict(nested_dataset)), nested_dataset)) if nested_dataset else 0
|
|
924
|
+
|
|
925
|
+
def _dataset_specification(
|
|
926
|
+
self,
|
|
927
|
+
writer: Callable[[str, str, Table], list[dict[str, Any]]],
|
|
928
|
+
dataset: DatasetSpec | None = None,
|
|
929
|
+
snapshot_catalog: DerivaML | None = None,
|
|
930
|
+
) -> list[dict[str, Any]]:
|
|
931
|
+
"""Output a download/export specification for a dataset_table. Each element of the dataset_table
|
|
932
|
+
will be placed in its own directory.
|
|
933
|
+
The top level data directory of the resulting BDBag will have one subdirectory for element type.
|
|
934
|
+
The subdirectory will contain the CSV indicating which elements of that type are present in the
|
|
935
|
+
dataset_table, and then there will be a subdirectory for each object that is reachable from the
|
|
936
|
+
dataset_table members.
|
|
937
|
+
|
|
938
|
+
To simplify reconstructing the relationship between tables, the CVS for each element is included.
|
|
939
|
+
The top level data directory will also contain a subdirectory for any controlled vocabularies used in
|
|
940
|
+
the dataset_table. All assets will be placed into a directory named asset in a subdirectory with the
|
|
941
|
+
asset table name.
|
|
942
|
+
|
|
943
|
+
For example, consider a dataset_table that consists of two element types, T1 and T2. T1 has foreign
|
|
944
|
+
key relationships to objects in tables T3 and T4. There are also two controlled vocabularies, CV1 and
|
|
945
|
+
CV2. T2 is an asset table which has two assets in it. The layout of the resulting bdbag would be:
|
|
946
|
+
data
|
|
947
|
+
CV1/
|
|
948
|
+
cv1.csv
|
|
949
|
+
CV2/
|
|
950
|
+
cv2.csv
|
|
951
|
+
Dataset/
|
|
952
|
+
T1/
|
|
953
|
+
t1.csv
|
|
954
|
+
T3/
|
|
955
|
+
t3.csv
|
|
956
|
+
T4/
|
|
957
|
+
t4.csv
|
|
958
|
+
T2/
|
|
959
|
+
t2.csv
|
|
960
|
+
asset/
|
|
961
|
+
T2
|
|
962
|
+
f1
|
|
963
|
+
f2
|
|
964
|
+
|
|
965
|
+
Args:
|
|
966
|
+
writer: Callable[[list[Table]]: list[dict[str: Any]]]:
|
|
967
|
+
|
|
968
|
+
Returns:
|
|
969
|
+
A dataset_table specification.
|
|
970
|
+
"""
|
|
971
|
+
element_spec = self._export_vocabulary(writer)
|
|
972
|
+
for path in self._table_paths(dataset=dataset, snapshot_catalog=snapshot_catalog):
|
|
973
|
+
element_spec.extend(writer(*path))
|
|
974
|
+
return element_spec
|
|
975
|
+
|
|
976
|
+
def _download_dataset_bag(
|
|
977
|
+
self,
|
|
978
|
+
dataset: DatasetSpec,
|
|
979
|
+
execution_rid: RID | None = None,
|
|
980
|
+
snapshot_catalog: DerivaML | None = None,
|
|
981
|
+
) -> DatasetBag:
|
|
982
|
+
"""Download a dataset onto the local file system. Create a MINID for the dataset if one doesn't already exist.
|
|
983
|
+
|
|
984
|
+
Args:
|
|
985
|
+
dataset: Specification of the dataset to be downloaded.
|
|
986
|
+
execution_rid: Execution RID for the dataset.
|
|
987
|
+
snapshot_catalog: Snapshot catalog for the dataset version if specified.
|
|
988
|
+
|
|
989
|
+
Returns:
|
|
990
|
+
Tuple consisting of the path to the dataset, the RID of the dataset that was downloaded and the MINID
|
|
991
|
+
for the dataset.
|
|
992
|
+
"""
|
|
993
|
+
if (
|
|
994
|
+
execution_rid
|
|
995
|
+
and execution_rid != DRY_RUN_RID
|
|
996
|
+
and self._model.catalog.resolve_rid(execution_rid).table.name != "Execution"
|
|
997
|
+
):
|
|
998
|
+
raise DerivaMLException(f"RID {execution_rid} is not an execution")
|
|
999
|
+
minid = self._get_dataset_minid(dataset, snapshot_catalog=snapshot_catalog)
|
|
1000
|
+
|
|
1001
|
+
bag_path = (
|
|
1002
|
+
self._materialize_dataset_bag(minid, execution_rid=execution_rid)
|
|
1003
|
+
if dataset.materialize
|
|
1004
|
+
else self._download_dataset_minid(minid)
|
|
1005
|
+
)
|
|
1006
|
+
return DatabaseModel(minid, bag_path, self._working_dir).get_dataset()
|
|
1007
|
+
|
|
1008
|
+
def _version_snapshot(self, dataset: DatasetSpec) -> str:
|
|
1009
|
+
"""Return a catalog with snapshot for the specified dataset version"""
|
|
1010
|
+
try:
|
|
1011
|
+
version_record = next(
|
|
1012
|
+
h for h in self.dataset_history(dataset_rid=dataset.rid) if h.dataset_version == dataset.version
|
|
1013
|
+
)
|
|
1014
|
+
except StopIteration:
|
|
1015
|
+
raise DerivaMLException(f"Dataset version {dataset.version} not found for dataset {dataset.rid}")
|
|
1016
|
+
return f"{self._model.catalog.catalog_id}@{version_record.snapshot}"
|
|
1017
|
+
|
|
1018
|
+
def _create_dataset_minid(self, dataset: DatasetSpec, snapshot_catalog: DerivaML | None = None) -> str:
|
|
1019
|
+
with TemporaryDirectory() as tmp_dir:
|
|
1020
|
+
# Generate a download specification file for the current catalog schema. By default, this spec
|
|
1021
|
+
# will generate a minid and place the bag into S3 storage.
|
|
1022
|
+
spec_file = Path(tmp_dir) / "download_spec.json"
|
|
1023
|
+
with spec_file.open("w", encoding="utf-8") as ds:
|
|
1024
|
+
json.dump(self._generate_dataset_download_spec(dataset, snapshot_catalog), ds)
|
|
1025
|
+
try:
|
|
1026
|
+
self._logger.info(
|
|
1027
|
+
"Downloading dataset %s for catalog: %s@%s"
|
|
1028
|
+
% (
|
|
1029
|
+
"minid" if self._use_minid else "bag",
|
|
1030
|
+
dataset.rid,
|
|
1031
|
+
str(dataset.version),
|
|
1032
|
+
)
|
|
1033
|
+
)
|
|
1034
|
+
# Generate the bag and put into S3 storage.
|
|
1035
|
+
exporter = DerivaExport(
|
|
1036
|
+
host=self._model.catalog.deriva_server.server,
|
|
1037
|
+
config_file=spec_file,
|
|
1038
|
+
output_dir=tmp_dir,
|
|
1039
|
+
defer_download=True,
|
|
1040
|
+
timeout=(10, 610),
|
|
1041
|
+
envars={"RID": dataset.rid},
|
|
1042
|
+
)
|
|
1043
|
+
minid_page_url = exporter.export()[0] # Get the MINID launch page
|
|
1044
|
+
except (
|
|
1045
|
+
DerivaDownloadError,
|
|
1046
|
+
DerivaDownloadConfigurationError,
|
|
1047
|
+
DerivaDownloadAuthenticationError,
|
|
1048
|
+
DerivaDownloadAuthorizationError,
|
|
1049
|
+
DerivaDownloadTimeoutError,
|
|
1050
|
+
) as e:
|
|
1051
|
+
raise DerivaMLException(format_exception(e))
|
|
1052
|
+
# Update version table with MINID.
|
|
1053
|
+
if self._use_minid:
|
|
1054
|
+
version_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Version"]
|
|
1055
|
+
version_rid = [
|
|
1056
|
+
h for h in self.dataset_history(dataset_rid=dataset.rid) if h.dataset_version == dataset.version
|
|
1057
|
+
][0].version_rid
|
|
1058
|
+
version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
|
|
1059
|
+
return minid_page_url
|
|
1060
|
+
|
|
1061
|
+
def _get_dataset_minid(
|
|
1062
|
+
self,
|
|
1063
|
+
dataset: DatasetSpec,
|
|
1064
|
+
snapshot_catalog: DerivaML | None = None,
|
|
1065
|
+
create: bool = True,
|
|
1066
|
+
) -> DatasetMinid | None:
|
|
1067
|
+
"""Return a MINID for the specified dataset. If no version is specified, use the latest.
|
|
1068
|
+
|
|
1069
|
+
Args:
|
|
1070
|
+
dataset: Specification of the dataset.
|
|
1071
|
+
snapshot_catalog: Snapshot catalog for the dataset version if specified.
|
|
1072
|
+
create: Create a new MINID if one doesn't already exist.
|
|
1073
|
+
|
|
1074
|
+
Returns:
|
|
1075
|
+
New or existing MINID for the dataset.
|
|
1076
|
+
"""
|
|
1077
|
+
rid = dataset.rid
|
|
1078
|
+
|
|
1079
|
+
# Case 1: RID is already a MINID or direct URL
|
|
1080
|
+
if rid.startswith("minid"):
|
|
1081
|
+
return self._fetch_minid_metadata(f"https://identifiers.org/{rid}", dataset.version)
|
|
1082
|
+
if rid.startswith("http"):
|
|
1083
|
+
return self._fetch_minid_metadata(rid, dataset.version)
|
|
1084
|
+
|
|
1085
|
+
# Case 2: RID is a dataset RID – validate existence
|
|
1086
|
+
if not any(rid == ds["RID"] for ds in self.find_datasets()):
|
|
1087
|
+
raise DerivaMLTableTypeError("Dataset", rid)
|
|
1088
|
+
|
|
1089
|
+
# Find dataset version record
|
|
1090
|
+
version_str = str(dataset.version)
|
|
1091
|
+
history = self.dataset_history(rid)
|
|
1092
|
+
try:
|
|
1093
|
+
version_record = next(v for v in history if v.dataset_version == version_str)
|
|
1094
|
+
except StopIteration:
|
|
1095
|
+
raise DerivaMLException(f"Version {version_str} does not exist for RID {rid}")
|
|
1096
|
+
|
|
1097
|
+
# Check or create MINID
|
|
1098
|
+
minid_url = version_record.minid
|
|
1099
|
+
# If we either don't have a MINID, or we have a MINID, but we don't want to use it, generate a new one.
|
|
1100
|
+
if (not minid_url) or (not self._use_minid):
|
|
1101
|
+
if not create:
|
|
1102
|
+
raise DerivaMLException(f"Minid for dataset {rid} doesn't exist")
|
|
1103
|
+
if self._use_minid:
|
|
1104
|
+
self._logger.info("Creating new MINID for dataset %s", rid)
|
|
1105
|
+
minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
|
|
1106
|
+
|
|
1107
|
+
# Return based on MINID usage
|
|
1108
|
+
if self._use_minid:
|
|
1109
|
+
return self._fetch_minid_metadata(minid_url, dataset.version)
|
|
1110
|
+
return DatasetMinid(
|
|
1111
|
+
dataset_version=dataset.version,
|
|
1112
|
+
RID=f"{rid}@{version_record.snapshot}",
|
|
1113
|
+
location=minid_url,
|
|
1114
|
+
)
|
|
1115
|
+
|
|
1116
|
+
def _fetch_minid_metadata(self, url: str, version: DatasetVersion) -> DatasetMinid:
|
|
1117
|
+
r = requests.get(url, headers={"accept": "application/json"})
|
|
1118
|
+
r.raise_for_status()
|
|
1119
|
+
return DatasetMinid(dataset_version=version, **r.json())
|
|
1120
|
+
|
|
1121
|
+
def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
|
|
1122
|
+
"""Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it, and
|
|
1123
|
+
validate that all the metadata is correct
|
|
1124
|
+
|
|
1125
|
+
Args:
|
|
1126
|
+
minid: The RID of a dataset_table or a minid to an existing bag.
|
|
1127
|
+
Returns:
|
|
1128
|
+
the location of the unpacked and validated dataset_table bag and the RID of the bag and the bag MINID
|
|
1129
|
+
"""
|
|
1130
|
+
|
|
1131
|
+
# Check to see if we have an existing idempotent materialization of the desired bag. If so, then reuse
|
|
1132
|
+
# it. If not, then we need to extract the contents of the archive into our cache directory.
|
|
1133
|
+
bag_dir = self._cache_dir / f"{minid.dataset_rid}_{minid.checksum}"
|
|
1134
|
+
if bag_dir.exists():
|
|
1135
|
+
self._logger.info(f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}")
|
|
1136
|
+
return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
|
|
1137
|
+
|
|
1138
|
+
# Either bag hasn't been downloaded yet, or we are not using a Minid, so we don't know the checksum yet.
|
|
1139
|
+
with TemporaryDirectory() as tmp_dir:
|
|
1140
|
+
if self._use_minid:
|
|
1141
|
+
# Get bag from S3
|
|
1142
|
+
bag_path = Path(tmp_dir) / Path(urlparse(minid.bag_url).path).name
|
|
1143
|
+
archive_path = fetch_single_file(minid.bag_url, output_path=bag_path)
|
|
1144
|
+
else:
|
|
1145
|
+
exporter = DerivaExport(host=self._model.catalog.deriva_server.server, output_dir=tmp_dir)
|
|
1146
|
+
archive_path = exporter.retrieve_file(minid.bag_url)
|
|
1147
|
+
hashes = hash_utils.compute_file_hashes(archive_path, hashes=["md5", "sha256"])
|
|
1148
|
+
checksum = hashes["sha256"][0]
|
|
1149
|
+
bag_dir = self._cache_dir / f"{minid.dataset_rid}_{checksum}"
|
|
1150
|
+
if bag_dir.exists():
|
|
1151
|
+
self._logger.info(f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}")
|
|
1152
|
+
return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
|
|
1153
|
+
bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
|
|
1154
|
+
bdb.validate_bag_structure(bag_path)
|
|
1155
|
+
return Path(bag_path)
|
|
1156
|
+
|
|
1157
|
+
def _materialize_dataset_bag(
|
|
1158
|
+
self,
|
|
1159
|
+
minid: DatasetMinid,
|
|
1160
|
+
execution_rid: RID | None = None,
|
|
1161
|
+
) -> Path:
|
|
1162
|
+
"""Materialize a dataset_table bag into a local directory
|
|
1163
|
+
|
|
1164
|
+
Args:
|
|
1165
|
+
minid: A MINID to an existing bag or a RID of the dataset_table that should be downloaded.
|
|
1166
|
+
|
|
1167
|
+
Returns:
|
|
1168
|
+
A tuple containing the path to the bag, the RID of the bag, and the MINID to the bag.
|
|
1169
|
+
"""
|
|
1170
|
+
|
|
1171
|
+
def update_status(status: Status, msg: str) -> None:
|
|
1172
|
+
"""Update the current status for this execution in the catalog"""
|
|
1173
|
+
if execution_rid and execution_rid != DRY_RUN_RID:
|
|
1174
|
+
self._model.catalog.getPathBuilder().schemas[self._ml_schema].Execution.update(
|
|
1175
|
+
[
|
|
1176
|
+
{
|
|
1177
|
+
"RID": execution_rid,
|
|
1178
|
+
"Status": status.value,
|
|
1179
|
+
"Status_Detail": msg,
|
|
1180
|
+
}
|
|
1181
|
+
]
|
|
1182
|
+
)
|
|
1183
|
+
self._logger.info(msg)
|
|
1184
|
+
|
|
1185
|
+
def fetch_progress_callback(current, total):
|
|
1186
|
+
msg = f"Materializing bag: {current} of {total} file(s) downloaded."
|
|
1187
|
+
if execution_rid:
|
|
1188
|
+
update_status(Status.running, msg)
|
|
1189
|
+
return True
|
|
1190
|
+
|
|
1191
|
+
def validation_progress_callback(current, total):
|
|
1192
|
+
msg = f"Validating bag: {current} of {total} file(s) validated."
|
|
1193
|
+
if execution_rid:
|
|
1194
|
+
update_status(Status.running, msg)
|
|
1195
|
+
return True
|
|
1196
|
+
|
|
1197
|
+
# request metadata
|
|
1198
|
+
bag_path = self._download_dataset_minid(minid)
|
|
1199
|
+
bag_dir = bag_path.parent
|
|
1200
|
+
validated_check = bag_dir / "validated_check.txt"
|
|
1201
|
+
|
|
1202
|
+
# If this bag has already been validated, our work is done. Otherwise, materialize the bag.
|
|
1203
|
+
if not validated_check.exists():
|
|
1204
|
+
self._logger.info(f"Materializing bag {minid.dataset_rid} Version:{minid.dataset_version}")
|
|
1205
|
+
bdb.materialize(
|
|
1206
|
+
bag_path.as_posix(),
|
|
1207
|
+
fetch_callback=fetch_progress_callback,
|
|
1208
|
+
validation_callback=validation_progress_callback,
|
|
1209
|
+
)
|
|
1210
|
+
validated_check.touch()
|
|
1211
|
+
return Path(bag_path)
|
|
1212
|
+
|
|
1213
|
+
def _export_annotation(
|
|
1214
|
+
self,
|
|
1215
|
+
snapshot_catalog: DerivaML | None = None,
|
|
1216
|
+
) -> list[dict[str, Any]]:
|
|
1217
|
+
"""Return and output specification for the datasets in the provided model
|
|
1218
|
+
|
|
1219
|
+
Returns:
|
|
1220
|
+
An export specification suitable for Chaise.
|
|
1221
|
+
"""
|
|
1222
|
+
|
|
1223
|
+
# Export specification is a specification for the datasets, plus any controlled vocabulary
|
|
1224
|
+
return [
|
|
1225
|
+
{
|
|
1226
|
+
"source": {"api": False, "skip_root_path": True},
|
|
1227
|
+
"destination": {"type": "env", "params": {"query_keys": ["snaptime"]}},
|
|
1228
|
+
},
|
|
1229
|
+
{
|
|
1230
|
+
"source": {"api": "entity"},
|
|
1231
|
+
"destination": {
|
|
1232
|
+
"type": "env",
|
|
1233
|
+
"params": {"query_keys": ["RID", "Description"]},
|
|
1234
|
+
},
|
|
1235
|
+
},
|
|
1236
|
+
{
|
|
1237
|
+
"source": {"api": "schema", "skip_root_path": True},
|
|
1238
|
+
"destination": {"type": "json", "name": "schema"},
|
|
1239
|
+
},
|
|
1240
|
+
] + self._dataset_specification(
|
|
1241
|
+
self._export_annotation_dataset_element,
|
|
1242
|
+
None,
|
|
1243
|
+
snapshot_catalog=snapshot_catalog,
|
|
1244
|
+
)
|
|
1245
|
+
|
|
1246
|
+
def _export_specification(
|
|
1247
|
+
self, dataset: DatasetSpec, snapshot_catalog: DerivaML | None = None
|
|
1248
|
+
) -> list[dict[str, Any]]:
|
|
1249
|
+
"""
|
|
1250
|
+
Generate a specification for export engine for specific dataset.
|
|
1251
|
+
|
|
1252
|
+
Returns:
|
|
1253
|
+
a download specification for the datasets in the provided model.
|
|
1254
|
+
|
|
1255
|
+
"""
|
|
1256
|
+
|
|
1257
|
+
# Download spec is the spec for any controlled vocabulary and for the dataset_table.
|
|
1258
|
+
return [
|
|
1259
|
+
{
|
|
1260
|
+
"processor": "json",
|
|
1261
|
+
"processor_params": {"query_path": "/schema", "output_path": "schema"},
|
|
1262
|
+
}
|
|
1263
|
+
] + self._dataset_specification(self._export_specification_dataset_element, dataset, snapshot_catalog)
|
|
1264
|
+
|
|
1265
|
+
@staticmethod
|
|
1266
|
+
def _export_specification_dataset_element(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
|
|
1267
|
+
"""Return the download specification for the data object indicated by a path through the data model.
|
|
1268
|
+
|
|
1269
|
+
Args:
|
|
1270
|
+
spath: Source path
|
|
1271
|
+
dpath: Destination path
|
|
1272
|
+
table: Table referenced to by the path
|
|
1273
|
+
|
|
1274
|
+
Returns:
|
|
1275
|
+
The download specification that will retrieve that data from the catalog and place it into a BDBag.
|
|
1276
|
+
"""
|
|
1277
|
+
exports = [
|
|
1278
|
+
{
|
|
1279
|
+
"processor": "csv",
|
|
1280
|
+
"processor_params": {
|
|
1281
|
+
"query_path": f"/entity/{spath}",
|
|
1282
|
+
"output_path": dpath,
|
|
1283
|
+
},
|
|
1284
|
+
}
|
|
1285
|
+
]
|
|
1286
|
+
|
|
1287
|
+
# If this table is an asset table, then we need to output the files associated with the asset.
|
|
1288
|
+
asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
|
|
1289
|
+
if asset_columns.issubset({c.name for c in table.columns}):
|
|
1290
|
+
exports.append(
|
|
1291
|
+
{
|
|
1292
|
+
"processor": "fetch",
|
|
1293
|
+
"processor_params": {
|
|
1294
|
+
"query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5,asset_rid:=RID",
|
|
1295
|
+
"output_path": "asset/{asset_rid}/" + table.name,
|
|
1296
|
+
},
|
|
1297
|
+
}
|
|
1298
|
+
)
|
|
1299
|
+
return exports
|
|
1300
|
+
|
|
1301
|
+
def _export_annotation_dataset_element(self, spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
|
|
1302
|
+
"""Given a path in the data model, output an export specification for the path taken to get to the
|
|
1303
|
+
current table.
|
|
1304
|
+
|
|
1305
|
+
Args:
|
|
1306
|
+
spath: Source path
|
|
1307
|
+
dpath: Destination path
|
|
1308
|
+
table: Table referenced to by the path
|
|
1309
|
+
|
|
1310
|
+
Returns:
|
|
1311
|
+
The export specification that will retrieve that data from the catalog and place it into a BDBag.
|
|
1312
|
+
"""
|
|
1313
|
+
# The table is the last element of the path. Generate the ERMRest query by converting the list of tables
|
|
1314
|
+
# into a path in the form of /S:T1/S:T2/S:Table
|
|
1315
|
+
# Generate the destination path in the file system using just the table names.
|
|
1316
|
+
|
|
1317
|
+
skip_root_path = False
|
|
1318
|
+
if spath.startswith(f"{self._ml_schema}:Dataset/"):
|
|
1319
|
+
# Chaise will add table name and RID filter, so strip it off.
|
|
1320
|
+
spath = "/".join(spath.split("/")[2:])
|
|
1321
|
+
if spath == "":
|
|
1322
|
+
# This path is to just the dataset table.
|
|
1323
|
+
return []
|
|
1324
|
+
else:
|
|
1325
|
+
# A vocabulary table, so we don't want the root_path.
|
|
1326
|
+
skip_root_path = True
|
|
1327
|
+
exports = [
|
|
1328
|
+
{
|
|
1329
|
+
"source": {
|
|
1330
|
+
"api": "entity",
|
|
1331
|
+
"path": spath,
|
|
1332
|
+
"skip_root_path": skip_root_path,
|
|
1333
|
+
},
|
|
1334
|
+
"destination": {"name": dpath, "type": "csv"},
|
|
1335
|
+
}
|
|
1336
|
+
]
|
|
1337
|
+
|
|
1338
|
+
# If this table is an asset table, then we need to output the files associated with the asset.
|
|
1339
|
+
asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
|
|
1340
|
+
if asset_columns.issubset({c.name for c in table.columns}):
|
|
1341
|
+
exports.append(
|
|
1342
|
+
{
|
|
1343
|
+
"source": {
|
|
1344
|
+
"skip_root_path": False,
|
|
1345
|
+
"api": "attribute",
|
|
1346
|
+
"path": f"{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5, asset_rid:=RID",
|
|
1347
|
+
},
|
|
1348
|
+
"destination": {"name": "asset/{asset_rid}/" + table.name, "type": "fetch"},
|
|
1349
|
+
}
|
|
1350
|
+
)
|
|
1351
|
+
return exports
|
|
1352
|
+
|
|
1353
|
+
def _generate_dataset_download_spec(
|
|
1354
|
+
self, dataset: DatasetSpec, snapshot_catalog: DerivaML | None = None
|
|
1355
|
+
) -> dict[str, Any]:
|
|
1356
|
+
"""
|
|
1357
|
+
Generate a specification for downloading a specific dataset.
|
|
1358
|
+
|
|
1359
|
+
This routine creates a download specification that can be used by the Deriva export processor to download
|
|
1360
|
+
a specific dataset as a MINID.
|
|
1361
|
+
Returns:
|
|
1362
|
+
"""
|
|
1363
|
+
s3_target = "s3://eye-ai-shared"
|
|
1364
|
+
minid_test = False
|
|
1365
|
+
|
|
1366
|
+
catalog_id = self._version_snapshot(dataset)
|
|
1367
|
+
post_processors = (
|
|
1368
|
+
{
|
|
1369
|
+
"post_processors": [
|
|
1370
|
+
{
|
|
1371
|
+
"processor": "cloud_upload",
|
|
1372
|
+
"processor_params": {
|
|
1373
|
+
"acl": "public-read",
|
|
1374
|
+
"target_url": s3_target,
|
|
1375
|
+
},
|
|
1376
|
+
},
|
|
1377
|
+
{
|
|
1378
|
+
"processor": "identifier",
|
|
1379
|
+
"processor_params": {
|
|
1380
|
+
"test": minid_test,
|
|
1381
|
+
"env_column_map": {
|
|
1382
|
+
"RID": "{RID}@{snaptime}",
|
|
1383
|
+
"Description": "{Description}",
|
|
1384
|
+
},
|
|
1385
|
+
},
|
|
1386
|
+
},
|
|
1387
|
+
]
|
|
1388
|
+
}
|
|
1389
|
+
if self._use_minid
|
|
1390
|
+
else {}
|
|
1391
|
+
)
|
|
1392
|
+
return post_processors | {
|
|
1393
|
+
"env": {"RID": "{RID}"},
|
|
1394
|
+
"bag": {
|
|
1395
|
+
"bag_name": "Dataset_{RID}",
|
|
1396
|
+
"bag_algorithms": ["md5"],
|
|
1397
|
+
"bag_archiver": "zip",
|
|
1398
|
+
"bag_metadata": {},
|
|
1399
|
+
"bag_idempotent": True,
|
|
1400
|
+
},
|
|
1401
|
+
"catalog": {
|
|
1402
|
+
"host": f"{self._model.catalog.deriva_server.scheme}://{self._model.catalog.deriva_server.server}",
|
|
1403
|
+
"catalog_id": catalog_id,
|
|
1404
|
+
"query_processors": [
|
|
1405
|
+
{
|
|
1406
|
+
"processor": "env",
|
|
1407
|
+
"processor_params": {
|
|
1408
|
+
"output_path": "Dataset",
|
|
1409
|
+
"query_keys": ["snaptime"],
|
|
1410
|
+
"query_path": "/",
|
|
1411
|
+
},
|
|
1412
|
+
},
|
|
1413
|
+
{
|
|
1414
|
+
"processor": "env",
|
|
1415
|
+
"processor_params": {
|
|
1416
|
+
"query_path": "/entity/M:=deriva-ml:Dataset/RID={RID}",
|
|
1417
|
+
"output_path": "Dataset",
|
|
1418
|
+
"query_keys": ["RID", "Description"],
|
|
1419
|
+
},
|
|
1420
|
+
},
|
|
1421
|
+
]
|
|
1422
|
+
+ self._export_specification(dataset, snapshot_catalog),
|
|
1423
|
+
},
|
|
1424
|
+
}
|
|
1425
|
+
|
|
1426
|
+
def _generate_dataset_download_annotations(self) -> dict[str, Any]:
|
|
1427
|
+
post_processors = (
|
|
1428
|
+
{
|
|
1429
|
+
"type": "BAG",
|
|
1430
|
+
"outputs": [{"fragment_key": "dataset_export_outputs"}],
|
|
1431
|
+
"displayname": "BDBag to Cloud",
|
|
1432
|
+
"bag_idempotent": True,
|
|
1433
|
+
"postprocessors": [
|
|
1434
|
+
{
|
|
1435
|
+
"processor": "cloud_upload",
|
|
1436
|
+
"processor_params": {
|
|
1437
|
+
"acl": "public-read",
|
|
1438
|
+
"target_url": "s3://eye-ai-shared/",
|
|
1439
|
+
},
|
|
1440
|
+
},
|
|
1441
|
+
{
|
|
1442
|
+
"processor": "identifier",
|
|
1443
|
+
"processor_params": {
|
|
1444
|
+
"test": False,
|
|
1445
|
+
"env_column_map": {
|
|
1446
|
+
"RID": "{RID}@{snaptime}",
|
|
1447
|
+
"Description": "{Description}",
|
|
1448
|
+
},
|
|
1449
|
+
},
|
|
1450
|
+
},
|
|
1451
|
+
],
|
|
1452
|
+
}
|
|
1453
|
+
if self._use_minid
|
|
1454
|
+
else {}
|
|
1455
|
+
)
|
|
1456
|
+
return {
|
|
1457
|
+
deriva_tags.export_fragment_definitions: {"dataset_export_outputs": self._export_annotation()},
|
|
1458
|
+
deriva_tags.visible_foreign_keys: self._dataset_visible_fkeys(),
|
|
1459
|
+
deriva_tags.export_2019: {
|
|
1460
|
+
"detailed": {
|
|
1461
|
+
"templates": [
|
|
1462
|
+
{
|
|
1463
|
+
"type": "BAG",
|
|
1464
|
+
"outputs": [{"fragment_key": "dataset_export_outputs"}],
|
|
1465
|
+
"displayname": "BDBag Download",
|
|
1466
|
+
"bag_idempotent": True,
|
|
1467
|
+
}
|
|
1468
|
+
| post_processors
|
|
1469
|
+
]
|
|
1470
|
+
}
|
|
1471
|
+
},
|
|
1472
|
+
}
|
|
1473
|
+
|
|
1474
|
+
def _dataset_visible_fkeys(self) -> dict[str, Any]:
|
|
1475
|
+
def fkey_name(fk):
|
|
1476
|
+
return [fk.name[0].name, fk.name[1]]
|
|
1477
|
+
|
|
1478
|
+
dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
|
|
1479
|
+
|
|
1480
|
+
source_list = [
|
|
1481
|
+
{
|
|
1482
|
+
"source": [
|
|
1483
|
+
{"inbound": ["deriva-ml", "Dataset_Version_Dataset_fkey"]},
|
|
1484
|
+
"RID",
|
|
1485
|
+
],
|
|
1486
|
+
"markdown_name": "Previous Versions",
|
|
1487
|
+
"entity": True,
|
|
1488
|
+
},
|
|
1489
|
+
{
|
|
1490
|
+
"source": [
|
|
1491
|
+
{"inbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
|
|
1492
|
+
{"outbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
|
|
1493
|
+
"RID",
|
|
1494
|
+
],
|
|
1495
|
+
"markdown_name": "Parent Datasets",
|
|
1496
|
+
},
|
|
1497
|
+
{
|
|
1498
|
+
"source": [
|
|
1499
|
+
{"inbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
|
|
1500
|
+
{"outbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
|
|
1501
|
+
"RID",
|
|
1502
|
+
],
|
|
1503
|
+
"markdown_name": "Child Datasets",
|
|
1504
|
+
},
|
|
1505
|
+
]
|
|
1506
|
+
source_list.extend(
|
|
1507
|
+
[
|
|
1508
|
+
{
|
|
1509
|
+
"source": [
|
|
1510
|
+
{"inbound": fkey_name(fkey.self_fkey)},
|
|
1511
|
+
{"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
|
|
1512
|
+
"RID",
|
|
1513
|
+
],
|
|
1514
|
+
"markdown_name": other_fkey.pk_table.name,
|
|
1515
|
+
}
|
|
1516
|
+
for fkey in dataset_table.find_associations(max_arity=3, pure=False)
|
|
1517
|
+
]
|
|
1518
|
+
)
|
|
1519
|
+
return {"detailed": source_list}
|