deriva-ml 1.17.9__py3-none-any.whl → 1.17.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +43 -1
- deriva_ml/asset/__init__.py +17 -0
- deriva_ml/asset/asset.py +357 -0
- deriva_ml/asset/aux_classes.py +100 -0
- deriva_ml/bump_version.py +254 -11
- deriva_ml/catalog/__init__.py +21 -0
- deriva_ml/catalog/clone.py +1199 -0
- deriva_ml/catalog/localize.py +426 -0
- deriva_ml/core/__init__.py +29 -0
- deriva_ml/core/base.py +817 -1067
- deriva_ml/core/config.py +169 -21
- deriva_ml/core/constants.py +120 -19
- deriva_ml/core/definitions.py +123 -13
- deriva_ml/core/enums.py +47 -73
- deriva_ml/core/ermrest.py +226 -193
- deriva_ml/core/exceptions.py +297 -14
- deriva_ml/core/filespec.py +99 -28
- deriva_ml/core/logging_config.py +225 -0
- deriva_ml/core/mixins/__init__.py +42 -0
- deriva_ml/core/mixins/annotation.py +915 -0
- deriva_ml/core/mixins/asset.py +384 -0
- deriva_ml/core/mixins/dataset.py +237 -0
- deriva_ml/core/mixins/execution.py +408 -0
- deriva_ml/core/mixins/feature.py +365 -0
- deriva_ml/core/mixins/file.py +263 -0
- deriva_ml/core/mixins/path_builder.py +145 -0
- deriva_ml/core/mixins/rid_resolution.py +204 -0
- deriva_ml/core/mixins/vocabulary.py +400 -0
- deriva_ml/core/mixins/workflow.py +322 -0
- deriva_ml/core/validation.py +389 -0
- deriva_ml/dataset/__init__.py +2 -1
- deriva_ml/dataset/aux_classes.py +20 -4
- deriva_ml/dataset/catalog_graph.py +575 -0
- deriva_ml/dataset/dataset.py +1242 -1008
- deriva_ml/dataset/dataset_bag.py +1311 -182
- deriva_ml/dataset/history.py +27 -14
- deriva_ml/dataset/upload.py +225 -38
- deriva_ml/demo_catalog.py +186 -105
- deriva_ml/execution/__init__.py +46 -2
- deriva_ml/execution/base_config.py +639 -0
- deriva_ml/execution/execution.py +545 -244
- deriva_ml/execution/execution_configuration.py +26 -11
- deriva_ml/execution/execution_record.py +592 -0
- deriva_ml/execution/find_caller.py +298 -0
- deriva_ml/execution/model_protocol.py +175 -0
- deriva_ml/execution/multirun_config.py +153 -0
- deriva_ml/execution/runner.py +595 -0
- deriva_ml/execution/workflow.py +224 -35
- deriva_ml/experiment/__init__.py +8 -0
- deriva_ml/experiment/experiment.py +411 -0
- deriva_ml/feature.py +6 -1
- deriva_ml/install_kernel.py +143 -6
- deriva_ml/interfaces.py +862 -0
- deriva_ml/model/__init__.py +99 -0
- deriva_ml/model/annotations.py +1278 -0
- deriva_ml/model/catalog.py +286 -60
- deriva_ml/model/database.py +144 -649
- deriva_ml/model/deriva_ml_database.py +308 -0
- deriva_ml/model/handles.py +14 -0
- deriva_ml/run_model.py +319 -0
- deriva_ml/run_notebook.py +507 -38
- deriva_ml/schema/__init__.py +18 -2
- deriva_ml/schema/annotations.py +62 -33
- deriva_ml/schema/create_schema.py +169 -69
- deriva_ml/schema/validation.py +601 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -5
- deriva_ml-1.17.11.dist-info/RECORD +77 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +2 -0
- deriva_ml/protocols/dataset.py +0 -19
- deriva_ml/test.py +0 -94
- deriva_ml-1.17.9.dist-info/RECORD +0 -45
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,575 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from pprint import pformat
|
|
5
|
+
from typing import Any, Callable, Iterator
|
|
6
|
+
|
|
7
|
+
from deriva.core.ermrest_model import Table
|
|
8
|
+
from deriva.core.utils.core_utils import tag as deriva_tags
|
|
9
|
+
|
|
10
|
+
from deriva_ml.core.constants import RID
|
|
11
|
+
from deriva_ml.interfaces import DatasetLike, DerivaMLCatalog
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
|
|
15
|
+
from icecream import ic
|
|
16
|
+
|
|
17
|
+
ic.configureOutput(
|
|
18
|
+
includeContext=True,
|
|
19
|
+
argToStringFunction=lambda x: pformat(x.model_dump() if hasattr(x, "model_dump") else x, width=80, depth=10),
|
|
20
|
+
)
|
|
21
|
+
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
22
|
+
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class CatalogGraph:
|
|
26
|
+
"""Generates export specifications and annotations for dataset downloads.
|
|
27
|
+
|
|
28
|
+
This class creates the configuration needed for Deriva's export processor to
|
|
29
|
+
download datasets as BDBags, optionally with S3 upload and MINID registration.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
ml_instance: The DerivaML catalog instance.
|
|
33
|
+
s3_bucket: S3 bucket URL for dataset bag storage (e.g., 's3://my-bucket').
|
|
34
|
+
Required for MINID functionality. If None, MINID features are disabled.
|
|
35
|
+
use_minid: Whether to use MINID service for persistent identification.
|
|
36
|
+
Only effective when s3_bucket is provided.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
ml_instance: DerivaMLCatalog,
|
|
42
|
+
s3_bucket: str | None = None,
|
|
43
|
+
use_minid: bool = True,
|
|
44
|
+
):
|
|
45
|
+
self._ml_schema = ml_instance.ml_schema
|
|
46
|
+
self._ml_instance = ml_instance
|
|
47
|
+
self._s3_bucket = s3_bucket
|
|
48
|
+
# MINID only works if S3 bucket is configured
|
|
49
|
+
self._use_minid = use_minid and s3_bucket is not None
|
|
50
|
+
self._dataset_table = ml_instance._dataset_table
|
|
51
|
+
|
|
52
|
+
def _export_annotation(
|
|
53
|
+
self,
|
|
54
|
+
) -> list[dict[str, Any]]:
|
|
55
|
+
"""Return and output specification for the datasets in the provided model
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
An export specification suitable for Chaise.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
# Export specification is a specification for the datasets, plus any controlled vocabulary
|
|
62
|
+
return [
|
|
63
|
+
{
|
|
64
|
+
"source": {"api": False, "skip_root_path": True},
|
|
65
|
+
"destination": {"type": "env", "params": {"query_keys": ["snaptime"]}},
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"source": {"api": "entity"},
|
|
69
|
+
"destination": {
|
|
70
|
+
"type": "env",
|
|
71
|
+
"params": {"query_keys": ["RID", "Description"]},
|
|
72
|
+
},
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
"source": {"api": "schema", "skip_root_path": True},
|
|
76
|
+
"destination": {"type": "json", "name": "schema"},
|
|
77
|
+
},
|
|
78
|
+
] + self._dataset_specification(self._export_annotation_dataset_element, None)
|
|
79
|
+
|
|
80
|
+
def _export_specification(self, dataset: DatasetLike) -> list[dict[str, Any]]:
|
|
81
|
+
"""
|
|
82
|
+
Generate a specification for export engine for specific dataset.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
a download specification for the datasets in the provided model.
|
|
86
|
+
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
# Download spec is the spec for any controlled vocabulary and for the dataset_table.
|
|
90
|
+
return [
|
|
91
|
+
{
|
|
92
|
+
"processor": "json",
|
|
93
|
+
"processor_params": {"query_path": "/schema", "output_path": "schema"},
|
|
94
|
+
}
|
|
95
|
+
] + self._dataset_specification(self._export_specification_dataset_element, dataset)
|
|
96
|
+
|
|
97
|
+
@staticmethod
|
|
98
|
+
def _export_specification_dataset_element(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
|
|
99
|
+
"""Return the download specification for the data object indicated by a path through the data model.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
spath: Source path
|
|
103
|
+
dpath: Destination path
|
|
104
|
+
table: Table referenced to by the path
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
The download specification that will retrieve that data from the catalog and place it into a BDBag.
|
|
108
|
+
"""
|
|
109
|
+
exports = [
|
|
110
|
+
{
|
|
111
|
+
"processor": "csv",
|
|
112
|
+
"processor_params": {
|
|
113
|
+
"query_path": f"/entity/{spath}",
|
|
114
|
+
"output_path": dpath,
|
|
115
|
+
},
|
|
116
|
+
}
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
# If this table is an asset table, then we need to output the files associated with the asset.
|
|
120
|
+
asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
|
|
121
|
+
if asset_columns.issubset({c.name for c in table.columns}):
|
|
122
|
+
exports.append(
|
|
123
|
+
{
|
|
124
|
+
"processor": "fetch",
|
|
125
|
+
"processor_params": {
|
|
126
|
+
"query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5,asset_rid:=RID",
|
|
127
|
+
"output_path": "asset/{asset_rid}/" + table.name,
|
|
128
|
+
},
|
|
129
|
+
}
|
|
130
|
+
)
|
|
131
|
+
return exports
|
|
132
|
+
|
|
133
|
+
def _export_annotation_dataset_element(self, spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
|
|
134
|
+
"""Given a path in the data model, output an export specification for the path taken to get to the
|
|
135
|
+
current table.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
spath: Source path
|
|
139
|
+
dpath: Destination path
|
|
140
|
+
table: Table referenced to by the path
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
The export specification that will retrieve that data from the catalog and place it into a BDBag.
|
|
144
|
+
"""
|
|
145
|
+
# The table is the last element of the path. Generate the ERMRest query by converting the list of tables
|
|
146
|
+
# into a path in the form of /S:T1/S:T2/S:Table
|
|
147
|
+
# Generate the destination path in the file system using just the table names.
|
|
148
|
+
|
|
149
|
+
skip_root_path = False
|
|
150
|
+
if spath.startswith(f"{self._ml_schema}:Dataset/"):
|
|
151
|
+
# Chaise will add table name and RID filter, so strip it off.
|
|
152
|
+
spath = "/".join(spath.split("/")[2:])
|
|
153
|
+
if spath == "":
|
|
154
|
+
# This path is to just the dataset table.
|
|
155
|
+
return []
|
|
156
|
+
else:
|
|
157
|
+
# A vocabulary table, so we don't want the root_path.
|
|
158
|
+
skip_root_path = True
|
|
159
|
+
exports = [
|
|
160
|
+
{
|
|
161
|
+
"source": {
|
|
162
|
+
"api": "entity",
|
|
163
|
+
"path": spath,
|
|
164
|
+
"skip_root_path": skip_root_path,
|
|
165
|
+
},
|
|
166
|
+
"destination": {"name": dpath, "type": "csv"},
|
|
167
|
+
}
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
# If this table is an asset table, then we need to output the files associated with the asset.
|
|
171
|
+
asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
|
|
172
|
+
if asset_columns.issubset({c.name for c in table.columns}):
|
|
173
|
+
exports.append(
|
|
174
|
+
{
|
|
175
|
+
"source": {
|
|
176
|
+
"skip_root_path": False,
|
|
177
|
+
"api": "attribute",
|
|
178
|
+
"path": f"{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5, asset_rid:=RID",
|
|
179
|
+
},
|
|
180
|
+
"destination": {"name": "asset/{asset_rid}/" + table.name, "type": "fetch"},
|
|
181
|
+
}
|
|
182
|
+
)
|
|
183
|
+
return exports
|
|
184
|
+
|
|
185
|
+
def generate_dataset_download_spec(self, dataset: DatasetLike) -> dict[str, Any]:
|
|
186
|
+
"""Generate a specification for downloading a specific dataset.
|
|
187
|
+
|
|
188
|
+
This routine creates a download specification that can be used by the Deriva
|
|
189
|
+
export processor to download a specific dataset as a BDBag. If s3_bucket is
|
|
190
|
+
configured and use_minid is True, the bag will be uploaded to S3 and
|
|
191
|
+
registered with the MINID service.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
dataset: The dataset to generate the download spec for.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
A download specification dictionary for the Deriva export processor.
|
|
198
|
+
"""
|
|
199
|
+
minid_test = False
|
|
200
|
+
|
|
201
|
+
post_processors: dict[str, Any] = {}
|
|
202
|
+
if self._use_minid and self._s3_bucket:
|
|
203
|
+
post_processors = {
|
|
204
|
+
"post_processors": [
|
|
205
|
+
{
|
|
206
|
+
"processor": "cloud_upload",
|
|
207
|
+
"processor_params": {
|
|
208
|
+
"acl": "public-read",
|
|
209
|
+
"target_url": self._s3_bucket,
|
|
210
|
+
},
|
|
211
|
+
},
|
|
212
|
+
{
|
|
213
|
+
"processor": "identifier",
|
|
214
|
+
"processor_params": {
|
|
215
|
+
"test": minid_test,
|
|
216
|
+
"env_column_map": {
|
|
217
|
+
"RID": "{RID}@{snaptime}",
|
|
218
|
+
"Description": "{Description}",
|
|
219
|
+
},
|
|
220
|
+
},
|
|
221
|
+
},
|
|
222
|
+
]
|
|
223
|
+
}
|
|
224
|
+
return post_processors | {
|
|
225
|
+
"env": {"RID": "{RID}"},
|
|
226
|
+
"bag": {
|
|
227
|
+
"bag_name": "Dataset_{RID}",
|
|
228
|
+
"bag_algorithms": ["md5"],
|
|
229
|
+
"bag_archiver": "zip",
|
|
230
|
+
"bag_metadata": {},
|
|
231
|
+
"bag_idempotent": True,
|
|
232
|
+
},
|
|
233
|
+
"catalog": {
|
|
234
|
+
"host": f"{self._ml_instance.catalog.deriva_server.scheme}://{self._ml_instance.catalog.deriva_server.server}",
|
|
235
|
+
"catalog_id": self._ml_instance.catalog_id,
|
|
236
|
+
"query_processors": [
|
|
237
|
+
{
|
|
238
|
+
"processor": "env",
|
|
239
|
+
"processor_params": {
|
|
240
|
+
"output_path": "Dataset",
|
|
241
|
+
"query_keys": ["snaptime"],
|
|
242
|
+
"query_path": "/",
|
|
243
|
+
},
|
|
244
|
+
},
|
|
245
|
+
{
|
|
246
|
+
"processor": "env",
|
|
247
|
+
"processor_params": {
|
|
248
|
+
"query_path": "/entity/M:=deriva-ml:Dataset/RID={RID}",
|
|
249
|
+
"output_path": "Dataset",
|
|
250
|
+
"query_keys": ["RID", "Description"],
|
|
251
|
+
},
|
|
252
|
+
},
|
|
253
|
+
]
|
|
254
|
+
+ self._export_specification(dataset),
|
|
255
|
+
},
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
def generate_dataset_download_annotations(self) -> dict[str, Any]:
|
|
259
|
+
"""Generate export annotations for the Dataset table.
|
|
260
|
+
|
|
261
|
+
These annotations configure Chaise's export functionality for datasets.
|
|
262
|
+
If s3_bucket is configured and use_minid is True, includes post-processors
|
|
263
|
+
for S3 upload and MINID registration.
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
A dictionary of annotations to apply to the Dataset table.
|
|
267
|
+
"""
|
|
268
|
+
post_processors: dict[str, Any] = {}
|
|
269
|
+
if self._use_minid and self._s3_bucket:
|
|
270
|
+
# Ensure the S3 bucket URL ends with a trailing slash for the annotation
|
|
271
|
+
s3_url = self._s3_bucket if self._s3_bucket.endswith("/") else f"{self._s3_bucket}/"
|
|
272
|
+
post_processors = {
|
|
273
|
+
"type": "BAG",
|
|
274
|
+
"outputs": [{"fragment_key": "dataset_export_outputs"}],
|
|
275
|
+
"displayname": "BDBag to Cloud",
|
|
276
|
+
"bag_idempotent": True,
|
|
277
|
+
"postprocessors": [
|
|
278
|
+
{
|
|
279
|
+
"processor": "cloud_upload",
|
|
280
|
+
"processor_params": {
|
|
281
|
+
"acl": "public-read",
|
|
282
|
+
"target_url": s3_url,
|
|
283
|
+
},
|
|
284
|
+
},
|
|
285
|
+
{
|
|
286
|
+
"processor": "identifier",
|
|
287
|
+
"processor_params": {
|
|
288
|
+
"test": False,
|
|
289
|
+
"env_column_map": {
|
|
290
|
+
"RID": "{RID}@{snaptime}",
|
|
291
|
+
"Description": "{Description}",
|
|
292
|
+
},
|
|
293
|
+
},
|
|
294
|
+
},
|
|
295
|
+
],
|
|
296
|
+
}
|
|
297
|
+
return {
|
|
298
|
+
deriva_tags.export_fragment_definitions: {"dataset_export_outputs": self._export_annotation()},
|
|
299
|
+
deriva_tags.visible_foreign_keys: self._dataset_visible_fkeys(),
|
|
300
|
+
deriva_tags.export_2019: {
|
|
301
|
+
"detailed": {
|
|
302
|
+
"templates": [
|
|
303
|
+
{
|
|
304
|
+
"type": "BAG",
|
|
305
|
+
"outputs": [{"fragment_key": "dataset_export_outputs"}],
|
|
306
|
+
"displayname": "BDBag Download",
|
|
307
|
+
"bag_idempotent": True,
|
|
308
|
+
}
|
|
309
|
+
| post_processors
|
|
310
|
+
]
|
|
311
|
+
}
|
|
312
|
+
},
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
def _dataset_visible_fkeys(self) -> dict[str, Any]:
|
|
316
|
+
def fkey_name(fk):
|
|
317
|
+
return [fk.name[0].name, fk.name[1]]
|
|
318
|
+
|
|
319
|
+
dataset_table = self._ml_instance.model.schemas["deriva-ml"].tables["Dataset"]
|
|
320
|
+
|
|
321
|
+
source_list = [
|
|
322
|
+
{
|
|
323
|
+
"source": [
|
|
324
|
+
{"inbound": ["deriva-ml", "Dataset_Version_Dataset_fkey"]},
|
|
325
|
+
"RID",
|
|
326
|
+
],
|
|
327
|
+
"markdown_name": "Previous Versions",
|
|
328
|
+
"entity": True,
|
|
329
|
+
},
|
|
330
|
+
{
|
|
331
|
+
"source": [
|
|
332
|
+
{"inbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
|
|
333
|
+
{"outbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
|
|
334
|
+
"RID",
|
|
335
|
+
],
|
|
336
|
+
"markdown_name": "Parent Datasets",
|
|
337
|
+
},
|
|
338
|
+
{
|
|
339
|
+
"source": [
|
|
340
|
+
{"inbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
|
|
341
|
+
{"outbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
|
|
342
|
+
"RID",
|
|
343
|
+
],
|
|
344
|
+
"markdown_name": "Child Datasets",
|
|
345
|
+
},
|
|
346
|
+
]
|
|
347
|
+
source_list.extend(
|
|
348
|
+
[
|
|
349
|
+
{
|
|
350
|
+
"source": [
|
|
351
|
+
{"inbound": fkey_name(fkey.self_fkey)},
|
|
352
|
+
{"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
|
|
353
|
+
"RID",
|
|
354
|
+
],
|
|
355
|
+
"markdown_name": other_fkey.pk_table.name,
|
|
356
|
+
}
|
|
357
|
+
for fkey in dataset_table.find_associations(max_arity=3, pure=False)
|
|
358
|
+
]
|
|
359
|
+
)
|
|
360
|
+
return {"detailed": source_list}
|
|
361
|
+
|
|
362
|
+
def _collect_paths(
|
|
363
|
+
self,
|
|
364
|
+
dataset_rid: RID | None = None,
|
|
365
|
+
dataset_nesting_depth: int | None = None,
|
|
366
|
+
) -> set[tuple[Table, ...]]:
|
|
367
|
+
"""
|
|
368
|
+
Collects all schema paths relevant to a specific dataset, optionally filtered by dataset membership or nesting
|
|
369
|
+
depth, and returns those paths. The paths represent relationships between tables in the schema and how they can
|
|
370
|
+
be traversed based on the dataset's structure and context.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
dataset_rid:
|
|
374
|
+
An optional identifier for the specific dataset to filter paths. If provided,
|
|
375
|
+
only paths traversing elements of this dataset will be included.
|
|
376
|
+
dataset_nesting_depth:
|
|
377
|
+
Specifies the depth to which nested datasets should be included. If not provided,
|
|
378
|
+
a default depth is calculated based on the current instance.
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
set[tuple[Table, ...]]:
|
|
382
|
+
A set of tuples, where each tuple represents a valid path consisting of
|
|
383
|
+
Tables. Each path defines how tables are connected and can be navigated
|
|
384
|
+
through the schema.
|
|
385
|
+
"""
|
|
386
|
+
|
|
387
|
+
dataset_table = self._ml_instance.model.schemas[self._ml_schema].tables["Dataset"]
|
|
388
|
+
dataset_dataset = self._ml_instance.model.schemas[self._ml_schema].tables["Dataset_Dataset"]
|
|
389
|
+
|
|
390
|
+
# Figure out what types of elements the dataset contains.
|
|
391
|
+
dataset_associations = [
|
|
392
|
+
a
|
|
393
|
+
for a in self._dataset_table.find_associations()
|
|
394
|
+
if a.table.schema.name != self._ml_schema or a.table.name == "Dataset_Dataset"
|
|
395
|
+
]
|
|
396
|
+
|
|
397
|
+
if dataset_rid:
|
|
398
|
+
# Get a list of the members of the dataset so we can figure out which tables to query.
|
|
399
|
+
dataset = self._ml_instance.lookup_dataset(dataset_rid)
|
|
400
|
+
dataset_elements = [
|
|
401
|
+
self._ml_instance.model.name_to_table(e) for e, m in dataset.list_dataset_members().items() if m
|
|
402
|
+
]
|
|
403
|
+
included_associations = [
|
|
404
|
+
a.table for a in dataset_table.find_associations() if a.other_fkeys.pop().pk_table in dataset_elements
|
|
405
|
+
]
|
|
406
|
+
else:
|
|
407
|
+
included_associations = [a.table for a in dataset_associations]
|
|
408
|
+
|
|
409
|
+
# Get the paths through the schema and filter out all the dataset paths not used by this dataset.
|
|
410
|
+
paths = {
|
|
411
|
+
tuple(p)
|
|
412
|
+
for p in self._ml_instance.model._schema_to_paths()
|
|
413
|
+
if (len(p) == 1)
|
|
414
|
+
or (p[1] not in dataset_associations) # Tables in the domain schema
|
|
415
|
+
or (p[1] in included_associations) # Tables that include members of the dataset
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
# Add feature table paths for domain tables in the dataset
|
|
419
|
+
# Feature tables (e.g., Execution_Image_Image_Classification) contain feature values
|
|
420
|
+
# that need to be exported with the dataset
|
|
421
|
+
if dataset_rid:
|
|
422
|
+
for element_table in dataset_elements:
|
|
423
|
+
for feature in self._ml_instance.find_features(element_table):
|
|
424
|
+
# Find the path to the element table and extend it with the feature table
|
|
425
|
+
for path in paths.copy():
|
|
426
|
+
if path[-1] == element_table:
|
|
427
|
+
# Add a path that goes through the element table to the feature table
|
|
428
|
+
paths.add(path + (feature.feature_table,))
|
|
429
|
+
|
|
430
|
+
# Now get paths for nested datasets
|
|
431
|
+
nested_paths = set()
|
|
432
|
+
if dataset_rid:
|
|
433
|
+
dataset = self._ml_instance.lookup_dataset(dataset_rid)
|
|
434
|
+
for c in dataset.list_dataset_children():
|
|
435
|
+
nested_paths |= self._collect_paths(c.dataset_rid)
|
|
436
|
+
else:
|
|
437
|
+
# Initialize nesting depth if not already provided.
|
|
438
|
+
dataset_nesting_depth = (
|
|
439
|
+
self._dataset_nesting_depth() if dataset_nesting_depth is None else dataset_nesting_depth
|
|
440
|
+
)
|
|
441
|
+
if dataset_nesting_depth:
|
|
442
|
+
nested_paths = self._collect_paths(dataset_nesting_depth=dataset_nesting_depth - 1)
|
|
443
|
+
if nested_paths:
|
|
444
|
+
paths |= {
|
|
445
|
+
tuple([dataset_table]),
|
|
446
|
+
(dataset_table, dataset_dataset),
|
|
447
|
+
}
|
|
448
|
+
paths |= {(self._dataset_table, dataset_dataset) + p for p in nested_paths}
|
|
449
|
+
return paths
|
|
450
|
+
|
|
451
|
+
def _export_vocabulary(self, writer: Callable[[str, str, Table], list[dict[str, Any]]]) -> list[dict[str, Any]]:
|
|
452
|
+
"""
|
|
453
|
+
|
|
454
|
+
Args:
|
|
455
|
+
writer: Callable[[list[Table]]: list[dict[str: Any]]]:
|
|
456
|
+
|
|
457
|
+
Returns:
|
|
458
|
+
|
|
459
|
+
"""
|
|
460
|
+
vocabs = [
|
|
461
|
+
table
|
|
462
|
+
for s in self._ml_instance.model.schemas.values()
|
|
463
|
+
for table in s.tables.values()
|
|
464
|
+
if self._ml_instance.model.is_vocabulary(table)
|
|
465
|
+
]
|
|
466
|
+
return [o for table in vocabs for o in writer(f"{table.schema.name}:{table.name}", table.name, table)]
|
|
467
|
+
|
|
468
|
+
def _table_paths(
|
|
469
|
+
self,
|
|
470
|
+
dataset: DatasetLike | None = None,
|
|
471
|
+
) -> Iterator[tuple[str, str, Table]]:
|
|
472
|
+
paths = self._collect_paths(dataset and dataset.dataset_rid)
|
|
473
|
+
|
|
474
|
+
def source_path(path: tuple[Table, ...]) -> list[str]:
|
|
475
|
+
"""Convert a tuple representing a path into a source path component with FK linkage"""
|
|
476
|
+
path = list(path)
|
|
477
|
+
p = [f"{self._ml_instance.ml_schema}:Dataset/RID={{RID}}"]
|
|
478
|
+
for table in path[1:]:
|
|
479
|
+
if table.name == "Dataset_Dataset":
|
|
480
|
+
p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
|
|
481
|
+
elif table.name == "Dataset":
|
|
482
|
+
p.append("(Nested_Dataset)=(deriva-ml:Dataset:RID)")
|
|
483
|
+
elif table.name == "Dataset_Version":
|
|
484
|
+
p.append(f"(RID)=({self._ml_instance.ml_schema}:Dataset_Version:Dataset)")
|
|
485
|
+
else:
|
|
486
|
+
p.append(f"{table.schema.name}:{table.name}")
|
|
487
|
+
return p
|
|
488
|
+
|
|
489
|
+
src_paths = ["/".join(source_path(p)) for p in paths]
|
|
490
|
+
dest_paths = ["/".join([t.name for t in p]) for p in paths]
|
|
491
|
+
target_tables = [p[-1] for p in paths]
|
|
492
|
+
return zip(src_paths, dest_paths, target_tables)
|
|
493
|
+
|
|
494
|
+
def _dataset_nesting_depth(self, dataset: DatasetLike | None = None) -> int:
|
|
495
|
+
"""Determine the maximum dataset nesting depth in the current catalog.
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
|
|
499
|
+
"""
|
|
500
|
+
|
|
501
|
+
def children_depth(dataset: RID, nested_datasets: dict[str, list[str]]) -> int:
|
|
502
|
+
"""Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
|
|
503
|
+
try:
|
|
504
|
+
children = nested_datasets[dataset]
|
|
505
|
+
return max(map(lambda x: children_depth(x, nested_datasets), children)) + 1 if children else 1
|
|
506
|
+
except KeyError:
|
|
507
|
+
return 0
|
|
508
|
+
|
|
509
|
+
# Build up the dataset_table nesting graph...
|
|
510
|
+
pb = self._ml_instance.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Dataset"]
|
|
511
|
+
dataset_children = (
|
|
512
|
+
[
|
|
513
|
+
{
|
|
514
|
+
"Dataset": dataset.dataset_rid,
|
|
515
|
+
"Nested_Dataset": c,
|
|
516
|
+
} # Make uniform with return from datapath
|
|
517
|
+
for c in dataset.list_dataset_children()
|
|
518
|
+
]
|
|
519
|
+
if dataset
|
|
520
|
+
else pb.entities().fetch()
|
|
521
|
+
)
|
|
522
|
+
nested_dataset = defaultdict(list)
|
|
523
|
+
for ds in dataset_children:
|
|
524
|
+
nested_dataset[ds["Dataset"]].append(ds["Nested_Dataset"])
|
|
525
|
+
return max(map(lambda d: children_depth(d, dict(nested_dataset)), nested_dataset)) if nested_dataset else 0
|
|
526
|
+
|
|
527
|
+
def _dataset_specification(
|
|
528
|
+
self,
|
|
529
|
+
writer: Callable[[str, str, Table], list[dict[str, Any]]],
|
|
530
|
+
dataset: DatasetLike | None = None,
|
|
531
|
+
) -> list[dict[str, Any]]:
|
|
532
|
+
"""Output a download/export specification for a dataset_table. Each element of the dataset_table
|
|
533
|
+
will be placed in its own directory.
|
|
534
|
+
The top level data directory of the resulting BDBag will have one subdirectory for element type.
|
|
535
|
+
The subdirectory will contain the CSV indicating which elements of that type are present in the
|
|
536
|
+
dataset_table, and then there will be a subdirectory for each object that is reachable from the
|
|
537
|
+
dataset_table members.
|
|
538
|
+
|
|
539
|
+
To simplify reconstructing the relationship between tables, the CVS for each element is included.
|
|
540
|
+
The top level data directory will also contain a subdirectory for any controlled vocabularies used in
|
|
541
|
+
the dataset_table. All assets will be placed into a directory named asset in a subdirectory with the
|
|
542
|
+
asset table name.
|
|
543
|
+
|
|
544
|
+
For example, consider a dataset_table that consists of two element types, T1 and T2. T1 has foreign
|
|
545
|
+
key relationships to objects in tables T3 and T4. There are also two controlled vocabularies, CV1 and
|
|
546
|
+
CV2. T2 is an asset table which has two assets in it. The layout of the resulting bdbag would be:
|
|
547
|
+
data
|
|
548
|
+
CV1/
|
|
549
|
+
cv1.csv
|
|
550
|
+
CV2/
|
|
551
|
+
cv2.csv
|
|
552
|
+
Dataset/
|
|
553
|
+
T1/
|
|
554
|
+
t1.csv
|
|
555
|
+
T3/
|
|
556
|
+
t3.csv
|
|
557
|
+
T4/
|
|
558
|
+
t4.csv
|
|
559
|
+
T2/
|
|
560
|
+
t2.csv
|
|
561
|
+
asset/
|
|
562
|
+
T2
|
|
563
|
+
f1
|
|
564
|
+
f2
|
|
565
|
+
|
|
566
|
+
Args:
|
|
567
|
+
writer: Callable[[list[Table]]: list[dict[str: Any]]]:
|
|
568
|
+
|
|
569
|
+
Returns:
|
|
570
|
+
A dataset_table specification.
|
|
571
|
+
"""
|
|
572
|
+
element_spec = self._export_vocabulary(writer)
|
|
573
|
+
for path in self._table_paths(dataset=dataset):
|
|
574
|
+
element_spec.extend(writer(*path))
|
|
575
|
+
return element_spec
|