deriva-ml 1.17.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. deriva_ml/.DS_Store +0 -0
  2. deriva_ml/__init__.py +79 -0
  3. deriva_ml/bump_version.py +142 -0
  4. deriva_ml/core/__init__.py +39 -0
  5. deriva_ml/core/base.py +1527 -0
  6. deriva_ml/core/config.py +69 -0
  7. deriva_ml/core/constants.py +36 -0
  8. deriva_ml/core/definitions.py +74 -0
  9. deriva_ml/core/enums.py +222 -0
  10. deriva_ml/core/ermrest.py +288 -0
  11. deriva_ml/core/exceptions.py +28 -0
  12. deriva_ml/core/filespec.py +116 -0
  13. deriva_ml/dataset/__init__.py +12 -0
  14. deriva_ml/dataset/aux_classes.py +225 -0
  15. deriva_ml/dataset/dataset.py +1519 -0
  16. deriva_ml/dataset/dataset_bag.py +450 -0
  17. deriva_ml/dataset/history.py +109 -0
  18. deriva_ml/dataset/upload.py +439 -0
  19. deriva_ml/demo_catalog.py +495 -0
  20. deriva_ml/execution/__init__.py +26 -0
  21. deriva_ml/execution/environment.py +290 -0
  22. deriva_ml/execution/execution.py +1180 -0
  23. deriva_ml/execution/execution_configuration.py +147 -0
  24. deriva_ml/execution/workflow.py +413 -0
  25. deriva_ml/feature.py +228 -0
  26. deriva_ml/install_kernel.py +71 -0
  27. deriva_ml/model/__init__.py +0 -0
  28. deriva_ml/model/catalog.py +485 -0
  29. deriva_ml/model/database.py +719 -0
  30. deriva_ml/protocols/dataset.py +19 -0
  31. deriva_ml/run_notebook.py +228 -0
  32. deriva_ml/schema/__init__.py +3 -0
  33. deriva_ml/schema/annotations.py +473 -0
  34. deriva_ml/schema/check_schema.py +104 -0
  35. deriva_ml/schema/create_schema.py +393 -0
  36. deriva_ml/schema/deriva-ml-reference.json +8525 -0
  37. deriva_ml/schema/policy.json +81 -0
  38. deriva_ml/schema/table_comments_utils.py +57 -0
  39. deriva_ml/test.py +94 -0
  40. deriva_ml-1.17.10.dist-info/METADATA +38 -0
  41. deriva_ml-1.17.10.dist-info/RECORD +45 -0
  42. deriva_ml-1.17.10.dist-info/WHEEL +5 -0
  43. deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
  44. deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
  45. deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1519 @@
1
+ """Dataset management for DerivaML.
2
+
3
+ This module provides functionality for managing datasets in DerivaML. A dataset represents a collection
4
+ of related data that can be versioned, downloaded, and tracked. The module includes:
5
+
6
+ - Dataset class: Core class for dataset operations
7
+ - Version management: Track and update dataset versions
8
+ - History tracking: Record dataset changes over time
9
+ - Download capabilities: Export datasets as BDBags
10
+ - Relationship management: Handle dataset dependencies and hierarchies
11
+
12
+ The Dataset class serves as a base class in DerivaML, making its methods accessible through
13
+ DerivaML class instances.
14
+
15
+ Typical usage example:
16
+ >>> ml = DerivaML('deriva.example.org', 'my_catalog')
17
+ >>> dataset_rid = ml.create_dataset('experiment', 'Experimental data')
18
+ >>> ml.add_dataset_members(dataset_rid=dataset_rid, members=['1-abc123', '1-def456'])
19
+ >>> ml.increment_dataset_version(datset_rid=dataset_rid, component=VersionPart.minor,
20
+ ... description='Added new samples')
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import json
26
+ import logging
27
+ from collections import defaultdict
28
+
29
+ # Standard library imports
30
+ from graphlib import TopologicalSorter
31
+ from pathlib import Path
32
+ from tempfile import TemporaryDirectory
33
+ from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator
34
+ from urllib.parse import urlparse
35
+
36
+ import deriva.core.utils.hash_utils as hash_utils
37
+ import requests
38
+
39
+ # Third-party imports
40
+ from bdbag import bdbag_api as bdb
41
+ from bdbag.fetch.fetcher import fetch_single_file
42
+
43
+ # Deriva imports
44
+ from deriva.core.ermrest_model import Table
45
+ from deriva.core.utils.core_utils import format_exception
46
+ from deriva.core.utils.core_utils import tag as deriva_tags
47
+ from deriva.transfer.download.deriva_download import (
48
+ DerivaDownloadAuthenticationError,
49
+ DerivaDownloadAuthorizationError,
50
+ DerivaDownloadConfigurationError,
51
+ DerivaDownloadError,
52
+ DerivaDownloadTimeoutError,
53
+ )
54
+ from deriva.transfer.download.deriva_export import DerivaExport
55
+ from pydantic import ConfigDict, validate_call
56
+
57
+ # Local imports
58
+ try:
59
+ from icecream import ic
60
+
61
+ ic.configureOutput(includeContext=True)
62
+ except ImportError: # Graceful fallback if IceCream isn't installed.
63
+ ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
64
+
65
+ from deriva_ml.core.constants import RID
66
+ from deriva_ml.core.definitions import (
67
+ DRY_RUN_RID,
68
+ ML_SCHEMA,
69
+ MLVocab,
70
+ Status,
71
+ )
72
+ from deriva_ml.core.exceptions import DerivaMLException, DerivaMLTableTypeError
73
+ from deriva_ml.dataset.aux_classes import (
74
+ DatasetHistory,
75
+ DatasetMinid,
76
+ DatasetSpec,
77
+ DatasetVersion,
78
+ VersionPart,
79
+ )
80
+ from deriva_ml.dataset.dataset_bag import DatasetBag
81
+ from deriva_ml.model.catalog import DerivaModel
82
+ from deriva_ml.model.database import DatabaseModel
83
+
84
+ from .history import iso_to_snap
85
+
86
+ # Stop pycharm from complaining about undefined reference in docstring....
87
+ ml: DerivaML
88
+
89
+ if TYPE_CHECKING:
90
+ from deriva_ml.core.base import DerivaML
91
+
92
+
93
+ class Dataset:
94
+ """Manages dataset operations in a Deriva catalog.
95
+
96
+ The Dataset class provides functionality for creating, modifying, and tracking datasets
97
+ in a Deriva catalog. It handles versioning, relationships between datasets, and data export.
98
+
99
+ Attributes:
100
+ dataset_table (Table): ERMrest table storing dataset information.
101
+ _model (DerivaModel): Catalog model instance.
102
+ _ml_schema (str): Schema name for ML-specific tables.
103
+ _cache_dir (Path): Directory for caching downloaded datasets.
104
+ _working_dir (Path): Directory for working data.
105
+ _use_minid (bool): Whether to use MINID service for dataset identification.
106
+
107
+ Note:
108
+ This class is typically used as a base class, with its methods accessed through
109
+ DerivaML class instances rather than directly.
110
+ """
111
+
112
+ _Logger = logging.getLogger("deriva_ml")
113
+
114
+ def __init__(
115
+ self,
116
+ model: DerivaModel,
117
+ cache_dir: Path,
118
+ working_dir: Path,
119
+ use_minid: bool = True,
120
+ ):
121
+ """Initializes a Dataset instance.
122
+
123
+ Args:
124
+ model: DerivaModel instance representing the catalog.
125
+ cache_dir: Directory path for caching downloaded datasets.
126
+ working_dir: Directory path for working data.
127
+ use_minid: Whether to use MINID service for dataset identification.
128
+ """
129
+ self._model = model
130
+ self._ml_schema = ML_SCHEMA
131
+ self._cache_dir = cache_dir
132
+ self._working_dir = working_dir
133
+ self._logger = logging.getLogger("deriva_ml")
134
+ self._use_minid = use_minid
135
+
136
+ @property
137
+ def _dataset_table(self):
138
+ return self._model.schemas[self._ml_schema].tables["Dataset"]
139
+
140
+ def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
141
+ try:
142
+ rid_info = self._model.catalog.resolve_rid(dataset_rid, self._model.model)
143
+ except KeyError as _e:
144
+ raise DerivaMLException(f"Invalid RID {dataset_rid}")
145
+ if rid_info.table != self._dataset_table:
146
+ return False
147
+ elif deleted:
148
+ # Got a dataset rid. Now check to see if its deleted or not.
149
+ return True
150
+ else:
151
+ return not list(rid_info.datapath.entities().fetch())[0]["Deleted"]
152
+
153
+ def _insert_dataset_versions(
154
+ self,
155
+ dataset_list: list[DatasetSpec],
156
+ description: str | None = "",
157
+ execution_rid: RID | None = None,
158
+ ) -> None:
159
+ schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
160
+ # determine snapshot after changes were made
161
+
162
+ # Construct version records for insert
163
+ version_records = schema_path.tables["Dataset_Version"].insert(
164
+ [
165
+ {
166
+ "Dataset": dataset.rid,
167
+ "Version": str(dataset.version),
168
+ "Description": description,
169
+ "Execution": execution_rid,
170
+ }
171
+ for dataset in dataset_list
172
+ ]
173
+ )
174
+ version_records = list(version_records)
175
+ snap = self._model.catalog.get("/").json()["snaptime"]
176
+ schema_path.tables["Dataset_Version"].update(
177
+ [{"RID": v["RID"], "Dataset": v["Dataset"], "Snapshot": snap} for v in version_records]
178
+ )
179
+
180
+ # And update the dataset records.
181
+ schema_path.tables["Dataset"].update([{"Version": v["RID"], "RID": v["Dataset"]} for v in version_records])
182
+
183
+ def _bootstrap_versions(self):
184
+ datasets = [ds["RID"] for ds in self.find_datasets()]
185
+ ds_version = [
186
+ {
187
+ "Dataset": d,
188
+ "Version": "0.1.0",
189
+ "Description": "Dataset at the time of conversion to versioned datasets",
190
+ }
191
+ for d in datasets
192
+ ]
193
+ schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
194
+ version_path = schema_path.tables["Dataset_Version"]
195
+ dataset_path = schema_path.tables["Dataset"]
196
+ history = list(version_path.insert(ds_version))
197
+ dataset_versions = [{"RID": h["Dataset"], "Version": h["Version"]} for h in history]
198
+ dataset_path.update(dataset_versions)
199
+
200
+ def _synchronize_dataset_versions(self):
201
+ datasets = [ds["RID"] for ds in self.find_datasets()]
202
+ for ds in datasets:
203
+ self.dataset_version(ds)
204
+ schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
205
+ dataset_version_path = schema_path.tables["Dataset_Version"]
206
+ # Get the maximum version number for each dataset.
207
+ versions = {}
208
+ for v in dataset_version_path.entities().fetch():
209
+ if v["Version"] > versions.get("Dataset", DatasetVersion(0, 0, 0)):
210
+ versions[v["Dataset"]] = v
211
+ dataset_path = schema_path.tables["Dataset"]
212
+
213
+ dataset_path.update([{"RID": dataset, "Version": version["RID"]} for dataset, version in versions.items()])
214
+
215
+ def _set_version_snapshot(self):
216
+ """Update the Snapshot column of the Dataset_Version table to the correct time."""
217
+ dataset_version_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Version"]
218
+ versions = dataset_version_path.entities().fetch()
219
+ dataset_version_path.update(
220
+ [{"RID": h["RID"], "Snapshot": iso_to_snap(h["RCT"])} for h in versions if not h["Snapshot"]]
221
+ )
222
+
223
+ def dataset_history(self, dataset_rid: RID) -> list[DatasetHistory]:
224
+ """Retrieves the version history of a dataset.
225
+
226
+ Returns a chronological list of dataset versions, including their version numbers,
227
+ creation times, and associated metadata.
228
+
229
+ Args:
230
+ dataset_rid: Resource Identifier of the dataset.
231
+
232
+ Returns:
233
+ list[DatasetHistory]: List of history entries, each containing:
234
+ - dataset_version: Version number (major.minor.patch)
235
+ - minid: Minimal Viable Identifier
236
+ - snapshot: Catalog snapshot time
237
+ - dataset_rid: Dataset Resource Identifier
238
+ - version_rid: Version Resource Identifier
239
+ - description: Version description
240
+ - execution_rid: Associated execution RID
241
+
242
+ Raises:
243
+ DerivaMLException: If dataset_rid is not a valid dataset RID.
244
+
245
+ Example:
246
+ >>> history = ml.dataset_history("1-abc123")
247
+ >>> for entry in history:
248
+ ... print(f"Version {entry.dataset_version}: {entry.description}")
249
+ """
250
+
251
+ if not self._is_dataset_rid(dataset_rid):
252
+ raise DerivaMLException(f"RID is not for a data set: {dataset_rid}")
253
+ version_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Version"]
254
+ return [
255
+ DatasetHistory(
256
+ dataset_version=DatasetVersion.parse(v["Version"]),
257
+ minid=v["Minid"],
258
+ snapshot=v["Snapshot"],
259
+ dataset_rid=dataset_rid,
260
+ version_rid=v["RID"],
261
+ description=v["Description"],
262
+ execution_rid=v["Execution"],
263
+ )
264
+ for v in version_path.filter(version_path.Dataset == dataset_rid).entities().fetch()
265
+ ]
266
+
267
+ @validate_call
268
+ def dataset_version(self, dataset_rid: RID) -> DatasetVersion:
269
+ """Retrieve the current version of the specified dataset_table.
270
+
271
+ Given a rid, return the most recent version of the dataset. It is important to remember that this version
272
+ captures the state of the catalog at the time the version was created, not the current state of the catalog.
273
+ This means that its possible that the values associated with an object in the catalog may be different
274
+ from the values of that object in the dataset.
275
+
276
+ Args:
277
+ dataset_rid: The RID of the dataset to retrieve the version for.
278
+
279
+ Returns:
280
+ A tuple with the semantic version of the dataset_table.
281
+ """
282
+ history = self.dataset_history(dataset_rid)
283
+ if not history:
284
+ return DatasetVersion(0, 1, 0)
285
+ else:
286
+ # Ensure we return a DatasetVersion, not a string
287
+ versions = [h.dataset_version for h in history]
288
+ return max(versions) if versions else DatasetVersion(0, 1, 0)
289
+
290
+ def _build_dataset_graph(self, dataset_rid: RID) -> Iterable[RID]:
291
+ ts: TopologicalSorter = TopologicalSorter()
292
+ self._build_dataset_graph_1(dataset_rid, ts, set())
293
+ return ts.static_order()
294
+
295
+ def _build_dataset_graph_1(self, dataset_rid: RID, ts: TopologicalSorter, visited) -> None:
296
+ """Use topological sort to return bottom up list of nested datasets"""
297
+ ts.add(dataset_rid)
298
+ if dataset_rid not in visited:
299
+ visited.add(dataset_rid)
300
+ children = self.list_dataset_children(dataset_rid=dataset_rid)
301
+ parents = self.list_dataset_parents(dataset_rid=dataset_rid)
302
+ for parent in parents:
303
+ # Convert string to RID type
304
+ self._build_dataset_graph_1(RID(parent), ts, visited)
305
+ for child in children:
306
+ self._build_dataset_graph_1(child, ts, visited)
307
+
308
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
309
+ def increment_dataset_version(
310
+ self,
311
+ dataset_rid: RID,
312
+ component: VersionPart,
313
+ description: str | None = "",
314
+ execution_rid: RID | None = None,
315
+ ) -> DatasetVersion:
316
+ """Increments a dataset's version number.
317
+
318
+ Creates a new version of the dataset by incrementing the specified version component
319
+ (major, minor, or patch). The new version is recorded with an optional description
320
+ and execution reference.
321
+
322
+ Args:
323
+ dataset_rid: Resource Identifier of the dataset to version.
324
+ component: Which version component to increment ('major', 'minor', or 'patch').
325
+ description: Optional description of the changes in this version.
326
+ execution_rid: Optional execution RID to associate with this version.
327
+
328
+ Returns:
329
+ DatasetVersion: The new version number.
330
+
331
+ Raises:
332
+ DerivaMLException: If dataset_rid is invalid or version increment fails.
333
+
334
+ Example:
335
+ >>> new_version = ml.increment_dataset_version(
336
+ ... dataset_rid="1-abc123",
337
+ ... component="minor",
338
+ ... description="Added new samples"
339
+ ... )
340
+ >>> print(f"New version: {new_version}") # e.g., "1.2.0"
341
+ """
342
+
343
+ # Find all the datasets that are reachable from this dataset and determine their new version numbers.
344
+ related_datasets = list(self._build_dataset_graph(dataset_rid=dataset_rid))
345
+ version_update_list = [
346
+ DatasetSpec(
347
+ rid=ds_rid,
348
+ version=self.dataset_version(ds_rid).increment_version(component),
349
+ )
350
+ for ds_rid in related_datasets
351
+ ]
352
+ self._insert_dataset_versions(version_update_list, description=description, execution_rid=execution_rid)
353
+ return next((d.version for d in version_update_list if d.rid == dataset_rid))
354
+
355
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
356
+ def create_dataset(
357
+ self,
358
+ dataset_types: str | list[str] | None = None,
359
+ description: str = "",
360
+ execution_rid: RID | None = None,
361
+ version: DatasetVersion | None = None,
362
+ ) -> RID:
363
+ """Creates a new dataset in the catalog.
364
+
365
+ Creates a dataset with specified types and description. The dataset can be associated
366
+ with an execution and initialized with a specific version.
367
+
368
+ Args:
369
+ dataset_types: One or more dataset type terms from Dataset_Type vocabulary.
370
+ description: Description of the dataset's purpose and contents.
371
+ execution_rid: Optional execution RID to associate with dataset creation.
372
+ version: Optional initial version number. Defaults to 0.1.0.
373
+
374
+ Returns:
375
+ RID: Resource Identifier of the newly created dataset.
376
+
377
+ Raises:
378
+ DerivaMLException: If dataset_types are invalid or creation fails.
379
+
380
+ Example:
381
+ >>> rid = ml.create_dataset(
382
+ ... dataset_types=["experiment", "raw_data"],
383
+ ... description="RNA sequencing experiment data",
384
+ ... version=DatasetVersion(1, 0, 0)
385
+ ... )
386
+ """
387
+
388
+ version = version or DatasetVersion(0, 1, 0)
389
+ dataset_types = dataset_types or []
390
+
391
+ type_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables[MLVocab.dataset_type.value]
392
+ defined_types = list(type_path.entities().fetch())
393
+
394
+ def check_dataset_type(dtype: str) -> bool:
395
+ for term in defined_types:
396
+ if dtype == term["Name"] or (term["Synonyms"] and ds_type in term["Synonyms"]):
397
+ return True
398
+ return False
399
+
400
+ # Create the entry for the new dataset_table and get its RID.
401
+ ds_types = [dataset_types] if isinstance(dataset_types, str) else dataset_types
402
+ pb = self._model.catalog.getPathBuilder()
403
+ for ds_type in ds_types:
404
+ if not check_dataset_type(ds_type):
405
+ raise DerivaMLException("Dataset type must be a vocabulary term.")
406
+ dataset_table_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
407
+ dataset_rid = dataset_table_path.insert(
408
+ [
409
+ {
410
+ "Description": description,
411
+ "Deleted": False,
412
+ }
413
+ ]
414
+ )[0]["RID"]
415
+
416
+ # Get the name of the association table between dataset_table and dataset_type.
417
+ associations = list(self._model.schemas[self._ml_schema].tables[MLVocab.dataset_type].find_associations())
418
+ atable = associations[0].name if associations else None
419
+ pb.schemas[self._ml_schema].tables[atable].insert(
420
+ [{MLVocab.dataset_type: ds_type, "Dataset": dataset_rid} for ds_type in ds_types]
421
+ )
422
+ if execution_rid is not None:
423
+ pb.schemas[self._ml_schema].Dataset_Execution.insert([{"Dataset": dataset_rid, "Execution": execution_rid}])
424
+ self._insert_dataset_versions(
425
+ [DatasetSpec(rid=dataset_rid, version=version)],
426
+ execution_rid=execution_rid,
427
+ description="Initial dataset creation.",
428
+ )
429
+ return dataset_rid
430
+
431
+ @validate_call
432
+ def delete_dataset(self, dataset_rid: RID, recurse: bool = False) -> None:
433
+ """Delete a dataset_table from the catalog.
434
+
435
+ Args:
436
+ dataset_rid: RID of the dataset_table to delete.
437
+ recurse: If True, delete the dataset_table along with any nested datasets. (Default value = False)
438
+ """
439
+ # Get association table entries for this dataset_table
440
+ # Delete association table entries
441
+ if not self._is_dataset_rid(dataset_rid):
442
+ raise DerivaMLException("Dataset_rid is not a dataset.")
443
+
444
+ if parents := self.list_dataset_parents(dataset_rid):
445
+ raise DerivaMLException(f'Dataset_rid "{dataset_rid}" is in a nested dataset: {parents}.')
446
+
447
+ pb = self._model.catalog.getPathBuilder()
448
+ dataset_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
449
+
450
+ rid_list = [dataset_rid] + (self.list_dataset_children(dataset_rid=dataset_rid) if recurse else [])
451
+ dataset_path.update([{"RID": r, "Deleted": True} for r in rid_list])
452
+
453
+ def find_datasets(self, deleted: bool = False) -> Iterable[dict[str, Any]]:
454
+ """Returns a list of currently available datasets.
455
+
456
+ Arguments:
457
+ deleted: If True, included the datasets that have been deleted.
458
+
459
+ Returns:
460
+ list of currently available datasets.
461
+ """
462
+ # Get datapath to all the tables we will need: Dataset, DatasetType and the association table.
463
+ pb = self._model.catalog.getPathBuilder()
464
+ dataset_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
465
+ associations = list(self._model.schemas[self._ml_schema].tables[MLVocab.dataset_type].find_associations())
466
+ atable = associations[0].name if associations else None
467
+ ml_path = pb.schemas[self._ml_schema]
468
+ atable_path = ml_path.tables[atable]
469
+
470
+ if deleted:
471
+ filtered_path = dataset_path
472
+ else:
473
+ filtered_path = dataset_path.filter(
474
+ (dataset_path.Deleted == False) | (dataset_path.Deleted == None) # noqa: E711, E712
475
+ )
476
+
477
+ # Get a list of all the dataset_type values associated with this dataset_table.
478
+ datasets = []
479
+ for dataset in filtered_path.entities().fetch():
480
+ ds_types = (
481
+ atable_path.filter(atable_path.Dataset == dataset["RID"]).attributes(atable_path.Dataset_Type).fetch()
482
+ )
483
+ datasets.append(dataset | {MLVocab.dataset_type: [ds[MLVocab.dataset_type] for ds in ds_types]})
484
+ return datasets
485
+
486
+ def list_dataset_element_types(self) -> Iterable[Table]:
487
+ """List the types of entities that can be added to a dataset_table.
488
+
489
+ Returns:
490
+ :return: An iterable of Table objects that can be included as an element of a dataset_table.
491
+ """
492
+
493
+ def domain_table(table: Table) -> bool:
494
+ return table.schema.name == self._model.domain_schema or table.name == self._dataset_table.name
495
+
496
+ return [t for a in self._dataset_table.find_associations() if domain_table(t := a.other_fkeys.pop().pk_table)]
497
+
498
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
499
+ def add_dataset_element_type(self, element: str | Table) -> Table:
500
+ """A dataset_table is a heterogeneous collection of objects, each of which comes from a different table. This
501
+ routine makes it possible to add objects from the specified table to a dataset_table.
502
+
503
+ Args:
504
+ element: Name of the table or table object that is to be added to the dataset_table.
505
+
506
+ Returns:
507
+ The table object that was added to the dataset_table.
508
+ """
509
+ # Add table to map
510
+ element_table = self._model.name_to_table(element)
511
+ atable_def = Table.define_association([self._dataset_table, element_table])
512
+ try:
513
+ table = self._model.schemas[self._model.domain_schema].create_table(atable_def)
514
+ except ValueError as e:
515
+ if "already exists" in str(e):
516
+ table = self._model.name_to_table(atable_def["table_name"])
517
+ else:
518
+ raise e
519
+
520
+ # self.model = self.catalog.getCatalogModel()
521
+ self._dataset_table.annotations.update(self._generate_dataset_download_annotations())
522
+ self._model.model.apply()
523
+ return table
524
+
525
+ # @validate_call
526
+ def list_dataset_members(
527
+ self, dataset_rid: RID, recurse: bool = False, limit: int | None = None
528
+ ) -> dict[str, list[dict[str, Any]]]:
529
+ """Lists members of a dataset.
530
+
531
+ Returns a dictionary mapping member types to lists of member records. Can optionally
532
+ recurse through nested datasets and limit the number of results.
533
+
534
+ Args:
535
+ dataset_rid: Resource Identifier of the dataset.
536
+ recurse: Whether to include members of nested datasets. Defaults to False.
537
+ limit: Maximum number of members to return per type. None for no limit.
538
+
539
+ Returns:
540
+ dict[str, list[dict[str, Any]]]: Dictionary mapping member types to lists of members.
541
+ Each member is a dictionary containing the record's attributes.
542
+
543
+ Raises:
544
+ DerivaMLException: If dataset_rid is invalid.
545
+
546
+ Example:
547
+ >>> members = ml.list_dataset_members("1-abc123", recurse=True)
548
+ >>> for type_name, records in members.items():
549
+ ... print(f"{type_name}: {len(records)} records")
550
+ """
551
+
552
+ if not self._is_dataset_rid(dataset_rid):
553
+ raise DerivaMLException(f"RID is not for a dataset_table: {dataset_rid}")
554
+
555
+ # Look at each of the element types that might be in the dataset_table and get the list of rid for them from
556
+ # the appropriate association table.
557
+ members = defaultdict(list)
558
+ pb = self._model.catalog.getPathBuilder()
559
+ for assoc_table in self._dataset_table.find_associations():
560
+ other_fkey = assoc_table.other_fkeys.pop()
561
+ target_table = other_fkey.pk_table
562
+ member_table = assoc_table.table
563
+
564
+ # Look at domain tables and nested datasets.
565
+ if target_table.schema.name != self._model.domain_schema and not (
566
+ target_table == self._dataset_table or target_table.name == "File"
567
+ ):
568
+ continue
569
+ member_column = (
570
+ "Nested_Dataset" if target_table == self._dataset_table else other_fkey.foreign_key_columns[0].name
571
+ )
572
+
573
+ target_path = pb.schemas[target_table.schema.name].tables[target_table.name]
574
+ member_path = pb.schemas[member_table.schema.name].tables[member_table.name]
575
+
576
+ path = member_path.filter(member_path.Dataset == dataset_rid).link(
577
+ target_path,
578
+ on=(member_path.columns[member_column] == target_path.columns["RID"]),
579
+ )
580
+ target_entities = list(path.entities().fetch(limit=limit) if limit else path.entities().fetch())
581
+ members[target_table.name].extend(target_entities)
582
+ if recurse and target_table == self._dataset_table:
583
+ # Get the members for all the nested datasets and add to the member list.
584
+ nested_datasets = [d["RID"] for d in target_entities]
585
+ for ds in nested_datasets:
586
+ for k, v in self.list_dataset_members(ds, recurse=recurse).items():
587
+ members[k].extend(v)
588
+ return dict(members)
589
+
590
+ @validate_call
591
+ def add_dataset_members(
592
+ self,
593
+ dataset_rid: RID,
594
+ members: list[RID] | dict[str, list[RID]],
595
+ validate: bool = True,
596
+ description: str | None = "",
597
+ execution_rid: RID | None = None,
598
+ ) -> None:
599
+ """Adds members to a dataset.
600
+
601
+ Associates one or more records with a dataset. Can optionally validate member types
602
+ and create a new dataset version to track the changes.
603
+
604
+ Args:
605
+ dataset_rid: Resource Identifier of the dataset.
606
+ members: List of RIDs to add as dataset members. Can be orginized into a dictionary that indicates the
607
+ table that the member rids belong to.
608
+ validate: Whether to validate member types. Defaults to True.
609
+ description: Optional description of the member additions.
610
+ execution_rid: Optional execution RID to associate with changes.
611
+
612
+ Raises:
613
+ DerivaMLException: If:
614
+ - dataset_rid is invalid
615
+ - members are invalid or of wrong type
616
+ - adding members would create a cycle
617
+ - validation fails
618
+
619
+ Example:
620
+ >>> ml.add_dataset_members(
621
+ ... dataset_rid="1-abc123",
622
+ ... members=["1-def456", "1-ghi789"],
623
+ ... description="Added sample data"
624
+ ... )
625
+ """
626
+ description = description or "Updated dataset via add_dataset_members"
627
+
628
+ def check_dataset_cycle(member_rid, path=None):
629
+ """
630
+
631
+ Args:
632
+ member_rid:
633
+ path: (Default value = None)
634
+
635
+ Returns:
636
+
637
+ """
638
+ path = path or set(dataset_rid)
639
+ return member_rid in path
640
+
641
+ if validate:
642
+ existing_rids = set(m["RID"] for ms in self.list_dataset_members(dataset_rid).values() for m in ms)
643
+ if overlap := set(existing_rids).intersection(members):
644
+ raise DerivaMLException(f"Attempting to add existing member to dataset_table {dataset_rid}: {overlap}")
645
+
646
+ # Now go through every rid to be added to the data set and sort them based on what association table entries
647
+ # need to be made.
648
+ dataset_elements = {}
649
+ association_map = {
650
+ a.other_fkeys.pop().pk_table.name: a.table.name for a in self._dataset_table.find_associations()
651
+ }
652
+
653
+ # Get a list of all the object types that can be linked to a dataset_table.
654
+ if type(members) is list:
655
+ members = set(members)
656
+ for m in members:
657
+ try:
658
+ rid_info = self._model.catalog.resolve_rid(m)
659
+ except KeyError:
660
+ raise DerivaMLException(f"Invalid RID: {m}")
661
+ if rid_info.table.name not in association_map:
662
+ raise DerivaMLException(f"RID table: {rid_info.table.name} not part of dataset_table")
663
+ if rid_info.table == self._dataset_table and check_dataset_cycle(rid_info.rid):
664
+ raise DerivaMLException("Creating cycle of datasets is not allowed")
665
+ dataset_elements.setdefault(rid_info.table.name, []).append(rid_info.rid)
666
+ else:
667
+ dataset_elements = {t: set(ms) for t, ms in members.items()}
668
+ # Now make the entries into the association tables.
669
+ pb = self._model.catalog.getPathBuilder()
670
+ for table, elements in dataset_elements.items():
671
+ schema_path = pb.schemas[
672
+ self._ml_schema if (table == "Dataset" or table == "File") else self._model.domain_schema
673
+ ]
674
+ fk_column = "Nested_Dataset" if table == "Dataset" else table
675
+ if len(elements):
676
+ # Find out the name of the column in the association table.
677
+ schema_path.tables[association_map[table]].insert(
678
+ [{"Dataset": dataset_rid, fk_column: e} for e in elements]
679
+ )
680
+ self.increment_dataset_version(
681
+ dataset_rid,
682
+ VersionPart.minor,
683
+ description=description,
684
+ execution_rid=execution_rid,
685
+ )
686
+
687
+ @validate_call
688
+ def delete_dataset_members(
689
+ self,
690
+ dataset_rid: RID,
691
+ members: list[RID],
692
+ description: str = "",
693
+ execution_rid: RID | None = None,
694
+ ) -> None:
695
+ """Remove elements to an existing dataset_table.
696
+
697
+ Delete elements from an existing dataset. In addition to deleting members, the minor version number of the
698
+ dataset is incremented and the description, if provide is applied to that new version.
699
+
700
+ Args:
701
+ dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
702
+ members: List of member RIDs to add to the dataset_table.
703
+ description: Markdown description of the updated dataset.
704
+ execution_rid: Optional RID of execution associated with this operation.
705
+ """
706
+
707
+ members = set(members)
708
+ description = description or "Deletes dataset members"
709
+
710
+ # Now go through every rid to be added to the data set and sort them based on what association table entries
711
+ # need to be made.
712
+ dataset_elements = {}
713
+ association_map = {
714
+ a.other_fkeys.pop().pk_table.name: a.table.name for a in self._dataset_table.find_associations()
715
+ }
716
+ # Get a list of all the object types that can be linked to a dataset_table.
717
+ for m in members:
718
+ try:
719
+ rid_info = self._model.catalog.resolve_rid(m)
720
+ except KeyError:
721
+ raise DerivaMLException(f"Invalid RID: {m}")
722
+ if rid_info.table.name not in association_map:
723
+ raise DerivaMLException(f"RID table: {rid_info.table.name} not part of dataset_table")
724
+ dataset_elements.setdefault(rid_info.table.name, []).append(rid_info.rid)
725
+ # Now make the entries into the association tables.
726
+ pb = self._model.catalog.getPathBuilder()
727
+ for table, elements in dataset_elements.items():
728
+ schema_path = pb.schemas[self._ml_schema if table == "Dataset" else self._model.domain_schema]
729
+ fk_column = "Nested_Dataset" if table == "Dataset" else table
730
+
731
+ if len(elements):
732
+ atable_path = schema_path.tables[association_map[table]]
733
+ # Find out the name of the column in the association table.
734
+ for e in elements:
735
+ entity = atable_path.filter(
736
+ (atable_path.Dataset == dataset_rid) & (atable_path.columns[fk_column] == e),
737
+ )
738
+ entity.delete()
739
+ self.increment_dataset_version(
740
+ dataset_rid,
741
+ VersionPart.minor,
742
+ description=description,
743
+ execution_rid=execution_rid,
744
+ )
745
+
746
+ @validate_call
747
+ def list_dataset_parents(self, dataset_rid: RID) -> list[str]:
748
+ """Given a dataset_table RID, return a list of RIDs of the parent datasets if this is included in a
749
+ nested dataset.
750
+
751
+ Args:
752
+ dataset_rid: return: RID of the parent dataset_table.
753
+
754
+ Returns:
755
+ RID of the parent dataset_table.
756
+ """
757
+ if not self._is_dataset_rid(dataset_rid):
758
+ raise DerivaMLException(f"RID: {dataset_rid} does not belong to dataset_table {self._dataset_table.name}")
759
+ # Get association table for nested datasets
760
+ pb = self._model.catalog.getPathBuilder()
761
+ atable_path = pb.schemas[self._ml_schema].Dataset_Dataset
762
+ return [p["Dataset"] for p in atable_path.filter(atable_path.Nested_Dataset == dataset_rid).entities().fetch()]
763
+
764
+ @validate_call
765
+ def list_dataset_children(self, dataset_rid: RID, recurse: bool = False) -> list[RID]:
766
+ """Given a dataset_table RID, return a list of RIDs for any nested datasets.
767
+
768
+ Args:
769
+ dataset_rid: A dataset_table RID.
770
+ recurse: If True, return a list of nested datasets RIDs.
771
+
772
+ Returns:
773
+ list of nested dataset RIDs.
774
+
775
+ """
776
+ dataset_dataset_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Dataset"]
777
+ nested_datasets = list(dataset_dataset_path.entities().fetch())
778
+
779
+ def find_children(rid: RID):
780
+ children = [child["Nested_Dataset"] for child in nested_datasets if child["Dataset"] == rid]
781
+ if recurse:
782
+ for child in children.copy():
783
+ children.extend(find_children(child))
784
+ return children
785
+
786
+ return find_children(dataset_rid)
787
+
788
+ def _export_vocabulary(self, writer: Callable[[str, str, Table], list[dict[str, Any]]]) -> list[dict[str, Any]]:
789
+ """
790
+
791
+ Args:
792
+ writer: Callable[[list[Table]]: list[dict[str: Any]]]:
793
+
794
+ Returns:
795
+
796
+ """
797
+ vocabs = [
798
+ table
799
+ for s in self._model.schemas.values()
800
+ for table in s.tables.values()
801
+ if self._model.is_vocabulary(table)
802
+ ]
803
+ return [o for table in vocabs for o in writer(f"{table.schema.name}:{table.name}", table.name, table)]
804
+
805
+ def _table_paths(
806
+ self,
807
+ dataset: DatasetSpec | None = None,
808
+ snapshot_catalog: DerivaML | None = None,
809
+ ) -> Iterator[tuple[str, str, Table]]:
810
+ paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
811
+
812
+ def source_path(path: tuple[Table, ...]) -> list[str]:
813
+ """Convert a tuple representing a path into a source path component with FK linkage"""
814
+ path = list(path)
815
+ p = [f"{self._model.ml_schema}:Dataset/RID={{RID}}"]
816
+ for table in path[1:]:
817
+ if table.name == "Dataset_Dataset":
818
+ p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
819
+ elif table.name == "Dataset":
820
+ p.append("(Nested_Dataset)=(deriva-ml:Dataset:RID)")
821
+ elif table.name == "Dataset_Version":
822
+ p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
823
+ else:
824
+ p.append(f"{table.schema.name}:{table.name}")
825
+ return p
826
+
827
+ src_paths = ["/".join(source_path(p)) for p in paths]
828
+ dest_paths = ["/".join([t.name for t in p]) for p in paths]
829
+ target_tables = [p[-1] for p in paths]
830
+ return zip(src_paths, dest_paths, target_tables)
831
+
832
+ def _collect_paths(
833
+ self,
834
+ dataset_rid: RID | None = None,
835
+ snapshot: Dataset | None = None,
836
+ dataset_nesting_depth: int | None = None,
837
+ ) -> set[tuple[Table, ...]]:
838
+ snapshot_catalog = snapshot if snapshot else self
839
+
840
+ dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables["Dataset"]
841
+ dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
842
+
843
+ # Figure out what types of elements the dataset contains.
844
+ dataset_associations = [
845
+ a
846
+ for a in self._dataset_table.find_associations()
847
+ if a.table.schema.name != self._ml_schema or a.table.name == "Dataset_Dataset"
848
+ ]
849
+ if dataset_rid:
850
+ # Get a list of the members of the dataset so we can figure out which tables to query.
851
+ dataset_elements = [
852
+ snapshot_catalog._model.name_to_table(e)
853
+ for e, m in snapshot_catalog.list_dataset_members(
854
+ dataset_rid=dataset_rid, # limit=1 Limit seems to make things run slow.
855
+ ).items()
856
+ if m
857
+ ]
858
+ included_associations = [
859
+ a.table for a in dataset_table.find_associations() if a.other_fkeys.pop().pk_table in dataset_elements
860
+ ]
861
+ else:
862
+ included_associations = dataset_associations
863
+
864
+ # Get the paths through the schema and filter out all the dataset paths not used by this dataset.
865
+ paths = {
866
+ tuple(p)
867
+ for p in snapshot_catalog._model._schema_to_paths()
868
+ if (len(p) == 1)
869
+ or (p[1] not in dataset_associations) # Tables in the domain schema
870
+ or (p[1] in included_associations) # Tables that include members of the dataset
871
+ }
872
+ # Now get paths for nested datasets
873
+ nested_paths = set()
874
+ if dataset_rid:
875
+ for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
876
+ nested_paths |= self._collect_paths(c, snapshot=snapshot_catalog)
877
+ else:
878
+ # Initialize nesting depth if not already provided.
879
+ dataset_nesting_depth = (
880
+ self._dataset_nesting_depth() if dataset_nesting_depth is None else dataset_nesting_depth
881
+ )
882
+ if dataset_nesting_depth:
883
+ nested_paths = self._collect_paths(dataset_nesting_depth=dataset_nesting_depth - 1)
884
+ if nested_paths:
885
+ paths |= {
886
+ tuple([dataset_table]),
887
+ (dataset_table, dataset_dataset),
888
+ }
889
+ paths |= {(self._dataset_table, dataset_dataset) + p for p in nested_paths}
890
+ return paths
891
+
892
+ def _dataset_nesting_depth(self, dataset_rid: RID | None = None) -> int:
893
+ """Determine the maximum dataset nesting depth in the current catalog.
894
+
895
+ Returns:
896
+
897
+ """
898
+
899
+ def children_depth(dataset_rid: RID, nested_datasets: dict[str, list[str]]) -> int:
900
+ """Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
901
+ try:
902
+ children = nested_datasets[dataset_rid]
903
+ return max(map(lambda x: children_depth(x, nested_datasets), children)) + 1 if children else 1
904
+ except KeyError:
905
+ return 0
906
+
907
+ # Build up the dataset_table nesting graph...
908
+ pb = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Dataset"]
909
+ dataset_children = (
910
+ [
911
+ {
912
+ "Dataset": dataset_rid,
913
+ "Nested_Dataset": c,
914
+ } # Make uniform with return from datapath
915
+ for c in self.list_dataset_children(dataset_rid=dataset_rid)
916
+ ]
917
+ if dataset_rid
918
+ else pb.entities().fetch()
919
+ )
920
+ nested_dataset = defaultdict(list)
921
+ for ds in dataset_children:
922
+ nested_dataset[ds["Dataset"]].append(ds["Nested_Dataset"])
923
+ return max(map(lambda d: children_depth(d, dict(nested_dataset)), nested_dataset)) if nested_dataset else 0
924
+
925
+ def _dataset_specification(
926
+ self,
927
+ writer: Callable[[str, str, Table], list[dict[str, Any]]],
928
+ dataset: DatasetSpec | None = None,
929
+ snapshot_catalog: DerivaML | None = None,
930
+ ) -> list[dict[str, Any]]:
931
+ """Output a download/export specification for a dataset_table. Each element of the dataset_table
932
+ will be placed in its own directory.
933
+ The top level data directory of the resulting BDBag will have one subdirectory for element type.
934
+ The subdirectory will contain the CSV indicating which elements of that type are present in the
935
+ dataset_table, and then there will be a subdirectory for each object that is reachable from the
936
+ dataset_table members.
937
+
938
+ To simplify reconstructing the relationship between tables, the CVS for each element is included.
939
+ The top level data directory will also contain a subdirectory for any controlled vocabularies used in
940
+ the dataset_table. All assets will be placed into a directory named asset in a subdirectory with the
941
+ asset table name.
942
+
943
+ For example, consider a dataset_table that consists of two element types, T1 and T2. T1 has foreign
944
+ key relationships to objects in tables T3 and T4. There are also two controlled vocabularies, CV1 and
945
+ CV2. T2 is an asset table which has two assets in it. The layout of the resulting bdbag would be:
946
+ data
947
+ CV1/
948
+ cv1.csv
949
+ CV2/
950
+ cv2.csv
951
+ Dataset/
952
+ T1/
953
+ t1.csv
954
+ T3/
955
+ t3.csv
956
+ T4/
957
+ t4.csv
958
+ T2/
959
+ t2.csv
960
+ asset/
961
+ T2
962
+ f1
963
+ f2
964
+
965
+ Args:
966
+ writer: Callable[[list[Table]]: list[dict[str: Any]]]:
967
+
968
+ Returns:
969
+ A dataset_table specification.
970
+ """
971
+ element_spec = self._export_vocabulary(writer)
972
+ for path in self._table_paths(dataset=dataset, snapshot_catalog=snapshot_catalog):
973
+ element_spec.extend(writer(*path))
974
+ return element_spec
975
+
976
+ def _download_dataset_bag(
977
+ self,
978
+ dataset: DatasetSpec,
979
+ execution_rid: RID | None = None,
980
+ snapshot_catalog: DerivaML | None = None,
981
+ ) -> DatasetBag:
982
+ """Download a dataset onto the local file system. Create a MINID for the dataset if one doesn't already exist.
983
+
984
+ Args:
985
+ dataset: Specification of the dataset to be downloaded.
986
+ execution_rid: Execution RID for the dataset.
987
+ snapshot_catalog: Snapshot catalog for the dataset version if specified.
988
+
989
+ Returns:
990
+ Tuple consisting of the path to the dataset, the RID of the dataset that was downloaded and the MINID
991
+ for the dataset.
992
+ """
993
+ if (
994
+ execution_rid
995
+ and execution_rid != DRY_RUN_RID
996
+ and self._model.catalog.resolve_rid(execution_rid).table.name != "Execution"
997
+ ):
998
+ raise DerivaMLException(f"RID {execution_rid} is not an execution")
999
+ minid = self._get_dataset_minid(dataset, snapshot_catalog=snapshot_catalog)
1000
+
1001
+ bag_path = (
1002
+ self._materialize_dataset_bag(minid, execution_rid=execution_rid)
1003
+ if dataset.materialize
1004
+ else self._download_dataset_minid(minid)
1005
+ )
1006
+ return DatabaseModel(minid, bag_path, self._working_dir).get_dataset()
1007
+
1008
+ def _version_snapshot(self, dataset: DatasetSpec) -> str:
1009
+ """Return a catalog with snapshot for the specified dataset version"""
1010
+ try:
1011
+ version_record = next(
1012
+ h for h in self.dataset_history(dataset_rid=dataset.rid) if h.dataset_version == dataset.version
1013
+ )
1014
+ except StopIteration:
1015
+ raise DerivaMLException(f"Dataset version {dataset.version} not found for dataset {dataset.rid}")
1016
+ return f"{self._model.catalog.catalog_id}@{version_record.snapshot}"
1017
+
1018
+ def _create_dataset_minid(self, dataset: DatasetSpec, snapshot_catalog: DerivaML | None = None) -> str:
1019
+ with TemporaryDirectory() as tmp_dir:
1020
+ # Generate a download specification file for the current catalog schema. By default, this spec
1021
+ # will generate a minid and place the bag into S3 storage.
1022
+ spec_file = Path(tmp_dir) / "download_spec.json"
1023
+ with spec_file.open("w", encoding="utf-8") as ds:
1024
+ json.dump(self._generate_dataset_download_spec(dataset, snapshot_catalog), ds)
1025
+ try:
1026
+ self._logger.info(
1027
+ "Downloading dataset %s for catalog: %s@%s"
1028
+ % (
1029
+ "minid" if self._use_minid else "bag",
1030
+ dataset.rid,
1031
+ str(dataset.version),
1032
+ )
1033
+ )
1034
+ # Generate the bag and put into S3 storage.
1035
+ exporter = DerivaExport(
1036
+ host=self._model.catalog.deriva_server.server,
1037
+ config_file=spec_file,
1038
+ output_dir=tmp_dir,
1039
+ defer_download=True,
1040
+ timeout=(10, 610),
1041
+ envars={"RID": dataset.rid},
1042
+ )
1043
+ minid_page_url = exporter.export()[0] # Get the MINID launch page
1044
+ except (
1045
+ DerivaDownloadError,
1046
+ DerivaDownloadConfigurationError,
1047
+ DerivaDownloadAuthenticationError,
1048
+ DerivaDownloadAuthorizationError,
1049
+ DerivaDownloadTimeoutError,
1050
+ ) as e:
1051
+ raise DerivaMLException(format_exception(e))
1052
+ # Update version table with MINID.
1053
+ if self._use_minid:
1054
+ version_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema].tables["Dataset_Version"]
1055
+ version_rid = [
1056
+ h for h in self.dataset_history(dataset_rid=dataset.rid) if h.dataset_version == dataset.version
1057
+ ][0].version_rid
1058
+ version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
1059
+ return minid_page_url
1060
+
1061
+ def _get_dataset_minid(
1062
+ self,
1063
+ dataset: DatasetSpec,
1064
+ snapshot_catalog: DerivaML | None = None,
1065
+ create: bool = True,
1066
+ ) -> DatasetMinid | None:
1067
+ """Return a MINID for the specified dataset. If no version is specified, use the latest.
1068
+
1069
+ Args:
1070
+ dataset: Specification of the dataset.
1071
+ snapshot_catalog: Snapshot catalog for the dataset version if specified.
1072
+ create: Create a new MINID if one doesn't already exist.
1073
+
1074
+ Returns:
1075
+ New or existing MINID for the dataset.
1076
+ """
1077
+ rid = dataset.rid
1078
+
1079
+ # Case 1: RID is already a MINID or direct URL
1080
+ if rid.startswith("minid"):
1081
+ return self._fetch_minid_metadata(f"https://identifiers.org/{rid}", dataset.version)
1082
+ if rid.startswith("http"):
1083
+ return self._fetch_minid_metadata(rid, dataset.version)
1084
+
1085
+ # Case 2: RID is a dataset RID – validate existence
1086
+ if not any(rid == ds["RID"] for ds in self.find_datasets()):
1087
+ raise DerivaMLTableTypeError("Dataset", rid)
1088
+
1089
+ # Find dataset version record
1090
+ version_str = str(dataset.version)
1091
+ history = self.dataset_history(rid)
1092
+ try:
1093
+ version_record = next(v for v in history if v.dataset_version == version_str)
1094
+ except StopIteration:
1095
+ raise DerivaMLException(f"Version {version_str} does not exist for RID {rid}")
1096
+
1097
+ # Check or create MINID
1098
+ minid_url = version_record.minid
1099
+ # If we either don't have a MINID, or we have a MINID, but we don't want to use it, generate a new one.
1100
+ if (not minid_url) or (not self._use_minid):
1101
+ if not create:
1102
+ raise DerivaMLException(f"Minid for dataset {rid} doesn't exist")
1103
+ if self._use_minid:
1104
+ self._logger.info("Creating new MINID for dataset %s", rid)
1105
+ minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
1106
+
1107
+ # Return based on MINID usage
1108
+ if self._use_minid:
1109
+ return self._fetch_minid_metadata(minid_url, dataset.version)
1110
+ return DatasetMinid(
1111
+ dataset_version=dataset.version,
1112
+ RID=f"{rid}@{version_record.snapshot}",
1113
+ location=minid_url,
1114
+ )
1115
+
1116
+ def _fetch_minid_metadata(self, url: str, version: DatasetVersion) -> DatasetMinid:
1117
+ r = requests.get(url, headers={"accept": "application/json"})
1118
+ r.raise_for_status()
1119
+ return DatasetMinid(dataset_version=version, **r.json())
1120
+
1121
+ def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
1122
+ """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it, and
1123
+ validate that all the metadata is correct
1124
+
1125
+ Args:
1126
+ minid: The RID of a dataset_table or a minid to an existing bag.
1127
+ Returns:
1128
+ the location of the unpacked and validated dataset_table bag and the RID of the bag and the bag MINID
1129
+ """
1130
+
1131
+ # Check to see if we have an existing idempotent materialization of the desired bag. If so, then reuse
1132
+ # it. If not, then we need to extract the contents of the archive into our cache directory.
1133
+ bag_dir = self._cache_dir / f"{minid.dataset_rid}_{minid.checksum}"
1134
+ if bag_dir.exists():
1135
+ self._logger.info(f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}")
1136
+ return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
1137
+
1138
+ # Either bag hasn't been downloaded yet, or we are not using a Minid, so we don't know the checksum yet.
1139
+ with TemporaryDirectory() as tmp_dir:
1140
+ if self._use_minid:
1141
+ # Get bag from S3
1142
+ bag_path = Path(tmp_dir) / Path(urlparse(minid.bag_url).path).name
1143
+ archive_path = fetch_single_file(minid.bag_url, output_path=bag_path)
1144
+ else:
1145
+ exporter = DerivaExport(host=self._model.catalog.deriva_server.server, output_dir=tmp_dir)
1146
+ archive_path = exporter.retrieve_file(minid.bag_url)
1147
+ hashes = hash_utils.compute_file_hashes(archive_path, hashes=["md5", "sha256"])
1148
+ checksum = hashes["sha256"][0]
1149
+ bag_dir = self._cache_dir / f"{minid.dataset_rid}_{checksum}"
1150
+ if bag_dir.exists():
1151
+ self._logger.info(f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}")
1152
+ return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
1153
+ bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
1154
+ bdb.validate_bag_structure(bag_path)
1155
+ return Path(bag_path)
1156
+
1157
+ def _materialize_dataset_bag(
1158
+ self,
1159
+ minid: DatasetMinid,
1160
+ execution_rid: RID | None = None,
1161
+ ) -> Path:
1162
+ """Materialize a dataset_table bag into a local directory
1163
+
1164
+ Args:
1165
+ minid: A MINID to an existing bag or a RID of the dataset_table that should be downloaded.
1166
+
1167
+ Returns:
1168
+ A tuple containing the path to the bag, the RID of the bag, and the MINID to the bag.
1169
+ """
1170
+
1171
+ def update_status(status: Status, msg: str) -> None:
1172
+ """Update the current status for this execution in the catalog"""
1173
+ if execution_rid and execution_rid != DRY_RUN_RID:
1174
+ self._model.catalog.getPathBuilder().schemas[self._ml_schema].Execution.update(
1175
+ [
1176
+ {
1177
+ "RID": execution_rid,
1178
+ "Status": status.value,
1179
+ "Status_Detail": msg,
1180
+ }
1181
+ ]
1182
+ )
1183
+ self._logger.info(msg)
1184
+
1185
+ def fetch_progress_callback(current, total):
1186
+ msg = f"Materializing bag: {current} of {total} file(s) downloaded."
1187
+ if execution_rid:
1188
+ update_status(Status.running, msg)
1189
+ return True
1190
+
1191
+ def validation_progress_callback(current, total):
1192
+ msg = f"Validating bag: {current} of {total} file(s) validated."
1193
+ if execution_rid:
1194
+ update_status(Status.running, msg)
1195
+ return True
1196
+
1197
+ # request metadata
1198
+ bag_path = self._download_dataset_minid(minid)
1199
+ bag_dir = bag_path.parent
1200
+ validated_check = bag_dir / "validated_check.txt"
1201
+
1202
+ # If this bag has already been validated, our work is done. Otherwise, materialize the bag.
1203
+ if not validated_check.exists():
1204
+ self._logger.info(f"Materializing bag {minid.dataset_rid} Version:{minid.dataset_version}")
1205
+ bdb.materialize(
1206
+ bag_path.as_posix(),
1207
+ fetch_callback=fetch_progress_callback,
1208
+ validation_callback=validation_progress_callback,
1209
+ )
1210
+ validated_check.touch()
1211
+ return Path(bag_path)
1212
+
1213
+ def _export_annotation(
1214
+ self,
1215
+ snapshot_catalog: DerivaML | None = None,
1216
+ ) -> list[dict[str, Any]]:
1217
+ """Return and output specification for the datasets in the provided model
1218
+
1219
+ Returns:
1220
+ An export specification suitable for Chaise.
1221
+ """
1222
+
1223
+ # Export specification is a specification for the datasets, plus any controlled vocabulary
1224
+ return [
1225
+ {
1226
+ "source": {"api": False, "skip_root_path": True},
1227
+ "destination": {"type": "env", "params": {"query_keys": ["snaptime"]}},
1228
+ },
1229
+ {
1230
+ "source": {"api": "entity"},
1231
+ "destination": {
1232
+ "type": "env",
1233
+ "params": {"query_keys": ["RID", "Description"]},
1234
+ },
1235
+ },
1236
+ {
1237
+ "source": {"api": "schema", "skip_root_path": True},
1238
+ "destination": {"type": "json", "name": "schema"},
1239
+ },
1240
+ ] + self._dataset_specification(
1241
+ self._export_annotation_dataset_element,
1242
+ None,
1243
+ snapshot_catalog=snapshot_catalog,
1244
+ )
1245
+
1246
+ def _export_specification(
1247
+ self, dataset: DatasetSpec, snapshot_catalog: DerivaML | None = None
1248
+ ) -> list[dict[str, Any]]:
1249
+ """
1250
+ Generate a specification for export engine for specific dataset.
1251
+
1252
+ Returns:
1253
+ a download specification for the datasets in the provided model.
1254
+
1255
+ """
1256
+
1257
+ # Download spec is the spec for any controlled vocabulary and for the dataset_table.
1258
+ return [
1259
+ {
1260
+ "processor": "json",
1261
+ "processor_params": {"query_path": "/schema", "output_path": "schema"},
1262
+ }
1263
+ ] + self._dataset_specification(self._export_specification_dataset_element, dataset, snapshot_catalog)
1264
+
1265
+ @staticmethod
1266
+ def _export_specification_dataset_element(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
1267
+ """Return the download specification for the data object indicated by a path through the data model.
1268
+
1269
+ Args:
1270
+ spath: Source path
1271
+ dpath: Destination path
1272
+ table: Table referenced to by the path
1273
+
1274
+ Returns:
1275
+ The download specification that will retrieve that data from the catalog and place it into a BDBag.
1276
+ """
1277
+ exports = [
1278
+ {
1279
+ "processor": "csv",
1280
+ "processor_params": {
1281
+ "query_path": f"/entity/{spath}",
1282
+ "output_path": dpath,
1283
+ },
1284
+ }
1285
+ ]
1286
+
1287
+ # If this table is an asset table, then we need to output the files associated with the asset.
1288
+ asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
1289
+ if asset_columns.issubset({c.name for c in table.columns}):
1290
+ exports.append(
1291
+ {
1292
+ "processor": "fetch",
1293
+ "processor_params": {
1294
+ "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5,asset_rid:=RID",
1295
+ "output_path": "asset/{asset_rid}/" + table.name,
1296
+ },
1297
+ }
1298
+ )
1299
+ return exports
1300
+
1301
+ def _export_annotation_dataset_element(self, spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
1302
+ """Given a path in the data model, output an export specification for the path taken to get to the
1303
+ current table.
1304
+
1305
+ Args:
1306
+ spath: Source path
1307
+ dpath: Destination path
1308
+ table: Table referenced to by the path
1309
+
1310
+ Returns:
1311
+ The export specification that will retrieve that data from the catalog and place it into a BDBag.
1312
+ """
1313
+ # The table is the last element of the path. Generate the ERMRest query by converting the list of tables
1314
+ # into a path in the form of /S:T1/S:T2/S:Table
1315
+ # Generate the destination path in the file system using just the table names.
1316
+
1317
+ skip_root_path = False
1318
+ if spath.startswith(f"{self._ml_schema}:Dataset/"):
1319
+ # Chaise will add table name and RID filter, so strip it off.
1320
+ spath = "/".join(spath.split("/")[2:])
1321
+ if spath == "":
1322
+ # This path is to just the dataset table.
1323
+ return []
1324
+ else:
1325
+ # A vocabulary table, so we don't want the root_path.
1326
+ skip_root_path = True
1327
+ exports = [
1328
+ {
1329
+ "source": {
1330
+ "api": "entity",
1331
+ "path": spath,
1332
+ "skip_root_path": skip_root_path,
1333
+ },
1334
+ "destination": {"name": dpath, "type": "csv"},
1335
+ }
1336
+ ]
1337
+
1338
+ # If this table is an asset table, then we need to output the files associated with the asset.
1339
+ asset_columns = {"Filename", "URL", "Length", "MD5", "Description"}
1340
+ if asset_columns.issubset({c.name for c in table.columns}):
1341
+ exports.append(
1342
+ {
1343
+ "source": {
1344
+ "skip_root_path": False,
1345
+ "api": "attribute",
1346
+ "path": f"{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5, asset_rid:=RID",
1347
+ },
1348
+ "destination": {"name": "asset/{asset_rid}/" + table.name, "type": "fetch"},
1349
+ }
1350
+ )
1351
+ return exports
1352
+
1353
+ def _generate_dataset_download_spec(
1354
+ self, dataset: DatasetSpec, snapshot_catalog: DerivaML | None = None
1355
+ ) -> dict[str, Any]:
1356
+ """
1357
+ Generate a specification for downloading a specific dataset.
1358
+
1359
+ This routine creates a download specification that can be used by the Deriva export processor to download
1360
+ a specific dataset as a MINID.
1361
+ Returns:
1362
+ """
1363
+ s3_target = "s3://eye-ai-shared"
1364
+ minid_test = False
1365
+
1366
+ catalog_id = self._version_snapshot(dataset)
1367
+ post_processors = (
1368
+ {
1369
+ "post_processors": [
1370
+ {
1371
+ "processor": "cloud_upload",
1372
+ "processor_params": {
1373
+ "acl": "public-read",
1374
+ "target_url": s3_target,
1375
+ },
1376
+ },
1377
+ {
1378
+ "processor": "identifier",
1379
+ "processor_params": {
1380
+ "test": minid_test,
1381
+ "env_column_map": {
1382
+ "RID": "{RID}@{snaptime}",
1383
+ "Description": "{Description}",
1384
+ },
1385
+ },
1386
+ },
1387
+ ]
1388
+ }
1389
+ if self._use_minid
1390
+ else {}
1391
+ )
1392
+ return post_processors | {
1393
+ "env": {"RID": "{RID}"},
1394
+ "bag": {
1395
+ "bag_name": "Dataset_{RID}",
1396
+ "bag_algorithms": ["md5"],
1397
+ "bag_archiver": "zip",
1398
+ "bag_metadata": {},
1399
+ "bag_idempotent": True,
1400
+ },
1401
+ "catalog": {
1402
+ "host": f"{self._model.catalog.deriva_server.scheme}://{self._model.catalog.deriva_server.server}",
1403
+ "catalog_id": catalog_id,
1404
+ "query_processors": [
1405
+ {
1406
+ "processor": "env",
1407
+ "processor_params": {
1408
+ "output_path": "Dataset",
1409
+ "query_keys": ["snaptime"],
1410
+ "query_path": "/",
1411
+ },
1412
+ },
1413
+ {
1414
+ "processor": "env",
1415
+ "processor_params": {
1416
+ "query_path": "/entity/M:=deriva-ml:Dataset/RID={RID}",
1417
+ "output_path": "Dataset",
1418
+ "query_keys": ["RID", "Description"],
1419
+ },
1420
+ },
1421
+ ]
1422
+ + self._export_specification(dataset, snapshot_catalog),
1423
+ },
1424
+ }
1425
+
1426
+ def _generate_dataset_download_annotations(self) -> dict[str, Any]:
1427
+ post_processors = (
1428
+ {
1429
+ "type": "BAG",
1430
+ "outputs": [{"fragment_key": "dataset_export_outputs"}],
1431
+ "displayname": "BDBag to Cloud",
1432
+ "bag_idempotent": True,
1433
+ "postprocessors": [
1434
+ {
1435
+ "processor": "cloud_upload",
1436
+ "processor_params": {
1437
+ "acl": "public-read",
1438
+ "target_url": "s3://eye-ai-shared/",
1439
+ },
1440
+ },
1441
+ {
1442
+ "processor": "identifier",
1443
+ "processor_params": {
1444
+ "test": False,
1445
+ "env_column_map": {
1446
+ "RID": "{RID}@{snaptime}",
1447
+ "Description": "{Description}",
1448
+ },
1449
+ },
1450
+ },
1451
+ ],
1452
+ }
1453
+ if self._use_minid
1454
+ else {}
1455
+ )
1456
+ return {
1457
+ deriva_tags.export_fragment_definitions: {"dataset_export_outputs": self._export_annotation()},
1458
+ deriva_tags.visible_foreign_keys: self._dataset_visible_fkeys(),
1459
+ deriva_tags.export_2019: {
1460
+ "detailed": {
1461
+ "templates": [
1462
+ {
1463
+ "type": "BAG",
1464
+ "outputs": [{"fragment_key": "dataset_export_outputs"}],
1465
+ "displayname": "BDBag Download",
1466
+ "bag_idempotent": True,
1467
+ }
1468
+ | post_processors
1469
+ ]
1470
+ }
1471
+ },
1472
+ }
1473
+
1474
+ def _dataset_visible_fkeys(self) -> dict[str, Any]:
1475
+ def fkey_name(fk):
1476
+ return [fk.name[0].name, fk.name[1]]
1477
+
1478
+ dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
1479
+
1480
+ source_list = [
1481
+ {
1482
+ "source": [
1483
+ {"inbound": ["deriva-ml", "Dataset_Version_Dataset_fkey"]},
1484
+ "RID",
1485
+ ],
1486
+ "markdown_name": "Previous Versions",
1487
+ "entity": True,
1488
+ },
1489
+ {
1490
+ "source": [
1491
+ {"inbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
1492
+ {"outbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
1493
+ "RID",
1494
+ ],
1495
+ "markdown_name": "Parent Datasets",
1496
+ },
1497
+ {
1498
+ "source": [
1499
+ {"inbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
1500
+ {"outbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
1501
+ "RID",
1502
+ ],
1503
+ "markdown_name": "Child Datasets",
1504
+ },
1505
+ ]
1506
+ source_list.extend(
1507
+ [
1508
+ {
1509
+ "source": [
1510
+ {"inbound": fkey_name(fkey.self_fkey)},
1511
+ {"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
1512
+ "RID",
1513
+ ],
1514
+ "markdown_name": other_fkey.pk_table.name,
1515
+ }
1516
+ for fkey in dataset_table.find_associations(max_arity=3, pure=False)
1517
+ ]
1518
+ )
1519
+ return {"detailed": source_list}