deriva-ml 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. deriva_ml/__init__.py +43 -1
  2. deriva_ml/asset/__init__.py +17 -0
  3. deriva_ml/asset/asset.py +357 -0
  4. deriva_ml/asset/aux_classes.py +100 -0
  5. deriva_ml/bump_version.py +254 -11
  6. deriva_ml/catalog/__init__.py +21 -0
  7. deriva_ml/catalog/clone.py +1199 -0
  8. deriva_ml/catalog/localize.py +426 -0
  9. deriva_ml/core/__init__.py +29 -0
  10. deriva_ml/core/base.py +817 -1067
  11. deriva_ml/core/config.py +169 -21
  12. deriva_ml/core/constants.py +120 -19
  13. deriva_ml/core/definitions.py +123 -13
  14. deriva_ml/core/enums.py +47 -73
  15. deriva_ml/core/ermrest.py +226 -193
  16. deriva_ml/core/exceptions.py +297 -14
  17. deriva_ml/core/filespec.py +99 -28
  18. deriva_ml/core/logging_config.py +225 -0
  19. deriva_ml/core/mixins/__init__.py +42 -0
  20. deriva_ml/core/mixins/annotation.py +915 -0
  21. deriva_ml/core/mixins/asset.py +384 -0
  22. deriva_ml/core/mixins/dataset.py +237 -0
  23. deriva_ml/core/mixins/execution.py +408 -0
  24. deriva_ml/core/mixins/feature.py +365 -0
  25. deriva_ml/core/mixins/file.py +263 -0
  26. deriva_ml/core/mixins/path_builder.py +145 -0
  27. deriva_ml/core/mixins/rid_resolution.py +204 -0
  28. deriva_ml/core/mixins/vocabulary.py +400 -0
  29. deriva_ml/core/mixins/workflow.py +322 -0
  30. deriva_ml/core/validation.py +389 -0
  31. deriva_ml/dataset/__init__.py +2 -1
  32. deriva_ml/dataset/aux_classes.py +20 -4
  33. deriva_ml/dataset/catalog_graph.py +575 -0
  34. deriva_ml/dataset/dataset.py +1242 -1008
  35. deriva_ml/dataset/dataset_bag.py +1311 -182
  36. deriva_ml/dataset/history.py +27 -14
  37. deriva_ml/dataset/upload.py +225 -38
  38. deriva_ml/demo_catalog.py +126 -110
  39. deriva_ml/execution/__init__.py +46 -2
  40. deriva_ml/execution/base_config.py +639 -0
  41. deriva_ml/execution/execution.py +543 -242
  42. deriva_ml/execution/execution_configuration.py +26 -11
  43. deriva_ml/execution/execution_record.py +592 -0
  44. deriva_ml/execution/find_caller.py +298 -0
  45. deriva_ml/execution/model_protocol.py +175 -0
  46. deriva_ml/execution/multirun_config.py +153 -0
  47. deriva_ml/execution/runner.py +595 -0
  48. deriva_ml/execution/workflow.py +223 -34
  49. deriva_ml/experiment/__init__.py +8 -0
  50. deriva_ml/experiment/experiment.py +411 -0
  51. deriva_ml/feature.py +6 -1
  52. deriva_ml/install_kernel.py +143 -6
  53. deriva_ml/interfaces.py +862 -0
  54. deriva_ml/model/__init__.py +99 -0
  55. deriva_ml/model/annotations.py +1278 -0
  56. deriva_ml/model/catalog.py +286 -60
  57. deriva_ml/model/database.py +144 -649
  58. deriva_ml/model/deriva_ml_database.py +308 -0
  59. deriva_ml/model/handles.py +14 -0
  60. deriva_ml/run_model.py +319 -0
  61. deriva_ml/run_notebook.py +507 -38
  62. deriva_ml/schema/__init__.py +18 -2
  63. deriva_ml/schema/annotations.py +62 -33
  64. deriva_ml/schema/create_schema.py +169 -69
  65. deriva_ml/schema/validation.py +601 -0
  66. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -4
  67. deriva_ml-1.17.11.dist-info/RECORD +77 -0
  68. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
  69. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +1 -0
  70. deriva_ml/protocols/dataset.py +0 -19
  71. deriva_ml/test.py +0 -94
  72. deriva_ml-1.17.10.dist-info/RECORD +0 -45
  73. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
  74. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,384 @@
1
+ """Asset management mixin for DerivaML.
2
+
3
+ This module provides the AssetMixin class which handles
4
+ asset table operations including creating, listing, and looking up assets.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import TYPE_CHECKING, Any, Callable, Iterable
10
+
11
+ # Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
12
+ import importlib
13
+ _ermrest_model = importlib.import_module("deriva.core.ermrest_model")
14
+ Table = _ermrest_model.Table
15
+
16
+ from deriva_ml.core.definitions import AssetTableDef, ColumnDefinition, MLVocab, RID, VocabularyTerm
17
+ from deriva_ml.core.exceptions import DerivaMLException
18
+ from deriva_ml.schema.annotations import asset_annotation
19
+
20
+ if TYPE_CHECKING:
21
+ from deriva_ml.asset.asset import Asset
22
+ from deriva_ml.execution.execution_record import ExecutionRecord
23
+ from deriva_ml.model.catalog import DerivaModel
24
+
25
+
26
+ class AssetMixin:
27
+ """Mixin providing asset management operations.
28
+
29
+ This mixin requires the host class to have:
30
+ - model: DerivaModel instance
31
+ - ml_schema: str - name of the ML schema
32
+ - domain_schema: str - name of the domain schema
33
+ - pathBuilder(): method returning catalog path builder
34
+ - add_term(): method for adding vocabulary terms (from VocabularyMixin)
35
+ - apply_catalog_annotations(): method to update navbar (from DerivaML base class)
36
+
37
+ Methods:
38
+ create_asset: Create a new asset table
39
+ list_assets: List contents of an asset table
40
+ """
41
+
42
+ # Type hints for IDE support - actual attributes/methods from host class
43
+ model: "DerivaModel"
44
+ ml_schema: str
45
+ domain_schemas: frozenset[str]
46
+ default_schema: str | None
47
+ pathBuilder: Callable[[], Any]
48
+ add_term: Callable[..., VocabularyTerm]
49
+ apply_catalog_annotations: Callable[[], None]
50
+
51
+ # Note: @validate_call removed because ColumnDefinition is now a dataclass from
52
+ # deriva.core.typed and Pydantic validation doesn't work well with dataclass fields
53
+ def create_asset(
54
+ self,
55
+ asset_name: str,
56
+ column_defs: Iterable[ColumnDefinition] | None = None,
57
+ fkey_defs: Iterable[ColumnDefinition] | None = None,
58
+ referenced_tables: Iterable[Table] | None = None,
59
+ comment: str = "",
60
+ schema: str | None = None,
61
+ update_navbar: bool = True,
62
+ ) -> Table:
63
+ """Creates an asset table.
64
+
65
+ Args:
66
+ asset_name: Name of the asset table.
67
+ column_defs: Iterable of ColumnDefinition objects to provide additional metadata for asset.
68
+ fkey_defs: Iterable of ForeignKeyDefinition objects to provide additional metadata for asset.
69
+ referenced_tables: Iterable of Table objects to which asset should provide foreign-key references to.
70
+ comment: Description of the asset table. (Default value = '')
71
+ schema: Schema in which to create the asset table. Defaults to domain_schema.
72
+ update_navbar: If True (default), automatically updates the navigation bar to include
73
+ the new asset table. Set to False during batch asset creation to avoid redundant
74
+ updates, then call apply_catalog_annotations() once at the end.
75
+
76
+ Returns:
77
+ Table object for the asset table.
78
+ """
79
+ # Initialize empty collections if None provided
80
+ column_defs = column_defs or []
81
+ fkey_defs = fkey_defs or []
82
+ referenced_tables = referenced_tables or []
83
+ schema = schema or self.model._require_default_schema()
84
+
85
+ # Add an asset type to vocabulary
86
+ self.add_term(MLVocab.asset_type, asset_name, description=f"A {asset_name} asset")
87
+
88
+ # Create the main asset table
89
+ # Note: column_defs and fkey_defs should be ColumnDef/ForeignKeyDef objects
90
+ asset_table = self.model.schemas[schema].create_table(
91
+ AssetTableDef(
92
+ schema_name=schema,
93
+ name=asset_name,
94
+ columns=list(column_defs),
95
+ foreign_keys=list(fkey_defs),
96
+ comment=comment,
97
+ )
98
+ )
99
+
100
+ # Create an association table between asset and asset type
101
+ self.model.create_table(
102
+ Table.define_association(
103
+ [
104
+ (asset_table.name, asset_table),
105
+ ("Asset_Type", self.model.name_to_table("Asset_Type")),
106
+ ]
107
+ ),
108
+ schema=schema,
109
+ )
110
+
111
+ # Create references to other tables if specified
112
+ for t in referenced_tables:
113
+ asset_table.create_reference(self.model.name_to_table(t))
114
+
115
+ # Create an association table for tracking execution
116
+ atable = self.model.create_table(
117
+ Table.define_association(
118
+ [
119
+ (asset_name, asset_table),
120
+ (
121
+ "Execution",
122
+ self.model.schemas[self.ml_schema].tables["Execution"],
123
+ ),
124
+ ]
125
+ ),
126
+ schema=schema,
127
+ )
128
+ atable.create_reference(self.model.name_to_table("Asset_Role"))
129
+
130
+ # Add asset annotations
131
+ asset_annotation(asset_table)
132
+
133
+ # Update navbar to include the new asset table
134
+ if update_navbar:
135
+ self.apply_catalog_annotations()
136
+
137
+ return asset_table
138
+
139
+ def list_assets(self, asset_table: Table | str) -> list["Asset"]:
140
+ """Lists contents of an asset table.
141
+
142
+ Returns a list of Asset objects for the specified asset table.
143
+
144
+ Args:
145
+ asset_table: Table or name of the asset table to list assets for.
146
+
147
+ Returns:
148
+ list[Asset]: List of Asset objects for the assets in the table.
149
+
150
+ Raises:
151
+ DerivaMLException: If the table is not an asset table or doesn't exist.
152
+
153
+ Example:
154
+ >>> assets = ml.list_assets("Image")
155
+ >>> for asset in assets:
156
+ ... print(f"{asset.asset_rid}: {asset.filename}")
157
+ """
158
+ from deriva_ml.asset.asset import Asset
159
+
160
+ # Validate and get asset table reference
161
+ asset_table_obj = self.model.name_to_table(asset_table)
162
+ if not self.model.is_asset(asset_table_obj):
163
+ raise DerivaMLException(f"Table {asset_table_obj.name} is not an asset")
164
+
165
+ # Get path builders for asset and type tables
166
+ pb = self.pathBuilder()
167
+ asset_path = pb.schemas[asset_table_obj.schema.name].tables[asset_table_obj.name]
168
+ (
169
+ asset_type_table,
170
+ _,
171
+ _,
172
+ ) = self.model.find_association(asset_table_obj, MLVocab.asset_type)
173
+ type_path = pb.schemas[asset_type_table.schema.name].tables[asset_type_table.name]
174
+
175
+ # Build a list of Asset objects
176
+ assets = []
177
+ for asset_record in asset_path.entities().fetch():
178
+ # Get associated asset types for each asset
179
+ asset_types = (
180
+ type_path.filter(type_path.columns[asset_table_obj.name] == asset_record["RID"])
181
+ .attributes(type_path.Asset_Type)
182
+ .fetch()
183
+ )
184
+ asset_type_list = [asset_type[MLVocab.asset_type.value] for asset_type in asset_types]
185
+
186
+ assets.append(Asset(
187
+ catalog=self, # type: ignore[arg-type]
188
+ asset_rid=asset_record["RID"],
189
+ asset_table=asset_table_obj.name,
190
+ filename=asset_record.get("Filename", ""),
191
+ url=asset_record.get("URL", ""),
192
+ length=asset_record.get("Length", 0),
193
+ md5=asset_record.get("MD5", ""),
194
+ description=asset_record.get("Description", ""),
195
+ asset_types=asset_type_list,
196
+ ))
197
+ return assets
198
+
199
+ def list_asset_executions(
200
+ self, asset_rid: str, asset_role: str | None = None
201
+ ) -> list["ExecutionRecord"]:
202
+ """List all executions associated with an asset.
203
+
204
+ Given an asset RID, returns a list of executions that created or used
205
+ the asset, along with the role (Input/Output) in each execution.
206
+
207
+ Args:
208
+ asset_rid: The RID of the asset to look up.
209
+ asset_role: Optional filter for asset role ('Input' or 'Output').
210
+ If None, returns all associations.
211
+
212
+ Returns:
213
+ list[ExecutionRecord]: List of ExecutionRecord objects for the
214
+ executions associated with this asset.
215
+
216
+ Raises:
217
+ DerivaMLException: If the asset RID is not found or not an asset.
218
+
219
+ Example:
220
+ >>> # Find all executions that created this asset
221
+ >>> executions = ml.list_asset_executions("1-abc123", asset_role="Output")
222
+ >>> for exe in executions:
223
+ ... print(f"Created by execution {exe.execution_rid}")
224
+
225
+ >>> # Find all executions that used this asset as input
226
+ >>> executions = ml.list_asset_executions("1-abc123", asset_role="Input")
227
+ """
228
+ # Resolve the RID to find which asset table it belongs to
229
+ rid_info = self.resolve_rid(asset_rid) # type: ignore[attr-defined]
230
+ asset_table = rid_info.table
231
+
232
+ if not self.model.is_asset(asset_table):
233
+ raise DerivaMLException(f"RID {asset_rid} is not an asset (table: {asset_table.name})")
234
+
235
+ # Find the association table between this asset table and Execution
236
+ asset_exe_table, asset_fk, execution_fk = self.model.find_association(asset_table, "Execution")
237
+
238
+ # Build the query
239
+ pb = self.pathBuilder()
240
+ asset_exe_path = pb.schemas[asset_exe_table.schema.name].tables[asset_exe_table.name]
241
+
242
+ # Filter by asset RID
243
+ query = asset_exe_path.filter(asset_exe_path.columns[asset_fk] == asset_rid)
244
+
245
+ # Optionally filter by asset role
246
+ if asset_role:
247
+ query = query.filter(asset_exe_path.Asset_Role == asset_role)
248
+
249
+ # Convert to ExecutionRecord objects
250
+ records = list(query.entities().fetch())
251
+ return [self.lookup_execution(record["Execution"]) for record in records] # type: ignore[attr-defined]
252
+
253
+ def lookup_asset(self, asset_rid: RID) -> "Asset":
254
+ """Look up an asset by its RID.
255
+
256
+ Returns an Asset object for the specified RID. The asset can be from
257
+ any asset table in the catalog.
258
+
259
+ Args:
260
+ asset_rid: The RID of the asset to look up.
261
+
262
+ Returns:
263
+ Asset object for the specified RID.
264
+
265
+ Raises:
266
+ DerivaMLException: If the RID is not found or is not an asset.
267
+
268
+ Example:
269
+ >>> asset = ml.lookup_asset("3JSE")
270
+ >>> print(f"File: {asset.filename}, Table: {asset.asset_table}")
271
+ """
272
+ from deriva_ml.asset.asset import Asset
273
+
274
+ # Resolve the RID to find which table it belongs to
275
+ rid_info = self.resolve_rid(asset_rid) # type: ignore[attr-defined]
276
+ asset_table = rid_info.table
277
+
278
+ if not self.model.is_asset(asset_table):
279
+ raise DerivaMLException(f"RID {asset_rid} is not an asset (table: {asset_table.name})")
280
+
281
+ # Query the asset table for this record
282
+ pb = self.pathBuilder()
283
+ asset_path = pb.schemas[asset_table.schema.name].tables[asset_table.name]
284
+
285
+ records = list(asset_path.filter(asset_path.RID == asset_rid).entities().fetch())
286
+ if not records:
287
+ raise DerivaMLException(f"Asset {asset_rid} not found in table {asset_table.name}")
288
+
289
+ record = records[0]
290
+
291
+ # Get asset types
292
+ asset_types = []
293
+ try:
294
+ type_assoc_table, asset_fk, _ = self.model.find_association(asset_table, "Asset_Type")
295
+ type_path = pb.schemas[type_assoc_table.schema.name].tables[type_assoc_table.name]
296
+ types = list(
297
+ type_path.filter(type_path.columns[asset_fk] == asset_rid)
298
+ .attributes(type_path.Asset_Type)
299
+ .fetch()
300
+ )
301
+ asset_types = [t["Asset_Type"] for t in types]
302
+ except Exception:
303
+ pass # No type association for this asset table
304
+
305
+ return Asset(
306
+ catalog=self, # type: ignore[arg-type]
307
+ asset_rid=asset_rid,
308
+ asset_table=asset_table.name,
309
+ filename=record.get("Filename", ""),
310
+ url=record.get("URL", ""),
311
+ length=record.get("Length", 0),
312
+ md5=record.get("MD5", ""),
313
+ description=record.get("Description", ""),
314
+ asset_types=asset_types,
315
+ )
316
+
317
+ def list_asset_tables(self) -> list[Table]:
318
+ """List all asset tables in the catalog.
319
+
320
+ Returns:
321
+ List of Table objects that are asset tables.
322
+
323
+ Example:
324
+ >>> for table in ml.list_asset_tables():
325
+ ... print(f"Asset table: {table.name}")
326
+ """
327
+ tables = []
328
+ # Include asset tables from all domain schemas
329
+ for domain_schema in self.domain_schemas:
330
+ if domain_schema in self.model.schemas:
331
+ tables.extend([
332
+ t for t in self.model.schemas[domain_schema].tables.values()
333
+ if self.model.is_asset(t)
334
+ ])
335
+ # Also include ML schema asset tables (like Execution_Asset)
336
+ tables.extend([
337
+ t for t in self.model.schemas[self.ml_schema].tables.values()
338
+ if self.model.is_asset(t)
339
+ ])
340
+ return tables
341
+
342
+ def find_assets(
343
+ self,
344
+ asset_table: Table | str | None = None,
345
+ asset_type: str | None = None,
346
+ ) -> Iterable["Asset"]:
347
+ """Find assets in the catalog.
348
+
349
+ Returns an iterable of Asset objects matching the specified criteria.
350
+ If no criteria are specified, returns all assets from all asset tables.
351
+
352
+ Args:
353
+ asset_table: Optional table or table name to search. If None, searches
354
+ all asset tables.
355
+ asset_type: Optional asset type to filter by. Only returns assets
356
+ with this type.
357
+
358
+ Returns:
359
+ Iterable of Asset objects matching the criteria.
360
+
361
+ Example:
362
+ >>> # Find all assets in the Model table
363
+ >>> models = list(ml.find_assets(asset_table="Model"))
364
+
365
+ >>> # Find all assets with type "Training_Data"
366
+ >>> training = list(ml.find_assets(asset_type="Training_Data"))
367
+
368
+ >>> # Find all assets across all tables
369
+ >>> all_assets = list(ml.find_assets())
370
+ """
371
+ # Determine which tables to search
372
+ if asset_table is not None:
373
+ tables = [self.model.name_to_table(asset_table)]
374
+ else:
375
+ tables = self.list_asset_tables()
376
+
377
+ for table in tables:
378
+ # Get all assets from this table (now returns Asset objects)
379
+ for asset in self.list_assets(table):
380
+ # Filter by asset type if specified
381
+ if asset_type is not None:
382
+ if asset_type not in asset.asset_types:
383
+ continue
384
+ yield asset
@@ -0,0 +1,237 @@
1
+ """Dataset management mixin for DerivaML.
2
+
3
+ This module provides the DatasetMixin class which handles
4
+ dataset operations including finding, creating, looking up,
5
+ deleting, and managing dataset elements.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import TYPE_CHECKING, Any, Callable, Iterable
11
+
12
+ # Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
13
+ import importlib
14
+ _ermrest_model = importlib.import_module("deriva.core.ermrest_model")
15
+ Table = _ermrest_model.Table
16
+
17
+ from pydantic import ConfigDict, validate_call
18
+
19
+ from deriva_ml.core.definitions import RID, MLVocab
20
+ from deriva_ml.core.exceptions import DerivaMLException, DerivaMLTableTypeError
21
+ from deriva_ml.dataset.aux_classes import DatasetSpec
22
+
23
+ if TYPE_CHECKING:
24
+ from deriva_ml.dataset.dataset import Dataset
25
+ from deriva_ml.dataset.dataset_bag import DatasetBag
26
+ from deriva_ml.model.catalog import DerivaModel
27
+
28
+
29
+ class DatasetMixin:
30
+ """Mixin providing dataset management operations.
31
+
32
+ This mixin requires the host class to have:
33
+ - model: DerivaModel instance
34
+ - ml_schema: str - name of the ML schema
35
+ - domain_schema: str - name of the domain schema
36
+ - s3_bucket: str | None - S3 bucket URL for dataset storage
37
+ - use_minid: bool - whether to use MINIDs
38
+ - pathBuilder(): method returning catalog path builder
39
+ - _dataset_table: property returning the Dataset table
40
+
41
+ Methods:
42
+ find_datasets: List all datasets in the catalog
43
+ create_dataset: Create a new dataset
44
+ lookup_dataset: Look up a dataset by RID or spec
45
+ delete_dataset: Delete a dataset
46
+ list_dataset_element_types: List types that can be added to datasets
47
+ add_dataset_element_type: Add a new element type to datasets
48
+ download_dataset_bag: Download a dataset as a bag
49
+ """
50
+
51
+ # Type hints for IDE support - actual attributes/methods from host class
52
+ model: "DerivaModel"
53
+ ml_schema: str
54
+ domain_schemas: frozenset[str]
55
+ default_schema: str | None
56
+ s3_bucket: str | None
57
+ use_minid: bool
58
+ pathBuilder: Callable[[], Any]
59
+
60
+ @property
61
+ def _dataset_table(self) -> Table:
62
+ """Get the Dataset table. Must be provided by host class."""
63
+ raise NotImplementedError
64
+
65
+ def find_datasets(self, deleted: bool = False) -> Iterable["Dataset"]:
66
+ """List all datasets in the catalog.
67
+
68
+ Args:
69
+ deleted: If True, include datasets that have been marked as deleted.
70
+
71
+ Returns:
72
+ Iterable of Dataset objects.
73
+
74
+ Example:
75
+ >>> datasets = list(ml.find_datasets())
76
+ >>> for ds in datasets:
77
+ ... print(f"{ds.dataset_rid}: {ds.description}")
78
+ """
79
+ # Import here to avoid circular imports
80
+ from deriva_ml.dataset.dataset import Dataset
81
+
82
+ # Get datapath to the Dataset table
83
+ pb = self.pathBuilder()
84
+ dataset_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
85
+
86
+ if deleted:
87
+ filtered_path = dataset_path
88
+ else:
89
+ filtered_path = dataset_path.filter(
90
+ (dataset_path.Deleted == False) | (dataset_path.Deleted == None) # noqa: E711, E712
91
+ )
92
+
93
+ # Create Dataset objects - dataset_types is now a property that fetches from catalog
94
+ datasets = []
95
+ for dataset in filtered_path.entities().fetch():
96
+ datasets.append(
97
+ Dataset(
98
+ self, # type: ignore[arg-type]
99
+ dataset_rid=dataset["RID"],
100
+ description=dataset["Description"],
101
+ )
102
+ )
103
+ return datasets
104
+
105
+ def lookup_dataset(self, dataset: RID | DatasetSpec, deleted: bool = False) -> "Dataset":
106
+ """Look up a dataset by RID or DatasetSpec.
107
+
108
+ Args:
109
+ dataset: Dataset RID or DatasetSpec to look up.
110
+ deleted: If True, include datasets that have been marked as deleted.
111
+
112
+ Returns:
113
+ Dataset: The dataset object for the specified RID.
114
+
115
+ Raises:
116
+ DerivaMLException: If the dataset is not found.
117
+
118
+ Example:
119
+ >>> dataset = ml.lookup_dataset("4HM")
120
+ >>> print(f"Version: {dataset.current_version}")
121
+ """
122
+ if isinstance(dataset, DatasetSpec):
123
+ dataset_rid = dataset.rid
124
+ else:
125
+ dataset_rid = dataset
126
+
127
+ try:
128
+ return [ds for ds in self.find_datasets(deleted=deleted) if ds.dataset_rid == dataset_rid][0]
129
+ except IndexError:
130
+ raise DerivaMLException(f"Dataset {dataset_rid} not found.")
131
+
132
+ def delete_dataset(self, dataset: "Dataset", recurse: bool = False) -> None:
133
+ """Delete a dataset from the catalog.
134
+
135
+ Args:
136
+ dataset: The dataset to delete.
137
+ recurse: If True, delete the dataset along with any nested datasets. (Default value = False)
138
+ """
139
+ # Get association table entries for this dataset_table
140
+ # Delete association table entries
141
+ dataset_rid = dataset.dataset_rid
142
+ if not self.model.is_dataset_rid(dataset.dataset_rid):
143
+ raise DerivaMLException("Dataset_rid is not a dataset.")
144
+
145
+ if parents := dataset.list_dataset_parents():
146
+ raise DerivaMLException(f'Dataset "{dataset}" is in a nested dataset: {parents}.')
147
+
148
+ pb = self.pathBuilder()
149
+ dataset_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
150
+
151
+ # list_dataset_children returns Dataset objects, so extract their RIDs
152
+ child_rids = [ds.dataset_rid for ds in dataset.list_dataset_children()] if recurse else []
153
+ rid_list = [dataset_rid] + child_rids
154
+ dataset_path.update([{"RID": r, "Deleted": True} for r in rid_list])
155
+
156
+ def list_dataset_element_types(self) -> Iterable[Table]:
157
+ """List the types of entities that can be added to a dataset.
158
+
159
+ Returns:
160
+ An iterable of Table objects that can be included as an element of a dataset.
161
+ """
162
+
163
+ def is_domain_or_dataset_table(table: Table) -> bool:
164
+ return self.model.is_domain_schema(table.schema.name) or table.name == self._dataset_table.name
165
+
166
+ return [t for a in self._dataset_table.find_associations() if is_domain_or_dataset_table(t := a.other_fkeys.pop().pk_table)]
167
+
168
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
169
+ def add_dataset_element_type(self, element: str | Table) -> Table:
170
+ """Makes it possible to add objects from the specified table to a dataset.
171
+
172
+ A dataset is a heterogeneous collection of objects, each of which comes from a different table.
173
+ This routine adds the specified table as a valid element type for datasets.
174
+
175
+ Args:
176
+ element: Name of the table or table object that is to be added to the dataset.
177
+
178
+ Returns:
179
+ The table object that was added to the dataset.
180
+ """
181
+ # Import here to avoid circular imports
182
+ from deriva_ml.dataset.catalog_graph import CatalogGraph
183
+
184
+ # Add table to map
185
+ element_table = self.model.name_to_table(element)
186
+ atable_def = Table.define_association([self._dataset_table, element_table])
187
+ try:
188
+ table = self.model.create_table(atable_def)
189
+ except ValueError as e:
190
+ if "already exists" in str(e):
191
+ table = self.model.name_to_table(atable_def["table_name"])
192
+ else:
193
+ raise e
194
+
195
+ # self.model = self.catalog.getCatalogModel()
196
+ annotations = CatalogGraph(self, s3_bucket=self.s3_bucket, use_minid=self.use_minid).generate_dataset_download_annotations() # type: ignore[arg-type]
197
+ self._dataset_table.annotations.update(annotations)
198
+ self.model.model.apply()
199
+ return table
200
+
201
+ def download_dataset_bag(
202
+ self,
203
+ dataset: DatasetSpec,
204
+ ) -> "DatasetBag":
205
+ """Downloads a dataset to the local filesystem.
206
+
207
+ Downloads a dataset specified by DatasetSpec to the local filesystem. If the catalog
208
+ has s3_bucket configured and use_minid is enabled, the bag will be uploaded to S3
209
+ and registered with the MINID service.
210
+
211
+ Args:
212
+ dataset: Specification of the dataset to download, including version and materialization options.
213
+
214
+ Returns:
215
+ DatasetBag: Object containing:
216
+ - path: Local filesystem path to downloaded dataset
217
+ - rid: Dataset's Resource Identifier
218
+ - minid: Dataset's Minimal Viable Identifier (if MINID enabled)
219
+
220
+ Note:
221
+ MINID support requires s3_bucket to be configured when creating the DerivaML instance.
222
+ The catalog's use_minid setting controls whether MINIDs are created.
223
+
224
+ Examples:
225
+ Download with default options:
226
+ >>> spec = DatasetSpec(rid="1-abc123")
227
+ >>> bag = ml.download_dataset_bag(dataset=spec)
228
+ >>> print(f"Downloaded to {bag.path}")
229
+ """
230
+ if not self.model.is_dataset_rid(dataset.rid):
231
+ raise DerivaMLTableTypeError("Dataset", dataset.rid)
232
+ ds = self.lookup_dataset(dataset)
233
+ return ds.download_dataset_bag(
234
+ version=dataset.version,
235
+ materialize=dataset.materialize,
236
+ use_minid=self.use_minid,
237
+ )