deriva-ml 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +43 -1
- deriva_ml/asset/__init__.py +17 -0
- deriva_ml/asset/asset.py +357 -0
- deriva_ml/asset/aux_classes.py +100 -0
- deriva_ml/bump_version.py +254 -11
- deriva_ml/catalog/__init__.py +21 -0
- deriva_ml/catalog/clone.py +1199 -0
- deriva_ml/catalog/localize.py +426 -0
- deriva_ml/core/__init__.py +29 -0
- deriva_ml/core/base.py +817 -1067
- deriva_ml/core/config.py +169 -21
- deriva_ml/core/constants.py +120 -19
- deriva_ml/core/definitions.py +123 -13
- deriva_ml/core/enums.py +47 -73
- deriva_ml/core/ermrest.py +226 -193
- deriva_ml/core/exceptions.py +297 -14
- deriva_ml/core/filespec.py +99 -28
- deriva_ml/core/logging_config.py +225 -0
- deriva_ml/core/mixins/__init__.py +42 -0
- deriva_ml/core/mixins/annotation.py +915 -0
- deriva_ml/core/mixins/asset.py +384 -0
- deriva_ml/core/mixins/dataset.py +237 -0
- deriva_ml/core/mixins/execution.py +408 -0
- deriva_ml/core/mixins/feature.py +365 -0
- deriva_ml/core/mixins/file.py +263 -0
- deriva_ml/core/mixins/path_builder.py +145 -0
- deriva_ml/core/mixins/rid_resolution.py +204 -0
- deriva_ml/core/mixins/vocabulary.py +400 -0
- deriva_ml/core/mixins/workflow.py +322 -0
- deriva_ml/core/validation.py +389 -0
- deriva_ml/dataset/__init__.py +2 -1
- deriva_ml/dataset/aux_classes.py +20 -4
- deriva_ml/dataset/catalog_graph.py +575 -0
- deriva_ml/dataset/dataset.py +1242 -1008
- deriva_ml/dataset/dataset_bag.py +1311 -182
- deriva_ml/dataset/history.py +27 -14
- deriva_ml/dataset/upload.py +225 -38
- deriva_ml/demo_catalog.py +126 -110
- deriva_ml/execution/__init__.py +46 -2
- deriva_ml/execution/base_config.py +639 -0
- deriva_ml/execution/execution.py +543 -242
- deriva_ml/execution/execution_configuration.py +26 -11
- deriva_ml/execution/execution_record.py +592 -0
- deriva_ml/execution/find_caller.py +298 -0
- deriva_ml/execution/model_protocol.py +175 -0
- deriva_ml/execution/multirun_config.py +153 -0
- deriva_ml/execution/runner.py +595 -0
- deriva_ml/execution/workflow.py +223 -34
- deriva_ml/experiment/__init__.py +8 -0
- deriva_ml/experiment/experiment.py +411 -0
- deriva_ml/feature.py +6 -1
- deriva_ml/install_kernel.py +143 -6
- deriva_ml/interfaces.py +862 -0
- deriva_ml/model/__init__.py +99 -0
- deriva_ml/model/annotations.py +1278 -0
- deriva_ml/model/catalog.py +286 -60
- deriva_ml/model/database.py +144 -649
- deriva_ml/model/deriva_ml_database.py +308 -0
- deriva_ml/model/handles.py +14 -0
- deriva_ml/run_model.py +319 -0
- deriva_ml/run_notebook.py +507 -38
- deriva_ml/schema/__init__.py +18 -2
- deriva_ml/schema/annotations.py +62 -33
- deriva_ml/schema/create_schema.py +169 -69
- deriva_ml/schema/validation.py +601 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -4
- deriva_ml-1.17.11.dist-info/RECORD +77 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +1 -0
- deriva_ml/protocols/dataset.py +0 -19
- deriva_ml/test.py +0 -94
- deriva_ml-1.17.10.dist-info/RECORD +0 -45
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
"""Asset management mixin for DerivaML.
|
|
2
|
+
|
|
3
|
+
This module provides the AssetMixin class which handles
|
|
4
|
+
asset table operations including creating, listing, and looking up assets.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterable
|
|
10
|
+
|
|
11
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
12
|
+
import importlib
|
|
13
|
+
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
14
|
+
Table = _ermrest_model.Table
|
|
15
|
+
|
|
16
|
+
from deriva_ml.core.definitions import AssetTableDef, ColumnDefinition, MLVocab, RID, VocabularyTerm
|
|
17
|
+
from deriva_ml.core.exceptions import DerivaMLException
|
|
18
|
+
from deriva_ml.schema.annotations import asset_annotation
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from deriva_ml.asset.asset import Asset
|
|
22
|
+
from deriva_ml.execution.execution_record import ExecutionRecord
|
|
23
|
+
from deriva_ml.model.catalog import DerivaModel
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AssetMixin:
|
|
27
|
+
"""Mixin providing asset management operations.
|
|
28
|
+
|
|
29
|
+
This mixin requires the host class to have:
|
|
30
|
+
- model: DerivaModel instance
|
|
31
|
+
- ml_schema: str - name of the ML schema
|
|
32
|
+
- domain_schema: str - name of the domain schema
|
|
33
|
+
- pathBuilder(): method returning catalog path builder
|
|
34
|
+
- add_term(): method for adding vocabulary terms (from VocabularyMixin)
|
|
35
|
+
- apply_catalog_annotations(): method to update navbar (from DerivaML base class)
|
|
36
|
+
|
|
37
|
+
Methods:
|
|
38
|
+
create_asset: Create a new asset table
|
|
39
|
+
list_assets: List contents of an asset table
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
# Type hints for IDE support - actual attributes/methods from host class
|
|
43
|
+
model: "DerivaModel"
|
|
44
|
+
ml_schema: str
|
|
45
|
+
domain_schemas: frozenset[str]
|
|
46
|
+
default_schema: str | None
|
|
47
|
+
pathBuilder: Callable[[], Any]
|
|
48
|
+
add_term: Callable[..., VocabularyTerm]
|
|
49
|
+
apply_catalog_annotations: Callable[[], None]
|
|
50
|
+
|
|
51
|
+
# Note: @validate_call removed because ColumnDefinition is now a dataclass from
|
|
52
|
+
# deriva.core.typed and Pydantic validation doesn't work well with dataclass fields
|
|
53
|
+
def create_asset(
|
|
54
|
+
self,
|
|
55
|
+
asset_name: str,
|
|
56
|
+
column_defs: Iterable[ColumnDefinition] | None = None,
|
|
57
|
+
fkey_defs: Iterable[ColumnDefinition] | None = None,
|
|
58
|
+
referenced_tables: Iterable[Table] | None = None,
|
|
59
|
+
comment: str = "",
|
|
60
|
+
schema: str | None = None,
|
|
61
|
+
update_navbar: bool = True,
|
|
62
|
+
) -> Table:
|
|
63
|
+
"""Creates an asset table.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
asset_name: Name of the asset table.
|
|
67
|
+
column_defs: Iterable of ColumnDefinition objects to provide additional metadata for asset.
|
|
68
|
+
fkey_defs: Iterable of ForeignKeyDefinition objects to provide additional metadata for asset.
|
|
69
|
+
referenced_tables: Iterable of Table objects to which asset should provide foreign-key references to.
|
|
70
|
+
comment: Description of the asset table. (Default value = '')
|
|
71
|
+
schema: Schema in which to create the asset table. Defaults to domain_schema.
|
|
72
|
+
update_navbar: If True (default), automatically updates the navigation bar to include
|
|
73
|
+
the new asset table. Set to False during batch asset creation to avoid redundant
|
|
74
|
+
updates, then call apply_catalog_annotations() once at the end.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Table object for the asset table.
|
|
78
|
+
"""
|
|
79
|
+
# Initialize empty collections if None provided
|
|
80
|
+
column_defs = column_defs or []
|
|
81
|
+
fkey_defs = fkey_defs or []
|
|
82
|
+
referenced_tables = referenced_tables or []
|
|
83
|
+
schema = schema or self.model._require_default_schema()
|
|
84
|
+
|
|
85
|
+
# Add an asset type to vocabulary
|
|
86
|
+
self.add_term(MLVocab.asset_type, asset_name, description=f"A {asset_name} asset")
|
|
87
|
+
|
|
88
|
+
# Create the main asset table
|
|
89
|
+
# Note: column_defs and fkey_defs should be ColumnDef/ForeignKeyDef objects
|
|
90
|
+
asset_table = self.model.schemas[schema].create_table(
|
|
91
|
+
AssetTableDef(
|
|
92
|
+
schema_name=schema,
|
|
93
|
+
name=asset_name,
|
|
94
|
+
columns=list(column_defs),
|
|
95
|
+
foreign_keys=list(fkey_defs),
|
|
96
|
+
comment=comment,
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Create an association table between asset and asset type
|
|
101
|
+
self.model.create_table(
|
|
102
|
+
Table.define_association(
|
|
103
|
+
[
|
|
104
|
+
(asset_table.name, asset_table),
|
|
105
|
+
("Asset_Type", self.model.name_to_table("Asset_Type")),
|
|
106
|
+
]
|
|
107
|
+
),
|
|
108
|
+
schema=schema,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Create references to other tables if specified
|
|
112
|
+
for t in referenced_tables:
|
|
113
|
+
asset_table.create_reference(self.model.name_to_table(t))
|
|
114
|
+
|
|
115
|
+
# Create an association table for tracking execution
|
|
116
|
+
atable = self.model.create_table(
|
|
117
|
+
Table.define_association(
|
|
118
|
+
[
|
|
119
|
+
(asset_name, asset_table),
|
|
120
|
+
(
|
|
121
|
+
"Execution",
|
|
122
|
+
self.model.schemas[self.ml_schema].tables["Execution"],
|
|
123
|
+
),
|
|
124
|
+
]
|
|
125
|
+
),
|
|
126
|
+
schema=schema,
|
|
127
|
+
)
|
|
128
|
+
atable.create_reference(self.model.name_to_table("Asset_Role"))
|
|
129
|
+
|
|
130
|
+
# Add asset annotations
|
|
131
|
+
asset_annotation(asset_table)
|
|
132
|
+
|
|
133
|
+
# Update navbar to include the new asset table
|
|
134
|
+
if update_navbar:
|
|
135
|
+
self.apply_catalog_annotations()
|
|
136
|
+
|
|
137
|
+
return asset_table
|
|
138
|
+
|
|
139
|
+
def list_assets(self, asset_table: Table | str) -> list["Asset"]:
|
|
140
|
+
"""Lists contents of an asset table.
|
|
141
|
+
|
|
142
|
+
Returns a list of Asset objects for the specified asset table.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
asset_table: Table or name of the asset table to list assets for.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
list[Asset]: List of Asset objects for the assets in the table.
|
|
149
|
+
|
|
150
|
+
Raises:
|
|
151
|
+
DerivaMLException: If the table is not an asset table or doesn't exist.
|
|
152
|
+
|
|
153
|
+
Example:
|
|
154
|
+
>>> assets = ml.list_assets("Image")
|
|
155
|
+
>>> for asset in assets:
|
|
156
|
+
... print(f"{asset.asset_rid}: {asset.filename}")
|
|
157
|
+
"""
|
|
158
|
+
from deriva_ml.asset.asset import Asset
|
|
159
|
+
|
|
160
|
+
# Validate and get asset table reference
|
|
161
|
+
asset_table_obj = self.model.name_to_table(asset_table)
|
|
162
|
+
if not self.model.is_asset(asset_table_obj):
|
|
163
|
+
raise DerivaMLException(f"Table {asset_table_obj.name} is not an asset")
|
|
164
|
+
|
|
165
|
+
# Get path builders for asset and type tables
|
|
166
|
+
pb = self.pathBuilder()
|
|
167
|
+
asset_path = pb.schemas[asset_table_obj.schema.name].tables[asset_table_obj.name]
|
|
168
|
+
(
|
|
169
|
+
asset_type_table,
|
|
170
|
+
_,
|
|
171
|
+
_,
|
|
172
|
+
) = self.model.find_association(asset_table_obj, MLVocab.asset_type)
|
|
173
|
+
type_path = pb.schemas[asset_type_table.schema.name].tables[asset_type_table.name]
|
|
174
|
+
|
|
175
|
+
# Build a list of Asset objects
|
|
176
|
+
assets = []
|
|
177
|
+
for asset_record in asset_path.entities().fetch():
|
|
178
|
+
# Get associated asset types for each asset
|
|
179
|
+
asset_types = (
|
|
180
|
+
type_path.filter(type_path.columns[asset_table_obj.name] == asset_record["RID"])
|
|
181
|
+
.attributes(type_path.Asset_Type)
|
|
182
|
+
.fetch()
|
|
183
|
+
)
|
|
184
|
+
asset_type_list = [asset_type[MLVocab.asset_type.value] for asset_type in asset_types]
|
|
185
|
+
|
|
186
|
+
assets.append(Asset(
|
|
187
|
+
catalog=self, # type: ignore[arg-type]
|
|
188
|
+
asset_rid=asset_record["RID"],
|
|
189
|
+
asset_table=asset_table_obj.name,
|
|
190
|
+
filename=asset_record.get("Filename", ""),
|
|
191
|
+
url=asset_record.get("URL", ""),
|
|
192
|
+
length=asset_record.get("Length", 0),
|
|
193
|
+
md5=asset_record.get("MD5", ""),
|
|
194
|
+
description=asset_record.get("Description", ""),
|
|
195
|
+
asset_types=asset_type_list,
|
|
196
|
+
))
|
|
197
|
+
return assets
|
|
198
|
+
|
|
199
|
+
def list_asset_executions(
|
|
200
|
+
self, asset_rid: str, asset_role: str | None = None
|
|
201
|
+
) -> list["ExecutionRecord"]:
|
|
202
|
+
"""List all executions associated with an asset.
|
|
203
|
+
|
|
204
|
+
Given an asset RID, returns a list of executions that created or used
|
|
205
|
+
the asset, along with the role (Input/Output) in each execution.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
asset_rid: The RID of the asset to look up.
|
|
209
|
+
asset_role: Optional filter for asset role ('Input' or 'Output').
|
|
210
|
+
If None, returns all associations.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
list[ExecutionRecord]: List of ExecutionRecord objects for the
|
|
214
|
+
executions associated with this asset.
|
|
215
|
+
|
|
216
|
+
Raises:
|
|
217
|
+
DerivaMLException: If the asset RID is not found or not an asset.
|
|
218
|
+
|
|
219
|
+
Example:
|
|
220
|
+
>>> # Find all executions that created this asset
|
|
221
|
+
>>> executions = ml.list_asset_executions("1-abc123", asset_role="Output")
|
|
222
|
+
>>> for exe in executions:
|
|
223
|
+
... print(f"Created by execution {exe.execution_rid}")
|
|
224
|
+
|
|
225
|
+
>>> # Find all executions that used this asset as input
|
|
226
|
+
>>> executions = ml.list_asset_executions("1-abc123", asset_role="Input")
|
|
227
|
+
"""
|
|
228
|
+
# Resolve the RID to find which asset table it belongs to
|
|
229
|
+
rid_info = self.resolve_rid(asset_rid) # type: ignore[attr-defined]
|
|
230
|
+
asset_table = rid_info.table
|
|
231
|
+
|
|
232
|
+
if not self.model.is_asset(asset_table):
|
|
233
|
+
raise DerivaMLException(f"RID {asset_rid} is not an asset (table: {asset_table.name})")
|
|
234
|
+
|
|
235
|
+
# Find the association table between this asset table and Execution
|
|
236
|
+
asset_exe_table, asset_fk, execution_fk = self.model.find_association(asset_table, "Execution")
|
|
237
|
+
|
|
238
|
+
# Build the query
|
|
239
|
+
pb = self.pathBuilder()
|
|
240
|
+
asset_exe_path = pb.schemas[asset_exe_table.schema.name].tables[asset_exe_table.name]
|
|
241
|
+
|
|
242
|
+
# Filter by asset RID
|
|
243
|
+
query = asset_exe_path.filter(asset_exe_path.columns[asset_fk] == asset_rid)
|
|
244
|
+
|
|
245
|
+
# Optionally filter by asset role
|
|
246
|
+
if asset_role:
|
|
247
|
+
query = query.filter(asset_exe_path.Asset_Role == asset_role)
|
|
248
|
+
|
|
249
|
+
# Convert to ExecutionRecord objects
|
|
250
|
+
records = list(query.entities().fetch())
|
|
251
|
+
return [self.lookup_execution(record["Execution"]) for record in records] # type: ignore[attr-defined]
|
|
252
|
+
|
|
253
|
+
def lookup_asset(self, asset_rid: RID) -> "Asset":
|
|
254
|
+
"""Look up an asset by its RID.
|
|
255
|
+
|
|
256
|
+
Returns an Asset object for the specified RID. The asset can be from
|
|
257
|
+
any asset table in the catalog.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
asset_rid: The RID of the asset to look up.
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
Asset object for the specified RID.
|
|
264
|
+
|
|
265
|
+
Raises:
|
|
266
|
+
DerivaMLException: If the RID is not found or is not an asset.
|
|
267
|
+
|
|
268
|
+
Example:
|
|
269
|
+
>>> asset = ml.lookup_asset("3JSE")
|
|
270
|
+
>>> print(f"File: {asset.filename}, Table: {asset.asset_table}")
|
|
271
|
+
"""
|
|
272
|
+
from deriva_ml.asset.asset import Asset
|
|
273
|
+
|
|
274
|
+
# Resolve the RID to find which table it belongs to
|
|
275
|
+
rid_info = self.resolve_rid(asset_rid) # type: ignore[attr-defined]
|
|
276
|
+
asset_table = rid_info.table
|
|
277
|
+
|
|
278
|
+
if not self.model.is_asset(asset_table):
|
|
279
|
+
raise DerivaMLException(f"RID {asset_rid} is not an asset (table: {asset_table.name})")
|
|
280
|
+
|
|
281
|
+
# Query the asset table for this record
|
|
282
|
+
pb = self.pathBuilder()
|
|
283
|
+
asset_path = pb.schemas[asset_table.schema.name].tables[asset_table.name]
|
|
284
|
+
|
|
285
|
+
records = list(asset_path.filter(asset_path.RID == asset_rid).entities().fetch())
|
|
286
|
+
if not records:
|
|
287
|
+
raise DerivaMLException(f"Asset {asset_rid} not found in table {asset_table.name}")
|
|
288
|
+
|
|
289
|
+
record = records[0]
|
|
290
|
+
|
|
291
|
+
# Get asset types
|
|
292
|
+
asset_types = []
|
|
293
|
+
try:
|
|
294
|
+
type_assoc_table, asset_fk, _ = self.model.find_association(asset_table, "Asset_Type")
|
|
295
|
+
type_path = pb.schemas[type_assoc_table.schema.name].tables[type_assoc_table.name]
|
|
296
|
+
types = list(
|
|
297
|
+
type_path.filter(type_path.columns[asset_fk] == asset_rid)
|
|
298
|
+
.attributes(type_path.Asset_Type)
|
|
299
|
+
.fetch()
|
|
300
|
+
)
|
|
301
|
+
asset_types = [t["Asset_Type"] for t in types]
|
|
302
|
+
except Exception:
|
|
303
|
+
pass # No type association for this asset table
|
|
304
|
+
|
|
305
|
+
return Asset(
|
|
306
|
+
catalog=self, # type: ignore[arg-type]
|
|
307
|
+
asset_rid=asset_rid,
|
|
308
|
+
asset_table=asset_table.name,
|
|
309
|
+
filename=record.get("Filename", ""),
|
|
310
|
+
url=record.get("URL", ""),
|
|
311
|
+
length=record.get("Length", 0),
|
|
312
|
+
md5=record.get("MD5", ""),
|
|
313
|
+
description=record.get("Description", ""),
|
|
314
|
+
asset_types=asset_types,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
def list_asset_tables(self) -> list[Table]:
|
|
318
|
+
"""List all asset tables in the catalog.
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
List of Table objects that are asset tables.
|
|
322
|
+
|
|
323
|
+
Example:
|
|
324
|
+
>>> for table in ml.list_asset_tables():
|
|
325
|
+
... print(f"Asset table: {table.name}")
|
|
326
|
+
"""
|
|
327
|
+
tables = []
|
|
328
|
+
# Include asset tables from all domain schemas
|
|
329
|
+
for domain_schema in self.domain_schemas:
|
|
330
|
+
if domain_schema in self.model.schemas:
|
|
331
|
+
tables.extend([
|
|
332
|
+
t for t in self.model.schemas[domain_schema].tables.values()
|
|
333
|
+
if self.model.is_asset(t)
|
|
334
|
+
])
|
|
335
|
+
# Also include ML schema asset tables (like Execution_Asset)
|
|
336
|
+
tables.extend([
|
|
337
|
+
t for t in self.model.schemas[self.ml_schema].tables.values()
|
|
338
|
+
if self.model.is_asset(t)
|
|
339
|
+
])
|
|
340
|
+
return tables
|
|
341
|
+
|
|
342
|
+
def find_assets(
|
|
343
|
+
self,
|
|
344
|
+
asset_table: Table | str | None = None,
|
|
345
|
+
asset_type: str | None = None,
|
|
346
|
+
) -> Iterable["Asset"]:
|
|
347
|
+
"""Find assets in the catalog.
|
|
348
|
+
|
|
349
|
+
Returns an iterable of Asset objects matching the specified criteria.
|
|
350
|
+
If no criteria are specified, returns all assets from all asset tables.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
asset_table: Optional table or table name to search. If None, searches
|
|
354
|
+
all asset tables.
|
|
355
|
+
asset_type: Optional asset type to filter by. Only returns assets
|
|
356
|
+
with this type.
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
Iterable of Asset objects matching the criteria.
|
|
360
|
+
|
|
361
|
+
Example:
|
|
362
|
+
>>> # Find all assets in the Model table
|
|
363
|
+
>>> models = list(ml.find_assets(asset_table="Model"))
|
|
364
|
+
|
|
365
|
+
>>> # Find all assets with type "Training_Data"
|
|
366
|
+
>>> training = list(ml.find_assets(asset_type="Training_Data"))
|
|
367
|
+
|
|
368
|
+
>>> # Find all assets across all tables
|
|
369
|
+
>>> all_assets = list(ml.find_assets())
|
|
370
|
+
"""
|
|
371
|
+
# Determine which tables to search
|
|
372
|
+
if asset_table is not None:
|
|
373
|
+
tables = [self.model.name_to_table(asset_table)]
|
|
374
|
+
else:
|
|
375
|
+
tables = self.list_asset_tables()
|
|
376
|
+
|
|
377
|
+
for table in tables:
|
|
378
|
+
# Get all assets from this table (now returns Asset objects)
|
|
379
|
+
for asset in self.list_assets(table):
|
|
380
|
+
# Filter by asset type if specified
|
|
381
|
+
if asset_type is not None:
|
|
382
|
+
if asset_type not in asset.asset_types:
|
|
383
|
+
continue
|
|
384
|
+
yield asset
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
"""Dataset management mixin for DerivaML.
|
|
2
|
+
|
|
3
|
+
This module provides the DatasetMixin class which handles
|
|
4
|
+
dataset operations including finding, creating, looking up,
|
|
5
|
+
deleting, and managing dataset elements.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterable
|
|
11
|
+
|
|
12
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
13
|
+
import importlib
|
|
14
|
+
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
15
|
+
Table = _ermrest_model.Table
|
|
16
|
+
|
|
17
|
+
from pydantic import ConfigDict, validate_call
|
|
18
|
+
|
|
19
|
+
from deriva_ml.core.definitions import RID, MLVocab
|
|
20
|
+
from deriva_ml.core.exceptions import DerivaMLException, DerivaMLTableTypeError
|
|
21
|
+
from deriva_ml.dataset.aux_classes import DatasetSpec
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from deriva_ml.dataset.dataset import Dataset
|
|
25
|
+
from deriva_ml.dataset.dataset_bag import DatasetBag
|
|
26
|
+
from deriva_ml.model.catalog import DerivaModel
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DatasetMixin:
|
|
30
|
+
"""Mixin providing dataset management operations.
|
|
31
|
+
|
|
32
|
+
This mixin requires the host class to have:
|
|
33
|
+
- model: DerivaModel instance
|
|
34
|
+
- ml_schema: str - name of the ML schema
|
|
35
|
+
- domain_schema: str - name of the domain schema
|
|
36
|
+
- s3_bucket: str | None - S3 bucket URL for dataset storage
|
|
37
|
+
- use_minid: bool - whether to use MINIDs
|
|
38
|
+
- pathBuilder(): method returning catalog path builder
|
|
39
|
+
- _dataset_table: property returning the Dataset table
|
|
40
|
+
|
|
41
|
+
Methods:
|
|
42
|
+
find_datasets: List all datasets in the catalog
|
|
43
|
+
create_dataset: Create a new dataset
|
|
44
|
+
lookup_dataset: Look up a dataset by RID or spec
|
|
45
|
+
delete_dataset: Delete a dataset
|
|
46
|
+
list_dataset_element_types: List types that can be added to datasets
|
|
47
|
+
add_dataset_element_type: Add a new element type to datasets
|
|
48
|
+
download_dataset_bag: Download a dataset as a bag
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
# Type hints for IDE support - actual attributes/methods from host class
|
|
52
|
+
model: "DerivaModel"
|
|
53
|
+
ml_schema: str
|
|
54
|
+
domain_schemas: frozenset[str]
|
|
55
|
+
default_schema: str | None
|
|
56
|
+
s3_bucket: str | None
|
|
57
|
+
use_minid: bool
|
|
58
|
+
pathBuilder: Callable[[], Any]
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def _dataset_table(self) -> Table:
|
|
62
|
+
"""Get the Dataset table. Must be provided by host class."""
|
|
63
|
+
raise NotImplementedError
|
|
64
|
+
|
|
65
|
+
def find_datasets(self, deleted: bool = False) -> Iterable["Dataset"]:
|
|
66
|
+
"""List all datasets in the catalog.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
deleted: If True, include datasets that have been marked as deleted.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Iterable of Dataset objects.
|
|
73
|
+
|
|
74
|
+
Example:
|
|
75
|
+
>>> datasets = list(ml.find_datasets())
|
|
76
|
+
>>> for ds in datasets:
|
|
77
|
+
... print(f"{ds.dataset_rid}: {ds.description}")
|
|
78
|
+
"""
|
|
79
|
+
# Import here to avoid circular imports
|
|
80
|
+
from deriva_ml.dataset.dataset import Dataset
|
|
81
|
+
|
|
82
|
+
# Get datapath to the Dataset table
|
|
83
|
+
pb = self.pathBuilder()
|
|
84
|
+
dataset_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
|
|
85
|
+
|
|
86
|
+
if deleted:
|
|
87
|
+
filtered_path = dataset_path
|
|
88
|
+
else:
|
|
89
|
+
filtered_path = dataset_path.filter(
|
|
90
|
+
(dataset_path.Deleted == False) | (dataset_path.Deleted == None) # noqa: E711, E712
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Create Dataset objects - dataset_types is now a property that fetches from catalog
|
|
94
|
+
datasets = []
|
|
95
|
+
for dataset in filtered_path.entities().fetch():
|
|
96
|
+
datasets.append(
|
|
97
|
+
Dataset(
|
|
98
|
+
self, # type: ignore[arg-type]
|
|
99
|
+
dataset_rid=dataset["RID"],
|
|
100
|
+
description=dataset["Description"],
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
return datasets
|
|
104
|
+
|
|
105
|
+
def lookup_dataset(self, dataset: RID | DatasetSpec, deleted: bool = False) -> "Dataset":
|
|
106
|
+
"""Look up a dataset by RID or DatasetSpec.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
dataset: Dataset RID or DatasetSpec to look up.
|
|
110
|
+
deleted: If True, include datasets that have been marked as deleted.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Dataset: The dataset object for the specified RID.
|
|
114
|
+
|
|
115
|
+
Raises:
|
|
116
|
+
DerivaMLException: If the dataset is not found.
|
|
117
|
+
|
|
118
|
+
Example:
|
|
119
|
+
>>> dataset = ml.lookup_dataset("4HM")
|
|
120
|
+
>>> print(f"Version: {dataset.current_version}")
|
|
121
|
+
"""
|
|
122
|
+
if isinstance(dataset, DatasetSpec):
|
|
123
|
+
dataset_rid = dataset.rid
|
|
124
|
+
else:
|
|
125
|
+
dataset_rid = dataset
|
|
126
|
+
|
|
127
|
+
try:
|
|
128
|
+
return [ds for ds in self.find_datasets(deleted=deleted) if ds.dataset_rid == dataset_rid][0]
|
|
129
|
+
except IndexError:
|
|
130
|
+
raise DerivaMLException(f"Dataset {dataset_rid} not found.")
|
|
131
|
+
|
|
132
|
+
def delete_dataset(self, dataset: "Dataset", recurse: bool = False) -> None:
|
|
133
|
+
"""Delete a dataset from the catalog.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
dataset: The dataset to delete.
|
|
137
|
+
recurse: If True, delete the dataset along with any nested datasets. (Default value = False)
|
|
138
|
+
"""
|
|
139
|
+
# Get association table entries for this dataset_table
|
|
140
|
+
# Delete association table entries
|
|
141
|
+
dataset_rid = dataset.dataset_rid
|
|
142
|
+
if not self.model.is_dataset_rid(dataset.dataset_rid):
|
|
143
|
+
raise DerivaMLException("Dataset_rid is not a dataset.")
|
|
144
|
+
|
|
145
|
+
if parents := dataset.list_dataset_parents():
|
|
146
|
+
raise DerivaMLException(f'Dataset "{dataset}" is in a nested dataset: {parents}.')
|
|
147
|
+
|
|
148
|
+
pb = self.pathBuilder()
|
|
149
|
+
dataset_path = pb.schemas[self._dataset_table.schema.name].tables[self._dataset_table.name]
|
|
150
|
+
|
|
151
|
+
# list_dataset_children returns Dataset objects, so extract their RIDs
|
|
152
|
+
child_rids = [ds.dataset_rid for ds in dataset.list_dataset_children()] if recurse else []
|
|
153
|
+
rid_list = [dataset_rid] + child_rids
|
|
154
|
+
dataset_path.update([{"RID": r, "Deleted": True} for r in rid_list])
|
|
155
|
+
|
|
156
|
+
def list_dataset_element_types(self) -> Iterable[Table]:
|
|
157
|
+
"""List the types of entities that can be added to a dataset.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
An iterable of Table objects that can be included as an element of a dataset.
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
def is_domain_or_dataset_table(table: Table) -> bool:
|
|
164
|
+
return self.model.is_domain_schema(table.schema.name) or table.name == self._dataset_table.name
|
|
165
|
+
|
|
166
|
+
return [t for a in self._dataset_table.find_associations() if is_domain_or_dataset_table(t := a.other_fkeys.pop().pk_table)]
|
|
167
|
+
|
|
168
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
169
|
+
def add_dataset_element_type(self, element: str | Table) -> Table:
|
|
170
|
+
"""Makes it possible to add objects from the specified table to a dataset.
|
|
171
|
+
|
|
172
|
+
A dataset is a heterogeneous collection of objects, each of which comes from a different table.
|
|
173
|
+
This routine adds the specified table as a valid element type for datasets.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
element: Name of the table or table object that is to be added to the dataset.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
The table object that was added to the dataset.
|
|
180
|
+
"""
|
|
181
|
+
# Import here to avoid circular imports
|
|
182
|
+
from deriva_ml.dataset.catalog_graph import CatalogGraph
|
|
183
|
+
|
|
184
|
+
# Add table to map
|
|
185
|
+
element_table = self.model.name_to_table(element)
|
|
186
|
+
atable_def = Table.define_association([self._dataset_table, element_table])
|
|
187
|
+
try:
|
|
188
|
+
table = self.model.create_table(atable_def)
|
|
189
|
+
except ValueError as e:
|
|
190
|
+
if "already exists" in str(e):
|
|
191
|
+
table = self.model.name_to_table(atable_def["table_name"])
|
|
192
|
+
else:
|
|
193
|
+
raise e
|
|
194
|
+
|
|
195
|
+
# self.model = self.catalog.getCatalogModel()
|
|
196
|
+
annotations = CatalogGraph(self, s3_bucket=self.s3_bucket, use_minid=self.use_minid).generate_dataset_download_annotations() # type: ignore[arg-type]
|
|
197
|
+
self._dataset_table.annotations.update(annotations)
|
|
198
|
+
self.model.model.apply()
|
|
199
|
+
return table
|
|
200
|
+
|
|
201
|
+
def download_dataset_bag(
|
|
202
|
+
self,
|
|
203
|
+
dataset: DatasetSpec,
|
|
204
|
+
) -> "DatasetBag":
|
|
205
|
+
"""Downloads a dataset to the local filesystem.
|
|
206
|
+
|
|
207
|
+
Downloads a dataset specified by DatasetSpec to the local filesystem. If the catalog
|
|
208
|
+
has s3_bucket configured and use_minid is enabled, the bag will be uploaded to S3
|
|
209
|
+
and registered with the MINID service.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
dataset: Specification of the dataset to download, including version and materialization options.
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
DatasetBag: Object containing:
|
|
216
|
+
- path: Local filesystem path to downloaded dataset
|
|
217
|
+
- rid: Dataset's Resource Identifier
|
|
218
|
+
- minid: Dataset's Minimal Viable Identifier (if MINID enabled)
|
|
219
|
+
|
|
220
|
+
Note:
|
|
221
|
+
MINID support requires s3_bucket to be configured when creating the DerivaML instance.
|
|
222
|
+
The catalog's use_minid setting controls whether MINIDs are created.
|
|
223
|
+
|
|
224
|
+
Examples:
|
|
225
|
+
Download with default options:
|
|
226
|
+
>>> spec = DatasetSpec(rid="1-abc123")
|
|
227
|
+
>>> bag = ml.download_dataset_bag(dataset=spec)
|
|
228
|
+
>>> print(f"Downloaded to {bag.path}")
|
|
229
|
+
"""
|
|
230
|
+
if not self.model.is_dataset_rid(dataset.rid):
|
|
231
|
+
raise DerivaMLTableTypeError("Dataset", dataset.rid)
|
|
232
|
+
ds = self.lookup_dataset(dataset)
|
|
233
|
+
return ds.download_dataset_bag(
|
|
234
|
+
version=dataset.version,
|
|
235
|
+
materialize=dataset.materialize,
|
|
236
|
+
use_minid=self.use_minid,
|
|
237
|
+
)
|