deriva-ml 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +43 -1
- deriva_ml/asset/__init__.py +17 -0
- deriva_ml/asset/asset.py +357 -0
- deriva_ml/asset/aux_classes.py +100 -0
- deriva_ml/bump_version.py +254 -11
- deriva_ml/catalog/__init__.py +21 -0
- deriva_ml/catalog/clone.py +1199 -0
- deriva_ml/catalog/localize.py +426 -0
- deriva_ml/core/__init__.py +29 -0
- deriva_ml/core/base.py +817 -1067
- deriva_ml/core/config.py +169 -21
- deriva_ml/core/constants.py +120 -19
- deriva_ml/core/definitions.py +123 -13
- deriva_ml/core/enums.py +47 -73
- deriva_ml/core/ermrest.py +226 -193
- deriva_ml/core/exceptions.py +297 -14
- deriva_ml/core/filespec.py +99 -28
- deriva_ml/core/logging_config.py +225 -0
- deriva_ml/core/mixins/__init__.py +42 -0
- deriva_ml/core/mixins/annotation.py +915 -0
- deriva_ml/core/mixins/asset.py +384 -0
- deriva_ml/core/mixins/dataset.py +237 -0
- deriva_ml/core/mixins/execution.py +408 -0
- deriva_ml/core/mixins/feature.py +365 -0
- deriva_ml/core/mixins/file.py +263 -0
- deriva_ml/core/mixins/path_builder.py +145 -0
- deriva_ml/core/mixins/rid_resolution.py +204 -0
- deriva_ml/core/mixins/vocabulary.py +400 -0
- deriva_ml/core/mixins/workflow.py +322 -0
- deriva_ml/core/validation.py +389 -0
- deriva_ml/dataset/__init__.py +2 -1
- deriva_ml/dataset/aux_classes.py +20 -4
- deriva_ml/dataset/catalog_graph.py +575 -0
- deriva_ml/dataset/dataset.py +1242 -1008
- deriva_ml/dataset/dataset_bag.py +1311 -182
- deriva_ml/dataset/history.py +27 -14
- deriva_ml/dataset/upload.py +225 -38
- deriva_ml/demo_catalog.py +126 -110
- deriva_ml/execution/__init__.py +46 -2
- deriva_ml/execution/base_config.py +639 -0
- deriva_ml/execution/execution.py +543 -242
- deriva_ml/execution/execution_configuration.py +26 -11
- deriva_ml/execution/execution_record.py +592 -0
- deriva_ml/execution/find_caller.py +298 -0
- deriva_ml/execution/model_protocol.py +175 -0
- deriva_ml/execution/multirun_config.py +153 -0
- deriva_ml/execution/runner.py +595 -0
- deriva_ml/execution/workflow.py +223 -34
- deriva_ml/experiment/__init__.py +8 -0
- deriva_ml/experiment/experiment.py +411 -0
- deriva_ml/feature.py +6 -1
- deriva_ml/install_kernel.py +143 -6
- deriva_ml/interfaces.py +862 -0
- deriva_ml/model/__init__.py +99 -0
- deriva_ml/model/annotations.py +1278 -0
- deriva_ml/model/catalog.py +286 -60
- deriva_ml/model/database.py +144 -649
- deriva_ml/model/deriva_ml_database.py +308 -0
- deriva_ml/model/handles.py +14 -0
- deriva_ml/run_model.py +319 -0
- deriva_ml/run_notebook.py +507 -38
- deriva_ml/schema/__init__.py +18 -2
- deriva_ml/schema/annotations.py +62 -33
- deriva_ml/schema/create_schema.py +169 -69
- deriva_ml/schema/validation.py +601 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -4
- deriva_ml-1.17.11.dist-info/RECORD +77 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +1 -0
- deriva_ml/protocols/dataset.py +0 -19
- deriva_ml/test.py +0 -94
- deriva_ml-1.17.10.dist-info/RECORD +0 -45
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
"""Feature management mixin for DerivaML.
|
|
2
|
+
|
|
3
|
+
This module provides the FeatureMixin class which handles
|
|
4
|
+
feature operations including creating, looking up, deleting,
|
|
5
|
+
and listing feature values.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from itertools import chain
|
|
11
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterable
|
|
12
|
+
|
|
13
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
14
|
+
import importlib
|
|
15
|
+
datapath = importlib.import_module("deriva.core.datapath")
|
|
16
|
+
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
17
|
+
Key = _ermrest_model.Key
|
|
18
|
+
Table = _ermrest_model.Table
|
|
19
|
+
|
|
20
|
+
from pydantic import ConfigDict, validate_call
|
|
21
|
+
|
|
22
|
+
from deriva_ml.core.definitions import ColumnDefinition, VocabularyTerm
|
|
23
|
+
from deriva_ml.core.exceptions import DerivaMLException
|
|
24
|
+
from deriva_ml.feature import Feature, FeatureRecord
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from deriva_ml.model.catalog import DerivaModel
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class FeatureMixin:
|
|
31
|
+
"""Mixin providing feature management operations.
|
|
32
|
+
|
|
33
|
+
This mixin requires the host class to have:
|
|
34
|
+
- model: DerivaModel instance
|
|
35
|
+
- ml_schema: str - name of the ML schema
|
|
36
|
+
- domain_schema: str - name of the domain schema
|
|
37
|
+
- pathBuilder(): method returning catalog path builder
|
|
38
|
+
- add_term(): method for adding vocabulary terms (from VocabularyMixin)
|
|
39
|
+
- apply_catalog_annotations(): method to update navbar (from DerivaML base class)
|
|
40
|
+
|
|
41
|
+
Methods:
|
|
42
|
+
create_feature: Create a new feature definition
|
|
43
|
+
feature_record_class: Get pydantic model class for feature records
|
|
44
|
+
delete_feature: Remove a feature definition
|
|
45
|
+
lookup_feature: Retrieve a Feature object
|
|
46
|
+
find_features: Find all features in the catalog, optionally filtered by table
|
|
47
|
+
list_feature_values: Get all values for a feature
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# Type hints for IDE support - actual attributes/methods from host class
|
|
51
|
+
model: "DerivaModel"
|
|
52
|
+
ml_schema: str
|
|
53
|
+
domain_schemas: frozenset[str]
|
|
54
|
+
default_schema: str | None
|
|
55
|
+
pathBuilder: Callable[[], Any]
|
|
56
|
+
add_term: Callable[..., VocabularyTerm]
|
|
57
|
+
apply_catalog_annotations: Callable[[], None]
|
|
58
|
+
|
|
59
|
+
def create_feature(
|
|
60
|
+
self,
|
|
61
|
+
target_table: Table | str,
|
|
62
|
+
feature_name: str,
|
|
63
|
+
terms: list[Table | str] | None = None,
|
|
64
|
+
assets: list[Table | str] | None = None,
|
|
65
|
+
metadata: list[ColumnDefinition | Table | Key | str] | None = None,
|
|
66
|
+
optional: list[str] | None = None,
|
|
67
|
+
comment: str = "",
|
|
68
|
+
update_navbar: bool = True,
|
|
69
|
+
) -> type[FeatureRecord]:
|
|
70
|
+
"""Creates a new feature definition.
|
|
71
|
+
|
|
72
|
+
A feature represents a measurable property or characteristic that can be associated with records in the target
|
|
73
|
+
table. Features can include vocabulary terms, asset references, and additional metadata.
|
|
74
|
+
|
|
75
|
+
**Side Effects**:
|
|
76
|
+
This method dynamically creates:
|
|
77
|
+
1. A new association table in the domain schema to store feature values
|
|
78
|
+
2. A Pydantic model class (subclass of FeatureRecord) for creating validated feature instances
|
|
79
|
+
|
|
80
|
+
The returned Pydantic model class provides type-safe construction of feature records with
|
|
81
|
+
automatic validation of values against the feature's definition (vocabulary terms, asset
|
|
82
|
+
references, etc.). Use this class to create feature instances that can be inserted into
|
|
83
|
+
the catalog.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
target_table: Table to associate the feature with (name or Table object).
|
|
87
|
+
feature_name: Unique name for the feature within the target table.
|
|
88
|
+
terms: Optional vocabulary tables/names whose terms can be used as feature values.
|
|
89
|
+
assets: Optional asset tables/names that can be referenced by this feature.
|
|
90
|
+
metadata: Optional columns, tables, or keys to include in a feature definition.
|
|
91
|
+
optional: Column names that are not required when creating feature instances.
|
|
92
|
+
comment: Description of the feature's purpose and usage.
|
|
93
|
+
update_navbar: If True (default), automatically updates the navigation bar to include
|
|
94
|
+
the new feature table. Set to False during batch feature creation to avoid
|
|
95
|
+
redundant updates, then call apply_catalog_annotations() once at the end.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
type[FeatureRecord]: A dynamically generated Pydantic model class for creating
|
|
99
|
+
validated feature instances. The class has fields corresponding to the feature's
|
|
100
|
+
terms, assets, and metadata columns.
|
|
101
|
+
|
|
102
|
+
Raises:
|
|
103
|
+
DerivaMLException: If a feature definition is invalid or conflicts with existing features.
|
|
104
|
+
|
|
105
|
+
Examples:
|
|
106
|
+
Create a feature with confidence score:
|
|
107
|
+
>>> DiagnosisFeature = ml.create_feature(
|
|
108
|
+
... target_table="Image",
|
|
109
|
+
... feature_name="Diagnosis",
|
|
110
|
+
... terms=["Diagnosis_Type"],
|
|
111
|
+
... metadata=[ColumnDefinition(name="confidence", type=BuiltinTypes.float4)],
|
|
112
|
+
... comment="Clinical diagnosis label"
|
|
113
|
+
... )
|
|
114
|
+
>>> # Use the returned class to create validated feature instances
|
|
115
|
+
>>> record = DiagnosisFeature(
|
|
116
|
+
... Image="1-ABC", # Target record RID
|
|
117
|
+
... Diagnosis_Type="Normal", # Vocabulary term
|
|
118
|
+
... confidence=0.95,
|
|
119
|
+
... Execution="2-XYZ" # Execution that produced this value
|
|
120
|
+
... )
|
|
121
|
+
"""
|
|
122
|
+
# Initialize empty collections if None provided
|
|
123
|
+
terms = terms or []
|
|
124
|
+
assets = assets or []
|
|
125
|
+
metadata = metadata or []
|
|
126
|
+
optional = optional or []
|
|
127
|
+
|
|
128
|
+
def normalize_metadata(m: Key | Table | ColumnDefinition | str | dict) -> Key | Table | dict:
|
|
129
|
+
"""Helper function to normalize metadata references.
|
|
130
|
+
|
|
131
|
+
Handles:
|
|
132
|
+
- str: Table name, converted to Table object
|
|
133
|
+
- ColumnDefinition: Dataclass with to_dict() method
|
|
134
|
+
- dict: Already in dict format (from Column.define())
|
|
135
|
+
- Key/Table: Passed through unchanged
|
|
136
|
+
"""
|
|
137
|
+
if isinstance(m, str):
|
|
138
|
+
return self.model.name_to_table(m)
|
|
139
|
+
elif isinstance(m, dict):
|
|
140
|
+
# Already a dict (e.g., from Column.define())
|
|
141
|
+
return m
|
|
142
|
+
elif hasattr(m, 'to_dict'):
|
|
143
|
+
# ColumnDefinition or similar dataclass
|
|
144
|
+
return m.to_dict()
|
|
145
|
+
else:
|
|
146
|
+
return m
|
|
147
|
+
|
|
148
|
+
# Validate asset and term tables
|
|
149
|
+
if not all(map(self.model.is_asset, assets)):
|
|
150
|
+
raise DerivaMLException("Invalid create_feature asset table.")
|
|
151
|
+
if not all(map(self.model.is_vocabulary, terms)):
|
|
152
|
+
raise DerivaMLException("Invalid create_feature asset table.")
|
|
153
|
+
|
|
154
|
+
# Get references to required tables
|
|
155
|
+
target_table = self.model.name_to_table(target_table)
|
|
156
|
+
execution = self.model.schemas[self.ml_schema].tables["Execution"]
|
|
157
|
+
feature_name_table = self.model.schemas[self.ml_schema].tables["Feature_Name"]
|
|
158
|
+
|
|
159
|
+
# Add feature name to vocabulary
|
|
160
|
+
feature_name_term = self.add_term("Feature_Name", feature_name, description=comment)
|
|
161
|
+
atable_name = f"Execution_{target_table.name}_{feature_name_term.name}"
|
|
162
|
+
# Create an association table implementing the feature
|
|
163
|
+
atable = self.model.create_table(
|
|
164
|
+
target_table.define_association(
|
|
165
|
+
table_name=atable_name,
|
|
166
|
+
associates=[execution, target_table, feature_name_table],
|
|
167
|
+
metadata=[normalize_metadata(m) for m in chain(assets, terms, metadata)],
|
|
168
|
+
comment=comment,
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
# Configure optional columns and default feature name
|
|
172
|
+
for c in optional:
|
|
173
|
+
atable.columns[c].alter(nullok=True)
|
|
174
|
+
atable.columns["Feature_Name"].alter(default=feature_name_term.name)
|
|
175
|
+
|
|
176
|
+
# Update navbar to include the new feature table
|
|
177
|
+
if update_navbar:
|
|
178
|
+
self.apply_catalog_annotations()
|
|
179
|
+
|
|
180
|
+
# Return feature record class for creating instances
|
|
181
|
+
return self.feature_record_class(target_table, feature_name)
|
|
182
|
+
|
|
183
|
+
def feature_record_class(self, table: str | Table, feature_name: str) -> type[FeatureRecord]:
|
|
184
|
+
"""Returns a dynamically generated Pydantic model class for creating feature records.
|
|
185
|
+
|
|
186
|
+
Each feature has a unique set of columns based on its definition (terms, assets, metadata).
|
|
187
|
+
This method returns a Pydantic class with fields corresponding to those columns, providing:
|
|
188
|
+
|
|
189
|
+
- **Type validation**: Values are validated against expected types (str, int, float, Path)
|
|
190
|
+
- **Required field checking**: Non-nullable columns must be provided
|
|
191
|
+
- **Default values**: Feature_Name is pre-filled with the feature's name
|
|
192
|
+
|
|
193
|
+
**Field types in the generated class:**
|
|
194
|
+
- `{TargetTable}` (str): Required. RID of the target record (e.g., Image RID)
|
|
195
|
+
- `Execution` (str, optional): RID of the execution for provenance tracking
|
|
196
|
+
- `Feature_Name` (str): Pre-filled with the feature name
|
|
197
|
+
- Term columns (str): Accept vocabulary term names
|
|
198
|
+
- Asset columns (str | Path): Accept asset RIDs or file paths
|
|
199
|
+
- Value columns: Accept values matching the column type (int, float, str)
|
|
200
|
+
|
|
201
|
+
Use `lookup_feature()` to inspect the feature's structure and see what columns
|
|
202
|
+
are available.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
table: The table containing the feature, either as name or Table object.
|
|
206
|
+
feature_name: Name of the feature to create a record class for.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
type[FeatureRecord]: A Pydantic model class for creating validated feature records.
|
|
210
|
+
The class name follows the pattern `{TargetTable}Feature{FeatureName}`.
|
|
211
|
+
|
|
212
|
+
Raises:
|
|
213
|
+
DerivaMLException: If the feature doesn't exist or the table is invalid.
|
|
214
|
+
|
|
215
|
+
Example:
|
|
216
|
+
>>> # Get the dynamically generated class
|
|
217
|
+
>>> DiagnosisFeature = ml.feature_record_class("Image", "Diagnosis")
|
|
218
|
+
>>>
|
|
219
|
+
>>> # Create a validated feature record
|
|
220
|
+
>>> record = DiagnosisFeature(
|
|
221
|
+
... Image="1-ABC", # Target record RID
|
|
222
|
+
... Diagnosis_Type="Normal", # Vocabulary term
|
|
223
|
+
... confidence=0.95, # Metadata column
|
|
224
|
+
... Execution="2-XYZ" # Provenance
|
|
225
|
+
... )
|
|
226
|
+
>>>
|
|
227
|
+
>>> # Convert to dict for insertion
|
|
228
|
+
>>> record.model_dump()
|
|
229
|
+
{'Image': '1-ABC', 'Diagnosis_Type': 'Normal', 'confidence': 0.95, ...}
|
|
230
|
+
"""
|
|
231
|
+
# Look up a feature and return its record class
|
|
232
|
+
return self.lookup_feature(table, feature_name).feature_record_class()
|
|
233
|
+
|
|
234
|
+
def delete_feature(self, table: Table | str, feature_name: str) -> bool:
|
|
235
|
+
"""Removes a feature definition and its data.
|
|
236
|
+
|
|
237
|
+
Deletes the feature and its implementation table from the catalog. This operation cannot be undone and
|
|
238
|
+
will remove all feature values associated with this feature.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
table: The table containing the feature, either as name or Table object.
|
|
242
|
+
feature_name: Name of the feature to delete.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
bool: True if the feature was successfully deleted, False if it didn't exist.
|
|
246
|
+
|
|
247
|
+
Raises:
|
|
248
|
+
DerivaMLException: If deletion fails due to constraints or permissions.
|
|
249
|
+
|
|
250
|
+
Example:
|
|
251
|
+
>>> success = ml.delete_feature("samples", "obsolete_feature")
|
|
252
|
+
>>> print("Deleted" if success else "Not found")
|
|
253
|
+
"""
|
|
254
|
+
# Get table reference and find feature
|
|
255
|
+
table = self.model.name_to_table(table)
|
|
256
|
+
try:
|
|
257
|
+
# Find and delete the feature's implementation table
|
|
258
|
+
feature = next(f for f in self.model.find_features(table) if f.feature_name == feature_name)
|
|
259
|
+
feature.feature_table.drop()
|
|
260
|
+
return True
|
|
261
|
+
except StopIteration:
|
|
262
|
+
return False
|
|
263
|
+
|
|
264
|
+
def lookup_feature(self, table: str | Table, feature_name: str) -> Feature:
|
|
265
|
+
"""Retrieves a Feature object.
|
|
266
|
+
|
|
267
|
+
Looks up and returns a Feature object that provides an interface to work with an existing feature
|
|
268
|
+
definition in the catalog.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
table: The table containing the feature, either as name or Table object.
|
|
272
|
+
feature_name: Name of the feature to look up.
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
Feature: An object representing the feature and its implementation.
|
|
276
|
+
|
|
277
|
+
Raises:
|
|
278
|
+
DerivaMLException: If the feature doesn't exist in the specified table.
|
|
279
|
+
|
|
280
|
+
Example:
|
|
281
|
+
>>> feature = ml.lookup_feature("samples", "expression_level")
|
|
282
|
+
>>> print(feature.feature_name)
|
|
283
|
+
'expression_level'
|
|
284
|
+
"""
|
|
285
|
+
return self.model.lookup_feature(table, feature_name)
|
|
286
|
+
|
|
287
|
+
def find_features(self, table: str | Table | None = None) -> list[Feature]:
|
|
288
|
+
"""Find features in the catalog.
|
|
289
|
+
|
|
290
|
+
Catalog-level operation to find feature definitions. If a table is specified,
|
|
291
|
+
returns only features for that table. If no table is specified, returns all
|
|
292
|
+
features across all tables in the catalog.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
table: Optional table to find features for. If None, returns all features
|
|
296
|
+
in the catalog.
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
A list of Feature instances describing the features.
|
|
300
|
+
|
|
301
|
+
Examples:
|
|
302
|
+
Find all features in the catalog:
|
|
303
|
+
>>> all_features = ml.find_features()
|
|
304
|
+
>>> for f in all_features:
|
|
305
|
+
... print(f"{f.target_table.name}.{f.feature_name}")
|
|
306
|
+
|
|
307
|
+
Find features for a specific table:
|
|
308
|
+
>>> image_features = ml.find_features("Image")
|
|
309
|
+
>>> print([f.feature_name for f in image_features])
|
|
310
|
+
"""
|
|
311
|
+
return list(self.model.find_features(table))
|
|
312
|
+
|
|
313
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
314
|
+
def list_feature_values(
|
|
315
|
+
self, table: Table | str, feature_name: str
|
|
316
|
+
) -> Iterable[FeatureRecord]:
|
|
317
|
+
"""Retrieves all values for a feature as typed FeatureRecord instances.
|
|
318
|
+
|
|
319
|
+
Returns an iterator of dynamically-generated FeatureRecord objects for each
|
|
320
|
+
feature value. Each record is an instance of a Pydantic model specific to
|
|
321
|
+
this feature, with typed attributes for all columns including the Execution
|
|
322
|
+
that created the feature value.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
table: The table containing the feature, either as name or Table object.
|
|
326
|
+
feature_name: Name of the feature to retrieve values for.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
Iterable[FeatureRecord]: An iterator of FeatureRecord instances.
|
|
330
|
+
Each instance has:
|
|
331
|
+
- Execution: RID of the execution that created this feature value
|
|
332
|
+
- Feature_Name: Name of the feature
|
|
333
|
+
- All feature-specific columns as typed attributes
|
|
334
|
+
- model_dump() method to convert back to a dictionary
|
|
335
|
+
|
|
336
|
+
Raises:
|
|
337
|
+
DerivaMLException: If the feature doesn't exist or cannot be accessed.
|
|
338
|
+
|
|
339
|
+
Example:
|
|
340
|
+
>>> # Get typed feature records
|
|
341
|
+
>>> for record in ml.list_feature_values("Image", "Quality"):
|
|
342
|
+
... print(f"Image {record.Image}: {record.ImageQuality}")
|
|
343
|
+
... print(f"Created by execution: {record.Execution}")
|
|
344
|
+
|
|
345
|
+
>>> # Convert records to dictionaries
|
|
346
|
+
>>> records = list(ml.list_feature_values("Image", "Quality"))
|
|
347
|
+
>>> dicts = [r.model_dump() for r in records]
|
|
348
|
+
"""
|
|
349
|
+
# Get table and feature
|
|
350
|
+
table = self.model.name_to_table(table)
|
|
351
|
+
feature = self.lookup_feature(table, feature_name)
|
|
352
|
+
|
|
353
|
+
# Get the dynamically-generated FeatureRecord subclass for this feature
|
|
354
|
+
record_class = feature.feature_record_class()
|
|
355
|
+
|
|
356
|
+
# Build and execute query for feature values
|
|
357
|
+
pb = self.pathBuilder()
|
|
358
|
+
raw_values = pb.schemas[feature.feature_table.schema.name].tables[feature.feature_table.name].entities().fetch()
|
|
359
|
+
|
|
360
|
+
for raw_value in raw_values:
|
|
361
|
+
# Create a record instance from the raw dictionary
|
|
362
|
+
# Filter to only include fields that the record class expects
|
|
363
|
+
field_names = set(record_class.model_fields.keys())
|
|
364
|
+
filtered_data = {k: v for k, v in raw_value.items() if k in field_names}
|
|
365
|
+
yield record_class(**filtered_data)
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""File management mixin for DerivaML.
|
|
2
|
+
|
|
3
|
+
This module provides the FileMixin class which handles
|
|
4
|
+
file operations including adding and listing files.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from itertools import chain
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterable
|
|
13
|
+
from urllib.parse import urlsplit
|
|
14
|
+
|
|
15
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
16
|
+
import importlib
|
|
17
|
+
datapath = importlib.import_module("deriva.core.datapath")
|
|
18
|
+
|
|
19
|
+
from deriva_ml.core.definitions import RID, FileSpec, MLTable, MLVocab, VocabularyTerm
|
|
20
|
+
from deriva_ml.core.exceptions import DerivaMLInvalidTerm, DerivaMLTableTypeError
|
|
21
|
+
from deriva_ml.dataset.aux_classes import DatasetVersion
|
|
22
|
+
from deriva_ml.dataset.history import iso_to_snap
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from deriva.core.ermrest_catalog import ResolveRidResult
|
|
26
|
+
|
|
27
|
+
from deriva_ml.dataset.dataset import Dataset
|
|
28
|
+
from deriva_ml.model.catalog import DerivaModel
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class FileMixin:
|
|
32
|
+
"""Mixin providing file management operations.
|
|
33
|
+
|
|
34
|
+
This mixin requires the host class to have:
|
|
35
|
+
- model: DerivaModel instance
|
|
36
|
+
- ml_schema: str - name of the ML schema
|
|
37
|
+
- pathBuilder(): method returning catalog path builder
|
|
38
|
+
- resolve_rid(): method for RID resolution (from RidResolutionMixin)
|
|
39
|
+
- lookup_term(): method for vocabulary lookup (from VocabularyMixin)
|
|
40
|
+
- list_vocabulary_terms(): method for listing vocab terms (from VocabularyMixin)
|
|
41
|
+
- find_datasets(): method for finding datasets (from DatasetMixin)
|
|
42
|
+
|
|
43
|
+
Methods:
|
|
44
|
+
add_files: Add files to the catalog with metadata
|
|
45
|
+
list_files: List files in the catalog
|
|
46
|
+
_bootstrap_versions: Initialize dataset versions
|
|
47
|
+
_synchronize_dataset_versions: Sync dataset versions
|
|
48
|
+
_set_version_snapshot: Update version snapshots
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
# Type hints for IDE support - actual attributes/methods from host class
|
|
52
|
+
model: "DerivaModel"
|
|
53
|
+
ml_schema: str
|
|
54
|
+
pathBuilder: Callable[[], Any]
|
|
55
|
+
resolve_rid: Callable[[RID], "ResolveRidResult"]
|
|
56
|
+
lookup_term: Callable[[str, str], VocabularyTerm]
|
|
57
|
+
list_vocabulary_terms: Callable[[str], list[VocabularyTerm]]
|
|
58
|
+
find_datasets: Callable[..., Iterable["Dataset"]]
|
|
59
|
+
|
|
60
|
+
def add_files(
|
|
61
|
+
self,
|
|
62
|
+
files: Iterable[FileSpec],
|
|
63
|
+
execution_rid: RID,
|
|
64
|
+
dataset_types: str | list[str] | None = None,
|
|
65
|
+
description: str = "",
|
|
66
|
+
) -> "Dataset":
|
|
67
|
+
"""Adds files to the catalog with their metadata.
|
|
68
|
+
|
|
69
|
+
Registers files in the catalog along with their metadata (MD5, length, URL) and associates them with
|
|
70
|
+
specified file types. Links files to the specified execution record for provenance tracking.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
files: File specifications containing MD5 checksum, length, and URL.
|
|
74
|
+
execution_rid: Execution RID to associate files with (required for provenance).
|
|
75
|
+
dataset_types: One or more dataset type terms from File_Type vocabulary.
|
|
76
|
+
description: Description of the files.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Dataset: Dataset that represents the newly added files.
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
DerivaMLException: If file_types are invalid or execution_rid is not an execution record.
|
|
83
|
+
|
|
84
|
+
Examples:
|
|
85
|
+
Add files via an execution:
|
|
86
|
+
>>> with ml.create_execution(config) as exe:
|
|
87
|
+
... files = [FileSpec(url="path/to/file.txt", md5="abc123", length=1000)]
|
|
88
|
+
... dataset = exe.add_files(files, dataset_types="text")
|
|
89
|
+
"""
|
|
90
|
+
# Import here to avoid circular imports
|
|
91
|
+
from deriva_ml.dataset.dataset import Dataset
|
|
92
|
+
|
|
93
|
+
if self.resolve_rid(execution_rid).table.name != "Execution":
|
|
94
|
+
raise DerivaMLTableTypeError("Execution", execution_rid)
|
|
95
|
+
|
|
96
|
+
filespec_list = list(files)
|
|
97
|
+
|
|
98
|
+
# Get a list of all defined file types and their synonyms.
|
|
99
|
+
defined_types = set(
|
|
100
|
+
chain.from_iterable([[t.name] + list(t.synonyms or []) for t in self.list_vocabulary_terms(MLVocab.asset_type)])
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Get a list of all of the file types used in the filespec_list
|
|
104
|
+
spec_types = set(chain.from_iterable(filespec.file_types for filespec in filespec_list))
|
|
105
|
+
|
|
106
|
+
# Now make sure that all of the file types and dataset_types in the spec list are defined.
|
|
107
|
+
if spec_types - defined_types:
|
|
108
|
+
raise DerivaMLInvalidTerm(MLVocab.asset_type.name, f"{spec_types - defined_types}")
|
|
109
|
+
|
|
110
|
+
# Normalize dataset_types, make sure File type is included.
|
|
111
|
+
if isinstance(dataset_types, list):
|
|
112
|
+
dataset_types = ["File"] + dataset_types if "File" not in dataset_types else dataset_types
|
|
113
|
+
else:
|
|
114
|
+
dataset_types = ["File", dataset_types] if dataset_types else ["File"]
|
|
115
|
+
for ds_type in dataset_types:
|
|
116
|
+
self.lookup_term(MLVocab.dataset_type, ds_type)
|
|
117
|
+
|
|
118
|
+
# Add files to the file table, and collect up the resulting entries by directory name.
|
|
119
|
+
pb = self.pathBuilder()
|
|
120
|
+
file_records = list(
|
|
121
|
+
pb.schemas[self.ml_schema].tables["File"].insert([f.model_dump(by_alias=True) for f in filespec_list])
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Get the name of the association table between file_table and file_type and add file_type records
|
|
125
|
+
atable = self.model.find_association(MLTable.file, MLVocab.asset_type)[0].name
|
|
126
|
+
# Need to get a link between file record and file_types.
|
|
127
|
+
type_map = {
|
|
128
|
+
file_spec.md5: file_spec.file_types + ([] if "File" in file_spec.file_types else [])
|
|
129
|
+
for file_spec in filespec_list
|
|
130
|
+
}
|
|
131
|
+
file_type_records = [
|
|
132
|
+
{MLVocab.asset_type.value: file_type, "File": file_record["RID"]}
|
|
133
|
+
for file_record in file_records
|
|
134
|
+
for file_type in type_map[file_record["MD5"]]
|
|
135
|
+
]
|
|
136
|
+
pb.schemas[self.ml_schema].tables[atable].insert(file_type_records)
|
|
137
|
+
|
|
138
|
+
# Link files to the execution for provenance tracking.
|
|
139
|
+
pb.schemas[self.ml_schema].File_Execution.insert(
|
|
140
|
+
[
|
|
141
|
+
{"File": file_record["RID"], "Execution": execution_rid, "Asset_Role": "Output"}
|
|
142
|
+
for file_record in file_records
|
|
143
|
+
]
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Now create datasets to capture the original directory structure of the files.
|
|
147
|
+
dir_rid_map = defaultdict(list)
|
|
148
|
+
for e in file_records:
|
|
149
|
+
dir_rid_map[Path(urlsplit(e["URL"]).path).parent].append(e["RID"])
|
|
150
|
+
|
|
151
|
+
nested_datasets = []
|
|
152
|
+
path_length = 0
|
|
153
|
+
dataset = None
|
|
154
|
+
# Start with the longest path so we get subdirectories first.
|
|
155
|
+
for p, rids in sorted(dir_rid_map.items(), key=lambda kv: len(kv[0].parts), reverse=True):
|
|
156
|
+
dataset = Dataset.create_dataset(
|
|
157
|
+
self, # type: ignore[arg-type]
|
|
158
|
+
dataset_types=dataset_types,
|
|
159
|
+
execution_rid=execution_rid,
|
|
160
|
+
description=description,
|
|
161
|
+
)
|
|
162
|
+
members = rids
|
|
163
|
+
if len(p.parts) < path_length:
|
|
164
|
+
# Going up one level in directory, so Create nested dataset
|
|
165
|
+
members = [m.dataset_rid for m in nested_datasets] + rids
|
|
166
|
+
nested_datasets = []
|
|
167
|
+
dataset.add_dataset_members(members=members, execution_rid=execution_rid)
|
|
168
|
+
nested_datasets.append(dataset)
|
|
169
|
+
path_length = len(p.parts)
|
|
170
|
+
|
|
171
|
+
return dataset
|
|
172
|
+
|
|
173
|
+
def _bootstrap_versions(self) -> None:
|
|
174
|
+
"""Initialize dataset versions for datasets that don't have versions."""
|
|
175
|
+
datasets = [ds.dataset_rid for ds in self.find_datasets()]
|
|
176
|
+
ds_version = [
|
|
177
|
+
{
|
|
178
|
+
"Dataset": d,
|
|
179
|
+
"Version": "0.1.0",
|
|
180
|
+
"Description": "Dataset at the time of conversion to versioned datasets",
|
|
181
|
+
}
|
|
182
|
+
for d in datasets
|
|
183
|
+
]
|
|
184
|
+
schema_path = self.pathBuilder().schemas[self.ml_schema]
|
|
185
|
+
version_path = schema_path.tables["Dataset_Version"]
|
|
186
|
+
dataset_path = schema_path.tables["Dataset"]
|
|
187
|
+
history = list(version_path.insert(ds_version))
|
|
188
|
+
dataset_versions = [{"RID": h["Dataset"], "Version": h["Version"]} for h in history]
|
|
189
|
+
dataset_path.update(dataset_versions)
|
|
190
|
+
|
|
191
|
+
def _synchronize_dataset_versions(self) -> None:
|
|
192
|
+
"""Synchronize dataset versions with the latest version in Dataset_Version table."""
|
|
193
|
+
schema_path = self.pathBuilder().schemas[self.ml_schema]
|
|
194
|
+
dataset_version_path = schema_path.tables["Dataset_Version"]
|
|
195
|
+
# Get the maximum version number for each dataset.
|
|
196
|
+
versions = {}
|
|
197
|
+
for v in dataset_version_path.entities().fetch():
|
|
198
|
+
if v["Version"] > versions.get("Dataset", DatasetVersion(0, 0, 0)):
|
|
199
|
+
versions[v["Dataset"]] = v
|
|
200
|
+
dataset_path = schema_path.tables["Dataset"]
|
|
201
|
+
dataset_path.update([{"RID": dataset, "Version": version["RID"]} for dataset, version in versions.items()])
|
|
202
|
+
|
|
203
|
+
def _set_version_snapshot(self) -> None:
|
|
204
|
+
"""Update the Snapshot column of the Dataset_Version table to the correct time."""
|
|
205
|
+
dataset_version_path = self.pathBuilder().schemas[self.model.ml_schema].tables["Dataset_Version"]
|
|
206
|
+
versions = dataset_version_path.entities().fetch()
|
|
207
|
+
dataset_version_path.update(
|
|
208
|
+
[{"RID": h["RID"], "Snapshot": iso_to_snap(h["RCT"])} for h in versions if not h["Snapshot"]]
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
def list_files(self, file_types: list[str] | None = None) -> list[dict[str, Any]]:
|
|
212
|
+
"""Lists files in the catalog with their metadata.
|
|
213
|
+
|
|
214
|
+
Returns a list of files with their metadata including URL, MD5 hash, length, description,
|
|
215
|
+
and associated file types. Files can be optionally filtered by type.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
file_types: Filter results to only include these file types.
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
list[dict[str, Any]]: List of file records, each containing:
|
|
222
|
+
- RID: Resource identifier
|
|
223
|
+
- URL: File location
|
|
224
|
+
- MD5: File hash
|
|
225
|
+
- Length: File size
|
|
226
|
+
- Description: File description
|
|
227
|
+
- File_Types: List of associated file types
|
|
228
|
+
|
|
229
|
+
Examples:
|
|
230
|
+
List all files:
|
|
231
|
+
>>> files = ml.list_files()
|
|
232
|
+
>>> for f in files:
|
|
233
|
+
... print(f"{f['RID']}: {f['URL']}")
|
|
234
|
+
|
|
235
|
+
Filter by file type:
|
|
236
|
+
>>> image_files = ml.list_files(["image", "png"])
|
|
237
|
+
"""
|
|
238
|
+
asset_type_atable, file_fk, asset_type_fk = self.model.find_association("File", "Asset_Type")
|
|
239
|
+
ml_path = self.pathBuilder().schemas[self.ml_schema]
|
|
240
|
+
file = ml_path.File
|
|
241
|
+
asset_type = ml_path.tables[asset_type_atable.name]
|
|
242
|
+
|
|
243
|
+
path = file.path
|
|
244
|
+
path = path.link(asset_type.alias("AT"), on=file.RID == asset_type.columns[file_fk], join_type="left")
|
|
245
|
+
if file_types:
|
|
246
|
+
path = path.filter(asset_type.columns[asset_type_fk] == datapath.Any(*file_types))
|
|
247
|
+
path = path.attributes(
|
|
248
|
+
path.File.RID,
|
|
249
|
+
path.File.URL,
|
|
250
|
+
path.File.MD5,
|
|
251
|
+
path.File.Length,
|
|
252
|
+
path.File.Description,
|
|
253
|
+
path.AT.columns[asset_type_fk],
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
file_map = {}
|
|
257
|
+
for f in path.fetch():
|
|
258
|
+
entry = file_map.setdefault(f["RID"], {**f, "File_Types": []})
|
|
259
|
+
if ft := f.get("Asset_Type"): # assign-and-test in one go
|
|
260
|
+
entry["File_Types"].append(ft)
|
|
261
|
+
|
|
262
|
+
# Now get rid of the File_Type key and return the result
|
|
263
|
+
return [(f, f.pop("Asset_Type"))[0] for f in file_map.values()]
|