deriva-ml 1.17.9__py3-none-any.whl → 1.17.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. deriva_ml/__init__.py +43 -1
  2. deriva_ml/asset/__init__.py +17 -0
  3. deriva_ml/asset/asset.py +357 -0
  4. deriva_ml/asset/aux_classes.py +100 -0
  5. deriva_ml/bump_version.py +254 -11
  6. deriva_ml/catalog/__init__.py +21 -0
  7. deriva_ml/catalog/clone.py +1199 -0
  8. deriva_ml/catalog/localize.py +426 -0
  9. deriva_ml/core/__init__.py +29 -0
  10. deriva_ml/core/base.py +817 -1067
  11. deriva_ml/core/config.py +169 -21
  12. deriva_ml/core/constants.py +120 -19
  13. deriva_ml/core/definitions.py +123 -13
  14. deriva_ml/core/enums.py +47 -73
  15. deriva_ml/core/ermrest.py +226 -193
  16. deriva_ml/core/exceptions.py +297 -14
  17. deriva_ml/core/filespec.py +99 -28
  18. deriva_ml/core/logging_config.py +225 -0
  19. deriva_ml/core/mixins/__init__.py +42 -0
  20. deriva_ml/core/mixins/annotation.py +915 -0
  21. deriva_ml/core/mixins/asset.py +384 -0
  22. deriva_ml/core/mixins/dataset.py +237 -0
  23. deriva_ml/core/mixins/execution.py +408 -0
  24. deriva_ml/core/mixins/feature.py +365 -0
  25. deriva_ml/core/mixins/file.py +263 -0
  26. deriva_ml/core/mixins/path_builder.py +145 -0
  27. deriva_ml/core/mixins/rid_resolution.py +204 -0
  28. deriva_ml/core/mixins/vocabulary.py +400 -0
  29. deriva_ml/core/mixins/workflow.py +322 -0
  30. deriva_ml/core/validation.py +389 -0
  31. deriva_ml/dataset/__init__.py +2 -1
  32. deriva_ml/dataset/aux_classes.py +20 -4
  33. deriva_ml/dataset/catalog_graph.py +575 -0
  34. deriva_ml/dataset/dataset.py +1242 -1008
  35. deriva_ml/dataset/dataset_bag.py +1311 -182
  36. deriva_ml/dataset/history.py +27 -14
  37. deriva_ml/dataset/upload.py +225 -38
  38. deriva_ml/demo_catalog.py +186 -105
  39. deriva_ml/execution/__init__.py +46 -2
  40. deriva_ml/execution/base_config.py +639 -0
  41. deriva_ml/execution/execution.py +545 -244
  42. deriva_ml/execution/execution_configuration.py +26 -11
  43. deriva_ml/execution/execution_record.py +592 -0
  44. deriva_ml/execution/find_caller.py +298 -0
  45. deriva_ml/execution/model_protocol.py +175 -0
  46. deriva_ml/execution/multirun_config.py +153 -0
  47. deriva_ml/execution/runner.py +595 -0
  48. deriva_ml/execution/workflow.py +224 -35
  49. deriva_ml/experiment/__init__.py +8 -0
  50. deriva_ml/experiment/experiment.py +411 -0
  51. deriva_ml/feature.py +6 -1
  52. deriva_ml/install_kernel.py +143 -6
  53. deriva_ml/interfaces.py +862 -0
  54. deriva_ml/model/__init__.py +99 -0
  55. deriva_ml/model/annotations.py +1278 -0
  56. deriva_ml/model/catalog.py +286 -60
  57. deriva_ml/model/database.py +144 -649
  58. deriva_ml/model/deriva_ml_database.py +308 -0
  59. deriva_ml/model/handles.py +14 -0
  60. deriva_ml/run_model.py +319 -0
  61. deriva_ml/run_notebook.py +507 -38
  62. deriva_ml/schema/__init__.py +18 -2
  63. deriva_ml/schema/annotations.py +62 -33
  64. deriva_ml/schema/create_schema.py +169 -69
  65. deriva_ml/schema/validation.py +601 -0
  66. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -5
  67. deriva_ml-1.17.11.dist-info/RECORD +77 -0
  68. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
  69. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +2 -0
  70. deriva_ml/protocols/dataset.py +0 -19
  71. deriva_ml/test.py +0 -94
  72. deriva_ml-1.17.9.dist-info/RECORD +0 -45
  73. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
  74. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,365 @@
1
+ """Feature management mixin for DerivaML.
2
+
3
+ This module provides the FeatureMixin class which handles
4
+ feature operations including creating, looking up, deleting,
5
+ and listing feature values.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from itertools import chain
11
+ from typing import TYPE_CHECKING, Any, Callable, Iterable
12
+
13
+ # Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
14
+ import importlib
15
+ datapath = importlib.import_module("deriva.core.datapath")
16
+ _ermrest_model = importlib.import_module("deriva.core.ermrest_model")
17
+ Key = _ermrest_model.Key
18
+ Table = _ermrest_model.Table
19
+
20
+ from pydantic import ConfigDict, validate_call
21
+
22
+ from deriva_ml.core.definitions import ColumnDefinition, VocabularyTerm
23
+ from deriva_ml.core.exceptions import DerivaMLException
24
+ from deriva_ml.feature import Feature, FeatureRecord
25
+
26
+ if TYPE_CHECKING:
27
+ from deriva_ml.model.catalog import DerivaModel
28
+
29
+
30
+ class FeatureMixin:
31
+ """Mixin providing feature management operations.
32
+
33
+ This mixin requires the host class to have:
34
+ - model: DerivaModel instance
35
+ - ml_schema: str - name of the ML schema
36
+ - domain_schema: str - name of the domain schema
37
+ - pathBuilder(): method returning catalog path builder
38
+ - add_term(): method for adding vocabulary terms (from VocabularyMixin)
39
+ - apply_catalog_annotations(): method to update navbar (from DerivaML base class)
40
+
41
+ Methods:
42
+ create_feature: Create a new feature definition
43
+ feature_record_class: Get pydantic model class for feature records
44
+ delete_feature: Remove a feature definition
45
+ lookup_feature: Retrieve a Feature object
46
+ find_features: Find all features in the catalog, optionally filtered by table
47
+ list_feature_values: Get all values for a feature
48
+ """
49
+
50
+ # Type hints for IDE support - actual attributes/methods from host class
51
+ model: "DerivaModel"
52
+ ml_schema: str
53
+ domain_schemas: frozenset[str]
54
+ default_schema: str | None
55
+ pathBuilder: Callable[[], Any]
56
+ add_term: Callable[..., VocabularyTerm]
57
+ apply_catalog_annotations: Callable[[], None]
58
+
59
+ def create_feature(
60
+ self,
61
+ target_table: Table | str,
62
+ feature_name: str,
63
+ terms: list[Table | str] | None = None,
64
+ assets: list[Table | str] | None = None,
65
+ metadata: list[ColumnDefinition | Table | Key | str] | None = None,
66
+ optional: list[str] | None = None,
67
+ comment: str = "",
68
+ update_navbar: bool = True,
69
+ ) -> type[FeatureRecord]:
70
+ """Creates a new feature definition.
71
+
72
+ A feature represents a measurable property or characteristic that can be associated with records in the target
73
+ table. Features can include vocabulary terms, asset references, and additional metadata.
74
+
75
+ **Side Effects**:
76
+ This method dynamically creates:
77
+ 1. A new association table in the domain schema to store feature values
78
+ 2. A Pydantic model class (subclass of FeatureRecord) for creating validated feature instances
79
+
80
+ The returned Pydantic model class provides type-safe construction of feature records with
81
+ automatic validation of values against the feature's definition (vocabulary terms, asset
82
+ references, etc.). Use this class to create feature instances that can be inserted into
83
+ the catalog.
84
+
85
+ Args:
86
+ target_table: Table to associate the feature with (name or Table object).
87
+ feature_name: Unique name for the feature within the target table.
88
+ terms: Optional vocabulary tables/names whose terms can be used as feature values.
89
+ assets: Optional asset tables/names that can be referenced by this feature.
90
+ metadata: Optional columns, tables, or keys to include in a feature definition.
91
+ optional: Column names that are not required when creating feature instances.
92
+ comment: Description of the feature's purpose and usage.
93
+ update_navbar: If True (default), automatically updates the navigation bar to include
94
+ the new feature table. Set to False during batch feature creation to avoid
95
+ redundant updates, then call apply_catalog_annotations() once at the end.
96
+
97
+ Returns:
98
+ type[FeatureRecord]: A dynamically generated Pydantic model class for creating
99
+ validated feature instances. The class has fields corresponding to the feature's
100
+ terms, assets, and metadata columns.
101
+
102
+ Raises:
103
+ DerivaMLException: If a feature definition is invalid or conflicts with existing features.
104
+
105
+ Examples:
106
+ Create a feature with confidence score:
107
+ >>> DiagnosisFeature = ml.create_feature(
108
+ ... target_table="Image",
109
+ ... feature_name="Diagnosis",
110
+ ... terms=["Diagnosis_Type"],
111
+ ... metadata=[ColumnDefinition(name="confidence", type=BuiltinTypes.float4)],
112
+ ... comment="Clinical diagnosis label"
113
+ ... )
114
+ >>> # Use the returned class to create validated feature instances
115
+ >>> record = DiagnosisFeature(
116
+ ... Image="1-ABC", # Target record RID
117
+ ... Diagnosis_Type="Normal", # Vocabulary term
118
+ ... confidence=0.95,
119
+ ... Execution="2-XYZ" # Execution that produced this value
120
+ ... )
121
+ """
122
+ # Initialize empty collections if None provided
123
+ terms = terms or []
124
+ assets = assets or []
125
+ metadata = metadata or []
126
+ optional = optional or []
127
+
128
+ def normalize_metadata(m: Key | Table | ColumnDefinition | str | dict) -> Key | Table | dict:
129
+ """Helper function to normalize metadata references.
130
+
131
+ Handles:
132
+ - str: Table name, converted to Table object
133
+ - ColumnDefinition: Dataclass with to_dict() method
134
+ - dict: Already in dict format (from Column.define())
135
+ - Key/Table: Passed through unchanged
136
+ """
137
+ if isinstance(m, str):
138
+ return self.model.name_to_table(m)
139
+ elif isinstance(m, dict):
140
+ # Already a dict (e.g., from Column.define())
141
+ return m
142
+ elif hasattr(m, 'to_dict'):
143
+ # ColumnDefinition or similar dataclass
144
+ return m.to_dict()
145
+ else:
146
+ return m
147
+
148
+ # Validate asset and term tables
149
+ if not all(map(self.model.is_asset, assets)):
150
+ raise DerivaMLException("Invalid create_feature asset table.")
151
+ if not all(map(self.model.is_vocabulary, terms)):
152
+ raise DerivaMLException("Invalid create_feature asset table.")
153
+
154
+ # Get references to required tables
155
+ target_table = self.model.name_to_table(target_table)
156
+ execution = self.model.schemas[self.ml_schema].tables["Execution"]
157
+ feature_name_table = self.model.schemas[self.ml_schema].tables["Feature_Name"]
158
+
159
+ # Add feature name to vocabulary
160
+ feature_name_term = self.add_term("Feature_Name", feature_name, description=comment)
161
+ atable_name = f"Execution_{target_table.name}_{feature_name_term.name}"
162
+ # Create an association table implementing the feature
163
+ atable = self.model.create_table(
164
+ target_table.define_association(
165
+ table_name=atable_name,
166
+ associates=[execution, target_table, feature_name_table],
167
+ metadata=[normalize_metadata(m) for m in chain(assets, terms, metadata)],
168
+ comment=comment,
169
+ )
170
+ )
171
+ # Configure optional columns and default feature name
172
+ for c in optional:
173
+ atable.columns[c].alter(nullok=True)
174
+ atable.columns["Feature_Name"].alter(default=feature_name_term.name)
175
+
176
+ # Update navbar to include the new feature table
177
+ if update_navbar:
178
+ self.apply_catalog_annotations()
179
+
180
+ # Return feature record class for creating instances
181
+ return self.feature_record_class(target_table, feature_name)
182
+
183
+ def feature_record_class(self, table: str | Table, feature_name: str) -> type[FeatureRecord]:
184
+ """Returns a dynamically generated Pydantic model class for creating feature records.
185
+
186
+ Each feature has a unique set of columns based on its definition (terms, assets, metadata).
187
+ This method returns a Pydantic class with fields corresponding to those columns, providing:
188
+
189
+ - **Type validation**: Values are validated against expected types (str, int, float, Path)
190
+ - **Required field checking**: Non-nullable columns must be provided
191
+ - **Default values**: Feature_Name is pre-filled with the feature's name
192
+
193
+ **Field types in the generated class:**
194
+ - `{TargetTable}` (str): Required. RID of the target record (e.g., Image RID)
195
+ - `Execution` (str, optional): RID of the execution for provenance tracking
196
+ - `Feature_Name` (str): Pre-filled with the feature name
197
+ - Term columns (str): Accept vocabulary term names
198
+ - Asset columns (str | Path): Accept asset RIDs or file paths
199
+ - Value columns: Accept values matching the column type (int, float, str)
200
+
201
+ Use `lookup_feature()` to inspect the feature's structure and see what columns
202
+ are available.
203
+
204
+ Args:
205
+ table: The table containing the feature, either as name or Table object.
206
+ feature_name: Name of the feature to create a record class for.
207
+
208
+ Returns:
209
+ type[FeatureRecord]: A Pydantic model class for creating validated feature records.
210
+ The class name follows the pattern `{TargetTable}Feature{FeatureName}`.
211
+
212
+ Raises:
213
+ DerivaMLException: If the feature doesn't exist or the table is invalid.
214
+
215
+ Example:
216
+ >>> # Get the dynamically generated class
217
+ >>> DiagnosisFeature = ml.feature_record_class("Image", "Diagnosis")
218
+ >>>
219
+ >>> # Create a validated feature record
220
+ >>> record = DiagnosisFeature(
221
+ ... Image="1-ABC", # Target record RID
222
+ ... Diagnosis_Type="Normal", # Vocabulary term
223
+ ... confidence=0.95, # Metadata column
224
+ ... Execution="2-XYZ" # Provenance
225
+ ... )
226
+ >>>
227
+ >>> # Convert to dict for insertion
228
+ >>> record.model_dump()
229
+ {'Image': '1-ABC', 'Diagnosis_Type': 'Normal', 'confidence': 0.95, ...}
230
+ """
231
+ # Look up a feature and return its record class
232
+ return self.lookup_feature(table, feature_name).feature_record_class()
233
+
234
+ def delete_feature(self, table: Table | str, feature_name: str) -> bool:
235
+ """Removes a feature definition and its data.
236
+
237
+ Deletes the feature and its implementation table from the catalog. This operation cannot be undone and
238
+ will remove all feature values associated with this feature.
239
+
240
+ Args:
241
+ table: The table containing the feature, either as name or Table object.
242
+ feature_name: Name of the feature to delete.
243
+
244
+ Returns:
245
+ bool: True if the feature was successfully deleted, False if it didn't exist.
246
+
247
+ Raises:
248
+ DerivaMLException: If deletion fails due to constraints or permissions.
249
+
250
+ Example:
251
+ >>> success = ml.delete_feature("samples", "obsolete_feature")
252
+ >>> print("Deleted" if success else "Not found")
253
+ """
254
+ # Get table reference and find feature
255
+ table = self.model.name_to_table(table)
256
+ try:
257
+ # Find and delete the feature's implementation table
258
+ feature = next(f for f in self.model.find_features(table) if f.feature_name == feature_name)
259
+ feature.feature_table.drop()
260
+ return True
261
+ except StopIteration:
262
+ return False
263
+
264
+ def lookup_feature(self, table: str | Table, feature_name: str) -> Feature:
265
+ """Retrieves a Feature object.
266
+
267
+ Looks up and returns a Feature object that provides an interface to work with an existing feature
268
+ definition in the catalog.
269
+
270
+ Args:
271
+ table: The table containing the feature, either as name or Table object.
272
+ feature_name: Name of the feature to look up.
273
+
274
+ Returns:
275
+ Feature: An object representing the feature and its implementation.
276
+
277
+ Raises:
278
+ DerivaMLException: If the feature doesn't exist in the specified table.
279
+
280
+ Example:
281
+ >>> feature = ml.lookup_feature("samples", "expression_level")
282
+ >>> print(feature.feature_name)
283
+ 'expression_level'
284
+ """
285
+ return self.model.lookup_feature(table, feature_name)
286
+
287
+ def find_features(self, table: str | Table | None = None) -> list[Feature]:
288
+ """Find features in the catalog.
289
+
290
+ Catalog-level operation to find feature definitions. If a table is specified,
291
+ returns only features for that table. If no table is specified, returns all
292
+ features across all tables in the catalog.
293
+
294
+ Args:
295
+ table: Optional table to find features for. If None, returns all features
296
+ in the catalog.
297
+
298
+ Returns:
299
+ A list of Feature instances describing the features.
300
+
301
+ Examples:
302
+ Find all features in the catalog:
303
+ >>> all_features = ml.find_features()
304
+ >>> for f in all_features:
305
+ ... print(f"{f.target_table.name}.{f.feature_name}")
306
+
307
+ Find features for a specific table:
308
+ >>> image_features = ml.find_features("Image")
309
+ >>> print([f.feature_name for f in image_features])
310
+ """
311
+ return list(self.model.find_features(table))
312
+
313
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
314
+ def list_feature_values(
315
+ self, table: Table | str, feature_name: str
316
+ ) -> Iterable[FeatureRecord]:
317
+ """Retrieves all values for a feature as typed FeatureRecord instances.
318
+
319
+ Returns an iterator of dynamically-generated FeatureRecord objects for each
320
+ feature value. Each record is an instance of a Pydantic model specific to
321
+ this feature, with typed attributes for all columns including the Execution
322
+ that created the feature value.
323
+
324
+ Args:
325
+ table: The table containing the feature, either as name or Table object.
326
+ feature_name: Name of the feature to retrieve values for.
327
+
328
+ Returns:
329
+ Iterable[FeatureRecord]: An iterator of FeatureRecord instances.
330
+ Each instance has:
331
+ - Execution: RID of the execution that created this feature value
332
+ - Feature_Name: Name of the feature
333
+ - All feature-specific columns as typed attributes
334
+ - model_dump() method to convert back to a dictionary
335
+
336
+ Raises:
337
+ DerivaMLException: If the feature doesn't exist or cannot be accessed.
338
+
339
+ Example:
340
+ >>> # Get typed feature records
341
+ >>> for record in ml.list_feature_values("Image", "Quality"):
342
+ ... print(f"Image {record.Image}: {record.ImageQuality}")
343
+ ... print(f"Created by execution: {record.Execution}")
344
+
345
+ >>> # Convert records to dictionaries
346
+ >>> records = list(ml.list_feature_values("Image", "Quality"))
347
+ >>> dicts = [r.model_dump() for r in records]
348
+ """
349
+ # Get table and feature
350
+ table = self.model.name_to_table(table)
351
+ feature = self.lookup_feature(table, feature_name)
352
+
353
+ # Get the dynamically-generated FeatureRecord subclass for this feature
354
+ record_class = feature.feature_record_class()
355
+
356
+ # Build and execute query for feature values
357
+ pb = self.pathBuilder()
358
+ raw_values = pb.schemas[feature.feature_table.schema.name].tables[feature.feature_table.name].entities().fetch()
359
+
360
+ for raw_value in raw_values:
361
+ # Create a record instance from the raw dictionary
362
+ # Filter to only include fields that the record class expects
363
+ field_names = set(record_class.model_fields.keys())
364
+ filtered_data = {k: v for k, v in raw_value.items() if k in field_names}
365
+ yield record_class(**filtered_data)
@@ -0,0 +1,263 @@
1
+ """File management mixin for DerivaML.
2
+
3
+ This module provides the FileMixin class which handles
4
+ file operations including adding and listing files.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from collections import defaultdict
10
+ from itertools import chain
11
+ from pathlib import Path
12
+ from typing import TYPE_CHECKING, Any, Callable, Iterable
13
+ from urllib.parse import urlsplit
14
+
15
+ # Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
16
+ import importlib
17
+ datapath = importlib.import_module("deriva.core.datapath")
18
+
19
+ from deriva_ml.core.definitions import RID, FileSpec, MLTable, MLVocab, VocabularyTerm
20
+ from deriva_ml.core.exceptions import DerivaMLInvalidTerm, DerivaMLTableTypeError
21
+ from deriva_ml.dataset.aux_classes import DatasetVersion
22
+ from deriva_ml.dataset.history import iso_to_snap
23
+
24
+ if TYPE_CHECKING:
25
+ from deriva.core.ermrest_catalog import ResolveRidResult
26
+
27
+ from deriva_ml.dataset.dataset import Dataset
28
+ from deriva_ml.model.catalog import DerivaModel
29
+
30
+
31
+ class FileMixin:
32
+ """Mixin providing file management operations.
33
+
34
+ This mixin requires the host class to have:
35
+ - model: DerivaModel instance
36
+ - ml_schema: str - name of the ML schema
37
+ - pathBuilder(): method returning catalog path builder
38
+ - resolve_rid(): method for RID resolution (from RidResolutionMixin)
39
+ - lookup_term(): method for vocabulary lookup (from VocabularyMixin)
40
+ - list_vocabulary_terms(): method for listing vocab terms (from VocabularyMixin)
41
+ - find_datasets(): method for finding datasets (from DatasetMixin)
42
+
43
+ Methods:
44
+ add_files: Add files to the catalog with metadata
45
+ list_files: List files in the catalog
46
+ _bootstrap_versions: Initialize dataset versions
47
+ _synchronize_dataset_versions: Sync dataset versions
48
+ _set_version_snapshot: Update version snapshots
49
+ """
50
+
51
+ # Type hints for IDE support - actual attributes/methods from host class
52
+ model: "DerivaModel"
53
+ ml_schema: str
54
+ pathBuilder: Callable[[], Any]
55
+ resolve_rid: Callable[[RID], "ResolveRidResult"]
56
+ lookup_term: Callable[[str, str], VocabularyTerm]
57
+ list_vocabulary_terms: Callable[[str], list[VocabularyTerm]]
58
+ find_datasets: Callable[..., Iterable["Dataset"]]
59
+
60
+ def add_files(
61
+ self,
62
+ files: Iterable[FileSpec],
63
+ execution_rid: RID,
64
+ dataset_types: str | list[str] | None = None,
65
+ description: str = "",
66
+ ) -> "Dataset":
67
+ """Adds files to the catalog with their metadata.
68
+
69
+ Registers files in the catalog along with their metadata (MD5, length, URL) and associates them with
70
+ specified file types. Links files to the specified execution record for provenance tracking.
71
+
72
+ Args:
73
+ files: File specifications containing MD5 checksum, length, and URL.
74
+ execution_rid: Execution RID to associate files with (required for provenance).
75
+ dataset_types: One or more dataset type terms from File_Type vocabulary.
76
+ description: Description of the files.
77
+
78
+ Returns:
79
+ Dataset: Dataset that represents the newly added files.
80
+
81
+ Raises:
82
+ DerivaMLException: If file_types are invalid or execution_rid is not an execution record.
83
+
84
+ Examples:
85
+ Add files via an execution:
86
+ >>> with ml.create_execution(config) as exe:
87
+ ... files = [FileSpec(url="path/to/file.txt", md5="abc123", length=1000)]
88
+ ... dataset = exe.add_files(files, dataset_types="text")
89
+ """
90
+ # Import here to avoid circular imports
91
+ from deriva_ml.dataset.dataset import Dataset
92
+
93
+ if self.resolve_rid(execution_rid).table.name != "Execution":
94
+ raise DerivaMLTableTypeError("Execution", execution_rid)
95
+
96
+ filespec_list = list(files)
97
+
98
+ # Get a list of all defined file types and their synonyms.
99
+ defined_types = set(
100
+ chain.from_iterable([[t.name] + list(t.synonyms or []) for t in self.list_vocabulary_terms(MLVocab.asset_type)])
101
+ )
102
+
103
+ # Get a list of all of the file types used in the filespec_list
104
+ spec_types = set(chain.from_iterable(filespec.file_types for filespec in filespec_list))
105
+
106
+ # Now make sure that all of the file types and dataset_types in the spec list are defined.
107
+ if spec_types - defined_types:
108
+ raise DerivaMLInvalidTerm(MLVocab.asset_type.name, f"{spec_types - defined_types}")
109
+
110
+ # Normalize dataset_types, make sure File type is included.
111
+ if isinstance(dataset_types, list):
112
+ dataset_types = ["File"] + dataset_types if "File" not in dataset_types else dataset_types
113
+ else:
114
+ dataset_types = ["File", dataset_types] if dataset_types else ["File"]
115
+ for ds_type in dataset_types:
116
+ self.lookup_term(MLVocab.dataset_type, ds_type)
117
+
118
+ # Add files to the file table, and collect up the resulting entries by directory name.
119
+ pb = self.pathBuilder()
120
+ file_records = list(
121
+ pb.schemas[self.ml_schema].tables["File"].insert([f.model_dump(by_alias=True) for f in filespec_list])
122
+ )
123
+
124
+ # Get the name of the association table between file_table and file_type and add file_type records
125
+ atable = self.model.find_association(MLTable.file, MLVocab.asset_type)[0].name
126
+ # Need to get a link between file record and file_types.
127
+ type_map = {
128
+ file_spec.md5: file_spec.file_types + ([] if "File" in file_spec.file_types else [])
129
+ for file_spec in filespec_list
130
+ }
131
+ file_type_records = [
132
+ {MLVocab.asset_type.value: file_type, "File": file_record["RID"]}
133
+ for file_record in file_records
134
+ for file_type in type_map[file_record["MD5"]]
135
+ ]
136
+ pb.schemas[self.ml_schema].tables[atable].insert(file_type_records)
137
+
138
+ # Link files to the execution for provenance tracking.
139
+ pb.schemas[self.ml_schema].File_Execution.insert(
140
+ [
141
+ {"File": file_record["RID"], "Execution": execution_rid, "Asset_Role": "Output"}
142
+ for file_record in file_records
143
+ ]
144
+ )
145
+
146
+ # Now create datasets to capture the original directory structure of the files.
147
+ dir_rid_map = defaultdict(list)
148
+ for e in file_records:
149
+ dir_rid_map[Path(urlsplit(e["URL"]).path).parent].append(e["RID"])
150
+
151
+ nested_datasets = []
152
+ path_length = 0
153
+ dataset = None
154
+ # Start with the longest path so we get subdirectories first.
155
+ for p, rids in sorted(dir_rid_map.items(), key=lambda kv: len(kv[0].parts), reverse=True):
156
+ dataset = Dataset.create_dataset(
157
+ self, # type: ignore[arg-type]
158
+ dataset_types=dataset_types,
159
+ execution_rid=execution_rid,
160
+ description=description,
161
+ )
162
+ members = rids
163
+ if len(p.parts) < path_length:
164
+ # Going up one level in directory, so Create nested dataset
165
+ members = [m.dataset_rid for m in nested_datasets] + rids
166
+ nested_datasets = []
167
+ dataset.add_dataset_members(members=members, execution_rid=execution_rid)
168
+ nested_datasets.append(dataset)
169
+ path_length = len(p.parts)
170
+
171
+ return dataset
172
+
173
+ def _bootstrap_versions(self) -> None:
174
+ """Initialize dataset versions for datasets that don't have versions."""
175
+ datasets = [ds.dataset_rid for ds in self.find_datasets()]
176
+ ds_version = [
177
+ {
178
+ "Dataset": d,
179
+ "Version": "0.1.0",
180
+ "Description": "Dataset at the time of conversion to versioned datasets",
181
+ }
182
+ for d in datasets
183
+ ]
184
+ schema_path = self.pathBuilder().schemas[self.ml_schema]
185
+ version_path = schema_path.tables["Dataset_Version"]
186
+ dataset_path = schema_path.tables["Dataset"]
187
+ history = list(version_path.insert(ds_version))
188
+ dataset_versions = [{"RID": h["Dataset"], "Version": h["Version"]} for h in history]
189
+ dataset_path.update(dataset_versions)
190
+
191
+ def _synchronize_dataset_versions(self) -> None:
192
+ """Synchronize dataset versions with the latest version in Dataset_Version table."""
193
+ schema_path = self.pathBuilder().schemas[self.ml_schema]
194
+ dataset_version_path = schema_path.tables["Dataset_Version"]
195
+ # Get the maximum version number for each dataset.
196
+ versions = {}
197
+ for v in dataset_version_path.entities().fetch():
198
+ if v["Version"] > versions.get("Dataset", DatasetVersion(0, 0, 0)):
199
+ versions[v["Dataset"]] = v
200
+ dataset_path = schema_path.tables["Dataset"]
201
+ dataset_path.update([{"RID": dataset, "Version": version["RID"]} for dataset, version in versions.items()])
202
+
203
+ def _set_version_snapshot(self) -> None:
204
+ """Update the Snapshot column of the Dataset_Version table to the correct time."""
205
+ dataset_version_path = self.pathBuilder().schemas[self.model.ml_schema].tables["Dataset_Version"]
206
+ versions = dataset_version_path.entities().fetch()
207
+ dataset_version_path.update(
208
+ [{"RID": h["RID"], "Snapshot": iso_to_snap(h["RCT"])} for h in versions if not h["Snapshot"]]
209
+ )
210
+
211
+ def list_files(self, file_types: list[str] | None = None) -> list[dict[str, Any]]:
212
+ """Lists files in the catalog with their metadata.
213
+
214
+ Returns a list of files with their metadata including URL, MD5 hash, length, description,
215
+ and associated file types. Files can be optionally filtered by type.
216
+
217
+ Args:
218
+ file_types: Filter results to only include these file types.
219
+
220
+ Returns:
221
+ list[dict[str, Any]]: List of file records, each containing:
222
+ - RID: Resource identifier
223
+ - URL: File location
224
+ - MD5: File hash
225
+ - Length: File size
226
+ - Description: File description
227
+ - File_Types: List of associated file types
228
+
229
+ Examples:
230
+ List all files:
231
+ >>> files = ml.list_files()
232
+ >>> for f in files:
233
+ ... print(f"{f['RID']}: {f['URL']}")
234
+
235
+ Filter by file type:
236
+ >>> image_files = ml.list_files(["image", "png"])
237
+ """
238
+ asset_type_atable, file_fk, asset_type_fk = self.model.find_association("File", "Asset_Type")
239
+ ml_path = self.pathBuilder().schemas[self.ml_schema]
240
+ file = ml_path.File
241
+ asset_type = ml_path.tables[asset_type_atable.name]
242
+
243
+ path = file.path
244
+ path = path.link(asset_type.alias("AT"), on=file.RID == asset_type.columns[file_fk], join_type="left")
245
+ if file_types:
246
+ path = path.filter(asset_type.columns[asset_type_fk] == datapath.Any(*file_types))
247
+ path = path.attributes(
248
+ path.File.RID,
249
+ path.File.URL,
250
+ path.File.MD5,
251
+ path.File.Length,
252
+ path.File.Description,
253
+ path.AT.columns[asset_type_fk],
254
+ )
255
+
256
+ file_map = {}
257
+ for f in path.fetch():
258
+ entry = file_map.setdefault(f["RID"], {**f, "File_Types": []})
259
+ if ft := f.get("Asset_Type"): # assign-and-test in one go
260
+ entry["File_Types"].append(ft)
261
+
262
+ # Now get rid of the File_Type key and return the result
263
+ return [(f, f.pop("Asset_Type"))[0] for f in file_map.values()]