deriva-ml 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. deriva_ml/__init__.py +43 -1
  2. deriva_ml/asset/__init__.py +17 -0
  3. deriva_ml/asset/asset.py +357 -0
  4. deriva_ml/asset/aux_classes.py +100 -0
  5. deriva_ml/bump_version.py +254 -11
  6. deriva_ml/catalog/__init__.py +21 -0
  7. deriva_ml/catalog/clone.py +1199 -0
  8. deriva_ml/catalog/localize.py +426 -0
  9. deriva_ml/core/__init__.py +29 -0
  10. deriva_ml/core/base.py +817 -1067
  11. deriva_ml/core/config.py +169 -21
  12. deriva_ml/core/constants.py +120 -19
  13. deriva_ml/core/definitions.py +123 -13
  14. deriva_ml/core/enums.py +47 -73
  15. deriva_ml/core/ermrest.py +226 -193
  16. deriva_ml/core/exceptions.py +297 -14
  17. deriva_ml/core/filespec.py +99 -28
  18. deriva_ml/core/logging_config.py +225 -0
  19. deriva_ml/core/mixins/__init__.py +42 -0
  20. deriva_ml/core/mixins/annotation.py +915 -0
  21. deriva_ml/core/mixins/asset.py +384 -0
  22. deriva_ml/core/mixins/dataset.py +237 -0
  23. deriva_ml/core/mixins/execution.py +408 -0
  24. deriva_ml/core/mixins/feature.py +365 -0
  25. deriva_ml/core/mixins/file.py +263 -0
  26. deriva_ml/core/mixins/path_builder.py +145 -0
  27. deriva_ml/core/mixins/rid_resolution.py +204 -0
  28. deriva_ml/core/mixins/vocabulary.py +400 -0
  29. deriva_ml/core/mixins/workflow.py +322 -0
  30. deriva_ml/core/validation.py +389 -0
  31. deriva_ml/dataset/__init__.py +2 -1
  32. deriva_ml/dataset/aux_classes.py +20 -4
  33. deriva_ml/dataset/catalog_graph.py +575 -0
  34. deriva_ml/dataset/dataset.py +1242 -1008
  35. deriva_ml/dataset/dataset_bag.py +1311 -182
  36. deriva_ml/dataset/history.py +27 -14
  37. deriva_ml/dataset/upload.py +225 -38
  38. deriva_ml/demo_catalog.py +126 -110
  39. deriva_ml/execution/__init__.py +46 -2
  40. deriva_ml/execution/base_config.py +639 -0
  41. deriva_ml/execution/execution.py +543 -242
  42. deriva_ml/execution/execution_configuration.py +26 -11
  43. deriva_ml/execution/execution_record.py +592 -0
  44. deriva_ml/execution/find_caller.py +298 -0
  45. deriva_ml/execution/model_protocol.py +175 -0
  46. deriva_ml/execution/multirun_config.py +153 -0
  47. deriva_ml/execution/runner.py +595 -0
  48. deriva_ml/execution/workflow.py +223 -34
  49. deriva_ml/experiment/__init__.py +8 -0
  50. deriva_ml/experiment/experiment.py +411 -0
  51. deriva_ml/feature.py +6 -1
  52. deriva_ml/install_kernel.py +143 -6
  53. deriva_ml/interfaces.py +862 -0
  54. deriva_ml/model/__init__.py +99 -0
  55. deriva_ml/model/annotations.py +1278 -0
  56. deriva_ml/model/catalog.py +286 -60
  57. deriva_ml/model/database.py +144 -649
  58. deriva_ml/model/deriva_ml_database.py +308 -0
  59. deriva_ml/model/handles.py +14 -0
  60. deriva_ml/run_model.py +319 -0
  61. deriva_ml/run_notebook.py +507 -38
  62. deriva_ml/schema/__init__.py +18 -2
  63. deriva_ml/schema/annotations.py +62 -33
  64. deriva_ml/schema/create_schema.py +169 -69
  65. deriva_ml/schema/validation.py +601 -0
  66. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -4
  67. deriva_ml-1.17.11.dist-info/RECORD +77 -0
  68. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
  69. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +1 -0
  70. deriva_ml/protocols/dataset.py +0 -19
  71. deriva_ml/test.py +0 -94
  72. deriva_ml-1.17.10.dist-info/RECORD +0 -45
  73. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
  74. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,408 @@
1
+ """Execution management mixin for DerivaML.
2
+
3
+ This module provides the ExecutionMixin class which handles
4
+ execution operations including creating, restoring, and updating
5
+ execution status.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from pathlib import Path
11
+ from typing import TYPE_CHECKING, Any, Callable, Iterable
12
+
13
+ from deriva_ml.core.definitions import RID
14
+ from deriva_ml.core.enums import Status
15
+ from deriva_ml.core.exceptions import DerivaMLException
16
+ from deriva_ml.dataset.upload import asset_file_path, execution_rids
17
+ from deriva_ml.execution.execution_configuration import ExecutionConfiguration
18
+
19
+ if TYPE_CHECKING:
20
+ from deriva_ml.execution.execution import Execution
21
+ from deriva_ml.execution.execution_record import ExecutionRecord
22
+ from deriva_ml.execution.workflow import Workflow
23
+ from deriva_ml.experiment.experiment import Experiment
24
+ from deriva_ml.model.catalog import DerivaModel
25
+
26
+
27
+ class ExecutionMixin:
28
+ """Mixin providing execution management operations.
29
+
30
+ This mixin requires the host class to have:
31
+ - model: DerivaModel instance
32
+ - ml_schema: str - name of the ML schema
33
+ - working_dir: Path - working directory path
34
+ - pathBuilder(): method returning catalog path builder
35
+ - retrieve_rid(): method for retrieving RID data (from RidResolutionMixin)
36
+
37
+ Methods:
38
+ create_execution: Create a new execution environment
39
+ restore_execution: Restore a previous execution
40
+ _update_status: Update execution status in catalog
41
+ """
42
+
43
+ # Type hints for IDE support - actual attributes/methods from host class
44
+ model: "DerivaModel"
45
+ ml_schema: str
46
+ working_dir: Path
47
+ status: str
48
+ pathBuilder: Callable[[], Any]
49
+ retrieve_rid: Callable[[RID], dict[str, Any]]
50
+ _execution: "Execution"
51
+
52
+ def _update_status(self, new_status: Status, status_detail: str, execution_rid: RID) -> None:
53
+ """Update the status of an execution in the catalog.
54
+
55
+ Args:
56
+ new_status: New status.
57
+ status_detail: Details of the status.
58
+ execution_rid: Resource Identifier (RID) of the execution.
59
+ """
60
+ self.status = new_status.value
61
+ self.pathBuilder().schemas[self.ml_schema].Execution.update(
62
+ [
63
+ {
64
+ "RID": execution_rid,
65
+ "Status": self.status,
66
+ "Status_Detail": status_detail,
67
+ }
68
+ ]
69
+ )
70
+
71
+ def create_execution(
72
+ self, configuration: ExecutionConfiguration, workflow: "Workflow | RID | None" = None, dry_run: bool = False
73
+ ) -> "Execution":
74
+ """Create an execution environment.
75
+
76
+ Initializes a local compute environment for executing an ML or analytic routine.
77
+ This has several side effects:
78
+
79
+ 1. Downloads datasets specified in the configuration to the cache directory.
80
+ If no version is specified, creates a new minor version for the dataset.
81
+ 2. Downloads any execution assets to the working directory.
82
+ 3. Creates an execution record in the catalog (unless dry_run=True).
83
+
84
+ Args:
85
+ configuration: ExecutionConfiguration specifying execution parameters.
86
+ workflow: Optional Workflow object or RID if not present in configuration.
87
+ dry_run: If True, skip creating catalog records and uploading results.
88
+
89
+ Returns:
90
+ Execution: An execution object for managing the execution lifecycle.
91
+
92
+ Example:
93
+ >>> config = ExecutionConfiguration(
94
+ ... workflow=workflow,
95
+ ... description="Process samples",
96
+ ... datasets=[DatasetSpec(rid="4HM")],
97
+ ... )
98
+ >>> with ml.create_execution(config) as execution:
99
+ ... # Run analysis
100
+ ... pass
101
+ >>> execution.upload_execution_outputs()
102
+ """
103
+ # Import here to avoid circular dependency
104
+ from deriva_ml.execution.execution import Execution
105
+
106
+ # Create and store an execution instance
107
+ self._execution = Execution(configuration, self, workflow=workflow, dry_run=dry_run) # type: ignore[arg-type]
108
+ return self._execution
109
+
110
+ def lookup_execution(self, execution_rid: RID) -> "ExecutionRecord":
111
+ """Look up an execution by RID and return an ExecutionRecord.
112
+
113
+ Creates an ExecutionRecord object for querying and modifying execution
114
+ metadata. The ExecutionRecord provides access to the catalog record
115
+ state and allows updating mutable properties like status and description.
116
+
117
+ For running computations with datasets and assets, use ``restore_execution()``
118
+ or ``create_execution()`` which return full Execution objects.
119
+
120
+ Args:
121
+ execution_rid: Resource Identifier (RID) of the execution.
122
+
123
+ Returns:
124
+ ExecutionRecord: An execution record object bound to the catalog.
125
+
126
+ Raises:
127
+ DerivaMLException: If execution_rid is not valid or doesn't refer
128
+ to an Execution record.
129
+
130
+ Example:
131
+ Look up an execution and query its state::
132
+
133
+ >>> record = ml.lookup_execution("1-abc123")
134
+ >>> print(f"Status: {record.status}")
135
+ >>> print(f"Description: {record.description}")
136
+
137
+ Update mutable properties::
138
+
139
+ >>> record.status = Status.completed
140
+ >>> record.description = "Analysis finished"
141
+
142
+ Query relationships::
143
+
144
+ >>> children = list(record.list_nested_executions())
145
+ >>> parents = list(record.list_parent_executions())
146
+ """
147
+ # Import here to avoid circular dependency
148
+ from deriva_ml.execution.execution_record import ExecutionRecord
149
+
150
+ # Get execution record from catalog and verify it's an Execution
151
+ resolved = self.resolve_rid(execution_rid)
152
+ if resolved.table.name != "Execution":
153
+ raise DerivaMLException(
154
+ f"RID '{execution_rid}' refers to a {resolved.table.name}, not an Execution"
155
+ )
156
+
157
+ execution_data = self.retrieve_rid(execution_rid)
158
+
159
+ # Parse timestamps if present
160
+ start_time = None
161
+ stop_time = None
162
+ if execution_data.get("Start"):
163
+ from datetime import datetime
164
+ try:
165
+ start_time = datetime.fromisoformat(execution_data["Start"].replace("Z", "+00:00"))
166
+ except (ValueError, AttributeError):
167
+ pass
168
+ if execution_data.get("Stop"):
169
+ from datetime import datetime
170
+ try:
171
+ stop_time = datetime.fromisoformat(execution_data["Stop"].replace("Z", "+00:00"))
172
+ except (ValueError, AttributeError):
173
+ pass
174
+
175
+ # Look up the workflow if present
176
+ workflow_rid = execution_data.get("Workflow")
177
+ workflow = self.lookup_workflow(workflow_rid) if workflow_rid else None
178
+
179
+ # Create ExecutionRecord bound to this catalog
180
+ record = ExecutionRecord(
181
+ execution_rid=execution_rid,
182
+ workflow=workflow,
183
+ status=Status(execution_data.get("Status", "Created")),
184
+ description=execution_data.get("Description"),
185
+ start_time=start_time,
186
+ stop_time=stop_time,
187
+ duration=execution_data.get("Duration"),
188
+ _ml_instance=self,
189
+ _logger=getattr(self, "_logger", None),
190
+ )
191
+
192
+ return record
193
+
194
+ def restore_execution(self, execution_rid: RID | None = None) -> "Execution":
195
+ """Restores a previous execution.
196
+
197
+ Given an execution RID, retrieves the execution configuration and restores the local compute environment.
198
+ This routine has a number of side effects.
199
+
200
+ 1. The datasets specified in the configuration are downloaded and placed in the cache-dir. If a version is
201
+ not specified in the configuration, then a new minor version number is created for the dataset and downloaded.
202
+
203
+ 2. If any execution assets are provided in the configuration, they are downloaded and placed
204
+ in the working directory.
205
+
206
+ Args:
207
+ execution_rid: Resource Identifier (RID) of the execution to restore.
208
+
209
+ Returns:
210
+ Execution: An execution object representing the restored execution environment.
211
+
212
+ Raises:
213
+ DerivaMLException: If execution_rid is not valid or execution cannot be restored.
214
+
215
+ Example:
216
+ >>> execution = ml.restore_execution("1-abc123")
217
+ """
218
+ # Import here to avoid circular dependency
219
+ from deriva_ml.execution.execution import Execution
220
+
221
+ # If no RID provided, try to find single execution in working directory
222
+ if not execution_rid:
223
+ e_rids = execution_rids(self.working_dir)
224
+ if len(e_rids) != 1:
225
+ raise DerivaMLException(f"Multiple execution RIDs were found {e_rids}.")
226
+ execution_rid = e_rids[0]
227
+
228
+ # Try to load configuration from a file
229
+ cfile = asset_file_path(
230
+ prefix=self.working_dir,
231
+ exec_rid=execution_rid,
232
+ file_name="configuration.json",
233
+ asset_table=self.model.name_to_table("Execution_Metadata"),
234
+ metadata={},
235
+ )
236
+
237
+ # Load configuration from a file or create from an execution record
238
+ if cfile.exists():
239
+ configuration = ExecutionConfiguration.load_configuration(cfile)
240
+ else:
241
+ execution = self.retrieve_rid(execution_rid)
242
+ # Look up the workflow object from the RID
243
+ workflow_rid = execution.get("Workflow")
244
+ workflow = self.lookup_workflow(workflow_rid) if workflow_rid else None
245
+ configuration = ExecutionConfiguration(
246
+ workflow=workflow,
247
+ description=execution["Description"],
248
+ )
249
+
250
+ # Create and return an execution instance
251
+ return Execution(configuration, self, reload=execution_rid) # type: ignore[arg-type]
252
+
253
+ def find_executions(
254
+ self,
255
+ workflow: "Workflow | RID | None" = None,
256
+ workflow_type: str | None = None,
257
+ status: Status | None = None,
258
+ ) -> Iterable["ExecutionRecord"]:
259
+ """List all executions in the catalog.
260
+
261
+ Returns ExecutionRecord objects for each execution. These provide access
262
+ to execution metadata and allow updating mutable properties.
263
+
264
+ Args:
265
+ workflow: Optional Workflow object or RID to filter by.
266
+ workflow_type: Optional workflow type name to filter by (e.g., "python_script").
267
+ This filters by the Workflow_Type vocabulary term.
268
+ status: Optional status to filter by (e.g., Status.completed).
269
+
270
+ Returns:
271
+ Iterable of ExecutionRecord objects.
272
+
273
+ Example:
274
+ List all executions::
275
+
276
+ >>> for record in ml.find_executions():
277
+ ... print(f"{record.execution_rid}: {record.status}")
278
+
279
+ Filter by status::
280
+
281
+ >>> completed = list(ml.find_executions(status=Status.completed))
282
+
283
+ Filter by specific workflow::
284
+
285
+ >>> workflow = ml.lookup_workflow("2-ABC1")
286
+ >>> for record in ml.find_executions(workflow=workflow):
287
+ ... print(f"{record.execution_rid}: {record.description}")
288
+
289
+ Filter by workflow type (all notebooks)::
290
+
291
+ >>> notebooks = list(ml.find_executions(workflow_type="python_notebook"))
292
+ """
293
+ # Import for type checking
294
+ from deriva_ml.execution.workflow import Workflow as WorkflowClass
295
+
296
+ # Get datapath to the Execution table
297
+ pb = self.pathBuilder()
298
+ execution_path = pb.schemas[self.ml_schema].Execution
299
+
300
+ # Apply filters
301
+ filtered_path = execution_path
302
+
303
+ # Filter by specific workflow
304
+ if workflow:
305
+ workflow_rid = workflow.rid if isinstance(workflow, WorkflowClass) else workflow
306
+ filtered_path = filtered_path.filter(execution_path.Workflow == workflow_rid)
307
+
308
+ # Filter by workflow type - need to join with Workflow table
309
+ if workflow_type:
310
+ workflow_path = pb.schemas[self.ml_schema].Workflow
311
+ # Link to workflows with matching type
312
+ filtered_path = (
313
+ filtered_path
314
+ .link(workflow_path, on=(execution_path.Workflow == workflow_path.RID))
315
+ .filter(workflow_path.Workflow_Type == workflow_type)
316
+ )
317
+
318
+ if status:
319
+ filtered_path = filtered_path.filter(execution_path.Status == status.value)
320
+
321
+ # Create ExecutionRecord objects
322
+ for exec_record in filtered_path.entities().fetch():
323
+ yield self.lookup_execution(exec_record["RID"])
324
+
325
+ def lookup_experiment(self, execution_rid: RID) -> "Experiment":
326
+ """Look up an experiment by execution RID.
327
+
328
+ Creates an Experiment object for analyzing completed executions.
329
+ Provides convenient access to execution metadata, configuration choices,
330
+ model parameters, inputs, and outputs.
331
+
332
+ Args:
333
+ execution_rid: Resource Identifier (RID) of the execution.
334
+
335
+ Returns:
336
+ Experiment: An experiment object for the given execution RID.
337
+
338
+ Example:
339
+ >>> exp = ml.lookup_experiment("47BE")
340
+ >>> print(exp.name) # e.g., "cifar10_quick"
341
+ >>> print(exp.config_choices) # Hydra config names used
342
+ >>> print(exp.model_config) # Model hyperparameters
343
+ """
344
+ from deriva_ml.experiment import Experiment
345
+
346
+ return Experiment(self, execution_rid) # type: ignore[arg-type]
347
+
348
+ def find_experiments(
349
+ self,
350
+ workflow_rid: RID | None = None,
351
+ status: Status | None = None,
352
+ ) -> Iterable["Experiment"]:
353
+ """List all experiments (executions with Hydra configuration) in the catalog.
354
+
355
+ Creates Experiment objects for analyzing completed ML model runs.
356
+ Only returns executions that have Hydra configuration metadata
357
+ (i.e., a config.yaml file in Execution_Metadata assets).
358
+
359
+ Args:
360
+ workflow_rid: Optional workflow RID to filter by.
361
+ status: Optional status to filter by (e.g., Status.Completed).
362
+
363
+ Returns:
364
+ Iterable of Experiment objects for executions with Hydra config.
365
+
366
+ Example:
367
+ >>> experiments = list(ml.find_experiments(status=Status.Completed))
368
+ >>> for exp in experiments:
369
+ ... print(f"{exp.name}: {exp.config_choices}")
370
+ """
371
+ import re
372
+ from deriva_ml.experiment import Experiment
373
+
374
+ # Get datapath to tables
375
+ pb = self.pathBuilder()
376
+ execution_path = pb.schemas[self.ml_schema].Execution
377
+ metadata_path = pb.schemas[self.ml_schema].Execution_Metadata
378
+ meta_exec_path = pb.schemas[self.ml_schema].Execution_Metadata_Execution
379
+
380
+ # Find executions that have metadata assets with config.yaml files
381
+ # Query the association table to find executions with hydra config metadata
382
+ exec_rids_with_config = set()
383
+
384
+ # Get all metadata records and filter for config.yaml files in Python
385
+ # (ERMrest regex support varies by deployment)
386
+ config_pattern = re.compile(r".*-config\.yaml$")
387
+ config_metadata_rids = set()
388
+ for meta in metadata_path.entities().fetch():
389
+ filename = meta.get("Filename", "")
390
+ if filename and config_pattern.match(filename):
391
+ config_metadata_rids.add(meta["RID"])
392
+
393
+ if config_metadata_rids:
394
+ # Query the association table to find which executions have these metadata
395
+ for assoc_record in meta_exec_path.entities().fetch():
396
+ if assoc_record.get("Execution_Metadata") in config_metadata_rids:
397
+ exec_rids_with_config.add(assoc_record["Execution"])
398
+
399
+ # Apply additional filters and yield Experiment objects
400
+ filtered_path = execution_path
401
+ if workflow_rid:
402
+ filtered_path = filtered_path.filter(execution_path.Workflow == workflow_rid)
403
+ if status:
404
+ filtered_path = filtered_path.filter(execution_path.Status == status.value)
405
+
406
+ for exec_record in filtered_path.entities().fetch():
407
+ if exec_record["RID"] in exec_rids_with_config:
408
+ yield Experiment(self, exec_record["RID"]) # type: ignore[arg-type]