deriva-ml 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. deriva_ml/__init__.py +43 -1
  2. deriva_ml/asset/__init__.py +17 -0
  3. deriva_ml/asset/asset.py +357 -0
  4. deriva_ml/asset/aux_classes.py +100 -0
  5. deriva_ml/bump_version.py +254 -11
  6. deriva_ml/catalog/__init__.py +21 -0
  7. deriva_ml/catalog/clone.py +1199 -0
  8. deriva_ml/catalog/localize.py +426 -0
  9. deriva_ml/core/__init__.py +29 -0
  10. deriva_ml/core/base.py +817 -1067
  11. deriva_ml/core/config.py +169 -21
  12. deriva_ml/core/constants.py +120 -19
  13. deriva_ml/core/definitions.py +123 -13
  14. deriva_ml/core/enums.py +47 -73
  15. deriva_ml/core/ermrest.py +226 -193
  16. deriva_ml/core/exceptions.py +297 -14
  17. deriva_ml/core/filespec.py +99 -28
  18. deriva_ml/core/logging_config.py +225 -0
  19. deriva_ml/core/mixins/__init__.py +42 -0
  20. deriva_ml/core/mixins/annotation.py +915 -0
  21. deriva_ml/core/mixins/asset.py +384 -0
  22. deriva_ml/core/mixins/dataset.py +237 -0
  23. deriva_ml/core/mixins/execution.py +408 -0
  24. deriva_ml/core/mixins/feature.py +365 -0
  25. deriva_ml/core/mixins/file.py +263 -0
  26. deriva_ml/core/mixins/path_builder.py +145 -0
  27. deriva_ml/core/mixins/rid_resolution.py +204 -0
  28. deriva_ml/core/mixins/vocabulary.py +400 -0
  29. deriva_ml/core/mixins/workflow.py +322 -0
  30. deriva_ml/core/validation.py +389 -0
  31. deriva_ml/dataset/__init__.py +2 -1
  32. deriva_ml/dataset/aux_classes.py +20 -4
  33. deriva_ml/dataset/catalog_graph.py +575 -0
  34. deriva_ml/dataset/dataset.py +1242 -1008
  35. deriva_ml/dataset/dataset_bag.py +1311 -182
  36. deriva_ml/dataset/history.py +27 -14
  37. deriva_ml/dataset/upload.py +225 -38
  38. deriva_ml/demo_catalog.py +126 -110
  39. deriva_ml/execution/__init__.py +46 -2
  40. deriva_ml/execution/base_config.py +639 -0
  41. deriva_ml/execution/execution.py +543 -242
  42. deriva_ml/execution/execution_configuration.py +26 -11
  43. deriva_ml/execution/execution_record.py +592 -0
  44. deriva_ml/execution/find_caller.py +298 -0
  45. deriva_ml/execution/model_protocol.py +175 -0
  46. deriva_ml/execution/multirun_config.py +153 -0
  47. deriva_ml/execution/runner.py +595 -0
  48. deriva_ml/execution/workflow.py +223 -34
  49. deriva_ml/experiment/__init__.py +8 -0
  50. deriva_ml/experiment/experiment.py +411 -0
  51. deriva_ml/feature.py +6 -1
  52. deriva_ml/install_kernel.py +143 -6
  53. deriva_ml/interfaces.py +862 -0
  54. deriva_ml/model/__init__.py +99 -0
  55. deriva_ml/model/annotations.py +1278 -0
  56. deriva_ml/model/catalog.py +286 -60
  57. deriva_ml/model/database.py +144 -649
  58. deriva_ml/model/deriva_ml_database.py +308 -0
  59. deriva_ml/model/handles.py +14 -0
  60. deriva_ml/run_model.py +319 -0
  61. deriva_ml/run_notebook.py +507 -38
  62. deriva_ml/schema/__init__.py +18 -2
  63. deriva_ml/schema/annotations.py +62 -33
  64. deriva_ml/schema/create_schema.py +169 -69
  65. deriva_ml/schema/validation.py +601 -0
  66. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -4
  67. deriva_ml-1.17.11.dist-info/RECORD +77 -0
  68. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
  69. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +1 -0
  70. deriva_ml/protocols/dataset.py +0 -19
  71. deriva_ml/test.py +0 -94
  72. deriva_ml-1.17.10.dist-info/RECORD +0 -45
  73. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
  74. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,322 @@
1
+ """Workflow management mixin for DerivaML.
2
+
3
+ This module provides the WorkflowMixin class which handles
4
+ workflow operations including adding, looking up, listing,
5
+ and creating workflows.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import TYPE_CHECKING, Any, Callable
11
+
12
+ # Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
13
+ import importlib
14
+ _deriva_core = importlib.import_module("deriva.core")
15
+ format_exception = _deriva_core.format_exception
16
+
17
+ from deriva_ml.core.definitions import RID, MLVocab, VocabularyTerm
18
+ from deriva_ml.core.exceptions import DerivaMLException
19
+ from deriva_ml.execution.workflow import Workflow
20
+
21
+ if TYPE_CHECKING:
22
+ from deriva_ml.interfaces import DerivaMLCatalog
23
+
24
+
25
+ class WorkflowMixin:
26
+ """Mixin providing workflow management operations.
27
+
28
+ This mixin requires the host class to have:
29
+ - ml_schema: str - name of the ML schema
30
+ - pathBuilder(): method returning catalog path builder
31
+ - lookup_term(): method for vocabulary term lookup (from VocabularyMixin)
32
+
33
+ Methods:
34
+ find_workflows: Find all workflows in the catalog
35
+ add_workflow: Add a workflow to the catalog
36
+ lookup_workflow: Look up a workflow by RID
37
+ find_workflow_by_url: Find a workflow by URL or checksum
38
+ create_workflow: Create a new workflow definition
39
+ """
40
+
41
+ # Type hints for IDE support - actual attributes/methods from host class
42
+ ml_schema: str
43
+ pathBuilder: Callable[[], Any]
44
+ lookup_term: Callable[[str, str], VocabularyTerm]
45
+
46
+ def find_workflows(self) -> list[Workflow]:
47
+ """Find all workflows in the catalog.
48
+
49
+ Catalog-level operation to find all workflow definitions, including their
50
+ names, URLs, types, versions, and descriptions. Each returned Workflow
51
+ is bound to the catalog, allowing its description to be updated.
52
+
53
+ Returns:
54
+ list[Workflow]: List of workflow objects, each containing:
55
+ - name: Workflow name
56
+ - url: Source code URL
57
+ - workflow_type: Type of workflow
58
+ - version: Version identifier
59
+ - description: Workflow description
60
+ - rid: Resource identifier
61
+ - checksum: Source code checksum
62
+
63
+ Examples:
64
+ List all workflows and their descriptions::
65
+
66
+ >>> workflows = ml.find_workflows()
67
+ >>> for w in workflows:
68
+ ... print(f"{w.name} (v{w.version}): {w.description}")
69
+ ... print(f" Source: {w.url}")
70
+
71
+ Update a workflow's description (workflows are catalog-bound)::
72
+
73
+ >>> workflows = ml.find_workflows()
74
+ >>> workflows[0].description = "Updated description"
75
+ """
76
+ # Get a workflow table path and fetch all workflows
77
+ workflow_path = self.pathBuilder().schemas[self.ml_schema].Workflow
78
+ workflows = []
79
+ for w in workflow_path.entities().fetch():
80
+ workflow = Workflow(
81
+ name=w["Name"],
82
+ url=w["URL"],
83
+ workflow_type=w["Workflow_Type"],
84
+ version=w["Version"],
85
+ description=w["Description"],
86
+ rid=w["RID"],
87
+ checksum=w["Checksum"],
88
+ )
89
+ # Bind the workflow to this catalog instance
90
+ workflow._ml_instance = self # type: ignore[assignment]
91
+ workflows.append(workflow)
92
+ return workflows
93
+
94
+ def add_workflow(self, workflow: Workflow) -> RID:
95
+ """Adds a workflow to the catalog.
96
+
97
+ Registers a new workflow in the catalog or returns the RID of an existing workflow with the same
98
+ URL or checksum.
99
+
100
+ Each workflow represents a specific computational process or analysis pipeline.
101
+
102
+ Args:
103
+ workflow: Workflow object containing name, URL, type, version, and description.
104
+
105
+ Returns:
106
+ RID: Resource Identifier of the added or existing workflow.
107
+
108
+ Raises:
109
+ DerivaMLException: If workflow insertion fails or required fields are missing.
110
+
111
+ Examples:
112
+ >>> workflow = Workflow(
113
+ ... name="Gene Analysis",
114
+ ... url="https://github.com/org/repo/workflows/gene_analysis.py",
115
+ ... workflow_type="python_script",
116
+ ... version="1.0.0",
117
+ ... description="Analyzes gene expression patterns"
118
+ ... )
119
+ >>> workflow_rid = ml.add_workflow(workflow)
120
+ """
121
+ # Check if a workflow already exists by URL or checksum
122
+ if workflow_rid := self._find_workflow_rid_by_url(workflow.checksum or workflow.url):
123
+ return workflow_rid
124
+
125
+ # Get an ML schema path for the workflow table
126
+ ml_schema_path = self.pathBuilder().schemas[self.ml_schema]
127
+
128
+ try:
129
+ # Create a workflow record
130
+ workflow_record = {
131
+ "URL": workflow.url,
132
+ "Name": workflow.name,
133
+ "Description": workflow.description,
134
+ "Checksum": workflow.checksum,
135
+ "Version": workflow.version,
136
+ MLVocab.workflow_type: self.lookup_term(MLVocab.workflow_type, workflow.workflow_type).name,
137
+ }
138
+ # Insert a workflow and get its RID
139
+ workflow_rid = ml_schema_path.Workflow.insert([workflow_record])[0]["RID"]
140
+ except Exception as e:
141
+ error = format_exception(e)
142
+ raise DerivaMLException(f"Failed to insert workflow. Error: {error}")
143
+ return workflow_rid
144
+
145
+ def lookup_workflow(self, rid: RID) -> Workflow:
146
+ """Look up a workflow by its Resource Identifier (RID).
147
+
148
+ Retrieves a workflow from the catalog by its RID and returns a Workflow
149
+ object bound to the catalog. The returned Workflow can be modified (e.g.,
150
+ updating its description) and changes will be reflected in the catalog.
151
+
152
+ Args:
153
+ rid: Resource Identifier of the workflow to look up.
154
+
155
+ Returns:
156
+ Workflow: The workflow object bound to this catalog, allowing
157
+ properties like ``description`` to be updated.
158
+
159
+ Raises:
160
+ DerivaMLException: If the RID does not correspond to a workflow
161
+ in the catalog.
162
+
163
+ Examples:
164
+ Look up a workflow and read its properties::
165
+
166
+ >>> workflow = ml.lookup_workflow("2-ABC1")
167
+ >>> print(f"Name: {workflow.name}")
168
+ >>> print(f"Description: {workflow.description}")
169
+ >>> print(f"Type: {workflow.workflow_type}")
170
+
171
+ Update a workflow's description (persisted to catalog)::
172
+
173
+ >>> workflow = ml.lookup_workflow("2-ABC1")
174
+ >>> workflow.description = "Updated analysis pipeline for RNA sequences"
175
+ >>> # The change is immediately written to the catalog
176
+
177
+ Attempting to update on a read-only catalog raises an error::
178
+
179
+ >>> snapshot = ml.catalog_snapshot("2023-01-15T10:30:00")
180
+ >>> workflow = snapshot.lookup_workflow("2-ABC1")
181
+ >>> workflow.description = "New description"
182
+ DerivaMLException: Cannot update workflow description on a read-only
183
+ catalog snapshot. Use a writable catalog connection instead.
184
+ """
185
+ # Get the workflow table path
186
+ workflow_path = self.pathBuilder().schemas[self.ml_schema].Workflow
187
+
188
+ # Filter by RID
189
+ records = list(workflow_path.filter(workflow_path.RID == rid).entities().fetch())
190
+
191
+ if not records:
192
+ raise DerivaMLException(f"Workflow with RID '{rid}' not found in the catalog")
193
+
194
+ w = records[0]
195
+ workflow = Workflow(
196
+ name=w["Name"],
197
+ url=w["URL"],
198
+ workflow_type=w["Workflow_Type"],
199
+ version=w["Version"],
200
+ description=w["Description"],
201
+ rid=w["RID"],
202
+ checksum=w["Checksum"],
203
+ )
204
+ # Bind the workflow to this catalog instance for write-back support
205
+ workflow._ml_instance = self # type: ignore[assignment]
206
+ return workflow
207
+
208
+ def _find_workflow_rid_by_url(self, url_or_checksum: str) -> RID | None:
209
+ """Internal method to find a workflow RID by URL or checksum.
210
+
211
+ Args:
212
+ url_or_checksum: URL or checksum of the workflow to find.
213
+
214
+ Returns:
215
+ RID: Resource Identifier of the workflow if found, None otherwise.
216
+ """
217
+ # Get a workflow table path
218
+ workflow_path = self.pathBuilder().schemas[self.ml_schema].Workflow
219
+ workflow_rid = None
220
+ for w in workflow_path.path.entities().fetch():
221
+ if w['URL'] == url_or_checksum or w['Checksum'] == url_or_checksum:
222
+ workflow_rid = w['RID']
223
+
224
+ return workflow_rid
225
+
226
+ def lookup_workflow_by_url(self, url_or_checksum: str) -> Workflow:
227
+ """Look up a workflow by URL or checksum and return the full Workflow object.
228
+
229
+ Searches for a workflow in the catalog that matches the given URL or
230
+ checksum and returns a Workflow object bound to the catalog. This allows
231
+ you to both identify a workflow by its source code location and modify
232
+ its properties (e.g., description).
233
+
234
+ The URL should be a GitHub URL pointing to the specific version of the
235
+ workflow source code. The format typically includes the commit hash::
236
+
237
+ https://github.com/org/repo/blob/<commit_hash>/path/to/workflow.py
238
+
239
+ Alternatively, you can search by the Git object hash (checksum) of the
240
+ workflow file.
241
+
242
+ Args:
243
+ url_or_checksum: GitHub URL with commit hash, or Git object hash
244
+ (checksum) of the workflow file.
245
+
246
+ Returns:
247
+ Workflow: The workflow object bound to this catalog, allowing
248
+ properties like ``description`` to be updated.
249
+
250
+ Raises:
251
+ DerivaMLException: If no workflow with the given URL or checksum
252
+ is found in the catalog.
253
+
254
+ Examples:
255
+ Look up a workflow by its GitHub URL::
256
+
257
+ >>> url = "https://github.com/org/repo/blob/abc123/analysis.py"
258
+ >>> workflow = ml.lookup_workflow_by_url(url)
259
+ >>> print(f"Found: {workflow.name}")
260
+ >>> print(f"Version: {workflow.version}")
261
+
262
+ Look up by Git object hash (checksum)::
263
+
264
+ >>> workflow = ml.lookup_workflow_by_url("abc123def456789...")
265
+ >>> print(f"Name: {workflow.name}")
266
+ >>> print(f"URL: {workflow.url}")
267
+
268
+ Update the workflow's description after lookup::
269
+
270
+ >>> workflow = ml.lookup_workflow_by_url(url)
271
+ >>> workflow.description = "Updated analysis pipeline"
272
+ >>> # The change is persisted to the catalog
273
+
274
+ Typical GitHub URL formats supported::
275
+
276
+ # Full blob URL with commit hash
277
+ https://github.com/org/repo/blob/abc123def/src/workflow.py
278
+
279
+ # The URL is matched exactly, so ensure it matches what was
280
+ # recorded when the workflow was registered
281
+ """
282
+ # Find the RID first
283
+ rid = self._find_workflow_rid_by_url(url_or_checksum)
284
+ if rid is None:
285
+ raise DerivaMLException(
286
+ f"Workflow with URL or checksum '{url_or_checksum}' not found in the catalog"
287
+ )
288
+
289
+ # Use lookup_workflow to get the full object with catalog binding
290
+ return self.lookup_workflow(rid)
291
+
292
+ def create_workflow(self, name: str, workflow_type: str, description: str = "") -> Workflow:
293
+ """Creates a new workflow definition.
294
+
295
+ Creates a Workflow object that represents a computational process or analysis pipeline. The workflow type
296
+ must be a term from the controlled vocabulary. This method is typically used to define new analysis
297
+ workflows before execution.
298
+
299
+ Args:
300
+ name: Name of the workflow.
301
+ workflow_type: Type of workflow (must exist in workflow_type vocabulary).
302
+ description: Description of what the workflow does.
303
+
304
+ Returns:
305
+ Workflow: New workflow object ready for registration.
306
+
307
+ Raises:
308
+ DerivaMLException: If workflow_type is not in the vocabulary.
309
+
310
+ Examples:
311
+ >>> workflow = ml.create_workflow(
312
+ ... name="RNA Analysis",
313
+ ... workflow_type="python_notebook",
314
+ ... description="RNA sequence analysis pipeline"
315
+ ... )
316
+ >>> rid = ml.add_workflow(workflow)
317
+ """
318
+ # Validate workflow type exists in vocabulary
319
+ self.lookup_term(MLVocab.workflow_type, workflow_type)
320
+
321
+ # Create and return a new workflow object
322
+ return Workflow(name=name, workflow_type=workflow_type, description=description)