deriva-ml 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +69 -1
- deriva_ml/asset/__init__.py +17 -0
- deriva_ml/asset/asset.py +357 -0
- deriva_ml/asset/aux_classes.py +100 -0
- deriva_ml/bump_version.py +254 -11
- deriva_ml/catalog/__init__.py +31 -0
- deriva_ml/catalog/clone.py +1939 -0
- deriva_ml/catalog/localize.py +426 -0
- deriva_ml/core/__init__.py +29 -0
- deriva_ml/core/base.py +845 -1067
- deriva_ml/core/config.py +169 -21
- deriva_ml/core/constants.py +120 -19
- deriva_ml/core/definitions.py +123 -13
- deriva_ml/core/enums.py +47 -73
- deriva_ml/core/ermrest.py +226 -193
- deriva_ml/core/exceptions.py +297 -14
- deriva_ml/core/filespec.py +99 -28
- deriva_ml/core/logging_config.py +225 -0
- deriva_ml/core/mixins/__init__.py +42 -0
- deriva_ml/core/mixins/annotation.py +915 -0
- deriva_ml/core/mixins/asset.py +384 -0
- deriva_ml/core/mixins/dataset.py +237 -0
- deriva_ml/core/mixins/execution.py +408 -0
- deriva_ml/core/mixins/feature.py +365 -0
- deriva_ml/core/mixins/file.py +263 -0
- deriva_ml/core/mixins/path_builder.py +145 -0
- deriva_ml/core/mixins/rid_resolution.py +204 -0
- deriva_ml/core/mixins/vocabulary.py +400 -0
- deriva_ml/core/mixins/workflow.py +322 -0
- deriva_ml/core/validation.py +389 -0
- deriva_ml/dataset/__init__.py +2 -1
- deriva_ml/dataset/aux_classes.py +20 -4
- deriva_ml/dataset/catalog_graph.py +575 -0
- deriva_ml/dataset/dataset.py +1242 -1008
- deriva_ml/dataset/dataset_bag.py +1311 -182
- deriva_ml/dataset/history.py +27 -14
- deriva_ml/dataset/upload.py +225 -38
- deriva_ml/demo_catalog.py +126 -110
- deriva_ml/execution/__init__.py +46 -2
- deriva_ml/execution/base_config.py +639 -0
- deriva_ml/execution/execution.py +543 -242
- deriva_ml/execution/execution_configuration.py +26 -11
- deriva_ml/execution/execution_record.py +592 -0
- deriva_ml/execution/find_caller.py +298 -0
- deriva_ml/execution/model_protocol.py +175 -0
- deriva_ml/execution/multirun_config.py +153 -0
- deriva_ml/execution/runner.py +595 -0
- deriva_ml/execution/workflow.py +223 -34
- deriva_ml/experiment/__init__.py +8 -0
- deriva_ml/experiment/experiment.py +411 -0
- deriva_ml/feature.py +6 -1
- deriva_ml/install_kernel.py +143 -6
- deriva_ml/interfaces.py +862 -0
- deriva_ml/model/__init__.py +99 -0
- deriva_ml/model/annotations.py +1278 -0
- deriva_ml/model/catalog.py +286 -60
- deriva_ml/model/database.py +144 -649
- deriva_ml/model/deriva_ml_database.py +308 -0
- deriva_ml/model/handles.py +14 -0
- deriva_ml/run_model.py +319 -0
- deriva_ml/run_notebook.py +507 -38
- deriva_ml/schema/__init__.py +18 -2
- deriva_ml/schema/annotations.py +62 -33
- deriva_ml/schema/create_schema.py +169 -69
- deriva_ml/schema/validation.py +601 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/METADATA +4 -4
- deriva_ml-1.17.12.dist-info/RECORD +77 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/WHEEL +1 -1
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/entry_points.txt +1 -0
- deriva_ml/protocols/dataset.py +0 -19
- deriva_ml/test.py +0 -94
- deriva_ml-1.17.10.dist-info/RECORD +0 -45
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""Workflow management mixin for DerivaML.
|
|
2
|
+
|
|
3
|
+
This module provides the WorkflowMixin class which handles
|
|
4
|
+
workflow operations including adding, looking up, listing,
|
|
5
|
+
and creating workflows.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Callable
|
|
11
|
+
|
|
12
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
13
|
+
import importlib
|
|
14
|
+
_deriva_core = importlib.import_module("deriva.core")
|
|
15
|
+
format_exception = _deriva_core.format_exception
|
|
16
|
+
|
|
17
|
+
from deriva_ml.core.definitions import RID, MLVocab, VocabularyTerm
|
|
18
|
+
from deriva_ml.core.exceptions import DerivaMLException
|
|
19
|
+
from deriva_ml.execution.workflow import Workflow
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from deriva_ml.interfaces import DerivaMLCatalog
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class WorkflowMixin:
|
|
26
|
+
"""Mixin providing workflow management operations.
|
|
27
|
+
|
|
28
|
+
This mixin requires the host class to have:
|
|
29
|
+
- ml_schema: str - name of the ML schema
|
|
30
|
+
- pathBuilder(): method returning catalog path builder
|
|
31
|
+
- lookup_term(): method for vocabulary term lookup (from VocabularyMixin)
|
|
32
|
+
|
|
33
|
+
Methods:
|
|
34
|
+
find_workflows: Find all workflows in the catalog
|
|
35
|
+
add_workflow: Add a workflow to the catalog
|
|
36
|
+
lookup_workflow: Look up a workflow by RID
|
|
37
|
+
find_workflow_by_url: Find a workflow by URL or checksum
|
|
38
|
+
create_workflow: Create a new workflow definition
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
# Type hints for IDE support - actual attributes/methods from host class
|
|
42
|
+
ml_schema: str
|
|
43
|
+
pathBuilder: Callable[[], Any]
|
|
44
|
+
lookup_term: Callable[[str, str], VocabularyTerm]
|
|
45
|
+
|
|
46
|
+
def find_workflows(self) -> list[Workflow]:
|
|
47
|
+
"""Find all workflows in the catalog.
|
|
48
|
+
|
|
49
|
+
Catalog-level operation to find all workflow definitions, including their
|
|
50
|
+
names, URLs, types, versions, and descriptions. Each returned Workflow
|
|
51
|
+
is bound to the catalog, allowing its description to be updated.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
list[Workflow]: List of workflow objects, each containing:
|
|
55
|
+
- name: Workflow name
|
|
56
|
+
- url: Source code URL
|
|
57
|
+
- workflow_type: Type of workflow
|
|
58
|
+
- version: Version identifier
|
|
59
|
+
- description: Workflow description
|
|
60
|
+
- rid: Resource identifier
|
|
61
|
+
- checksum: Source code checksum
|
|
62
|
+
|
|
63
|
+
Examples:
|
|
64
|
+
List all workflows and their descriptions::
|
|
65
|
+
|
|
66
|
+
>>> workflows = ml.find_workflows()
|
|
67
|
+
>>> for w in workflows:
|
|
68
|
+
... print(f"{w.name} (v{w.version}): {w.description}")
|
|
69
|
+
... print(f" Source: {w.url}")
|
|
70
|
+
|
|
71
|
+
Update a workflow's description (workflows are catalog-bound)::
|
|
72
|
+
|
|
73
|
+
>>> workflows = ml.find_workflows()
|
|
74
|
+
>>> workflows[0].description = "Updated description"
|
|
75
|
+
"""
|
|
76
|
+
# Get a workflow table path and fetch all workflows
|
|
77
|
+
workflow_path = self.pathBuilder().schemas[self.ml_schema].Workflow
|
|
78
|
+
workflows = []
|
|
79
|
+
for w in workflow_path.entities().fetch():
|
|
80
|
+
workflow = Workflow(
|
|
81
|
+
name=w["Name"],
|
|
82
|
+
url=w["URL"],
|
|
83
|
+
workflow_type=w["Workflow_Type"],
|
|
84
|
+
version=w["Version"],
|
|
85
|
+
description=w["Description"],
|
|
86
|
+
rid=w["RID"],
|
|
87
|
+
checksum=w["Checksum"],
|
|
88
|
+
)
|
|
89
|
+
# Bind the workflow to this catalog instance
|
|
90
|
+
workflow._ml_instance = self # type: ignore[assignment]
|
|
91
|
+
workflows.append(workflow)
|
|
92
|
+
return workflows
|
|
93
|
+
|
|
94
|
+
def add_workflow(self, workflow: Workflow) -> RID:
|
|
95
|
+
"""Adds a workflow to the catalog.
|
|
96
|
+
|
|
97
|
+
Registers a new workflow in the catalog or returns the RID of an existing workflow with the same
|
|
98
|
+
URL or checksum.
|
|
99
|
+
|
|
100
|
+
Each workflow represents a specific computational process or analysis pipeline.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
workflow: Workflow object containing name, URL, type, version, and description.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
RID: Resource Identifier of the added or existing workflow.
|
|
107
|
+
|
|
108
|
+
Raises:
|
|
109
|
+
DerivaMLException: If workflow insertion fails or required fields are missing.
|
|
110
|
+
|
|
111
|
+
Examples:
|
|
112
|
+
>>> workflow = Workflow(
|
|
113
|
+
... name="Gene Analysis",
|
|
114
|
+
... url="https://github.com/org/repo/workflows/gene_analysis.py",
|
|
115
|
+
... workflow_type="python_script",
|
|
116
|
+
... version="1.0.0",
|
|
117
|
+
... description="Analyzes gene expression patterns"
|
|
118
|
+
... )
|
|
119
|
+
>>> workflow_rid = ml.add_workflow(workflow)
|
|
120
|
+
"""
|
|
121
|
+
# Check if a workflow already exists by URL or checksum
|
|
122
|
+
if workflow_rid := self._find_workflow_rid_by_url(workflow.checksum or workflow.url):
|
|
123
|
+
return workflow_rid
|
|
124
|
+
|
|
125
|
+
# Get an ML schema path for the workflow table
|
|
126
|
+
ml_schema_path = self.pathBuilder().schemas[self.ml_schema]
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
# Create a workflow record
|
|
130
|
+
workflow_record = {
|
|
131
|
+
"URL": workflow.url,
|
|
132
|
+
"Name": workflow.name,
|
|
133
|
+
"Description": workflow.description,
|
|
134
|
+
"Checksum": workflow.checksum,
|
|
135
|
+
"Version": workflow.version,
|
|
136
|
+
MLVocab.workflow_type: self.lookup_term(MLVocab.workflow_type, workflow.workflow_type).name,
|
|
137
|
+
}
|
|
138
|
+
# Insert a workflow and get its RID
|
|
139
|
+
workflow_rid = ml_schema_path.Workflow.insert([workflow_record])[0]["RID"]
|
|
140
|
+
except Exception as e:
|
|
141
|
+
error = format_exception(e)
|
|
142
|
+
raise DerivaMLException(f"Failed to insert workflow. Error: {error}")
|
|
143
|
+
return workflow_rid
|
|
144
|
+
|
|
145
|
+
def lookup_workflow(self, rid: RID) -> Workflow:
|
|
146
|
+
"""Look up a workflow by its Resource Identifier (RID).
|
|
147
|
+
|
|
148
|
+
Retrieves a workflow from the catalog by its RID and returns a Workflow
|
|
149
|
+
object bound to the catalog. The returned Workflow can be modified (e.g.,
|
|
150
|
+
updating its description) and changes will be reflected in the catalog.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
rid: Resource Identifier of the workflow to look up.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Workflow: The workflow object bound to this catalog, allowing
|
|
157
|
+
properties like ``description`` to be updated.
|
|
158
|
+
|
|
159
|
+
Raises:
|
|
160
|
+
DerivaMLException: If the RID does not correspond to a workflow
|
|
161
|
+
in the catalog.
|
|
162
|
+
|
|
163
|
+
Examples:
|
|
164
|
+
Look up a workflow and read its properties::
|
|
165
|
+
|
|
166
|
+
>>> workflow = ml.lookup_workflow("2-ABC1")
|
|
167
|
+
>>> print(f"Name: {workflow.name}")
|
|
168
|
+
>>> print(f"Description: {workflow.description}")
|
|
169
|
+
>>> print(f"Type: {workflow.workflow_type}")
|
|
170
|
+
|
|
171
|
+
Update a workflow's description (persisted to catalog)::
|
|
172
|
+
|
|
173
|
+
>>> workflow = ml.lookup_workflow("2-ABC1")
|
|
174
|
+
>>> workflow.description = "Updated analysis pipeline for RNA sequences"
|
|
175
|
+
>>> # The change is immediately written to the catalog
|
|
176
|
+
|
|
177
|
+
Attempting to update on a read-only catalog raises an error::
|
|
178
|
+
|
|
179
|
+
>>> snapshot = ml.catalog_snapshot("2023-01-15T10:30:00")
|
|
180
|
+
>>> workflow = snapshot.lookup_workflow("2-ABC1")
|
|
181
|
+
>>> workflow.description = "New description"
|
|
182
|
+
DerivaMLException: Cannot update workflow description on a read-only
|
|
183
|
+
catalog snapshot. Use a writable catalog connection instead.
|
|
184
|
+
"""
|
|
185
|
+
# Get the workflow table path
|
|
186
|
+
workflow_path = self.pathBuilder().schemas[self.ml_schema].Workflow
|
|
187
|
+
|
|
188
|
+
# Filter by RID
|
|
189
|
+
records = list(workflow_path.filter(workflow_path.RID == rid).entities().fetch())
|
|
190
|
+
|
|
191
|
+
if not records:
|
|
192
|
+
raise DerivaMLException(f"Workflow with RID '{rid}' not found in the catalog")
|
|
193
|
+
|
|
194
|
+
w = records[0]
|
|
195
|
+
workflow = Workflow(
|
|
196
|
+
name=w["Name"],
|
|
197
|
+
url=w["URL"],
|
|
198
|
+
workflow_type=w["Workflow_Type"],
|
|
199
|
+
version=w["Version"],
|
|
200
|
+
description=w["Description"],
|
|
201
|
+
rid=w["RID"],
|
|
202
|
+
checksum=w["Checksum"],
|
|
203
|
+
)
|
|
204
|
+
# Bind the workflow to this catalog instance for write-back support
|
|
205
|
+
workflow._ml_instance = self # type: ignore[assignment]
|
|
206
|
+
return workflow
|
|
207
|
+
|
|
208
|
+
def _find_workflow_rid_by_url(self, url_or_checksum: str) -> RID | None:
|
|
209
|
+
"""Internal method to find a workflow RID by URL or checksum.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
url_or_checksum: URL or checksum of the workflow to find.
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
RID: Resource Identifier of the workflow if found, None otherwise.
|
|
216
|
+
"""
|
|
217
|
+
# Get a workflow table path
|
|
218
|
+
workflow_path = self.pathBuilder().schemas[self.ml_schema].Workflow
|
|
219
|
+
workflow_rid = None
|
|
220
|
+
for w in workflow_path.path.entities().fetch():
|
|
221
|
+
if w['URL'] == url_or_checksum or w['Checksum'] == url_or_checksum:
|
|
222
|
+
workflow_rid = w['RID']
|
|
223
|
+
|
|
224
|
+
return workflow_rid
|
|
225
|
+
|
|
226
|
+
def lookup_workflow_by_url(self, url_or_checksum: str) -> Workflow:
|
|
227
|
+
"""Look up a workflow by URL or checksum and return the full Workflow object.
|
|
228
|
+
|
|
229
|
+
Searches for a workflow in the catalog that matches the given URL or
|
|
230
|
+
checksum and returns a Workflow object bound to the catalog. This allows
|
|
231
|
+
you to both identify a workflow by its source code location and modify
|
|
232
|
+
its properties (e.g., description).
|
|
233
|
+
|
|
234
|
+
The URL should be a GitHub URL pointing to the specific version of the
|
|
235
|
+
workflow source code. The format typically includes the commit hash::
|
|
236
|
+
|
|
237
|
+
https://github.com/org/repo/blob/<commit_hash>/path/to/workflow.py
|
|
238
|
+
|
|
239
|
+
Alternatively, you can search by the Git object hash (checksum) of the
|
|
240
|
+
workflow file.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
url_or_checksum: GitHub URL with commit hash, or Git object hash
|
|
244
|
+
(checksum) of the workflow file.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Workflow: The workflow object bound to this catalog, allowing
|
|
248
|
+
properties like ``description`` to be updated.
|
|
249
|
+
|
|
250
|
+
Raises:
|
|
251
|
+
DerivaMLException: If no workflow with the given URL or checksum
|
|
252
|
+
is found in the catalog.
|
|
253
|
+
|
|
254
|
+
Examples:
|
|
255
|
+
Look up a workflow by its GitHub URL::
|
|
256
|
+
|
|
257
|
+
>>> url = "https://github.com/org/repo/blob/abc123/analysis.py"
|
|
258
|
+
>>> workflow = ml.lookup_workflow_by_url(url)
|
|
259
|
+
>>> print(f"Found: {workflow.name}")
|
|
260
|
+
>>> print(f"Version: {workflow.version}")
|
|
261
|
+
|
|
262
|
+
Look up by Git object hash (checksum)::
|
|
263
|
+
|
|
264
|
+
>>> workflow = ml.lookup_workflow_by_url("abc123def456789...")
|
|
265
|
+
>>> print(f"Name: {workflow.name}")
|
|
266
|
+
>>> print(f"URL: {workflow.url}")
|
|
267
|
+
|
|
268
|
+
Update the workflow's description after lookup::
|
|
269
|
+
|
|
270
|
+
>>> workflow = ml.lookup_workflow_by_url(url)
|
|
271
|
+
>>> workflow.description = "Updated analysis pipeline"
|
|
272
|
+
>>> # The change is persisted to the catalog
|
|
273
|
+
|
|
274
|
+
Typical GitHub URL formats supported::
|
|
275
|
+
|
|
276
|
+
# Full blob URL with commit hash
|
|
277
|
+
https://github.com/org/repo/blob/abc123def/src/workflow.py
|
|
278
|
+
|
|
279
|
+
# The URL is matched exactly, so ensure it matches what was
|
|
280
|
+
# recorded when the workflow was registered
|
|
281
|
+
"""
|
|
282
|
+
# Find the RID first
|
|
283
|
+
rid = self._find_workflow_rid_by_url(url_or_checksum)
|
|
284
|
+
if rid is None:
|
|
285
|
+
raise DerivaMLException(
|
|
286
|
+
f"Workflow with URL or checksum '{url_or_checksum}' not found in the catalog"
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Use lookup_workflow to get the full object with catalog binding
|
|
290
|
+
return self.lookup_workflow(rid)
|
|
291
|
+
|
|
292
|
+
def create_workflow(self, name: str, workflow_type: str, description: str = "") -> Workflow:
|
|
293
|
+
"""Creates a new workflow definition.
|
|
294
|
+
|
|
295
|
+
Creates a Workflow object that represents a computational process or analysis pipeline. The workflow type
|
|
296
|
+
must be a term from the controlled vocabulary. This method is typically used to define new analysis
|
|
297
|
+
workflows before execution.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
name: Name of the workflow.
|
|
301
|
+
workflow_type: Type of workflow (must exist in workflow_type vocabulary).
|
|
302
|
+
description: Description of what the workflow does.
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Workflow: New workflow object ready for registration.
|
|
306
|
+
|
|
307
|
+
Raises:
|
|
308
|
+
DerivaMLException: If workflow_type is not in the vocabulary.
|
|
309
|
+
|
|
310
|
+
Examples:
|
|
311
|
+
>>> workflow = ml.create_workflow(
|
|
312
|
+
... name="RNA Analysis",
|
|
313
|
+
... workflow_type="python_notebook",
|
|
314
|
+
... description="RNA sequence analysis pipeline"
|
|
315
|
+
... )
|
|
316
|
+
>>> rid = ml.add_workflow(workflow)
|
|
317
|
+
"""
|
|
318
|
+
# Validate workflow type exists in vocabulary
|
|
319
|
+
self.lookup_term(MLVocab.workflow_type, workflow_type)
|
|
320
|
+
|
|
321
|
+
# Create and return a new workflow object
|
|
322
|
+
return Workflow(name=name, workflow_type=workflow_type, description=description)
|