deriva-ml 1.14.0__py3-none-any.whl → 1.14.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +25 -30
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1489 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +4 -0
- deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
- deriva_ml/{dataset.py → dataset/dataset.py} +406 -428
- deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
- deriva_ml/{history.py → dataset/history.py} +51 -33
- deriva_ml/{upload.py → dataset/upload.py} +48 -70
- deriva_ml/demo_catalog.py +233 -183
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/{execution.py → execution/execution.py} +365 -252
- deriva_ml/execution/execution_configuration.py +163 -0
- deriva_ml/{execution_configuration.py → execution/workflow.py} +212 -224
- deriva_ml/feature.py +83 -46
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
- deriva_ml/{database_model.py → model/database.py} +52 -74
- deriva_ml/model/sql_mapper.py +44 -0
- deriva_ml/run_notebook.py +19 -11
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/{schema_setup → schema}/annotations.py +31 -22
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/METADATA +5 -4
- deriva_ml-1.14.27.dist-info/RECORD +40 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/entry_points.txt +1 -0
- deriva_ml/deriva_definitions.py +0 -391
- deriva_ml/deriva_ml_base.py +0 -1046
- deriva_ml/execution_environment.py +0 -139
- deriva_ml/schema_setup/table_comments_utils.py +0 -56
- deriva_ml/test-files/execution-parameters.json +0 -1
- deriva_ml/test-files/notebook-parameters.json +0 -5
- deriva_ml/test_functions.py +0 -141
- deriva_ml/test_notebook.ipynb +0 -197
- deriva_ml-1.14.0.dist-info/RECORD +0 -31
- /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
- /deriva_ml/{schema_setup → schema}/policy.json +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/WHEEL +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/top_level.txt +0 -0
|
@@ -1,91 +1,239 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Classes that are used to define an execution configuration.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from __future__ import annotations
|
|
6
|
-
|
|
7
1
|
import inspect
|
|
8
|
-
import json
|
|
9
2
|
import logging
|
|
10
3
|
import os
|
|
11
|
-
|
|
12
|
-
from requests import RequestException
|
|
13
|
-
import requests
|
|
14
4
|
import subprocess
|
|
15
|
-
from
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
16
7
|
|
|
8
|
+
import requests
|
|
17
9
|
from pydantic import (
|
|
18
10
|
BaseModel,
|
|
19
|
-
conlist,
|
|
20
|
-
ConfigDict,
|
|
21
|
-
field_validator,
|
|
22
|
-
Field,
|
|
23
11
|
PrivateAttr,
|
|
24
12
|
)
|
|
25
|
-
from
|
|
26
|
-
import sys
|
|
27
|
-
|
|
13
|
+
from requests import RequestException
|
|
28
14
|
|
|
29
|
-
from .
|
|
30
|
-
from .
|
|
15
|
+
from deriva_ml.core.definitions import RID
|
|
16
|
+
from deriva_ml.core.exceptions import DerivaMLException
|
|
31
17
|
|
|
32
18
|
try:
|
|
33
|
-
from IPython import get_ipython
|
|
19
|
+
from IPython.core.getipython import get_ipython
|
|
34
20
|
except ImportError: # Graceful fallback if IPython isn't installed.
|
|
35
21
|
|
|
36
|
-
def get_ipython():
|
|
37
|
-
"""Dummy routine in case you are not running in IPython."""
|
|
22
|
+
def get_ipython() -> None:
|
|
38
23
|
return None
|
|
39
24
|
|
|
40
25
|
|
|
41
26
|
try:
|
|
42
27
|
from jupyter_server.serverapp import list_running_servers
|
|
28
|
+
|
|
29
|
+
def get_servers() -> list[Any]:
|
|
30
|
+
return list(list_running_servers())
|
|
43
31
|
except ImportError:
|
|
44
32
|
|
|
45
33
|
def list_running_servers():
|
|
46
|
-
"""Dummy routine in case you are not running in Jupyter."""
|
|
47
34
|
return []
|
|
48
35
|
|
|
36
|
+
def get_servers() -> list[Any]:
|
|
37
|
+
return list_running_servers()
|
|
38
|
+
|
|
49
39
|
|
|
50
40
|
try:
|
|
51
|
-
from ipykernel import get_connection_file
|
|
41
|
+
from ipykernel.connect import get_connection_file
|
|
42
|
+
|
|
43
|
+
def get_kernel_connection() -> str:
|
|
44
|
+
return get_connection_file()
|
|
52
45
|
except ImportError:
|
|
53
46
|
|
|
54
47
|
def get_connection_file():
|
|
55
|
-
"""Dummy routine in case you are not running in Jupyter."""
|
|
56
48
|
return ""
|
|
57
49
|
|
|
50
|
+
def get_kernel_connection() -> str:
|
|
51
|
+
return get_connection_file()
|
|
52
|
+
|
|
58
53
|
|
|
59
54
|
class Workflow(BaseModel):
|
|
60
|
-
"""
|
|
61
|
-
|
|
55
|
+
"""Represents a computational workflow in DerivaML.
|
|
56
|
+
|
|
57
|
+
A workflow defines a computational process or analysis pipeline. Each workflow has
|
|
58
|
+
a unique identifier, source code location, and type. Workflows are typically
|
|
59
|
+
associated with Git repositories for version control.
|
|
62
60
|
|
|
63
61
|
Attributes:
|
|
64
|
-
name:
|
|
65
|
-
url:
|
|
66
|
-
workflow_type:
|
|
67
|
-
version
|
|
68
|
-
description:
|
|
69
|
-
|
|
62
|
+
name (str): Human-readable name of the workflow.
|
|
63
|
+
url (str): URI to the workflow source code (typically a GitHub URL).
|
|
64
|
+
workflow_type (str): Type of workflow (must be a controlled vocabulary term).
|
|
65
|
+
version (str | None): Version identifier (semantic versioning).
|
|
66
|
+
description (str | None): Description of workflow purpose and behavior.
|
|
67
|
+
rid (RID | None): Resource Identifier if registered in catalog.
|
|
68
|
+
checksum (str | None): Git hash of workflow source code.
|
|
69
|
+
is_notebook (bool): Whether workflow is a Jupyter notebook.
|
|
70
|
+
|
|
71
|
+
Example:
|
|
72
|
+
>>> workflow = Workflow(
|
|
73
|
+
... name="RNA Analysis",
|
|
74
|
+
... url="https://github.com/org/repo/analysis.ipynb",
|
|
75
|
+
... workflow_type="python_notebook",
|
|
76
|
+
... version="1.0.0",
|
|
77
|
+
... description="RNA sequence analysis"
|
|
78
|
+
... )
|
|
70
79
|
"""
|
|
71
80
|
|
|
72
81
|
name: str
|
|
73
82
|
url: str
|
|
74
83
|
workflow_type: str
|
|
75
|
-
version:
|
|
76
|
-
description: str = None
|
|
77
|
-
rid:
|
|
78
|
-
checksum:
|
|
84
|
+
version: str | None = None
|
|
85
|
+
description: str | None = None
|
|
86
|
+
rid: RID | None = None
|
|
87
|
+
checksum: str | None = None
|
|
79
88
|
is_notebook: bool = False
|
|
80
89
|
|
|
81
90
|
_logger: Any = PrivateAttr()
|
|
82
91
|
|
|
83
92
|
def __post_init__(self):
|
|
93
|
+
"""Initializes logging for the workflow."""
|
|
84
94
|
self._logger = logging.getLogger("deriva_ml")
|
|
85
95
|
|
|
96
|
+
@staticmethod
|
|
97
|
+
def create_workflow(
|
|
98
|
+
name: str,
|
|
99
|
+
workflow_type: str,
|
|
100
|
+
description: str = "",
|
|
101
|
+
) -> "Workflow":
|
|
102
|
+
"""Creates a workflow from the current execution context.
|
|
103
|
+
|
|
104
|
+
Identifies the currently executing program (script or notebook) and creates
|
|
105
|
+
a workflow definition. Automatically determines the Git repository information
|
|
106
|
+
and source code checksum.
|
|
107
|
+
|
|
108
|
+
The behavior can be configured using environment variables:
|
|
109
|
+
- DERIVA_ML_WORKFLOW_URL: Override the detected workflow URL
|
|
110
|
+
- DERIVA_ML_WORKFLOW_CHECKSUM: Override the computed checksum
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
name: Human-readable name for the workflow.
|
|
114
|
+
workflow_type: Type of workflow (must be a vocabulary term).
|
|
115
|
+
description: Optional description of workflow purpose.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Workflow: New workflow instance with detected Git information.
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
DerivaMLException: If not in a Git repository or detection fails.
|
|
122
|
+
|
|
123
|
+
Example:
|
|
124
|
+
>>> workflow = Workflow.create_workflow(
|
|
125
|
+
... name="Sample Analysis",
|
|
126
|
+
... workflow_type="python_script",
|
|
127
|
+
... description="Process sample data"
|
|
128
|
+
... )
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
# Check to see if execution file info is being passed in by calling program.
|
|
132
|
+
if "DERIVA_ML_WORKFLOW_URL" in os.environ:
|
|
133
|
+
github_url = os.environ["DERIVA_ML_WORKFLOW_URL"]
|
|
134
|
+
checksum = os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"]
|
|
135
|
+
is_notebook = True
|
|
136
|
+
else:
|
|
137
|
+
path, is_notebook = Workflow._get_python_script()
|
|
138
|
+
github_url, checksum = Workflow.get_url_and_checksum(path)
|
|
139
|
+
|
|
140
|
+
return Workflow(
|
|
141
|
+
name=name,
|
|
142
|
+
url=github_url,
|
|
143
|
+
checksum=checksum,
|
|
144
|
+
description=description,
|
|
145
|
+
workflow_type=workflow_type,
|
|
146
|
+
is_notebook=is_notebook,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
@staticmethod
|
|
150
|
+
def get_url_and_checksum(executable_path: Path) -> tuple[str, str]:
|
|
151
|
+
"""Determines the Git URL and checksum for a file.
|
|
152
|
+
|
|
153
|
+
Computes the Git repository URL and file checksum for the specified path.
|
|
154
|
+
For notebooks, strips cell outputs before computing the checksum.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
executable_path: Path to the workflow file.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
tuple[str, str]: (GitHub URL, Git object hash)
|
|
161
|
+
|
|
162
|
+
Raises:
|
|
163
|
+
DerivaMLException: If not in a Git repository.
|
|
164
|
+
|
|
165
|
+
Example:
|
|
166
|
+
>>> url, checksum = Workflow.get_url_and_checksum(Path("analysis.ipynb"))
|
|
167
|
+
>>> print(f"URL: {url}")
|
|
168
|
+
>>> print(f"Checksum: {checksum}")
|
|
169
|
+
"""
|
|
170
|
+
try:
|
|
171
|
+
subprocess.run(
|
|
172
|
+
"git rev-parse --is-inside-work-tree",
|
|
173
|
+
capture_output=True,
|
|
174
|
+
text=True,
|
|
175
|
+
shell=True,
|
|
176
|
+
check=True,
|
|
177
|
+
)
|
|
178
|
+
except subprocess.CalledProcessError:
|
|
179
|
+
raise DerivaMLException("Not executing in a Git repository.")
|
|
180
|
+
|
|
181
|
+
github_url, is_dirty = Workflow._github_url(executable_path)
|
|
182
|
+
|
|
183
|
+
if is_dirty:
|
|
184
|
+
logging.getLogger("deriva_ml").warning(
|
|
185
|
+
f"File {executable_path} has been modified since last commit. Consider commiting before executing"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# If you are in a notebook, strip out the outputs before computing the checksum.
|
|
189
|
+
cmd = (
|
|
190
|
+
f"nbstripout -t {executable_path} | git hash-object --stdin"
|
|
191
|
+
if "ipynb" == executable_path.suffix
|
|
192
|
+
else f"git hash-object {executable_path}"
|
|
193
|
+
)
|
|
194
|
+
checksum = (
|
|
195
|
+
subprocess.run(
|
|
196
|
+
cmd,
|
|
197
|
+
capture_output=True,
|
|
198
|
+
text=True,
|
|
199
|
+
check=False,
|
|
200
|
+
shell=True,
|
|
201
|
+
).stdout.strip()
|
|
202
|
+
if executable_path != "REPL"
|
|
203
|
+
else "1"
|
|
204
|
+
)
|
|
205
|
+
return github_url, checksum
|
|
206
|
+
|
|
207
|
+
@staticmethod
|
|
208
|
+
def _get_git_root(executable_path: Path) -> str | None:
|
|
209
|
+
"""Gets the root directory of the Git repository.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
executable_path: Path to check for Git repository.
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
str | None: Absolute path to repository root, or None if not in repository.
|
|
216
|
+
"""
|
|
217
|
+
try:
|
|
218
|
+
result = subprocess.run(
|
|
219
|
+
["git", "rev-parse", "--show-toplevel"],
|
|
220
|
+
cwd=executable_path.parent,
|
|
221
|
+
stdout=subprocess.PIPE,
|
|
222
|
+
stderr=subprocess.DEVNULL,
|
|
223
|
+
text=True,
|
|
224
|
+
check=True,
|
|
225
|
+
)
|
|
226
|
+
return result.stdout.strip()
|
|
227
|
+
except subprocess.CalledProcessError:
|
|
228
|
+
return None # Not in a git repository
|
|
229
|
+
|
|
86
230
|
@staticmethod
|
|
87
231
|
def _check_nbstrip_status() -> None:
|
|
88
|
-
"""
|
|
232
|
+
"""Checks if nbstripout is installed and configured.
|
|
233
|
+
|
|
234
|
+
Verifies that the nbstripout tool is available and properly installed in the
|
|
235
|
+
Git repository. Issues warnings if setup is incomplete.
|
|
236
|
+
"""
|
|
89
237
|
logger = logging.getLogger("deriva_ml")
|
|
90
238
|
try:
|
|
91
239
|
if subprocess.run(
|
|
@@ -93,15 +241,17 @@ class Workflow(BaseModel):
|
|
|
93
241
|
check=False,
|
|
94
242
|
capture_output=True,
|
|
95
243
|
).returncode:
|
|
96
|
-
logger.warning(
|
|
97
|
-
"nbstripout is not installed in repository. Please run nbstripout --install"
|
|
98
|
-
)
|
|
244
|
+
logger.warning("nbstripout is not installed in repository. Please run nbstripout --install")
|
|
99
245
|
except subprocess.CalledProcessError:
|
|
100
246
|
logger.error("nbstripout is not found.")
|
|
101
247
|
|
|
102
248
|
@staticmethod
|
|
103
249
|
def _get_notebook_path() -> Path | None:
|
|
104
|
-
"""
|
|
250
|
+
"""Gets the path of the currently executing notebook.
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
Path | None: Absolute path to current notebook, or None if not in notebook.
|
|
254
|
+
"""
|
|
105
255
|
|
|
106
256
|
server, session = Workflow._get_notebook_session()
|
|
107
257
|
if server and session:
|
|
@@ -116,7 +266,7 @@ class Workflow(BaseModel):
|
|
|
116
266
|
"""Return the absolute path of the current notebook."""
|
|
117
267
|
# Get the kernel's connection file and extract the kernel ID
|
|
118
268
|
try:
|
|
119
|
-
if not (connection_file := Path(
|
|
269
|
+
if not (connection_file := Path(get_kernel_connection()).name):
|
|
120
270
|
return None, None
|
|
121
271
|
except RuntimeError:
|
|
122
272
|
return None, None
|
|
@@ -124,7 +274,7 @@ class Workflow(BaseModel):
|
|
|
124
274
|
kernel_id = connection_file.split("-", 1)[1].split(".")[0]
|
|
125
275
|
|
|
126
276
|
# Look through the running server sessions to find the matching kernel ID
|
|
127
|
-
for server in
|
|
277
|
+
for server in get_servers():
|
|
128
278
|
try:
|
|
129
279
|
# If a token is required for authentication, include it in headers
|
|
130
280
|
token = server.get("token", "")
|
|
@@ -155,32 +305,30 @@ class Workflow(BaseModel):
|
|
|
155
305
|
is_notebook = False
|
|
156
306
|
stack = inspect.stack()
|
|
157
307
|
# Get the caller's filename, which is two up the stack from here.
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
"Looking for caller failed"
|
|
167
|
-
) # Stack is too shallow
|
|
308
|
+
|
|
309
|
+
filename = Path(stack[-1].filename)
|
|
310
|
+
if not filename.exists():
|
|
311
|
+
# Being called from the command line interpreter.
|
|
312
|
+
filename = Path("REPL")
|
|
313
|
+
# Get the caller's filename, which is two up the stack from here.
|
|
314
|
+
else:
|
|
315
|
+
raise DerivaMLException("Looking for caller failed") # Stack is too shallow
|
|
168
316
|
return filename, is_notebook
|
|
169
317
|
|
|
170
318
|
@staticmethod
|
|
171
319
|
def _github_url(executable_path: Path) -> tuple[str, bool]:
|
|
172
|
-
"""Return a
|
|
320
|
+
"""Return a GitHub URL for the latest commit of the script from which this routine is called.
|
|
173
321
|
|
|
174
|
-
This routine is used to be called from a script or notebook (e.g
|
|
175
|
-
the file is in a
|
|
176
|
-
file in
|
|
322
|
+
This routine is used to be called from a script or notebook (e.g., python -m file). It assumes that
|
|
323
|
+
the file is in a GitHub repository and committed. It returns a URL to the last commited version of this
|
|
324
|
+
file in GitHub.
|
|
177
325
|
|
|
178
|
-
Returns: A tuple with the gethub_url and a boolean to
|
|
326
|
+
Returns: A tuple with the gethub_url and a boolean to indicate if uncommited changes
|
|
179
327
|
have been made to the file.
|
|
180
328
|
|
|
181
329
|
"""
|
|
182
330
|
|
|
183
|
-
# Get repo URL from local
|
|
331
|
+
# Get repo URL from local GitHub repo.
|
|
184
332
|
if executable_path == "REPL":
|
|
185
333
|
return "REPL", True
|
|
186
334
|
try:
|
|
@@ -197,7 +345,7 @@ class Workflow(BaseModel):
|
|
|
197
345
|
# Find the root directory for the repository
|
|
198
346
|
repo_root = Workflow._get_git_root(executable_path)
|
|
199
347
|
|
|
200
|
-
# Now check to see if file has been modified since the last commit.
|
|
348
|
+
# Now check to see if a file has been modified since the last commit.
|
|
201
349
|
try:
|
|
202
350
|
result = subprocess.run(
|
|
203
351
|
["git", "status", "--porcelain"],
|
|
@@ -206,11 +354,9 @@ class Workflow(BaseModel):
|
|
|
206
354
|
text=True,
|
|
207
355
|
check=True,
|
|
208
356
|
)
|
|
209
|
-
is_dirty = bool(
|
|
210
|
-
"M " in result.stdout.strip()
|
|
211
|
-
) # Returns True if output indicates a modified file
|
|
357
|
+
is_dirty = bool("M " in result.stdout.strip()) # Returns True if the output indicates a modified file
|
|
212
358
|
except subprocess.CalledProcessError:
|
|
213
|
-
is_dirty = False # If Git command fails, assume no changes
|
|
359
|
+
is_dirty = False # If the Git command fails, assume no changes
|
|
214
360
|
|
|
215
361
|
"""Get SHA-1 hash of latest commit of the file in the repository"""
|
|
216
362
|
result = subprocess.run(
|
|
@@ -223,161 +369,3 @@ class Workflow(BaseModel):
|
|
|
223
369
|
sha = result.stdout.strip()
|
|
224
370
|
url = f"{github_url}/blob/{sha}/{executable_path.relative_to(repo_root)}"
|
|
225
371
|
return url, is_dirty
|
|
226
|
-
|
|
227
|
-
@staticmethod
|
|
228
|
-
def _get_git_root(executable_path: Path):
|
|
229
|
-
try:
|
|
230
|
-
result = subprocess.run(
|
|
231
|
-
["git", "rev-parse", "--show-toplevel"],
|
|
232
|
-
cwd=executable_path.parent,
|
|
233
|
-
stdout=subprocess.PIPE,
|
|
234
|
-
stderr=subprocess.DEVNULL,
|
|
235
|
-
text=True,
|
|
236
|
-
check=True,
|
|
237
|
-
)
|
|
238
|
-
return result.stdout.strip()
|
|
239
|
-
except subprocess.CalledProcessError:
|
|
240
|
-
return None # Not in a git repository
|
|
241
|
-
|
|
242
|
-
@staticmethod
|
|
243
|
-
def create_workflow(
|
|
244
|
-
name: str,
|
|
245
|
-
workflow_type: str,
|
|
246
|
-
description: str = "",
|
|
247
|
-
) -> Workflow:
|
|
248
|
-
"""Identify current executing program and return a workflow RID for it
|
|
249
|
-
|
|
250
|
-
Determine the notebook or script that is currently being executed. Assume that this is
|
|
251
|
-
being executed from a cloned GitHub repository. Determine the remote repository name for
|
|
252
|
-
this object. Then either retrieve an existing workflow for this executable or create
|
|
253
|
-
a new one.
|
|
254
|
-
|
|
255
|
-
Args:
|
|
256
|
-
name: The name of the workflow.
|
|
257
|
-
workflow_type: The type of the workflow.
|
|
258
|
-
description: The description of the workflow.
|
|
259
|
-
"""
|
|
260
|
-
|
|
261
|
-
# Check to see if execution file info is being passed in by calling program.
|
|
262
|
-
if "DERIVA_ML_WORKFLOW_URL" in os.environ:
|
|
263
|
-
github_url = os.environ["DERIVA_ML_WORKFLOW_URL"]
|
|
264
|
-
checksum = os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"]
|
|
265
|
-
is_notebook = True
|
|
266
|
-
else:
|
|
267
|
-
path, is_notebook = Workflow._get_python_script()
|
|
268
|
-
github_url, checksum = Workflow.get_url_and_checksum(path)
|
|
269
|
-
|
|
270
|
-
return Workflow(
|
|
271
|
-
name=name,
|
|
272
|
-
url=github_url,
|
|
273
|
-
checksum=checksum,
|
|
274
|
-
description=description,
|
|
275
|
-
workflow_type=workflow_type,
|
|
276
|
-
is_notebook=is_notebook,
|
|
277
|
-
)
|
|
278
|
-
|
|
279
|
-
@staticmethod
|
|
280
|
-
def get_url_and_checksum(executable_path: Path) -> tuple[str, str]:
|
|
281
|
-
"""Determine the checksum for a specified executable"""
|
|
282
|
-
try:
|
|
283
|
-
subprocess.run(
|
|
284
|
-
"git rev-parse --is-inside-work-tree",
|
|
285
|
-
capture_output=True,
|
|
286
|
-
text=True,
|
|
287
|
-
shell=True,
|
|
288
|
-
check=True,
|
|
289
|
-
)
|
|
290
|
-
except subprocess.CalledProcessError:
|
|
291
|
-
raise DerivaMLException("Not executing in a Git repository.")
|
|
292
|
-
|
|
293
|
-
github_url, is_dirty = Workflow._github_url(executable_path)
|
|
294
|
-
|
|
295
|
-
if is_dirty:
|
|
296
|
-
logging.getLogger("deriva_ml").warning(
|
|
297
|
-
f"File {executable_path} has been modified since last commit. Consider commiting before executing"
|
|
298
|
-
)
|
|
299
|
-
|
|
300
|
-
# If you are in a notebook, strip out the outputs before computing the checksum.
|
|
301
|
-
cmd = (
|
|
302
|
-
f"nbstripout -t {executable_path} | git hash-object --stdin"
|
|
303
|
-
if "ipynb" == executable_path.suffix
|
|
304
|
-
else f"git hash-object {executable_path}"
|
|
305
|
-
)
|
|
306
|
-
checksum = (
|
|
307
|
-
subprocess.run(
|
|
308
|
-
cmd,
|
|
309
|
-
capture_output=True,
|
|
310
|
-
text=True,
|
|
311
|
-
check=False,
|
|
312
|
-
shell=True,
|
|
313
|
-
).stdout.strip()
|
|
314
|
-
if executable_path != "REPL"
|
|
315
|
-
else "1"
|
|
316
|
-
)
|
|
317
|
-
return github_url, checksum
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
class ExecutionConfiguration(BaseModel):
|
|
321
|
-
"""Define the parameters that are used to configure a specific execution.
|
|
322
|
-
|
|
323
|
-
Attributes:
|
|
324
|
-
datasets: List of dataset specifications which specify the dataset RID, version and if the dataset
|
|
325
|
-
should be materialized.
|
|
326
|
-
assets: List of assets to be downloaded prior to execution. The values must be RIDs in an asset table
|
|
327
|
-
parameters: Either a dictionary or a path to a JSON file that contains configuration parameters for the execution.
|
|
328
|
-
workflow: Either a Workflow object, or a RID for a workflow instance.
|
|
329
|
-
parameters: Either a dictionary or a path to a JSON file that contains configuration parameters for the execution.
|
|
330
|
-
description: A description of the execution. Can use Markdown format.
|
|
331
|
-
"""
|
|
332
|
-
|
|
333
|
-
datasets: conlist(DatasetSpec) = []
|
|
334
|
-
assets: list[RID | str] = [] # List of RIDs to model files.
|
|
335
|
-
workflow: RID | Workflow
|
|
336
|
-
parameters: dict[str, Any] | Path = {}
|
|
337
|
-
description: str = ""
|
|
338
|
-
argv: conlist(str) = Field(default_factory=lambda: sys.argv)
|
|
339
|
-
|
|
340
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
341
|
-
|
|
342
|
-
@field_validator("parameters", mode="before")
|
|
343
|
-
@classmethod
|
|
344
|
-
def validate_parameters(cls, value: Any) -> Any:
|
|
345
|
-
"""If a parameter is a file, assume that it has JSON contents for configuration parameters"""
|
|
346
|
-
if isinstance(value, str) or isinstance(value, Path):
|
|
347
|
-
with open(value, "r") as f:
|
|
348
|
-
return json.load(f)
|
|
349
|
-
else:
|
|
350
|
-
return value
|
|
351
|
-
|
|
352
|
-
@staticmethod
|
|
353
|
-
def load_configuration(path: Path) -> ExecutionConfiguration:
|
|
354
|
-
"""Create a ExecutionConfiguration from a JSON configuration file.
|
|
355
|
-
|
|
356
|
-
Args:
|
|
357
|
-
path: File containing JSON version of execution configuration.
|
|
358
|
-
|
|
359
|
-
Returns:
|
|
360
|
-
An execution configuration whose values are loaded from the given file.
|
|
361
|
-
"""
|
|
362
|
-
with open(path) as fd:
|
|
363
|
-
config = json.load(fd)
|
|
364
|
-
return ExecutionConfiguration.model_validate(config)
|
|
365
|
-
|
|
366
|
-
# def download_execution_configuration(
|
|
367
|
-
# self, configuration_rid: RID
|
|
368
|
-
# ) -> ExecutionConfiguration:
|
|
369
|
-
# """Create an ExecutionConfiguration object from a catalog RID that points to a JSON representation of that
|
|
370
|
-
# configuration in hatrac
|
|
371
|
-
#
|
|
372
|
-
# Args:
|
|
373
|
-
# configuration_rid: RID that should be to an asset table that refers to an execution configuration
|
|
374
|
-
#
|
|
375
|
-
# Returns:
|
|
376
|
-
# A ExecutionConfiguration object for configured by the parameters in the configuration file.
|
|
377
|
-
# """
|
|
378
|
-
# AssertionError("Not Implemented")
|
|
379
|
-
# configuration = self.retrieve_rid(configuration_rid)
|
|
380
|
-
# with NamedTemporaryFile("w+", delete=False, suffix=".json") as dest_file:
|
|
381
|
-
# hs = HatracStore("https", self.host_name, self.credential)
|
|
382
|
-
# hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
|
|
383
|
-
# return ExecutionConfiguration.load_configuration(Path(dest_file.name))
|