deriva-ml 1.12.3__py3-none-any.whl → 1.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deriva_ml/__init__.py CHANGED
@@ -14,7 +14,8 @@ __all__ = [
14
14
  "BuiltinTypes",
15
15
  "UploadState",
16
16
  "MLVocab",
17
- "ExecMetadataVocab",
17
+ "MLAsset",
18
+ "ExecAssetType",
18
19
  "RID",
19
20
  "DerivaSystemColumns",
20
21
  "VersionPart",
@@ -32,7 +33,8 @@ from .deriva_definitions import (
32
33
  RID,
33
34
  DerivaMLException,
34
35
  MLVocab,
35
- ExecMetadataVocab,
36
+ MLAsset,
37
+ ExecAssetType,
36
38
  DerivaSystemColumns,
37
39
  )
38
40
  from .deriva_ml_base import DerivaML
@@ -49,4 +51,3 @@ try:
49
51
  except PackageNotFoundError:
50
52
  # package is not installed
51
53
  pass
52
-
@@ -192,7 +192,12 @@ class MLVocab(StrEnum):
192
192
  asset_role = "Asset_Role"
193
193
 
194
194
 
195
- class ExecMetadataVocab(StrEnum):
195
+ class MLAsset(StrEnum):
196
+ execution_metadata = "Execution_Metadata"
197
+ execution_asset = "Execution_Asset"
198
+
199
+
200
+ class ExecMetadataType(StrEnum):
196
201
  """
197
202
  Predefined execution metadata types.
198
203
  """
@@ -201,6 +206,16 @@ class ExecMetadataVocab(StrEnum):
201
206
  runtime_env = "Runtime_Env"
202
207
 
203
208
 
209
+ class ExecAssetType(StrEnum):
210
+ """
211
+ Predefined execution metadata types.
212
+ """
213
+
214
+ input_file = "Input_File"
215
+ output_file = "Output_File"
216
+ notebook_output = "Notebook_Output"
217
+
218
+
204
219
  class ColumnDefinition(BaseModel):
205
220
  """Pydantic model for deriva_py Column.define"""
206
221
 
@@ -14,17 +14,16 @@ import getpass
14
14
  import logging
15
15
  from datetime import datetime
16
16
  from itertools import chain
17
- import inspect
18
- import setuptools_scm
19
17
  from pathlib import Path
20
18
  import requests
21
- import subprocess
19
+
22
20
  from typing import Optional, Any, Iterable, TYPE_CHECKING
21
+
23
22
  from deriva.core import (
24
23
  get_credential,
25
24
  urlquote,
26
- DEFAULT_SESSION_CONFIG,
27
25
  format_exception,
26
+ DEFAULT_SESSION_CONFIG,
28
27
  )
29
28
  import deriva.core.datapath as datapath
30
29
  from deriva.core.datapath import DataPathException
@@ -33,7 +32,6 @@ from deriva.core.ermrest_catalog import ResolveRidResult
33
32
  from deriva.core.ermrest_model import Key, Table
34
33
  from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
35
34
  from pydantic import validate_call, ConfigDict
36
- from requests import RequestException
37
35
 
38
36
  from .execution_configuration import ExecutionConfiguration, Workflow
39
37
  from .feature import Feature, FeatureRecord
@@ -60,33 +58,6 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
60
58
  ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
61
59
 
62
60
 
63
- try:
64
- from IPython import get_ipython
65
- except ImportError: # Graceful fallback if IPython isn't installed.
66
-
67
- def get_ipython():
68
- """Dummy routine in case you are not running in IPython."""
69
- return None
70
-
71
-
72
- try:
73
- from jupyter_server.serverapp import list_running_servers
74
- except ImportError:
75
-
76
- def list_running_servers():
77
- """Dummy routine in case you are not running in Jupyter."""
78
- return []
79
-
80
-
81
- try:
82
- from ipykernel import get_connection_file
83
- except ImportError:
84
-
85
- def get_connection_file():
86
- """Dummy routine in case you are not running in Jupyter."""
87
- return ""
88
-
89
-
90
61
  if TYPE_CHECKING:
91
62
  from .execution import Execution
92
63
 
@@ -165,7 +136,6 @@ class DerivaML(Dataset):
165
136
  self.version = model_version
166
137
  self.configuration = None
167
138
  self._execution: Optional[Execution] = None
168
- self.executable_path, self._is_notebook = self._get_python_script()
169
139
  self.domain_schema = self.model.domain_schema
170
140
  self.project_name = project_name or self.domain_schema
171
141
  self.start_time = datetime.now()
@@ -192,102 +162,6 @@ class DerivaML(Dataset):
192
162
  except (AttributeError, requests.HTTPError):
193
163
  pass
194
164
 
195
- def _check_nbstrip_status(self) -> None:
196
- """Check to see if nbstrip is installed"""
197
- try:
198
- if subprocess.run(
199
- ["nbstripout", "--is-installed"],
200
- check=False,
201
- capture_output=True,
202
- ).returncode:
203
- self._logger.warning(
204
- "nbstripout is not installed in repository. Please run nbstripout --install"
205
- )
206
- except subprocess.CalledProcessError:
207
- self._logger.error("nbstripout is not found.")
208
-
209
- @staticmethod
210
- def _get_notebook_session() -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
211
- """Return the absolute path of the current notebook."""
212
- # Get the kernel's connection file and extract the kernel ID
213
- try:
214
- if not (connection_file := Path(get_connection_file()).name):
215
- return None, None
216
- except RuntimeError:
217
- return None, None
218
-
219
- kernel_id = connection_file.split("-", 1)[1].split(".")[0]
220
-
221
- # Look through the running server sessions to find the matching kernel ID
222
- for server in list_running_servers():
223
- try:
224
- # If a token is required for authentication, include it in headers
225
- token = server.get("token", "")
226
- headers = {}
227
- if token:
228
- headers["Authorization"] = f"token {token}"
229
-
230
- try:
231
- sessions_url = server["url"] + "api/sessions"
232
- response = requests.get(sessions_url, headers=headers)
233
- response.raise_for_status()
234
- sessions = response.json()
235
- except RequestException as e:
236
- raise e
237
- for sess in sessions:
238
- if sess["kernel"]["id"] == kernel_id:
239
- return server, sess
240
- except Exception as _e:
241
- # Ignore servers we can't connect to.
242
- pass
243
- return None, None
244
-
245
- def _get_notebook_path(self) -> Path | None:
246
- """Return the absolute path of the current notebook."""
247
-
248
- server, session = DerivaML._get_notebook_session()
249
- if server and session:
250
- self._check_nbstrip_status()
251
- relative_path = session["notebook"]["path"]
252
- # Join the notebook directory with the relative path
253
- return Path(server["root_dir"]) / relative_path
254
- else:
255
- return None
256
-
257
- def _get_python_script(self) -> tuple[Path, bool]:
258
- """Return the path to the currently executing script"""
259
- is_notebook = False
260
- if filename := self._get_notebook_path():
261
- is_notebook = True
262
- else:
263
- stack = inspect.stack()
264
- # Get the caller's filename, which is two up the stack from here.
265
- if len(stack) > 1:
266
- filename = Path(stack[2].filename)
267
- if not filename.exists():
268
- # Begin called from command line interpreter.
269
- filename = "REPL"
270
- # Get the caller's filename, which is two up the stack from here.
271
- else:
272
- raise DerivaMLException(
273
- "Looking for caller failed"
274
- ) # Stack is too shallow
275
- return filename, is_notebook
276
-
277
- def _get_git_root(self):
278
- try:
279
- result = subprocess.run(
280
- ["git", "rev-parse", "--show-toplevel"],
281
- cwd=self.executable_path.parent,
282
- stdout=subprocess.PIPE,
283
- stderr=subprocess.DEVNULL,
284
- text=True,
285
- check=True,
286
- )
287
- return result.stdout.strip()
288
- except subprocess.CalledProcessError:
289
- return None # Not in a git repository
290
-
291
165
  @staticmethod
292
166
  def _get_session_config():
293
167
  """ """
@@ -311,10 +185,6 @@ class DerivaML(Dataset):
311
185
  """Get a new instance of a pathBuilder object."""
312
186
  return self.catalog.getPathBuilder()
313
187
 
314
- def get_version(self) -> str:
315
- """Return the version number of the executable"""
316
- return setuptools_scm.get_version(root=self._get_git_root())
317
-
318
188
  @property
319
189
  def domain_path(self):
320
190
  """Get a new instance of a pathBuilder object to the domain schema"""
@@ -1117,105 +987,7 @@ class DerivaML(Dataset):
1117
987
  # Make sure type is correct.
1118
988
  self.lookup_term(MLVocab.workflow_type, workflow_type)
1119
989
 
1120
- try:
1121
- subprocess.run(
1122
- "git rev-parse --is-inside-work-tree",
1123
- capture_output=True,
1124
- text=True,
1125
- shell=True,
1126
- check=True,
1127
- )
1128
- except subprocess.CalledProcessError:
1129
- raise DerivaMLException("Not executing in a Git repository.")
1130
-
1131
- github_url, is_dirty = self._github_url()
1132
-
1133
- if is_dirty:
1134
- self._logger.warning(
1135
- f"File {self.executable_path} has been modified since last commit. Consider commiting before executing"
1136
- )
1137
-
1138
- # If you are in a notebook, strip out the outputs before computing the checksum.
1139
- cmd = (
1140
- f"nbstripout {self.executable_path} | git hash-object --stdin"
1141
- if self._is_notebook
1142
- else f"git hash-object {self.executable_path}"
1143
- )
1144
- checksum = (
1145
- subprocess.run(
1146
- cmd,
1147
- capture_output=True,
1148
- text=True,
1149
- check=False,
1150
- shell=True,
1151
- ).stdout.strip()
1152
- if self.executable_path != "REPL"
1153
- else "1"
1154
- )
1155
-
1156
- return Workflow(
1157
- name=name,
1158
- url=github_url,
1159
- checksum=checksum,
1160
- description=description,
1161
- workflow_type=workflow_type,
1162
- )
1163
-
1164
- def _github_url(self) -> tuple[str, bool]:
1165
- """Return a GitHUB URL for the latest commit of the script from which this routine is called.
1166
-
1167
- This routine is used to be called from a script or notebook (e.g. python -m file). It assumes that
1168
- the file is in a gitHUB repository and commited. It returns a URL to the last commited version of this
1169
- file in GitHUB.
1170
-
1171
- Returns: A tuple with the gethub_url and a boolean to indicated if uncommited changes
1172
- have been made to the file.
1173
-
1174
- """
1175
-
1176
- # Get repo URL from local gitHub repo.
1177
- if self.executable_path == "REPL":
1178
- return "REPL", True
1179
- try:
1180
- result = subprocess.run(
1181
- ["git", "remote", "get-url", "origin"],
1182
- capture_output=True,
1183
- text=True,
1184
- cwd=self.executable_path.parent,
1185
- )
1186
- github_url = result.stdout.strip().removesuffix(".git")
1187
- except subprocess.CalledProcessError:
1188
- raise DerivaMLException("No GIT remote found")
1189
-
1190
- # Find the root directory for the repository
1191
- repo_root = self._get_git_root()
1192
-
1193
- # Now check to see if file has been modified since the last commit.
1194
- try:
1195
- result = subprocess.run(
1196
- ["git", "status", "--porcelain"],
1197
- cwd=self.executable_path.parent,
1198
- capture_output=True,
1199
- text=True,
1200
- check=True,
1201
- )
1202
- is_dirty = bool(
1203
- "M " in result.stdout.strip()
1204
- ) # Returns True if output indicates a modified file
1205
- except subprocess.CalledProcessError:
1206
- is_dirty = False # If Git command fails, assume no changes
1207
-
1208
- """Get SHA-1 hash of latest commit of the file in the repository"""
1209
- result = subprocess.run(
1210
- ["git", "log", "-n", "1", "--pretty=format:%H--", self.executable_path],
1211
- cwd=self.executable_path.parent,
1212
- capture_output=True,
1213
- text=True,
1214
- check=True,
1215
- )
1216
- sha = result.stdout.strip()
1217
- url = f"{github_url}/blob/{sha}/{self.executable_path.relative_to(repo_root)}"
1218
- return url, is_dirty
990
+ return Workflow.create_workflow(name, workflow_type, description)
1219
991
 
1220
992
  # @validate_call
1221
993
  def create_execution(
@@ -1259,6 +1031,15 @@ class DerivaML(Dataset):
1259
1031
  exec_rid=execution_rid,
1260
1032
  file_name="configuration.json",
1261
1033
  asset_table=self.model.name_to_table("Execution_Metadata"),
1034
+ metadata={},
1262
1035
  )
1263
- configuration = ExecutionConfiguration.load_configuration(cfile)
1036
+
1037
+ if cfile.exists():
1038
+ configuration = ExecutionConfiguration.load_configuration(cfile)
1039
+ else:
1040
+ execution = self.retrieve_rid(execution_rid)
1041
+ configuration = ExecutionConfiguration(
1042
+ workflow=execution["Workflow"],
1043
+ description=execution["Description"],
1044
+ )
1264
1045
  return Execution(configuration, self, reload=execution_rid)
deriva_ml/execution.py CHANGED
@@ -20,13 +20,15 @@ from typing import Iterable, Any, Optional
20
20
  from deriva.core import format_exception
21
21
  from deriva.core.datapath import DataPathException
22
22
  from deriva.core.hatrac_store import HatracStore
23
- from .deriva_definitions import ExecMetadataVocab
24
23
  from .deriva_definitions import (
25
24
  RID,
26
25
  Status,
27
26
  FileUploadState,
28
27
  DerivaMLException,
29
28
  MLVocab,
29
+ MLAsset,
30
+ ExecMetadataType,
31
+ ExecAssetType,
30
32
  DRY_RUN_RID,
31
33
  )
32
34
  from .deriva_ml_base import DerivaML, FeatureRecord
@@ -54,11 +56,14 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
54
56
 
55
57
 
56
58
  try:
57
- from jupyter_server.serverapp import list_running_servers
59
+ from IPython.display import display, Markdown
58
60
  except ImportError:
59
61
 
60
- def list_running_servers():
61
- return []
62
+ def display(s):
63
+ print(s)
64
+
65
+ def Markdown(s):
66
+ return s
62
67
 
63
68
 
64
69
  class AssetFilePath(type(Path())):
@@ -218,15 +223,24 @@ class Execution:
218
223
  ]
219
224
  )[0]["RID"]
220
225
 
226
+ if (
227
+ isinstance(self.configuration.workflow, Workflow)
228
+ and self.configuration.workflow.is_notebook
229
+ ):
230
+ # Put execution_rid into cell output so we can find it later.
231
+ display(
232
+ Markdown(f"Execution RID: {self._ml_object.cite(self.execution_rid)}")
233
+ )
234
+
221
235
  # Create a directory for execution rid so we can recover state in case of a crash.
222
236
  execution_root(prefix=self._ml_object.working_dir, exec_rid=self.execution_rid)
223
237
  self._initialize_execution(reload)
224
238
 
225
239
  def _save_runtime_environment(self):
226
240
  runtime_env_path = self.asset_file_path(
227
- asset_name="Execution_Metadata",
228
- file_name=f"environment_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
229
- asset_types=ExecMetadataVocab.runtime_env.value,
241
+ "Execution_Metadata",
242
+ f"environment_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
243
+ ExecMetadataType.runtime_env.value,
230
244
  )
231
245
  with open(runtime_env_path, "w") as fp:
232
246
  json.dump(get_execution_environment(), fp)
@@ -281,16 +295,24 @@ class Execution:
281
295
  )
282
296
 
283
297
  # Save configuration details for later upload
284
- cfile = self.asset_file_path(
285
- asset_name="Execution_Metadata",
286
- file_name="configuration.json",
287
- asset_types=ExecMetadataVocab.execution_config.value,
288
- )
289
- with open(cfile.as_posix(), "w", encoding="utf-8") as config_file:
290
- json.dump(self.configuration.model_dump(), config_file)
298
+ if not reload:
299
+ cfile = self.asset_file_path(
300
+ MLAsset.execution_metadata,
301
+ "configuration.json",
302
+ ExecMetadataType.execution_config.value,
303
+ )
304
+ with open(cfile.as_posix(), "w", encoding="utf-8") as config_file:
305
+ json.dump(self.configuration.model_dump(), config_file)
306
+
307
+ for parameter_file in self.configuration.parameters:
308
+ self.asset_file_path(
309
+ MLAsset.execution_asset,
310
+ parameter_file,
311
+ ExecAssetType.input_file.value,
312
+ )
291
313
 
292
- # save runtime env
293
- self._save_runtime_environment()
314
+ # save runtime env
315
+ self._save_runtime_environment()
294
316
 
295
317
  self.start_time = datetime.now()
296
318
  self.update_status(Status.pending, "Initialize status finished.")
@@ -741,7 +763,7 @@ class Execution:
741
763
  def asset_file_path(
742
764
  self,
743
765
  asset_name: str,
744
- file_name: str,
766
+ file_name: str | Path,
745
767
  asset_types: Optional[list[str] | str] = None,
746
768
  copy_file=False,
747
769
  **kwargs,
@@ -4,16 +4,56 @@ Classes that are used to define an execution configuration.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
+ import inspect
7
8
  import json
9
+ import logging
10
+ import os
11
+
12
+ from requests import RequestException
13
+ import requests
14
+ import subprocess
8
15
  from typing import Optional, Any
9
16
 
10
- from pydantic import BaseModel, conlist, ConfigDict, field_validator, Field
17
+ from pydantic import (
18
+ BaseModel,
19
+ conlist,
20
+ ConfigDict,
21
+ field_validator,
22
+ Field,
23
+ PrivateAttr,
24
+ )
11
25
  from pathlib import Path
12
26
  import sys
13
27
 
14
28
 
15
29
  from .dataset_aux_classes import DatasetSpec
16
- from .deriva_definitions import RID
30
+ from .deriva_definitions import RID, DerivaMLException
31
+
32
+ try:
33
+ from IPython import get_ipython
34
+ except ImportError: # Graceful fallback if IPython isn't installed.
35
+
36
+ def get_ipython():
37
+ """Dummy routine in case you are not running in IPython."""
38
+ return None
39
+
40
+
41
+ try:
42
+ from jupyter_server.serverapp import list_running_servers
43
+ except ImportError:
44
+
45
+ def list_running_servers():
46
+ """Dummy routine in case you are not running in Jupyter."""
47
+ return []
48
+
49
+
50
+ try:
51
+ from ipykernel import get_connection_file
52
+ except ImportError:
53
+
54
+ def get_connection_file():
55
+ """Dummy routine in case you are not running in Jupyter."""
56
+ return ""
17
57
 
18
58
 
19
59
  class Workflow(BaseModel):
@@ -26,15 +66,255 @@ class Workflow(BaseModel):
26
66
  workflow_type: The type of the workflow. Must be an existing controlled vocabulary term.
27
67
  version: The version of the workflow instance. Should follow semantic versioning.
28
68
  description: A description of the workflow instance. Can be in Markdown format.
69
+ is_notebook: A boolean indicating whether this workflow instance is a notebook or not.
29
70
  """
30
71
 
31
72
  name: str
32
73
  url: str
33
74
  workflow_type: str
34
75
  version: Optional[str] = None
35
- description: Optional[str] = ""
76
+ description: str = None
36
77
  rid: Optional[RID] = None
37
- checksum: Optional[str]
78
+ checksum: Optional[str] = None
79
+ is_notebook: bool = False
80
+
81
+ _logger: Any = PrivateAttr()
82
+
83
+ def __post_init__(self):
84
+ self._logger = logging.getLogger("deriva_ml")
85
+
86
+ @staticmethod
87
+ def _check_nbstrip_status() -> None:
88
+ """Check to see if nbstrip is installed"""
89
+ logger = logging.getLogger("deriva_ml")
90
+ try:
91
+ if subprocess.run(
92
+ ["nbstripout", "--is-installed"],
93
+ check=False,
94
+ capture_output=True,
95
+ ).returncode:
96
+ logger.warning(
97
+ "nbstripout is not installed in repository. Please run nbstripout --install"
98
+ )
99
+ except subprocess.CalledProcessError:
100
+ logger.error("nbstripout is not found.")
101
+
102
+ @staticmethod
103
+ def _get_notebook_path() -> Path | None:
104
+ """Return the absolute path of the current notebook."""
105
+
106
+ server, session = Workflow._get_notebook_session()
107
+ if server and session:
108
+ relative_path = session["notebook"]["path"]
109
+ # Join the notebook directory with the relative path
110
+ return Path(server["root_dir"]) / relative_path
111
+ else:
112
+ return None
113
+
114
+ @staticmethod
115
+ def _get_notebook_session() -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
116
+ """Return the absolute path of the current notebook."""
117
+ # Get the kernel's connection file and extract the kernel ID
118
+ try:
119
+ if not (connection_file := Path(get_connection_file()).name):
120
+ return None, None
121
+ except RuntimeError:
122
+ return None, None
123
+
124
+ kernel_id = connection_file.split("-", 1)[1].split(".")[0]
125
+
126
+ # Look through the running server sessions to find the matching kernel ID
127
+ for server in list_running_servers():
128
+ try:
129
+ # If a token is required for authentication, include it in headers
130
+ token = server.get("token", "")
131
+ headers = {}
132
+ if token:
133
+ headers["Authorization"] = f"token {token}"
134
+
135
+ try:
136
+ sessions_url = server["url"] + "api/sessions"
137
+ response = requests.get(sessions_url, headers=headers)
138
+ response.raise_for_status()
139
+ sessions = response.json()
140
+ except RequestException as e:
141
+ raise e
142
+ for sess in sessions:
143
+ if sess["kernel"]["id"] == kernel_id:
144
+ return server, sess
145
+ except Exception as _e:
146
+ # Ignore servers we can't connect to.
147
+ pass
148
+ return None, None
149
+
150
+ @staticmethod
151
+ def _get_python_script() -> tuple[Path, bool]:
152
+ """Return the path to the currently executing script"""
153
+ is_notebook = True
154
+ if not (filename := Workflow._get_notebook_path()):
155
+ is_notebook = False
156
+ stack = inspect.stack()
157
+ # Get the caller's filename, which is two up the stack from here.
158
+ if len(stack) > 1:
159
+ filename = Path(stack[2].filename)
160
+ if not filename.exists():
161
+ # Begin called from command line interpreter.
162
+ filename = Path("REPL")
163
+ # Get the caller's filename, which is two up the stack from here.
164
+ else:
165
+ raise DerivaMLException(
166
+ "Looking for caller failed"
167
+ ) # Stack is too shallow
168
+ return filename, is_notebook
169
+
170
+ @staticmethod
171
+ def _github_url(executable_path: Path) -> tuple[str, bool]:
172
+ """Return a GitHUB URL for the latest commit of the script from which this routine is called.
173
+
174
+ This routine is used to be called from a script or notebook (e.g. python -m file). It assumes that
175
+ the file is in a gitHUB repository and commited. It returns a URL to the last commited version of this
176
+ file in GitHUB.
177
+
178
+ Returns: A tuple with the gethub_url and a boolean to indicated if uncommited changes
179
+ have been made to the file.
180
+
181
+ """
182
+
183
+ # Get repo URL from local gitHub repo.
184
+ if executable_path == "REPL":
185
+ return "REPL", True
186
+ try:
187
+ result = subprocess.run(
188
+ ["git", "remote", "get-url", "origin"],
189
+ capture_output=True,
190
+ text=True,
191
+ cwd=executable_path.parent,
192
+ )
193
+ github_url = result.stdout.strip().removesuffix(".git")
194
+ except subprocess.CalledProcessError:
195
+ raise DerivaMLException("No GIT remote found")
196
+
197
+ # Find the root directory for the repository
198
+ repo_root = Workflow._get_git_root(executable_path)
199
+
200
+ # Now check to see if file has been modified since the last commit.
201
+ try:
202
+ result = subprocess.run(
203
+ ["git", "status", "--porcelain"],
204
+ cwd=executable_path.parent,
205
+ capture_output=True,
206
+ text=True,
207
+ check=True,
208
+ )
209
+ is_dirty = bool(
210
+ "M " in result.stdout.strip()
211
+ ) # Returns True if output indicates a modified file
212
+ except subprocess.CalledProcessError:
213
+ is_dirty = False # If Git command fails, assume no changes
214
+
215
+ """Get SHA-1 hash of latest commit of the file in the repository"""
216
+ result = subprocess.run(
217
+ ["git", "log", "-n", "1", "--pretty=format:%H--", executable_path],
218
+ cwd=executable_path.parent,
219
+ capture_output=True,
220
+ text=True,
221
+ check=True,
222
+ )
223
+ sha = result.stdout.strip()
224
+ url = f"{github_url}/blob/{sha}/{executable_path.relative_to(repo_root)}"
225
+ return url, is_dirty
226
+
227
+ @staticmethod
228
+ def _get_git_root(executable_path: Path):
229
+ try:
230
+ result = subprocess.run(
231
+ ["git", "rev-parse", "--show-toplevel"],
232
+ cwd=executable_path.parent,
233
+ stdout=subprocess.PIPE,
234
+ stderr=subprocess.DEVNULL,
235
+ text=True,
236
+ check=True,
237
+ )
238
+ return result.stdout.strip()
239
+ except subprocess.CalledProcessError:
240
+ return None # Not in a git repository
241
+
242
+ @staticmethod
243
+ def create_workflow(
244
+ name: str,
245
+ workflow_type: str,
246
+ description: str = "",
247
+ ) -> Workflow:
248
+ """Identify current executing program and return a workflow RID for it
249
+
250
+ Determine the notebook or script that is currently being executed. Assume that this is
251
+ being executed from a cloned GitHub repository. Determine the remote repository name for
252
+ this object. Then either retrieve an existing workflow for this executable or create
253
+ a new one.
254
+
255
+ Args:
256
+ name: The name of the workflow.
257
+ workflow_type: The type of the workflow.
258
+ description: The description of the workflow.
259
+ """
260
+
261
+ # Check to see if execution file info is being passed in by calling program.
262
+ if "DERIVA_ML_WORKFLOW_URL" in os.environ:
263
+ github_url = os.environ["DERIVA_ML_WORKFLOW_URL"]
264
+ checksum = os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"]
265
+ is_notebook = True
266
+ else:
267
+ path, is_notebook = Workflow._get_python_script()
268
+ github_url, checksum = Workflow.get_url_and_checksum(path)
269
+
270
+ return Workflow(
271
+ name=name,
272
+ url=github_url,
273
+ checksum=checksum,
274
+ description=description,
275
+ workflow_type=workflow_type,
276
+ is_notebook=is_notebook,
277
+ )
278
+
279
+ @staticmethod
280
+ def get_url_and_checksum(executable_path: Path) -> tuple[str, str]:
281
+ """Determine the checksum for a specified executable"""
282
+ try:
283
+ subprocess.run(
284
+ "git rev-parse --is-inside-work-tree",
285
+ capture_output=True,
286
+ text=True,
287
+ shell=True,
288
+ check=True,
289
+ )
290
+ except subprocess.CalledProcessError:
291
+ raise DerivaMLException("Not executing in a Git repository.")
292
+
293
+ github_url, is_dirty = Workflow._github_url(executable_path)
294
+
295
+ if is_dirty:
296
+ logging.getLogger("deriva_ml").warning(
297
+ f"File {executable_path} has been modified since last commit. Consider commiting before executing"
298
+ )
299
+
300
+ # If you are in a notebook, strip out the outputs before computing the checksum.
301
+ cmd = (
302
+ f"nbstripout -t {executable_path} | git hash-object --stdin"
303
+ if "ipynb" == executable_path.suffix
304
+ else f"git hash-object {executable_path}"
305
+ )
306
+ checksum = (
307
+ subprocess.run(
308
+ cmd,
309
+ capture_output=True,
310
+ text=True,
311
+ check=False,
312
+ shell=True,
313
+ ).stdout.strip()
314
+ if executable_path != "REPL"
315
+ else "1"
316
+ )
317
+ return github_url, checksum
38
318
 
39
319
 
40
320
  class ExecutionConfiguration(BaseModel):
@@ -52,7 +332,7 @@ class ExecutionConfiguration(BaseModel):
52
332
  datasets: conlist(DatasetSpec) = []
53
333
  assets: list[RID | str] = [] # List of RIDs to model files.
54
334
  workflow: RID | Workflow
55
- parameters: dict[str, Any] = {}
335
+ parameters: dict[str, Any] | Path = {}
56
336
  description: str = ""
57
337
  argv: conlist(str) = Field(default_factory=lambda: sys.argv)
58
338
 
@@ -61,7 +341,7 @@ class ExecutionConfiguration(BaseModel):
61
341
  @field_validator("parameters", mode="before")
62
342
  @classmethod
63
343
  def validate_parameters(cls, value: Any) -> Any:
64
- """If parameter is a file, assume that it has JSON contents for configuration parameters"""
344
+ """If a parameter is a file, assume that it has JSON contents for configuration parameters"""
65
345
  if isinstance(value, str) or isinstance(value, Path):
66
346
  with open(value, "r") as f:
67
347
  return json.load(f)
@@ -0,0 +1,179 @@
1
+ """Module to run a notebook using papermill"""
2
+
3
+ from datetime import datetime
4
+ import json
5
+ import os
6
+ import papermill as pm
7
+ from pathlib import Path
8
+ import regex as re
9
+ import tempfile
10
+
11
+ from deriva_ml import Workflow, DerivaML
12
+ from deriva.core import BaseCLI
13
+ from deriva_ml import MLAsset, ExecAssetType
14
+
15
+
16
+ class DerivaMLRunNotebookCLI(BaseCLI):
17
+ """Main class to part command line arguments and call model"""
18
+
19
+ def __init__(self, description, epilog, **kwargs):
20
+ BaseCLI.__init__(self, description, epilog, **kwargs)
21
+ Workflow._check_nbstrip_status()
22
+ self.parser.add_argument(
23
+ "notebook_file", type=Path, help="Path to the notebook file"
24
+ )
25
+
26
+ self.parser.add_argument(
27
+ "--file",
28
+ "-f",
29
+ type=Path,
30
+ default=None,
31
+ help="JSON file with parameter values to inject into the notebook.",
32
+ )
33
+
34
+ self.parser.add_argument(
35
+ "--inspect",
36
+ action="store_true",
37
+ help="Display parameters information for the given notebook path.",
38
+ )
39
+
40
+ self.parser.add_argument(
41
+ "--log-output",
42
+ action="store_false",
43
+ help="Display logging output from notebook.",
44
+ )
45
+
46
+ self.parser.add_argument(
47
+ "--catalog",
48
+ metavar="<1>",
49
+ default=1,
50
+ help="Catalog number. Default 1",
51
+ )
52
+
53
+ self.parser.add_argument(
54
+ "--parameter",
55
+ "-p",
56
+ nargs=2,
57
+ action="append",
58
+ metavar=("KEY", "VALUE"),
59
+ default=[],
60
+ help="Provide a parameter name and value to inject into the notebook.",
61
+ )
62
+
63
+ self.parser.add_argument(
64
+ "--kernel", "-k", nargs=1, help="Name of kernel to run..", default=None
65
+ )
66
+
67
+ @staticmethod
68
+ def _coerce_number(val: str):
69
+ """
70
+ Try to convert a string to int, then float; otherwise return str.
71
+ """
72
+ try:
73
+ return int(val)
74
+ except ValueError:
75
+ try:
76
+ return float(val)
77
+ except ValueError:
78
+ return val
79
+
80
+ def main(self):
81
+ """Parse arguments and set up execution environment."""
82
+ args = self.parse_cli()
83
+ notebook_file = args.notebook_file
84
+ parameter_file = args.file
85
+
86
+ # args.parameter is now a list of [KEY, VALUE] lists
87
+ # e.g. [['timeout', '30'], ['name', 'Alice'], ...]
88
+ parameters = {key: self._coerce_number(val) for key, val in args.parameter}
89
+
90
+ if parameter_file:
91
+ if not (parameter_file.is_file() and parameter_file.suffix == ".json"):
92
+ print("Parameter file must be an json file.")
93
+ exit(1)
94
+ with open(parameter_file, "r") as f:
95
+ parameters |= json.load(f)
96
+
97
+ if not (notebook_file.is_file() and notebook_file.suffix == ".ipynb"):
98
+ print("Notebook file must be an ipynb file.")
99
+ exit(1)
100
+
101
+ os.environ["DERIVA_HOST"] = args.host
102
+ os.environ["DERIVA_CATALOG_ID"] = args.catalog
103
+
104
+ # Create a workflow instance for this specific version of the script. Return an existing workflow if one is found.
105
+ notebook_parameters = pm.inspect_notebook(notebook_file)
106
+ if args.inspect:
107
+ for param, value in notebook_parameters.items():
108
+ print(
109
+ f"{param}:{value['inferred_type_name']} (default {value['default']})"
110
+ )
111
+ return
112
+ else:
113
+ notebook_parameters = (
114
+ {"host": args.host, "catalog": args.catalog}
115
+ | {k: v["default"] for k, v in notebook_parameters.items()}
116
+ | parameters
117
+ )
118
+ print(f"Running notebook {notebook_file.name} with parameters:")
119
+ for param, value in notebook_parameters.items():
120
+ print(f" {param}:{value}")
121
+ self.run_notebook(notebook_file.resolve(), parameters, args.kernel)
122
+
123
+ def run_notebook(self, notebook_file, parameters, kernel=None):
124
+ url, checksum = Workflow.get_url_and_checksum(Path(notebook_file))
125
+ os.environ["DERIVA_ML_WORKFLOW_URL"] = url
126
+ os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"] = checksum
127
+
128
+ with tempfile.TemporaryDirectory() as tmpdirname:
129
+ notebook_output = Path(tmpdirname) / Path(notebook_file).name
130
+ pm.execute_notebook(
131
+ input_path=notebook_file,
132
+ output_path=notebook_output,
133
+ parameters=parameters,
134
+ kernel_name=kernel,
135
+ )
136
+ host = catalog_id = execution_rid = None
137
+ with open(notebook_output, "r") as f:
138
+ for line in f:
139
+ if m := re.search(
140
+ r"Execution RID: https://(?P<host>.*)/id/(?P<catalog_id>.*)/(?P<execution_rid>[\w-]+)",
141
+ line,
142
+ ):
143
+ hostname = m["host"]
144
+ catalog_id = m["catalog_id"]
145
+ execution_rid = m["execution_rid"]
146
+ if not execution_rid:
147
+ print("Execution RID not found.")
148
+ exit(1)
149
+ print("Uploaded notebook output for Execution RID:", execution_rid)
150
+
151
+ ml_instance = DerivaML(hostname=hostname, catalog_id=catalog_id)
152
+
153
+ execution = ml_instance.restore_execution(execution_rid)
154
+ execution.asset_file_path(
155
+ asset_name=MLAsset.execution_asset,
156
+ file_name=notebook_output,
157
+ asset_types=ExecAssetType.notebook_output,
158
+ )
159
+ parameter_file = execution.asset_file_path(
160
+ asset_name=MLAsset.execution_asset,
161
+ file_name=f"notebook-parameters-{datetime.now().strftime('%Y%m%d-%H%M%S')}.json",
162
+ asset_types=ExecAssetType.input_file.value,
163
+ )
164
+ with open(parameter_file, "w") as f:
165
+ json.dump(parameters, f)
166
+
167
+ execution.upload_execution_outputs()
168
+ print(ml_instance.cite(execution_rid))
169
+
170
+
171
+ def main():
172
+ cli = DerivaMLRunNotebookCLI(
173
+ description="Deriva ML Execution Script Demo", epilog=""
174
+ )
175
+ cli.main()
176
+
177
+
178
+ if __name__ == "__main__":
179
+ main()
@@ -0,0 +1 @@
1
+ {"local-file": "My local file.txt"}
@@ -0,0 +1,5 @@
1
+ {
2
+ "assets": ["2-7J8M"],
3
+ "datasets": ["2-7K8W"],
4
+ "parameters": "test-files/execution-parameters.json"
5
+ }
@@ -0,0 +1,197 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "id": "0",
6
+ "metadata": {
7
+ "ExecuteTime": {
8
+ "end_time": "2025-04-18T22:52:49.930351Z",
9
+ "start_time": "2025-04-18T22:52:48.926842Z"
10
+ }
11
+ },
12
+ "source": [
13
+ "import builtins\n",
14
+ "import os\n",
15
+ "\n",
16
+ "from deriva.core.utils.globus_auth_utils import GlobusNativeLogin\n",
17
+ "from deriva_ml import ExecutionConfiguration, MLVocab, DerivaML, DatasetSpec"
18
+ ],
19
+ "outputs": [],
20
+ "execution_count": 1
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "id": "1",
25
+ "metadata": {
26
+ "tags": [
27
+ "parameters"
28
+ ],
29
+ "ExecuteTime": {
30
+ "end_time": "2025-04-18T22:52:49.988873Z",
31
+ "start_time": "2025-04-18T22:52:49.986713Z"
32
+ }
33
+ },
34
+ "source": [
35
+ "foo: int = 1\n",
36
+ "assets = []\n",
37
+ "datasets = []\n",
38
+ "parameters = None"
39
+ ],
40
+ "outputs": [],
41
+ "execution_count": 2
42
+ },
43
+ {
44
+ "metadata": {
45
+ "ExecuteTime": {
46
+ "end_time": "2025-04-18T22:52:50.002808Z",
47
+ "start_time": "2025-04-18T22:52:49.999450Z"
48
+ }
49
+ },
50
+ "cell_type": "code",
51
+ "source": [
52
+ "print(\"foo\", foo)\n",
53
+ "print(\"assets\", assets)\n",
54
+ "print(\"datasets\", datasets)\n",
55
+ "print(\"parameters\", parameters)"
56
+ ],
57
+ "id": "70b23cdd933ce669",
58
+ "outputs": [
59
+ {
60
+ "name": "stdout",
61
+ "output_type": "stream",
62
+ "text": [
63
+ "foo 1\n",
64
+ "assets []\n",
65
+ "datasets []\n",
66
+ "parameters None\n"
67
+ ]
68
+ }
69
+ ],
70
+ "execution_count": 3
71
+ },
72
+ {
73
+ "metadata": {
74
+ "ExecuteTime": {
75
+ "end_time": "2025-04-18T22:52:50.344660Z",
76
+ "start_time": "2025-04-18T22:52:50.013816Z"
77
+ }
78
+ },
79
+ "cell_type": "code",
80
+ "source": [
81
+ "hostname = os.environ.get(\"DERIVA_HOST\") #or \"dev.eye-ai.org\"\n",
82
+ "catalog_id = os.environ.get(\"DERIVA_CATALOG_ID\") #or 'eye-ai'\n",
83
+ "\n",
84
+ "gnl = GlobusNativeLogin(host=hostname)\n",
85
+ "if gnl.is_logged_in([hostname]):\n",
86
+ " print(\"You are already logged in.\")\n",
87
+ "else:\n",
88
+ " gnl.login([hostname], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)\n",
89
+ " print(\"Login Successful\")\n"
90
+ ],
91
+ "id": "2",
92
+ "outputs": [
93
+ {
94
+ "ename": "AttributeError",
95
+ "evalue": "'NoneType' object has no attribute 'lower'",
96
+ "output_type": "error",
97
+ "traceback": [
98
+ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
99
+ "\u001B[0;31mAttributeError\u001B[0m Traceback (most recent call last)",
100
+ "Cell \u001B[0;32mIn[4], line 5\u001B[0m\n\u001B[1;32m 2\u001B[0m catalog_id \u001B[38;5;241m=\u001B[39m os\u001B[38;5;241m.\u001B[39menviron\u001B[38;5;241m.\u001B[39mget(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mDERIVA_CATALOG_ID\u001B[39m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;66;03m#or 'eye-ai'\u001B[39;00m\n\u001B[1;32m 4\u001B[0m gnl \u001B[38;5;241m=\u001B[39m GlobusNativeLogin(host\u001B[38;5;241m=\u001B[39mhostname)\n\u001B[0;32m----> 5\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[43mgnl\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mis_logged_in\u001B[49m\u001B[43m(\u001B[49m\u001B[43m[\u001B[49m\u001B[43mhostname\u001B[49m\u001B[43m]\u001B[49m\u001B[43m)\u001B[49m:\n\u001B[1;32m 6\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mYou are already logged in.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 7\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n",
101
+ "File \u001B[0;32m~/opt/anaconda3/envs/deriva-test/lib/python3.10/site-packages/deriva/core/utils/globus_auth_utils.py:582\u001B[0m, in \u001B[0;36mGlobusNativeLogin.is_logged_in\u001B[0;34m(self, hosts, requested_scopes, hosts_to_scope_map, exclude_defaults)\u001B[0m\n\u001B[1;32m 576\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;21mis_logged_in\u001B[39m(\u001B[38;5;28mself\u001B[39m,\n\u001B[1;32m 577\u001B[0m hosts\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m,\n\u001B[1;32m 578\u001B[0m requested_scopes\u001B[38;5;241m=\u001B[39m(),\n\u001B[1;32m 579\u001B[0m hosts_to_scope_map\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m,\n\u001B[1;32m 580\u001B[0m exclude_defaults\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[1;32m 581\u001B[0m scopes \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mset\u001B[39m(requested_scopes)\n\u001B[0;32m--> 582\u001B[0m scope_map \u001B[38;5;241m=\u001B[39m hosts_to_scope_map \u001B[38;5;28;01mif\u001B[39;00m hosts_to_scope_map \u001B[38;5;28;01melse\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mhosts_to_scope_map\u001B[49m\u001B[43m(\u001B[49m\u001B[43mhosts\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01mor\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mhosts\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 583\u001B[0m scopes\u001B[38;5;241m.\u001B[39mupdate(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mscope_set_from_scope_map(scope_map))\n\u001B[1;32m 584\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m exclude_defaults:\n",
102
+ "File \u001B[0;32m~/opt/anaconda3/envs/deriva-test/lib/python3.10/site-packages/deriva/core/utils/globus_auth_utils.py:607\u001B[0m, in \u001B[0;36mGlobusNativeLogin.hosts_to_scope_map\u001B[0;34m(self, hosts, match_scope_tag, all_tagged_scopes, force_refresh, warn_on_discovery_failure)\u001B[0m\n\u001B[1;32m 605\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m host \u001B[38;5;129;01min\u001B[39;00m hosts:\n\u001B[1;32m 606\u001B[0m scope_map\u001B[38;5;241m.\u001B[39mupdate({host: []})\n\u001B[0;32m--> 607\u001B[0m scopes \u001B[38;5;241m=\u001B[39m \u001B[43mget_oauth_scopes_for_host\u001B[49m\u001B[43m(\u001B[49m\u001B[43mhost\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 608\u001B[0m \u001B[43m \u001B[49m\u001B[43mconfig_file\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mconfig_file\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 609\u001B[0m \u001B[43m \u001B[49m\u001B[43mforce_refresh\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mforce_refresh\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 610\u001B[0m \u001B[43m \u001B[49m\u001B[43mwarn_on_discovery_failure\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mwarn_on_discovery_failure\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 611\u001B[0m scope_list \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mlist\u001B[39m()\n\u001B[1;32m 612\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m scopes:\n",
103
+ "File \u001B[0;32m~/opt/anaconda3/envs/deriva-test/lib/python3.10/site-packages/deriva/core/utils/core_utils.py:300\u001B[0m, in \u001B[0;36mget_oauth_scopes_for_host\u001B[0;34m(host, config_file, force_refresh, warn_on_discovery_failure)\u001B[0m\n\u001B[1;32m 298\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m required_scopes:\n\u001B[1;32m 299\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m hostname, scopes \u001B[38;5;129;01min\u001B[39;00m required_scopes\u001B[38;5;241m.\u001B[39mitems():\n\u001B[0;32m--> 300\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[43mhost\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mlower\u001B[49m() \u001B[38;5;241m==\u001B[39m hostname\u001B[38;5;241m.\u001B[39mlower():\n\u001B[1;32m 301\u001B[0m result \u001B[38;5;241m=\u001B[39m scopes\n\u001B[1;32m 302\u001B[0m \u001B[38;5;28;01mbreak\u001B[39;00m\n",
104
+ "\u001B[0;31mAttributeError\u001B[0m: 'NoneType' object has no attribute 'lower'"
105
+ ]
106
+ }
107
+ ],
108
+ "execution_count": 4
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "id": "3",
113
+ "metadata": {},
114
+ "source": [
115
+ "ml_instance = DerivaML(hostname, catalog_id)\n",
116
+ "\n",
117
+ "ml_instance.add_term(MLVocab.workflow_type, \"Manual Workflow\", description=\"Initial setup of Model File\")\n",
118
+ "ml_instance.add_term(MLVocab.asset_type, \"API_Model\", description=\"Model for our API workflow\")"
119
+ ],
120
+ "outputs": [],
121
+ "execution_count": null
122
+ },
123
+ {
124
+ "metadata": {},
125
+ "cell_type": "code",
126
+ "source": [
127
+ "api_workflow = ml_instance.create_workflow(\n",
128
+ " name=\"Manual Workflow\",\n",
129
+ " workflow_type=\"Manual Workflow\",\n",
130
+ " description=\"A manual operation\"\n",
131
+ ")"
132
+ ],
133
+ "id": "5",
134
+ "outputs": [],
135
+ "execution_count": null
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "id": "6",
140
+ "metadata": {},
141
+ "source": [
142
+ "manual_execution = ml_instance.create_execution(\n",
143
+ " ExecutionConfiguration(\n",
144
+ " description=\"Sample Execution\",\n",
145
+ " workflow=api_workflow,\n",
146
+ " datasets=[DatasetSpec(rid=ds, version=ml_instance.dataset_version(ds)) for ds in datasets],\n",
147
+ " assets=assets,\n",
148
+ " parameters=parameters\n",
149
+ " )\n",
150
+ ")"
151
+ ],
152
+ "outputs": [],
153
+ "execution_count": null
154
+ },
155
+ {
156
+ "metadata": {},
157
+ "cell_type": "code",
158
+ "source": [
159
+ "print(f'parameters: {manual_execution.parameters}')\n",
160
+ "print(f'datasets: {manual_execution.datasets}')\n",
161
+ "print(f'assets: {manual_execution.asset_paths}')"
162
+ ],
163
+ "id": "4b2a3b8c16333645",
164
+ "outputs": [],
165
+ "execution_count": null
166
+ },
167
+ {
168
+ "metadata": {},
169
+ "cell_type": "code",
170
+ "source": "manual_execution.upload_execution_outputs()",
171
+ "id": "efa8cb1b0ed438bb",
172
+ "outputs": [],
173
+ "execution_count": null
174
+ }
175
+ ],
176
+ "metadata": {
177
+ "kernelspec": {
178
+ "display_name": "deriva-test",
179
+ "language": "python",
180
+ "name": "deriva-test"
181
+ },
182
+ "language_info": {
183
+ "codemirror_mode": {
184
+ "name": "ipython",
185
+ "version": 2
186
+ },
187
+ "file_extension": ".py",
188
+ "mimetype": "text/x-python",
189
+ "name": "python",
190
+ "nbconvert_exporter": "python",
191
+ "pygments_lexer": "ipython2",
192
+ "version": "2.7.6"
193
+ }
194
+ },
195
+ "nbformat": 4,
196
+ "nbformat_minor": 5
197
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.12.3
3
+ Version: 1.13.1
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -15,6 +15,7 @@ Requires-Dist: semver>3.0.0
15
15
  Requires-Dist: setuptools>=64
16
16
  Requires-Dist: setuptools-scm<=6.0
17
17
  Requires-Dist: nbstripout
18
+ Requires-Dist: papermill
18
19
  Dynamic: license-file
19
20
 
20
21
  # DerivaML
@@ -1,27 +1,31 @@
1
- deriva_ml/__init__.py,sha256=r1Z9N5vtZkAET7emqhpAx2bf_xJUp5wHOc4_DIplsG8,1082
1
+ deriva_ml/__init__.py,sha256=GfneBq7xDphMqUQY96sW9ixRj74M3UTUCmD4KMIRSaM,1101
2
2
  deriva_ml/database_model.py,sha256=lMbAEqn4n0m7h_JstMX_LX9gbvBIEydG3sRilPn3eLU,14885
3
3
  deriva_ml/dataset.py,sha256=OyWUKWnYeP0ctimSBQ4em-uJrzCNOohx4GPT2uIl6R4,60649
4
4
  deriva_ml/dataset_aux_classes.py,sha256=YxjQnu2kS9kK_f8bGqhmgE6ty9GNeitCxfvReT9vaM0,6537
5
5
  deriva_ml/dataset_bag.py,sha256=yS8oYVshfFtRDyhGPRqtbvxjyd3ZFF29lrB783OP4vM,11849
6
6
  deriva_ml/demo_catalog.py,sha256=9Qo3JD4bUIwnL3ngPctc2QBeWApvMR_5UyaK9ockTrY,11536
7
- deriva_ml/deriva_definitions.py,sha256=HLaQ0zWO-Yd17Yp8hvqFSGkvjANJ52Ws5yHCVYMhfGA,8918
8
- deriva_ml/deriva_ml_base.py,sha256=rrImShp1RXvMuXVLft5GfTnxf_PfF1LONHgV1Ee_E9I,46517
7
+ deriva_ml/deriva_definitions.py,sha256=avdOgxtB60yb8XsWm-AYtCdvg2QkQbyfkZuA9xx9t2U,9221
8
+ deriva_ml/deriva_ml_base.py,sha256=JYTG_a8SURhrPQBTz6OaGMk0D0sSPWpXqCnoVnSNViI,38501
9
9
  deriva_ml/deriva_model.py,sha256=wytGCAHutiUaRfnRKr80Ks_P6ci0_wXRU3vq3lthfYU,13260
10
- deriva_ml/execution.py,sha256=xYS4wYRYcksNjUZ-Rwys_H4jZchW3YVu-uWg7ySJMjk,37510
11
- deriva_ml/execution_configuration.py,sha256=XQeXzPz9Gh_AGa_iYW8zF95niwHed3ojv4gnibB0thA,4082
10
+ deriva_ml/execution.py,sha256=Oyja5wonSBUDUIVSC01w3AojGEkWyw_8_kBMv3MTZBM,38126
11
+ deriva_ml/execution_configuration.py,sha256=KKg2HhvOiOmYc3jJ9iJeeHYyRu05Bb8JpojmPn1gYW0,14072
12
12
  deriva_ml/execution_environment.py,sha256=bCRKrCELDbGQDo7_FKfw7e8iMzVjSRZK3baKkqH5-_0,3264
13
13
  deriva_ml/feature.py,sha256=07g0uSrhumdopJluWuWSRMrzagaikAOihqB09bzXBP4,5475
14
14
  deriva_ml/history.py,sha256=qTDLDs8Ow_6r7mDO0gZm0Fg81SWKOAgtCU5pzZoDRgM,2828
15
+ deriva_ml/run_notebook.py,sha256=vhmij4P1Va52MIj8hOc-WmjLRp3sTmK6p7LXCWrzejc,6308
15
16
  deriva_ml/test_functions.py,sha256=-eqLHjjCQCLBNAr1ofbZekNiCOfMISSACRxT_YHER8I,4396
17
+ deriva_ml/test_notebook.ipynb,sha256=_5D6rkSGbmENPJZbDgfZ6-yt94BNEwxytVUDmG3RE3w,10166
16
18
  deriva_ml/upload.py,sha256=gHTGXAVlf56EwNzmw5zY0gbBf8h08eU2q2GBbb2FdVc,16087
17
19
  deriva_ml/schema_setup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
20
  deriva_ml/schema_setup/annotations.py,sha256=v0gTpmWYxRqsQ-bcnQzsr8WowGv2pi9pZUsO3WWnu1U,9528
19
21
  deriva_ml/schema_setup/create_schema.py,sha256=hNMc-v5tferd0UjfdB6nBw7Rc_o-Mg6NkPqQGie9YOw,11700
20
22
  deriva_ml/schema_setup/policy.json,sha256=77sf0Imy6CAQV0_VwwbA56_KROJ05WXsvT-Wjtkk538,1633
21
23
  deriva_ml/schema_setup/table_comments_utils.py,sha256=-2_ubEpoH7ViLVb-ZfW9wZbQ26DTKNgjkCABMzGu4i4,2140
22
- deriva_ml-1.12.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
- deriva_ml-1.12.3.dist-info/METADATA,sha256=CNoKyLpxijU8MrLj4VQzrOQLAU3oIT232DF9RI-eFbw,974
24
- deriva_ml-1.12.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
25
- deriva_ml-1.12.3.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
26
- deriva_ml-1.12.3.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
27
- deriva_ml-1.12.3.dist-info/RECORD,,
24
+ deriva_ml/test-files/execution-parameters.json,sha256=1vBqXlaMa0cysonE20TweVDfTGRdSi9CUuAkW1xiYNo,36
25
+ deriva_ml/test-files/notebook-parameters.json,sha256=7uEE2sLQSrSc9cEGQ_RKE7t5dwkEYv0qLo5mRbzo8Og,108
26
+ deriva_ml-1.13.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
27
+ deriva_ml-1.13.1.dist-info/METADATA,sha256=OKuCDvSR63ii7fO1W6tw-7-6RtYaKMHR59AbiURo_tI,999
28
+ deriva_ml-1.13.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
29
+ deriva_ml-1.13.1.dist-info/entry_points.txt,sha256=cJnALMa6pjdk6RQCt4HFbKHqALpVa0k6wPeQDPedLJI,295
30
+ deriva_ml-1.13.1.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
31
+ deriva_ml-1.13.1.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  [console_scripts]
2
2
  deriva-ml-alter-annotation = deriva_ml.schema_setup.alter_annotation:main
3
3
  deriva-ml-create-schema = deriva_ml.schema_setup.create_schema:main
4
+ deriva-ml-run-notebook = deriva_ml.run_notebook:main
4
5
  deriva-ml-table-comments-utils = deriva_ml.schema_setup.table_comments_utils:main