deriva-ml 1.12.2__tar.gz → 1.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {deriva_ml-1.12.2/src/deriva_ml.egg-info → deriva_ml-1.13.0}/PKG-INFO +2 -1
  2. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/pyproject.toml +3 -0
  3. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/__init__.py +2 -1
  4. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/dataset.py +9 -2
  5. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/deriva_definitions.py +6 -0
  6. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/deriva_ml_base.py +14 -233
  7. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/execution.py +101 -42
  8. deriva_ml-1.13.0/src/deriva_ml/execution_configuration.py +382 -0
  9. deriva_ml-1.13.0/src/deriva_ml/run_notebook.py +155 -0
  10. deriva_ml-1.13.0/src/deriva_ml/test_notebook.ipynb +124 -0
  11. {deriva_ml-1.12.2 → deriva_ml-1.13.0/src/deriva_ml.egg-info}/PKG-INFO +2 -1
  12. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml.egg-info/SOURCES.txt +2 -0
  13. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml.egg-info/entry_points.txt +1 -0
  14. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml.egg-info/requires.txt +1 -0
  15. deriva_ml-1.12.2/src/deriva_ml/execution_configuration.py +0 -102
  16. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/.github/workflows/publish-docs.yml +0 -0
  17. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/.gitignore +0 -0
  18. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/LICENSE +0 -0
  19. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/README.md +0 -0
  20. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/.DS_Store +0 -0
  21. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/Notebooks/DerivaML Create Notes.ipynb +0 -0
  22. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/Notebooks/DerivaML Dataset.ipynb +0 -0
  23. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/Notebooks/DerivaML Execution.ipynb +0 -0
  24. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/Notebooks/DerivaML Features.ipynb +0 -0
  25. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/Notebooks/DerivaML Vocabulary.ipynb +0 -0
  26. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/assets/ERD.png +0 -0
  27. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/assets/Launcher.png +0 -0
  28. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/assets/copy_minid.png +0 -0
  29. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/assets/deriva-logo.png +0 -0
  30. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/assets/deriva-ml.pdf +0 -0
  31. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/assets/sharing-at-home.pdf +0 -0
  32. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/code-docs/dataset.md +0 -0
  33. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/code-docs/dataset_aux_classes.md +0 -0
  34. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/code-docs/dataset_bag.md +0 -0
  35. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/code-docs/deriva_ml_base.md +0 -0
  36. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/code-docs/deriva_model.md +0 -0
  37. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/code-docs/execution.md +0 -0
  38. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/code-docs/execution_configuration.md +0 -0
  39. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/code-docs/feature.md +0 -0
  40. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/code-docs/upload.md +0 -0
  41. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/deriva_ml_structure.md +0 -0
  42. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/index.md +0 -0
  43. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/release-notes.md +0 -0
  44. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/user-guide/datasets.md +0 -0
  45. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/user-guide/execution-configuration.md +0 -0
  46. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/user-guide/file-assets.md +0 -0
  47. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/user-guide/identifiers.md +0 -0
  48. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/user-guide/install.md +0 -0
  49. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/docs/user-guide/ml_workflow_instruction.md +0 -0
  50. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/mkdocs.yml +0 -0
  51. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/release.sh +0 -0
  52. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/setup.cfg +0 -0
  53. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/database_model.py +0 -0
  54. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/dataset_aux_classes.py +0 -0
  55. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/dataset_bag.py +0 -0
  56. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/demo_catalog.py +0 -0
  57. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/deriva_model.py +0 -0
  58. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/execution_environment.py +0 -0
  59. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/feature.py +0 -0
  60. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/history.py +0 -0
  61. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/schema_setup/__init__.py +0 -0
  62. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/schema_setup/annotations.py +0 -0
  63. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/schema_setup/create_schema.py +0 -0
  64. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/schema_setup/policy.json +0 -0
  65. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/schema_setup/table_comments_utils.py +0 -0
  66. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/test_functions.py +0 -0
  67. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml/upload.py +0 -0
  68. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
  69. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/src/deriva_ml.egg-info/top_level.txt +0 -0
  70. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/tests/__init__.py +0 -0
  71. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/tests/derivaml_test.py +0 -0
  72. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/tests/runner.py +0 -0
  73. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/tests/test_basic_tables.py +0 -0
  74. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/tests/test_dataset.py +0 -0
  75. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/tests/test_download.py +0 -0
  76. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/tests/test_execution.py +0 -0
  77. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/tests/test_features.py +0 -0
  78. {deriva_ml-1.12.2 → deriva_ml-1.13.0}/tests/test_upload.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.12.2
3
+ Version: 1.13.0
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -15,6 +15,7 @@ Requires-Dist: semver>3.0.0
15
15
  Requires-Dist: setuptools>=64
16
16
  Requires-Dist: setuptools-scm<=6.0
17
17
  Requires-Dist: nbstripout
18
+ Requires-Dist: papermill
18
19
  Dynamic: license-file
19
20
 
20
21
  # DerivaML
@@ -22,6 +22,7 @@ dependencies = [
22
22
  "setuptools>=64",
23
23
  "setuptools-scm<=6.0",
24
24
  "nbstripout",
25
+ "papermill"
25
26
  ]
26
27
 
27
28
  [tool.setuptools.package-data]
@@ -54,6 +55,8 @@ post_commit_hooks = []
54
55
  deriva-ml-table-comments-utils = "deriva_ml.schema_setup.table_comments_utils:main"
55
56
  deriva-ml-create-schema = "deriva_ml.schema_setup.create_schema:main"
56
57
  deriva-ml-alter-annotation = "deriva_ml.schema_setup.alter_annotation:main"
58
+ deriva-ml-run-notebook = "deriva_ml.run_notebook:main"
59
+
57
60
 
58
61
  [metadata]
59
62
  license = "Apache 2.0"
@@ -14,6 +14,7 @@ __all__ = [
14
14
  "BuiltinTypes",
15
15
  "UploadState",
16
16
  "MLVocab",
17
+ "MLAsset",
17
18
  "ExecMetadataVocab",
18
19
  "RID",
19
20
  "DerivaSystemColumns",
@@ -32,6 +33,7 @@ from .deriva_definitions import (
32
33
  RID,
33
34
  DerivaMLException,
34
35
  MLVocab,
36
+ MLAsset,
35
37
  ExecMetadataVocab,
36
38
  DerivaSystemColumns,
37
39
  )
@@ -49,4 +51,3 @@ try:
49
51
  except PackageNotFoundError:
50
52
  # package is not installed
51
53
  pass
52
-
@@ -41,7 +41,14 @@ from tempfile import TemporaryDirectory, NamedTemporaryFile
41
41
  from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
42
42
 
43
43
  from deriva_ml import DatasetBag
44
- from .deriva_definitions import ML_SCHEMA, DerivaMLException, MLVocab, Status, RID
44
+ from .deriva_definitions import (
45
+ ML_SCHEMA,
46
+ DerivaMLException,
47
+ MLVocab,
48
+ Status,
49
+ RID,
50
+ DRY_RUN_RID,
51
+ )
45
52
  from .history import iso_to_snap
46
53
  from .deriva_model import DerivaModel
47
54
  from .database_model import DatabaseModel
@@ -957,7 +964,7 @@ class Dataset:
957
964
  for the dataset.
958
965
  """
959
966
  if (
960
- execution_rid
967
+ execution_rid != DRY_RUN_RID
961
968
  and self._model.catalog.resolve_rid(execution_rid).table.name != "Execution"
962
969
  ):
963
970
  raise DerivaMLException(f"RID {execution_rid} is not an execution")
@@ -21,6 +21,7 @@ from pydantic import (
21
21
  from socket import gethostname
22
22
 
23
23
  ML_SCHEMA = "deriva-ml"
24
+ DRY_RUN_RID = "0000"
24
25
 
25
26
  # We are going to use schema as a field name and this collides with method in pydantic base class
26
27
  warnings.filterwarnings(
@@ -191,6 +192,11 @@ class MLVocab(StrEnum):
191
192
  asset_role = "Asset_Role"
192
193
 
193
194
 
195
+ class MLAsset(StrEnum):
196
+ execution_metadata = "Execution_Metadata"
197
+ execution_asset = "Execution_Asset"
198
+
199
+
194
200
  class ExecMetadataVocab(StrEnum):
195
201
  """
196
202
  Predefined execution metadata types.
@@ -14,17 +14,16 @@ import getpass
14
14
  import logging
15
15
  from datetime import datetime
16
16
  from itertools import chain
17
- import inspect
18
- import setuptools_scm
19
17
  from pathlib import Path
20
18
  import requests
21
- import subprocess
19
+
22
20
  from typing import Optional, Any, Iterable, TYPE_CHECKING
21
+
23
22
  from deriva.core import (
24
23
  get_credential,
25
24
  urlquote,
26
- DEFAULT_SESSION_CONFIG,
27
25
  format_exception,
26
+ DEFAULT_SESSION_CONFIG,
28
27
  )
29
28
  import deriva.core.datapath as datapath
30
29
  from deriva.core.datapath import DataPathException
@@ -33,7 +32,6 @@ from deriva.core.ermrest_catalog import ResolveRidResult
33
32
  from deriva.core.ermrest_model import Key, Table
34
33
  from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
35
34
  from pydantic import validate_call, ConfigDict
36
- from requests import RequestException
37
35
 
38
36
  from .execution_configuration import ExecutionConfiguration, Workflow
39
37
  from .feature import Feature, FeatureRecord
@@ -60,33 +58,6 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
60
58
  ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
61
59
 
62
60
 
63
- try:
64
- from IPython import get_ipython
65
- except ImportError: # Graceful fallback if IPython isn't installed.
66
-
67
- def get_ipython():
68
- """Dummy routine in case you are not running in IPython."""
69
- return None
70
-
71
-
72
- try:
73
- from jupyter_server.serverapp import list_running_servers
74
- except ImportError:
75
-
76
- def list_running_servers():
77
- """Dummy routine in case you are not running in Jupyter."""
78
- return []
79
-
80
-
81
- try:
82
- from ipykernel import get_connection_file
83
- except ImportError:
84
-
85
- def get_connection_file():
86
- """Dummy routine in case you are not running in Jupyter."""
87
- return ""
88
-
89
-
90
61
  if TYPE_CHECKING:
91
62
  from .execution import Execution
92
63
 
@@ -165,7 +136,6 @@ class DerivaML(Dataset):
165
136
  self.version = model_version
166
137
  self.configuration = None
167
138
  self._execution: Optional[Execution] = None
168
- self.executable_path, self._is_notebook = self._get_python_script()
169
139
  self.domain_schema = self.model.domain_schema
170
140
  self.project_name = project_name or self.domain_schema
171
141
  self.start_time = datetime.now()
@@ -192,102 +162,6 @@ class DerivaML(Dataset):
192
162
  except (AttributeError, requests.HTTPError):
193
163
  pass
194
164
 
195
- def _check_nbstrip_status(self) -> None:
196
- """Check to see if nbstrip is installed"""
197
- try:
198
- if subprocess.run(
199
- ["nbstripout", "--is-installed"],
200
- check=False,
201
- capture_output=True,
202
- ).returncode:
203
- self._logger.warning(
204
- "nbstripout is not installed in repository. Please run nbstripout --install"
205
- )
206
- except subprocess.CalledProcessError:
207
- self._logger.error("nbstripout is not found.")
208
-
209
- @staticmethod
210
- def _get_notebook_session() -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
211
- """Return the absolute path of the current notebook."""
212
- # Get the kernel's connection file and extract the kernel ID
213
- try:
214
- if not (connection_file := Path(get_connection_file()).name):
215
- return None, None
216
- except RuntimeError:
217
- return None, None
218
-
219
- kernel_id = connection_file.split("-", 1)[1].split(".")[0]
220
-
221
- # Look through the running server sessions to find the matching kernel ID
222
- for server in list_running_servers():
223
- try:
224
- # If a token is required for authentication, include it in headers
225
- token = server.get("token", "")
226
- headers = {}
227
- if token:
228
- headers["Authorization"] = f"token {token}"
229
-
230
- try:
231
- sessions_url = server["url"] + "api/sessions"
232
- response = requests.get(sessions_url, headers=headers)
233
- response.raise_for_status()
234
- sessions = response.json()
235
- except RequestException as e:
236
- raise e
237
- for sess in sessions:
238
- if sess["kernel"]["id"] == kernel_id:
239
- return server, sess
240
- except Exception as _e:
241
- # Ignore servers we can't connect to.
242
- pass
243
- return None, None
244
-
245
- def _get_notebook_path(self) -> Path | None:
246
- """Return the absolute path of the current notebook."""
247
-
248
- server, session = DerivaML._get_notebook_session()
249
- if server and session:
250
- self._check_nbstrip_status()
251
- relative_path = session["notebook"]["path"]
252
- # Join the notebook directory with the relative path
253
- return Path(server["root_dir"]) / relative_path
254
- else:
255
- return None
256
-
257
- def _get_python_script(self) -> tuple[Path, bool]:
258
- """Return the path to the currently executing script"""
259
- is_notebook = False
260
- if filename := self._get_notebook_path():
261
- is_notebook = True
262
- else:
263
- stack = inspect.stack()
264
- # Get the caller's filename, which is two up the stack from here.
265
- if len(stack) > 1:
266
- filename = Path(stack[2].filename)
267
- if not filename.exists():
268
- # Begin called from command line interpreter.
269
- filename = "REPL"
270
- # Get the caller's filename, which is two up the stack from here.
271
- else:
272
- raise DerivaMLException(
273
- "Looking for caller failed"
274
- ) # Stack is too shallow
275
- return filename, is_notebook
276
-
277
- def _get_git_root(self):
278
- try:
279
- result = subprocess.run(
280
- ["git", "rev-parse", "--show-toplevel"],
281
- cwd=self.executable_path.parent,
282
- stdout=subprocess.PIPE,
283
- stderr=subprocess.DEVNULL,
284
- text=True,
285
- check=True,
286
- )
287
- return result.stdout.strip()
288
- except subprocess.CalledProcessError:
289
- return None # Not in a git repository
290
-
291
165
  @staticmethod
292
166
  def _get_session_config():
293
167
  """ """
@@ -311,10 +185,6 @@ class DerivaML(Dataset):
311
185
  """Get a new instance of a pathBuilder object."""
312
186
  return self.catalog.getPathBuilder()
313
187
 
314
- def get_version(self) -> str:
315
- """Return the version number of the executable"""
316
- return setuptools_scm.get_version(root=self._get_git_root())
317
-
318
188
  @property
319
189
  def domain_path(self):
320
190
  """Get a new instance of a pathBuilder object to the domain schema"""
@@ -1117,105 +987,7 @@ class DerivaML(Dataset):
1117
987
  # Make sure type is correct.
1118
988
  self.lookup_term(MLVocab.workflow_type, workflow_type)
1119
989
 
1120
- try:
1121
- subprocess.run(
1122
- "git rev-parse --is-inside-work-tree",
1123
- capture_output=True,
1124
- text=True,
1125
- shell=True,
1126
- check=True,
1127
- )
1128
- except subprocess.CalledProcessError:
1129
- raise DerivaMLException("Not executing in a Git repository.")
1130
-
1131
- github_url, is_dirty = self._github_url()
1132
-
1133
- if is_dirty:
1134
- self._logger.warning(
1135
- f"File {self.executable_path} has been modified since last commit. Consider commiting before executing"
1136
- )
1137
-
1138
- # If you are in a notebook, strip out the outputs before computing the checksum.
1139
- cmd = (
1140
- f"nbstripout {self.executable_path} | git hash-object --stdin"
1141
- if self._is_notebook
1142
- else f"git hash-object {self.executable_path}"
1143
- )
1144
- checksum = (
1145
- subprocess.run(
1146
- cmd,
1147
- capture_output=True,
1148
- text=True,
1149
- check=False,
1150
- shell=True,
1151
- ).stdout.strip()
1152
- if self.executable_path != "REPL"
1153
- else "1"
1154
- )
1155
-
1156
- return Workflow(
1157
- name=name,
1158
- url=github_url,
1159
- checksum=checksum,
1160
- description=description,
1161
- workflow_type=workflow_type,
1162
- )
1163
-
1164
- def _github_url(self) -> tuple[str, bool]:
1165
- """Return a GitHUB URL for the latest commit of the script from which this routine is called.
1166
-
1167
- This routine is used to be called from a script or notebook (e.g. python -m file). It assumes that
1168
- the file is in a gitHUB repository and commited. It returns a URL to the last commited version of this
1169
- file in GitHUB.
1170
-
1171
- Returns: A tuple with the gethub_url and a boolean to indicated if uncommited changes
1172
- have been made to the file.
1173
-
1174
- """
1175
-
1176
- # Get repo URL from local gitHub repo.
1177
- if self.executable_path == "REPL":
1178
- return "REPL", True
1179
- try:
1180
- result = subprocess.run(
1181
- ["git", "remote", "get-url", "origin"],
1182
- capture_output=True,
1183
- text=True,
1184
- cwd=self.executable_path.parent,
1185
- )
1186
- github_url = result.stdout.strip().removesuffix(".git")
1187
- except subprocess.CalledProcessError:
1188
- raise DerivaMLException("No GIT remote found")
1189
-
1190
- # Find the root directory for the repository
1191
- repo_root = self._get_git_root()
1192
-
1193
- # Now check to see if file has been modified since the last commit.
1194
- try:
1195
- result = subprocess.run(
1196
- ["git", "status", "--porcelain"],
1197
- cwd=self.executable_path.parent,
1198
- capture_output=True,
1199
- text=True,
1200
- check=True,
1201
- )
1202
- is_dirty = bool(
1203
- "M " in result.stdout.strip()
1204
- ) # Returns True if output indicates a modified file
1205
- except subprocess.CalledProcessError:
1206
- is_dirty = False # If Git command fails, assume no changes
1207
-
1208
- """Get SHA-1 hash of latest commit of the file in the repository"""
1209
- result = subprocess.run(
1210
- ["git", "log", "-n", "1", "--pretty=format:%H--", self.executable_path],
1211
- cwd=self.executable_path.parent,
1212
- capture_output=True,
1213
- text=True,
1214
- check=True,
1215
- )
1216
- sha = result.stdout.strip()
1217
- url = f"{github_url}/blob/{sha}/{self.executable_path.relative_to(repo_root)}"
1218
- return url, is_dirty
990
+ return Workflow.create_workflow(name, workflow_type, description)
1219
991
 
1220
992
  # @validate_call
1221
993
  def create_execution(
@@ -1259,6 +1031,15 @@ class DerivaML(Dataset):
1259
1031
  exec_rid=execution_rid,
1260
1032
  file_name="configuration.json",
1261
1033
  asset_table=self.model.name_to_table("Execution_Metadata"),
1034
+ metadata={},
1262
1035
  )
1263
- configuration = ExecutionConfiguration.load_configuration(cfile)
1036
+
1037
+ if cfile.exists():
1038
+ configuration = ExecutionConfiguration.load_configuration(cfile)
1039
+ else:
1040
+ execution = self.retrieve_rid(execution_rid)
1041
+ configuration = ExecutionConfiguration(
1042
+ workflow=execution["Workflow"],
1043
+ description=execution["Description"],
1044
+ )
1264
1045
  return Execution(configuration, self, reload=execution_rid)
@@ -5,21 +5,31 @@ This module defined the Execution class which is used to interact with the state
5
5
  from __future__ import annotations
6
6
 
7
7
  from collections import defaultdict
8
+ from datetime import datetime
8
9
  import json
9
10
  import logging
10
11
  import os
11
- import shutil
12
- from datetime import datetime
13
12
  from pathlib import Path
14
- from typing import Iterable, Any, Optional
15
13
 
16
- from deriva.core import format_exception
17
14
  from pydantic import validate_call, ConfigDict
15
+ import regex as re
18
16
  import sys
19
- from deriva.core.hatrac_store import HatracStore
17
+ import shutil
18
+ from typing import Iterable, Any, Optional
20
19
 
20
+ from deriva.core import format_exception
21
+ from deriva.core.datapath import DataPathException
22
+ from deriva.core.hatrac_store import HatracStore
21
23
  from .deriva_definitions import ExecMetadataVocab
22
- from .deriva_definitions import RID, Status, FileUploadState, DerivaMLException, MLVocab
24
+ from .deriva_definitions import (
25
+ RID,
26
+ Status,
27
+ FileUploadState,
28
+ DerivaMLException,
29
+ MLVocab,
30
+ MLAsset,
31
+ DRY_RUN_RID,
32
+ )
23
33
  from .deriva_ml_base import DerivaML, FeatureRecord
24
34
  from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
25
35
  from .dataset_bag import DatasetBag
@@ -45,11 +55,14 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
45
55
 
46
56
 
47
57
  try:
48
- from jupyter_server.serverapp import list_running_servers
58
+ from IPython.display import display, Markdown
49
59
  except ImportError:
50
60
 
51
- def list_running_servers():
52
- return []
61
+ def display(s):
62
+ print(s)
63
+
64
+ def Markdown(s):
65
+ return s
53
66
 
54
67
 
55
68
  class AssetFilePath(type(Path())):
@@ -167,7 +180,7 @@ class Execution:
167
180
  self.workflow_rid = (
168
181
  self._ml_object.add_workflow(self.configuration.workflow)
169
182
  if not self._dry_run
170
- else "0000"
183
+ else DRY_RUN_RID
171
184
  )
172
185
  else:
173
186
  self.workflow_rid = self.configuration.workflow
@@ -195,10 +208,10 @@ class Execution:
195
208
  schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
196
209
  if reload:
197
210
  self.execution_rid = reload
198
- if self.execution_rid == "0000":
211
+ if self.execution_rid == DRY_RUN_RID:
199
212
  self._dry_run = True
200
213
  elif self._dry_run:
201
- self.execution_rid = "0000"
214
+ self.execution_rid = DRY_RUN_RID
202
215
  else:
203
216
  self.execution_rid = schema_path.Execution.insert(
204
217
  [
@@ -209,6 +222,15 @@ class Execution:
209
222
  ]
210
223
  )[0]["RID"]
211
224
 
225
+ if (
226
+ isinstance(self.configuration.workflow, Workflow)
227
+ and self.configuration.workflow.is_notebook
228
+ ):
229
+ # Put execution_rid into cell output so we can find it later.
230
+ display(
231
+ Markdown(f"Execution RID: {self._ml_object.cite(self.execution_rid)}")
232
+ )
233
+
212
234
  # Create a directory for execution rid so we can recover state in case of a crash.
213
235
  execution_root(prefix=self._ml_object.working_dir, exec_rid=self.execution_rid)
214
236
  self._initialize_execution(reload)
@@ -272,16 +294,20 @@ class Execution:
272
294
  )
273
295
 
274
296
  # Save configuration details for later upload
275
- cfile = self.asset_file_path(
276
- asset_name="Execution_Metadata",
277
- file_name="configuration.json",
278
- asset_types=ExecMetadataVocab.execution_config.value,
279
- )
280
- with open(cfile.as_posix(), "w", encoding="utf-8") as config_file:
281
- json.dump(self.configuration.model_dump(), config_file)
297
+ if not reload:
298
+ cfile = self.asset_file_path(
299
+ asset_name=MLAsset.execution_metadata,
300
+ file_name="configuration.json",
301
+ asset_types=ExecMetadataVocab.execution_config.value,
302
+ )
303
+ with open(cfile.as_posix(), "w", encoding="utf-8") as config_file:
304
+ json.dump(self.configuration.model_dump(), config_file)
305
+
306
+ for parameter_file in self.configuration.parameters:
307
+ self.asset_file_path(MLAsset.execution_assets, parameter_file)
282
308
 
283
- # save runtime env
284
- self._save_runtime_environment()
309
+ # save runtime env
310
+ self._save_runtime_environment()
285
311
 
286
312
  self.start_time = datetime.now()
287
313
  self.update_status(Status.pending, "Initialize status finished.")
@@ -625,9 +651,20 @@ class Execution:
625
651
  with open(feature_file, "r") as feature_values:
626
652
  entities = [json.loads(line.strip()) for line in feature_values]
627
653
  # Update the asset columns in the feature and add to the catalog.
628
- self._ml_object.domain_path.tables[feature_table].insert(
629
- [map_path(e) for e in entities]
630
- )
654
+ try:
655
+ self._ml_object.domain_path.tables[feature_table].insert(
656
+ [map_path(e) for e in entities]
657
+ )
658
+ except DataPathException as e:
659
+ if re.match(
660
+ rf'DETAIL: +Key +\("Execution", +"{target_table}", +"Feature_Name"\)=\(.*\) already exists',
661
+ e.message,
662
+ ):
663
+ self._logger.info(
664
+ f"Skipping reload of feature values for {feature_table}"
665
+ )
666
+ else:
667
+ raise e
631
668
 
632
669
  def _update_asset_execution_table(
633
670
  self,
@@ -652,16 +689,27 @@ class Execution:
652
689
  asset_exe = self._model.find_association(asset_table_name, "Execution")
653
690
  asset_exe_path = pb.schemas[asset_exe.schema.name].tables[asset_exe.name]
654
691
 
655
- asset_exe_path.insert(
656
- [
657
- {
658
- asset_table_name: asset_path.asset_rid,
659
- "Execution": self.execution_rid,
660
- "Asset_Role": asset_role,
661
- }
662
- for asset_path in asset_list
663
- ]
664
- )
692
+ try:
693
+ asset_exe_path.insert(
694
+ [
695
+ {
696
+ asset_table_name: asset_path.asset_rid,
697
+ "Execution": self.execution_rid,
698
+ "Asset_Role": asset_role,
699
+ }
700
+ for asset_path in asset_list
701
+ ]
702
+ )
703
+ except DataPathException as e:
704
+ if re.match(
705
+ rf'DETAIL: +Key +\("{asset_table_name}", +"Execution"\)=\(.*\) already exists',
706
+ e.message,
707
+ ):
708
+ self._logger.info(
709
+ f"Skipping reload of execution assocations for {asset_table_name}"
710
+ )
711
+ else:
712
+ raise e
665
713
 
666
714
  # Now add in the type names via the asset_asset_type association table.
667
715
  # Get the list of types for each file in the asset.
@@ -687,19 +735,30 @@ class Execution:
687
735
  type_path = pb.schemas[asset_asset_type.schema.name].tables[
688
736
  asset_asset_type.name
689
737
  ]
690
- type_path.insert(
691
- [
692
- {asset_table_name: asset.asset_rid, "Asset_Type": t}
693
- for asset in asset_list
694
- for t in asset_type_map[asset.file_name]
695
- ]
696
- )
738
+ try:
739
+ type_path.insert(
740
+ [
741
+ {asset_table_name: asset.asset_rid, "Asset_Type": t}
742
+ for asset in asset_list
743
+ for t in asset_type_map[asset.file_name]
744
+ ]
745
+ )
746
+ except DataPathException as e:
747
+ if re.match(
748
+ rf'DETAIL: +Key +\("{asset_table_name}", +"Asset_Type"\)=\(.*\) already exists',
749
+ e.message,
750
+ ):
751
+ self._logger.info(
752
+ f"Skipping reload of execution asset types for {asset_table_name}"
753
+ )
754
+ else:
755
+ raise e
697
756
 
698
757
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
699
758
  def asset_file_path(
700
759
  self,
701
760
  asset_name: str,
702
- file_name: str,
761
+ file_name: str | Path,
703
762
  asset_types: Optional[list[str] | str] = None,
704
763
  copy_file=False,
705
764
  **kwargs,