deriva-ml 1.14.0__py3-none-any.whl → 1.14.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. deriva_ml/__init__.py +25 -30
  2. deriva_ml/core/__init__.py +39 -0
  3. deriva_ml/core/base.py +1489 -0
  4. deriva_ml/core/constants.py +36 -0
  5. deriva_ml/core/definitions.py +74 -0
  6. deriva_ml/core/enums.py +222 -0
  7. deriva_ml/core/ermrest.py +288 -0
  8. deriva_ml/core/exceptions.py +28 -0
  9. deriva_ml/core/filespec.py +116 -0
  10. deriva_ml/dataset/__init__.py +4 -0
  11. deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
  12. deriva_ml/{dataset.py → dataset/dataset.py} +405 -428
  13. deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
  14. deriva_ml/{history.py → dataset/history.py} +51 -33
  15. deriva_ml/{upload.py → dataset/upload.py} +48 -70
  16. deriva_ml/demo_catalog.py +233 -183
  17. deriva_ml/execution/environment.py +290 -0
  18. deriva_ml/{execution.py → execution/execution.py} +365 -252
  19. deriva_ml/execution/execution_configuration.py +163 -0
  20. deriva_ml/{execution_configuration.py → execution/workflow.py} +206 -218
  21. deriva_ml/feature.py +83 -46
  22. deriva_ml/model/__init__.py +0 -0
  23. deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
  24. deriva_ml/{database_model.py → model/database.py} +52 -74
  25. deriva_ml/model/sql_mapper.py +44 -0
  26. deriva_ml/run_notebook.py +19 -11
  27. deriva_ml/schema/__init__.py +3 -0
  28. deriva_ml/{schema_setup → schema}/annotations.py +31 -22
  29. deriva_ml/schema/check_schema.py +104 -0
  30. deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
  31. deriva_ml/schema/deriva-ml-reference.json +8525 -0
  32. deriva_ml/schema/table_comments_utils.py +57 -0
  33. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/METADATA +5 -4
  34. deriva_ml-1.14.26.dist-info/RECORD +40 -0
  35. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/entry_points.txt +1 -0
  36. deriva_ml/deriva_definitions.py +0 -391
  37. deriva_ml/deriva_ml_base.py +0 -1046
  38. deriva_ml/execution_environment.py +0 -139
  39. deriva_ml/schema_setup/table_comments_utils.py +0 -56
  40. deriva_ml/test-files/execution-parameters.json +0 -1
  41. deriva_ml/test-files/notebook-parameters.json +0 -5
  42. deriva_ml/test_functions.py +0 -141
  43. deriva_ml/test_notebook.ipynb +0 -197
  44. deriva_ml-1.14.0.dist-info/RECORD +0 -31
  45. /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
  46. /deriva_ml/{schema_setup → schema}/policy.json +0 -0
  47. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/WHEEL +0 -0
  48. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/licenses/LICENSE +0 -0
  49. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/top_level.txt +0 -0
@@ -1,91 +1,239 @@
1
- """
2
- Classes that are used to define an execution configuration.
3
- """
4
-
5
- from __future__ import annotations
6
-
7
1
  import inspect
8
- import json
9
2
  import logging
10
3
  import os
11
-
12
- from requests import RequestException
13
- import requests
14
4
  import subprocess
15
- from typing import Optional, Any
5
+ from pathlib import Path
6
+ from typing import Any
16
7
 
8
+ import requests
17
9
  from pydantic import (
18
10
  BaseModel,
19
- conlist,
20
- ConfigDict,
21
- field_validator,
22
- Field,
23
11
  PrivateAttr,
24
12
  )
25
- from pathlib import Path
26
- import sys
27
-
13
+ from requests import RequestException
28
14
 
29
- from .dataset_aux_classes import DatasetSpec
30
- from .deriva_definitions import RID, DerivaMLException
15
+ from deriva_ml.core.definitions import RID
16
+ from deriva_ml.core.exceptions import DerivaMLException
31
17
 
32
18
  try:
33
- from IPython import get_ipython
19
+ from IPython.core.getipython import get_ipython
34
20
  except ImportError: # Graceful fallback if IPython isn't installed.
35
21
 
36
- def get_ipython():
37
- """Dummy routine in case you are not running in IPython."""
22
+ def get_ipython() -> None:
38
23
  return None
39
24
 
40
25
 
41
26
  try:
42
27
  from jupyter_server.serverapp import list_running_servers
28
+
29
+ def get_servers() -> list[Any]:
30
+ return list(list_running_servers())
43
31
  except ImportError:
44
32
 
45
33
  def list_running_servers():
46
- """Dummy routine in case you are not running in Jupyter."""
47
34
  return []
48
35
 
36
+ def get_servers() -> list[Any]:
37
+ return list_running_servers()
38
+
49
39
 
50
40
  try:
51
- from ipykernel import get_connection_file
41
+ from ipykernel.connect import get_connection_file
42
+
43
+ def get_kernel_connection() -> str:
44
+ return get_connection_file()
52
45
  except ImportError:
53
46
 
54
47
  def get_connection_file():
55
- """Dummy routine in case you are not running in Jupyter."""
56
48
  return ""
57
49
 
50
+ def get_kernel_connection() -> str:
51
+ return get_connection_file()
52
+
58
53
 
59
54
  class Workflow(BaseModel):
60
- """A specification of a workflow. Must have a name, URI to the workflow instance, and a type. The workflow type
61
- needs to be an existing-controlled vocabulary term.
55
+ """Represents a computational workflow in DerivaML.
56
+
57
+ A workflow defines a computational process or analysis pipeline. Each workflow has
58
+ a unique identifier, source code location, and type. Workflows are typically
59
+ associated with Git repositories for version control.
62
60
 
63
61
  Attributes:
64
- name: The name of the workflow
65
- url: The URI to the workflow instance. In most cases should be a GitHub URI to the code being executed.
66
- workflow_type: The type of the workflow. Must be an existing controlled vocabulary term.
67
- version: The version of the workflow instance. Should follow semantic versioning.
68
- description: A description of the workflow instance. Can be in Markdown format.
69
- is_notebook: A boolean indicating whether this workflow instance is a notebook or not.
62
+ name (str): Human-readable name of the workflow.
63
+ url (str): URI to the workflow source code (typically a GitHub URL).
64
+ workflow_type (str): Type of workflow (must be a controlled vocabulary term).
65
+ version (str | None): Version identifier (semantic versioning).
66
+ description (str | None): Description of workflow purpose and behavior.
67
+ rid (RID | None): Resource Identifier if registered in catalog.
68
+ checksum (str | None): Git hash of workflow source code.
69
+ is_notebook (bool): Whether workflow is a Jupyter notebook.
70
+
71
+ Example:
72
+ >>> workflow = Workflow(
73
+ ... name="RNA Analysis",
74
+ ... url="https://github.com/org/repo/analysis.ipynb",
75
+ ... workflow_type="python_notebook",
76
+ ... version="1.0.0",
77
+ ... description="RNA sequence analysis"
78
+ ... )
70
79
  """
71
80
 
72
81
  name: str
73
82
  url: str
74
83
  workflow_type: str
75
- version: Optional[str] = None
76
- description: str = None
77
- rid: Optional[RID] = None
78
- checksum: Optional[str] = None
84
+ version: str | None = None
85
+ description: str | None = None
86
+ rid: RID | None = None
87
+ checksum: str | None = None
79
88
  is_notebook: bool = False
80
89
 
81
90
  _logger: Any = PrivateAttr()
82
91
 
83
92
  def __post_init__(self):
93
+ """Initializes logging for the workflow."""
84
94
  self._logger = logging.getLogger("deriva_ml")
85
95
 
96
+ @staticmethod
97
+ def create_workflow(
98
+ name: str,
99
+ workflow_type: str,
100
+ description: str = "",
101
+ ) -> "Workflow":
102
+ """Creates a workflow from the current execution context.
103
+
104
+ Identifies the currently executing program (script or notebook) and creates
105
+ a workflow definition. Automatically determines the Git repository information
106
+ and source code checksum.
107
+
108
+ The behavior can be configured using environment variables:
109
+ - DERIVA_ML_WORKFLOW_URL: Override the detected workflow URL
110
+ - DERIVA_ML_WORKFLOW_CHECKSUM: Override the computed checksum
111
+
112
+ Args:
113
+ name: Human-readable name for the workflow.
114
+ workflow_type: Type of workflow (must be a vocabulary term).
115
+ description: Optional description of workflow purpose.
116
+
117
+ Returns:
118
+ Workflow: New workflow instance with detected Git information.
119
+
120
+ Raises:
121
+ DerivaMLException: If not in a Git repository or detection fails.
122
+
123
+ Example:
124
+ >>> workflow = Workflow.create_workflow(
125
+ ... name="Sample Analysis",
126
+ ... workflow_type="python_script",
127
+ ... description="Process sample data"
128
+ ... )
129
+ """
130
+
131
+ # Check to see if execution file info is being passed in by calling program.
132
+ if "DERIVA_ML_WORKFLOW_URL" in os.environ:
133
+ github_url = os.environ["DERIVA_ML_WORKFLOW_URL"]
134
+ checksum = os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"]
135
+ is_notebook = True
136
+ else:
137
+ path, is_notebook = Workflow._get_python_script()
138
+ github_url, checksum = Workflow.get_url_and_checksum(path)
139
+
140
+ return Workflow(
141
+ name=name,
142
+ url=github_url,
143
+ checksum=checksum,
144
+ description=description,
145
+ workflow_type=workflow_type,
146
+ is_notebook=is_notebook,
147
+ )
148
+
149
+ @staticmethod
150
+ def get_url_and_checksum(executable_path: Path) -> tuple[str, str]:
151
+ """Determines the Git URL and checksum for a file.
152
+
153
+ Computes the Git repository URL and file checksum for the specified path.
154
+ For notebooks, strips cell outputs before computing the checksum.
155
+
156
+ Args:
157
+ executable_path: Path to the workflow file.
158
+
159
+ Returns:
160
+ tuple[str, str]: (GitHub URL, Git object hash)
161
+
162
+ Raises:
163
+ DerivaMLException: If not in a Git repository.
164
+
165
+ Example:
166
+ >>> url, checksum = Workflow.get_url_and_checksum(Path("analysis.ipynb"))
167
+ >>> print(f"URL: {url}")
168
+ >>> print(f"Checksum: {checksum}")
169
+ """
170
+ try:
171
+ subprocess.run(
172
+ "git rev-parse --is-inside-work-tree",
173
+ capture_output=True,
174
+ text=True,
175
+ shell=True,
176
+ check=True,
177
+ )
178
+ except subprocess.CalledProcessError:
179
+ raise DerivaMLException("Not executing in a Git repository.")
180
+
181
+ github_url, is_dirty = Workflow._github_url(executable_path)
182
+
183
+ if is_dirty:
184
+ logging.getLogger("deriva_ml").warning(
185
+ f"File {executable_path} has been modified since last commit. Consider commiting before executing"
186
+ )
187
+
188
+ # If you are in a notebook, strip out the outputs before computing the checksum.
189
+ cmd = (
190
+ f"nbstripout -t {executable_path} | git hash-object --stdin"
191
+ if "ipynb" == executable_path.suffix
192
+ else f"git hash-object {executable_path}"
193
+ )
194
+ checksum = (
195
+ subprocess.run(
196
+ cmd,
197
+ capture_output=True,
198
+ text=True,
199
+ check=False,
200
+ shell=True,
201
+ ).stdout.strip()
202
+ if executable_path != "REPL"
203
+ else "1"
204
+ )
205
+ return github_url, checksum
206
+
207
+ @staticmethod
208
+ def _get_git_root(executable_path: Path) -> str | None:
209
+ """Gets the root directory of the Git repository.
210
+
211
+ Args:
212
+ executable_path: Path to check for Git repository.
213
+
214
+ Returns:
215
+ str | None: Absolute path to repository root, or None if not in repository.
216
+ """
217
+ try:
218
+ result = subprocess.run(
219
+ ["git", "rev-parse", "--show-toplevel"],
220
+ cwd=executable_path.parent,
221
+ stdout=subprocess.PIPE,
222
+ stderr=subprocess.DEVNULL,
223
+ text=True,
224
+ check=True,
225
+ )
226
+ return result.stdout.strip()
227
+ except subprocess.CalledProcessError:
228
+ return None # Not in a git repository
229
+
86
230
  @staticmethod
87
231
  def _check_nbstrip_status() -> None:
88
- """Check to see if nbstrip is installed"""
232
+ """Checks if nbstripout is installed and configured.
233
+
234
+ Verifies that the nbstripout tool is available and properly installed in the
235
+ Git repository. Issues warnings if setup is incomplete.
236
+ """
89
237
  logger = logging.getLogger("deriva_ml")
90
238
  try:
91
239
  if subprocess.run(
@@ -93,15 +241,17 @@ class Workflow(BaseModel):
93
241
  check=False,
94
242
  capture_output=True,
95
243
  ).returncode:
96
- logger.warning(
97
- "nbstripout is not installed in repository. Please run nbstripout --install"
98
- )
244
+ logger.warning("nbstripout is not installed in repository. Please run nbstripout --install")
99
245
  except subprocess.CalledProcessError:
100
246
  logger.error("nbstripout is not found.")
101
247
 
102
248
  @staticmethod
103
249
  def _get_notebook_path() -> Path | None:
104
- """Return the absolute path of the current notebook."""
250
+ """Gets the path of the currently executing notebook.
251
+
252
+ Returns:
253
+ Path | None: Absolute path to current notebook, or None if not in notebook.
254
+ """
105
255
 
106
256
  server, session = Workflow._get_notebook_session()
107
257
  if server and session:
@@ -116,7 +266,7 @@ class Workflow(BaseModel):
116
266
  """Return the absolute path of the current notebook."""
117
267
  # Get the kernel's connection file and extract the kernel ID
118
268
  try:
119
- if not (connection_file := Path(get_connection_file()).name):
269
+ if not (connection_file := Path(get_kernel_connection()).name):
120
270
  return None, None
121
271
  except RuntimeError:
122
272
  return None, None
@@ -124,7 +274,7 @@ class Workflow(BaseModel):
124
274
  kernel_id = connection_file.split("-", 1)[1].split(".")[0]
125
275
 
126
276
  # Look through the running server sessions to find the matching kernel ID
127
- for server in list_running_servers():
277
+ for server in get_servers():
128
278
  try:
129
279
  # If a token is required for authentication, include it in headers
130
280
  token = server.get("token", "")
@@ -158,29 +308,27 @@ class Workflow(BaseModel):
158
308
  if len(stack) > 1:
159
309
  filename = Path(stack[2].filename)
160
310
  if not filename.exists():
161
- # Begin called from command line interpreter.
311
+ # Being called from the command line interpreter.
162
312
  filename = Path("REPL")
163
313
  # Get the caller's filename, which is two up the stack from here.
164
314
  else:
165
- raise DerivaMLException(
166
- "Looking for caller failed"
167
- ) # Stack is too shallow
315
+ raise DerivaMLException("Looking for caller failed") # Stack is too shallow
168
316
  return filename, is_notebook
169
317
 
170
318
  @staticmethod
171
319
  def _github_url(executable_path: Path) -> tuple[str, bool]:
172
- """Return a GitHUB URL for the latest commit of the script from which this routine is called.
320
+ """Return a GitHub URL for the latest commit of the script from which this routine is called.
173
321
 
174
- This routine is used to be called from a script or notebook (e.g. python -m file). It assumes that
175
- the file is in a gitHUB repository and commited. It returns a URL to the last commited version of this
176
- file in GitHUB.
322
+ This routine is used to be called from a script or notebook (e.g., python -m file). It assumes that
323
+ the file is in a GitHub repository and committed. It returns a URL to the last commited version of this
324
+ file in GitHub.
177
325
 
178
- Returns: A tuple with the gethub_url and a boolean to indicated if uncommited changes
326
+ Returns: A tuple with the gethub_url and a boolean to indicate if uncommited changes
179
327
  have been made to the file.
180
328
 
181
329
  """
182
330
 
183
- # Get repo URL from local gitHub repo.
331
+ # Get repo URL from local GitHub repo.
184
332
  if executable_path == "REPL":
185
333
  return "REPL", True
186
334
  try:
@@ -197,7 +345,7 @@ class Workflow(BaseModel):
197
345
  # Find the root directory for the repository
198
346
  repo_root = Workflow._get_git_root(executable_path)
199
347
 
200
- # Now check to see if file has been modified since the last commit.
348
+ # Now check to see if a file has been modified since the last commit.
201
349
  try:
202
350
  result = subprocess.run(
203
351
  ["git", "status", "--porcelain"],
@@ -206,11 +354,9 @@ class Workflow(BaseModel):
206
354
  text=True,
207
355
  check=True,
208
356
  )
209
- is_dirty = bool(
210
- "M " in result.stdout.strip()
211
- ) # Returns True if output indicates a modified file
357
+ is_dirty = bool("M " in result.stdout.strip()) # Returns True if the output indicates a modified file
212
358
  except subprocess.CalledProcessError:
213
- is_dirty = False # If Git command fails, assume no changes
359
+ is_dirty = False # If the Git command fails, assume no changes
214
360
 
215
361
  """Get SHA-1 hash of latest commit of the file in the repository"""
216
362
  result = subprocess.run(
@@ -223,161 +369,3 @@ class Workflow(BaseModel):
223
369
  sha = result.stdout.strip()
224
370
  url = f"{github_url}/blob/{sha}/{executable_path.relative_to(repo_root)}"
225
371
  return url, is_dirty
226
-
227
- @staticmethod
228
- def _get_git_root(executable_path: Path):
229
- try:
230
- result = subprocess.run(
231
- ["git", "rev-parse", "--show-toplevel"],
232
- cwd=executable_path.parent,
233
- stdout=subprocess.PIPE,
234
- stderr=subprocess.DEVNULL,
235
- text=True,
236
- check=True,
237
- )
238
- return result.stdout.strip()
239
- except subprocess.CalledProcessError:
240
- return None # Not in a git repository
241
-
242
- @staticmethod
243
- def create_workflow(
244
- name: str,
245
- workflow_type: str,
246
- description: str = "",
247
- ) -> Workflow:
248
- """Identify current executing program and return a workflow RID for it
249
-
250
- Determine the notebook or script that is currently being executed. Assume that this is
251
- being executed from a cloned GitHub repository. Determine the remote repository name for
252
- this object. Then either retrieve an existing workflow for this executable or create
253
- a new one.
254
-
255
- Args:
256
- name: The name of the workflow.
257
- workflow_type: The type of the workflow.
258
- description: The description of the workflow.
259
- """
260
-
261
- # Check to see if execution file info is being passed in by calling program.
262
- if "DERIVA_ML_WORKFLOW_URL" in os.environ:
263
- github_url = os.environ["DERIVA_ML_WORKFLOW_URL"]
264
- checksum = os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"]
265
- is_notebook = True
266
- else:
267
- path, is_notebook = Workflow._get_python_script()
268
- github_url, checksum = Workflow.get_url_and_checksum(path)
269
-
270
- return Workflow(
271
- name=name,
272
- url=github_url,
273
- checksum=checksum,
274
- description=description,
275
- workflow_type=workflow_type,
276
- is_notebook=is_notebook,
277
- )
278
-
279
- @staticmethod
280
- def get_url_and_checksum(executable_path: Path) -> tuple[str, str]:
281
- """Determine the checksum for a specified executable"""
282
- try:
283
- subprocess.run(
284
- "git rev-parse --is-inside-work-tree",
285
- capture_output=True,
286
- text=True,
287
- shell=True,
288
- check=True,
289
- )
290
- except subprocess.CalledProcessError:
291
- raise DerivaMLException("Not executing in a Git repository.")
292
-
293
- github_url, is_dirty = Workflow._github_url(executable_path)
294
-
295
- if is_dirty:
296
- logging.getLogger("deriva_ml").warning(
297
- f"File {executable_path} has been modified since last commit. Consider commiting before executing"
298
- )
299
-
300
- # If you are in a notebook, strip out the outputs before computing the checksum.
301
- cmd = (
302
- f"nbstripout -t {executable_path} | git hash-object --stdin"
303
- if "ipynb" == executable_path.suffix
304
- else f"git hash-object {executable_path}"
305
- )
306
- checksum = (
307
- subprocess.run(
308
- cmd,
309
- capture_output=True,
310
- text=True,
311
- check=False,
312
- shell=True,
313
- ).stdout.strip()
314
- if executable_path != "REPL"
315
- else "1"
316
- )
317
- return github_url, checksum
318
-
319
-
320
- class ExecutionConfiguration(BaseModel):
321
- """Define the parameters that are used to configure a specific execution.
322
-
323
- Attributes:
324
- datasets: List of dataset specifications which specify the dataset RID, version and if the dataset
325
- should be materialized.
326
- assets: List of assets to be downloaded prior to execution. The values must be RIDs in an asset table
327
- parameters: Either a dictionary or a path to a JSON file that contains configuration parameters for the execution.
328
- workflow: Either a Workflow object, or a RID for a workflow instance.
329
- parameters: Either a dictionary or a path to a JSON file that contains configuration parameters for the execution.
330
- description: A description of the execution. Can use Markdown format.
331
- """
332
-
333
- datasets: conlist(DatasetSpec) = []
334
- assets: list[RID | str] = [] # List of RIDs to model files.
335
- workflow: RID | Workflow
336
- parameters: dict[str, Any] | Path = {}
337
- description: str = ""
338
- argv: conlist(str) = Field(default_factory=lambda: sys.argv)
339
-
340
- model_config = ConfigDict(arbitrary_types_allowed=True)
341
-
342
- @field_validator("parameters", mode="before")
343
- @classmethod
344
- def validate_parameters(cls, value: Any) -> Any:
345
- """If a parameter is a file, assume that it has JSON contents for configuration parameters"""
346
- if isinstance(value, str) or isinstance(value, Path):
347
- with open(value, "r") as f:
348
- return json.load(f)
349
- else:
350
- return value
351
-
352
- @staticmethod
353
- def load_configuration(path: Path) -> ExecutionConfiguration:
354
- """Create a ExecutionConfiguration from a JSON configuration file.
355
-
356
- Args:
357
- path: File containing JSON version of execution configuration.
358
-
359
- Returns:
360
- An execution configuration whose values are loaded from the given file.
361
- """
362
- with open(path) as fd:
363
- config = json.load(fd)
364
- return ExecutionConfiguration.model_validate(config)
365
-
366
- # def download_execution_configuration(
367
- # self, configuration_rid: RID
368
- # ) -> ExecutionConfiguration:
369
- # """Create an ExecutionConfiguration object from a catalog RID that points to a JSON representation of that
370
- # configuration in hatrac
371
- #
372
- # Args:
373
- # configuration_rid: RID that should be to an asset table that refers to an execution configuration
374
- #
375
- # Returns:
376
- # A ExecutionConfiguration object for configured by the parameters in the configuration file.
377
- # """
378
- # AssertionError("Not Implemented")
379
- # configuration = self.retrieve_rid(configuration_rid)
380
- # with NamedTemporaryFile("w+", delete=False, suffix=".json") as dest_file:
381
- # hs = HatracStore("https", self.host_name, self.credential)
382
- # hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
383
- # return ExecutionConfiguration.load_configuration(Path(dest_file.name))