deriva-ml 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. deriva_ml/__init__.py +43 -1
  2. deriva_ml/asset/__init__.py +17 -0
  3. deriva_ml/asset/asset.py +357 -0
  4. deriva_ml/asset/aux_classes.py +100 -0
  5. deriva_ml/bump_version.py +254 -11
  6. deriva_ml/catalog/__init__.py +21 -0
  7. deriva_ml/catalog/clone.py +1199 -0
  8. deriva_ml/catalog/localize.py +426 -0
  9. deriva_ml/core/__init__.py +29 -0
  10. deriva_ml/core/base.py +817 -1067
  11. deriva_ml/core/config.py +169 -21
  12. deriva_ml/core/constants.py +120 -19
  13. deriva_ml/core/definitions.py +123 -13
  14. deriva_ml/core/enums.py +47 -73
  15. deriva_ml/core/ermrest.py +226 -193
  16. deriva_ml/core/exceptions.py +297 -14
  17. deriva_ml/core/filespec.py +99 -28
  18. deriva_ml/core/logging_config.py +225 -0
  19. deriva_ml/core/mixins/__init__.py +42 -0
  20. deriva_ml/core/mixins/annotation.py +915 -0
  21. deriva_ml/core/mixins/asset.py +384 -0
  22. deriva_ml/core/mixins/dataset.py +237 -0
  23. deriva_ml/core/mixins/execution.py +408 -0
  24. deriva_ml/core/mixins/feature.py +365 -0
  25. deriva_ml/core/mixins/file.py +263 -0
  26. deriva_ml/core/mixins/path_builder.py +145 -0
  27. deriva_ml/core/mixins/rid_resolution.py +204 -0
  28. deriva_ml/core/mixins/vocabulary.py +400 -0
  29. deriva_ml/core/mixins/workflow.py +322 -0
  30. deriva_ml/core/validation.py +389 -0
  31. deriva_ml/dataset/__init__.py +2 -1
  32. deriva_ml/dataset/aux_classes.py +20 -4
  33. deriva_ml/dataset/catalog_graph.py +575 -0
  34. deriva_ml/dataset/dataset.py +1242 -1008
  35. deriva_ml/dataset/dataset_bag.py +1311 -182
  36. deriva_ml/dataset/history.py +27 -14
  37. deriva_ml/dataset/upload.py +225 -38
  38. deriva_ml/demo_catalog.py +126 -110
  39. deriva_ml/execution/__init__.py +46 -2
  40. deriva_ml/execution/base_config.py +639 -0
  41. deriva_ml/execution/execution.py +543 -242
  42. deriva_ml/execution/execution_configuration.py +26 -11
  43. deriva_ml/execution/execution_record.py +592 -0
  44. deriva_ml/execution/find_caller.py +298 -0
  45. deriva_ml/execution/model_protocol.py +175 -0
  46. deriva_ml/execution/multirun_config.py +153 -0
  47. deriva_ml/execution/runner.py +595 -0
  48. deriva_ml/execution/workflow.py +223 -34
  49. deriva_ml/experiment/__init__.py +8 -0
  50. deriva_ml/experiment/experiment.py +411 -0
  51. deriva_ml/feature.py +6 -1
  52. deriva_ml/install_kernel.py +143 -6
  53. deriva_ml/interfaces.py +862 -0
  54. deriva_ml/model/__init__.py +99 -0
  55. deriva_ml/model/annotations.py +1278 -0
  56. deriva_ml/model/catalog.py +286 -60
  57. deriva_ml/model/database.py +144 -649
  58. deriva_ml/model/deriva_ml_database.py +308 -0
  59. deriva_ml/model/handles.py +14 -0
  60. deriva_ml/run_model.py +319 -0
  61. deriva_ml/run_notebook.py +507 -38
  62. deriva_ml/schema/__init__.py +18 -2
  63. deriva_ml/schema/annotations.py +62 -33
  64. deriva_ml/schema/create_schema.py +169 -69
  65. deriva_ml/schema/validation.py +601 -0
  66. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -4
  67. deriva_ml-1.17.11.dist-info/RECORD +77 -0
  68. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
  69. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +1 -0
  70. deriva_ml/protocols/dataset.py +0 -19
  71. deriva_ml/test.py +0 -94
  72. deriva_ml-1.17.10.dist-info/RECORD +0 -45
  73. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
  74. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,23 @@
1
- import inspect
1
+ from __future__ import annotations
2
+
2
3
  import logging
3
4
  import os
4
5
  import subprocess
5
6
  import sys
6
7
  import warnings
7
8
  from pathlib import Path
8
- from typing import Any
9
+ from typing import TYPE_CHECKING, Any
9
10
 
10
11
  import requests
11
- from pydantic import BaseModel, PrivateAttr, model_validator
12
+ from pydantic import BaseModel, ConfigDict, PrivateAttr, model_validator
12
13
  from requests import RequestException
13
14
 
14
15
  from deriva_ml.core.definitions import RID
15
16
  from deriva_ml.core.exceptions import DerivaMLException
17
+ from deriva_ml.execution.find_caller import _get_calling_module
18
+
19
+ if TYPE_CHECKING:
20
+ from deriva_ml.interfaces import DerivaMLCatalog
16
21
 
17
22
  try:
18
23
  from IPython.core.getipython import get_ipython
@@ -57,26 +62,60 @@ class Workflow(BaseModel):
57
62
  a unique identifier, source code location, and type. Workflows are typically
58
63
  associated with Git repositories for version control.
59
64
 
65
+ When a Workflow is retrieved via ``lookup_workflow(rid)`` or ``lookup_workflow_by_url()``,
66
+ it is bound to a catalog and its ``description`` and ``workflow_type`` properties become
67
+ writable. Setting these properties will update the catalog record. If the catalog is
68
+ read-only (a snapshot), attempting to set them will raise a ``DerivaMLException``.
69
+
60
70
  Attributes:
61
71
  name (str): Human-readable name of the workflow.
62
72
  url (str): URI to the workflow source code (typically a GitHub URL).
63
73
  workflow_type (str): Type of workflow (must be a controlled vocabulary term).
74
+ When the workflow is bound to a writable catalog, setting this property
75
+ will update the catalog record. The new value must be a valid term from
76
+ the Workflow_Type vocabulary.
64
77
  version (str | None): Version identifier (semantic versioning).
65
78
  description (str | None): Description of workflow purpose and behavior.
79
+ When the workflow is bound to a writable catalog, setting this property
80
+ will update the catalog record.
66
81
  rid (RID | None): Resource Identifier if registered in catalog.
67
82
  checksum (str | None): Git hash of workflow source code.
68
83
  is_notebook (bool): Whether workflow is a Jupyter notebook.
69
84
 
70
85
  Example:
71
- >>> workflow = Workflow(
72
- ... name="RNA Analysis",
73
- ... url="https://github.com/org/repo/analysis.ipynb",
74
- ... workflow_type="python_notebook",
75
- ... version="1.0.0",
76
- ... description="RNA sequence analysis"
77
- ... )
86
+ Create a workflow programmatically::
87
+
88
+ >>> workflow = Workflow(
89
+ ... name="RNA Analysis",
90
+ ... url="https://github.com/org/repo/analysis.ipynb",
91
+ ... workflow_type="python_notebook",
92
+ ... version="1.0.0",
93
+ ... description="RNA sequence analysis"
94
+ ... )
95
+
96
+ Look up an existing workflow by RID and update its properties::
97
+
98
+ >>> workflow = ml.lookup_workflow("2-ABC1")
99
+ >>> workflow.description = "Updated description for RNA analysis"
100
+ >>> workflow.workflow_type = "python_script"
101
+ >>> print(workflow.description)
102
+ Updated description for RNA analysis
103
+
104
+ Look up by URL and update::
105
+
106
+ >>> url = "https://github.com/org/repo/blob/abc123/analysis.py"
107
+ >>> workflow = ml.lookup_workflow_by_url(url)
108
+ >>> workflow.description = "New description"
109
+
110
+ Attempting to update on a read-only catalog raises an error::
111
+
112
+ >>> snapshot_ml = ml.catalog_snapshot("2023-01-15T10:30:00")
113
+ >>> workflow = snapshot_ml.lookup_workflow("2-ABC1")
114
+ >>> workflow.description = "New description" # Raises DerivaMLException
78
115
  """
79
116
 
117
+ model_config = ConfigDict(arbitrary_types_allowed=True)
118
+
80
119
  name: str
81
120
  workflow_type: str
82
121
  description: str | None = None
@@ -87,8 +126,119 @@ class Workflow(BaseModel):
87
126
  is_notebook: bool = False
88
127
  git_root: Path | None = None
89
128
 
129
+ _ml_instance: "DerivaMLCatalog | None" = PrivateAttr(default=None)
90
130
  _logger: logging.Logger = PrivateAttr(default=10)
91
131
 
132
+ def __setattr__(self, name: str, value: Any) -> None:
133
+ """Override setattr to intercept description and workflow_type updates.
134
+
135
+ When the workflow is bound to a catalog (via lookup_workflow), setting
136
+ the ``description`` or ``workflow_type`` properties will update the catalog
137
+ record. If the catalog is read-only (a snapshot), a DerivaMLException is raised.
138
+
139
+ Args:
140
+ name: The attribute name being set.
141
+ value: The value to set.
142
+
143
+ Raises:
144
+ DerivaMLException: If attempting to set properties on a read-only
145
+ catalog (snapshot), or if workflow_type is not a valid vocabulary term.
146
+
147
+ Examples:
148
+ Update description::
149
+
150
+ >>> workflow = ml.lookup_workflow("2-ABC1")
151
+ >>> workflow.description = "Updated description"
152
+
153
+ Update workflow type::
154
+
155
+ >>> workflow = ml.lookup_workflow("2-ABC1")
156
+ >>> workflow.workflow_type = "python_notebook"
157
+ """
158
+ # Only intercept updates after full initialization
159
+ # Use __dict__ check to avoid recursion during Pydantic model construction
160
+ if (
161
+ "__pydantic_private__" in self.__dict__
162
+ and self.__dict__.get("__pydantic_private__", {}).get("_ml_instance") is not None
163
+ ):
164
+ if name == "description":
165
+ self._update_description_in_catalog(value)
166
+ elif name == "workflow_type":
167
+ self._update_workflow_type_in_catalog(value)
168
+ super().__setattr__(name, value)
169
+
170
+ def _check_writable_catalog(self, operation: str) -> None:
171
+ """Check that the catalog is writable and workflow is registered.
172
+
173
+ Args:
174
+ operation: Description of the operation being attempted.
175
+
176
+ Raises:
177
+ DerivaMLException: If the workflow is not registered (no RID),
178
+ or if the catalog is read-only (a snapshot).
179
+ """
180
+ # Import here to avoid circular dependency at module load
181
+ import importlib
182
+ _deriva_core = importlib.import_module("deriva.core")
183
+ ErmrestSnapshot = _deriva_core.ErmrestSnapshot
184
+
185
+ if self.rid is None:
186
+ raise DerivaMLException(
187
+ f"Cannot {operation}: Workflow is not registered in the catalog (no RID)"
188
+ )
189
+
190
+ if isinstance(self._ml_instance.catalog, ErmrestSnapshot):
191
+ raise DerivaMLException(
192
+ f"Cannot {operation} on a read-only catalog snapshot. "
193
+ "Use a writable catalog connection instead."
194
+ )
195
+
196
+ def _update_description_in_catalog(self, new_description: str | None) -> None:
197
+ """Update the description field in the catalog.
198
+
199
+ This internal method is called when the description property is set
200
+ on a catalog-bound Workflow object.
201
+
202
+ Args:
203
+ new_description: The new description value.
204
+
205
+ Raises:
206
+ DerivaMLException: If the workflow is not registered (no RID),
207
+ or if the catalog is read-only (a snapshot).
208
+ """
209
+ self._check_writable_catalog("update description")
210
+
211
+ # Update the catalog record
212
+ pb = self._ml_instance.pathBuilder()
213
+ workflow_path = pb.schemas[self._ml_instance.ml_schema].Workflow
214
+ workflow_path.update([{"RID": self.rid, "Description": new_description}])
215
+
216
+ def _update_workflow_type_in_catalog(self, new_workflow_type: str) -> None:
217
+ """Update the workflow_type field in the catalog.
218
+
219
+ This internal method is called when the workflow_type property is set
220
+ on a catalog-bound Workflow object. The new workflow type must be a valid
221
+ term from the Workflow_Type vocabulary.
222
+
223
+ Args:
224
+ new_workflow_type: The new workflow type (must be a valid vocabulary term).
225
+
226
+ Raises:
227
+ DerivaMLException: If the workflow is not registered (no RID),
228
+ the catalog is read-only (a snapshot), or the workflow_type
229
+ is not a valid vocabulary term.
230
+ """
231
+ self._check_writable_catalog("update workflow_type")
232
+
233
+ # Validate that the new workflow type exists in vocabulary
234
+ from deriva_ml.core.definitions import MLVocab
235
+ self._ml_instance.lookup_term(MLVocab.workflow_type, new_workflow_type)
236
+
237
+ # Update the catalog record
238
+ pb = self._ml_instance.pathBuilder()
239
+ workflow_path = pb.schemas[self._ml_instance.ml_schema].Workflow
240
+ workflow_path.update([{"RID": self.rid, "Workflow_Type": new_workflow_type}])
241
+
92
242
  @model_validator(mode="after")
93
243
  def setup_url_checksum(self) -> "Workflow":
94
244
  """Creates a workflow from the current execution context.
@@ -100,6 +250,13 @@ class Workflow(BaseModel):
100
250
  The behavior can be configured using environment variables:
101
251
  - DERIVA_ML_WORKFLOW_URL: Override the detected workflow URL
102
252
  - DERIVA_ML_WORKFLOW_CHECKSUM: Override the computed checksum
253
+ - DERIVAML_MCP_IN_DOCKER: Set to "true" to use Docker metadata instead of git
254
+
255
+ Docker environment variables (used when DERIVAML_MCP_IN_DOCKER=true):
256
+ - DERIVAML_MCP_VERSION: Semantic version of the Docker image
257
+ - DERIVAML_MCP_GIT_COMMIT: Git commit hash at build time
258
+ - DERIVAML_MCP_IMAGE_DIGEST: Docker image digest (unique identifier)
259
+ - DERIVAML_MCP_IMAGE_NAME: Docker image name (e.g., ghcr.io/org/repo)
103
260
 
104
261
  Args:
105
262
 
@@ -107,7 +264,7 @@ class Workflow(BaseModel):
107
264
  Workflow: New workflow instance with detected Git information.
108
265
 
109
266
  Raises:
110
- DerivaMLException: If not in a Git repository or detection fails.
267
+ DerivaMLException: If not in a Git repository or detection fails (non-Docker).
111
268
 
112
269
  Example:
113
270
  >>> workflow = Workflow.create_workflow(
@@ -116,22 +273,55 @@ class Workflow(BaseModel):
116
273
  ... description="Process sample data"
117
274
  ... )
118
275
  """
119
- """Initializes logging for the workflow."""
276
+ self._logger = logging.getLogger("deriva_ml")
120
277
 
121
- # Check to see if execution file info is being passed in by calling program.
278
+ # Check if running in Docker container (no git repo available)
279
+ if os.environ.get("DERIVAML_MCP_IN_DOCKER", "").lower() == "true":
280
+ # Use Docker image metadata for provenance
281
+ self.version = self.version or os.environ.get("DERIVAML_MCP_VERSION", "")
282
+
283
+ # Use image digest as checksum (unique identifier for the container)
284
+ # Fall back to git commit if digest not available
285
+ self.checksum = self.checksum or (
286
+ os.environ.get("DERIVAML_MCP_IMAGE_DIGEST", "")
287
+ or os.environ.get("DERIVAML_MCP_GIT_COMMIT", "")
288
+ )
289
+
290
+ # Build URL pointing to the Docker image or source repo
291
+ if not self.url:
292
+ image_name = os.environ.get(
293
+ "DERIVAML_MCP_IMAGE_NAME",
294
+ "ghcr.io/informatics-isi-edu/deriva-ml-mcp",
295
+ )
296
+ image_digest = os.environ.get("DERIVAML_MCP_IMAGE_DIGEST", "")
297
+ if image_digest:
298
+ # URL format: image@sha256:digest
299
+ self.url = f"{image_name}@{image_digest}"
300
+ else:
301
+ # Fall back to source repo with git commit
302
+ source_url = "https://github.com/informatics-isi-edu/deriva-ml-mcp"
303
+ git_commit = os.environ.get("DERIVAML_MCP_GIT_COMMIT", "")
304
+ self.url = f"{source_url}/commit/{git_commit}" if git_commit else source_url
305
+
306
+ return self
307
+
308
+ # Check to see if execution file info is being passed in by calling program (notebook runner)
122
309
  if "DERIVA_ML_WORKFLOW_URL" in os.environ:
123
310
  self.url = os.environ["DERIVA_ML_WORKFLOW_URL"]
124
- self.checksum = os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"]
125
- self.git_root = Workflow._get_git_root(Path(os.environ["DERIVA_ML_NOTEBOOK_PATH"]))
311
+ self.checksum = os.environ.get("DERIVA_ML_WORKFLOW_CHECKSUM", "")
312
+ notebook_path = os.environ.get("DERIVA_ML_NOTEBOOK_PATH")
313
+ if notebook_path:
314
+ self.git_root = Workflow._get_git_root(Path(notebook_path))
126
315
  self.is_notebook = True
316
+ return self
127
317
 
318
+ # Standard git detection for local development
128
319
  if not self.url:
129
320
  path, self.is_notebook = Workflow._get_python_script()
130
321
  self.url, self.checksum = Workflow.get_url_and_checksum(path)
131
322
  self.git_root = Workflow._get_git_root(path)
132
323
 
133
324
  self.version = self.version or Workflow.get_dynamic_version(root=str(self.git_root or Path.cwd()))
134
- self._logger = logging.getLogger("deriva_ml")
135
325
  return self
136
326
 
137
327
  @staticmethod
@@ -260,7 +450,21 @@ class Workflow(BaseModel):
260
450
  except RuntimeError:
261
451
  return None, None
262
452
 
263
- kernel_id = connection_file.split("-", 1)[1].split(".")[0]
453
+ # Extract kernel ID from connection filename.
454
+ # Standard Jupyter format: "kernel-<kernel_id>.json"
455
+ # PyCharm/other formats may vary: "<kernel_id>.json" or other patterns
456
+ kernel_id = None
457
+ if connection_file.startswith("kernel-") and "-" in connection_file:
458
+ # Standard format: kernel-<uuid>.json
459
+ parts = connection_file.split("-", 1)
460
+ if len(parts) > 1:
461
+ kernel_id = parts[1].rsplit(".", 1)[0]
462
+ else:
463
+ # Fallback: assume filename (without extension) is the kernel ID
464
+ kernel_id = connection_file.rsplit(".", 1)[0]
465
+
466
+ if not kernel_id:
467
+ return None, None
264
468
 
265
469
  # Look through the running server sessions to find the matching kernel ID
266
470
  for server in get_servers():
@@ -310,23 +514,8 @@ class Workflow(BaseModel):
310
514
  @staticmethod
311
515
  def _get_python_script() -> tuple[Path, bool]:
312
516
  """Return the path to the currently executing script"""
313
- is_notebook = True
314
- if not (filename := Workflow._get_notebook_path()):
315
- is_notebook = False
316
- stack = [
317
- s.filename
318
- for s in inspect.stack()
319
- if ("pycharm" not in s.filename) and ("site-packages" not in s.filename)
320
- ]
321
- # Get the caller's filename, which is two up the stack from here.
322
- filename = Path(stack[-1])
323
- if not (filename.exists()) or Workflow._in_repl():
324
- # Being called from the command line interpreter.
325
- filename = Path.cwd() / Path("REPL")
326
- # Get the caller's filename, which is two up the stack from here.
327
- elif (not filename.exists()) and "PYTEST_CURRENT_TEST" in os.environ:
328
- filename = Path.cwd() / Path("pytest")
329
- return filename, is_notebook
517
+ is_notebook = Workflow._get_notebook_path() is not None
518
+ return Path(_get_calling_module()), is_notebook
330
519
 
331
520
  @staticmethod
332
521
  def _github_url(executable_path: Path) -> tuple[str, bool]:
@@ -0,0 +1,8 @@
1
+ """Experiment analysis for DerivaML.
2
+
3
+ This module provides the Experiment class for analyzing completed executions.
4
+ """
5
+
6
+ from deriva_ml.experiment.experiment import Experiment
7
+
8
+ __all__ = ["Experiment"]