deriva-ml 1.17.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. deriva_ml/.DS_Store +0 -0
  2. deriva_ml/__init__.py +79 -0
  3. deriva_ml/bump_version.py +142 -0
  4. deriva_ml/core/__init__.py +39 -0
  5. deriva_ml/core/base.py +1527 -0
  6. deriva_ml/core/config.py +69 -0
  7. deriva_ml/core/constants.py +36 -0
  8. deriva_ml/core/definitions.py +74 -0
  9. deriva_ml/core/enums.py +222 -0
  10. deriva_ml/core/ermrest.py +288 -0
  11. deriva_ml/core/exceptions.py +28 -0
  12. deriva_ml/core/filespec.py +116 -0
  13. deriva_ml/dataset/__init__.py +12 -0
  14. deriva_ml/dataset/aux_classes.py +225 -0
  15. deriva_ml/dataset/dataset.py +1519 -0
  16. deriva_ml/dataset/dataset_bag.py +450 -0
  17. deriva_ml/dataset/history.py +109 -0
  18. deriva_ml/dataset/upload.py +439 -0
  19. deriva_ml/demo_catalog.py +495 -0
  20. deriva_ml/execution/__init__.py +26 -0
  21. deriva_ml/execution/environment.py +290 -0
  22. deriva_ml/execution/execution.py +1180 -0
  23. deriva_ml/execution/execution_configuration.py +147 -0
  24. deriva_ml/execution/workflow.py +413 -0
  25. deriva_ml/feature.py +228 -0
  26. deriva_ml/install_kernel.py +71 -0
  27. deriva_ml/model/__init__.py +0 -0
  28. deriva_ml/model/catalog.py +485 -0
  29. deriva_ml/model/database.py +719 -0
  30. deriva_ml/protocols/dataset.py +19 -0
  31. deriva_ml/run_notebook.py +228 -0
  32. deriva_ml/schema/__init__.py +3 -0
  33. deriva_ml/schema/annotations.py +473 -0
  34. deriva_ml/schema/check_schema.py +104 -0
  35. deriva_ml/schema/create_schema.py +393 -0
  36. deriva_ml/schema/deriva-ml-reference.json +8525 -0
  37. deriva_ml/schema/policy.json +81 -0
  38. deriva_ml/schema/table_comments_utils.py +57 -0
  39. deriva_ml/test.py +94 -0
  40. deriva_ml-1.17.10.dist-info/METADATA +38 -0
  41. deriva_ml-1.17.10.dist-info/RECORD +45 -0
  42. deriva_ml-1.17.10.dist-info/WHEEL +5 -0
  43. deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
  44. deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
  45. deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
@@ -0,0 +1,147 @@
1
+ """Configuration management for DerivaML executions.
2
+
3
+ This module provides functionality for configuring and managing execution parameters in DerivaML.
4
+ It includes:
5
+
6
+ - ExecutionConfiguration class: Core class for execution settings
7
+ - Parameter validation: Handles JSON and file-based parameters
8
+ - Dataset specifications: Manages dataset versions and materialization
9
+ - Asset management: Tracks required input files
10
+
11
+ The module supports both direct parameter specification and JSON-based configuration files.
12
+
13
+ Typical usage example:
14
+ >>> config = ExecutionConfiguration(
15
+ ... workflow="analysis_workflow",
16
+ ... datasets=[DatasetSpec(rid="1-abc123", version="1.0.0")],
17
+ ... parameters={"threshold": 0.5},
18
+ ... description="Process sample data"
19
+ ... )
20
+ >>> execution = ml.create_execution(config)
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import json
26
+ import sys
27
+ from dataclasses import dataclass
28
+ from pathlib import Path
29
+ from typing import Any
30
+
31
+ from hydra_zen import builds
32
+ from omegaconf import DictConfig
33
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
34
+
35
+ from deriva_ml.core.definitions import RID
36
+ from deriva_ml.dataset.aux_classes import DatasetSpec
37
+ from deriva_ml.execution.workflow import Workflow
38
+
39
+
40
+ class ExecutionConfiguration(BaseModel):
41
+ """Configuration for a DerivaML execution.
42
+
43
+ Defines the complete configuration for a computational or manual process in DerivaML,
44
+ including required datasets, input assets, workflow definition, and parameters.
45
+
46
+ Attributes:
47
+ datasets (list[DatasetSpec]): Dataset specifications, each containing:
48
+ - rid: Dataset Resource Identifier
49
+ - version: Version to use
50
+ - materialize: Whether to extract dataset contents
51
+ assets (list[RID]): Resource Identifiers of required input assets.
52
+ workflow (RID | Workflow): Workflow definition or its Resource Identifier.
53
+ parameters (dict[str, Any] | Path): Execution parameters, either as:
54
+ - Dictionary of parameter values
55
+ - Path to JSON file containing parameters
56
+ description (str): Description of execution purpose (supports Markdown).
57
+ argv (list[str]): Command line arguments used to start execution.
58
+
59
+ Example:
60
+ >>> config = ExecutionConfiguration(
61
+ ... workflow=Workflow.create_workflow("analysis", "python_script"),
62
+ ... datasets=[
63
+ ... DatasetSpec(rid="1-abc123", version="1.0.0", materialize=True)
64
+ ... ],
65
+ ... parameters={"threshold": 0.5, "max_iterations": 100},
66
+ ... description="Process RNA sequence data"
67
+ ... )
68
+ """
69
+
70
+ datasets: list[DatasetSpec] = []
71
+ assets: list[RID] = []
72
+ workflow: RID | Workflow | None = None
73
+ description: str = ""
74
+ argv: list[str] = Field(default_factory=lambda: sys.argv)
75
+
76
+ model_config = ConfigDict(arbitrary_types_allowed=True)
77
+
78
+ # @field_validator("datasets", mode="before")
79
+ # @classmethod
80
+ # def validate_datasets(cls, value: Any) -> Any:
81
+ # if isinstance(value, DatasetList):
82
+ # config_list: DatasetList = value
83
+ # value = config_list.datasets
84
+ # return value
85
+ @field_validator("assets", mode="before")
86
+ @classmethod
87
+ def validate_assets(cls, value: Any) -> Any:
88
+ return [v.rid if isinstance(v, DictConfig) or isinstance(v, AssetRID) else v for v in value]
89
+
90
+ @staticmethod
91
+ def load_configuration(path: Path) -> ExecutionConfiguration:
92
+ """Creates an ExecutionConfiguration from a JSON file.
93
+
94
+ Loads and parses a JSON configuration file into an ExecutionConfiguration
95
+ instance. The file should contain a valid configuration specification.
96
+
97
+ Args:
98
+ path: Path to JSON configuration file.
99
+
100
+ Returns:
101
+ ExecutionConfiguration: Loaded configuration instance.
102
+
103
+ Raises:
104
+ ValueError: If JSON file is invalid or missing required fields.
105
+ FileNotFoundError: If configuration file doesn't exist.
106
+
107
+ Example:
108
+ >>> config = ExecutionConfiguration.load_configuration(Path("config.json"))
109
+ >>> print(f"Workflow: {config.workflow}")
110
+ >>> print(f"Datasets: {len(config.datasets)}")
111
+ """
112
+ with Path(path).open() as fd:
113
+ config = json.load(fd)
114
+ return ExecutionConfiguration.model_validate(config)
115
+
116
+ # def download_execution_configuration(
117
+ # self, configuration_rid: RID
118
+ # ) -> ExecutionConfiguration:
119
+ # """Create an ExecutionConfiguration object from a catalog RID that points to a JSON representation of that
120
+ # configuration in hatrac
121
+ #
122
+ # Args:
123
+ # configuration_rid: RID that should be to an asset table that refers to an execution configuration
124
+ #
125
+ # Returns:
126
+ # A ExecutionConfiguration object for configured by the parameters in the configuration file.
127
+ # """
128
+ # AssertionError("Not Implemented")
129
+ # configuration = self.retrieve_rid(configuration_rid)
130
+ # with NamedTemporaryFile("w+", delete=False, suffix=".json") as dest_file:
131
+ # hs = HatracStore("https", self.host_name, self.credential)
132
+ # hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
133
+ # return ExecutionConfiguration.load_configuration(Path(dest_file.name))
134
+
135
+
136
+ @dataclass
137
+ class AssetRID(str):
138
+ rid: str
139
+ description: str = ""
140
+
141
+ def __new__(cls, rid: str, description: str = ""):
142
+ obj = super().__new__(cls, rid)
143
+ obj.description = description
144
+ return obj
145
+
146
+
147
+ AssetRIDConfig = builds(AssetRID, populate_full_signature=True)
@@ -0,0 +1,413 @@
1
+ import inspect
2
+ import logging
3
+ import os
4
+ import subprocess
5
+ import sys
6
+ import warnings
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import requests
11
+ from pydantic import BaseModel, PrivateAttr, model_validator
12
+ from requests import RequestException
13
+
14
+ from deriva_ml.core.definitions import RID
15
+ from deriva_ml.core.exceptions import DerivaMLException
16
+
17
+ try:
18
+ from IPython.core.getipython import get_ipython
19
+ except ImportError: # Graceful fallback if IPython isn't installed.
20
+
21
+ def get_ipython() -> None:
22
+ return None
23
+
24
+
25
+ try:
26
+ from jupyter_server.serverapp import list_running_servers
27
+
28
+ def get_servers() -> list[Any]:
29
+ return list(list_running_servers())
30
+ except ImportError:
31
+
32
+ def list_running_servers():
33
+ return []
34
+
35
+ def get_servers() -> list[Any]:
36
+ return list_running_servers()
37
+
38
+
39
+ try:
40
+ from ipykernel.connect import get_connection_file
41
+
42
+ def get_kernel_connection() -> str:
43
+ return get_connection_file()
44
+ except ImportError:
45
+
46
+ def get_connection_file():
47
+ return ""
48
+
49
+ def get_kernel_connection() -> str:
50
+ return get_connection_file()
51
+
52
+
53
+ class Workflow(BaseModel):
54
+ """Represents a computational workflow in DerivaML.
55
+
56
+ A workflow defines a computational process or analysis pipeline. Each workflow has
57
+ a unique identifier, source code location, and type. Workflows are typically
58
+ associated with Git repositories for version control.
59
+
60
+ Attributes:
61
+ name (str): Human-readable name of the workflow.
62
+ url (str): URI to the workflow source code (typically a GitHub URL).
63
+ workflow_type (str): Type of workflow (must be a controlled vocabulary term).
64
+ version (str | None): Version identifier (semantic versioning).
65
+ description (str | None): Description of workflow purpose and behavior.
66
+ rid (RID | None): Resource Identifier if registered in catalog.
67
+ checksum (str | None): Git hash of workflow source code.
68
+ is_notebook (bool): Whether workflow is a Jupyter notebook.
69
+
70
+ Example:
71
+ >>> workflow = Workflow(
72
+ ... name="RNA Analysis",
73
+ ... url="https://github.com/org/repo/analysis.ipynb",
74
+ ... workflow_type="python_notebook",
75
+ ... version="1.0.0",
76
+ ... description="RNA sequence analysis"
77
+ ... )
78
+ """
79
+
80
+ name: str
81
+ workflow_type: str
82
+ description: str | None = None
83
+ url: str | None = None
84
+ version: str | None = None
85
+ rid: RID | None = None
86
+ checksum: str | None = None
87
+ is_notebook: bool = False
88
+ git_root: Path | None = None
89
+
90
+ _logger: logging.Logger = PrivateAttr(default=10)
91
+
92
+ @model_validator(mode="after")
93
+ def setup_url_checksum(self) -> "Workflow":
94
+ """Creates a workflow from the current execution context.
95
+
96
+ Identifies the currently executing program (script or notebook) and creates
97
+ a workflow definition. Automatically determines the Git repository information
98
+ and source code checksum.
99
+
100
+ The behavior can be configured using environment variables:
101
+ - DERIVA_ML_WORKFLOW_URL: Override the detected workflow URL
102
+ - DERIVA_ML_WORKFLOW_CHECKSUM: Override the computed checksum
103
+
104
+ Args:
105
+
106
+ Returns:
107
+ Workflow: New workflow instance with detected Git information.
108
+
109
+ Raises:
110
+ DerivaMLException: If not in a Git repository or detection fails.
111
+
112
+ Example:
113
+ >>> workflow = Workflow.create_workflow(
114
+ ... name="Sample Analysis",
115
+ ... workflow_type="python_script",
116
+ ... description="Process sample data"
117
+ ... )
118
+ """
119
+ """Initializes logging for the workflow."""
120
+
121
+ # Check to see if execution file info is being passed in by calling program.
122
+ if "DERIVA_ML_WORKFLOW_URL" in os.environ:
123
+ self.url = os.environ["DERIVA_ML_WORKFLOW_URL"]
124
+ self.checksum = os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"]
125
+ self.git_root = Workflow._get_git_root(Path(os.environ["DERIVA_ML_NOTEBOOK_PATH"]))
126
+ self.is_notebook = True
127
+
128
+ if not self.url:
129
+ path, self.is_notebook = Workflow._get_python_script()
130
+ self.url, self.checksum = Workflow.get_url_and_checksum(path)
131
+ self.git_root = Workflow._get_git_root(path)
132
+
133
+ self.version = self.version or Workflow.get_dynamic_version(root=str(self.git_root or Path.cwd()))
134
+ self._logger = logging.getLogger("deriva_ml")
135
+ return self
136
+
137
+ @staticmethod
138
+ def get_url_and_checksum(executable_path: Path) -> tuple[str, str]:
139
+ """Determines the Git URL and checksum for a file.
140
+
141
+ Computes the Git repository URL and file checksum for the specified path.
142
+ For notebooks, strips cell outputs before computing the checksum.
143
+
144
+ Args:
145
+ executable_path: Path to the workflow file.
146
+
147
+ Returns:
148
+ tuple[str, str]: (GitHub URL, Git object hash)
149
+
150
+ Raises:
151
+ DerivaMLException: If not in a Git repository.
152
+
153
+ Example:
154
+ >>> url, checksum = Workflow.get_url_and_checksum(Path("analysis.ipynb"))
155
+ >>> print(f"URL: {url}")
156
+ >>> print(f"Checksum: {checksum}")
157
+ """
158
+ try:
159
+ subprocess.run(
160
+ "git rev-parse --is-inside-work-tree",
161
+ capture_output=True,
162
+ text=True,
163
+ shell=True,
164
+ check=True,
165
+ )
166
+ except subprocess.CalledProcessError:
167
+ raise DerivaMLException("Not executing in a Git repository.")
168
+
169
+ github_url, is_dirty = Workflow._github_url(executable_path)
170
+
171
+ if is_dirty:
172
+ logging.getLogger("deriva_ml").warning(
173
+ f"File {executable_path} has been modified since last commit. Consider commiting before executing"
174
+ )
175
+
176
+ # If you are in a notebook, strip out the outputs before computing the checksum.
177
+ cmd = (
178
+ f"nbstripout -t {executable_path} | git hash-object --stdin"
179
+ if "ipynb" == executable_path.suffix
180
+ else f"git hash-object {executable_path}"
181
+ )
182
+ checksum = (
183
+ subprocess.run(
184
+ cmd,
185
+ capture_output=True,
186
+ text=True,
187
+ check=False,
188
+ shell=True,
189
+ ).stdout.strip()
190
+ if executable_path != "REPL"
191
+ else "1"
192
+ )
193
+ return github_url, checksum
194
+
195
+ @staticmethod
196
+ def _get_git_root(executable_path: Path) -> str | None:
197
+ """Gets the root directory of the Git repository.
198
+
199
+ Args:
200
+ executable_path: Path to check for Git repository.
201
+
202
+ Returns:
203
+ str | None: Absolute path to repository root, or None if not in repository.
204
+ """
205
+ try:
206
+ result = subprocess.run(
207
+ ["git", "rev-parse", "--show-toplevel"],
208
+ cwd=executable_path.parent,
209
+ stdout=subprocess.PIPE,
210
+ stderr=subprocess.DEVNULL,
211
+ text=True,
212
+ check=True,
213
+ )
214
+ return result.stdout.strip()
215
+ except subprocess.CalledProcessError:
216
+ return None # Not in a git repository
217
+
218
+ @staticmethod
219
+ def _check_nbstrip_status() -> None:
220
+ """Checks if nbstripout is installed and configured.
221
+
222
+ Verifies that the nbstripout tool is available and properly installed in the
223
+ Git repository. Issues warnings if setup is incomplete.
224
+ """
225
+ logger = logging.getLogger("deriva_ml")
226
+ try:
227
+ if subprocess.run(
228
+ ["nbstripout", "--is-installed"],
229
+ check=False,
230
+ capture_output=True,
231
+ ).returncode:
232
+ logger.warning("nbstripout is not installed in repository. Please run nbstripout --install")
233
+ except subprocess.CalledProcessError:
234
+ logger.error("nbstripout is not found.")
235
+
236
+ @staticmethod
237
+ def _get_notebook_path() -> Path | None:
238
+ """Gets the path of the currently executing notebook.
239
+
240
+ Returns:
241
+ Path | None: Absolute path to current notebook, or None if not in notebook.
242
+ """
243
+
244
+ server, session = Workflow._get_notebook_session()
245
+
246
+ if server and session:
247
+ relative_path = session["notebook"]["path"]
248
+ # Join the notebook directory with the relative path
249
+ return Path(server["root_dir"]) / relative_path
250
+ else:
251
+ return None
252
+
253
+ @staticmethod
254
+ def _get_notebook_session() -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
255
+ """Return the absolute path of the current notebook."""
256
+ # Get the kernel's connection file and extract the kernel ID
257
+ try:
258
+ if not (connection_file := Path(get_kernel_connection()).name):
259
+ return None, None
260
+ except RuntimeError:
261
+ return None, None
262
+
263
+ kernel_id = connection_file.split("-", 1)[1].split(".")[0]
264
+
265
+ # Look through the running server sessions to find the matching kernel ID
266
+ for server in get_servers():
267
+ try:
268
+ # If a token is required for authentication, include it in headers
269
+ token = server.get("token", "")
270
+ headers = {}
271
+ if token:
272
+ headers["Authorization"] = f"token {token}"
273
+
274
+ try:
275
+ sessions_url = server["url"] + "api/sessions"
276
+ response = requests.get(sessions_url, headers=headers)
277
+ response.raise_for_status()
278
+ sessions = response.json()
279
+ except RequestException as e:
280
+ raise e
281
+ for sess in sessions:
282
+ if sess["kernel"]["id"] == kernel_id:
283
+ return server, sess
284
+ except Exception as _e:
285
+ # Ignore servers we can't connect to.
286
+ pass
287
+ return None, None
288
+
289
+ @staticmethod
290
+ def _in_repl():
291
+ # Standard Python interactive mode
292
+ if hasattr(sys, "ps1"):
293
+ return True
294
+
295
+ # Interactive mode forced by -i
296
+ if sys.flags.interactive:
297
+ return True
298
+
299
+ # IPython / Jupyter detection
300
+ try:
301
+ from IPython import get_ipython
302
+
303
+ if get_ipython() is not None:
304
+ return True
305
+ except ImportError:
306
+ pass
307
+
308
+ return False
309
+
310
+ @staticmethod
311
+ def _get_python_script() -> tuple[Path, bool]:
312
+ """Return the path to the currently executing script"""
313
+ is_notebook = True
314
+ if not (filename := Workflow._get_notebook_path()):
315
+ is_notebook = False
316
+ stack = [
317
+ s.filename
318
+ for s in inspect.stack()
319
+ if ("pycharm" not in s.filename) and ("site-packages" not in s.filename)
320
+ ]
321
+ # Get the caller's filename, which is two up the stack from here.
322
+ filename = Path(stack[-1])
323
+ if not (filename.exists()) or Workflow._in_repl():
324
+ # Being called from the command line interpreter.
325
+ filename = Path.cwd() / Path("REPL")
326
+ # Get the caller's filename, which is two up the stack from here.
327
+ elif (not filename.exists()) and "PYTEST_CURRENT_TEST" in os.environ:
328
+ filename = Path.cwd() / Path("pytest")
329
+ return filename, is_notebook
330
+
331
+ @staticmethod
332
+ def _github_url(executable_path: Path) -> tuple[str, bool]:
333
+ """Return a GitHub URL for the latest commit of the script from which this routine is called.
334
+
335
+ This routine is used to be called from a script or notebook (e.g., python -m file). It assumes that
336
+ the file is in a GitHub repository and committed. It returns a URL to the last commited version of this
337
+ file in GitHub.
338
+
339
+ Returns: A tuple with the gethub_url and a boolean to indicate if uncommited changes
340
+ have been made to the file.
341
+
342
+ """
343
+
344
+ # Get repo URL from local GitHub repo.
345
+ if executable_path == "REPL":
346
+ return "REPL", True
347
+ try:
348
+ result = subprocess.run(
349
+ ["git", "remote", "get-url", "origin"],
350
+ capture_output=True,
351
+ text=True,
352
+ cwd=executable_path.parent,
353
+ )
354
+ github_url = result.stdout.strip().removesuffix(".git")
355
+ except subprocess.CalledProcessError:
356
+ raise DerivaMLException("No GIT remote found")
357
+
358
+ # Find the root directory for the repository
359
+ repo_root = Workflow._get_git_root(executable_path)
360
+
361
+ # Now check to see if a file has been modified since the last commit.
362
+ try:
363
+ result = subprocess.run(
364
+ ["git", "status", "--porcelain"],
365
+ cwd=executable_path.parent,
366
+ capture_output=True,
367
+ text=True,
368
+ check=False,
369
+ )
370
+ is_dirty = bool("M " in result.stdout.strip()) # Returns True if the output indicates a modified file
371
+ except subprocess.CalledProcessError:
372
+ is_dirty = False # If the Git command fails, assume no changes
373
+
374
+ """Get SHA-1 hash of latest commit of the file in the repository"""
375
+
376
+ result = subprocess.run(
377
+ ["git", "log", "-n", "1", "--pretty=format:%H", executable_path],
378
+ cwd=repo_root,
379
+ capture_output=True,
380
+ text=True,
381
+ check=False,
382
+ )
383
+ sha = result.stdout.strip()
384
+ url = f"{github_url}/blob/{sha}/{executable_path.relative_to(repo_root)}"
385
+ return url, is_dirty
386
+
387
+ @staticmethod
388
+ def get_dynamic_version(root: str | os.PathLike | None = None) -> str:
389
+ """
390
+ Return a dynamic version string based on VCS state (setuptools_scm),
391
+ including dirty/uncommitted changes if configured.
392
+
393
+ Works under uv / Python 3.10+ by forcing setuptools to use stdlib distutils.
394
+ """
395
+ # 1) Tell setuptools to use stdlib distutils (or no override) to avoid
396
+ # the '_distutils_hack' assertion you hit.
397
+ os.environ.setdefault("SETUPTOOLS_USE_DISTUTILS", "stdlib")
398
+
399
+ warnings.filterwarnings(
400
+ "ignore",
401
+ category=UserWarning,
402
+ module="_distutils_hack",
403
+ )
404
+ try:
405
+ from setuptools_scm import get_version
406
+ except Exception as e: # ImportError or anything environment-specific
407
+ raise RuntimeError(f"setuptools_scm is not available: {e}") from e
408
+
409
+ if root is None:
410
+ # Adjust this to point at your repo root if needed
411
+ root = Path(__file__).resolve().parents[1]
412
+
413
+ return get_version(root=root)