deriva-ml 1.17.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/.DS_Store +0 -0
- deriva_ml/__init__.py +79 -0
- deriva_ml/bump_version.py +142 -0
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1527 -0
- deriva_ml/core/config.py +69 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +12 -0
- deriva_ml/dataset/aux_classes.py +225 -0
- deriva_ml/dataset/dataset.py +1519 -0
- deriva_ml/dataset/dataset_bag.py +450 -0
- deriva_ml/dataset/history.py +109 -0
- deriva_ml/dataset/upload.py +439 -0
- deriva_ml/demo_catalog.py +495 -0
- deriva_ml/execution/__init__.py +26 -0
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/execution/execution.py +1180 -0
- deriva_ml/execution/execution_configuration.py +147 -0
- deriva_ml/execution/workflow.py +413 -0
- deriva_ml/feature.py +228 -0
- deriva_ml/install_kernel.py +71 -0
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/model/catalog.py +485 -0
- deriva_ml/model/database.py +719 -0
- deriva_ml/protocols/dataset.py +19 -0
- deriva_ml/run_notebook.py +228 -0
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/schema/annotations.py +473 -0
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/schema/create_schema.py +393 -0
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/policy.json +81 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- deriva_ml/test.py +94 -0
- deriva_ml-1.17.10.dist-info/METADATA +38 -0
- deriva_ml-1.17.10.dist-info/RECORD +45 -0
- deriva_ml-1.17.10.dist-info/WHEEL +5 -0
- deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
- deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
- deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""A module defining the DatasetLike protocol for dataset operations.
|
|
2
|
+
|
|
3
|
+
This module contains the definition of the DatasetLike protocol, which
|
|
4
|
+
provides an interface for datasets to implement specific functionality related
|
|
5
|
+
to listing dataset children. It is particularly useful for ensuring type
|
|
6
|
+
compatibility for objects that mimic datasets in their behavior.
|
|
7
|
+
|
|
8
|
+
Classes:
|
|
9
|
+
DatasetLike: A protocol that specifies methods required for dataset-like
|
|
10
|
+
objects.
|
|
11
|
+
"""
|
|
12
|
+
from typing import Protocol, runtime_checkable
|
|
13
|
+
|
|
14
|
+
from deriva_ml.core.definitions import RID
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@runtime_checkable
|
|
18
|
+
class DatasetLike(Protocol):
|
|
19
|
+
def list_dataset_children(self, dataset_rid: RID, recurse: bool = False) -> list[RID]: ...
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Module to run a notebook using papermill"""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import tempfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import nbformat
|
|
9
|
+
import papermill as pm
|
|
10
|
+
import yaml
|
|
11
|
+
from deriva.core import BaseCLI
|
|
12
|
+
from jupyter_client.kernelspec import KernelSpecManager
|
|
13
|
+
from nbconvert import MarkdownExporter
|
|
14
|
+
|
|
15
|
+
from deriva_ml import DerivaML, ExecAssetType, Execution, ExecutionConfiguration, MLAsset, Workflow
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DerivaMLRunNotebookCLI(BaseCLI):
|
|
19
|
+
"""Main class to part command line arguments and call model"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, description, epilog, **kwargs):
|
|
22
|
+
BaseCLI.__init__(self, description, epilog, **kwargs)
|
|
23
|
+
Workflow._check_nbstrip_status()
|
|
24
|
+
self.parser.add_argument("notebook_file", type=Path, help="Path to the notebook file")
|
|
25
|
+
|
|
26
|
+
self.parser.add_argument(
|
|
27
|
+
"--file",
|
|
28
|
+
"-f",
|
|
29
|
+
type=Path,
|
|
30
|
+
default=None,
|
|
31
|
+
help="JSON or YAML file with parameter values to inject into the notebook.",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
self.parser.add_argument(
|
|
35
|
+
"--inspect",
|
|
36
|
+
action="store_true",
|
|
37
|
+
help="Display parameters information for the given notebook path.",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
self.parser.add_argument(
|
|
41
|
+
"--log-output",
|
|
42
|
+
action="store_true",
|
|
43
|
+
help="Display logging output from notebook.",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
self.parser.add_argument(
|
|
47
|
+
"--parameter",
|
|
48
|
+
"-p",
|
|
49
|
+
nargs=2,
|
|
50
|
+
action="append",
|
|
51
|
+
metavar=("KEY", "VALUE"),
|
|
52
|
+
default=[],
|
|
53
|
+
help="Provide a parameter name and value to inject into the notebook.",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
self.parser.add_argument(
|
|
57
|
+
"--kernel",
|
|
58
|
+
"-k",
|
|
59
|
+
type=str,
|
|
60
|
+
help="Name of kernel to run..",
|
|
61
|
+
default=self._find_kernel_for_venv(),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
@staticmethod
|
|
65
|
+
def _coerce_number(val: str):
|
|
66
|
+
"""
|
|
67
|
+
Try to convert a string to int, then float; otherwise return str.
|
|
68
|
+
"""
|
|
69
|
+
try:
|
|
70
|
+
return int(val)
|
|
71
|
+
except ValueError:
|
|
72
|
+
try:
|
|
73
|
+
return float(val)
|
|
74
|
+
except ValueError:
|
|
75
|
+
return val
|
|
76
|
+
|
|
77
|
+
def main(self):
|
|
78
|
+
"""Parse arguments and set up execution environment."""
|
|
79
|
+
args = self.parse_cli()
|
|
80
|
+
notebook_file: Path = args.notebook_file
|
|
81
|
+
parameter_file = args.file
|
|
82
|
+
|
|
83
|
+
# args.parameter is now a list of [KEY, VALUE] lists
|
|
84
|
+
# e.g. [['timeout', '30'], ['name', 'Alice'], ...]
|
|
85
|
+
parameters = {key: self._coerce_number(val) for key, val in args.parameter}
|
|
86
|
+
|
|
87
|
+
if parameter_file:
|
|
88
|
+
with parameter_file.open("r") as f:
|
|
89
|
+
if parameter_file.suffix == ".json":
|
|
90
|
+
parameters |= json.load(f)
|
|
91
|
+
elif parameter_file.suffix == ".yaml":
|
|
92
|
+
parameters |= yaml.safe_load(f)
|
|
93
|
+
else:
|
|
94
|
+
print("Parameter file must be an json or YAML file.")
|
|
95
|
+
exit(1)
|
|
96
|
+
|
|
97
|
+
if not (notebook_file.is_file() and notebook_file.suffix == ".ipynb"):
|
|
98
|
+
print(f"Notebook file must be an ipynb file: {notebook_file.name}.")
|
|
99
|
+
exit(1)
|
|
100
|
+
|
|
101
|
+
# Create a workflow instance for this specific version of the script.
|
|
102
|
+
# Return an existing workflow if one is found.
|
|
103
|
+
notebook_parameters = pm.inspect_notebook(notebook_file)
|
|
104
|
+
|
|
105
|
+
if args.inspect:
|
|
106
|
+
for param, value in notebook_parameters.items():
|
|
107
|
+
print(f"{param}:{value['inferred_type_name']} (default {value['default']})")
|
|
108
|
+
return
|
|
109
|
+
else:
|
|
110
|
+
notebook_parameters = {k: v["default"] for k, v in notebook_parameters.items()} | parameters
|
|
111
|
+
self.run_notebook(notebook_file.resolve(), parameters, kernel=args.kernel, log=args.log_output)
|
|
112
|
+
|
|
113
|
+
@staticmethod
|
|
114
|
+
def _find_kernel_for_venv() -> str | None:
|
|
115
|
+
"""
|
|
116
|
+
Return the name and spec of an existing Jupyter kernel corresponding
|
|
117
|
+
to a given Python virtual environment path.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
venv_path : str
|
|
122
|
+
Absolute or relative path to the virtual environment.
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
dict | None
|
|
127
|
+
The kernel spec (as a dict) if found, or None if not found.
|
|
128
|
+
"""
|
|
129
|
+
venv = os.environ.get("VIRTUAL_ENV")
|
|
130
|
+
if not venv:
|
|
131
|
+
return None
|
|
132
|
+
venv_path = Path(venv).resolve()
|
|
133
|
+
ksm = KernelSpecManager()
|
|
134
|
+
for name, spec in ksm.get_all_specs().items():
|
|
135
|
+
kernel_json = spec.get("spec", {})
|
|
136
|
+
argv = kernel_json.get("argv", [])
|
|
137
|
+
# check for python executable path inside argv
|
|
138
|
+
for arg in argv:
|
|
139
|
+
try:
|
|
140
|
+
if Path(arg).resolve() == venv_path.joinpath("bin", "python").resolve():
|
|
141
|
+
return name
|
|
142
|
+
except Exception:
|
|
143
|
+
continue
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
def run_notebook(self, notebook_file: Path, parameters, kernel=None, log=False):
|
|
147
|
+
url, checksum = Workflow.get_url_and_checksum(Path(notebook_file))
|
|
148
|
+
os.environ["DERIVA_ML_WORKFLOW_URL"] = url
|
|
149
|
+
os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"] = checksum
|
|
150
|
+
os.environ["DERIVA_ML_NOTEBOOK_PATH"] = notebook_file.as_posix()
|
|
151
|
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
|
152
|
+
notebook_output = Path(tmpdirname) / Path(notebook_file).name
|
|
153
|
+
execution_rid_path = Path(tmpdirname) / "execution_rid.json"
|
|
154
|
+
os.environ["DERIVA_ML_SAVE_EXECUTION_RID"] = execution_rid_path.as_posix()
|
|
155
|
+
pm.execute_notebook(
|
|
156
|
+
input_path=notebook_file,
|
|
157
|
+
output_path=notebook_output,
|
|
158
|
+
parameters=parameters,
|
|
159
|
+
kernel_name=kernel,
|
|
160
|
+
log_output=log,
|
|
161
|
+
)
|
|
162
|
+
print(f"Notebook output saved to {notebook_output}")
|
|
163
|
+
with execution_rid_path.open("r") as f:
|
|
164
|
+
execution_config = json.load(f)
|
|
165
|
+
|
|
166
|
+
if not execution_config:
|
|
167
|
+
print("Execution RID not found.")
|
|
168
|
+
exit(1)
|
|
169
|
+
|
|
170
|
+
execution_rid = execution_config["execution_rid"]
|
|
171
|
+
hostname = execution_config["hostname"]
|
|
172
|
+
catalog_id = execution_config["catalog_id"]
|
|
173
|
+
workflow_rid = execution_config["workflow_rid"]
|
|
174
|
+
ml_instance = DerivaML(hostname=hostname, catalog_id=catalog_id, working_dir=tmpdirname)
|
|
175
|
+
workflow_rid = ml_instance.retrieve_rid(execution_config["execution_rid"])["Workflow"]
|
|
176
|
+
|
|
177
|
+
execution = Execution(
|
|
178
|
+
configuration=ExecutionConfiguration(workflow=workflow_rid),
|
|
179
|
+
ml_object=ml_instance,
|
|
180
|
+
reload=execution_rid,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Generate an HTML version of the output notebook.
|
|
184
|
+
notebook_output_md = notebook_output.with_suffix(".md")
|
|
185
|
+
with notebook_output.open() as f:
|
|
186
|
+
nb = nbformat.read(f, as_version=4)
|
|
187
|
+
# Convert to Markdown
|
|
188
|
+
exporter = MarkdownExporter()
|
|
189
|
+
(body, resources) = exporter.from_notebook_node(nb)
|
|
190
|
+
|
|
191
|
+
with notebook_output_md.open("w") as f:
|
|
192
|
+
f.write(body)
|
|
193
|
+
nb = nbformat.read(notebook_output, as_version=4)
|
|
194
|
+
|
|
195
|
+
execution.asset_file_path(
|
|
196
|
+
asset_name=MLAsset.execution_asset,
|
|
197
|
+
file_name=notebook_output,
|
|
198
|
+
asset_types=ExecAssetType.notebook_output,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
execution.asset_file_path(
|
|
202
|
+
asset_name=MLAsset.execution_asset,
|
|
203
|
+
file_name=notebook_output_md,
|
|
204
|
+
asset_types=ExecAssetType.notebook_output,
|
|
205
|
+
)
|
|
206
|
+
execution.upload_execution_outputs()
|
|
207
|
+
|
|
208
|
+
print(ml_instance.cite(execution_rid))
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def main():
|
|
212
|
+
"""Main entry point for the notebook runner CLI.
|
|
213
|
+
|
|
214
|
+
Creates and runs the DerivaMLRunNotebookCLI instance.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
None. Executes the CLI.
|
|
218
|
+
"""
|
|
219
|
+
cli = DerivaMLRunNotebookCLI(description="Deriva ML Execution Script Demo", epilog="")
|
|
220
|
+
cli.main()
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
if __name__ == "__main__":
|
|
224
|
+
try:
|
|
225
|
+
main()
|
|
226
|
+
except Exception as e:
|
|
227
|
+
print(e)
|
|
228
|
+
exit(1)
|