deriva-ml 1.17.9__py3-none-any.whl → 1.17.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +43 -1
- deriva_ml/asset/__init__.py +17 -0
- deriva_ml/asset/asset.py +357 -0
- deriva_ml/asset/aux_classes.py +100 -0
- deriva_ml/bump_version.py +254 -11
- deriva_ml/catalog/__init__.py +21 -0
- deriva_ml/catalog/clone.py +1199 -0
- deriva_ml/catalog/localize.py +426 -0
- deriva_ml/core/__init__.py +29 -0
- deriva_ml/core/base.py +817 -1067
- deriva_ml/core/config.py +169 -21
- deriva_ml/core/constants.py +120 -19
- deriva_ml/core/definitions.py +123 -13
- deriva_ml/core/enums.py +47 -73
- deriva_ml/core/ermrest.py +226 -193
- deriva_ml/core/exceptions.py +297 -14
- deriva_ml/core/filespec.py +99 -28
- deriva_ml/core/logging_config.py +225 -0
- deriva_ml/core/mixins/__init__.py +42 -0
- deriva_ml/core/mixins/annotation.py +915 -0
- deriva_ml/core/mixins/asset.py +384 -0
- deriva_ml/core/mixins/dataset.py +237 -0
- deriva_ml/core/mixins/execution.py +408 -0
- deriva_ml/core/mixins/feature.py +365 -0
- deriva_ml/core/mixins/file.py +263 -0
- deriva_ml/core/mixins/path_builder.py +145 -0
- deriva_ml/core/mixins/rid_resolution.py +204 -0
- deriva_ml/core/mixins/vocabulary.py +400 -0
- deriva_ml/core/mixins/workflow.py +322 -0
- deriva_ml/core/validation.py +389 -0
- deriva_ml/dataset/__init__.py +2 -1
- deriva_ml/dataset/aux_classes.py +20 -4
- deriva_ml/dataset/catalog_graph.py +575 -0
- deriva_ml/dataset/dataset.py +1242 -1008
- deriva_ml/dataset/dataset_bag.py +1311 -182
- deriva_ml/dataset/history.py +27 -14
- deriva_ml/dataset/upload.py +225 -38
- deriva_ml/demo_catalog.py +186 -105
- deriva_ml/execution/__init__.py +46 -2
- deriva_ml/execution/base_config.py +639 -0
- deriva_ml/execution/execution.py +545 -244
- deriva_ml/execution/execution_configuration.py +26 -11
- deriva_ml/execution/execution_record.py +592 -0
- deriva_ml/execution/find_caller.py +298 -0
- deriva_ml/execution/model_protocol.py +175 -0
- deriva_ml/execution/multirun_config.py +153 -0
- deriva_ml/execution/runner.py +595 -0
- deriva_ml/execution/workflow.py +224 -35
- deriva_ml/experiment/__init__.py +8 -0
- deriva_ml/experiment/experiment.py +411 -0
- deriva_ml/feature.py +6 -1
- deriva_ml/install_kernel.py +143 -6
- deriva_ml/interfaces.py +862 -0
- deriva_ml/model/__init__.py +99 -0
- deriva_ml/model/annotations.py +1278 -0
- deriva_ml/model/catalog.py +286 -60
- deriva_ml/model/database.py +144 -649
- deriva_ml/model/deriva_ml_database.py +308 -0
- deriva_ml/model/handles.py +14 -0
- deriva_ml/run_model.py +319 -0
- deriva_ml/run_notebook.py +507 -38
- deriva_ml/schema/__init__.py +18 -2
- deriva_ml/schema/annotations.py +62 -33
- deriva_ml/schema/create_schema.py +169 -69
- deriva_ml/schema/validation.py +601 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -5
- deriva_ml-1.17.11.dist-info/RECORD +77 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +2 -0
- deriva_ml/protocols/dataset.py +0 -19
- deriva_ml/test.py +0 -94
- deriva_ml-1.17.9.dist-info/RECORD +0 -45
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
deriva_ml/run_notebook.py
CHANGED
|
@@ -1,7 +1,51 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
1
|
+
"""Command-line interface for executing Jupyter notebooks with DerivaML tracking.
|
|
2
|
+
|
|
3
|
+
This module provides a CLI tool for running Jupyter notebooks using papermill while
|
|
4
|
+
automatically tracking the execution in a Deriva catalog. It handles:
|
|
5
|
+
|
|
6
|
+
- Parameter injection into notebooks from command-line arguments or config files
|
|
7
|
+
- Automatic kernel detection for the current virtual environment
|
|
8
|
+
- Execution tracking with workflow provenance
|
|
9
|
+
- Conversion of executed notebooks to Markdown format
|
|
10
|
+
- Upload of notebook outputs as execution assets
|
|
11
|
+
|
|
12
|
+
The notebook being executed should use DerivaML's execution context to record
|
|
13
|
+
its workflow. When run through this CLI, environment variables are set to
|
|
14
|
+
communicate workflow metadata (URL, checksum, notebook path) to the notebook.
|
|
15
|
+
|
|
16
|
+
Environment Variables Set:
|
|
17
|
+
DERIVA_ML_WORKFLOW_URL: URL to the notebook source (e.g., GitHub URL)
|
|
18
|
+
DERIVA_ML_WORKFLOW_CHECKSUM: MD5 checksum of the notebook file
|
|
19
|
+
DERIVA_ML_NOTEBOOK_PATH: Local filesystem path to the notebook
|
|
20
|
+
DERIVA_ML_SAVE_EXECUTION_RID: Path where notebook should save execution info
|
|
21
|
+
|
|
22
|
+
Usage:
|
|
23
|
+
deriva-ml-run-notebook notebook.ipynb --host example.org --catalog 1
|
|
24
|
+
deriva-ml-run-notebook notebook.ipynb -p param1 value1 -p param2 value2
|
|
25
|
+
deriva-ml-run-notebook notebook.ipynb --file parameters.yaml
|
|
26
|
+
deriva-ml-run-notebook notebook.ipynb --inspect # Show available parameters
|
|
27
|
+
deriva-ml-run-notebook notebook.ipynb assets=my_assets # Hydra overrides only
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
# Run a training notebook with explicit host/catalog
|
|
31
|
+
deriva-ml-run-notebook train_model.ipynb \\
|
|
32
|
+
--host deriva.example.org \\
|
|
33
|
+
--catalog 42 \\
|
|
34
|
+
-p learning_rate 0.001 \\
|
|
35
|
+
--kernel my_ml_env
|
|
36
|
+
|
|
37
|
+
# Run using Hydra config defaults (no --host/--catalog needed)
|
|
38
|
+
deriva-ml-run-notebook analysis.ipynb assets=roc_comparison_probabilities
|
|
39
|
+
|
|
40
|
+
See Also:
|
|
41
|
+
- install_kernel: Module for installing Jupyter kernels for virtual environments
|
|
42
|
+
- Workflow: Class that handles workflow registration and Git integration
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
import base64
|
|
3
46
|
import json
|
|
4
47
|
import os
|
|
48
|
+
import re
|
|
5
49
|
import tempfile
|
|
6
50
|
from pathlib import Path
|
|
7
51
|
|
|
@@ -12,14 +56,191 @@ from deriva.core import BaseCLI
|
|
|
12
56
|
from jupyter_client.kernelspec import KernelSpecManager
|
|
13
57
|
from nbconvert import MarkdownExporter
|
|
14
58
|
|
|
15
|
-
from deriva_ml import DerivaML, ExecAssetType,
|
|
59
|
+
from deriva_ml import DerivaML, ExecAssetType, MLAsset
|
|
60
|
+
from deriva_ml.execution import Execution, ExecutionConfiguration, Workflow
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _html_table_to_markdown(html: str) -> str | None:
|
|
64
|
+
"""Convert an HTML DataFrame table to markdown format.
|
|
65
|
+
|
|
66
|
+
Parses HTML table elements and converts them to a properly formatted
|
|
67
|
+
markdown table with headers and alignment.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
html: HTML string potentially containing a DataFrame table.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Markdown table string if an HTML table was found, None otherwise.
|
|
74
|
+
"""
|
|
75
|
+
# Check if this looks like a pandas DataFrame HTML output
|
|
76
|
+
if '<table' not in html or 'dataframe' not in html:
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
# Extract table content using regex (avoid heavy dependency on BeautifulSoup)
|
|
81
|
+
thead_match = re.search(r'<thead>(.*?)</thead>', html, re.DOTALL)
|
|
82
|
+
tbody_match = re.search(r'<tbody>(.*?)</tbody>', html, re.DOTALL)
|
|
83
|
+
|
|
84
|
+
if not thead_match or not tbody_match:
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
thead = thead_match.group(1)
|
|
88
|
+
tbody = tbody_match.group(1)
|
|
89
|
+
|
|
90
|
+
# Extract header row(s)
|
|
91
|
+
header_rows = re.findall(r'<tr[^>]*>(.*?)</tr>', thead, re.DOTALL)
|
|
92
|
+
if not header_rows:
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
# For pandas DataFrames with named index:
|
|
96
|
+
# - First row contains: empty <th> + column names
|
|
97
|
+
# - Second row (if exists) contains: index name + empty <th>s
|
|
98
|
+
# We need to use the first row for column names and second row for index name
|
|
99
|
+
|
|
100
|
+
first_row = header_rows[0]
|
|
101
|
+
first_headers = re.findall(r'<th[^>]*>(.*?)</th>', first_row, re.DOTALL)
|
|
102
|
+
first_headers = [re.sub(r'<[^>]+>', '', h).strip() for h in first_headers]
|
|
103
|
+
|
|
104
|
+
# Check if there's a second header row with an index name
|
|
105
|
+
index_name = ""
|
|
106
|
+
if len(header_rows) > 1:
|
|
107
|
+
second_row = header_rows[1]
|
|
108
|
+
second_headers = re.findall(r'<th[^>]*>(.*?)</th>', second_row, re.DOTALL)
|
|
109
|
+
second_headers = [re.sub(r'<[^>]+>', '', h).strip() for h in second_headers]
|
|
110
|
+
# The index name is typically in the first cell of the second row
|
|
111
|
+
if second_headers and second_headers[0]:
|
|
112
|
+
index_name = second_headers[0]
|
|
113
|
+
|
|
114
|
+
# Build final headers: use index name for first column if available
|
|
115
|
+
headers = first_headers.copy()
|
|
116
|
+
if headers and not headers[0] and index_name:
|
|
117
|
+
headers[0] = index_name
|
|
118
|
+
|
|
119
|
+
# Extract body rows
|
|
120
|
+
body_rows = re.findall(r'<tr[^>]*>(.*?)</tr>', tbody, re.DOTALL)
|
|
121
|
+
|
|
122
|
+
rows = []
|
|
123
|
+
for row_html in body_rows:
|
|
124
|
+
# Get both th (index) and td (data) cells
|
|
125
|
+
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row_html, re.DOTALL)
|
|
126
|
+
cells = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
|
|
127
|
+
rows.append(cells)
|
|
128
|
+
|
|
129
|
+
if not headers or not rows:
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
# Build markdown table
|
|
133
|
+
# Determine column widths for alignment
|
|
134
|
+
col_widths = [len(h) for h in headers]
|
|
135
|
+
for row in rows:
|
|
136
|
+
for i, cell in enumerate(row):
|
|
137
|
+
if i < len(col_widths):
|
|
138
|
+
col_widths[i] = max(col_widths[i], len(cell))
|
|
139
|
+
|
|
140
|
+
# Format header
|
|
141
|
+
header_line = '| ' + ' | '.join(h.ljust(col_widths[i]) for i, h in enumerate(headers)) + ' |'
|
|
142
|
+
separator = '|' + '|'.join('-' * (w + 2) for w in col_widths) + '|'
|
|
143
|
+
|
|
144
|
+
# Format rows
|
|
145
|
+
formatted_rows = []
|
|
146
|
+
for row in rows:
|
|
147
|
+
# Pad row if needed
|
|
148
|
+
padded = row + [''] * (len(headers) - len(row))
|
|
149
|
+
formatted = '| ' + ' | '.join(
|
|
150
|
+
padded[i].ljust(col_widths[i]) if i < len(col_widths) else padded[i]
|
|
151
|
+
for i in range(len(headers))
|
|
152
|
+
) + ' |'
|
|
153
|
+
formatted_rows.append(formatted)
|
|
154
|
+
|
|
155
|
+
return '\n'.join([header_line, separator] + formatted_rows)
|
|
156
|
+
|
|
157
|
+
except Exception:
|
|
158
|
+
# If parsing fails, return None to use default behavior
|
|
159
|
+
return None
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _convert_dataframe_outputs(nb: nbformat.NotebookNode) -> nbformat.NotebookNode:
|
|
163
|
+
"""Convert DataFrame HTML outputs in notebook cells to markdown tables.
|
|
164
|
+
|
|
165
|
+
Iterates through all code cells and converts any display_data outputs
|
|
166
|
+
containing DataFrame HTML tables to markdown format for better rendering.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
nb: The notebook node to process.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
The modified notebook node with converted outputs.
|
|
173
|
+
"""
|
|
174
|
+
for cell in nb.cells:
|
|
175
|
+
if cell.cell_type != 'code':
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
new_outputs = []
|
|
179
|
+
for output in cell.get('outputs', []):
|
|
180
|
+
if output.get('output_type') in ('display_data', 'execute_result'):
|
|
181
|
+
data = output.get('data', {})
|
|
182
|
+
html = data.get('text/html', '')
|
|
183
|
+
|
|
184
|
+
if html and '<table' in html and 'dataframe' in html:
|
|
185
|
+
md_table = _html_table_to_markdown(html)
|
|
186
|
+
if md_table:
|
|
187
|
+
# Replace the output with markdown text
|
|
188
|
+
# Keep the original output type but change the data
|
|
189
|
+
new_output = output.copy()
|
|
190
|
+
new_output['data'] = {'text/plain': md_table}
|
|
191
|
+
new_outputs.append(new_output)
|
|
192
|
+
continue
|
|
193
|
+
|
|
194
|
+
new_outputs.append(output)
|
|
195
|
+
|
|
196
|
+
cell['outputs'] = new_outputs
|
|
197
|
+
|
|
198
|
+
return nb
|
|
16
199
|
|
|
17
200
|
|
|
18
201
|
class DerivaMLRunNotebookCLI(BaseCLI):
|
|
19
|
-
"""
|
|
202
|
+
"""Command-line interface for running Jupyter notebooks with DerivaML execution tracking.
|
|
203
|
+
|
|
204
|
+
This CLI extends Deriva's BaseCLI to provide notebook execution capabilities using
|
|
205
|
+
papermill. It automatically detects the appropriate Jupyter kernel for the current
|
|
206
|
+
virtual environment and handles parameter injection from multiple sources.
|
|
207
|
+
|
|
208
|
+
The CLI supports:
|
|
209
|
+
- Positional notebook file argument
|
|
210
|
+
- Parameter injection via -p/--parameter flags (multiple allowed)
|
|
211
|
+
- Parameter injection via JSON or YAML configuration files
|
|
212
|
+
- Automatic kernel detection for the active virtual environment
|
|
213
|
+
- Inspection mode to display available notebook parameters
|
|
214
|
+
- Logging output from notebook execution
|
|
215
|
+
|
|
216
|
+
Attributes:
|
|
217
|
+
parser: ArgumentParser instance with configured arguments.
|
|
218
|
+
|
|
219
|
+
Example:
|
|
220
|
+
>>> cli = DerivaMLRunNotebookCLI(
|
|
221
|
+
... description="Run ML notebook",
|
|
222
|
+
... epilog="See documentation for more details"
|
|
223
|
+
... )
|
|
224
|
+
>>> cli.main() # Parses args and runs notebook
|
|
225
|
+
"""
|
|
20
226
|
|
|
21
|
-
def __init__(self, description, epilog, **kwargs):
|
|
227
|
+
def __init__(self, description: str, epilog: str, **kwargs) -> None:
|
|
228
|
+
"""Initialize the notebook runner CLI with command-line arguments.
|
|
229
|
+
|
|
230
|
+
Sets up argument parsing for notebook execution, including the notebook file
|
|
231
|
+
path, parameter injection options, kernel selection, and inspection mode.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
description: Description text shown in --help output.
|
|
235
|
+
epilog: Additional text shown after argument help.
|
|
236
|
+
**kwargs: Additional keyword arguments passed to BaseCLI.
|
|
237
|
+
|
|
238
|
+
Note:
|
|
239
|
+
Calls Workflow._check_nbstrip_status() to verify nbstripout is configured,
|
|
240
|
+
which helps ensure notebooks are properly cleaned before Git commits.
|
|
241
|
+
"""
|
|
22
242
|
BaseCLI.__init__(self, description, epilog, **kwargs)
|
|
243
|
+
# Verify nbstripout is configured for clean notebook version control
|
|
23
244
|
Workflow._check_nbstrip_status()
|
|
24
245
|
self.parser.add_argument("notebook_file", type=Path, help="Path to the notebook file")
|
|
25
246
|
|
|
@@ -31,12 +252,25 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
31
252
|
help="JSON or YAML file with parameter values to inject into the notebook.",
|
|
32
253
|
)
|
|
33
254
|
|
|
255
|
+
self.parser.add_argument(
|
|
256
|
+
"--catalog",
|
|
257
|
+
type=str,
|
|
258
|
+
default=None,
|
|
259
|
+
help="Catalog number or identifier (optional if defined in Hydra config)"
|
|
260
|
+
)
|
|
261
|
+
|
|
34
262
|
self.parser.add_argument(
|
|
35
263
|
"--inspect",
|
|
36
264
|
action="store_true",
|
|
37
265
|
help="Display parameters information for the given notebook path.",
|
|
38
266
|
)
|
|
39
267
|
|
|
268
|
+
self.parser.add_argument(
|
|
269
|
+
"--info",
|
|
270
|
+
action="store_true",
|
|
271
|
+
help="Display available Hydra configuration groups and options.",
|
|
272
|
+
)
|
|
273
|
+
|
|
40
274
|
self.parser.add_argument(
|
|
41
275
|
"--log-output",
|
|
42
276
|
action="store_true",
|
|
@@ -61,10 +295,37 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
61
295
|
default=self._find_kernel_for_venv(),
|
|
62
296
|
)
|
|
63
297
|
|
|
298
|
+
self.parser.add_argument(
|
|
299
|
+
"hydra_overrides",
|
|
300
|
+
nargs="*",
|
|
301
|
+
help="Hydra-zen configuration overrides (e.g., assets=roc_quick_probabilities)",
|
|
302
|
+
)
|
|
303
|
+
|
|
64
304
|
@staticmethod
|
|
65
|
-
def _coerce_number(val: str):
|
|
66
|
-
"""
|
|
67
|
-
|
|
305
|
+
def _coerce_number(val: str) -> int | float | str:
|
|
306
|
+
"""Convert a string value to the most appropriate numeric type.
|
|
307
|
+
|
|
308
|
+
Attempts to parse the string as an integer first, then as a float.
|
|
309
|
+
If neither succeeds, returns the original string unchanged.
|
|
310
|
+
|
|
311
|
+
This is used to convert command-line parameter values (which are always
|
|
312
|
+
strings) to appropriate Python types for notebook parameter injection.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
val: String value to convert.
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
The value as int if it's a valid integer string,
|
|
319
|
+
as float if it's a valid float string,
|
|
320
|
+
or the original string if neither conversion succeeds.
|
|
321
|
+
|
|
322
|
+
Examples:
|
|
323
|
+
>>> DerivaMLRunNotebookCLI._coerce_number("42")
|
|
324
|
+
42
|
|
325
|
+
>>> DerivaMLRunNotebookCLI._coerce_number("3.14")
|
|
326
|
+
3.14
|
|
327
|
+
>>> DerivaMLRunNotebookCLI._coerce_number("hello")
|
|
328
|
+
'hello'
|
|
68
329
|
"""
|
|
69
330
|
try:
|
|
70
331
|
return int(val)
|
|
@@ -74,16 +335,41 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
74
335
|
except ValueError:
|
|
75
336
|
return val
|
|
76
337
|
|
|
77
|
-
def main(self):
|
|
78
|
-
"""Parse arguments and
|
|
338
|
+
def main(self) -> None:
|
|
339
|
+
"""Parse command-line arguments and execute the notebook.
|
|
340
|
+
|
|
341
|
+
This is the main entry point that orchestrates:
|
|
342
|
+
1. Parsing command-line arguments
|
|
343
|
+
2. Loading parameters from file if specified
|
|
344
|
+
3. Validating the notebook file
|
|
345
|
+
4. Either inspecting notebook parameters or executing the notebook
|
|
346
|
+
|
|
347
|
+
The method merges parameters from multiple sources with the following
|
|
348
|
+
precedence (later sources override earlier):
|
|
349
|
+
1. Notebook default values
|
|
350
|
+
2. Parameters from --file (JSON/YAML)
|
|
351
|
+
3. Parameters from -p/--parameter flags
|
|
352
|
+
4. Host and catalog from CLI arguments
|
|
353
|
+
|
|
354
|
+
Raises:
|
|
355
|
+
SystemExit: If parameter file has invalid extension or notebook file
|
|
356
|
+
is invalid.
|
|
357
|
+
"""
|
|
79
358
|
args = self.parse_cli()
|
|
80
359
|
notebook_file: Path = args.notebook_file
|
|
81
360
|
parameter_file = args.file
|
|
82
361
|
|
|
83
|
-
#
|
|
84
|
-
# e.g. [['timeout', '30'],
|
|
362
|
+
# Build parameters dict from command-line -p/--parameter flags
|
|
363
|
+
# args.parameter is a list of [KEY, VALUE] lists, e.g. [['timeout', '30'], ...]
|
|
85
364
|
parameters = {key: self._coerce_number(val) for key, val in args.parameter}
|
|
86
|
-
|
|
365
|
+
# Inject host and catalog if provided on command line
|
|
366
|
+
# If not provided, the notebook will use values from Hydra config
|
|
367
|
+
if args.host:
|
|
368
|
+
parameters['host'] = args.host
|
|
369
|
+
if args.catalog:
|
|
370
|
+
parameters['catalog'] = args.catalog
|
|
371
|
+
|
|
372
|
+
# Merge parameters from configuration file if provided
|
|
87
373
|
if parameter_file:
|
|
88
374
|
with parameter_file.open("r") as f:
|
|
89
375
|
if parameter_file.suffix == ".json":
|
|
@@ -94,37 +380,135 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
94
380
|
print("Parameter file must be an json or YAML file.")
|
|
95
381
|
exit(1)
|
|
96
382
|
|
|
383
|
+
# Validate notebook file exists and has correct extension
|
|
97
384
|
if not (notebook_file.is_file() and notebook_file.suffix == ".ipynb"):
|
|
98
385
|
print(f"Notebook file must be an ipynb file: {notebook_file.name}.")
|
|
99
386
|
exit(1)
|
|
100
387
|
|
|
101
|
-
#
|
|
102
|
-
# Return an existing workflow if one is found.
|
|
388
|
+
# Use papermill to inspect notebook for parameter cell metadata
|
|
103
389
|
notebook_parameters = pm.inspect_notebook(notebook_file)
|
|
104
390
|
|
|
105
391
|
if args.inspect:
|
|
392
|
+
# Display parameter info and exit without executing
|
|
106
393
|
for param, value in notebook_parameters.items():
|
|
107
394
|
print(f"{param}:{value['inferred_type_name']} (default {value['default']})")
|
|
108
395
|
return
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
396
|
+
|
|
397
|
+
if args.info:
|
|
398
|
+
# Display available Hydra configuration options
|
|
399
|
+
self._show_hydra_info(notebook_file)
|
|
400
|
+
return
|
|
401
|
+
|
|
402
|
+
# Merge notebook defaults with provided parameters and execute
|
|
403
|
+
notebook_parameters = {k: v["default"] for k, v in notebook_parameters.items()} | parameters
|
|
404
|
+
self.run_notebook(
|
|
405
|
+
notebook_file.resolve(),
|
|
406
|
+
parameters,
|
|
407
|
+
kernel=args.kernel,
|
|
408
|
+
log=args.log_output,
|
|
409
|
+
hydra_overrides=args.hydra_overrides,
|
|
410
|
+
)
|
|
112
411
|
|
|
113
412
|
@staticmethod
|
|
114
|
-
def
|
|
413
|
+
def _show_hydra_info(notebook_file: Path) -> None:
|
|
414
|
+
"""Display available Hydra configuration groups and options.
|
|
415
|
+
|
|
416
|
+
Attempts to load the project's config module and display the available
|
|
417
|
+
configuration groups (e.g., assets, datasets, deriva_ml) and their
|
|
418
|
+
registered options.
|
|
419
|
+
|
|
420
|
+
Args:
|
|
421
|
+
notebook_file: Path to the notebook file (used to find the project root).
|
|
115
422
|
"""
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
423
|
+
import sys
|
|
424
|
+
|
|
425
|
+
from hydra_zen import store
|
|
426
|
+
|
|
427
|
+
# Add src directory to path so we can import configs
|
|
428
|
+
notebook_dir = notebook_file.parent.resolve()
|
|
429
|
+
project_root = notebook_dir.parent # Assume notebooks/ is one level down
|
|
430
|
+
src_dir = project_root / "src"
|
|
431
|
+
|
|
432
|
+
if src_dir.exists():
|
|
433
|
+
sys.path.insert(0, str(src_dir))
|
|
434
|
+
|
|
435
|
+
# Try to load configs using the new API, fall back to old method
|
|
436
|
+
try:
|
|
437
|
+
from deriva_ml.execution import load_configs
|
|
438
|
+
loaded = load_configs("configs")
|
|
439
|
+
if not loaded:
|
|
440
|
+
# Try the old way
|
|
441
|
+
from configs import load_all_configs
|
|
442
|
+
load_all_configs()
|
|
443
|
+
except ImportError:
|
|
444
|
+
print("Could not import configs module. Make sure src/configs/__init__.py exists.")
|
|
445
|
+
print("Available Hydra groups cannot be determined without loading the config module.")
|
|
446
|
+
return
|
|
447
|
+
|
|
448
|
+
# Access the internal store to list groups and entries
|
|
449
|
+
print("Available Hydra Configuration Groups:")
|
|
450
|
+
print("=" * 50)
|
|
451
|
+
|
|
452
|
+
# The hydra_zen store._queue contains (group, name) tuples
|
|
453
|
+
try:
|
|
454
|
+
groups: dict[str, list[str]] = {}
|
|
455
|
+
|
|
456
|
+
for group, name in store._queue:
|
|
457
|
+
if group:
|
|
458
|
+
if group not in groups:
|
|
459
|
+
groups[group] = []
|
|
460
|
+
if name not in groups[group]:
|
|
461
|
+
groups[group].append(name)
|
|
462
|
+
else:
|
|
463
|
+
# Top-level configs (group is None)
|
|
464
|
+
if "__root__" not in groups:
|
|
465
|
+
groups["__root__"] = []
|
|
466
|
+
if name not in groups["__root__"]:
|
|
467
|
+
groups["__root__"].append(name)
|
|
468
|
+
|
|
469
|
+
# Print groups and their options
|
|
470
|
+
for group in sorted(groups.keys()):
|
|
471
|
+
if group == "__root__":
|
|
472
|
+
print("\nTop-level configs:")
|
|
473
|
+
else:
|
|
474
|
+
print(f"\n{group}:")
|
|
475
|
+
for name in sorted(groups[group]):
|
|
476
|
+
print(f" - {name}")
|
|
477
|
+
|
|
478
|
+
print("\n" + "=" * 50)
|
|
479
|
+
print("Usage: deriva-ml-run-notebook notebook.ipynb [options] <group>=<option>")
|
|
480
|
+
print("Example: deriva-ml-run-notebook notebook.ipynb --host localhost assets=roc_quick_probabilities")
|
|
481
|
+
|
|
482
|
+
except Exception as e:
|
|
483
|
+
print(f"Error inspecting Hydra store: {e}")
|
|
484
|
+
print("Try running with --help for basic usage information.")
|
|
485
|
+
|
|
486
|
+
@staticmethod
|
|
487
|
+
def _find_kernel_for_venv() -> str | None:
|
|
488
|
+
"""Find a Jupyter kernel that matches the current virtual environment.
|
|
489
|
+
|
|
490
|
+
Searches through all installed Jupyter kernels to find one whose Python
|
|
491
|
+
executable path matches the VIRTUAL_ENV environment variable. This allows
|
|
492
|
+
automatic kernel selection when running notebooks from within an activated
|
|
493
|
+
virtual environment.
|
|
494
|
+
|
|
495
|
+
The method examines each kernel's argv configuration to find the Python
|
|
496
|
+
executable path and compares it to the expected location within the
|
|
497
|
+
virtual environment (venv_path/bin/python).
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
The kernel name (str) if a matching kernel is found, or None if
|
|
501
|
+
no virtual environment is active or no matching kernel exists.
|
|
502
|
+
|
|
503
|
+
Note:
|
|
504
|
+
This method only works on Unix-like systems where Python executables
|
|
505
|
+
are located at bin/python within the virtual environment. For Windows,
|
|
506
|
+
the path would be Scripts/python.exe.
|
|
507
|
+
|
|
508
|
+
Example:
|
|
509
|
+
>>> # With VIRTUAL_ENV=/path/to/myenv and kernel 'myenv' installed
|
|
510
|
+
>>> DerivaMLRunNotebookCLI._find_kernel_for_venv()
|
|
511
|
+
'myenv'
|
|
128
512
|
"""
|
|
129
513
|
venv = os.environ.get("VIRTUAL_ENV")
|
|
130
514
|
if not venv:
|
|
@@ -134,7 +518,7 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
134
518
|
for name, spec in ksm.get_all_specs().items():
|
|
135
519
|
kernel_json = spec.get("spec", {})
|
|
136
520
|
argv = kernel_json.get("argv", [])
|
|
137
|
-
#
|
|
521
|
+
# Check each argument for the Python executable path
|
|
138
522
|
for arg in argv:
|
|
139
523
|
try:
|
|
140
524
|
if Path(arg).resolve() == venv_path.joinpath("bin", "python").resolve():
|
|
@@ -143,15 +527,66 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
143
527
|
continue
|
|
144
528
|
return None
|
|
145
529
|
|
|
146
|
-
def run_notebook(
|
|
530
|
+
def run_notebook(
|
|
531
|
+
self,
|
|
532
|
+
notebook_file: Path,
|
|
533
|
+
parameters: dict,
|
|
534
|
+
kernel: str | None = None,
|
|
535
|
+
log: bool = False,
|
|
536
|
+
hydra_overrides: list[str] | None = None,
|
|
537
|
+
) -> None:
|
|
538
|
+
"""Execute a notebook with papermill and upload results to the catalog.
|
|
539
|
+
|
|
540
|
+
This method handles the complete notebook execution lifecycle:
|
|
541
|
+
1. Sets environment variables for workflow provenance (URL, checksum, path)
|
|
542
|
+
2. Executes the notebook using papermill with injected parameters
|
|
543
|
+
3. Reads execution metadata saved by the notebook
|
|
544
|
+
4. Converts executed notebook to Markdown format
|
|
545
|
+
5. Uploads both notebook outputs as execution assets
|
|
546
|
+
6. Prints a citation for the execution record
|
|
547
|
+
|
|
548
|
+
The notebook is expected to create an execution record during its run
|
|
549
|
+
and save the execution metadata to the path specified in the
|
|
550
|
+
DERIVA_ML_SAVE_EXECUTION_RID environment variable.
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
notebook_file: Absolute path to the notebook file to execute.
|
|
554
|
+
parameters: Dictionary of parameters to inject into the notebook's
|
|
555
|
+
parameter cell.
|
|
556
|
+
kernel: Name of the Jupyter kernel to use. If None, papermill will
|
|
557
|
+
use the notebook's default kernel.
|
|
558
|
+
log: If True, stream notebook cell outputs to stdout during execution.
|
|
559
|
+
hydra_overrides: Optional list of Hydra-zen configuration overrides
|
|
560
|
+
(e.g., ["assets=roc_quick_probabilities", "deriva_ml=eye_ai"]).
|
|
561
|
+
These are passed to the notebook via DERIVA_ML_HYDRA_OVERRIDES
|
|
562
|
+
environment variable as a JSON-encoded list.
|
|
563
|
+
|
|
564
|
+
Raises:
|
|
565
|
+
SystemExit: If the notebook doesn't save execution metadata.
|
|
566
|
+
|
|
567
|
+
Note:
|
|
568
|
+
The executed notebook and its Markdown conversion are uploaded to
|
|
569
|
+
the catalog as Execution_Asset records with type 'notebook_output'.
|
|
570
|
+
"""
|
|
571
|
+
# Get workflow provenance info (URL for Git-tracked files, checksum for integrity)
|
|
147
572
|
url, checksum = Workflow.get_url_and_checksum(Path(notebook_file))
|
|
148
573
|
os.environ["DERIVA_ML_WORKFLOW_URL"] = url
|
|
149
574
|
os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"] = checksum
|
|
150
575
|
os.environ["DERIVA_ML_NOTEBOOK_PATH"] = notebook_file.as_posix()
|
|
576
|
+
|
|
577
|
+
# Pass Hydra overrides to notebook via environment variable
|
|
578
|
+
if hydra_overrides:
|
|
579
|
+
os.environ["DERIVA_ML_HYDRA_OVERRIDES"] = json.dumps(hydra_overrides)
|
|
580
|
+
elif "DERIVA_ML_HYDRA_OVERRIDES" in os.environ:
|
|
581
|
+
del os.environ["DERIVA_ML_HYDRA_OVERRIDES"]
|
|
582
|
+
|
|
151
583
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
|
152
584
|
notebook_output = Path(tmpdirname) / Path(notebook_file).name
|
|
153
585
|
execution_rid_path = Path(tmpdirname) / "execution_rid.json"
|
|
586
|
+
# Tell the notebook where to save its execution metadata
|
|
154
587
|
os.environ["DERIVA_ML_SAVE_EXECUTION_RID"] = execution_rid_path.as_posix()
|
|
588
|
+
|
|
589
|
+
# Execute the notebook with papermill, injecting parameters
|
|
155
590
|
pm.execute_notebook(
|
|
156
591
|
input_path=notebook_file,
|
|
157
592
|
output_path=notebook_output,
|
|
@@ -160,6 +595,8 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
160
595
|
log_output=log,
|
|
161
596
|
)
|
|
162
597
|
print(f"Notebook output saved to {notebook_output}")
|
|
598
|
+
|
|
599
|
+
# Read execution metadata that the notebook should have saved
|
|
163
600
|
with execution_rid_path.open("r") as f:
|
|
164
601
|
execution_config = json.load(f)
|
|
165
602
|
|
|
@@ -167,31 +604,60 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
167
604
|
print("Execution RID not found.")
|
|
168
605
|
exit(1)
|
|
169
606
|
|
|
607
|
+
# Extract execution info to reconnect to the catalog
|
|
170
608
|
execution_rid = execution_config["execution_rid"]
|
|
171
609
|
hostname = execution_config["hostname"]
|
|
172
610
|
catalog_id = execution_config["catalog_id"]
|
|
173
|
-
|
|
611
|
+
|
|
612
|
+
# Create DerivaML instance to upload results
|
|
174
613
|
ml_instance = DerivaML(hostname=hostname, catalog_id=catalog_id, working_dir=tmpdirname)
|
|
175
614
|
workflow_rid = ml_instance.retrieve_rid(execution_config["execution_rid"])["Workflow"]
|
|
176
615
|
|
|
616
|
+
# Look up the workflow object from the RID
|
|
617
|
+
workflow = ml_instance.lookup_workflow(workflow_rid)
|
|
618
|
+
|
|
619
|
+
# Restore the execution context to upload outputs
|
|
177
620
|
execution = Execution(
|
|
178
|
-
configuration=ExecutionConfiguration(workflow=
|
|
621
|
+
configuration=ExecutionConfiguration(workflow=workflow),
|
|
179
622
|
ml_object=ml_instance,
|
|
180
623
|
reload=execution_rid,
|
|
181
624
|
)
|
|
182
625
|
|
|
183
|
-
#
|
|
626
|
+
# Convert executed notebook to Markdown for easier viewing
|
|
627
|
+
# We embed images as base64 data URIs so the markdown is self-contained
|
|
184
628
|
notebook_output_md = notebook_output.with_suffix(".md")
|
|
185
629
|
with notebook_output.open() as f:
|
|
186
630
|
nb = nbformat.read(f, as_version=4)
|
|
187
|
-
|
|
631
|
+
|
|
632
|
+
# Convert DataFrame HTML outputs to markdown tables for better rendering
|
|
633
|
+
nb = _convert_dataframe_outputs(nb)
|
|
634
|
+
|
|
188
635
|
exporter = MarkdownExporter()
|
|
189
636
|
(body, resources) = exporter.from_notebook_node(nb)
|
|
190
637
|
|
|
638
|
+
# Replace file references with inline base64 data URIs
|
|
639
|
+
if resources.get("outputs"):
|
|
640
|
+
for filename, data in resources["outputs"].items():
|
|
641
|
+
# Determine mime type from extension
|
|
642
|
+
if filename.endswith(".png"):
|
|
643
|
+
mime_type = "image/png"
|
|
644
|
+
elif filename.endswith(".jpg") or filename.endswith(".jpeg"):
|
|
645
|
+
mime_type = "image/jpeg"
|
|
646
|
+
elif filename.endswith(".svg"):
|
|
647
|
+
mime_type = "image/svg+xml"
|
|
648
|
+
else:
|
|
649
|
+
mime_type = "application/octet-stream"
|
|
650
|
+
|
|
651
|
+
# Create data URI and replace in markdown
|
|
652
|
+
b64_data = base64.b64encode(data).decode("utf-8")
|
|
653
|
+
data_uri = f"data:{mime_type};base64,{b64_data}"
|
|
654
|
+
body = body.replace(filename, data_uri)
|
|
655
|
+
|
|
191
656
|
with notebook_output_md.open("w") as f:
|
|
192
657
|
f.write(body)
|
|
193
658
|
nb = nbformat.read(notebook_output, as_version=4)
|
|
194
659
|
|
|
660
|
+
# Register both notebook outputs as execution assets
|
|
195
661
|
execution.asset_file_path(
|
|
196
662
|
asset_name=MLAsset.execution_asset,
|
|
197
663
|
file_name=notebook_output,
|
|
@@ -203,9 +669,12 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
203
669
|
file_name=notebook_output_md,
|
|
204
670
|
asset_types=ExecAssetType.notebook_output,
|
|
205
671
|
)
|
|
672
|
+
|
|
673
|
+
# Upload all registered assets to the catalog
|
|
206
674
|
execution.upload_execution_outputs()
|
|
207
675
|
|
|
208
|
-
|
|
676
|
+
# Print execution URL (without snapshot ID for readability)
|
|
677
|
+
print(f"https://{hostname}/id/{catalog_id}/{execution_rid}")
|
|
209
678
|
|
|
210
679
|
|
|
211
680
|
def main():
|