deriva-ml 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. deriva_ml/__init__.py +69 -1
  2. deriva_ml/asset/__init__.py +17 -0
  3. deriva_ml/asset/asset.py +357 -0
  4. deriva_ml/asset/aux_classes.py +100 -0
  5. deriva_ml/bump_version.py +254 -11
  6. deriva_ml/catalog/__init__.py +31 -0
  7. deriva_ml/catalog/clone.py +1939 -0
  8. deriva_ml/catalog/localize.py +426 -0
  9. deriva_ml/core/__init__.py +29 -0
  10. deriva_ml/core/base.py +845 -1067
  11. deriva_ml/core/config.py +169 -21
  12. deriva_ml/core/constants.py +120 -19
  13. deriva_ml/core/definitions.py +123 -13
  14. deriva_ml/core/enums.py +47 -73
  15. deriva_ml/core/ermrest.py +226 -193
  16. deriva_ml/core/exceptions.py +297 -14
  17. deriva_ml/core/filespec.py +99 -28
  18. deriva_ml/core/logging_config.py +225 -0
  19. deriva_ml/core/mixins/__init__.py +42 -0
  20. deriva_ml/core/mixins/annotation.py +915 -0
  21. deriva_ml/core/mixins/asset.py +384 -0
  22. deriva_ml/core/mixins/dataset.py +237 -0
  23. deriva_ml/core/mixins/execution.py +408 -0
  24. deriva_ml/core/mixins/feature.py +365 -0
  25. deriva_ml/core/mixins/file.py +263 -0
  26. deriva_ml/core/mixins/path_builder.py +145 -0
  27. deriva_ml/core/mixins/rid_resolution.py +204 -0
  28. deriva_ml/core/mixins/vocabulary.py +400 -0
  29. deriva_ml/core/mixins/workflow.py +322 -0
  30. deriva_ml/core/validation.py +389 -0
  31. deriva_ml/dataset/__init__.py +2 -1
  32. deriva_ml/dataset/aux_classes.py +20 -4
  33. deriva_ml/dataset/catalog_graph.py +575 -0
  34. deriva_ml/dataset/dataset.py +1242 -1008
  35. deriva_ml/dataset/dataset_bag.py +1311 -182
  36. deriva_ml/dataset/history.py +27 -14
  37. deriva_ml/dataset/upload.py +225 -38
  38. deriva_ml/demo_catalog.py +126 -110
  39. deriva_ml/execution/__init__.py +46 -2
  40. deriva_ml/execution/base_config.py +639 -0
  41. deriva_ml/execution/execution.py +543 -242
  42. deriva_ml/execution/execution_configuration.py +26 -11
  43. deriva_ml/execution/execution_record.py +592 -0
  44. deriva_ml/execution/find_caller.py +298 -0
  45. deriva_ml/execution/model_protocol.py +175 -0
  46. deriva_ml/execution/multirun_config.py +153 -0
  47. deriva_ml/execution/runner.py +595 -0
  48. deriva_ml/execution/workflow.py +223 -34
  49. deriva_ml/experiment/__init__.py +8 -0
  50. deriva_ml/experiment/experiment.py +411 -0
  51. deriva_ml/feature.py +6 -1
  52. deriva_ml/install_kernel.py +143 -6
  53. deriva_ml/interfaces.py +862 -0
  54. deriva_ml/model/__init__.py +99 -0
  55. deriva_ml/model/annotations.py +1278 -0
  56. deriva_ml/model/catalog.py +286 -60
  57. deriva_ml/model/database.py +144 -649
  58. deriva_ml/model/deriva_ml_database.py +308 -0
  59. deriva_ml/model/handles.py +14 -0
  60. deriva_ml/run_model.py +319 -0
  61. deriva_ml/run_notebook.py +507 -38
  62. deriva_ml/schema/__init__.py +18 -2
  63. deriva_ml/schema/annotations.py +62 -33
  64. deriva_ml/schema/create_schema.py +169 -69
  65. deriva_ml/schema/validation.py +601 -0
  66. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/METADATA +4 -4
  67. deriva_ml-1.17.12.dist-info/RECORD +77 -0
  68. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/WHEEL +1 -1
  69. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/entry_points.txt +1 -0
  70. deriva_ml/protocols/dataset.py +0 -19
  71. deriva_ml/test.py +0 -94
  72. deriva_ml-1.17.10.dist-info/RECORD +0 -45
  73. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/licenses/LICENSE +0 -0
  74. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/top_level.txt +0 -0
deriva_ml/run_notebook.py CHANGED
@@ -1,7 +1,51 @@
1
- """Module to run a notebook using papermill"""
2
-
1
+ """Command-line interface for executing Jupyter notebooks with DerivaML tracking.
2
+
3
+ This module provides a CLI tool for running Jupyter notebooks using papermill while
4
+ automatically tracking the execution in a Deriva catalog. It handles:
5
+
6
+ - Parameter injection into notebooks from command-line arguments or config files
7
+ - Automatic kernel detection for the current virtual environment
8
+ - Execution tracking with workflow provenance
9
+ - Conversion of executed notebooks to Markdown format
10
+ - Upload of notebook outputs as execution assets
11
+
12
+ The notebook being executed should use DerivaML's execution context to record
13
+ its workflow. When run through this CLI, environment variables are set to
14
+ communicate workflow metadata (URL, checksum, notebook path) to the notebook.
15
+
16
+ Environment Variables Set:
17
+ DERIVA_ML_WORKFLOW_URL: URL to the notebook source (e.g., GitHub URL)
18
+ DERIVA_ML_WORKFLOW_CHECKSUM: MD5 checksum of the notebook file
19
+ DERIVA_ML_NOTEBOOK_PATH: Local filesystem path to the notebook
20
+ DERIVA_ML_SAVE_EXECUTION_RID: Path where notebook should save execution info
21
+
22
+ Usage:
23
+ deriva-ml-run-notebook notebook.ipynb --host example.org --catalog 1
24
+ deriva-ml-run-notebook notebook.ipynb -p param1 value1 -p param2 value2
25
+ deriva-ml-run-notebook notebook.ipynb --file parameters.yaml
26
+ deriva-ml-run-notebook notebook.ipynb --inspect # Show available parameters
27
+ deriva-ml-run-notebook notebook.ipynb assets=my_assets # Hydra overrides only
28
+
29
+ Example:
30
+ # Run a training notebook with explicit host/catalog
31
+ deriva-ml-run-notebook train_model.ipynb \\
32
+ --host deriva.example.org \\
33
+ --catalog 42 \\
34
+ -p learning_rate 0.001 \\
35
+ --kernel my_ml_env
36
+
37
+ # Run using Hydra config defaults (no --host/--catalog needed)
38
+ deriva-ml-run-notebook analysis.ipynb assets=roc_comparison_probabilities
39
+
40
+ See Also:
41
+ - install_kernel: Module for installing Jupyter kernels for virtual environments
42
+ - Workflow: Class that handles workflow registration and Git integration
43
+ """
44
+
45
+ import base64
3
46
  import json
4
47
  import os
48
+ import re
5
49
  import tempfile
6
50
  from pathlib import Path
7
51
 
@@ -12,14 +56,191 @@ from deriva.core import BaseCLI
12
56
  from jupyter_client.kernelspec import KernelSpecManager
13
57
  from nbconvert import MarkdownExporter
14
58
 
15
- from deriva_ml import DerivaML, ExecAssetType, Execution, ExecutionConfiguration, MLAsset, Workflow
59
+ from deriva_ml import DerivaML, ExecAssetType, MLAsset
60
+ from deriva_ml.execution import Execution, ExecutionConfiguration, Workflow
61
+
62
+
63
+ def _html_table_to_markdown(html: str) -> str | None:
64
+ """Convert an HTML DataFrame table to markdown format.
65
+
66
+ Parses HTML table elements and converts them to a properly formatted
67
+ markdown table with headers and alignment.
68
+
69
+ Args:
70
+ html: HTML string potentially containing a DataFrame table.
71
+
72
+ Returns:
73
+ Markdown table string if an HTML table was found, None otherwise.
74
+ """
75
+ # Check if this looks like a pandas DataFrame HTML output
76
+ if '<table' not in html or 'dataframe' not in html:
77
+ return None
78
+
79
+ try:
80
+ # Extract table content using regex (avoid heavy dependency on BeautifulSoup)
81
+ thead_match = re.search(r'<thead>(.*?)</thead>', html, re.DOTALL)
82
+ tbody_match = re.search(r'<tbody>(.*?)</tbody>', html, re.DOTALL)
83
+
84
+ if not thead_match or not tbody_match:
85
+ return None
86
+
87
+ thead = thead_match.group(1)
88
+ tbody = tbody_match.group(1)
89
+
90
+ # Extract header row(s)
91
+ header_rows = re.findall(r'<tr[^>]*>(.*?)</tr>', thead, re.DOTALL)
92
+ if not header_rows:
93
+ return None
94
+
95
+ # For pandas DataFrames with named index:
96
+ # - First row contains: empty <th> + column names
97
+ # - Second row (if exists) contains: index name + empty <th>s
98
+ # We need to use the first row for column names and second row for index name
99
+
100
+ first_row = header_rows[0]
101
+ first_headers = re.findall(r'<th[^>]*>(.*?)</th>', first_row, re.DOTALL)
102
+ first_headers = [re.sub(r'<[^>]+>', '', h).strip() for h in first_headers]
103
+
104
+ # Check if there's a second header row with an index name
105
+ index_name = ""
106
+ if len(header_rows) > 1:
107
+ second_row = header_rows[1]
108
+ second_headers = re.findall(r'<th[^>]*>(.*?)</th>', second_row, re.DOTALL)
109
+ second_headers = [re.sub(r'<[^>]+>', '', h).strip() for h in second_headers]
110
+ # The index name is typically in the first cell of the second row
111
+ if second_headers and second_headers[0]:
112
+ index_name = second_headers[0]
113
+
114
+ # Build final headers: use index name for first column if available
115
+ headers = first_headers.copy()
116
+ if headers and not headers[0] and index_name:
117
+ headers[0] = index_name
118
+
119
+ # Extract body rows
120
+ body_rows = re.findall(r'<tr[^>]*>(.*?)</tr>', tbody, re.DOTALL)
121
+
122
+ rows = []
123
+ for row_html in body_rows:
124
+ # Get both th (index) and td (data) cells
125
+ cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row_html, re.DOTALL)
126
+ cells = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
127
+ rows.append(cells)
128
+
129
+ if not headers or not rows:
130
+ return None
131
+
132
+ # Build markdown table
133
+ # Determine column widths for alignment
134
+ col_widths = [len(h) for h in headers]
135
+ for row in rows:
136
+ for i, cell in enumerate(row):
137
+ if i < len(col_widths):
138
+ col_widths[i] = max(col_widths[i], len(cell))
139
+
140
+ # Format header
141
+ header_line = '| ' + ' | '.join(h.ljust(col_widths[i]) for i, h in enumerate(headers)) + ' |'
142
+ separator = '|' + '|'.join('-' * (w + 2) for w in col_widths) + '|'
143
+
144
+ # Format rows
145
+ formatted_rows = []
146
+ for row in rows:
147
+ # Pad row if needed
148
+ padded = row + [''] * (len(headers) - len(row))
149
+ formatted = '| ' + ' | '.join(
150
+ padded[i].ljust(col_widths[i]) if i < len(col_widths) else padded[i]
151
+ for i in range(len(headers))
152
+ ) + ' |'
153
+ formatted_rows.append(formatted)
154
+
155
+ return '\n'.join([header_line, separator] + formatted_rows)
156
+
157
+ except Exception:
158
+ # If parsing fails, return None to use default behavior
159
+ return None
160
+
161
+
162
+ def _convert_dataframe_outputs(nb: nbformat.NotebookNode) -> nbformat.NotebookNode:
163
+ """Convert DataFrame HTML outputs in notebook cells to markdown tables.
164
+
165
+ Iterates through all code cells and converts any display_data outputs
166
+ containing DataFrame HTML tables to markdown format for better rendering.
167
+
168
+ Args:
169
+ nb: The notebook node to process.
170
+
171
+ Returns:
172
+ The modified notebook node with converted outputs.
173
+ """
174
+ for cell in nb.cells:
175
+ if cell.cell_type != 'code':
176
+ continue
177
+
178
+ new_outputs = []
179
+ for output in cell.get('outputs', []):
180
+ if output.get('output_type') in ('display_data', 'execute_result'):
181
+ data = output.get('data', {})
182
+ html = data.get('text/html', '')
183
+
184
+ if html and '<table' in html and 'dataframe' in html:
185
+ md_table = _html_table_to_markdown(html)
186
+ if md_table:
187
+ # Replace the output with markdown text
188
+ # Keep the original output type but change the data
189
+ new_output = output.copy()
190
+ new_output['data'] = {'text/plain': md_table}
191
+ new_outputs.append(new_output)
192
+ continue
193
+
194
+ new_outputs.append(output)
195
+
196
+ cell['outputs'] = new_outputs
197
+
198
+ return nb
16
199
 
17
200
 
18
201
  class DerivaMLRunNotebookCLI(BaseCLI):
19
- """Main class to part command line arguments and call model"""
202
+ """Command-line interface for running Jupyter notebooks with DerivaML execution tracking.
203
+
204
+ This CLI extends Deriva's BaseCLI to provide notebook execution capabilities using
205
+ papermill. It automatically detects the appropriate Jupyter kernel for the current
206
+ virtual environment and handles parameter injection from multiple sources.
207
+
208
+ The CLI supports:
209
+ - Positional notebook file argument
210
+ - Parameter injection via -p/--parameter flags (multiple allowed)
211
+ - Parameter injection via JSON or YAML configuration files
212
+ - Automatic kernel detection for the active virtual environment
213
+ - Inspection mode to display available notebook parameters
214
+ - Logging output from notebook execution
215
+
216
+ Attributes:
217
+ parser: ArgumentParser instance with configured arguments.
218
+
219
+ Example:
220
+ >>> cli = DerivaMLRunNotebookCLI(
221
+ ... description="Run ML notebook",
222
+ ... epilog="See documentation for more details"
223
+ ... )
224
+ >>> cli.main() # Parses args and runs notebook
225
+ """
20
226
 
21
- def __init__(self, description, epilog, **kwargs):
227
+ def __init__(self, description: str, epilog: str, **kwargs) -> None:
228
+ """Initialize the notebook runner CLI with command-line arguments.
229
+
230
+ Sets up argument parsing for notebook execution, including the notebook file
231
+ path, parameter injection options, kernel selection, and inspection mode.
232
+
233
+ Args:
234
+ description: Description text shown in --help output.
235
+ epilog: Additional text shown after argument help.
236
+ **kwargs: Additional keyword arguments passed to BaseCLI.
237
+
238
+ Note:
239
+ Calls Workflow._check_nbstrip_status() to verify nbstripout is configured,
240
+ which helps ensure notebooks are properly cleaned before Git commits.
241
+ """
22
242
  BaseCLI.__init__(self, description, epilog, **kwargs)
243
+ # Verify nbstripout is configured for clean notebook version control
23
244
  Workflow._check_nbstrip_status()
24
245
  self.parser.add_argument("notebook_file", type=Path, help="Path to the notebook file")
25
246
 
@@ -31,12 +252,25 @@ class DerivaMLRunNotebookCLI(BaseCLI):
31
252
  help="JSON or YAML file with parameter values to inject into the notebook.",
32
253
  )
33
254
 
255
+ self.parser.add_argument(
256
+ "--catalog",
257
+ type=str,
258
+ default=None,
259
+ help="Catalog number or identifier (optional if defined in Hydra config)"
260
+ )
261
+
34
262
  self.parser.add_argument(
35
263
  "--inspect",
36
264
  action="store_true",
37
265
  help="Display parameters information for the given notebook path.",
38
266
  )
39
267
 
268
+ self.parser.add_argument(
269
+ "--info",
270
+ action="store_true",
271
+ help="Display available Hydra configuration groups and options.",
272
+ )
273
+
40
274
  self.parser.add_argument(
41
275
  "--log-output",
42
276
  action="store_true",
@@ -61,10 +295,37 @@ class DerivaMLRunNotebookCLI(BaseCLI):
61
295
  default=self._find_kernel_for_venv(),
62
296
  )
63
297
 
298
+ self.parser.add_argument(
299
+ "hydra_overrides",
300
+ nargs="*",
301
+ help="Hydra-zen configuration overrides (e.g., assets=roc_quick_probabilities)",
302
+ )
303
+
64
304
  @staticmethod
65
- def _coerce_number(val: str):
66
- """
67
- Try to convert a string to int, then float; otherwise return str.
305
+ def _coerce_number(val: str) -> int | float | str:
306
+ """Convert a string value to the most appropriate numeric type.
307
+
308
+ Attempts to parse the string as an integer first, then as a float.
309
+ If neither succeeds, returns the original string unchanged.
310
+
311
+ This is used to convert command-line parameter values (which are always
312
+ strings) to appropriate Python types for notebook parameter injection.
313
+
314
+ Args:
315
+ val: String value to convert.
316
+
317
+ Returns:
318
+ The value as int if it's a valid integer string,
319
+ as float if it's a valid float string,
320
+ or the original string if neither conversion succeeds.
321
+
322
+ Examples:
323
+ >>> DerivaMLRunNotebookCLI._coerce_number("42")
324
+ 42
325
+ >>> DerivaMLRunNotebookCLI._coerce_number("3.14")
326
+ 3.14
327
+ >>> DerivaMLRunNotebookCLI._coerce_number("hello")
328
+ 'hello'
68
329
  """
69
330
  try:
70
331
  return int(val)
@@ -74,16 +335,41 @@ class DerivaMLRunNotebookCLI(BaseCLI):
74
335
  except ValueError:
75
336
  return val
76
337
 
77
- def main(self):
78
- """Parse arguments and set up execution environment."""
338
+ def main(self) -> None:
339
+ """Parse command-line arguments and execute the notebook.
340
+
341
+ This is the main entry point that orchestrates:
342
+ 1. Parsing command-line arguments
343
+ 2. Loading parameters from file if specified
344
+ 3. Validating the notebook file
345
+ 4. Either inspecting notebook parameters or executing the notebook
346
+
347
+ The method merges parameters from multiple sources with the following
348
+ precedence (later sources override earlier):
349
+ 1. Notebook default values
350
+ 2. Parameters from --file (JSON/YAML)
351
+ 3. Parameters from -p/--parameter flags
352
+ 4. Host and catalog from CLI arguments
353
+
354
+ Raises:
355
+ SystemExit: If parameter file has invalid extension or notebook file
356
+ is invalid.
357
+ """
79
358
  args = self.parse_cli()
80
359
  notebook_file: Path = args.notebook_file
81
360
  parameter_file = args.file
82
361
 
83
- # args.parameter is now a list of [KEY, VALUE] lists
84
- # e.g. [['timeout', '30'], ['name', 'Alice'], ...]
362
+ # Build parameters dict from command-line -p/--parameter flags
363
+ # args.parameter is a list of [KEY, VALUE] lists, e.g. [['timeout', '30'], ...]
85
364
  parameters = {key: self._coerce_number(val) for key, val in args.parameter}
86
-
365
+ # Inject host and catalog if provided on command line
366
+ # If not provided, the notebook will use values from Hydra config
367
+ if args.host:
368
+ parameters['host'] = args.host
369
+ if args.catalog:
370
+ parameters['catalog'] = args.catalog
371
+
372
+ # Merge parameters from configuration file if provided
87
373
  if parameter_file:
88
374
  with parameter_file.open("r") as f:
89
375
  if parameter_file.suffix == ".json":
@@ -94,37 +380,135 @@ class DerivaMLRunNotebookCLI(BaseCLI):
94
380
  print("Parameter file must be an json or YAML file.")
95
381
  exit(1)
96
382
 
383
+ # Validate notebook file exists and has correct extension
97
384
  if not (notebook_file.is_file() and notebook_file.suffix == ".ipynb"):
98
385
  print(f"Notebook file must be an ipynb file: {notebook_file.name}.")
99
386
  exit(1)
100
387
 
101
- # Create a workflow instance for this specific version of the script.
102
- # Return an existing workflow if one is found.
388
+ # Use papermill to inspect notebook for parameter cell metadata
103
389
  notebook_parameters = pm.inspect_notebook(notebook_file)
104
390
 
105
391
  if args.inspect:
392
+ # Display parameter info and exit without executing
106
393
  for param, value in notebook_parameters.items():
107
394
  print(f"{param}:{value['inferred_type_name']} (default {value['default']})")
108
395
  return
109
- else:
110
- notebook_parameters = {k: v["default"] for k, v in notebook_parameters.items()} | parameters
111
- self.run_notebook(notebook_file.resolve(), parameters, kernel=args.kernel, log=args.log_output)
396
+
397
+ if args.info:
398
+ # Display available Hydra configuration options
399
+ self._show_hydra_info(notebook_file)
400
+ return
401
+
402
+ # Merge notebook defaults with provided parameters and execute
403
+ notebook_parameters = {k: v["default"] for k, v in notebook_parameters.items()} | parameters
404
+ self.run_notebook(
405
+ notebook_file.resolve(),
406
+ parameters,
407
+ kernel=args.kernel,
408
+ log=args.log_output,
409
+ hydra_overrides=args.hydra_overrides,
410
+ )
112
411
 
113
412
  @staticmethod
114
- def _find_kernel_for_venv() -> str | None:
413
+ def _show_hydra_info(notebook_file: Path) -> None:
414
+ """Display available Hydra configuration groups and options.
415
+
416
+ Attempts to load the project's config module and display the available
417
+ configuration groups (e.g., assets, datasets, deriva_ml) and their
418
+ registered options.
419
+
420
+ Args:
421
+ notebook_file: Path to the notebook file (used to find the project root).
115
422
  """
116
- Return the name and spec of an existing Jupyter kernel corresponding
117
- to a given Python virtual environment path.
118
-
119
- Parameters
120
- ----------
121
- venv_path : str
122
- Absolute or relative path to the virtual environment.
123
-
124
- Returns
125
- -------
126
- dict | None
127
- The kernel spec (as a dict) if found, or None if not found.
423
+ import sys
424
+
425
+ from hydra_zen import store
426
+
427
+ # Add src directory to path so we can import configs
428
+ notebook_dir = notebook_file.parent.resolve()
429
+ project_root = notebook_dir.parent # Assume notebooks/ is one level down
430
+ src_dir = project_root / "src"
431
+
432
+ if src_dir.exists():
433
+ sys.path.insert(0, str(src_dir))
434
+
435
+ # Try to load configs using the new API, fall back to old method
436
+ try:
437
+ from deriva_ml.execution import load_configs
438
+ loaded = load_configs("configs")
439
+ if not loaded:
440
+ # Try the old way
441
+ from configs import load_all_configs
442
+ load_all_configs()
443
+ except ImportError:
444
+ print("Could not import configs module. Make sure src/configs/__init__.py exists.")
445
+ print("Available Hydra groups cannot be determined without loading the config module.")
446
+ return
447
+
448
+ # Access the internal store to list groups and entries
449
+ print("Available Hydra Configuration Groups:")
450
+ print("=" * 50)
451
+
452
+ # The hydra_zen store._queue contains (group, name) tuples
453
+ try:
454
+ groups: dict[str, list[str]] = {}
455
+
456
+ for group, name in store._queue:
457
+ if group:
458
+ if group not in groups:
459
+ groups[group] = []
460
+ if name not in groups[group]:
461
+ groups[group].append(name)
462
+ else:
463
+ # Top-level configs (group is None)
464
+ if "__root__" not in groups:
465
+ groups["__root__"] = []
466
+ if name not in groups["__root__"]:
467
+ groups["__root__"].append(name)
468
+
469
+ # Print groups and their options
470
+ for group in sorted(groups.keys()):
471
+ if group == "__root__":
472
+ print("\nTop-level configs:")
473
+ else:
474
+ print(f"\n{group}:")
475
+ for name in sorted(groups[group]):
476
+ print(f" - {name}")
477
+
478
+ print("\n" + "=" * 50)
479
+ print("Usage: deriva-ml-run-notebook notebook.ipynb [options] <group>=<option>")
480
+ print("Example: deriva-ml-run-notebook notebook.ipynb --host localhost assets=roc_quick_probabilities")
481
+
482
+ except Exception as e:
483
+ print(f"Error inspecting Hydra store: {e}")
484
+ print("Try running with --help for basic usage information.")
485
+
486
+ @staticmethod
487
+ def _find_kernel_for_venv() -> str | None:
488
+ """Find a Jupyter kernel that matches the current virtual environment.
489
+
490
+ Searches through all installed Jupyter kernels to find one whose Python
491
+ executable path matches the VIRTUAL_ENV environment variable. This allows
492
+ automatic kernel selection when running notebooks from within an activated
493
+ virtual environment.
494
+
495
+ The method examines each kernel's argv configuration to find the Python
496
+ executable path and compares it to the expected location within the
497
+ virtual environment (venv_path/bin/python).
498
+
499
+ Returns:
500
+ The kernel name (str) if a matching kernel is found, or None if
501
+ no virtual environment is active or no matching kernel exists.
502
+
503
+ Note:
504
+ This method only works on Unix-like systems where Python executables
505
+ are located at bin/python within the virtual environment. For Windows,
506
+ the path would be Scripts/python.exe.
507
+
508
+ Example:
509
+ >>> # With VIRTUAL_ENV=/path/to/myenv and kernel 'myenv' installed
510
+ >>> DerivaMLRunNotebookCLI._find_kernel_for_venv()
511
+ 'myenv'
128
512
  """
129
513
  venv = os.environ.get("VIRTUAL_ENV")
130
514
  if not venv:
@@ -134,7 +518,7 @@ class DerivaMLRunNotebookCLI(BaseCLI):
134
518
  for name, spec in ksm.get_all_specs().items():
135
519
  kernel_json = spec.get("spec", {})
136
520
  argv = kernel_json.get("argv", [])
137
- # check for python executable path inside argv
521
+ # Check each argument for the Python executable path
138
522
  for arg in argv:
139
523
  try:
140
524
  if Path(arg).resolve() == venv_path.joinpath("bin", "python").resolve():
@@ -143,15 +527,66 @@ class DerivaMLRunNotebookCLI(BaseCLI):
143
527
  continue
144
528
  return None
145
529
 
146
- def run_notebook(self, notebook_file: Path, parameters, kernel=None, log=False):
530
+ def run_notebook(
531
+ self,
532
+ notebook_file: Path,
533
+ parameters: dict,
534
+ kernel: str | None = None,
535
+ log: bool = False,
536
+ hydra_overrides: list[str] | None = None,
537
+ ) -> None:
538
+ """Execute a notebook with papermill and upload results to the catalog.
539
+
540
+ This method handles the complete notebook execution lifecycle:
541
+ 1. Sets environment variables for workflow provenance (URL, checksum, path)
542
+ 2. Executes the notebook using papermill with injected parameters
543
+ 3. Reads execution metadata saved by the notebook
544
+ 4. Converts executed notebook to Markdown format
545
+ 5. Uploads both notebook outputs as execution assets
546
+ 6. Prints a citation for the execution record
547
+
548
+ The notebook is expected to create an execution record during its run
549
+ and save the execution metadata to the path specified in the
550
+ DERIVA_ML_SAVE_EXECUTION_RID environment variable.
551
+
552
+ Args:
553
+ notebook_file: Absolute path to the notebook file to execute.
554
+ parameters: Dictionary of parameters to inject into the notebook's
555
+ parameter cell.
556
+ kernel: Name of the Jupyter kernel to use. If None, papermill will
557
+ use the notebook's default kernel.
558
+ log: If True, stream notebook cell outputs to stdout during execution.
559
+ hydra_overrides: Optional list of Hydra-zen configuration overrides
560
+ (e.g., ["assets=roc_quick_probabilities", "deriva_ml=eye_ai"]).
561
+ These are passed to the notebook via DERIVA_ML_HYDRA_OVERRIDES
562
+ environment variable as a JSON-encoded list.
563
+
564
+ Raises:
565
+ SystemExit: If the notebook doesn't save execution metadata.
566
+
567
+ Note:
568
+ The executed notebook and its Markdown conversion are uploaded to
569
+ the catalog as Execution_Asset records with type 'notebook_output'.
570
+ """
571
+ # Get workflow provenance info (URL for Git-tracked files, checksum for integrity)
147
572
  url, checksum = Workflow.get_url_and_checksum(Path(notebook_file))
148
573
  os.environ["DERIVA_ML_WORKFLOW_URL"] = url
149
574
  os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"] = checksum
150
575
  os.environ["DERIVA_ML_NOTEBOOK_PATH"] = notebook_file.as_posix()
576
+
577
+ # Pass Hydra overrides to notebook via environment variable
578
+ if hydra_overrides:
579
+ os.environ["DERIVA_ML_HYDRA_OVERRIDES"] = json.dumps(hydra_overrides)
580
+ elif "DERIVA_ML_HYDRA_OVERRIDES" in os.environ:
581
+ del os.environ["DERIVA_ML_HYDRA_OVERRIDES"]
582
+
151
583
  with tempfile.TemporaryDirectory() as tmpdirname:
152
584
  notebook_output = Path(tmpdirname) / Path(notebook_file).name
153
585
  execution_rid_path = Path(tmpdirname) / "execution_rid.json"
586
+ # Tell the notebook where to save its execution metadata
154
587
  os.environ["DERIVA_ML_SAVE_EXECUTION_RID"] = execution_rid_path.as_posix()
588
+
589
+ # Execute the notebook with papermill, injecting parameters
155
590
  pm.execute_notebook(
156
591
  input_path=notebook_file,
157
592
  output_path=notebook_output,
@@ -160,6 +595,8 @@ class DerivaMLRunNotebookCLI(BaseCLI):
160
595
  log_output=log,
161
596
  )
162
597
  print(f"Notebook output saved to {notebook_output}")
598
+
599
+ # Read execution metadata that the notebook should have saved
163
600
  with execution_rid_path.open("r") as f:
164
601
  execution_config = json.load(f)
165
602
 
@@ -167,31 +604,60 @@ class DerivaMLRunNotebookCLI(BaseCLI):
167
604
  print("Execution RID not found.")
168
605
  exit(1)
169
606
 
607
+ # Extract execution info to reconnect to the catalog
170
608
  execution_rid = execution_config["execution_rid"]
171
609
  hostname = execution_config["hostname"]
172
610
  catalog_id = execution_config["catalog_id"]
173
- workflow_rid = execution_config["workflow_rid"]
611
+
612
+ # Create DerivaML instance to upload results
174
613
  ml_instance = DerivaML(hostname=hostname, catalog_id=catalog_id, working_dir=tmpdirname)
175
614
  workflow_rid = ml_instance.retrieve_rid(execution_config["execution_rid"])["Workflow"]
176
615
 
616
+ # Look up the workflow object from the RID
617
+ workflow = ml_instance.lookup_workflow(workflow_rid)
618
+
619
+ # Restore the execution context to upload outputs
177
620
  execution = Execution(
178
- configuration=ExecutionConfiguration(workflow=workflow_rid),
621
+ configuration=ExecutionConfiguration(workflow=workflow),
179
622
  ml_object=ml_instance,
180
623
  reload=execution_rid,
181
624
  )
182
625
 
183
- # Generate an HTML version of the output notebook.
626
+ # Convert executed notebook to Markdown for easier viewing
627
+ # We embed images as base64 data URIs so the markdown is self-contained
184
628
  notebook_output_md = notebook_output.with_suffix(".md")
185
629
  with notebook_output.open() as f:
186
630
  nb = nbformat.read(f, as_version=4)
187
- # Convert to Markdown
631
+
632
+ # Convert DataFrame HTML outputs to markdown tables for better rendering
633
+ nb = _convert_dataframe_outputs(nb)
634
+
188
635
  exporter = MarkdownExporter()
189
636
  (body, resources) = exporter.from_notebook_node(nb)
190
637
 
638
+ # Replace file references with inline base64 data URIs
639
+ if resources.get("outputs"):
640
+ for filename, data in resources["outputs"].items():
641
+ # Determine mime type from extension
642
+ if filename.endswith(".png"):
643
+ mime_type = "image/png"
644
+ elif filename.endswith(".jpg") or filename.endswith(".jpeg"):
645
+ mime_type = "image/jpeg"
646
+ elif filename.endswith(".svg"):
647
+ mime_type = "image/svg+xml"
648
+ else:
649
+ mime_type = "application/octet-stream"
650
+
651
+ # Create data URI and replace in markdown
652
+ b64_data = base64.b64encode(data).decode("utf-8")
653
+ data_uri = f"data:{mime_type};base64,{b64_data}"
654
+ body = body.replace(filename, data_uri)
655
+
191
656
  with notebook_output_md.open("w") as f:
192
657
  f.write(body)
193
658
  nb = nbformat.read(notebook_output, as_version=4)
194
659
 
660
+ # Register both notebook outputs as execution assets
195
661
  execution.asset_file_path(
196
662
  asset_name=MLAsset.execution_asset,
197
663
  file_name=notebook_output,
@@ -203,9 +669,12 @@ class DerivaMLRunNotebookCLI(BaseCLI):
203
669
  file_name=notebook_output_md,
204
670
  asset_types=ExecAssetType.notebook_output,
205
671
  )
672
+
673
+ # Upload all registered assets to the catalog
206
674
  execution.upload_execution_outputs()
207
675
 
208
- print(ml_instance.cite(execution_rid))
676
+ # Print execution URL (without snapshot ID for readability)
677
+ print(f"https://{hostname}/id/{catalog_id}/{execution_rid}")
209
678
 
210
679
 
211
680
  def main():