rapidfireai 0.10.2rc5__py3-none-any.whl → 0.10.3rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rapidfireai might be problematic. Click here for more details.

Files changed (31) hide show
  1. rapidfireai/backend/controller.py +29 -16
  2. rapidfireai/backend/worker.py +14 -7
  3. rapidfireai/cli.py +28 -1
  4. rapidfireai/db/rf_db.py +1 -1
  5. rapidfireai/db/tables.sql +1 -1
  6. rapidfireai/dispatcher/dispatcher.py +3 -1
  7. rapidfireai/dispatcher/gunicorn.conf.py +1 -1
  8. rapidfireai/experiment.py +75 -7
  9. rapidfireai/frontend/build/asset-manifest.json +3 -3
  10. rapidfireai/frontend/build/index.html +1 -1
  11. rapidfireai/frontend/build/static/js/{main.1bf27639.js → main.e7d3b759.js} +3 -3
  12. rapidfireai/frontend/build/static/js/{main.1bf27639.js.map → main.e7d3b759.js.map} +1 -1
  13. rapidfireai/frontend/proxy_middleware.py +1 -1
  14. rapidfireai/ml/callbacks.py +78 -38
  15. rapidfireai/ml/trainer.py +6 -6
  16. rapidfireai/start.sh +117 -34
  17. rapidfireai/utils/constants.py +20 -1
  18. rapidfireai/utils/experiment_utils.py +87 -43
  19. rapidfireai/utils/interactive_controller.py +494 -0
  20. rapidfireai/utils/metric_logger.py +346 -0
  21. rapidfireai/utils/mlflow_manager.py +0 -2
  22. rapidfireai/utils/worker_manager.py +16 -6
  23. rapidfireai/version.py +2 -2
  24. {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.10.3rc1.dist-info}/METADATA +7 -4
  25. {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.10.3rc1.dist-info}/RECORD +31 -28
  26. tutorial_notebooks/rf-colab-tensorboard-tutorial.ipynb +314 -0
  27. /rapidfireai/frontend/build/static/js/{main.1bf27639.js.LICENSE.txt → main.e7d3b759.js.LICENSE.txt} +0 -0
  28. {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.10.3rc1.dist-info}/WHEEL +0 -0
  29. {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.10.3rc1.dist-info}/entry_points.txt +0 -0
  30. {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.10.3rc1.dist-info}/licenses/LICENSE +0 -0
  31. {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.10.3rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,494 @@
1
+ """
2
+ Interactive Controller for Jupyter/Colab notebooks.
3
+ Provides UI controls for managing training runs similar to the frontend.
4
+ """
5
+
6
+ import json
7
+ import threading
8
+ import time
9
+ from typing import Any, Dict, Optional
10
+
11
+ import requests
12
+ from IPython.display import clear_output, display
13
+
14
+ try:
15
+ import ipywidgets as widgets
16
+ except ImportError:
17
+ raise ImportError(
18
+ "ipywidgets is required for InteractiveController. "
19
+ "Install with: pip install ipywidgets"
20
+ )
21
+
22
+
23
+ class InteractiveController:
24
+ """Interactive run controller for notebooks"""
25
+
26
+ def __init__(self, dispatcher_url: str = "http://127.0.0.1:8081"):
27
+ self.dispatcher_url = dispatcher_url.rstrip("/")
28
+ self.run_id: Optional[int] = None
29
+ self.config: Optional[Dict] = None
30
+ self.status: str = "Unknown"
31
+ self.chunk_number: int = 0
32
+
33
+ # Create UI widgets
34
+ self._create_widgets()
35
+
36
+ def _create_widgets(self):
37
+ """Create ipywidgets UI components"""
38
+ # Run selector
39
+ self.run_selector = widgets.Dropdown(
40
+ options=[],
41
+ description='',
42
+ disabled=False,
43
+ layout=widgets.Layout(width='300px')
44
+ )
45
+ self.load_btn = widgets.Button(
46
+ description="Load Run",
47
+ button_style="primary",
48
+ tooltip="Load the selected run",
49
+ icon="download"
50
+ )
51
+ self.refresh_selector_btn = widgets.Button(
52
+ description="Refresh List",
53
+ button_style="info",
54
+ tooltip="Refresh the list of available runs",
55
+ icon="refresh"
56
+ )
57
+
58
+ # Status display
59
+ self.status_label = widgets.HTML(value="<b>Status:</b> Not loaded")
60
+ self.chunk_label = widgets.HTML(value="<b>Chunk:</b> N/A")
61
+ self.run_id_label = widgets.HTML(value="<b>Run ID:</b> N/A")
62
+
63
+ # Action buttons
64
+ self.resume_btn = widgets.Button(
65
+ description="Resume",
66
+ button_style="success",
67
+ tooltip="Resume this run",
68
+ icon="play",
69
+ )
70
+ self.stop_btn = widgets.Button(
71
+ description="Stop",
72
+ button_style="danger",
73
+ tooltip="Stop this run",
74
+ icon="stop"
75
+ )
76
+ self.delete_btn = widgets.Button(
77
+ description="Delete",
78
+ button_style="danger",
79
+ tooltip="Delete this run",
80
+ icon="trash",
81
+ )
82
+ self.refresh_btn = widgets.Button(
83
+ description="Refresh Status",
84
+ button_style="info",
85
+ tooltip="Refresh current run status and metrics",
86
+ icon="sync",
87
+ )
88
+
89
+ # Config editor (for clone/modify)
90
+ self.config_text = widgets.Textarea(
91
+ value="{}",
92
+ placeholder="Run configuration (JSON)",
93
+ disabled=True,
94
+ layout=widgets.Layout(width="100%", height="200px"),
95
+ )
96
+ self.warm_start_checkbox = widgets.Checkbox(
97
+ value=False,
98
+ description="Warm Start (continue from previous checkpoint)",
99
+ disabled=True,
100
+ style={'description_width': 'initial'},
101
+ layout=widgets.Layout(margin='10px 0px')
102
+ )
103
+ self.clone_btn = widgets.Button(
104
+ description="Clone",
105
+ button_style="primary",
106
+ tooltip="Clone this run with modifications",
107
+ )
108
+ self.submit_clone_btn = widgets.Button(
109
+ description="✓ Submit Clone", button_style="success", disabled=True
110
+ )
111
+ self.cancel_clone_btn = widgets.Button(
112
+ description="✗ Cancel", button_style="", disabled=True
113
+ )
114
+
115
+ # Status message box
116
+ self.status_message = widgets.HTML(
117
+ value='',
118
+ layout=widgets.Layout(
119
+ width='100%',
120
+ min_height='40px',
121
+ padding='10px',
122
+ margin='10px 0px',
123
+ border='2px solid #ddd',
124
+ border_radius='5px'
125
+ )
126
+ )
127
+
128
+ # Experiment status display (live progress)
129
+ # self.experiment_status = widgets.HTML(
130
+ # value='<div style="padding: 10px; background-color: #f8f9fa; border: 2px solid #dee2e6; border-radius: 5px;">'
131
+ # '<b>Experiment Status:</b> Loading...'
132
+ # '</div>',
133
+ # layout=widgets.Layout(
134
+ # width='100%',
135
+ # margin='10px 0px'
136
+ # )
137
+ # )
138
+
139
+ # Bind button callbacks
140
+ self.refresh_selector_btn.on_click(lambda b: self.fetch_all_runs())
141
+ self.load_btn.on_click(lambda b: self._handle_load())
142
+ self.resume_btn.on_click(lambda b: self._handle_resume())
143
+ self.stop_btn.on_click(lambda b: self._handle_stop())
144
+ self.delete_btn.on_click(lambda b: self._handle_delete())
145
+ self.refresh_btn.on_click(lambda b: self.load_run(self.run_id) if self.run_id else None)
146
+ self.clone_btn.on_click(lambda b: self._enable_clone_mode())
147
+ self.submit_clone_btn.on_click(lambda b: self._handle_clone())
148
+ self.cancel_clone_btn.on_click(lambda b: self._handle_cancel_clone())
149
+
150
+ # Auto-load run when dropdown selection changes
151
+ self.run_selector.observe(self._on_run_selected, names='value')
152
+
153
+ def _show_message(self, message: str, message_type: str = "info"):
154
+ """Display a status message with styling"""
155
+ colors = {
156
+ "success": {"bg": "#d4edda", "border": "#28a745", "text": "#155724"},
157
+ "error": {"bg": "#f8d7da", "border": "#dc3545", "text": "#721c24"},
158
+ "info": {"bg": "#d1ecf1", "border": "#17a2b8", "text": "#0c5460"},
159
+ "warning": {"bg": "#fff3cd", "border": "#ffc107", "text": "#856404"}
160
+ }
161
+
162
+ style = colors.get(message_type, colors["info"])
163
+
164
+ self.status_message.value = f'''
165
+ <div style="
166
+ background-color: {style['bg']};
167
+ border: 2px solid {style['border']};
168
+ color: {style['text']};
169
+ padding: 10px;
170
+ border-radius: 5px;
171
+ font-weight: 600;
172
+ ">
173
+ {message}
174
+ </div>
175
+ '''
176
+
177
+ def _update_experiment_status(self):
178
+ """Update experiment status display with live progress"""
179
+ try:
180
+ response = requests.get(
181
+ f"{self.dispatcher_url}/dispatcher/get-all-runs",
182
+ timeout=5,
183
+ )
184
+ response.raise_for_status()
185
+ runs = response.json()
186
+
187
+ if runs:
188
+ total_runs = len(runs)
189
+ completed_runs = sum(1 for r in runs if r.get('status') == 'COMPLETED')
190
+ ongoing_runs = sum(1 for r in runs if r.get('status') == 'ONGOING')
191
+
192
+ # Determine status color and icon
193
+ if completed_runs == total_runs:
194
+ bg_color = "#d4edda"
195
+ border_color = "#28a745"
196
+ text_color = "#155724"
197
+ icon = "✓"
198
+ status_text = "All runs completed"
199
+ elif ongoing_runs > 0:
200
+ bg_color = "#d1ecf1"
201
+ border_color = "#17a2b8"
202
+ text_color = "#0c5460"
203
+ icon = "🔄"
204
+ status_text = "Training in progress"
205
+ else:
206
+ bg_color = "#fff3cd"
207
+ border_color = "#ffc107"
208
+ text_color = "#856404"
209
+ icon = "⏸"
210
+ status_text = "Training paused or stopped"
211
+
212
+ self.experiment_status.value = (
213
+ f'<div style="padding: 10px; background-color: {bg_color}; '
214
+ f'border: 2px solid {border_color}; border-radius: 5px; color: {text_color};">'
215
+ f'<b>{icon} Experiment Status:</b> {status_text}<br>'
216
+ f'<b>Progress:</b> {completed_runs}/{total_runs} runs completed'
217
+ '</div>'
218
+ )
219
+ else:
220
+ self.experiment_status.value = (
221
+ '<div style="padding: 10px; background-color: #f8f9fa; '
222
+ 'border: 2px solid #dee2e6; border-radius: 5px;">'
223
+ '<b>Experiment Status:</b> No runs found'
224
+ '</div>'
225
+ )
226
+
227
+ except requests.RequestException:
228
+ # Silently fail - don't update status if request fails
229
+ pass
230
+
231
+ def fetch_all_runs(self):
232
+ """Fetch all runs and populate dropdown"""
233
+ try:
234
+ response = requests.get(
235
+ f"{self.dispatcher_url}/dispatcher/get-all-runs",
236
+ timeout=5,
237
+ )
238
+ response.raise_for_status()
239
+ runs = response.json()
240
+
241
+ if runs:
242
+ # Create options as (label, value) tuples
243
+ options = [(f"Run {run['run_id']} - {run.get('status', 'Unknown')}", run['run_id'])
244
+ for run in runs]
245
+ self.run_selector.options = options
246
+ self._show_message(f"Found {len(runs)} runs", "success")
247
+ else:
248
+ self.run_selector.options = []
249
+ self._show_message("No runs found", "info")
250
+
251
+ # Update experiment status
252
+ # COMMENTED OUT
253
+ # self._update_experiment_status()
254
+
255
+ except requests.RequestException as e:
256
+ self._show_message(f"Error fetching runs: {e}", "error")
257
+
258
+ def _on_run_selected(self, change):
259
+ """Handle dropdown selection change - auto-load run"""
260
+ if change['new'] is not None:
261
+ self.load_run(change['new'])
262
+
263
+ def _handle_load(self):
264
+ """Handle load button click"""
265
+ if self.run_selector.value is not None:
266
+ self.load_run(self.run_selector.value)
267
+ else:
268
+ self._show_message("Please select a run first", "warning")
269
+
270
+ def load_run(self, run_id: int):
271
+ """Load run details from dispatcher API"""
272
+ self.run_id = run_id
273
+ try:
274
+ response = requests.post(
275
+ f"{self.dispatcher_url}/dispatcher/get-run",
276
+ json={"run_id": run_id},
277
+ timeout=5,
278
+ )
279
+ response.raise_for_status()
280
+ data = response.json()
281
+
282
+ # Update state
283
+ self.config = data.get("config", {})
284
+ self.status = data.get("status", "Unknown")
285
+ self.chunk_number = data.get("num_chunks_visited", 0)
286
+
287
+ # Update UI
288
+ self._update_display()
289
+ self._show_message(f"Loaded run {run_id}", "success")
290
+
291
+ # Update experiment status
292
+ # COMMENTED OUT
293
+ # self._update_experiment_status()
294
+
295
+ except requests.RequestException as e:
296
+ self._show_message(f"Error loading run: {e}", "error")
297
+
298
+ def _update_display(self):
299
+ """Update widget values"""
300
+ self.run_id_label.value = f"<b>Run ID:</b> {self.run_id}"
301
+ self.status_label.value = f"<b>Status:</b> {self.status}"
302
+ self.chunk_label.value = f"<b>Chunk:</b> {self.chunk_number}"
303
+ self.config_text.value = json.dumps(self.config, indent=2)
304
+
305
+ # Disable buttons if completed
306
+ is_completed = self.status.lower() == "completed"
307
+ self.resume_btn.disabled = is_completed
308
+ self.stop_btn.disabled = is_completed
309
+ self.clone_btn.disabled = is_completed
310
+ self.delete_btn.disabled = is_completed
311
+
312
+ def _handle_resume(self):
313
+ """Resume the run"""
314
+ try:
315
+ response = requests.post(
316
+ f"{self.dispatcher_url}/dispatcher/resume-run",
317
+ json={"run_id": self.run_id},
318
+ timeout=5,
319
+ )
320
+ response.raise_for_status()
321
+ result = response.json()
322
+
323
+ if result.get("error"):
324
+ self._show_message(f"Error: {result['error']}", "error")
325
+ else:
326
+ self._show_message(f"Resumed run {self.run_id}", "success")
327
+ self.load_run(self.run_id)
328
+ except requests.RequestException as e:
329
+ self._show_message(f"Error resuming run: {e}", "error")
330
+
331
+ def _handle_stop(self):
332
+ """Stop the run"""
333
+ try:
334
+ response = requests.post(
335
+ f"{self.dispatcher_url}/dispatcher/stop-run",
336
+ json={"run_id": self.run_id},
337
+ timeout=5,
338
+ )
339
+ response.raise_for_status()
340
+ result = response.json()
341
+
342
+ if result.get("error"):
343
+ self._show_message(f"Error: {result['error']}", "error")
344
+ else:
345
+ self._show_message(f"Stopped run {self.run_id}", "success")
346
+ self.load_run(self.run_id)
347
+ except requests.RequestException as e:
348
+ self._show_message(f"Error stopping run: {e}", "error")
349
+
350
+ def _handle_delete(self):
351
+ """Delete the run"""
352
+ try:
353
+ response = requests.post(
354
+ f"{self.dispatcher_url}/dispatcher/delete-run",
355
+ json={"run_id": self.run_id},
356
+ timeout=5,
357
+ )
358
+ response.raise_for_status()
359
+ result = response.json()
360
+
361
+ if result.get("error"):
362
+ self._show_message(f"Error: {result['error']}", "error")
363
+ else:
364
+ self._show_message(f"Deleted run {self.run_id}", "success")
365
+ except requests.RequestException as e:
366
+ self._show_message(f"Error deleting run: {e}", "error")
367
+
368
+ def _enable_clone_mode(self):
369
+ """Enable config editing for clone/modify"""
370
+ self.config_text.disabled = False
371
+ self.warm_start_checkbox.disabled = False
372
+ self.submit_clone_btn.disabled = False
373
+ self.cancel_clone_btn.disabled = False
374
+ self.clone_btn.disabled = True
375
+ self._show_message("Edit config and click Submit to clone", "info")
376
+
377
+ def _disable_clone_mode(self):
378
+ """Disable config editing"""
379
+ self.config_text.disabled = True
380
+ self.config_text.value = json.dumps(self.config, indent=2)
381
+ self.warm_start_checkbox.disabled = True
382
+ self.warm_start_checkbox.value = False
383
+ self.submit_clone_btn.disabled = True
384
+ self.cancel_clone_btn.disabled = True
385
+ self.clone_btn.disabled = False
386
+
387
+ def _handle_cancel_clone(self):
388
+ """Handle cancel clone button click"""
389
+ self._disable_clone_mode()
390
+ self._show_message("Cancelled clone", "info")
391
+
392
+ def _enable_colab_widgets(self):
393
+ """Enable custom widget manager for Google Colab"""
394
+ try:
395
+ # Try to import google.colab to detect if we're in Colab
396
+ import google.colab
397
+
398
+ # Enable custom widget manager for ipywidgets to work in Colab
399
+ from google.colab import output
400
+ output.enable_custom_widget_manager()
401
+ except ImportError:
402
+ # Not in Colab, no action needed
403
+ pass
404
+
405
+ def _handle_clone(self):
406
+ """Clone/modify the run"""
407
+ try:
408
+ # Parse config
409
+ try:
410
+ new_config = json.loads(self.config_text.value)
411
+ except json.JSONDecodeError as e:
412
+ self._show_message(f"Invalid JSON: {e}", "error")
413
+ return
414
+
415
+ response = requests.post(
416
+ f"{self.dispatcher_url}/dispatcher/clone-modify-run",
417
+ json={
418
+ "run_id": self.run_id,
419
+ "config": new_config,
420
+ "warm_start": self.warm_start_checkbox.value,
421
+ },
422
+ timeout=5,
423
+ )
424
+ response.raise_for_status()
425
+ result = response.json()
426
+
427
+ if result.get("error") or (result.get("result") is False):
428
+ error_msg = result.get("err_msg") or result.get("error")
429
+ self._show_message(f"Error: {error_msg}", "error")
430
+ else:
431
+ self._show_message(f"Cloned run {self.run_id}", "success")
432
+ self._disable_clone_mode()
433
+
434
+ except requests.RequestException as e:
435
+ self._show_message(f"Error cloning run: {e}", "error")
436
+
437
+ def display(self):
438
+ """Display the interactive controller UI"""
439
+ # Enable custom widget manager for Google Colab
440
+ self._enable_colab_widgets()
441
+
442
+ # Layout
443
+ header = widgets.VBox(
444
+ [
445
+ widgets.HTML("<h3>Interactive Run Controller</h3>"),
446
+ widgets.HBox([self.run_id_label, self.status_label, self.chunk_label]),
447
+ ]
448
+ )
449
+
450
+ # Run selector section
451
+ selector_section = widgets.VBox(
452
+ [
453
+ widgets.HTML("<b>Select a Run:</b>"),
454
+ widgets.HBox([self.run_selector, self.load_btn, self.refresh_selector_btn]),
455
+ ]
456
+ )
457
+
458
+ actions = widgets.HBox(
459
+ [self.resume_btn, self.stop_btn, self.delete_btn, self.refresh_btn]
460
+ )
461
+
462
+ config_section = widgets.VBox(
463
+ [
464
+ widgets.HTML("<b>Configuration:</b>"),
465
+ self.config_text,
466
+ self.warm_start_checkbox,
467
+ widgets.HBox([self.clone_btn, self.submit_clone_btn, self.cancel_clone_btn]),
468
+ ]
469
+ )
470
+
471
+ # COMMENTED OUT - Displaying experiment status in cell
472
+ # ui = widgets.VBox([header, self.experiment_status, self.status_message, selector_section, actions, config_section])
473
+ ui = widgets.VBox([header, self.status_message, selector_section, actions, config_section])
474
+
475
+ display(ui)
476
+
477
+ # Automatically fetch available runs
478
+ self.fetch_all_runs()
479
+
480
+ # Load initial data if run_id set
481
+ if self.run_id:
482
+ self.load_run(self.run_id)
483
+
484
+ def auto_refresh(self, interval: int = 5):
485
+ """Auto-refresh status every N seconds (run in background)"""
486
+
487
+ def refresh_loop():
488
+ while True:
489
+ if self.run_id:
490
+ self.load_run(self.run_id)
491
+ time.sleep(interval)
492
+
493
+ thread = threading.Thread(target=refresh_loop, daemon=True)
494
+ thread.start()