experimaestro 2.0.0b8__py3-none-any.whl → 2.0.0b17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (152) hide show
  1. experimaestro/__init__.py +12 -5
  2. experimaestro/cli/__init__.py +239 -126
  3. experimaestro/cli/filter.py +48 -23
  4. experimaestro/cli/jobs.py +253 -71
  5. experimaestro/cli/refactor.py +1 -2
  6. experimaestro/commandline.py +7 -4
  7. experimaestro/connectors/__init__.py +9 -1
  8. experimaestro/connectors/local.py +43 -3
  9. experimaestro/core/arguments.py +18 -18
  10. experimaestro/core/identifier.py +11 -11
  11. experimaestro/core/objects/config.py +96 -39
  12. experimaestro/core/objects/config_walk.py +3 -3
  13. experimaestro/core/{subparameters.py → partial.py} +16 -16
  14. experimaestro/core/partial_lock.py +394 -0
  15. experimaestro/core/types.py +12 -15
  16. experimaestro/dynamic.py +290 -0
  17. experimaestro/experiments/__init__.py +6 -2
  18. experimaestro/experiments/cli.py +217 -50
  19. experimaestro/experiments/configuration.py +24 -0
  20. experimaestro/generators.py +5 -5
  21. experimaestro/ipc.py +118 -1
  22. experimaestro/launcherfinder/__init__.py +2 -2
  23. experimaestro/launcherfinder/registry.py +6 -7
  24. experimaestro/launcherfinder/specs.py +2 -9
  25. experimaestro/launchers/slurm/__init__.py +2 -2
  26. experimaestro/launchers/slurm/base.py +62 -0
  27. experimaestro/locking.py +957 -1
  28. experimaestro/notifications.py +89 -201
  29. experimaestro/progress.py +63 -366
  30. experimaestro/rpyc.py +0 -2
  31. experimaestro/run.py +29 -2
  32. experimaestro/scheduler/__init__.py +8 -1
  33. experimaestro/scheduler/base.py +629 -53
  34. experimaestro/scheduler/dependencies.py +20 -16
  35. experimaestro/scheduler/experiment.py +732 -167
  36. experimaestro/scheduler/interfaces.py +316 -101
  37. experimaestro/scheduler/jobs.py +58 -20
  38. experimaestro/scheduler/remote/adaptive_sync.py +265 -0
  39. experimaestro/scheduler/remote/client.py +171 -117
  40. experimaestro/scheduler/remote/protocol.py +8 -193
  41. experimaestro/scheduler/remote/server.py +95 -71
  42. experimaestro/scheduler/services.py +53 -28
  43. experimaestro/scheduler/state_provider.py +663 -2430
  44. experimaestro/scheduler/state_status.py +1247 -0
  45. experimaestro/scheduler/transient.py +31 -0
  46. experimaestro/scheduler/workspace.py +1 -1
  47. experimaestro/scheduler/workspace_state_provider.py +1273 -0
  48. experimaestro/scriptbuilder.py +4 -4
  49. experimaestro/settings.py +36 -0
  50. experimaestro/tests/conftest.py +33 -5
  51. experimaestro/tests/connectors/bin/executable.py +1 -1
  52. experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
  53. experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
  54. experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
  55. experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
  56. experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
  57. experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
  58. experimaestro/tests/launchers/bin/test.py +1 -0
  59. experimaestro/tests/launchers/test_slurm.py +9 -9
  60. experimaestro/tests/partial_reschedule.py +46 -0
  61. experimaestro/tests/restart.py +3 -3
  62. experimaestro/tests/restart_main.py +1 -0
  63. experimaestro/tests/scripts/notifyandwait.py +1 -0
  64. experimaestro/tests/task_partial.py +38 -0
  65. experimaestro/tests/task_tokens.py +2 -2
  66. experimaestro/tests/tasks/test_dynamic.py +6 -6
  67. experimaestro/tests/test_dependencies.py +3 -3
  68. experimaestro/tests/test_deprecated.py +15 -15
  69. experimaestro/tests/test_dynamic_locking.py +317 -0
  70. experimaestro/tests/test_environment.py +24 -14
  71. experimaestro/tests/test_experiment.py +171 -36
  72. experimaestro/tests/test_identifier.py +25 -25
  73. experimaestro/tests/test_identifier_stability.py +3 -5
  74. experimaestro/tests/test_multitoken.py +2 -4
  75. experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
  76. experimaestro/tests/test_partial_paths.py +81 -138
  77. experimaestro/tests/test_pre_experiment.py +219 -0
  78. experimaestro/tests/test_progress.py +2 -8
  79. experimaestro/tests/test_remote_state.py +560 -99
  80. experimaestro/tests/test_stray_jobs.py +261 -0
  81. experimaestro/tests/test_tasks.py +1 -2
  82. experimaestro/tests/test_token_locking.py +52 -67
  83. experimaestro/tests/test_tokens.py +5 -6
  84. experimaestro/tests/test_transient.py +225 -0
  85. experimaestro/tests/test_workspace_state_provider.py +768 -0
  86. experimaestro/tests/token_reschedule.py +1 -3
  87. experimaestro/tests/utils.py +2 -7
  88. experimaestro/tokens.py +227 -372
  89. experimaestro/tools/diff.py +1 -0
  90. experimaestro/tools/documentation.py +4 -5
  91. experimaestro/tools/jobs.py +1 -2
  92. experimaestro/tui/app.py +438 -1966
  93. experimaestro/tui/app.tcss +162 -0
  94. experimaestro/tui/dialogs.py +172 -0
  95. experimaestro/tui/log_viewer.py +253 -3
  96. experimaestro/tui/messages.py +137 -0
  97. experimaestro/tui/utils.py +54 -0
  98. experimaestro/tui/widgets/__init__.py +23 -0
  99. experimaestro/tui/widgets/experiments.py +468 -0
  100. experimaestro/tui/widgets/global_services.py +238 -0
  101. experimaestro/tui/widgets/jobs.py +972 -0
  102. experimaestro/tui/widgets/log.py +156 -0
  103. experimaestro/tui/widgets/orphans.py +363 -0
  104. experimaestro/tui/widgets/runs.py +185 -0
  105. experimaestro/tui/widgets/services.py +314 -0
  106. experimaestro/tui/widgets/stray_jobs.py +528 -0
  107. experimaestro/utils/__init__.py +1 -1
  108. experimaestro/utils/environment.py +105 -22
  109. experimaestro/utils/fswatcher.py +124 -0
  110. experimaestro/utils/jobs.py +1 -2
  111. experimaestro/utils/jupyter.py +1 -2
  112. experimaestro/utils/logging.py +72 -0
  113. experimaestro/version.py +2 -2
  114. experimaestro/webui/__init__.py +9 -0
  115. experimaestro/webui/app.py +117 -0
  116. experimaestro/{server → webui}/data/index.css +66 -11
  117. experimaestro/webui/data/index.css.map +1 -0
  118. experimaestro/{server → webui}/data/index.js +82763 -87217
  119. experimaestro/webui/data/index.js.map +1 -0
  120. experimaestro/webui/routes/__init__.py +5 -0
  121. experimaestro/webui/routes/auth.py +53 -0
  122. experimaestro/webui/routes/proxy.py +117 -0
  123. experimaestro/webui/server.py +200 -0
  124. experimaestro/webui/state_bridge.py +152 -0
  125. experimaestro/webui/websocket.py +413 -0
  126. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +5 -6
  127. experimaestro-2.0.0b17.dist-info/RECORD +219 -0
  128. experimaestro/cli/progress.py +0 -269
  129. experimaestro/scheduler/state.py +0 -75
  130. experimaestro/scheduler/state_db.py +0 -437
  131. experimaestro/scheduler/state_sync.py +0 -891
  132. experimaestro/server/__init__.py +0 -467
  133. experimaestro/server/data/index.css.map +0 -1
  134. experimaestro/server/data/index.js.map +0 -1
  135. experimaestro/tests/test_cli_jobs.py +0 -615
  136. experimaestro/tests/test_file_progress.py +0 -425
  137. experimaestro/tests/test_file_progress_integration.py +0 -477
  138. experimaestro/tests/test_state_db.py +0 -434
  139. experimaestro-2.0.0b8.dist-info/RECORD +0 -187
  140. /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
  141. /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
  142. /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
  143. /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
  144. /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
  145. /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
  146. /experimaestro/{server → webui}/data/favicon.ico +0 -0
  147. /experimaestro/{server → webui}/data/index.html +0 -0
  148. /experimaestro/{server → webui}/data/login.html +0 -0
  149. /experimaestro/{server → webui}/data/manifest.json +0 -0
  150. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
  151. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
  152. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
experimaestro/tui/app.py CHANGED
@@ -4,1855 +4,62 @@ import logging
4
4
  from pathlib import Path
5
5
  from typing import Optional
6
6
  from textual.app import App, ComposeResult
7
- from textual import work
8
- from textual.containers import Container, Horizontal, Vertical
7
+ from textual.containers import Vertical
9
8
  from textual.widgets import (
10
9
  Header,
11
10
  Footer,
12
11
  DataTable,
13
- Label,
14
12
  TabbedContent,
15
13
  TabPane,
16
- RichLog,
17
- Button,
18
- Static,
19
- Input,
20
14
  )
21
- from textual.widget import Widget
22
- from textual.reactive import reactive
23
15
  from textual.binding import Binding
24
- from textual.message import Message
25
- from textual.screen import ModalScreen, Screen
26
- from textual import events
27
- from rich.text import Text
28
- from experimaestro.scheduler.state_provider import (
29
- WorkspaceStateProvider,
30
- StateEvent,
31
- StateEventType,
16
+
17
+ from experimaestro.scheduler.state_provider import StateProvider
18
+ from experimaestro.scheduler.state_status import (
19
+ EventBase,
20
+ ExperimentUpdatedEvent,
21
+ RunUpdatedEvent,
22
+ JobStateChangedEvent,
23
+ JobProgressEvent,
24
+ JobSubmittedEvent,
25
+ ServiceAddedEvent,
26
+ ServiceStateChangedEvent,
32
27
  )
33
28
  from experimaestro.tui.log_viewer import LogViewerScreen
34
-
35
-
36
- def format_duration(seconds: float) -> str:
37
- """Format duration in seconds to human-readable string"""
38
- if seconds < 0:
39
- return "-"
40
- seconds = int(seconds)
41
- if seconds < 60:
42
- return f"{seconds}s"
43
- elif seconds < 3600:
44
- return f"{seconds // 60}m {seconds % 60}s"
45
- elif seconds < 86400:
46
- return f"{seconds // 3600}h {(seconds % 3600) // 60}m"
47
- else:
48
- return f"{seconds // 86400}d {(seconds % 86400) // 3600}h"
49
-
50
-
51
- class QuitConfirmScreen(ModalScreen[bool]):
52
- """Modal screen for quit confirmation"""
53
-
54
- def __init__(self, has_active_experiment: bool = False):
55
- super().__init__()
56
- self.has_active_experiment = has_active_experiment
57
-
58
- def compose(self) -> ComposeResult:
59
- with Vertical(id="quit-dialog"):
60
- yield Static("Quit Experimaestro?", id="quit-title")
61
-
62
- if self.has_active_experiment:
63
- yield Static(
64
- "⚠️ The experiment is still in progress.\n"
65
- "Quitting will prevent new jobs from being launched.",
66
- id="quit-warning",
67
- )
68
- else:
69
- yield Static("Are you sure you want to quit?", id="quit-message")
70
-
71
- with Horizontal(id="quit-buttons"):
72
- yield Button("Quit", variant="error", id="quit-yes")
73
- yield Button("Cancel", variant="primary", id="quit-no")
74
-
75
- def on_button_pressed(self, event: Button.Pressed) -> None:
76
- if event.button.id == "quit-yes":
77
- self.dismiss(True)
78
- else:
79
- self.dismiss(False)
80
-
81
-
82
- class DeleteConfirmScreen(ModalScreen[bool]):
83
- """Modal screen for delete confirmation"""
84
-
85
- def __init__(
86
- self, item_type: str, item_name: str, warning: Optional[str] = None
87
- ) -> None:
88
- super().__init__()
89
- self.item_type = item_type
90
- self.item_name = item_name
91
- self.warning = warning
92
-
93
- def compose(self) -> ComposeResult:
94
- with Vertical(id="delete-dialog"):
95
- yield Static(f"Delete {self.item_type}?", id="delete-title")
96
- yield Static(
97
- f"This will permanently delete: {self.item_name}", id="delete-message"
98
- )
99
-
100
- if self.warning:
101
- yield Static(f"Warning: {self.warning}", id="delete-warning")
102
-
103
- with Horizontal(id="delete-buttons"):
104
- yield Button("Delete", variant="error", id="delete-yes")
105
- yield Button("Cancel", variant="primary", id="delete-no")
106
-
107
- def on_mount(self) -> None:
108
- """Focus cancel button by default"""
109
- self.query_one("#delete-no", Button).focus()
110
-
111
- def on_button_pressed(self, event: Button.Pressed) -> None:
112
- if event.button.id == "delete-yes":
113
- self.dismiss(True)
114
- else:
115
- self.dismiss(False)
116
-
117
-
118
- class KillConfirmScreen(ModalScreen[bool]):
119
- """Modal screen for kill confirmation"""
120
-
121
- def __init__(self, item_type: str, item_name: str) -> None:
122
- super().__init__()
123
- self.item_type = item_type
124
- self.item_name = item_name
125
-
126
- def compose(self) -> ComposeResult:
127
- with Vertical(id="kill-dialog"):
128
- yield Static(f"Kill {self.item_type}?", id="kill-title")
129
- yield Static(f"This will terminate: {self.item_name}", id="kill-message")
130
-
131
- with Horizontal(id="kill-buttons"):
132
- yield Button("Kill", variant="warning", id="kill-yes")
133
- yield Button("Cancel", variant="primary", id="kill-no")
134
-
135
- def on_mount(self) -> None:
136
- """Focus cancel button by default"""
137
- self.query_one("#kill-no", Button).focus()
138
-
139
- def on_button_pressed(self, event: Button.Pressed) -> None:
140
- if event.button.id == "kill-yes":
141
- self.dismiss(True)
142
- else:
143
- self.dismiss(False)
144
-
145
-
146
- def get_status_icon(status: str, failure_reason=None):
147
- """Get status icon for a job state.
148
-
149
- Args:
150
- status: Job state name (e.g., "done", "error", "running")
151
- failure_reason: Optional JobFailureStatus enum for error states
152
-
153
- Returns:
154
- Status icon string
155
- """
156
- if status == "done":
157
- return "✓"
158
- elif status == "error":
159
- # Show different icons for different failure types
160
- if failure_reason is not None:
161
- from experimaestro.scheduler.interfaces import JobFailureStatus
162
-
163
- if failure_reason == JobFailureStatus.DEPENDENCY:
164
- return "🔗" # Dependency failed
165
- elif failure_reason == JobFailureStatus.TIMEOUT:
166
- return "⏱" # Timeout
167
- elif failure_reason == JobFailureStatus.MEMORY:
168
- return "💾" # Memory issue
169
- # FAILED or unknown - use default error icon
170
- return "❌"
171
- elif status == "running":
172
- return "▶"
173
- elif status == "waiting":
174
- return "⌛" # Waiting for dependencies
175
- else:
176
- # phantom, unscheduled or unknown
177
- return "👻"
178
-
179
-
180
- class CaptureLog(RichLog):
181
- """Custom RichLog widget that captures print statements with log highlighting"""
182
-
183
- def on_mount(self) -> None:
184
- """Enable print capturing when widget is mounted"""
185
- self.begin_capture_print()
186
-
187
- def on_unmount(self) -> None:
188
- """Stop print capturing when widget is unmounted"""
189
- self.end_capture_print()
190
-
191
- def _format_log_line(self, text: str) -> Text:
192
- """Format a log line with appropriate styling based on log level"""
193
- result = Text()
194
-
195
- # Check for common log level patterns
196
- if text.startswith("ERROR:") or ":ERROR:" in text:
197
- result.append(text, style="bold red")
198
- elif text.startswith("WARNING:") or ":WARNING:" in text:
199
- result.append(text, style="yellow")
200
- elif text.startswith("INFO:") or ":INFO:" in text:
201
- result.append(text, style="green")
202
- elif text.startswith("DEBUG:") or ":DEBUG:" in text:
203
- result.append(text, style="dim")
204
- elif text.startswith("CRITICAL:") or ":CRITICAL:" in text:
205
- result.append(text, style="bold white on red")
206
- else:
207
- result.append(text)
208
-
209
- return result
210
-
211
- def on_print(self, event: events.Print) -> None:
212
- """Handle print events from captured stdout/stderr"""
213
- if text := event.text.strip():
214
- self.write(self._format_log_line(text))
215
-
216
-
217
- class ExperimentsList(Widget):
218
- """Widget displaying list of experiments"""
219
-
220
- BINDINGS = [
221
- Binding("d", "delete_experiment", "Delete", show=False),
222
- Binding("k", "kill_experiment", "Kill", show=False),
223
- ]
224
-
225
- current_experiment: reactive[Optional[str]] = reactive(None)
226
- collapsed: reactive[bool] = reactive(False)
227
-
228
- def __init__(self, state_provider: WorkspaceStateProvider) -> None:
229
- super().__init__()
230
- self.state_provider = state_provider
231
- self.experiments = []
232
-
233
- def _get_selected_experiment_id(self) -> Optional[str]:
234
- """Get the experiment ID from the currently selected row"""
235
- table = self.query_one("#experiments-table", DataTable)
236
- if table.cursor_row is None:
237
- return None
238
- row_key = list(table.rows.keys())[table.cursor_row]
239
- if row_key:
240
- return str(row_key.value)
241
- return None
242
-
243
- def action_delete_experiment(self) -> None:
244
- """Request to delete the selected experiment"""
245
- exp_id = self._get_selected_experiment_id()
246
- if exp_id:
247
- self.post_message(DeleteExperimentRequest(exp_id))
248
-
249
- def action_kill_experiment(self) -> None:
250
- """Request to kill all running jobs in the selected experiment"""
251
- exp_id = self._get_selected_experiment_id()
252
- if exp_id:
253
- self.post_message(KillExperimentRequest(exp_id))
254
-
255
- def compose(self) -> ComposeResult:
256
- # Collapsed header (hidden initially)
257
- with Horizontal(id="collapsed-header", classes="hidden"):
258
- yield Label("", id="collapsed-experiment-info")
259
-
260
- # Full experiments table
261
- with Container(id="experiments-table-container"):
262
- yield Label("Experiments", classes="section-title")
263
- yield DataTable(id="experiments-table", cursor_type="row")
264
-
265
- def on_mount(self) -> None:
266
- """Initialize the experiments table"""
267
- table = self.query_one("#experiments-table", DataTable)
268
- table.add_column("ID", key="id")
269
- table.add_column("Host", key="host")
270
- table.add_column("Jobs", key="jobs")
271
- table.add_column("Status", key="status")
272
- table.add_column("Started", key="started")
273
- table.add_column("Duration", key="duration")
274
- self.refresh_experiments()
275
-
276
- # If there's only one experiment, automatically select it
277
- if len(self.experiments) == 1:
278
- exp_id = self.experiments[0].experiment_id
279
- self.current_experiment = exp_id
280
- self.collapse_to_experiment(exp_id)
281
- self.post_message(ExperimentSelected(exp_id))
282
-
283
- def refresh_experiments(self) -> None:
284
- """Refresh the experiments list from state provider"""
285
- table = self.query_one("#experiments-table", DataTable)
286
-
287
- try:
288
- self.experiments = self.state_provider.get_experiments()
289
- self.log.debug(
290
- f"Refreshing experiments: found {len(self.experiments)} experiments"
291
- )
292
- except Exception as e:
293
- self.log.error(f"ERROR refreshing experiments: {e}")
294
- import traceback
295
-
296
- self.log.error(traceback.format_exc())
297
- self.experiments = []
298
- return
299
-
300
- # Get existing row keys
301
- existing_keys = set(table.rows.keys())
302
- current_exp_ids = set()
303
-
304
- from datetime import datetime
305
- import time as time_module
306
-
307
- for exp in self.experiments:
308
- exp_id = exp.experiment_id
309
- current_exp_ids.add(exp_id)
310
- total = exp.total_jobs
311
- finished = exp.finished_jobs
312
- failed = exp.failed_jobs
313
-
314
- # Determine status
315
- if failed > 0:
316
- status = f"❌ {failed} failed"
317
- elif finished == total and total > 0:
318
- status = "✓ Done"
319
- elif finished < total:
320
- status = f"▶ {finished}/{total}"
321
- else:
322
- status = "Empty"
323
-
324
- jobs_text = f"{finished}/{total}"
325
-
326
- # Format started time
327
- if exp.started_at:
328
- started = datetime.fromtimestamp(exp.started_at).strftime(
329
- "%Y-%m-%d %H:%M"
330
- )
331
- else:
332
- started = "-"
333
-
334
- # Calculate duration
335
- duration = "-"
336
- if exp.started_at:
337
- if exp.ended_at:
338
- elapsed = exp.ended_at - exp.started_at
339
- else:
340
- # Still running - show elapsed time
341
- elapsed = time_module.time() - exp.started_at
342
- # Format duration
343
- duration = format_duration(elapsed)
344
-
345
- # Get hostname (may be None for older experiments)
346
- hostname = getattr(exp, "hostname", None) or "-"
347
-
348
- # Update existing row or add new one
349
- if exp_id in existing_keys:
350
- table.update_cell(exp_id, "id", exp_id, update_width=True)
351
- table.update_cell(exp_id, "host", hostname, update_width=True)
352
- table.update_cell(exp_id, "jobs", jobs_text, update_width=True)
353
- table.update_cell(exp_id, "status", status, update_width=True)
354
- table.update_cell(exp_id, "started", started, update_width=True)
355
- table.update_cell(exp_id, "duration", duration, update_width=True)
356
- else:
357
- table.add_row(
358
- exp_id, hostname, jobs_text, status, started, duration, key=exp_id
359
- )
360
-
361
- # Remove rows for experiments that no longer exist
362
- for old_exp_id in existing_keys - current_exp_ids:
363
- table.remove_row(old_exp_id)
364
-
365
- # Update collapsed header if viewing an experiment
366
- if self.collapsed and self.current_experiment:
367
- self._update_collapsed_header(self.current_experiment)
368
-
369
- def on_data_table_row_selected(self, event: DataTable.RowSelected) -> None:
370
- """Handle experiment selection"""
371
- if event.row_key:
372
- self.current_experiment = str(event.row_key.value)
373
- self.collapse_to_experiment(self.current_experiment)
374
- self.post_message(ExperimentSelected(str(event.row_key.value)))
375
-
376
- def _update_collapsed_header(self, experiment_id: str) -> None:
377
- """Update the collapsed experiment header with current stats"""
378
- exp_info = next(
379
- (exp for exp in self.experiments if exp.experiment_id == experiment_id),
380
- None,
381
- )
382
- if not exp_info:
383
- return
384
-
385
- total = exp_info.total_jobs
386
- finished = exp_info.finished_jobs
387
- failed = exp_info.failed_jobs
388
-
389
- if failed > 0:
390
- status = f"❌ {failed} failed"
391
- elif finished == total and total > 0:
392
- status = "✓ Done"
393
- elif finished < total:
394
- status = f"▶ {finished}/{total}"
395
- else:
396
- status = "Empty"
397
-
398
- collapsed_label = self.query_one("#collapsed-experiment-info", Label)
399
- collapsed_label.update(f"📊 {experiment_id} - {status} (click to go back)")
400
-
401
- def collapse_to_experiment(self, experiment_id: str) -> None:
402
- """Collapse the experiments list to show only the selected experiment"""
403
- self._update_collapsed_header(experiment_id)
404
-
405
- # Hide table, show collapsed header
406
- self.query_one("#experiments-table-container").add_class("hidden")
407
- self.query_one("#collapsed-header").remove_class("hidden")
408
- self.collapsed = True
409
-
410
- def expand_experiments(self) -> None:
411
- """Expand back to full experiments list"""
412
- # Show table, hide collapsed header
413
- self.query_one("#collapsed-header").add_class("hidden")
414
- self.query_one("#experiments-table-container").remove_class("hidden")
415
- self.collapsed = False
416
- self.current_experiment = None
417
-
418
- # Focus the experiments table
419
- table = self.query_one("#experiments-table", DataTable)
420
- table.focus()
421
-
422
- def on_click(self) -> None:
423
- """Handle clicks on the widget"""
424
- if self.collapsed:
425
- self.expand_experiments()
426
- self.post_message(ExperimentDeselected())
427
-
428
-
429
- class ExperimentSelected(Message):
430
- """Message sent when an experiment is selected"""
431
-
432
- def __init__(self, experiment_id: str) -> None:
433
- super().__init__()
434
- self.experiment_id = experiment_id
435
-
436
-
437
- class ExperimentDeselected(Message):
438
- """Message sent when an experiment is deselected"""
439
-
440
- pass
441
-
442
-
443
- class JobSelected(Message):
444
- """Message sent when a job is selected"""
445
-
446
- def __init__(self, job_id: str, experiment_id: str) -> None:
447
- super().__init__()
448
- self.job_id = job_id
449
- self.experiment_id = experiment_id
450
-
451
-
452
- class JobDeselected(Message):
453
- """Message sent when returning from job detail view"""
454
-
455
- pass
456
-
457
-
458
- class ViewJobLogs(Message):
459
- """Message sent when user wants to view job logs"""
460
-
461
- def __init__(self, job_path: str, task_id: str) -> None:
462
- super().__init__()
463
- self.job_path = job_path
464
- self.task_id = task_id
465
-
466
-
467
- class ViewJobLogsRequest(Message):
468
- """Message sent when user requests to view logs from jobs table"""
469
-
470
- def __init__(self, job_id: str, experiment_id: str) -> None:
471
- super().__init__()
472
- self.job_id = job_id
473
- self.experiment_id = experiment_id
474
-
475
-
476
- class LogsSyncComplete(Message):
477
- """Message sent when remote log sync is complete"""
478
-
479
- def __init__(self, log_files: list, job_id: str) -> None:
480
- super().__init__()
481
- self.log_files = log_files
482
- self.job_id = job_id
483
-
484
-
485
- class LogsSyncFailed(Message):
486
- """Message sent when remote log sync fails"""
487
-
488
- def __init__(self, error: str) -> None:
489
- super().__init__()
490
- self.error = error
491
-
492
-
493
- class DeleteJobRequest(Message):
494
- """Message sent when user requests to delete a job"""
495
-
496
- def __init__(self, job_id: str, experiment_id: str) -> None:
497
- super().__init__()
498
- self.job_id = job_id
499
- self.experiment_id = experiment_id
500
-
501
-
502
- class DeleteExperimentRequest(Message):
503
- """Message sent when user requests to delete an experiment"""
504
-
505
- def __init__(self, experiment_id: str) -> None:
506
- super().__init__()
507
- self.experiment_id = experiment_id
508
-
509
-
510
- class KillJobRequest(Message):
511
- """Message sent when user requests to kill a running job"""
512
-
513
- def __init__(self, job_id: str, experiment_id: str) -> None:
514
- super().__init__()
515
- self.job_id = job_id
516
- self.experiment_id = experiment_id
517
-
518
-
519
- class KillExperimentRequest(Message):
520
- """Message sent when user requests to kill all running jobs in an experiment"""
521
-
522
- def __init__(self, experiment_id: str) -> None:
523
- super().__init__()
524
- self.experiment_id = experiment_id
525
-
526
-
527
- class FilterChanged(Message):
528
- """Message sent when search filter changes"""
529
-
530
- def __init__(self, filter_fn) -> None:
531
- super().__init__()
532
- self.filter_fn = filter_fn
533
-
534
-
535
- class ServicesList(Vertical):
536
- """Widget displaying services for selected experiment
537
-
538
- Services are retrieved from WorkspaceStateProvider.get_services() which
539
- abstracts away whether services are live (from scheduler) or recreated
540
- from database state_dict. The UI treats all services uniformly.
541
- """
542
-
543
- BINDINGS = [
544
- Binding("s", "start_service", "Start"),
545
- Binding("x", "stop_service", "Stop"),
546
- Binding("u", "copy_url", "Copy URL", show=False),
547
- ]
548
-
549
- # State icons for display
550
- STATE_ICONS = {
551
- "STOPPED": "⏹",
552
- "STARTING": "⏳",
553
- "RUNNING": "▶",
554
- "STOPPING": "⏳",
555
- }
556
-
557
- def __init__(self, state_provider: WorkspaceStateProvider) -> None:
558
- super().__init__()
559
- self.state_provider = state_provider
560
- self.current_experiment: Optional[str] = None
561
- self._services: dict = {} # service_id -> Service object
562
-
563
- def compose(self) -> ComposeResult:
564
- yield DataTable(id="services-table", cursor_type="row")
565
-
566
- def on_mount(self) -> None:
567
- """Set up the services table"""
568
- table = self.query_one("#services-table", DataTable)
569
- table.add_columns("ID", "Description", "State", "URL")
570
- table.cursor_type = "row"
571
-
572
- def set_experiment(self, experiment_id: Optional[str]) -> None:
573
- """Set the current experiment and refresh services"""
574
- self.current_experiment = experiment_id
575
- self.refresh_services()
576
-
577
- def refresh_services(self) -> None:
578
- """Refresh the services list from state provider"""
579
- table = self.query_one("#services-table", DataTable)
580
- table.clear()
581
- self._services = {}
582
-
583
- if not self.current_experiment:
584
- return
585
-
586
- # Get services from state provider (handles live vs DB automatically)
587
- services = self.state_provider.get_services(self.current_experiment)
588
- self.log.info(
589
- f"refresh_services got {len(services)} services: "
590
- f"{[(s.id, id(s), getattr(s, 'url', None)) for s in services]}"
591
- )
592
-
593
- for service in services:
594
- service_id = service.id
595
- self._services[service_id] = service
596
-
597
- state_name = service.state.name if hasattr(service, "state") else "UNKNOWN"
598
- state_icon = self.STATE_ICONS.get(state_name, "?")
599
- url = getattr(service, "url", None) or "-"
600
- description = (
601
- service.description() if hasattr(service, "description") else ""
602
- )
603
-
604
- table.add_row(
605
- service_id,
606
- description,
607
- f"{state_icon} {state_name}",
608
- url,
609
- key=service_id,
610
- )
611
-
612
- def _get_selected_service(self):
613
- """Get the currently selected Service object"""
614
- table = self.query_one("#services-table", DataTable)
615
- if table.cursor_row is not None and table.row_count > 0:
616
- row_key = list(table.rows.keys())[table.cursor_row]
617
- if row_key:
618
- service_id = str(row_key.value)
619
- return self._services.get(service_id)
620
- return None
621
-
622
- def action_start_service(self) -> None:
623
- """Start the selected service"""
624
- service = self._get_selected_service()
625
- if not service:
626
- return
627
-
628
- self.log.info(f"Starting service {service.id} (id={id(service)})")
629
-
630
- try:
631
- if hasattr(service, "get_url"):
632
- url = service.get_url()
633
- self.log.info(f"Service started, url={url}, service.url={service.url}")
634
- self.notify(f"Service started: {url}", severity="information")
635
- else:
636
- self.notify("Service does not support starting", severity="warning")
637
- self.refresh_services()
638
- except Exception as e:
639
- self.notify(f"Failed to start service: {e}", severity="error")
640
-
641
- def action_stop_service(self) -> None:
642
- """Stop the selected service"""
643
- service = self._get_selected_service()
644
- if not service:
645
- return
646
-
647
- from experimaestro.scheduler.services import ServiceState
648
-
649
- if service.state == ServiceState.STOPPED:
650
- self.notify("Service is not running", severity="warning")
651
- return
652
-
653
- try:
654
- if hasattr(service, "stop"):
655
- service.stop()
656
- self.notify(f"Service stopped: {service.id}", severity="information")
657
- else:
658
- self.notify("Service does not support stopping", severity="warning")
659
- self.refresh_services()
660
- except Exception as e:
661
- self.notify(f"Failed to stop service: {e}", severity="error")
662
-
663
- def action_copy_url(self) -> None:
664
- """Copy the service URL to clipboard"""
665
- service = self._get_selected_service()
666
- if not service:
667
- return
668
-
669
- url = getattr(service, "url", None)
670
- if url:
671
- try:
672
- import pyperclip
673
-
674
- pyperclip.copy(url)
675
- self.notify(f"URL copied: {url}", severity="information")
676
- except Exception as e:
677
- self.notify(f"Failed to copy: {e}", severity="error")
678
- else:
679
- self.notify("Start the service first to get URL", severity="warning")
680
-
681
-
682
- class JobDetailView(Widget):
683
- """Widget displaying detailed job information"""
684
-
685
- BINDINGS = [
686
- Binding("l", "view_logs", "View Logs", priority=True),
687
- ]
688
-
689
- def __init__(self, state_provider: WorkspaceStateProvider) -> None:
690
- super().__init__()
691
- self.state_provider = state_provider
692
- self.current_job_id: Optional[str] = None
693
- self.current_experiment_id: Optional[str] = None
694
- self.job_data: Optional[dict] = None
695
-
696
- def compose(self) -> ComposeResult:
697
- yield Label("Job Details", classes="section-title")
698
- with Vertical(id="job-detail-content"):
699
- yield Label("", id="job-id-label")
700
- yield Label("", id="job-task-label")
701
- yield Label("", id="job-status-label")
702
- yield Label("", id="job-path-label")
703
- yield Label("", id="job-times-label")
704
- yield Label("Tags:", classes="subsection-title")
705
- yield Label("", id="job-tags-label")
706
- yield Label("Progress:", classes="subsection-title")
707
- yield Label("", id="job-progress-label")
708
- yield Label("", id="job-logs-hint")
709
-
710
- def action_view_logs(self) -> None:
711
- """View job logs with toolong"""
712
- if self.job_data and self.job_data.path and self.job_data.task_id:
713
- self.post_message(
714
- ViewJobLogs(str(self.job_data.path), self.job_data.task_id)
715
- )
716
-
717
- def set_job(self, job_id: str, experiment_id: str) -> None:
718
- """Set the job to display"""
719
- self.current_job_id = job_id
720
- self.current_experiment_id = experiment_id
721
- self.refresh_job_detail()
722
-
723
- def refresh_job_detail(self) -> None:
724
- """Refresh job details from state provider"""
725
- if not self.current_job_id or not self.current_experiment_id:
726
- return
727
-
728
- job = self.state_provider.get_job(
729
- self.current_job_id, self.current_experiment_id
730
- )
731
- if not job:
732
- self.log(f"Job not found: {self.current_job_id}")
733
- return
734
-
735
- self.job_data = job
736
-
737
- # Update labels
738
- self.query_one("#job-id-label", Label).update(f"Job ID: {job.identifier}")
739
- self.query_one("#job-task-label", Label).update(f"Task: {job.task_id}")
740
-
741
- # Format status with icon and name
742
- status_name = job.state.name if job.state else "unknown"
743
- failure_reason = getattr(job, "failure_reason", None)
744
- status_icon = get_status_icon(status_name, failure_reason)
745
- status_text = f"{status_icon} {status_name}"
746
- if failure_reason:
747
- status_text += f" ({failure_reason.name})"
748
-
749
- self.query_one("#job-status-label", Label).update(f"Status: {status_text}")
750
-
751
- # Path (from locator)
752
- locator = job.locator or "-"
753
- self.query_one("#job-path-label", Label).update(f"Locator: {locator}")
754
-
755
- # Times - format timestamps
756
- from datetime import datetime
757
- import time as time_module
758
-
759
- def format_time(ts):
760
- if ts:
761
- return datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S")
762
- return "-"
763
-
764
- submitted = format_time(job.submittime)
765
- start = format_time(job.starttime)
766
- end = format_time(job.endtime)
767
-
768
- # Calculate duration
769
- duration = "-"
770
- if job.starttime:
771
- if job.endtime:
772
- duration = format_duration(job.endtime - job.starttime)
773
- else:
774
- duration = (
775
- format_duration(time_module.time() - job.starttime) + " (running)"
776
- )
777
-
778
- times_text = f"Submitted: {submitted} | Start: {start} | End: {end} | Duration: {duration}"
779
- self.query_one("#job-times-label", Label).update(times_text)
780
-
781
- # Tags - job.tags is now a dict
782
- tags = job.tags
783
- if tags:
784
- tags_text = ", ".join(f"{k}={v}" for k, v in tags.items())
785
- else:
786
- tags_text = "(no tags)"
787
- self.query_one("#job-tags-label", Label).update(tags_text)
788
-
789
- # Progress
790
- progress_list = job.progress or []
791
- if progress_list:
792
- progress_lines = []
793
- for p in progress_list:
794
- level = p.get("level", 0)
795
- pct = p.get("progress", 0) * 100
796
- desc = p.get("desc", "")
797
- indent = " " * level
798
- progress_lines.append(f"{indent}{pct:.1f}% {desc}")
799
- progress_text = "\n".join(progress_lines) if progress_lines else "-"
800
- else:
801
- progress_text = "-"
802
- self.query_one("#job-progress-label", Label).update(progress_text)
803
-
804
- # Log files hint - log files are named after the last part of the task ID
805
- job_path = job.path
806
- task_id = job.task_id
807
- if job_path and task_id:
808
- # Extract the last component of the task ID (e.g., "evaluate" from "mnist_xp.learn.evaluate")
809
- task_name = task_id.split(".")[-1]
810
- stdout_path = job_path / f"{task_name}.out"
811
- stderr_path = job_path / f"{task_name}.err"
812
- logs_exist = stdout_path.exists() or stderr_path.exists()
813
- if logs_exist:
814
- self.query_one("#job-logs-hint", Label).update(
815
- "[bold cyan]Press 'l' to view logs[/bold cyan]"
816
- )
817
- else:
818
- self.query_one("#job-logs-hint", Label).update("(no log files found)")
819
- else:
820
- self.query_one("#job-logs-hint", Label).update("")
821
-
822
-
823
- class SearchBar(Widget):
824
- """Search bar widget with filter hints for filtering jobs"""
825
-
826
- visible: reactive[bool] = reactive(False)
827
- _keep_filter: bool = False # Flag to keep filter when hiding
828
- _query_valid: bool = False # Track if current query is valid
829
-
830
- def __init__(self) -> None:
831
- super().__init__()
832
- self.filter_fn = None
833
- self.active_query = "" # Store the active query text
834
-
835
- def compose(self) -> ComposeResult:
836
- # Active filter indicator (shown when filter active but bar hidden)
837
- yield Static("", id="active-filter")
838
- # Search input container
839
- with Vertical(id="search-container"):
840
- yield Input(
841
- placeholder="Filter: @state = 'done', @name ~ 'pattern', tag = 'value'",
842
- id="search-input",
843
- )
844
- yield Static(
845
- "Syntax: @state = 'done' | @name ~ 'regex' | tag = 'value' | and/or",
846
- id="search-hints",
847
- )
848
- yield Static("", id="search-error")
849
-
850
- def on_mount(self) -> None:
851
- """Initialize visibility state"""
852
- # Start with everything hidden
853
- self.display = False
854
- self.query_one("#search-container").display = False
855
- self.query_one("#active-filter").display = False
856
- self.query_one("#search-error").display = False
857
-
858
- def watch_visible(self, visible: bool) -> None:
859
- """Show/hide search bar"""
860
- search_container = self.query_one("#search-container")
861
- active_filter = self.query_one("#active-filter")
862
- error_widget = self.query_one("#search-error")
863
-
864
- if visible:
865
- self.display = True
866
- search_container.display = True
867
- active_filter.display = False
868
- self.query_one("#search-input", Input).focus()
869
- else:
870
- if not self._keep_filter:
871
- self.query_one("#search-input", Input).value = ""
872
- self.filter_fn = None
873
- self.active_query = ""
874
- self._query_valid = False
875
- self._keep_filter = False
876
-
877
- # Show/hide based on whether filter is active
878
- if self.filter_fn is not None:
879
- # Filter active - show indicator, hide input
880
- self.display = True
881
- search_container.display = False
882
- error_widget.display = False
883
- active_filter.update(
884
- f"Filter: {self.active_query} (/ to edit, c to clear)"
885
- )
886
- active_filter.display = True
887
- else:
888
- # No filter - hide everything including this widget
889
- self.display = False
890
- search_container.display = False
891
- active_filter.display = False
892
- error_widget.display = False
893
-
894
- def on_input_changed(self, event: Input.Changed) -> None:
895
- """Parse filter expression when input changes"""
896
- query = event.value.strip()
897
- input_widget = self.query_one("#search-input", Input)
898
- error_widget = self.query_one("#search-error", Static)
899
-
900
- if not query:
901
- self.filter_fn = None
902
- self._query_valid = False
903
- self.post_message(FilterChanged(None))
904
- input_widget.remove_class("error")
905
- input_widget.remove_class("valid")
906
- error_widget.display = False
907
- return
908
-
909
- try:
910
- from experimaestro.cli.filter import createFilter
911
-
912
- self.filter_fn = createFilter(query)
913
- self._query_valid = True
914
- self.active_query = query
915
- self.post_message(FilterChanged(self.filter_fn))
916
- input_widget.remove_class("error")
917
- input_widget.add_class("valid")
918
- error_widget.display = False
919
- except Exception as e:
920
- self.filter_fn = None
921
- self._query_valid = False
922
- self.post_message(FilterChanged(None))
923
- input_widget.remove_class("valid")
924
- input_widget.add_class("error")
925
- error_widget.update(f"Invalid query: {str(e)[:50]}")
926
- error_widget.display = True
927
-
928
- def on_input_submitted(self, event: Input.Submitted) -> None:
929
- """Apply filter and hide search bar (only if query is valid)"""
930
- if self._query_valid and self.filter_fn is not None:
931
- # Set flag to keep filter when hiding
932
- self._keep_filter = True
933
- self.visible = False
934
- # Post message to focus jobs table
935
- self.post_message(SearchApplied())
936
- # If invalid, do nothing (keep input focused for correction)
937
-
938
-
939
- class SearchApplied(Message):
940
- """Message sent when search filter is applied via Enter"""
941
-
942
- pass
943
-
944
-
945
- class JobsTable(Vertical):
946
- """Widget displaying jobs for selected experiment"""
947
-
948
- BINDINGS = [
949
- Binding("d", "delete_job", "Delete", show=False),
950
- Binding("k", "kill_job", "Kill", show=False),
951
- Binding("l", "view_logs", "Logs"),
952
- Binding("f", "copy_path", "Copy Path", show=False),
953
- Binding("/", "toggle_search", "Search"),
954
- Binding("c", "clear_filter", "Clear", show=False),
955
- Binding("r", "refresh_live", "Refresh"),
956
- Binding("S", "sort_by_status", "Sort ⚑", show=False),
957
- Binding("T", "sort_by_task", "Sort Task", show=False),
958
- Binding("D", "sort_by_submitted", "Sort Date", show=False),
959
- Binding("escape", "clear_search", show=False, priority=True),
960
- ]
961
-
962
- # Track current sort state
963
- _sort_column: Optional[str] = None
964
- _sort_reverse: bool = False
965
- _needs_rebuild: bool = True # Start with rebuild needed
966
-
967
- def __init__(self, state_provider: WorkspaceStateProvider) -> None:
968
- super().__init__()
969
- self.state_provider = state_provider
970
- self.filter_fn = None
971
- self.current_experiment: Optional[str] = None
972
-
973
- def compose(self) -> ComposeResult:
974
- yield SearchBar()
975
- yield DataTable(id="jobs-table", cursor_type="row")
976
-
977
- def action_toggle_search(self) -> None:
978
- """Toggle search bar visibility"""
979
- search_bar = self.query_one(SearchBar)
980
- search_bar.visible = not search_bar.visible
981
-
982
- def action_clear_filter(self) -> None:
983
- """Clear the active filter"""
984
- if self.filter_fn is not None:
985
- search_bar = self.query_one(SearchBar)
986
- search_bar.query_one("#search-input", Input).value = ""
987
- search_bar.filter_fn = None
988
- search_bar.active_query = ""
989
- search_bar._query_valid = False
990
- # Hide the SearchBar completely
991
- search_bar.display = False
992
- search_bar.query_one("#search-container").display = False
993
- search_bar.query_one("#active-filter").display = False
994
- search_bar.query_one("#search-error").display = False
995
- self.filter_fn = None
996
- self.refresh_jobs()
997
- self.notify("Filter cleared", severity="information")
998
-
999
- def action_sort_by_status(self) -> None:
1000
- """Sort jobs by status"""
1001
- if self._sort_column == "status":
1002
- self._sort_reverse = not self._sort_reverse
1003
- else:
1004
- self._sort_column = "status"
1005
- self._sort_reverse = False
1006
- self._needs_rebuild = True
1007
- self._update_column_headers()
1008
- self.refresh_jobs()
1009
- order = "desc" if self._sort_reverse else "asc"
1010
- self.notify(f"Sorted by status ({order})", severity="information")
1011
-
1012
- def action_sort_by_task(self) -> None:
1013
- """Sort jobs by task"""
1014
- if self._sort_column == "task":
1015
- self._sort_reverse = not self._sort_reverse
1016
- else:
1017
- self._sort_column = "task"
1018
- self._sort_reverse = False
1019
- self._needs_rebuild = True
1020
- self._update_column_headers()
1021
- self.refresh_jobs()
1022
- order = "desc" if self._sort_reverse else "asc"
1023
- self.notify(f"Sorted by task ({order})", severity="information")
1024
-
1025
- def action_sort_by_submitted(self) -> None:
1026
- """Sort jobs by submission time"""
1027
- if self._sort_column == "submitted":
1028
- self._sort_reverse = not self._sort_reverse
1029
- else:
1030
- self._sort_column = "submitted"
1031
- self._sort_reverse = False
1032
- self._needs_rebuild = True
1033
- self._update_column_headers()
1034
- self.refresh_jobs()
1035
- order = "newest first" if self._sort_reverse else "oldest first"
1036
- self.notify(f"Sorted by date ({order})", severity="information")
1037
-
1038
- def action_clear_search(self) -> None:
1039
- """Handle escape: hide search bar if visible, or go back"""
1040
- search_bar = self.query_one(SearchBar)
1041
- if search_bar.visible:
1042
- # Search bar visible - hide it and clear filter
1043
- search_bar.visible = False
1044
- self.filter_fn = None
1045
- self.refresh_jobs()
1046
- # Focus the jobs table
1047
- self.query_one("#jobs-table", DataTable).focus()
1048
- else:
1049
- # Search bar hidden - go back (keep filter)
1050
- self.app.action_go_back()
1051
-
1052
- def action_refresh_live(self) -> None:
1053
- """Refresh the jobs table"""
1054
- self.refresh_jobs()
1055
- self.notify("Jobs refreshed", severity="information")
1056
-
1057
- def on_filter_changed(self, message: FilterChanged) -> None:
1058
- """Apply new filter"""
1059
- self.filter_fn = message.filter_fn
1060
- self.refresh_jobs()
1061
-
1062
- def on_search_applied(self, message: SearchApplied) -> None:
1063
- """Focus jobs table when search is applied"""
1064
- self.query_one("#jobs-table", DataTable).focus()
1065
-
1066
- def _get_selected_job_id(self) -> Optional[str]:
1067
- """Get the job ID from the currently selected row"""
1068
- table = self.query_one("#jobs-table", DataTable)
1069
- if table.cursor_row is None:
1070
- return None
1071
- row_key = table.get_row_at(table.cursor_row)
1072
- if row_key:
1073
- # The first column is job_id
1074
- return str(table.get_row_at(table.cursor_row)[0])
1075
- return None
1076
-
1077
- def action_delete_job(self) -> None:
1078
- """Request to delete the selected job"""
1079
- table = self.query_one("#jobs-table", DataTable)
1080
- if table.cursor_row is None or not self.current_experiment:
1081
- return
1082
-
1083
- # Get job ID from the row key
1084
- row_key = list(table.rows.keys())[table.cursor_row]
1085
- if row_key:
1086
- job_id = str(row_key.value)
1087
- self.post_message(DeleteJobRequest(job_id, self.current_experiment))
1088
-
1089
- def action_kill_job(self) -> None:
1090
- """Request to kill the selected job"""
1091
- table = self.query_one("#jobs-table", DataTable)
1092
- if table.cursor_row is None or not self.current_experiment:
1093
- return
1094
-
1095
- row_key = list(table.rows.keys())[table.cursor_row]
1096
- if row_key:
1097
- job_id = str(row_key.value)
1098
- self.post_message(KillJobRequest(job_id, self.current_experiment))
1099
-
1100
- def action_view_logs(self) -> None:
1101
- """Request to view logs for the selected job"""
1102
- table = self.query_one("#jobs-table", DataTable)
1103
- if table.cursor_row is None or not self.current_experiment:
1104
- return
1105
-
1106
- row_key = list(table.rows.keys())[table.cursor_row]
1107
- if row_key:
1108
- job_id = str(row_key.value)
1109
- self.post_message(ViewJobLogsRequest(job_id, self.current_experiment))
1110
-
1111
- def action_copy_path(self) -> None:
1112
- """Copy the job folder path to clipboard"""
1113
- import pyperclip
1114
-
1115
- table = self.query_one("#jobs-table", DataTable)
1116
- if table.cursor_row is None or not self.current_experiment:
1117
- return
1118
-
1119
- row_key = list(table.rows.keys())[table.cursor_row]
1120
- if row_key:
1121
- job_id = str(row_key.value)
1122
- job = self.state_provider.get_job(job_id, self.current_experiment)
1123
- if job and job.path:
1124
- try:
1125
- pyperclip.copy(str(job.path))
1126
- self.notify(f"Path copied: {job.path}", severity="information")
1127
- except Exception as e:
1128
- self.notify(f"Failed to copy: {e}", severity="error")
1129
- else:
1130
- self.notify("No path available for this job", severity="warning")
1131
-
1132
- # Status sort order (for sorting by status)
1133
- STATUS_ORDER = {
1134
- "running": 0,
1135
- "waiting": 1,
1136
- "error": 2,
1137
- "done": 3,
1138
- "unscheduled": 4,
1139
- "phantom": 5,
1140
- }
1141
-
1142
- # Failure reason sort order (within error status)
1143
- # More actionable failures first
1144
- FAILURE_ORDER = {
1145
- "TIMEOUT": 0, # Might just need retry
1146
- "MEMORY": 1, # Might need resource adjustment
1147
- "DEPENDENCY": 2, # Need to fix upstream job first
1148
- "FAILED": 3, # Generic failure
1149
- }
1150
-
1151
- @classmethod
1152
- def _get_status_sort_key(cls, job):
1153
- """Get sort key for a job based on status and failure reason.
1154
-
1155
- Returns tuple (status_order, failure_order) for proper sorting.
1156
- """
1157
- state_name = job.state.name if job.state else "unknown"
1158
- status_order = cls.STATUS_ORDER.get(state_name, 99)
1159
-
1160
- # For error jobs, also sort by failure reason
1161
- if state_name == "error":
1162
- failure_reason = getattr(job, "failure_reason", None)
1163
- if failure_reason:
1164
- failure_order = cls.FAILURE_ORDER.get(failure_reason.name, 99)
1165
- else:
1166
- failure_order = 99 # Unknown failure at end
1167
- else:
1168
- failure_order = 0
1169
-
1170
- return (status_order, failure_order)
1171
-
1172
- # Column key to display name mapping
1173
- COLUMN_LABELS = {
1174
- "job_id": "ID",
1175
- "task": "Task",
1176
- "status": "⚑",
1177
- "tags": "Tags",
1178
- "submitted": "Submitted",
1179
- "duration": "Duration",
1180
- }
1181
-
1182
- # Columns that support sorting (column key -> sort column name)
1183
- SORTABLE_COLUMNS = {
1184
- "status": "status",
1185
- "task": "task",
1186
- "submitted": "submitted",
1187
- }
1188
-
1189
- def on_mount(self) -> None:
1190
- """Initialize the jobs table"""
1191
- table = self.query_one("#jobs-table", DataTable)
1192
- table.add_column("ID", key="job_id")
1193
- table.add_column("Task", key="task")
1194
- table.add_column("⚑", key="status", width=6)
1195
- table.add_column("Tags", key="tags")
1196
- table.add_column("Submitted", key="submitted")
1197
- table.add_column("Duration", key="duration")
1198
- table.cursor_type = "row"
1199
- table.zebra_stripes = True
1200
-
1201
- def _update_column_headers(self) -> None:
1202
- """Update column headers with sort indicators"""
1203
- table = self.query_one("#jobs-table", DataTable)
1204
- for column in table.columns.values():
1205
- col_key = str(column.key.value) if column.key else None
1206
- if col_key and col_key in self.COLUMN_LABELS:
1207
- label = self.COLUMN_LABELS[col_key]
1208
- sort_col = self.SORTABLE_COLUMNS.get(col_key)
1209
- if sort_col and self._sort_column == sort_col:
1210
- # Add sort indicator
1211
- indicator = "▼" if self._sort_reverse else "▲"
1212
- new_label = f"{label} {indicator}"
1213
- else:
1214
- new_label = label
1215
- column.label = new_label
1216
-
1217
- def on_data_table_header_selected(self, event: DataTable.HeaderSelected) -> None:
1218
- """Handle column header click for sorting"""
1219
- col_key = str(event.column_key.value) if event.column_key else None
1220
- if col_key and col_key in self.SORTABLE_COLUMNS:
1221
- sort_col = self.SORTABLE_COLUMNS[col_key]
1222
- if self._sort_column == sort_col:
1223
- self._sort_reverse = not self._sort_reverse
1224
- else:
1225
- self._sort_column = sort_col
1226
- self._sort_reverse = False
1227
- self._needs_rebuild = True
1228
- self._update_column_headers()
1229
- self.refresh_jobs()
1230
-
1231
- def set_experiment(self, experiment_id: Optional[str]) -> None:
1232
- """Set the current experiment and refresh jobs"""
1233
- self.current_experiment = experiment_id
1234
- self.refresh_jobs()
1235
-
1236
- def refresh_jobs(self) -> None: # noqa: C901
1237
- """Refresh the jobs list from state provider"""
1238
- table = self.query_one("#jobs-table", DataTable)
1239
-
1240
- if not self.current_experiment:
1241
- return
1242
-
1243
- jobs = self.state_provider.get_jobs(self.current_experiment)
1244
- self.log.debug(
1245
- f"Refreshing jobs for {self.current_experiment}: {len(jobs)} jobs"
1246
- )
1247
-
1248
- # Apply filter if set
1249
- if self.filter_fn:
1250
- jobs = [j for j in jobs if self.filter_fn(j)]
1251
- self.log.debug(f"After filter: {len(jobs)} jobs")
1252
-
1253
- # Sort jobs based on selected column
1254
- if self._sort_column == "status":
1255
- # Sort by status priority, then by failure reason for errors
1256
- jobs.sort(
1257
- key=self._get_status_sort_key,
1258
- reverse=self._sort_reverse,
1259
- )
1260
- elif self._sort_column == "task":
1261
- # Sort by task name
1262
- jobs.sort(
1263
- key=lambda j: j.task_id or "",
1264
- reverse=self._sort_reverse,
1265
- )
1266
- else:
1267
- # Default: sort by submission time (oldest first by default)
1268
- # Jobs without submittime go to the end
1269
- jobs.sort(
1270
- key=lambda j: j.submittime or float("inf"),
1271
- reverse=self._sort_reverse,
1272
- )
1273
-
1274
- # Check if we need to rebuild (new/removed jobs, or status changed when sorting by status)
1275
- from datetime import datetime
1276
- import time as time_module
1277
-
1278
- existing_keys = {str(k.value) for k in table.rows.keys()}
1279
- current_job_ids = {job.identifier for job in jobs}
1280
-
1281
- # Check if job set changed
1282
- jobs_changed = existing_keys != current_job_ids
1283
-
1284
- # Check if status changed when sorting by status
1285
- status_changed = False
1286
- if self._sort_column == "status" and not jobs_changed:
1287
- current_statuses = {
1288
- job.identifier: (job.state.name if job.state else "unknown")
1289
- for job in jobs
1290
- }
1291
- if (
1292
- hasattr(self, "_last_statuses")
1293
- and self._last_statuses != current_statuses
1294
- ):
1295
- status_changed = True
1296
- self._last_statuses = current_statuses
1297
-
1298
- needs_rebuild = self._needs_rebuild or jobs_changed or status_changed
1299
- self._needs_rebuild = False
1300
-
1301
- # Build row data for all jobs
1302
- rows_data = {}
1303
- for job in jobs:
1304
- job_id = job.identifier
1305
- task_id = job.task_id
1306
- status = job.state.name if job.state else "unknown"
1307
-
1308
- # Format status with icon (and progress % if running)
1309
- if status == "running":
1310
- progress_list = job.progress or []
1311
- if progress_list:
1312
- last_progress = progress_list[-1]
1313
- progress_pct = last_progress.get("progress", 0) * 100
1314
- status_text = f"▶ {progress_pct:.0f}%"
1315
- else:
1316
- status_text = "▶"
1317
- else:
1318
- failure_reason = getattr(job, "failure_reason", None)
1319
- status_text = get_status_icon(status, failure_reason)
1320
-
1321
- # Format tags - show all tags on single line
1322
- tags = job.tags
1323
- if tags:
1324
- tags_text = Text()
1325
- for i, (k, v) in enumerate(tags.items()):
1326
- if i > 0:
1327
- tags_text.append(", ")
1328
- tags_text.append(f"{k}", style="bold")
1329
- tags_text.append(f"={v}")
1330
- else:
1331
- tags_text = Text("-")
1332
-
1333
- submitted = "-"
1334
- if job.submittime:
1335
- submitted = datetime.fromtimestamp(job.submittime).strftime(
1336
- "%Y-%m-%d %H:%M"
1337
- )
1338
-
1339
- # Calculate duration
1340
- start = job.starttime
1341
- end = job.endtime
1342
- duration = "-"
1343
- if start:
1344
- if end:
1345
- elapsed = end - start
1346
- else:
1347
- elapsed = time_module.time() - start
1348
- duration = self._format_duration(elapsed)
1349
-
1350
- job_id_short = job_id[:7]
1351
- rows_data[job_id] = (
1352
- job_id_short,
1353
- task_id,
1354
- status_text,
1355
- tags_text,
1356
- submitted,
1357
- duration,
1358
- )
1359
-
1360
- if needs_rebuild:
1361
- # Full rebuild needed - save selection, clear, rebuild
1362
- selected_key = None
1363
- if table.cursor_row is not None and table.row_count > 0:
1364
- try:
1365
- row_keys = list(table.rows.keys())
1366
- if table.cursor_row < len(row_keys):
1367
- selected_key = str(row_keys[table.cursor_row].value)
1368
- except (IndexError, KeyError):
1369
- pass
1370
-
1371
- table.clear()
1372
- new_cursor_row = None
1373
- for idx, job in enumerate(jobs):
1374
- job_id = job.identifier
1375
- table.add_row(*rows_data[job_id], key=job_id)
1376
- if selected_key == job_id:
1377
- new_cursor_row = idx
1378
-
1379
- if new_cursor_row is not None and table.row_count > 0:
1380
- table.move_cursor(row=new_cursor_row)
1381
- else:
1382
- # Just update cells in place - no reordering needed
1383
- for job_id, row_data in rows_data.items():
1384
- (
1385
- job_id_short,
1386
- task_id,
1387
- status_text,
1388
- tags_text,
1389
- submitted,
1390
- duration,
1391
- ) = row_data
1392
- table.update_cell(job_id, "job_id", job_id_short, update_width=True)
1393
- table.update_cell(job_id, "task", task_id, update_width=True)
1394
- table.update_cell(job_id, "status", status_text, update_width=True)
1395
- table.update_cell(job_id, "tags", tags_text, update_width=True)
1396
- table.update_cell(job_id, "submitted", submitted, update_width=True)
1397
- table.update_cell(job_id, "duration", duration, update_width=True)
1398
-
1399
- self.log.debug(
1400
- f"Jobs table now has {table.row_count} rows (rebuild={needs_rebuild})"
1401
- )
1402
-
1403
- def _format_duration(self, seconds: float) -> str:
1404
- """Format duration in seconds to human-readable string"""
1405
- if seconds < 0:
1406
- return "-"
1407
-
1408
- seconds = int(seconds)
1409
- if seconds < 60:
1410
- return f"{seconds}s"
1411
- elif seconds < 3600:
1412
- minutes = seconds // 60
1413
- secs = seconds % 60
1414
- return f"{minutes}m {secs}s"
1415
- elif seconds < 86400:
1416
- hours = seconds // 3600
1417
- minutes = (seconds % 3600) // 60
1418
- return f"{hours}h {minutes}m"
1419
- else:
1420
- days = seconds // 86400
1421
- hours = (seconds % 86400) // 3600
1422
- return f"{days}d {hours}h"
1423
-
1424
- def on_data_table_row_selected(self, event: DataTable.RowSelected) -> None:
1425
- """Handle job selection"""
1426
- if event.row_key and self.current_experiment:
1427
- job_id = str(event.row_key.value)
1428
- self.post_message(JobSelected(job_id, self.current_experiment))
1429
-
1430
-
1431
- class SizeCalculated(Message):
1432
- """Message sent when a folder size has been calculated"""
1433
-
1434
- def __init__(self, job_id: str, size: str, size_bytes: int) -> None:
1435
- super().__init__()
1436
- self.job_id = job_id
1437
- self.size = size
1438
- self.size_bytes = size_bytes
1439
-
1440
-
1441
- class OrphanJobsScreen(Screen):
1442
- """Screen for viewing and managing orphan jobs"""
1443
-
1444
- BINDINGS = [
1445
- Binding("d", "delete_selected", "Delete"),
1446
- Binding("D", "delete_all", "Delete All", key_display="D"),
1447
- Binding("escape", "go_back", "Back"),
1448
- Binding("q", "go_back", "Quit"),
1449
- Binding("r", "refresh", "Refresh"),
1450
- Binding("f", "copy_path", "Copy Path", show=False),
1451
- Binding("T", "sort_by_task", "Sort Task", show=False),
1452
- Binding("Z", "sort_by_size", "Sort Size", show=False),
1453
- ]
1454
-
1455
- _size_cache: dict = {} # Class-level cache (formatted strings)
1456
- _size_bytes_cache: dict = {} # Class-level cache (raw bytes for sorting)
1457
-
1458
- def __init__(self, state_provider: WorkspaceStateProvider) -> None:
1459
- super().__init__()
1460
- self.state_provider = state_provider
1461
- self.orphan_jobs = []
1462
- self._pending_jobs = [] # Jobs waiting for size calculation
1463
- self._sort_column: Optional[str] = None
1464
- self._sort_reverse: bool = False
1465
-
1466
- def compose(self) -> ComposeResult:
1467
- yield Header()
1468
- with Vertical(id="orphan-container"):
1469
- yield Static("Orphan Jobs", id="orphan-title")
1470
- yield Static("", id="orphan-stats")
1471
- yield DataTable(id="orphan-table", cursor_type="row")
1472
- yield Static("", id="orphan-job-info")
1473
- yield Footer()
1474
-
1475
- def on_mount(self) -> None:
1476
- """Initialize the orphan jobs table"""
1477
- table = self.query_one("#orphan-table", DataTable)
1478
- table.add_column("⚑", key="status", width=3)
1479
- table.add_column("Job ID", key="job_id", width=10)
1480
- table.add_column("Task", key="task")
1481
- table.add_column("Size", key="size", width=10)
1482
- self.refresh_orphans()
1483
-
1484
- def action_sort_by_task(self) -> None:
1485
- """Sort by task name"""
1486
- if self._sort_column == "task":
1487
- self._sort_reverse = not self._sort_reverse
1488
- else:
1489
- self._sort_column = "task"
1490
- self._sort_reverse = False
1491
- self._rebuild_table()
1492
- order = "desc" if self._sort_reverse else "asc"
1493
- self.notify(f"Sorted by task ({order})", severity="information")
1494
-
1495
- def action_sort_by_size(self) -> None:
1496
- """Sort by size"""
1497
- if self._sort_column == "size":
1498
- self._sort_reverse = not self._sort_reverse
1499
- else:
1500
- self._sort_column = "size"
1501
- self._sort_reverse = True # Default: largest first
1502
- self._rebuild_table()
1503
- order = "largest first" if self._sort_reverse else "smallest first"
1504
- self.notify(f"Sorted by size ({order})", severity="information")
1505
-
1506
- def _get_sorted_jobs(self):
1507
- """Return jobs sorted by current sort column"""
1508
- jobs = self.orphan_jobs[:]
1509
- if self._sort_column == "task":
1510
- jobs.sort(key=lambda j: j.task_id or "", reverse=self._sort_reverse)
1511
- elif self._sort_column == "size":
1512
- # Sort by raw bytes, jobs not in cache go to end
1513
- jobs.sort(
1514
- key=lambda j: self._size_bytes_cache.get(j.identifier, -1),
1515
- reverse=self._sort_reverse,
1516
- )
1517
- return jobs
1518
-
1519
- def _rebuild_table(self) -> None:
1520
- """Rebuild the table with current sort order"""
1521
- table = self.query_one("#orphan-table", DataTable)
1522
- table.clear()
1523
-
1524
- for job in self._get_sorted_jobs():
1525
- failure_reason = getattr(job, "failure_reason", None)
1526
- status_icon = get_status_icon(
1527
- job.state.name if job.state else "unknown", failure_reason
1528
- )
1529
- if job.identifier in self._size_cache:
1530
- size_text = self._size_cache[job.identifier]
1531
- else:
1532
- size_text = "waiting"
1533
- table.add_row(
1534
- status_icon,
1535
- job.identifier[:7],
1536
- job.task_id,
1537
- size_text,
1538
- key=job.identifier,
1539
- )
1540
-
1541
- def refresh_orphans(self) -> None:
1542
- """Refresh the orphan jobs list"""
1543
- # Only include orphan jobs that have an existing folder
1544
- all_orphans = self.state_provider.get_orphan_jobs()
1545
- self.orphan_jobs = [j for j in all_orphans if j.path and j.path.exists()]
1546
-
1547
- # Update stats
1548
- stats = self.query_one("#orphan-stats", Static)
1549
- stats.update(f"Found {len(self.orphan_jobs)} orphan jobs")
1550
-
1551
- # Collect jobs needing size calculation
1552
- self._pending_jobs = [
1553
- j for j in self.orphan_jobs if j.identifier not in self._size_cache
1554
- ]
1555
-
1556
- # Rebuild table
1557
- self._rebuild_table()
1558
-
1559
- # Start calculating sizes
1560
- if self._pending_jobs:
1561
- self._calculate_next_size()
1562
-
1563
- def _calculate_next_size(self) -> None:
1564
- """Calculate size for the next pending job using a worker"""
1565
- if not self._pending_jobs:
1566
- return
1567
-
1568
- job = self._pending_jobs.pop(0)
1569
- # Update to "calc..."
1570
- self._update_size_cell(job.identifier, "calc...")
1571
- # Run calculation in worker thread
1572
- self.run_worker(
1573
- self._calc_size_worker(job.identifier, job.path),
1574
- thread=True,
1575
- )
1576
-
1577
- async def _calc_size_worker(self, job_id: str, path):
1578
- """Worker to calculate folder size"""
1579
- size_bytes = await self._get_folder_size_async(path)
1580
- size_str = self._format_size(size_bytes)
1581
- self._size_cache[job_id] = size_str
1582
- self._size_bytes_cache[job_id] = size_bytes
1583
- self.post_message(SizeCalculated(job_id, size_str, size_bytes))
1584
-
1585
- def on_size_calculated(self, message: SizeCalculated) -> None:
1586
- """Handle size calculation completion"""
1587
- self._size_bytes_cache[message.job_id] = message.size_bytes
1588
- self._update_size_cell(message.job_id, message.size)
1589
- # Calculate next one
1590
- self._calculate_next_size()
1591
-
1592
- @staticmethod
1593
- async def _get_folder_size_async(path) -> int:
1594
- """Calculate total size of a folder using du command if available"""
1595
- import asyncio
1596
- import shutil
1597
- import sys
1598
-
1599
- # Try using du command for better performance
1600
- if shutil.which("du"):
1601
- try:
1602
- if sys.platform == "darwin":
1603
- # macOS: du -sk gives size in KB
1604
- proc = await asyncio.create_subprocess_exec(
1605
- "du",
1606
- "-sk",
1607
- str(path),
1608
- stdout=asyncio.subprocess.PIPE,
1609
- stderr=asyncio.subprocess.DEVNULL,
1610
- )
1611
- stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=30)
1612
- if proc.returncode == 0 and stdout:
1613
- # Output format: "SIZE\tPATH"
1614
- size_kb = int(stdout.decode().split()[0])
1615
- return size_kb * 1024
1616
- else:
1617
- # Linux: du -sb gives size in bytes
1618
- proc = await asyncio.create_subprocess_exec(
1619
- "du",
1620
- "-sb",
1621
- str(path),
1622
- stdout=asyncio.subprocess.PIPE,
1623
- stderr=asyncio.subprocess.DEVNULL,
1624
- )
1625
- stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=30)
1626
- if proc.returncode == 0 and stdout:
1627
- # Output format: "SIZE\tPATH"
1628
- return int(stdout.decode().split()[0])
1629
- except (asyncio.TimeoutError, ValueError, IndexError, OSError):
1630
- pass # Fall back to Python implementation
1631
-
1632
- # Fallback: Python implementation
1633
- return OrphanJobsScreen._get_folder_size_sync(path)
1634
-
1635
- @staticmethod
1636
- def _get_folder_size_sync(path) -> int:
1637
- """Calculate total size of a folder using Python (fallback)"""
1638
- total = 0
1639
- try:
1640
- for entry in path.rglob("*"):
1641
- if entry.is_file():
1642
- total += entry.stat().st_size
1643
- except (OSError, PermissionError):
1644
- pass
1645
- return total
1646
-
1647
- @staticmethod
1648
- def _format_size(size: int) -> str:
1649
- """Format size in human-readable format"""
1650
- for unit in ["B", "KB", "MB", "GB"]:
1651
- if size < 1024:
1652
- return f"{size:.1f}{unit}" if unit != "B" else f"{size}{unit}"
1653
- size /= 1024
1654
- return f"{size:.1f}TB"
1655
-
1656
- def _update_size_cell(self, job_id: str, value: str = None) -> None:
1657
- """Update the size cell for a job"""
1658
- try:
1659
- table = self.query_one("#orphan-table", DataTable)
1660
- size_text = (
1661
- value if value is not None else self._size_cache.get(job_id, "-")
1662
- )
1663
- table.update_cell(job_id, "size", size_text)
1664
- except Exception:
1665
- pass # Table may have changed
1666
-
1667
- def on_data_table_row_selected(self, event: DataTable.RowSelected) -> None:
1668
- """Show job details when a row is selected"""
1669
- self._update_job_info()
1670
-
1671
- def on_data_table_row_highlighted(self, event: DataTable.RowHighlighted) -> None:
1672
- """Show job details when cursor moves"""
1673
- self._update_job_info()
1674
-
1675
- def _update_job_info(self) -> None:
1676
- """Update the job info display"""
1677
- table = self.query_one("#orphan-table", DataTable)
1678
- info = self.query_one("#orphan-job-info", Static)
1679
-
1680
- if table.cursor_row is None:
1681
- info.update("")
1682
- return
1683
-
1684
- row_key = list(table.rows.keys())[table.cursor_row]
1685
- if row_key:
1686
- job_id = str(row_key.value)
1687
- job = next((j for j in self.orphan_jobs if j.identifier == job_id), None)
1688
- if job and job.path:
1689
- size = self._size_cache.get(job.identifier, "calculating...")
1690
- info.update(f"Path: {job.path} | Size: {size}")
1691
- else:
1692
- info.update("")
1693
-
1694
- def action_copy_path(self) -> None:
1695
- """Copy the job folder path to clipboard"""
1696
- import pyperclip
1697
-
1698
- table = self.query_one("#orphan-table", DataTable)
1699
- if table.cursor_row is None:
1700
- return
1701
-
1702
- row_key = list(table.rows.keys())[table.cursor_row]
1703
- if row_key:
1704
- job_id = str(row_key.value)
1705
- job = next((j for j in self.orphan_jobs if j.identifier == job_id), None)
1706
- if job and job.path:
1707
- try:
1708
- pyperclip.copy(str(job.path))
1709
- self.notify("Path copied", severity="information")
1710
- except Exception as e:
1711
- self.notify(f"Failed to copy: {e}", severity="error")
1712
-
1713
- def action_delete_selected(self) -> None:
1714
- """Delete the selected orphan job"""
1715
- table = self.query_one("#orphan-table", DataTable)
1716
- if table.cursor_row is None:
1717
- return
1718
-
1719
- row_key = list(table.rows.keys())[table.cursor_row]
1720
- if row_key:
1721
- job_id = str(row_key.value)
1722
- job = next((j for j in self.orphan_jobs if j.identifier == job_id), None)
1723
- if job:
1724
- self._delete_job(job)
1725
-
1726
- def _delete_job(self, job) -> None:
1727
- """Delete a single orphan job with confirmation"""
1728
-
1729
- def handle_delete(confirmed: bool) -> None:
1730
- if confirmed:
1731
- success, msg = self.state_provider.delete_job_safely(job)
1732
- if success:
1733
- self.notify(msg, severity="information")
1734
- self.refresh_orphans()
1735
- else:
1736
- self.notify(msg, severity="error")
1737
-
1738
- self.app.push_screen(
1739
- DeleteConfirmScreen("orphan job", job.identifier),
1740
- handle_delete,
1741
- )
1742
-
1743
- def action_delete_all(self) -> None:
1744
- """Delete all orphan jobs"""
1745
- if not self.orphan_jobs:
1746
- self.notify("No orphan jobs to delete", severity="warning")
1747
- return
1748
-
1749
- # Filter out running jobs
1750
- deletable_jobs = [j for j in self.orphan_jobs if not j.state.running()]
1751
-
1752
- if not deletable_jobs:
1753
- self.notify("All orphan jobs are running", severity="warning")
1754
- return
1755
-
1756
- def handle_delete_all(confirmed: bool) -> None:
1757
- if confirmed:
1758
- deleted = 0
1759
- for job in deletable_jobs:
1760
- success, _ = self.state_provider.delete_job_safely(
1761
- job, cascade_orphans=False
1762
- )
1763
- if success:
1764
- deleted += 1
1765
-
1766
- # Clean up orphan partials once at the end
1767
- self.state_provider.cleanup_orphan_partials(perform=True)
1768
-
1769
- self.notify(f"Deleted {deleted} orphan jobs", severity="information")
1770
- self.refresh_orphans()
1771
-
1772
- self.app.push_screen(
1773
- DeleteConfirmScreen(
1774
- "all orphan jobs",
1775
- f"{len(deletable_jobs)} jobs",
1776
- "This action cannot be undone",
1777
- ),
1778
- handle_delete_all,
1779
- )
1780
-
1781
- def action_refresh(self) -> None:
1782
- """Refresh the orphan jobs list"""
1783
- self.refresh_orphans()
1784
-
1785
- def action_go_back(self) -> None:
1786
- """Go back to main screen"""
1787
- self.dismiss()
1788
-
1789
-
1790
- class HelpScreen(ModalScreen[None]):
1791
- """Modal screen showing keyboard shortcuts"""
1792
-
1793
- BINDINGS = [
1794
- Binding("escape", "close", "Close"),
1795
- Binding("?", "close", "Close"),
1796
- ]
1797
-
1798
- def compose(self) -> ComposeResult:
1799
- from textual.containers import VerticalScroll
1800
-
1801
- help_text = """
1802
- [bold]Keyboard Shortcuts[/bold]
1803
-
1804
- [bold cyan]Navigation[/bold cyan]
1805
- q Quit application
1806
- Esc Go back / Close dialog
1807
- r Refresh data
1808
- ? Show this help
1809
- j Switch to Jobs tab
1810
- s Switch to Services tab
1811
-
1812
- [bold cyan]Experiments[/bold cyan]
1813
- Enter Select experiment
1814
- d Delete experiment
1815
- k Kill all running jobs
1816
-
1817
- [bold cyan]Jobs[/bold cyan]
1818
- l View job logs
1819
- d Delete job
1820
- k Kill running job
1821
- / Open search filter
1822
- c Clear search filter
1823
- S Sort by status
1824
- T Sort by task
1825
- D Sort by date
1826
- f Copy folder path
1827
-
1828
- [bold cyan]Services[/bold cyan]
1829
- s Start service
1830
- x Stop service
1831
- u Copy URL
1832
-
1833
- [bold cyan]Search Filter[/bold cyan]
1834
- Enter Apply filter
1835
- Esc Close and clear filter
1836
-
1837
- [bold cyan]Orphan Jobs[/bold cyan]
1838
- o Show orphan jobs
1839
- T Sort by task
1840
- Z Sort by size
1841
- d Delete selected
1842
- D Delete all
1843
- f Copy folder path
1844
- """
1845
- with Vertical(id="help-dialog"):
1846
- yield Static("Experimaestro Help", id="help-title")
1847
- with VerticalScroll(id="help-scroll"):
1848
- yield Static(help_text, id="help-content")
1849
- yield Button("Close", id="help-close-btn")
1850
-
1851
- def on_button_pressed(self, event: Button.Pressed) -> None:
1852
- self.dismiss()
1853
-
1854
- def action_close(self) -> None:
1855
- self.dismiss()
29
+ from experimaestro.tui.utils import format_duration, get_status_icon # noqa: F401
30
+ from experimaestro.tui.messages import (
31
+ ExperimentSelected,
32
+ ExperimentDeselected,
33
+ JobSelected,
34
+ JobDeselected,
35
+ ViewJobLogs,
36
+ ViewJobLogsRequest,
37
+ DeleteJobRequest,
38
+ DeleteExperimentRequest,
39
+ KillJobRequest,
40
+ KillExperimentRequest,
41
+ FilterChanged, # noqa: F401
42
+ SearchApplied, # noqa: F401
43
+ SizeCalculated, # noqa: F401
44
+ ShowRunsRequest,
45
+ RunSelected,
46
+ )
47
+ from experimaestro.tui.dialogs import (
48
+ QuitConfirmScreen,
49
+ DeleteConfirmScreen,
50
+ KillConfirmScreen,
51
+ HelpScreen,
52
+ )
53
+ from experimaestro.tui.widgets import (
54
+ CaptureLog,
55
+ ExperimentsList,
56
+ ServicesList,
57
+ JobsTable,
58
+ JobDetailView,
59
+ RunsList,
60
+ GlobalServiceSyncs,
61
+ )
62
+ from experimaestro.tui.widgets.stray_jobs import OrphanJobsTab
1856
63
 
1857
64
 
1858
65
  class ExperimaestroUI(App):
@@ -1866,7 +73,6 @@ class ExperimaestroUI(App):
1866
73
  Binding("?", "show_help", "Help"),
1867
74
  Binding("escape", "go_back", "Back", show=False),
1868
75
  Binding("l", "view_logs", "Logs", show=False),
1869
- Binding("o", "show_orphans", "Orphans", show=False),
1870
76
  Binding("j", "focus_jobs", "Jobs", show=False),
1871
77
  Binding("s", "focus_services", "Services", show=False),
1872
78
  ]
@@ -1875,15 +81,19 @@ class ExperimaestroUI(App):
1875
81
  self,
1876
82
  workdir: Optional[Path] = None,
1877
83
  watch: bool = True,
1878
- state_provider: Optional[WorkspaceStateProvider] = None,
84
+ state_provider: Optional[StateProvider] = None,
1879
85
  show_logs: bool = False,
1880
86
  ):
1881
87
  """Initialize the TUI
1882
88
 
1883
89
  Args:
1884
- workdir: Workspace directory (required if state_provider not provided)
90
+ workdir: Workspace directory (required if state_provider not provided
91
+ and not using deferred mode)
1885
92
  watch: Enable filesystem watching for workspace mode
1886
- state_provider: Pre-initialized state provider (for active experiments)
93
+ state_provider: Pre-initialized state provider (for active experiments).
94
+ If None and workdir is provided, creates a WorkspaceStateProvider.
95
+ If None and workdir is None, starts in deferred mode (logs only)
96
+ and waits for set_state_provider() to be called.
1887
97
  show_logs: Whether to show the logs tab (for active experiments)
1888
98
  """
1889
99
  super().__init__()
@@ -1891,46 +101,73 @@ class ExperimaestroUI(App):
1891
101
  self.watch = watch
1892
102
  self.show_logs = show_logs
1893
103
  self._listener_registered = False
104
+ self._monitor_mounted = False
1894
105
 
1895
106
  # Initialize state provider before compose
1896
107
  if state_provider:
1897
108
  self.state_provider = state_provider
1898
- self.owns_provider = False # Don't close external provider
1899
- self._has_active_experiment = True # External provider = active experiment
1900
- else:
1901
- from experimaestro.scheduler.state_provider import WorkspaceStateProvider
109
+ elif workdir:
110
+ from experimaestro.scheduler.workspace_state_provider import (
111
+ WorkspaceStateProvider,
112
+ )
1902
113
 
1903
114
  # Get singleton provider instance for this workspace
1904
- self.state_provider = WorkspaceStateProvider.get_instance(
1905
- self.workdir,
1906
- read_only=False,
1907
- sync_on_start=True,
1908
- sync_interval_minutes=5,
1909
- )
1910
- self.owns_provider = False # Provider is singleton, don't close
1911
- self._has_active_experiment = False # Just viewing, no active experiment
115
+ self.state_provider = WorkspaceStateProvider.get_instance(self.workdir)
116
+ else:
117
+ # Deferred mode: no provider yet, will be set later via set_state_provider()
118
+ self.state_provider = None
119
+
120
+ # Set subtitle to show scheduler status
121
+ self._update_scheduler_status()
122
+
123
+ def _update_scheduler_status(self) -> None:
124
+ """Update the subtitle to reflect scheduler status"""
125
+ if self.state_provider is None:
126
+ self.sub_title = "○ Waiting for experiment..."
127
+ elif self.state_provider.is_live:
128
+ self.sub_title = "● Running experiment"
129
+ else:
130
+ self.sub_title = "○ Monitoring workspace"
1912
131
 
1913
132
  def compose(self) -> ComposeResult:
1914
133
  """Compose the TUI layout"""
1915
134
  yield Header()
1916
135
 
1917
- if self.show_logs:
1918
- # Tabbed layout with logs
136
+ if self.state_provider is None:
137
+ # Deferred mode: only show logs, monitor will be added later
138
+ with TabbedContent(id="main-tabs"):
139
+ with TabPane("Logs", id="logs-tab"):
140
+ yield CaptureLog(id="logs", auto_scroll=True, wrap=True)
141
+ elif self.show_logs:
142
+ # Tabbed layout with logs and services
1919
143
  with TabbedContent(id="main-tabs"):
1920
144
  with TabPane("Monitor", id="monitor-tab"):
1921
145
  yield from self._compose_monitor_view()
146
+ with TabPane("Services (0)", id="services-sync-tab"):
147
+ yield GlobalServiceSyncs(self.state_provider)
148
+ with TabPane("Orphans (0)", id="orphan-tab"):
149
+ yield OrphanJobsTab(self.state_provider)
1922
150
  with TabPane("Logs", id="logs-tab"):
1923
151
  yield CaptureLog(id="logs", auto_scroll=True, wrap=True)
152
+ self._monitor_mounted = True
1924
153
  else:
1925
- # Simple layout without logs
1926
- with Vertical(id="main-container"):
1927
- yield from self._compose_monitor_view()
154
+ # Simple layout without logs but with services
155
+ with TabbedContent(id="main-tabs"):
156
+ with TabPane("Monitor", id="monitor-tab"):
157
+ yield from self._compose_monitor_view()
158
+ with TabPane("Services (0)", id="services-sync-tab"):
159
+ yield GlobalServiceSyncs(self.state_provider)
160
+ with TabPane("Orphans (0)", id="orphan-tab"):
161
+ yield OrphanJobsTab(self.state_provider)
162
+ self._monitor_mounted = True
1928
163
 
1929
164
  yield Footer()
1930
165
 
1931
166
  def _compose_monitor_view(self):
1932
- """Compose the monitor view with experiments, jobs/services tabs, and job details"""
167
+ """Compose the monitor view with experiments, runs, jobs/services tabs, and job details"""
1933
168
  yield ExperimentsList(self.state_provider)
169
+ # Runs list (hidden initially, shown when 'd' pressed on experiment)
170
+ yield RunsList(self.state_provider)
1934
171
  # Tabbed view for jobs and services (hidden initially)
1935
172
  with TabbedContent(id="experiment-tabs", classes="hidden"):
1936
173
  with TabPane("Jobs", id="jobs-tab"):
@@ -1946,9 +183,10 @@ class ExperimaestroUI(App):
1946
183
  # Resets logging
1947
184
  logging.basicConfig(level=logging.INFO, force=True)
1948
185
 
1949
- # Get the widgets
1950
- experiments_list = self.query_one(ExperimentsList)
1951
- experiments_list.refresh_experiments()
186
+ # If monitor is mounted, refresh experiments
187
+ if self._monitor_mounted:
188
+ experiments_list = self.query_one(ExperimentsList)
189
+ experiments_list.refresh_experiments()
1952
190
 
1953
191
  # Register as listener for state change notifications
1954
192
  # The state provider handles its own notification strategy internally
@@ -1957,7 +195,154 @@ class ExperimaestroUI(App):
1957
195
  self._listener_registered = True
1958
196
  self.log("Registered state listener for notifications")
1959
197
 
1960
- def _on_state_event(self, event: StateEvent) -> None:
198
+ def set_state_provider(self, state_provider: StateProvider) -> None:
199
+ """Set the state provider and mount monitor widgets (for deferred mode)
200
+
201
+ Call this method from a background thread after starting the experiment.
202
+ The TUI will add the Monitor, Services, and Orphans tabs.
203
+
204
+ Args:
205
+ state_provider: The state provider (typically the Scheduler)
206
+ """
207
+ self.state_provider = state_provider
208
+ self._update_scheduler_status()
209
+
210
+ # Mount monitor widgets if not already done
211
+ if not self._monitor_mounted:
212
+ self._mount_monitor_widgets()
213
+
214
+ # Register listener
215
+ if not self._listener_registered:
216
+ self.state_provider.add_listener(self._on_state_event)
217
+ self._listener_registered = True
218
+ self.log("Registered state listener for notifications")
219
+
220
+ def _mount_monitor_widgets(self) -> None:
221
+ """Mount the monitor widgets dynamically (for deferred mode)"""
222
+ tabs = self.query_one("#main-tabs", TabbedContent)
223
+
224
+ # Create monitor pane with all its children composed
225
+ monitor_pane = TabPane("Monitor", id="monitor-tab")
226
+ tabs.add_pane(monitor_pane, before="logs-tab")
227
+
228
+ # Create widgets
229
+ experiments_list = ExperimentsList(self.state_provider)
230
+ runs_list = RunsList(self.state_provider)
231
+ jobs_table = JobsTable(self.state_provider)
232
+ services_list = ServicesList(self.state_provider)
233
+ job_detail_view = JobDetailView(self.state_provider)
234
+
235
+ # Mount experiments and runs lists
236
+ monitor_pane.mount(experiments_list)
237
+ monitor_pane.mount(runs_list)
238
+
239
+ # Create experiment tabs with children using compose_add_child
240
+ experiment_tabs = TabbedContent(id="experiment-tabs", classes="hidden")
241
+ jobs_pane = TabPane("Jobs", id="jobs-tab")
242
+ services_pane = TabPane("Services", id="services-tab")
243
+ jobs_pane.compose_add_child(jobs_table)
244
+ services_pane.compose_add_child(services_list)
245
+ experiment_tabs.compose_add_child(jobs_pane)
246
+ experiment_tabs.compose_add_child(services_pane)
247
+ monitor_pane.mount(experiment_tabs)
248
+
249
+ # Create job detail container
250
+ job_detail_container = Vertical(id="job-detail-container", classes="hidden")
251
+ job_detail_container.compose_add_child(job_detail_view)
252
+ monitor_pane.mount(job_detail_container)
253
+
254
+ # Create and mount services sync tab
255
+ services_sync_pane = TabPane("Services (0)", id="services-sync-tab")
256
+ services_sync_pane.compose_add_child(GlobalServiceSyncs(self.state_provider))
257
+ tabs.add_pane(services_sync_pane, before="logs-tab")
258
+
259
+ # Create and mount orphans tab (only if not live)
260
+ if not self.state_provider.is_live:
261
+ orphan_pane = TabPane("Orphans (0)", id="orphan-tab")
262
+ orphan_pane.compose_add_child(OrphanJobsTab(self.state_provider))
263
+ tabs.add_pane(orphan_pane, before="logs-tab")
264
+
265
+ self._monitor_mounted = True
266
+
267
+ # Refresh experiments list
268
+ experiments_list.refresh_experiments()
269
+
270
+ def update_services_tab_title(self) -> None:
271
+ """Update the Services tab title with running service count"""
272
+ try:
273
+ # Count running services from state provider
274
+ from experimaestro.scheduler.services import ServiceState
275
+
276
+ all_services = self.state_provider.get_services()
277
+ running_count = sum(
278
+ 1
279
+ for s in all_services
280
+ if hasattr(s, "state") and s.state == ServiceState.RUNNING
281
+ )
282
+
283
+ # Find and update the tab pane title
284
+ tabs = self.query_one("#main-tabs", TabbedContent)
285
+ tab = tabs.get_tab("services-sync-tab")
286
+ if tab:
287
+ tab.label = f"Services ({running_count})"
288
+ except Exception:
289
+ pass
290
+
291
+ def update_orphan_tab_title(self) -> None:
292
+ """Update the Orphans tab title with orphan job count
293
+
294
+ Format: Orphans (X/Y) where X=running (stray), Y=non-running (finished)
295
+ """
296
+ try:
297
+ orphan_tab = self.query_one(OrphanJobsTab)
298
+ running = orphan_tab.running_count
299
+ finished = orphan_tab.finished_count
300
+ # Find and update the tab pane title
301
+ tabs = self.query_one("#main-tabs", TabbedContent)
302
+ tab = tabs.get_tab("orphan-tab")
303
+ if tab:
304
+ tab.label = f"Orphans ({running}/{finished})"
305
+ except Exception:
306
+ pass
307
+
308
+ def update_logs_tab_title(self) -> None:
309
+ """Update the Logs tab title to show unread indicator (bold when unread)"""
310
+ if not self.show_logs:
311
+ return
312
+ try:
313
+ from rich.text import Text
314
+
315
+ log_widget = self.query_one(CaptureLog)
316
+ tabs = self.query_one("#main-tabs", TabbedContent)
317
+ tab = tabs.get_tab("logs-tab")
318
+ if tab:
319
+ if log_widget.has_unread:
320
+ tab.label = Text("Logs *", style="bold")
321
+ else:
322
+ tab.label = "Logs"
323
+ except Exception:
324
+ pass
325
+
326
+ def on_tabbed_content_tab_activated(
327
+ self, event: TabbedContent.TabActivated
328
+ ) -> None:
329
+ """Handle tab switching"""
330
+ # event.pane is the TabPane, event.tab is the Tab widget (header)
331
+ if event.pane.id == "logs-tab" and self.show_logs:
332
+ try:
333
+ log_widget = self.query_one(CaptureLog)
334
+ log_widget.mark_as_read()
335
+ except Exception:
336
+ pass
337
+ elif event.pane.id == "services-sync-tab":
338
+ # Refresh global services when switching to Services tab
339
+ try:
340
+ global_services = self.query_one(GlobalServiceSyncs)
341
+ global_services.refresh_services()
342
+ except Exception:
343
+ pass
344
+
345
+ def _on_state_event(self, event: EventBase) -> None:
1961
346
  """Handle state change events from the state provider
1962
347
 
1963
348
  This may be called from the state provider's thread or the main thread,
@@ -1965,6 +350,8 @@ class ExperimaestroUI(App):
1965
350
  """
1966
351
  import threading
1967
352
 
353
+ self.log.info(f"_on_state_event called with: {type(event).__name__}")
354
+
1968
355
  if threading.current_thread() is threading.main_thread():
1969
356
  # Already in main thread, call directly
1970
357
  self._handle_state_event(event)
@@ -1972,59 +359,148 @@ class ExperimaestroUI(App):
1972
359
  # From background thread, use call_from_thread
1973
360
  self.call_from_thread(self._handle_state_event, event)
1974
361
 
1975
- def _handle_state_event(self, event: StateEvent) -> None:
1976
- """Process state event on the main thread"""
1977
- # Use query() instead of query_one() to avoid NoMatches exception
1978
- # when widgets aren't visible yet
1979
- jobs_tables = self.query(JobsTable)
1980
- services_lists = self.query(ServicesList)
362
+ def _handle_state_event(self, event: EventBase) -> None:
363
+ """Process state event on the main thread using handler dispatch"""
364
+ self.log.info(f"State event: {event}")
1981
365
 
1982
- self.log.debug(
1983
- f"State event {event.event_type.name}, "
1984
- f"JobsTable found: {len(jobs_tables)}, ServicesList found: {len(services_lists)}"
366
+ # Dispatch to handler if one exists for this event type
367
+ handler = self.STATE_EVENT_HANDLERS.get(type(event))
368
+ if handler:
369
+ self.log.info(f"Dispatching to handler: {handler.__name__}")
370
+ try:
371
+ handler(self, event)
372
+ except Exception as e:
373
+ self.log.error(f"Error in handler: {e}")
374
+ else:
375
+ self.log.warning(f"No handler for event type: {type(event).__name__}")
376
+
377
+ def _handle_experiment_updated(self, event: ExperimentUpdatedEvent) -> None:
378
+ """Handle ExperimentUpdatedEvent - refresh experiments list and jobs"""
379
+ for exp_list in self.query(ExperimentsList):
380
+ exp_list.refresh_experiments()
381
+
382
+ # Also refresh jobs table if we're viewing the affected experiment
383
+ # (this handles the case when experiment finishes and events are deleted)
384
+ for jobs_table in self.query(JobsTable):
385
+ if jobs_table.current_experiment == event.experiment_id:
386
+ jobs_table.refresh_jobs()
387
+
388
+ def _handle_run_updated(self, event: RunUpdatedEvent) -> None:
389
+ """Handle RunUpdatedEvent - refresh experiments list"""
390
+ for exp_list in self.query(ExperimentsList):
391
+ exp_list.refresh_experiments()
392
+
393
+ def _handle_service_added(self, event: ServiceAddedEvent) -> None:
394
+ """Handle ServiceAddedEvent - refresh services list and update tab title"""
395
+ event_exp_id = event.experiment_id
396
+ self.log.info(
397
+ f"ServiceAddedEvent received: exp={event_exp_id}, service={event.service_id}"
1985
398
  )
1986
399
 
1987
- if event.event_type == StateEventType.EXPERIMENT_UPDATED:
1988
- # Refresh experiments list
1989
- for exp_list in self.query(ExperimentsList):
1990
- exp_list.refresh_experiments()
400
+ # Refresh the global services widget
401
+ try:
402
+ global_services = self.query_one(GlobalServiceSyncs)
403
+ self.log.info("Calling GlobalServiceSyncs.refresh_services()")
404
+ global_services.refresh_services()
405
+ except Exception as e:
406
+ self.log.warning(f"Failed to refresh global services: {e}")
1991
407
 
1992
- elif event.event_type == StateEventType.JOB_UPDATED:
1993
- event_exp_id = event.data.get("experimentId")
408
+ # Refresh per-experiment services list
409
+ for services_list in self.query(ServicesList):
410
+ if services_list.current_experiment == event_exp_id:
411
+ services_list.refresh_services()
1994
412
 
1995
- # Refresh jobs table if we're viewing the affected experiment
1996
- for jobs_table in jobs_tables:
1997
- if jobs_table.current_experiment == event_exp_id:
1998
- jobs_table.refresh_jobs()
413
+ def _handle_service_state_changed(self, event: ServiceStateChangedEvent) -> None:
414
+ """Handle ServiceStateChangedEvent - update tab title when service state changes"""
415
+ # Update the Services tab title (running count may have changed)
416
+ self.update_services_tab_title()
1999
417
 
2000
- # Also refresh job detail if we're viewing the affected job
2001
- for job_detail_container in self.query("#job-detail-container"):
2002
- if not job_detail_container.has_class("hidden"):
2003
- for job_detail_view in self.query(JobDetailView):
2004
- event_job_id = event.data.get("jobId")
2005
- if job_detail_view.current_job_id == event_job_id:
2006
- job_detail_view.refresh_job_detail()
418
+ # Also refresh global services widget if visible
419
+ try:
420
+ global_services = self.query_one(GlobalServiceSyncs)
421
+ global_services.refresh_services()
422
+ except Exception:
423
+ pass
2007
424
 
2008
- # Also update the experiment stats in the experiments list
2009
- for exp_list in self.query(ExperimentsList):
2010
- exp_list.refresh_experiments()
425
+ # Refresh per-experiment services list
426
+ for services_list in self.query(ServicesList):
427
+ if services_list.current_experiment == event.experiment_id:
428
+ services_list.refresh_services()
429
+
430
+ def _handle_job_submitted(self, event: JobSubmittedEvent) -> None:
431
+ """Handle JobSubmittedEvent - update tags, dependencies, and refresh job list"""
432
+ event_exp_id = event.experiment_id
433
+
434
+ # Update tags_map, dependencies_map, and refresh jobs for the affected experiment
435
+ for jobs_table in self.query(JobsTable):
436
+ if jobs_table.current_experiment == event_exp_id:
437
+ # Add the new job's tags to the cache
438
+ if event.tags:
439
+ jobs_table.tags_map[event.job_id] = {
440
+ tag.key: tag.value for tag in event.tags
441
+ }
442
+ # Add the new job's dependencies to the cache
443
+ if event.depends_on:
444
+ jobs_table.dependencies_map[event.job_id] = event.depends_on
445
+ # Refresh to show the new job
446
+ jobs_table.refresh_jobs()
447
+
448
+ # Also update experiment stats
449
+ for exp_list in self.query(ExperimentsList):
450
+ exp_list.refresh_experiments()
451
+
452
+ def _handle_job_state_changed(self, event: JobStateChangedEvent) -> None:
453
+ """Handle JobStateChangedEvent - refresh job display
454
+
455
+ This event is dispatched once per job state change.
456
+ Used for progress updates and state changes from job processes.
457
+ """
458
+ # Refresh all jobs tables that might contain this job
459
+ for jobs_table in self.query(JobsTable):
460
+ jobs_table.refresh_jobs()
2011
461
 
2012
- elif event.event_type == StateEventType.RUN_UPDATED:
2013
- # Refresh experiments list to show updated run info
2014
- for exp_list in self.query(ExperimentsList):
2015
- exp_list.refresh_experiments()
462
+ # Also refresh job detail if we're viewing this job
463
+ for job_detail_container in self.query("#job-detail-container"):
464
+ if not job_detail_container.has_class("hidden"):
465
+ for job_detail_view in self.query(JobDetailView):
466
+ if job_detail_view.current_job_id == event.job_id:
467
+ job_detail_view.refresh_job_detail()
2016
468
 
2017
- elif event.event_type == StateEventType.SERVICE_UPDATED:
2018
- event_exp_id = event.data.get("experimentId")
469
+ # Also update the experiment stats in the experiments list
470
+ for exp_list in self.query(ExperimentsList):
471
+ exp_list.refresh_experiments()
2019
472
 
2020
- # Refresh services list if we're viewing the affected experiment
2021
- for services_list in services_lists:
2022
- if services_list.current_experiment == event_exp_id:
2023
- services_list.refresh_services()
473
+ def _handle_job_progress(self, event: JobProgressEvent) -> None:
474
+ """Handle JobProgressEvent - refresh job progress display
475
+
476
+ This event is dispatched when a job reports progress updates.
477
+ """
478
+ # Refresh all jobs tables that might contain this job
479
+ for jobs_table in self.query(JobsTable):
480
+ jobs_table.refresh_jobs()
481
+
482
+ # Also refresh job detail if we're viewing this job
483
+ for job_detail_container in self.query("#job-detail-container"):
484
+ if not job_detail_container.has_class("hidden"):
485
+ for job_detail_view in self.query(JobDetailView):
486
+ if job_detail_view.current_job_id == event.job_id:
487
+ job_detail_view.refresh_job_detail()
488
+
489
+ STATE_EVENT_HANDLERS = {
490
+ ExperimentUpdatedEvent: _handle_experiment_updated,
491
+ JobStateChangedEvent: _handle_job_state_changed,
492
+ JobProgressEvent: _handle_job_progress,
493
+ RunUpdatedEvent: _handle_run_updated,
494
+ ServiceAddedEvent: _handle_service_added,
495
+ ServiceStateChangedEvent: _handle_service_state_changed,
496
+ JobSubmittedEvent: _handle_job_submitted,
497
+ }
2024
498
 
2025
499
  def on_experiment_selected(self, message: ExperimentSelected) -> None:
2026
500
  """Handle experiment selection - show jobs/services tabs"""
2027
- self.log(f"Experiment selected: {message.experiment_id}")
501
+ self.log(
502
+ f"Experiment selected: {message.experiment_id} (run: {message.run_id})"
503
+ )
2028
504
 
2029
505
  # Set up services list
2030
506
  services_list = self.query_one(ServicesList)
@@ -2032,7 +508,7 @@ class ExperimaestroUI(App):
2032
508
 
2033
509
  # Set up jobs table
2034
510
  jobs_table_widget = self.query_one(JobsTable)
2035
- jobs_table_widget.set_experiment(message.experiment_id)
511
+ jobs_table_widget.set_experiment(message.experiment_id, message.run_id)
2036
512
 
2037
513
  # Show the tabbed content
2038
514
  tabs = self.query_one("#experiment-tabs", TabbedContent)
@@ -2117,63 +593,28 @@ class ExperimaestroUI(App):
2117
593
  job_detail_view = self.query_one(JobDetailView)
2118
594
  job_detail_view.action_view_logs()
2119
595
 
2120
- def action_show_orphans(self) -> None:
2121
- """Show orphan jobs screen"""
2122
- self.push_screen(OrphanJobsScreen(self.state_provider))
2123
-
2124
- @work(thread=True, exclusive=True)
2125
- def _sync_and_view_logs(self, job_path: Path, task_id: str) -> None:
2126
- """Sync logs from remote and then view them (runs in worker thread)"""
2127
- try:
2128
- # Sync the job directory
2129
- local_path = self.state_provider.sync_path(str(job_path))
2130
- if not local_path:
2131
- self.post_message(LogsSyncFailed("Failed to sync logs from remote"))
2132
- return
2133
-
2134
- job_path = local_path
2135
-
2136
- # Log files are named after the last part of the task ID
2137
- task_name = task_id.split(".")[-1]
2138
- stdout_path = job_path / f"{task_name}.out"
2139
- stderr_path = job_path / f"{task_name}.err"
2140
-
2141
- # Collect existing log files
2142
- log_files = []
2143
- if stdout_path.exists():
2144
- log_files.append(str(stdout_path))
2145
- if stderr_path.exists():
2146
- log_files.append(str(stderr_path))
2147
-
2148
- if not log_files:
2149
- self.post_message(
2150
- LogsSyncFailed(f"No log files found: {task_name}.out/.err")
2151
- )
2152
- return
2153
-
2154
- # Signal completion via message
2155
- job_id = job_path.name
2156
- self.post_message(LogsSyncComplete(log_files, job_id))
2157
-
2158
- except Exception as e:
2159
- self.post_message(LogsSyncFailed(str(e)))
2160
-
2161
- def on_logs_sync_complete(self, message: LogsSyncComplete) -> None:
2162
- """Handle successful log sync - show log viewer"""
2163
- self.push_screen(LogViewerScreen(message.log_files, message.job_id))
2164
-
2165
- def on_logs_sync_failed(self, message: LogsSyncFailed) -> None:
2166
- """Handle failed log sync"""
2167
- self.notify(message.error, severity="warning")
2168
-
2169
596
  def on_view_job_logs(self, message: ViewJobLogs) -> None:
2170
- """Handle request to view job logs - push LogViewerScreen"""
597
+ """Handle request to view job logs - push LogViewerScreen
598
+
599
+ For remote monitoring, switches to log viewer immediately with loading state,
600
+ then starts adaptive sync in background.
601
+ """
2171
602
  job_path = Path(message.job_path)
603
+ job_id = job_path.name
2172
604
 
2173
- # For remote monitoring, sync the job directory first (in worker thread)
605
+ # For remote monitoring, switch screen immediately with loading state
2174
606
  if self.state_provider.is_remote:
2175
- self.notify("Syncing logs from remote...", timeout=5)
2176
- self._sync_and_view_logs(job_path, message.task_id)
607
+ # Push screen immediately - it will handle sync and show loading state
608
+ self.push_screen(
609
+ LogViewerScreen(
610
+ log_files=[], # Will be populated after sync
611
+ job_id=job_id,
612
+ sync_func=self.state_provider.sync_path,
613
+ remote_path=str(job_path),
614
+ task_id=message.task_id,
615
+ job_state=message.job_state,
616
+ )
617
+ )
2177
618
  return
2178
619
 
2179
620
  # Local monitoring - no sync needed
@@ -2196,7 +637,6 @@ class ExperimaestroUI(App):
2196
637
  return
2197
638
 
2198
639
  # Push the log viewer screen
2199
- job_id = job_path.name
2200
640
  self.push_screen(LogViewerScreen(log_files, job_id))
2201
641
 
2202
642
  def on_view_job_logs_request(self, message: ViewJobLogsRequest) -> None:
@@ -2205,7 +645,7 @@ class ExperimaestroUI(App):
2205
645
  if not job or not job.path or not job.task_id:
2206
646
  self.notify("Cannot find job logs", severity="warning")
2207
647
  return
2208
- self.post_message(ViewJobLogs(str(job.path), job.task_id))
648
+ self.post_message(ViewJobLogs(str(job.path), job.task_id, job.state))
2209
649
 
2210
650
  def on_delete_job_request(self, message: DeleteJobRequest) -> None:
2211
651
  """Handle job deletion request"""
@@ -2330,6 +770,42 @@ class ExperimaestroUI(App):
2330
770
  handle_kill_response,
2331
771
  )
2332
772
 
773
+ def on_show_runs_request(self, message: ShowRunsRequest) -> None:
774
+ """Handle request to show experiment runs"""
775
+ runs_list = self.query_one(RunsList)
776
+ runs_list.set_experiment(message.experiment_id, message.current_run_id)
777
+
778
+ def on_run_selected(self, message: RunSelected) -> None:
779
+ """Handle run selection - show jobs for the selected run"""
780
+ self.log(
781
+ f"Run selected: {message.run_id} (current={message.is_current}) "
782
+ f"for {message.experiment_id}"
783
+ )
784
+
785
+ # Set up jobs table with the selected run
786
+ jobs_table_widget = self.query_one(JobsTable)
787
+ jobs_table_widget.set_experiment(
788
+ message.experiment_id,
789
+ message.run_id,
790
+ is_past_run=not message.is_current,
791
+ )
792
+
793
+ # Set up services list
794
+ services_list = self.query_one(ServicesList)
795
+ services_list.set_experiment(message.experiment_id)
796
+
797
+ # Show the tabbed content
798
+ tabs = self.query_one("#experiment-tabs", TabbedContent)
799
+ tabs.remove_class("hidden")
800
+
801
+ # Collapse experiments list
802
+ experiments_list = self.query_one(ExperimentsList)
803
+ experiments_list.collapse_to_experiment(message.experiment_id)
804
+
805
+ # Focus the jobs table
806
+ jobs_table = self.query_one("#jobs-table", DataTable)
807
+ jobs_table.focus()
808
+
2333
809
  def action_focus_jobs(self) -> None:
2334
810
  """Switch to the jobs tab"""
2335
811
  tabs = self.query_one("#experiment-tabs", TabbedContent)
@@ -2374,7 +850,7 @@ class ExperimaestroUI(App):
2374
850
  self.exit()
2375
851
 
2376
852
  self.push_screen(
2377
- QuitConfirmScreen(has_active_experiment=self._has_active_experiment),
853
+ QuitConfirmScreen(has_active_experiment=self.state_provider.is_live),
2378
854
  handle_quit_response,
2379
855
  )
2380
856
 
@@ -2389,7 +865,3 @@ class ExperimaestroUI(App):
2389
865
  self.state_provider.remove_listener(self._on_state_event)
2390
866
  self._listener_registered = False
2391
867
  self.log("Unregistered state listener")
2392
-
2393
- # Only close state provider if we own it (not external/active experiment)
2394
- if self.state_provider and self.owns_provider:
2395
- self.state_provider.close()