experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (154) hide show
  1. experimaestro/__init__.py +12 -5
  2. experimaestro/cli/__init__.py +393 -134
  3. experimaestro/cli/filter.py +48 -23
  4. experimaestro/cli/jobs.py +253 -71
  5. experimaestro/cli/refactor.py +1 -2
  6. experimaestro/commandline.py +7 -4
  7. experimaestro/connectors/__init__.py +9 -1
  8. experimaestro/connectors/local.py +43 -3
  9. experimaestro/core/arguments.py +18 -18
  10. experimaestro/core/identifier.py +11 -11
  11. experimaestro/core/objects/config.py +96 -39
  12. experimaestro/core/objects/config_walk.py +3 -3
  13. experimaestro/core/{subparameters.py → partial.py} +16 -16
  14. experimaestro/core/partial_lock.py +394 -0
  15. experimaestro/core/types.py +12 -15
  16. experimaestro/dynamic.py +290 -0
  17. experimaestro/experiments/__init__.py +6 -2
  18. experimaestro/experiments/cli.py +223 -52
  19. experimaestro/experiments/configuration.py +24 -0
  20. experimaestro/generators.py +5 -5
  21. experimaestro/ipc.py +118 -1
  22. experimaestro/launcherfinder/__init__.py +2 -2
  23. experimaestro/launcherfinder/registry.py +6 -7
  24. experimaestro/launcherfinder/specs.py +2 -9
  25. experimaestro/launchers/slurm/__init__.py +2 -2
  26. experimaestro/launchers/slurm/base.py +62 -0
  27. experimaestro/locking.py +957 -1
  28. experimaestro/notifications.py +89 -201
  29. experimaestro/progress.py +63 -366
  30. experimaestro/rpyc.py +0 -2
  31. experimaestro/run.py +29 -2
  32. experimaestro/scheduler/__init__.py +8 -1
  33. experimaestro/scheduler/base.py +650 -53
  34. experimaestro/scheduler/dependencies.py +20 -16
  35. experimaestro/scheduler/experiment.py +764 -169
  36. experimaestro/scheduler/interfaces.py +338 -96
  37. experimaestro/scheduler/jobs.py +58 -20
  38. experimaestro/scheduler/remote/__init__.py +31 -0
  39. experimaestro/scheduler/remote/adaptive_sync.py +265 -0
  40. experimaestro/scheduler/remote/client.py +928 -0
  41. experimaestro/scheduler/remote/protocol.py +282 -0
  42. experimaestro/scheduler/remote/server.py +447 -0
  43. experimaestro/scheduler/remote/sync.py +144 -0
  44. experimaestro/scheduler/services.py +186 -35
  45. experimaestro/scheduler/state_provider.py +811 -2157
  46. experimaestro/scheduler/state_status.py +1247 -0
  47. experimaestro/scheduler/transient.py +31 -0
  48. experimaestro/scheduler/workspace.py +1 -1
  49. experimaestro/scheduler/workspace_state_provider.py +1273 -0
  50. experimaestro/scriptbuilder.py +4 -4
  51. experimaestro/settings.py +36 -0
  52. experimaestro/tests/conftest.py +33 -5
  53. experimaestro/tests/connectors/bin/executable.py +1 -1
  54. experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
  55. experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
  56. experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
  57. experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
  58. experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
  59. experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
  60. experimaestro/tests/launchers/bin/test.py +1 -0
  61. experimaestro/tests/launchers/test_slurm.py +9 -9
  62. experimaestro/tests/partial_reschedule.py +46 -0
  63. experimaestro/tests/restart.py +3 -3
  64. experimaestro/tests/restart_main.py +1 -0
  65. experimaestro/tests/scripts/notifyandwait.py +1 -0
  66. experimaestro/tests/task_partial.py +38 -0
  67. experimaestro/tests/task_tokens.py +2 -2
  68. experimaestro/tests/tasks/test_dynamic.py +6 -6
  69. experimaestro/tests/test_dependencies.py +3 -3
  70. experimaestro/tests/test_deprecated.py +15 -15
  71. experimaestro/tests/test_dynamic_locking.py +317 -0
  72. experimaestro/tests/test_environment.py +24 -14
  73. experimaestro/tests/test_experiment.py +171 -36
  74. experimaestro/tests/test_identifier.py +25 -25
  75. experimaestro/tests/test_identifier_stability.py +3 -5
  76. experimaestro/tests/test_multitoken.py +2 -4
  77. experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
  78. experimaestro/tests/test_partial_paths.py +81 -138
  79. experimaestro/tests/test_pre_experiment.py +219 -0
  80. experimaestro/tests/test_progress.py +2 -8
  81. experimaestro/tests/test_remote_state.py +1132 -0
  82. experimaestro/tests/test_stray_jobs.py +261 -0
  83. experimaestro/tests/test_tasks.py +1 -2
  84. experimaestro/tests/test_token_locking.py +52 -67
  85. experimaestro/tests/test_tokens.py +5 -6
  86. experimaestro/tests/test_transient.py +225 -0
  87. experimaestro/tests/test_workspace_state_provider.py +768 -0
  88. experimaestro/tests/token_reschedule.py +1 -3
  89. experimaestro/tests/utils.py +2 -7
  90. experimaestro/tokens.py +227 -372
  91. experimaestro/tools/diff.py +1 -0
  92. experimaestro/tools/documentation.py +4 -5
  93. experimaestro/tools/jobs.py +1 -2
  94. experimaestro/tui/app.py +459 -1895
  95. experimaestro/tui/app.tcss +162 -0
  96. experimaestro/tui/dialogs.py +172 -0
  97. experimaestro/tui/log_viewer.py +253 -3
  98. experimaestro/tui/messages.py +137 -0
  99. experimaestro/tui/utils.py +54 -0
  100. experimaestro/tui/widgets/__init__.py +23 -0
  101. experimaestro/tui/widgets/experiments.py +468 -0
  102. experimaestro/tui/widgets/global_services.py +238 -0
  103. experimaestro/tui/widgets/jobs.py +972 -0
  104. experimaestro/tui/widgets/log.py +156 -0
  105. experimaestro/tui/widgets/orphans.py +363 -0
  106. experimaestro/tui/widgets/runs.py +185 -0
  107. experimaestro/tui/widgets/services.py +314 -0
  108. experimaestro/tui/widgets/stray_jobs.py +528 -0
  109. experimaestro/utils/__init__.py +1 -1
  110. experimaestro/utils/environment.py +105 -22
  111. experimaestro/utils/fswatcher.py +124 -0
  112. experimaestro/utils/jobs.py +1 -2
  113. experimaestro/utils/jupyter.py +1 -2
  114. experimaestro/utils/logging.py +72 -0
  115. experimaestro/version.py +2 -2
  116. experimaestro/webui/__init__.py +9 -0
  117. experimaestro/webui/app.py +117 -0
  118. experimaestro/{server → webui}/data/index.css +66 -11
  119. experimaestro/webui/data/index.css.map +1 -0
  120. experimaestro/{server → webui}/data/index.js +82763 -87217
  121. experimaestro/webui/data/index.js.map +1 -0
  122. experimaestro/webui/routes/__init__.py +5 -0
  123. experimaestro/webui/routes/auth.py +53 -0
  124. experimaestro/webui/routes/proxy.py +117 -0
  125. experimaestro/webui/server.py +200 -0
  126. experimaestro/webui/state_bridge.py +152 -0
  127. experimaestro/webui/websocket.py +413 -0
  128. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +8 -9
  129. experimaestro-2.0.0b17.dist-info/RECORD +219 -0
  130. experimaestro/cli/progress.py +0 -269
  131. experimaestro/scheduler/state.py +0 -75
  132. experimaestro/scheduler/state_db.py +0 -388
  133. experimaestro/scheduler/state_sync.py +0 -834
  134. experimaestro/server/__init__.py +0 -467
  135. experimaestro/server/data/index.css.map +0 -1
  136. experimaestro/server/data/index.js.map +0 -1
  137. experimaestro/tests/test_cli_jobs.py +0 -615
  138. experimaestro/tests/test_file_progress.py +0 -425
  139. experimaestro/tests/test_file_progress_integration.py +0 -477
  140. experimaestro/tests/test_state_db.py +0 -434
  141. experimaestro-2.0.0b4.dist-info/RECORD +0 -181
  142. /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
  143. /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
  144. /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
  145. /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
  146. /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
  147. /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
  148. /experimaestro/{server → webui}/data/favicon.ico +0 -0
  149. /experimaestro/{server → webui}/data/index.html +0 -0
  150. /experimaestro/{server → webui}/data/login.html +0 -0
  151. /experimaestro/{server → webui}/data/manifest.json +0 -0
  152. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
  153. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
  154. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,928 @@
1
+ """SSH State Provider Client
2
+
3
+ Client that connects via SSH to a remote SSHStateProviderServer and implements
4
+ the StateProvider-like interface for local TUI/web UI usage.
5
+
6
+ Usage:
7
+ client = SSHStateProviderClient(host="server", remote_workspace="/path/to/workspace")
8
+ client.connect()
9
+ experiments = client.get_experiments()
10
+ """
11
+
12
+ import atexit
13
+ import logging
14
+ import shutil
15
+ import subprocess
16
+ import tempfile
17
+ import threading
18
+ from concurrent.futures import Future, TimeoutError as FutureTimeoutError
19
+ from datetime import datetime
20
+ from importlib.metadata import version as get_package_version
21
+ from pathlib import Path
22
+ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Set
23
+
24
+ from termcolor import colored
25
+
26
+ from experimaestro.scheduler.state_provider import (
27
+ OfflineStateProvider,
28
+ StateListener,
29
+ MockJob,
30
+ MockExperiment,
31
+ MockService,
32
+ )
33
+ from experimaestro.scheduler.state_status import EventBase
34
+ from experimaestro.scheduler.interfaces import (
35
+ BaseJob,
36
+ BaseExperiment,
37
+ BaseService,
38
+ )
39
+ from experimaestro.scheduler.remote.protocol import (
40
+ RPCMethod,
41
+ NotificationMethod,
42
+ RPCResponse,
43
+ RPCNotification,
44
+ parse_message,
45
+ create_request,
46
+ serialize_datetime,
47
+ )
48
+
49
+ # Type for SSH output callback
50
+ OutputCallback = Optional["Callable[[str], None]"]
51
+
52
+ if TYPE_CHECKING:
53
+ from experimaestro.scheduler.remote.sync import RemoteFileSynchronizer
54
+
55
+
56
+ logger = logging.getLogger("xpm.remote.client")
57
+
58
+ # Default timeout for RPC requests (seconds)
59
+ DEFAULT_TIMEOUT = 30.0
60
+
61
+
62
+ def _strip_dev_version(version: str) -> str:
63
+ """Strip the .devN suffix from a version string.
64
+
65
+ Examples:
66
+ '2.0.0b3.dev8' -> '2.0.0b3'
67
+ '1.2.3.dev1' -> '1.2.3'
68
+ '1.2.3' -> '1.2.3' (unchanged)
69
+ """
70
+ import re
71
+
72
+ return re.sub(r"\.dev\d+$", "", version)
73
+
74
+
75
+ class SSHStateProviderClient(OfflineStateProvider):
76
+ """Client that connects to SSHStateProviderServer via SSH
77
+
78
+ This client implements the StateProvider interface for remote experiment
79
+ monitoring via SSH.
80
+
81
+ Features:
82
+ - JSON-RPC over SSH stdin/stdout
83
+ - Async request/response handling with futures
84
+ - Server push notifications converted to EventBases
85
+ - On-demand rsync for specific paths (used by services like TensorboardService)
86
+ """
87
+
88
+ def __init__(
89
+ self,
90
+ host: str,
91
+ remote_workspace: str,
92
+ ssh_options: Optional[List[str]] = None,
93
+ remote_xpm_path: Optional[str] = None,
94
+ output_callback: Optional[Callable[[str], None]] = None,
95
+ ):
96
+ """Initialize the client
97
+
98
+ Args:
99
+ host: SSH host (user@host or host)
100
+ remote_workspace: Path to workspace on the remote host
101
+ ssh_options: Additional SSH options (e.g., ["-p", "2222"])
102
+ remote_xpm_path: Path to experimaestro executable on remote host.
103
+ If None, uses 'uv tool run experimaestro==<version>'.
104
+ output_callback: Callback for SSH process output (stderr).
105
+ If None, a default callback prints with colored prefix.
106
+ Set to False (or a no-op lambda) to disable output display.
107
+ """
108
+ # Initialize base class (includes service cache)
109
+ super().__init__()
110
+
111
+ self.host = host
112
+ self.remote_workspace = remote_workspace
113
+ self.ssh_options = ssh_options or []
114
+ self.remote_xpm_path = remote_xpm_path
115
+ self._output_callback = output_callback
116
+
117
+ # Session-specific temporary cache directory (created on connect)
118
+ self._temp_dir: Optional[str] = None
119
+ self.local_cache_dir: Optional[Path] = None
120
+ self.workspace_path: Optional[Path] = None # For compatibility
121
+
122
+ self._process: Optional[subprocess.Popen] = None
123
+ self._stdin = None
124
+ self._stdout = None
125
+ self._stderr = None
126
+
127
+ self._listeners: Set[StateListener] = set()
128
+ self._listener_lock = threading.Lock()
129
+
130
+ self._response_handlers: Dict[int, Future] = {}
131
+ self._response_lock = threading.Lock()
132
+ self._request_id = 0
133
+
134
+ self._read_thread: Optional[threading.Thread] = None
135
+ self._notify_thread: Optional[threading.Thread] = None
136
+ self._stderr_thread: Optional[threading.Thread] = None
137
+ self._running = False
138
+ self._connected = False
139
+
140
+ self._synchronizer: Optional["RemoteFileSynchronizer"] = None
141
+
142
+ # Throttled notification delivery to avoid flooding UI
143
+ self._pending_events: List[EventBase] = []
144
+ self._pending_events_lock = threading.Lock()
145
+ self._notify_interval = 2.0 # Seconds between notification batches
146
+
147
+ def connect(self, timeout: float = 30.0):
148
+ """Establish SSH connection and start remote server
149
+
150
+ Args:
151
+ timeout: Connection timeout in seconds
152
+ """
153
+ if self._connected:
154
+ logger.warning("Already connected")
155
+ return
156
+
157
+ # Create session-specific temporary cache directory
158
+ self._temp_dir = tempfile.mkdtemp(prefix="xpm_remote_")
159
+ self.local_cache_dir = Path(self._temp_dir)
160
+ self.workspace_path = self.local_cache_dir
161
+ logger.debug("Created temporary cache directory: %s", self._temp_dir)
162
+
163
+ # Register cleanup on exit (in case disconnect isn't called)
164
+ atexit.register(self._cleanup_temp_dir)
165
+
166
+ # Build SSH command
167
+ cmd = ["ssh"]
168
+ cmd.extend(self.ssh_options)
169
+ cmd.append(self.host)
170
+
171
+ # Build remote command (workdir is passed to experiments group)
172
+ if self.remote_xpm_path:
173
+ # Use specified path to experimaestro
174
+ remote_cmd = f"{self.remote_xpm_path} experiments --workdir {self.remote_workspace} monitor-server"
175
+ else:
176
+ # Use uv tool run with version pinning
177
+ try:
178
+ xpm_version = get_package_version("experimaestro")
179
+ # Strip .devN suffix for release compatibility
180
+ xpm_version = _strip_dev_version(xpm_version)
181
+ except Exception:
182
+ xpm_version = None
183
+
184
+ if xpm_version:
185
+ remote_cmd = f"uv tool run experimaestro=={xpm_version} experiments --workdir {self.remote_workspace} monitor-server"
186
+ else:
187
+ remote_cmd = f"uv tool run experimaestro experiments --workdir {self.remote_workspace} monitor-server"
188
+ cmd.append(remote_cmd)
189
+
190
+ logger.info("Connecting to %s, workspace: %s", self.host, self.remote_workspace)
191
+ logger.debug("SSH command: %s", " ".join(cmd))
192
+
193
+ try:
194
+ self._process = subprocess.Popen(
195
+ cmd,
196
+ stdin=subprocess.PIPE,
197
+ stdout=subprocess.PIPE,
198
+ stderr=subprocess.PIPE,
199
+ bufsize=0, # Unbuffered
200
+ )
201
+ self._stdin = self._process.stdin
202
+ self._stdout = self._process.stdout
203
+ self._stderr = self._process.stderr
204
+ except Exception as e:
205
+ logger.error("Failed to start SSH process: %s", e)
206
+ raise ConnectionError(f"Failed to connect to {self.host}: {e}")
207
+
208
+ self._running = True
209
+
210
+ # Start read thread for responses and notifications
211
+ self._read_thread = threading.Thread(
212
+ target=self._read_loop, daemon=True, name="SSHClient-Read"
213
+ )
214
+ self._read_thread.start()
215
+
216
+ # Start notification thread for throttled event delivery
217
+ self._notify_thread = threading.Thread(
218
+ target=self._notify_loop, daemon=True, name="SSHClient-Notify"
219
+ )
220
+ self._notify_thread.start()
221
+
222
+ # Start stderr thread to display SSH output
223
+ self._stderr_thread = threading.Thread(
224
+ target=self._stderr_loop, daemon=True, name="SSHClient-Stderr"
225
+ )
226
+ self._stderr_thread.start()
227
+
228
+ # Wait for connection to be established by sending a test request
229
+ try:
230
+ sync_info = self._call_sync(RPCMethod.GET_SYNC_INFO, {}, timeout=timeout)
231
+ logger.info(
232
+ "Connected to remote workspace: %s", sync_info.get("workspace_path")
233
+ )
234
+ except Exception as e:
235
+ self.disconnect()
236
+ raise ConnectionError(f"Failed to establish connection: {e}")
237
+
238
+ self._connected = True
239
+
240
+ def disconnect(self):
241
+ """Disconnect from the remote server"""
242
+ self._running = False
243
+ self._connected = False
244
+
245
+ # Close stdin to signal EOF to remote server
246
+ if self._stdin:
247
+ try:
248
+ self._stdin.close()
249
+ except Exception:
250
+ pass
251
+
252
+ # Terminate the SSH process
253
+ if self._process:
254
+ try:
255
+ self._process.terminate()
256
+ self._process.wait(timeout=5.0)
257
+ except Exception:
258
+ try:
259
+ self._process.kill()
260
+ except Exception:
261
+ pass
262
+
263
+ # Wait for threads to finish
264
+ if self._read_thread and self._read_thread.is_alive():
265
+ self._read_thread.join(timeout=2.0)
266
+ if self._notify_thread and self._notify_thread.is_alive():
267
+ self._notify_thread.join(timeout=2.0)
268
+ if self._stderr_thread and self._stderr_thread.is_alive():
269
+ self._stderr_thread.join(timeout=2.0)
270
+
271
+ # Cancel any pending requests
272
+ with self._response_lock:
273
+ for future in self._response_handlers.values():
274
+ if not future.done():
275
+ future.set_exception(ConnectionError("Disconnected"))
276
+ self._response_handlers.clear()
277
+
278
+ # Clear service cache (using base class method)
279
+ self._clear_service_cache()
280
+
281
+ # Clean up temporary cache directory
282
+ self._cleanup_temp_dir()
283
+
284
+ logger.info("Disconnected from %s", self.host)
285
+
286
+ def _cleanup_temp_dir(self):
287
+ """Clean up the temporary cache directory"""
288
+ if self._temp_dir and Path(self._temp_dir).exists():
289
+ try:
290
+ shutil.rmtree(self._temp_dir)
291
+ logger.debug("Cleaned up temporary cache directory: %s", self._temp_dir)
292
+ except Exception as e:
293
+ logger.warning("Failed to clean up temp dir %s: %s", self._temp_dir, e)
294
+ finally:
295
+ self._temp_dir = None
296
+ self.local_cache_dir = None
297
+ # Unregister atexit handler if we cleaned up successfully
298
+ try:
299
+ atexit.unregister(self._cleanup_temp_dir)
300
+ except Exception:
301
+ pass
302
+
303
+ def close(self):
304
+ """Alias for disconnect() for compatibility with WorkspaceStateProvider"""
305
+ self.disconnect()
306
+
307
+ def _read_loop(self):
308
+ """Read responses and notifications from SSH stdout"""
309
+ while self._running:
310
+ try:
311
+ line = self._stdout.readline()
312
+ if not line:
313
+ # EOF - connection closed
314
+ logger.debug("SSH stdout closed")
315
+ break
316
+
317
+ line_str = line.decode("utf-8").strip()
318
+ if not line_str:
319
+ continue
320
+
321
+ self._process_message(line_str)
322
+
323
+ except Exception as e:
324
+ if self._running:
325
+ logger.exception("Error in read loop: %s", e)
326
+ break
327
+
328
+ # Connection lost
329
+ if self._running:
330
+ logger.warning("Connection to %s lost", self.host)
331
+ self._connected = False
332
+
333
+ def _stderr_loop(self):
334
+ """Read and display SSH stderr output with colored prefix"""
335
+ while self._running:
336
+ try:
337
+ line = self._stderr.readline()
338
+ if not line:
339
+ # EOF - stderr closed
340
+ logger.debug("SSH stderr closed")
341
+ break
342
+
343
+ line_str = line.decode("utf-8").rstrip("\n\r")
344
+ if not line_str:
345
+ continue
346
+
347
+ # Call output callback or use default
348
+ if self._output_callback is not None:
349
+ self._output_callback(line_str)
350
+ else:
351
+ # Default: print with colored prefix
352
+ prefix = colored("[SSH] ", "cyan", attrs=["bold"])
353
+ print(f"{prefix}{line_str}") # noqa: T201
354
+
355
+ except Exception as e:
356
+ if self._running:
357
+ logger.debug("Error reading stderr: %s", e)
358
+ break
359
+
360
+ def _process_message(self, line: str):
361
+ """Process a single message from the server"""
362
+ try:
363
+ msg = parse_message(line)
364
+ except ValueError as e:
365
+ logger.warning("Failed to parse message: %s", e)
366
+ return
367
+
368
+ if isinstance(msg, RPCResponse):
369
+ self._handle_response(msg)
370
+ elif isinstance(msg, RPCNotification):
371
+ self._handle_notification(msg)
372
+ else:
373
+ logger.debug("Unexpected message type: %s", type(msg).__name__)
374
+
375
+ def _handle_response(self, response: RPCResponse):
376
+ """Handle a response from the server"""
377
+ with self._response_lock:
378
+ future = self._response_handlers.pop(response.id, None)
379
+
380
+ if future is None:
381
+ logger.warning("Received response for unknown request ID: %s", response.id)
382
+ return
383
+
384
+ if response.error:
385
+ future.set_exception(
386
+ RuntimeError(
387
+ f"RPC error {response.error.code}: {response.error.message}"
388
+ )
389
+ )
390
+ else:
391
+ future.set_result(response.result)
392
+
393
+ def _handle_notification(self, notification: RPCNotification):
394
+ """Handle a notification from the server
395
+
396
+ Queues events for throttled delivery to avoid flooding the UI.
397
+ """
398
+ method = notification.method
399
+ params = notification.params
400
+
401
+ logger.debug("Received notification: %s", method)
402
+
403
+ # Convert notification to EventBase and queue for throttled delivery
404
+ event = self._notification_to_event(method, params)
405
+ if event:
406
+ with self._pending_events_lock:
407
+ self._pending_events.append(event)
408
+
409
+ # Handle shutdown notification immediately
410
+ if method == NotificationMethod.SHUTDOWN.value:
411
+ reason = params.get("reason", "unknown")
412
+ logger.info("Server shutdown: %s", reason)
413
+ self._connected = False
414
+
415
+ def _notify_loop(self):
416
+ """Background thread that delivers pending events to listeners periodically
417
+
418
+ This throttles notification delivery to avoid flooding the UI with
419
+ rapid state changes.
420
+ """
421
+ import time
422
+
423
+ while self._running:
424
+ time.sleep(self._notify_interval)
425
+
426
+ if not self._running:
427
+ break
428
+
429
+ # Get and clear pending events atomically
430
+ with self._pending_events_lock:
431
+ if not self._pending_events:
432
+ continue
433
+ events = self._pending_events.copy()
434
+ self._pending_events.clear()
435
+
436
+ # Deduplicate events by type (keep latest of each type)
437
+ # This prevents redundant refreshes for rapidly changing state
438
+ seen_types = set()
439
+ unique_events = []
440
+ for event in reversed(events):
441
+ event_type = type(event)
442
+ if event_type not in seen_types:
443
+ seen_types.add(event_type)
444
+ unique_events.append(event)
445
+ unique_events.reverse()
446
+
447
+ # Notify listeners
448
+ for event in unique_events:
449
+ self._notify_listeners(event)
450
+
451
+ def _notification_to_event(self, method: str, params: Dict) -> Optional[EventBase]:
452
+ """Convert a notification to a EventBase"""
453
+ if method != NotificationMethod.STATE_EVENT.value:
454
+ # Don't warn for known control notifications (handled elsewhere)
455
+ if method not in (
456
+ NotificationMethod.SHUTDOWN.value,
457
+ NotificationMethod.FILE_CHANGED.value,
458
+ ):
459
+ logger.warning("Unhandled notification method: %s", method)
460
+ return None
461
+
462
+ event_type = params.get("event_type")
463
+ data = params.get("data", {})
464
+ event_class = EventBase.get_class(event_type)
465
+ if event_class is None:
466
+ logger.warning("Unknown event type: %s", event_type)
467
+ return None
468
+
469
+ try:
470
+ return event_class(**data)
471
+ except TypeError as e:
472
+ logger.warning("Error deserializing event %s: %s", event_type, e)
473
+ return None
474
+
475
+ def _notify_listeners(self, event: EventBase):
476
+ """Notify all registered listeners of a state event"""
477
+ with self._listener_lock:
478
+ listeners = list(self._listeners)
479
+
480
+ for listener in listeners:
481
+ try:
482
+ listener(event)
483
+ except Exception as e:
484
+ logger.exception("Error in listener: %s", e)
485
+
486
+ def _call(self, method: RPCMethod, params: Dict) -> Future:
487
+ """Send an RPC request and return a Future for the response
488
+
489
+ Args:
490
+ method: RPC method to call
491
+ params: Method parameters
492
+
493
+ Returns:
494
+ Future that resolves to the response result
495
+ """
496
+ if not self._running:
497
+ future = Future()
498
+ future.set_exception(ConnectionError("Not connected"))
499
+ return future
500
+
501
+ with self._response_lock:
502
+ self._request_id += 1
503
+ request_id = self._request_id
504
+ future = Future()
505
+ self._response_handlers[request_id] = future
506
+
507
+ request_json = create_request(method, params, request_id)
508
+ try:
509
+ self._stdin.write((request_json + "\n").encode("utf-8"))
510
+ self._stdin.flush()
511
+ except Exception as e:
512
+ with self._response_lock:
513
+ self._response_handlers.pop(request_id, None)
514
+ future.set_exception(e)
515
+
516
+ return future
517
+
518
+ def _call_sync(
519
+ self, method: RPCMethod, params: Dict, timeout: float = DEFAULT_TIMEOUT
520
+ ):
521
+ """Send an RPC request and wait for the response
522
+
523
+ Args:
524
+ method: RPC method to call
525
+ params: Method parameters
526
+ timeout: Request timeout in seconds
527
+
528
+ Returns:
529
+ Response result
530
+
531
+ Raises:
532
+ TimeoutError: If the request times out
533
+ RuntimeError: If the RPC call returns an error
534
+ """
535
+ future = self._call(method, params)
536
+ try:
537
+ return future.result(timeout=timeout)
538
+ except FutureTimeoutError:
539
+ raise TimeoutError(f"Request {method.value} timed out after {timeout}s")
540
+
541
+ # -------------------------------------------------------------------------
542
+ # StateProvider-like Interface
543
+ # -------------------------------------------------------------------------
544
+
545
+ def add_listener(self, listener: StateListener):
546
+ """Register a listener for state change events"""
547
+ with self._listener_lock:
548
+ self._listeners.add(listener)
549
+
550
+ def remove_listener(self, listener: StateListener):
551
+ """Unregister a listener"""
552
+ with self._listener_lock:
553
+ self._listeners.discard(listener)
554
+
555
+ def get_experiments(self, since: Optional[datetime] = None) -> List[BaseExperiment]:
556
+ """Get list of all experiments"""
557
+ params = {"since": serialize_datetime(since)}
558
+ result = self._call_sync(RPCMethod.GET_EXPERIMENTS, params)
559
+ return [self._dict_to_experiment(d) for d in result]
560
+
561
+ def get_experiment(self, experiment_id: str) -> Optional[BaseExperiment]:
562
+ """Get a specific experiment by ID"""
563
+ params = {"experiment_id": experiment_id}
564
+ result = self._call_sync(RPCMethod.GET_EXPERIMENT, params)
565
+ if result is None:
566
+ return None
567
+ return self._dict_to_experiment(result)
568
+
569
+ def get_experiment_runs(self, experiment_id: str) -> List[Dict]:
570
+ """Get all runs for an experiment"""
571
+ params = {"experiment_id": experiment_id}
572
+ return self._call_sync(RPCMethod.GET_EXPERIMENT_RUNS, params)
573
+
574
+ def get_current_run(self, experiment_id: str) -> Optional[str]:
575
+ """Get the current run ID for an experiment"""
576
+ exp = self.get_experiment(experiment_id)
577
+ if exp is None:
578
+ return None
579
+ return exp.run_id
580
+
581
+ def get_jobs(
582
+ self,
583
+ experiment_id: Optional[str] = None,
584
+ run_id: Optional[str] = None,
585
+ task_id: Optional[str] = None,
586
+ state: Optional[str] = None,
587
+ tags: Optional[Dict[str, str]] = None,
588
+ since: Optional[datetime] = None,
589
+ ) -> List[BaseJob]:
590
+ """Query jobs with optional filters"""
591
+ params = {
592
+ "experiment_id": experiment_id,
593
+ "run_id": run_id,
594
+ "task_id": task_id,
595
+ "state": state,
596
+ "tags": tags,
597
+ "since": serialize_datetime(since),
598
+ }
599
+ result = self._call_sync(RPCMethod.GET_JOBS, params)
600
+ return [self._dict_to_job(d) for d in result]
601
+
602
+ def get_job(
603
+ self, job_id: str, experiment_id: str, run_id: Optional[str] = None
604
+ ) -> Optional[BaseJob]:
605
+ """Get a specific job"""
606
+ params = {
607
+ "job_id": job_id,
608
+ "experiment_id": experiment_id,
609
+ "run_id": run_id,
610
+ }
611
+ result = self._call_sync(RPCMethod.GET_JOB, params)
612
+ if result is None:
613
+ return None
614
+ return self._dict_to_job(result)
615
+
616
+ def get_all_jobs(
617
+ self,
618
+ state: Optional[str] = None,
619
+ tags: Optional[Dict[str, str]] = None,
620
+ since: Optional[datetime] = None,
621
+ ) -> List[BaseJob]:
622
+ """Get all jobs across all experiments"""
623
+ params = {
624
+ "state": state,
625
+ "tags": tags,
626
+ "since": serialize_datetime(since),
627
+ }
628
+ result = self._call_sync(RPCMethod.GET_ALL_JOBS, params)
629
+ return [self._dict_to_job(d) for d in result]
630
+
631
+ def get_tags_map(
632
+ self,
633
+ experiment_id: str,
634
+ run_id: Optional[str] = None,
635
+ ) -> Dict[str, Dict[str, str]]:
636
+ """Get tags map for jobs in an experiment/run"""
637
+ params = {
638
+ "experiment_id": experiment_id,
639
+ "run_id": run_id,
640
+ }
641
+ result = self._call_sync(RPCMethod.GET_TAGS_MAP, params)
642
+ return result or {}
643
+
644
+ def get_dependencies_map(
645
+ self,
646
+ experiment_id: str,
647
+ run_id: Optional[str] = None,
648
+ ) -> dict[str, list[str]]:
649
+ """Get dependencies map for jobs in an experiment/run"""
650
+ params = {
651
+ "experiment_id": experiment_id,
652
+ "run_id": run_id,
653
+ }
654
+ result = self._call_sync(RPCMethod.GET_DEPENDENCIES_MAP, params)
655
+ return result or {}
656
+
657
+ def _fetch_services_from_storage(
658
+ self, experiment_id: Optional[str], run_id: Optional[str]
659
+ ) -> List[BaseService]:
660
+ """Fetch services from remote server.
661
+
662
+ Called by base class get_services when cache is empty.
663
+ """
664
+ params = {
665
+ "experiment_id": experiment_id,
666
+ "run_id": run_id,
667
+ }
668
+ result = self._call_sync(RPCMethod.GET_SERVICES, params)
669
+
670
+ services = []
671
+ for d in result:
672
+ service = self._dict_to_service(d)
673
+ services.append(service)
674
+
675
+ return services
676
+
677
+ def kill_job(self, job: BaseJob, perform: bool = False) -> bool:
678
+ """Kill a running job"""
679
+ if not perform:
680
+ # Dry run - just check if job is running
681
+ return job.state.running()
682
+
683
+ params = {
684
+ "job_id": job.identifier,
685
+ "experiment_id": getattr(job, "experiment_id", ""),
686
+ "run_id": getattr(job, "run_id", ""),
687
+ }
688
+ result = self._call_sync(RPCMethod.KILL_JOB, params)
689
+ return result.get("success", False)
690
+
691
+ def clean_job(self, job: BaseJob, perform: bool = False) -> bool:
692
+ """Clean a finished job"""
693
+ if not perform:
694
+ # Dry run - just check if job is finished
695
+ return job.state.finished()
696
+
697
+ params = {
698
+ "job_id": job.identifier,
699
+ "experiment_id": getattr(job, "experiment_id", ""),
700
+ "run_id": getattr(job, "run_id", ""),
701
+ }
702
+ result = self._call_sync(RPCMethod.CLEAN_JOB, params)
703
+ return result.get("success", False)
704
+
705
+ def get_process_info(self, job: BaseJob):
706
+ """Get process information for a job
707
+
708
+ Returns None if the remote server doesn't support this method.
709
+ """
710
+ from experimaestro.scheduler.state_provider import ProcessInfo
711
+
712
+ params = {
713
+ "job_id": job.identifier,
714
+ "experiment_id": getattr(job, "experiment_id", ""),
715
+ "run_id": getattr(job, "run_id", ""),
716
+ }
717
+
718
+ try:
719
+ result = self._call_sync(RPCMethod.GET_PROCESS_INFO, params)
720
+ except RuntimeError:
721
+ # Server doesn't support this method (older version)
722
+ return None
723
+
724
+ if result is None:
725
+ return None
726
+
727
+ return ProcessInfo(
728
+ pid=result["pid"],
729
+ type=result["type"],
730
+ running=result.get("running", False),
731
+ )
732
+
733
+ # -------------------------------------------------------------------------
734
+ # Data Conversion
735
+ # -------------------------------------------------------------------------
736
+
737
+ def _dict_to_job(self, d: Dict) -> MockJob:
738
+ """Convert a dictionary to a MockJob using from_state_dict"""
739
+ # Translate remote path to local cache path
740
+ if d.get("path"):
741
+ remote_path = d["path"]
742
+ if remote_path.startswith(self.remote_workspace):
743
+ relative = remote_path[len(self.remote_workspace) :].lstrip("/")
744
+ d["path"] = self.local_cache_dir / relative
745
+ else:
746
+ d["path"] = Path(remote_path)
747
+
748
+ # Convert ISO datetime strings back to timestamps (floats)
749
+ for key in ("submitted_time", "started_time", "ended_time"):
750
+ d[key] = self._parse_datetime_to_timestamp(d.get(key))
751
+
752
+ return MockJob.from_state_dict(d, self.local_cache_dir)
753
+
754
+ def _dict_to_experiment(self, d: Dict) -> MockExperiment:
755
+ """Convert a dictionary to a MockExperiment using from_state_dict"""
756
+ # Translate remote workdir to local cache path
757
+ if d.get("workdir"):
758
+ remote_path = d["workdir"]
759
+ if remote_path.startswith(self.remote_workspace):
760
+ relative = remote_path[len(self.remote_workspace) :].lstrip("/")
761
+ d["workdir"] = self.local_cache_dir / relative
762
+ else:
763
+ d["workdir"] = Path(remote_path)
764
+
765
+ # Convert ISO datetime strings back to timestamps (floats)
766
+ for key in ("started_at", "ended_at"):
767
+ d[key] = self._parse_datetime_to_timestamp(d.get(key))
768
+
769
+ return MockExperiment.from_state_dict(d, self.local_cache_dir)
770
+
771
+ def _dict_to_service(self, d: Dict) -> BaseService:
772
+ """Convert a dictionary to a Service or MockService
773
+
774
+ Tries to recreate the actual Service from state_dict first.
775
+ Falls back to MockService with error message if module is missing.
776
+ """
777
+ state_dict = d.get("state_dict", {})
778
+ service_class = d.get("class", "")
779
+ service_id = d.get("service_id", "")
780
+
781
+ # Check for unserializable marker
782
+ if state_dict.get("__unserializable__"):
783
+ reason = state_dict.get("__reason__", "Service cannot be recreated")
784
+ return MockService(
785
+ service_id=service_id,
786
+ description_text=f"[{reason}]",
787
+ state_dict_data=state_dict,
788
+ service_class=service_class,
789
+ experiment_id=d.get("experiment_id"),
790
+ run_id=d.get("run_id"),
791
+ url=d.get("url"),
792
+ )
793
+
794
+ # Try to recreate actual Service from state_dict
795
+ if service_class:
796
+ try:
797
+ from experimaestro.scheduler.services import Service
798
+
799
+ # Create path translator that syncs and translates paths
800
+ def path_translator(remote_path: str) -> Path:
801
+ """Translate remote path to local, syncing if needed"""
802
+ local_path = self.sync_path(remote_path)
803
+ if local_path:
804
+ return local_path
805
+ # Fallback: map to local cache without sync
806
+ if remote_path.startswith(self.remote_workspace):
807
+ relative = remote_path[len(self.remote_workspace) :].lstrip("/")
808
+ return self.local_cache_dir / relative
809
+ return Path(remote_path)
810
+
811
+ service = Service.from_state_dict(
812
+ service_class, state_dict, path_translator
813
+ )
814
+ service.id = service_id
815
+ # Copy additional attributes
816
+ if d.get("experiment_id"):
817
+ service.experiment_id = d["experiment_id"]
818
+ if d.get("run_id"):
819
+ service.run_id = d["run_id"]
820
+ return service
821
+ except ModuleNotFoundError as e:
822
+ # Module not available locally - show error in description
823
+ missing_module = str(e).replace("No module named ", "").strip("'\"")
824
+ return MockService(
825
+ service_id=service_id,
826
+ description_text=f"[Missing module: {missing_module}]",
827
+ state_dict_data=state_dict,
828
+ service_class=service_class,
829
+ experiment_id=d.get("experiment_id"),
830
+ run_id=d.get("run_id"),
831
+ url=d.get("url"),
832
+ )
833
+ except Exception as e:
834
+ # Other error - show in description
835
+ return MockService(
836
+ service_id=service_id,
837
+ description_text=f"[Error: {e}]",
838
+ state_dict_data=state_dict,
839
+ service_class=service_class,
840
+ experiment_id=d.get("experiment_id"),
841
+ run_id=d.get("run_id"),
842
+ url=d.get("url"),
843
+ )
844
+
845
+ # No class - use MockService.from_full_state_dict
846
+ return MockService.from_full_state_dict(d)
847
+
848
+ def _parse_datetime_to_timestamp(self, value) -> Optional[float]:
849
+ """Convert datetime value to Unix timestamp
850
+
851
+ Handles: None, ISO string, float timestamp, datetime object
852
+ """
853
+ if value is None:
854
+ return None
855
+ if isinstance(value, (int, float)):
856
+ return float(value)
857
+ if isinstance(value, str):
858
+ try:
859
+ dt = datetime.fromisoformat(value)
860
+ return dt.timestamp()
861
+ except ValueError:
862
+ return None
863
+ if isinstance(value, datetime):
864
+ return value.timestamp()
865
+ return None
866
+
867
+ # -------------------------------------------------------------------------
868
+ # File Synchronization
869
+ # -------------------------------------------------------------------------
870
+
871
+ def sync_path(self, path: str) -> Optional[Path]:
872
+ """Sync a specific path from remote on-demand
873
+
874
+ Used by services (e.g., TensorboardService) that need access to
875
+ specific remote directories.
876
+
877
+ Args:
878
+ path: Can be:
879
+ - Remote absolute path (e.g., /remote/workspace/jobs/xxx)
880
+ - Local cache path (e.g., /tmp/xpm_remote_xxx/jobs/xxx)
881
+ - Relative path within workspace (e.g., jobs/xxx)
882
+
883
+ Returns:
884
+ Local path where the files were synced to, or None if sync failed
885
+ """
886
+ if not self._connected or not self.local_cache_dir:
887
+ logger.warning("Cannot sync: not connected")
888
+ return None
889
+
890
+ # Convert local cache path back to remote path if needed
891
+ local_cache_str = str(self.local_cache_dir)
892
+ if path.startswith(local_cache_str):
893
+ # Path is in local cache - extract relative path
894
+ relative = path[len(local_cache_str) :].lstrip("/")
895
+ remote_path = f"{self.remote_workspace}/{relative}"
896
+ elif path.startswith(self.remote_workspace):
897
+ # Already a remote path
898
+ remote_path = path
899
+ else:
900
+ # Assume it's a relative path
901
+ remote_path = f"{self.remote_workspace}/{path.lstrip('/')}"
902
+
903
+ from experimaestro.scheduler.remote.sync import RemoteFileSynchronizer
904
+
905
+ # Create synchronizer lazily
906
+ if self._synchronizer is None:
907
+ self._synchronizer = RemoteFileSynchronizer(
908
+ host=self.host,
909
+ remote_workspace=Path(self.remote_workspace),
910
+ local_cache=self.local_cache_dir,
911
+ ssh_options=self.ssh_options,
912
+ )
913
+
914
+ try:
915
+ return self._synchronizer.sync_path(remote_path)
916
+ except Exception as e:
917
+ logger.warning("Failed to sync path %s: %s", remote_path, e)
918
+ return None
919
+
920
+ @property
921
+ def read_only(self) -> bool:
922
+ """Client is always read-only"""
923
+ return True
924
+
925
+ @property
926
+ def is_remote(self) -> bool:
927
+ """This is a remote provider"""
928
+ return True