experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (154) hide show
  1. experimaestro/__init__.py +12 -5
  2. experimaestro/cli/__init__.py +393 -134
  3. experimaestro/cli/filter.py +48 -23
  4. experimaestro/cli/jobs.py +253 -71
  5. experimaestro/cli/refactor.py +1 -2
  6. experimaestro/commandline.py +7 -4
  7. experimaestro/connectors/__init__.py +9 -1
  8. experimaestro/connectors/local.py +43 -3
  9. experimaestro/core/arguments.py +18 -18
  10. experimaestro/core/identifier.py +11 -11
  11. experimaestro/core/objects/config.py +96 -39
  12. experimaestro/core/objects/config_walk.py +3 -3
  13. experimaestro/core/{subparameters.py → partial.py} +16 -16
  14. experimaestro/core/partial_lock.py +394 -0
  15. experimaestro/core/types.py +12 -15
  16. experimaestro/dynamic.py +290 -0
  17. experimaestro/experiments/__init__.py +6 -2
  18. experimaestro/experiments/cli.py +223 -52
  19. experimaestro/experiments/configuration.py +24 -0
  20. experimaestro/generators.py +5 -5
  21. experimaestro/ipc.py +118 -1
  22. experimaestro/launcherfinder/__init__.py +2 -2
  23. experimaestro/launcherfinder/registry.py +6 -7
  24. experimaestro/launcherfinder/specs.py +2 -9
  25. experimaestro/launchers/slurm/__init__.py +2 -2
  26. experimaestro/launchers/slurm/base.py +62 -0
  27. experimaestro/locking.py +957 -1
  28. experimaestro/notifications.py +89 -201
  29. experimaestro/progress.py +63 -366
  30. experimaestro/rpyc.py +0 -2
  31. experimaestro/run.py +29 -2
  32. experimaestro/scheduler/__init__.py +8 -1
  33. experimaestro/scheduler/base.py +650 -53
  34. experimaestro/scheduler/dependencies.py +20 -16
  35. experimaestro/scheduler/experiment.py +764 -169
  36. experimaestro/scheduler/interfaces.py +338 -96
  37. experimaestro/scheduler/jobs.py +58 -20
  38. experimaestro/scheduler/remote/__init__.py +31 -0
  39. experimaestro/scheduler/remote/adaptive_sync.py +265 -0
  40. experimaestro/scheduler/remote/client.py +928 -0
  41. experimaestro/scheduler/remote/protocol.py +282 -0
  42. experimaestro/scheduler/remote/server.py +447 -0
  43. experimaestro/scheduler/remote/sync.py +144 -0
  44. experimaestro/scheduler/services.py +186 -35
  45. experimaestro/scheduler/state_provider.py +811 -2157
  46. experimaestro/scheduler/state_status.py +1247 -0
  47. experimaestro/scheduler/transient.py +31 -0
  48. experimaestro/scheduler/workspace.py +1 -1
  49. experimaestro/scheduler/workspace_state_provider.py +1273 -0
  50. experimaestro/scriptbuilder.py +4 -4
  51. experimaestro/settings.py +36 -0
  52. experimaestro/tests/conftest.py +33 -5
  53. experimaestro/tests/connectors/bin/executable.py +1 -1
  54. experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
  55. experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
  56. experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
  57. experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
  58. experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
  59. experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
  60. experimaestro/tests/launchers/bin/test.py +1 -0
  61. experimaestro/tests/launchers/test_slurm.py +9 -9
  62. experimaestro/tests/partial_reschedule.py +46 -0
  63. experimaestro/tests/restart.py +3 -3
  64. experimaestro/tests/restart_main.py +1 -0
  65. experimaestro/tests/scripts/notifyandwait.py +1 -0
  66. experimaestro/tests/task_partial.py +38 -0
  67. experimaestro/tests/task_tokens.py +2 -2
  68. experimaestro/tests/tasks/test_dynamic.py +6 -6
  69. experimaestro/tests/test_dependencies.py +3 -3
  70. experimaestro/tests/test_deprecated.py +15 -15
  71. experimaestro/tests/test_dynamic_locking.py +317 -0
  72. experimaestro/tests/test_environment.py +24 -14
  73. experimaestro/tests/test_experiment.py +171 -36
  74. experimaestro/tests/test_identifier.py +25 -25
  75. experimaestro/tests/test_identifier_stability.py +3 -5
  76. experimaestro/tests/test_multitoken.py +2 -4
  77. experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
  78. experimaestro/tests/test_partial_paths.py +81 -138
  79. experimaestro/tests/test_pre_experiment.py +219 -0
  80. experimaestro/tests/test_progress.py +2 -8
  81. experimaestro/tests/test_remote_state.py +1132 -0
  82. experimaestro/tests/test_stray_jobs.py +261 -0
  83. experimaestro/tests/test_tasks.py +1 -2
  84. experimaestro/tests/test_token_locking.py +52 -67
  85. experimaestro/tests/test_tokens.py +5 -6
  86. experimaestro/tests/test_transient.py +225 -0
  87. experimaestro/tests/test_workspace_state_provider.py +768 -0
  88. experimaestro/tests/token_reschedule.py +1 -3
  89. experimaestro/tests/utils.py +2 -7
  90. experimaestro/tokens.py +227 -372
  91. experimaestro/tools/diff.py +1 -0
  92. experimaestro/tools/documentation.py +4 -5
  93. experimaestro/tools/jobs.py +1 -2
  94. experimaestro/tui/app.py +459 -1895
  95. experimaestro/tui/app.tcss +162 -0
  96. experimaestro/tui/dialogs.py +172 -0
  97. experimaestro/tui/log_viewer.py +253 -3
  98. experimaestro/tui/messages.py +137 -0
  99. experimaestro/tui/utils.py +54 -0
  100. experimaestro/tui/widgets/__init__.py +23 -0
  101. experimaestro/tui/widgets/experiments.py +468 -0
  102. experimaestro/tui/widgets/global_services.py +238 -0
  103. experimaestro/tui/widgets/jobs.py +972 -0
  104. experimaestro/tui/widgets/log.py +156 -0
  105. experimaestro/tui/widgets/orphans.py +363 -0
  106. experimaestro/tui/widgets/runs.py +185 -0
  107. experimaestro/tui/widgets/services.py +314 -0
  108. experimaestro/tui/widgets/stray_jobs.py +528 -0
  109. experimaestro/utils/__init__.py +1 -1
  110. experimaestro/utils/environment.py +105 -22
  111. experimaestro/utils/fswatcher.py +124 -0
  112. experimaestro/utils/jobs.py +1 -2
  113. experimaestro/utils/jupyter.py +1 -2
  114. experimaestro/utils/logging.py +72 -0
  115. experimaestro/version.py +2 -2
  116. experimaestro/webui/__init__.py +9 -0
  117. experimaestro/webui/app.py +117 -0
  118. experimaestro/{server → webui}/data/index.css +66 -11
  119. experimaestro/webui/data/index.css.map +1 -0
  120. experimaestro/{server → webui}/data/index.js +82763 -87217
  121. experimaestro/webui/data/index.js.map +1 -0
  122. experimaestro/webui/routes/__init__.py +5 -0
  123. experimaestro/webui/routes/auth.py +53 -0
  124. experimaestro/webui/routes/proxy.py +117 -0
  125. experimaestro/webui/server.py +200 -0
  126. experimaestro/webui/state_bridge.py +152 -0
  127. experimaestro/webui/websocket.py +413 -0
  128. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +8 -9
  129. experimaestro-2.0.0b17.dist-info/RECORD +219 -0
  130. experimaestro/cli/progress.py +0 -269
  131. experimaestro/scheduler/state.py +0 -75
  132. experimaestro/scheduler/state_db.py +0 -388
  133. experimaestro/scheduler/state_sync.py +0 -834
  134. experimaestro/server/__init__.py +0 -467
  135. experimaestro/server/data/index.css.map +0 -1
  136. experimaestro/server/data/index.js.map +0 -1
  137. experimaestro/tests/test_cli_jobs.py +0 -615
  138. experimaestro/tests/test_file_progress.py +0 -425
  139. experimaestro/tests/test_file_progress_integration.py +0 -477
  140. experimaestro/tests/test_state_db.py +0 -434
  141. experimaestro-2.0.0b4.dist-info/RECORD +0 -181
  142. /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
  143. /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
  144. /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
  145. /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
  146. /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
  147. /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
  148. /experimaestro/{server → webui}/data/favicon.ico +0 -0
  149. /experimaestro/{server → webui}/data/index.html +0 -0
  150. /experimaestro/{server → webui}/data/login.html +0 -0
  151. /experimaestro/{server → webui}/data/manifest.json +0 -0
  152. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
  153. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
  154. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,447 @@
1
+ """SSH State Provider Server
2
+
3
+ JSON-RPC server that wraps WorkspaceStateProvider and communicates via stdio.
4
+ Designed to be run over SSH for remote experiment monitoring.
5
+
6
+ Usage:
7
+ experimaestro experiments monitor-server --workdir /path/to/workspace
8
+ """
9
+
10
+ import logging
11
+ import sys
12
+ import threading
13
+ from pathlib import Path
14
+ from typing import IO, Callable, Dict, Optional
15
+
16
+ from experimaestro.scheduler.workspace_state_provider import WorkspaceStateProvider
17
+ from dataclasses import asdict
18
+
19
+ from experimaestro.scheduler.state_status import EventBase
20
+ from experimaestro.scheduler.remote.protocol import (
21
+ RPCMethod,
22
+ NotificationMethod,
23
+ parse_message,
24
+ create_success_response,
25
+ create_error_response,
26
+ create_notification,
27
+ serialize_datetime,
28
+ deserialize_datetime,
29
+ PARSE_ERROR,
30
+ METHOD_NOT_FOUND,
31
+ INVALID_PARAMS,
32
+ INTERNAL_ERROR,
33
+ WORKSPACE_NOT_FOUND,
34
+ )
35
+
36
+ logger = logging.getLogger("xpm.remote.server")
37
+
38
+
39
+ class SSHStateProviderServer:
40
+ """JSON-RPC server that wraps WorkspaceStateProvider for SSH-based monitoring
41
+
42
+ This server reads JSON-RPC requests from stdin and writes responses to stdout.
43
+ It registers as a listener with the WorkspaceStateProvider to push notifications
44
+ when state changes occur.
45
+
46
+ Thread safety:
47
+ - Writes to stdout are serialized with a lock
48
+ - The main read loop runs in the calling thread
49
+ - Event notifications may come from the state provider's change detector thread
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ workspace_path: Path,
55
+ stdin: IO[bytes] = None,
56
+ stdout: IO[bytes] = None,
57
+ ):
58
+ """Initialize the server
59
+
60
+ Args:
61
+ workspace_path: Path to the workspace directory
62
+ stdin: Input stream for reading requests (default: sys.stdin.buffer)
63
+ stdout: Output stream for writing responses (default: sys.stdout.buffer)
64
+ """
65
+ self.workspace_path = workspace_path
66
+ self.stdin = stdin if stdin is not None else sys.stdin.buffer
67
+ self.stdout = stdout if stdout is not None else sys.stdout.buffer
68
+ self._state_provider: Optional[WorkspaceStateProvider] = None
69
+ self._running = False
70
+ self._write_lock = threading.Lock()
71
+
72
+ # Map of method names to handler functions
73
+ self._handlers: Dict[str, Callable] = {
74
+ RPCMethod.GET_EXPERIMENTS.value: self._handle_get_experiments,
75
+ RPCMethod.GET_EXPERIMENT.value: self._handle_get_experiment,
76
+ RPCMethod.GET_EXPERIMENT_RUNS.value: self._handle_get_experiment_runs,
77
+ RPCMethod.GET_JOBS.value: self._handle_get_jobs,
78
+ RPCMethod.GET_JOB.value: self._handle_get_job,
79
+ RPCMethod.GET_ALL_JOBS.value: self._handle_get_all_jobs,
80
+ RPCMethod.GET_SERVICES.value: self._handle_get_services,
81
+ RPCMethod.GET_TAGS_MAP.value: self._handle_get_tags_map,
82
+ RPCMethod.GET_DEPENDENCIES_MAP.value: self._handle_get_dependencies_map,
83
+ RPCMethod.KILL_JOB.value: self._handle_kill_job,
84
+ RPCMethod.CLEAN_JOB.value: self._handle_clean_job,
85
+ RPCMethod.GET_SYNC_INFO.value: self._handle_get_sync_info,
86
+ RPCMethod.GET_PROCESS_INFO.value: self._handle_get_process_info,
87
+ }
88
+
89
+ def start(self):
90
+ """Start the server and begin processing requests
91
+
92
+ This method blocks until the server is stopped or stdin is closed.
93
+ """
94
+ # Verify workspace exists
95
+ if not self.workspace_path.exists():
96
+ logger.error("Workspace path does not exist: %s", self.workspace_path)
97
+ self._send_error_and_exit(
98
+ WORKSPACE_NOT_FOUND,
99
+ f"Workspace path does not exist: {self.workspace_path}",
100
+ )
101
+ return
102
+
103
+ # Initialize state provider in read-only mode with event watcher
104
+ try:
105
+ self._state_provider = WorkspaceStateProvider.get_instance(
106
+ self.workspace_path
107
+ )
108
+ except Exception as e:
109
+ logger.exception("Failed to initialize state provider")
110
+ self._send_error_and_exit(INTERNAL_ERROR, f"Failed to initialize: {e}")
111
+ return
112
+
113
+ # Register as listener for state changes
114
+ self._state_provider.add_listener(self._on_state_event)
115
+
116
+ self._running = True
117
+ logger.info("SSH State Provider Server started for %s", self.workspace_path)
118
+
119
+ try:
120
+ self._read_loop()
121
+ finally:
122
+ self.stop()
123
+
124
+ def stop(self):
125
+ """Stop the server and clean up resources"""
126
+ self._running = False
127
+
128
+ # Unregister listener
129
+ if self._state_provider is not None:
130
+ try:
131
+ self._state_provider.remove_listener(self._on_state_event)
132
+ except Exception:
133
+ pass
134
+
135
+ # Send shutdown notification
136
+ try:
137
+ self._send_notification(
138
+ NotificationMethod.SHUTDOWN, {"reason": "server_shutdown"}
139
+ )
140
+ except Exception:
141
+ pass
142
+
143
+ logger.info("SSH State Provider Server stopped")
144
+
145
+ def _read_loop(self):
146
+ """Main loop: read JSON-RPC requests from stdin and process them"""
147
+ while self._running:
148
+ try:
149
+ line = self.stdin.readline()
150
+ if not line:
151
+ # EOF - stdin closed
152
+ logger.debug("stdin closed, stopping server")
153
+ break
154
+
155
+ line_str = line.decode("utf-8").strip()
156
+ if not line_str:
157
+ continue
158
+
159
+ self._process_request(line_str)
160
+
161
+ except Exception as e:
162
+ logger.exception("Error in read loop: %s", e)
163
+ # Continue processing - don't crash on individual request errors
164
+
165
+ def _process_request(self, line: str):
166
+ """Process a single JSON-RPC request"""
167
+ try:
168
+ msg = parse_message(line)
169
+ except ValueError as e:
170
+ self._send_response(create_error_response(0, PARSE_ERROR, str(e)))
171
+ return
172
+
173
+ # We only handle requests (with id), not responses or notifications
174
+ from experimaestro.scheduler.remote.protocol import RPCRequest
175
+
176
+ if not isinstance(msg, RPCRequest):
177
+ logger.warning("Received non-request message: %s", type(msg).__name__)
178
+ return
179
+
180
+ request = msg
181
+ method = request.method
182
+ params = request.params
183
+ request_id = request.id
184
+
185
+ if request_id is None:
186
+ # Notification from client - we don't handle these currently
187
+ logger.debug("Received notification: %s", method)
188
+ return
189
+
190
+ # Dispatch to handler
191
+ handler = self._handlers.get(method)
192
+ if handler is None:
193
+ self._send_response(
194
+ create_error_response(
195
+ request_id, METHOD_NOT_FOUND, f"Unknown method: {method}"
196
+ )
197
+ )
198
+ return
199
+
200
+ try:
201
+ result = handler(params)
202
+ self._send_response(create_success_response(request_id, result))
203
+ except TypeError as e:
204
+ self._send_response(
205
+ create_error_response(request_id, INVALID_PARAMS, str(e))
206
+ )
207
+ except Exception as e:
208
+ logger.exception("Error handling %s", method)
209
+ self._send_response(
210
+ create_error_response(request_id, INTERNAL_ERROR, str(e))
211
+ )
212
+
213
+ def _send_response(self, response: str):
214
+ """Send a JSON-RPC response (thread-safe)"""
215
+ with self._write_lock:
216
+ self.stdout.write((response + "\n").encode("utf-8"))
217
+ self.stdout.flush()
218
+
219
+ def _send_notification(self, method: NotificationMethod, params: Dict):
220
+ """Send a JSON-RPC notification (thread-safe)"""
221
+ notification = create_notification(method, params)
222
+ self._send_response(notification)
223
+
224
+ def _send_error_and_exit(self, code: int, message: str):
225
+ """Send an error notification and exit"""
226
+ self._send_notification(
227
+ NotificationMethod.SHUTDOWN,
228
+ {"reason": "error", "code": code, "message": message},
229
+ )
230
+
231
+ def _on_state_event(self, event: EventBase):
232
+ """Handle state change events from the state provider
233
+
234
+ Converts events to JSON-RPC notifications and sends them to the client.
235
+ Uses generic serialization via dataclasses.asdict.
236
+ """
237
+ try:
238
+ # Serialize event to dict, filtering out None values and non-serializable objects
239
+ event_dict = {}
240
+ for key, value in asdict(event).items():
241
+ # Skip None values and complex objects (like job references)
242
+ if value is not None and not isinstance(value, (Path,)):
243
+ # Try to serialize - skip if not JSON-serializable
244
+ try:
245
+ import json
246
+
247
+ json.dumps(value)
248
+ event_dict[key] = value
249
+ except (TypeError, ValueError):
250
+ pass
251
+
252
+ self._send_notification(
253
+ NotificationMethod.STATE_EVENT,
254
+ {
255
+ "event_type": type(event).__name__,
256
+ "data": event_dict,
257
+ },
258
+ )
259
+ except Exception as e:
260
+ logger.exception("Error sending notification: %s", e)
261
+
262
+ # -------------------------------------------------------------------------
263
+ # Request Handlers
264
+ # -------------------------------------------------------------------------
265
+
266
+ def _handle_get_experiments(self, params: Dict) -> list:
267
+ """Handle get_experiments request"""
268
+ since = deserialize_datetime(params.get("since"))
269
+ experiments = self._state_provider.get_experiments(since=since)
270
+ return [exp.state_dict() for exp in experiments]
271
+
272
+ def _handle_get_experiment(self, params: Dict) -> Optional[Dict]:
273
+ """Handle get_experiment request"""
274
+ experiment_id = params.get("experiment_id")
275
+ if not experiment_id:
276
+ raise TypeError("experiment_id is required")
277
+
278
+ experiment = self._state_provider.get_experiment(experiment_id)
279
+ if experiment is None:
280
+ return None
281
+ return experiment.state_dict()
282
+
283
+ def _handle_get_experiment_runs(self, params: Dict) -> list:
284
+ """Handle get_experiment_runs request"""
285
+ experiment_id = params.get("experiment_id")
286
+ if not experiment_id:
287
+ raise TypeError("experiment_id is required")
288
+
289
+ runs = self._state_provider.get_experiment_runs(experiment_id)
290
+ return [run.state_dict() for run in runs]
291
+
292
+ def _handle_get_jobs(self, params: Dict) -> list:
293
+ """Handle get_jobs request"""
294
+ since = deserialize_datetime(params.get("since"))
295
+ jobs = self._state_provider.get_jobs(
296
+ experiment_id=params.get("experiment_id"),
297
+ run_id=params.get("run_id"),
298
+ task_id=params.get("task_id"),
299
+ state=params.get("state"),
300
+ tags=params.get("tags"),
301
+ since=since,
302
+ )
303
+ return [job.state_dict() for job in jobs]
304
+
305
+ def _handle_get_job(self, params: Dict) -> Optional[Dict]:
306
+ """Handle get_job request"""
307
+ job_id = params.get("job_id")
308
+ experiment_id = params.get("experiment_id")
309
+ if not job_id or not experiment_id:
310
+ raise TypeError("job_id and experiment_id are required")
311
+
312
+ job = self._state_provider.get_job(
313
+ job_id=job_id,
314
+ experiment_id=experiment_id,
315
+ run_id=params.get("run_id"),
316
+ )
317
+ if job is None:
318
+ return None
319
+ return job.state_dict()
320
+
321
+ def _handle_get_all_jobs(self, params: Dict) -> list:
322
+ """Handle get_all_jobs request"""
323
+ since = deserialize_datetime(params.get("since"))
324
+ jobs = self._state_provider.get_all_jobs(
325
+ state=params.get("state"),
326
+ tags=params.get("tags"),
327
+ since=since,
328
+ )
329
+ return [job.state_dict() for job in jobs]
330
+
331
+ def _handle_get_services(self, params: Dict) -> list:
332
+ """Handle get_services request
333
+
334
+ Returns serialized service data using full_state_dict().
335
+ """
336
+ services = self._state_provider.get_services(
337
+ experiment_id=params.get("experiment_id"),
338
+ run_id=params.get("run_id"),
339
+ )
340
+ return [svc.full_state_dict() for svc in services]
341
+
342
+ def _handle_get_tags_map(self, params: Dict) -> Dict[str, Dict[str, str]]:
343
+ """Handle get_tags_map request
344
+
345
+ Returns tags map for jobs in an experiment/run.
346
+ """
347
+ experiment_id = params.get("experiment_id")
348
+ if not experiment_id:
349
+ raise TypeError("experiment_id is required")
350
+
351
+ return self._state_provider.get_tags_map(
352
+ experiment_id=experiment_id,
353
+ run_id=params.get("run_id"),
354
+ )
355
+
356
+ def _handle_get_dependencies_map(self, params: Dict) -> dict[str, list[str]]:
357
+ """Handle get_dependencies_map request
358
+
359
+ Returns dependencies map for jobs in an experiment/run.
360
+ """
361
+ experiment_id = params.get("experiment_id")
362
+ if not experiment_id:
363
+ raise TypeError("experiment_id is required")
364
+
365
+ return self._state_provider.get_dependencies_map(
366
+ experiment_id=experiment_id,
367
+ run_id=params.get("run_id"),
368
+ )
369
+
370
+ def _handle_kill_job(self, params: Dict) -> Dict:
371
+ """Handle kill_job request"""
372
+ job_id = params.get("job_id")
373
+ experiment_id = params.get("experiment_id")
374
+ run_id = params.get("run_id")
375
+
376
+ if not job_id or not experiment_id or not run_id:
377
+ raise TypeError("job_id, experiment_id, and run_id are required")
378
+
379
+ # Get the job first
380
+ job = self._state_provider.get_job(job_id, experiment_id, run_id)
381
+ if job is None:
382
+ return {"success": False, "error": "Job not found"}
383
+
384
+ # Kill the job
385
+ try:
386
+ result = self._state_provider.kill_job(job, perform=True)
387
+ return {"success": result}
388
+ except Exception as e:
389
+ return {"success": False, "error": str(e)}
390
+
391
+ def _handle_clean_job(self, params: Dict) -> Dict:
392
+ """Handle clean_job request"""
393
+ job_id = params.get("job_id")
394
+ experiment_id = params.get("experiment_id")
395
+ run_id = params.get("run_id")
396
+
397
+ if not job_id or not experiment_id or not run_id:
398
+ raise TypeError("job_id, experiment_id, and run_id are required")
399
+
400
+ # Get the job first
401
+ job = self._state_provider.get_job(job_id, experiment_id, run_id)
402
+ if job is None:
403
+ return {"success": False, "error": "Job not found"}
404
+
405
+ # Clean the job
406
+ try:
407
+ result = self._state_provider.clean_job(job, perform=True)
408
+ return {"success": result}
409
+ except Exception as e:
410
+ return {"success": False, "error": str(e)}
411
+
412
+ def _handle_get_sync_info(self, params: Dict) -> Dict:
413
+ """Handle get_sync_info request"""
414
+ return {
415
+ "workspace_path": str(self.workspace_path),
416
+ "last_sync_time": (
417
+ serialize_datetime(self._state_provider.get_last_sync_time())
418
+ if hasattr(self._state_provider, "get_last_sync_time")
419
+ else None
420
+ ),
421
+ }
422
+
423
+ def _handle_get_process_info(self, params: Dict) -> Optional[Dict]:
424
+ """Handle get_process_info request"""
425
+ job_id = params.get("job_id")
426
+ experiment_id = params.get("experiment_id")
427
+ run_id = params.get("run_id")
428
+
429
+ if not job_id or not experiment_id:
430
+ raise TypeError("job_id and experiment_id are required")
431
+
432
+ # Get the job first
433
+ job = self._state_provider.get_job(job_id, experiment_id, run_id)
434
+ if job is None:
435
+ return None
436
+
437
+ # Get process info
438
+ pinfo = self._state_provider.get_process_info(job)
439
+ if pinfo is None:
440
+ return None
441
+
442
+ # Serialize ProcessInfo to dict
443
+ return {
444
+ "pid": pinfo.pid,
445
+ "type": pinfo.type,
446
+ "running": pinfo.running,
447
+ }
@@ -0,0 +1,144 @@
1
+ """Remote File Synchronizer
2
+
3
+ Handles rsync-based file synchronization between remote and local workspaces
4
+ for SSH-based experiment monitoring. Only syncs specific paths on-demand
5
+ when services need them (e.g., TensorboardService).
6
+ """
7
+
8
+ import logging
9
+ import subprocess
10
+ from pathlib import Path
11
+ from typing import List, Optional
12
+
13
+ logger = logging.getLogger("xpm.remote.sync")
14
+
15
+
16
+ class RemoteFileSynchronizer:
17
+ """Handles rsync-based file synchronization for remote monitoring
18
+
19
+ Syncs specific paths on-demand from a remote host to a local cache
20
+ directory. Used when services need access to remote files.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ host: str,
26
+ remote_workspace: Path,
27
+ local_cache: Path,
28
+ ssh_options: Optional[List[str]] = None,
29
+ ):
30
+ """Initialize the synchronizer
31
+
32
+ Args:
33
+ host: SSH host (user@host or just host)
34
+ remote_workspace: Path to workspace on the remote host
35
+ local_cache: Local directory to sync files to
36
+ ssh_options: Additional SSH options (e.g., ["-p", "2222"])
37
+ """
38
+ self.host = host
39
+ self.remote_workspace = remote_workspace
40
+ self.local_cache = local_cache
41
+ self.ssh_options = ssh_options or []
42
+
43
+ def sync_path(self, remote_path: str) -> Path:
44
+ """Sync a specific path from remote
45
+
46
+ Args:
47
+ remote_path: Absolute path on remote or path relative to workspace
48
+
49
+ Returns:
50
+ Local path where the files were synced to
51
+ """
52
+ # Normalize the path - get relative path within workspace
53
+ if remote_path.startswith(str(self.remote_workspace)):
54
+ relative_path = remote_path[len(str(self.remote_workspace)) :].lstrip("/")
55
+ else:
56
+ relative_path = remote_path.lstrip("/")
57
+
58
+ if not relative_path:
59
+ raise ValueError("Cannot sync empty path")
60
+
61
+ logger.info("Syncing path: %s", relative_path)
62
+
63
+ # Build source and destination
64
+ source = f"{self.host}:{self.remote_workspace}/{relative_path}/"
65
+ local_path = self.local_cache / relative_path
66
+ local_path.mkdir(parents=True, exist_ok=True)
67
+ dest = f"{local_path}/"
68
+
69
+ self._rsync(source, dest)
70
+
71
+ return local_path
72
+
73
+ def _rsync(self, source: str, dest: str):
74
+ """Execute rsync command
75
+
76
+ Args:
77
+ source: Remote source path (host:path/)
78
+ dest: Local destination path
79
+ """
80
+ cmd = [
81
+ "rsync",
82
+ "--inplace", # Update destination files in-place
83
+ "--delete", # Delete extraneous files from destination
84
+ "-L", # Transform symlinks into referent file/dir
85
+ "-a", # Archive mode (preserves permissions, times, etc.)
86
+ "-z", # Compress during transfer
87
+ "-v", # Verbose
88
+ ]
89
+
90
+ # SSH options
91
+ if self.ssh_options:
92
+ ssh_cmd = "ssh " + " ".join(self.ssh_options)
93
+ cmd.extend(["-e", ssh_cmd])
94
+
95
+ cmd.extend([source, dest])
96
+
97
+ logger.debug("Running rsync: %s", " ".join(cmd))
98
+
99
+ try:
100
+ result = subprocess.run(
101
+ cmd,
102
+ capture_output=True,
103
+ text=True,
104
+ timeout=300, # 5 minute timeout
105
+ )
106
+
107
+ if result.returncode != 0:
108
+ # rsync returns non-zero for some warnings
109
+ if result.returncode == 23:
110
+ # Partial transfer due to error - some files may be missing
111
+ logger.warning("Rsync partial transfer: %s", result.stderr)
112
+ elif result.returncode == 24:
113
+ # Partial transfer due to vanished source files
114
+ logger.debug("Rsync: some source files vanished")
115
+ else:
116
+ logger.error(
117
+ "Rsync failed (code %d): %s",
118
+ result.returncode,
119
+ result.stderr,
120
+ )
121
+ raise RuntimeError(f"Rsync failed: {result.stderr}")
122
+ else:
123
+ logger.debug("Rsync completed successfully")
124
+
125
+ except subprocess.TimeoutExpired:
126
+ logger.error("Rsync timed out")
127
+ raise
128
+ except FileNotFoundError:
129
+ logger.error("rsync command not found - please install rsync")
130
+ raise RuntimeError("rsync command not found")
131
+
132
+ def get_local_path(self, remote_path: str) -> Path:
133
+ """Get the local cache path for a remote path
134
+
135
+ Args:
136
+ remote_path: Absolute path on the remote system
137
+
138
+ Returns:
139
+ Corresponding path in the local cache
140
+ """
141
+ if remote_path.startswith(str(self.remote_workspace)):
142
+ relative = remote_path[len(str(self.remote_workspace)) :].lstrip("/")
143
+ return self.local_cache / relative
144
+ return Path(remote_path)