experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +10 -11
- experimaestro/annotations.py +167 -206
- experimaestro/cli/__init__.py +278 -7
- experimaestro/cli/filter.py +42 -74
- experimaestro/cli/jobs.py +157 -106
- experimaestro/cli/refactor.py +249 -0
- experimaestro/click.py +0 -1
- experimaestro/commandline.py +19 -3
- experimaestro/connectors/__init__.py +20 -1
- experimaestro/connectors/local.py +12 -0
- experimaestro/core/arguments.py +182 -46
- experimaestro/core/identifier.py +107 -6
- experimaestro/core/objects/__init__.py +6 -0
- experimaestro/core/objects/config.py +542 -25
- experimaestro/core/objects/config_walk.py +20 -0
- experimaestro/core/serialization.py +91 -34
- experimaestro/core/subparameters.py +164 -0
- experimaestro/core/types.py +175 -38
- experimaestro/exceptions.py +26 -0
- experimaestro/experiments/cli.py +111 -25
- experimaestro/generators.py +50 -9
- experimaestro/huggingface.py +3 -1
- experimaestro/launcherfinder/parser.py +29 -0
- experimaestro/launchers/__init__.py +26 -1
- experimaestro/launchers/direct.py +12 -0
- experimaestro/launchers/slurm/base.py +154 -2
- experimaestro/mkdocs/metaloader.py +0 -1
- experimaestro/mypy.py +452 -7
- experimaestro/notifications.py +63 -13
- experimaestro/progress.py +0 -2
- experimaestro/rpyc.py +0 -1
- experimaestro/run.py +19 -6
- experimaestro/scheduler/base.py +510 -125
- experimaestro/scheduler/dependencies.py +43 -28
- experimaestro/scheduler/dynamic_outputs.py +259 -130
- experimaestro/scheduler/experiment.py +256 -31
- experimaestro/scheduler/interfaces.py +501 -0
- experimaestro/scheduler/jobs.py +216 -206
- experimaestro/scheduler/remote/__init__.py +31 -0
- experimaestro/scheduler/remote/client.py +874 -0
- experimaestro/scheduler/remote/protocol.py +467 -0
- experimaestro/scheduler/remote/server.py +423 -0
- experimaestro/scheduler/remote/sync.py +144 -0
- experimaestro/scheduler/services.py +323 -23
- experimaestro/scheduler/state_db.py +437 -0
- experimaestro/scheduler/state_provider.py +2766 -0
- experimaestro/scheduler/state_sync.py +891 -0
- experimaestro/scheduler/workspace.py +52 -10
- experimaestro/scriptbuilder.py +7 -0
- experimaestro/server/__init__.py +147 -57
- experimaestro/server/data/index.css +0 -125
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +194 -58
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +44 -5
- experimaestro/sphinx/__init__.py +3 -3
- experimaestro/taskglobals.py +20 -0
- experimaestro/tests/conftest.py +80 -0
- experimaestro/tests/core/test_generics.py +2 -2
- experimaestro/tests/identifier_stability.json +45 -0
- experimaestro/tests/launchers/bin/sacct +6 -2
- experimaestro/tests/launchers/bin/sbatch +4 -2
- experimaestro/tests/launchers/test_slurm.py +80 -0
- experimaestro/tests/tasks/test_dynamic.py +231 -0
- experimaestro/tests/test_cli_jobs.py +615 -0
- experimaestro/tests/test_deprecated.py +630 -0
- experimaestro/tests/test_environment.py +200 -0
- experimaestro/tests/test_file_progress_integration.py +1 -1
- experimaestro/tests/test_forward.py +3 -3
- experimaestro/tests/test_identifier.py +372 -41
- experimaestro/tests/test_identifier_stability.py +458 -0
- experimaestro/tests/test_instance.py +3 -3
- experimaestro/tests/test_multitoken.py +442 -0
- experimaestro/tests/test_mypy.py +433 -0
- experimaestro/tests/test_objects.py +312 -5
- experimaestro/tests/test_outputs.py +2 -2
- experimaestro/tests/test_param.py +8 -12
- experimaestro/tests/test_partial_paths.py +231 -0
- experimaestro/tests/test_progress.py +0 -48
- experimaestro/tests/test_remote_state.py +671 -0
- experimaestro/tests/test_resumable_task.py +480 -0
- experimaestro/tests/test_serializers.py +141 -1
- experimaestro/tests/test_state_db.py +434 -0
- experimaestro/tests/test_subparameters.py +160 -0
- experimaestro/tests/test_tags.py +136 -0
- experimaestro/tests/test_tasks.py +107 -121
- experimaestro/tests/test_token_locking.py +252 -0
- experimaestro/tests/test_tokens.py +17 -13
- experimaestro/tests/test_types.py +123 -1
- experimaestro/tests/test_workspace_triggers.py +158 -0
- experimaestro/tests/token_reschedule.py +4 -2
- experimaestro/tests/utils.py +2 -2
- experimaestro/tokens.py +154 -57
- experimaestro/tools/diff.py +1 -1
- experimaestro/tui/__init__.py +8 -0
- experimaestro/tui/app.py +2395 -0
- experimaestro/tui/app.tcss +353 -0
- experimaestro/tui/log_viewer.py +228 -0
- experimaestro/utils/__init__.py +23 -0
- experimaestro/utils/environment.py +148 -0
- experimaestro/utils/git.py +129 -0
- experimaestro/utils/resources.py +1 -1
- experimaestro/version.py +34 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +68 -38
- experimaestro-2.0.0b8.dist-info/RECORD +187 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +1 -1
- experimaestro-2.0.0b8.dist-info/entry_points.txt +16 -0
- experimaestro/compat.py +0 -6
- experimaestro/core/objects.pyi +0 -221
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro-2.0.0a8.dist-info/RECORD +0 -166
- experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
"""SSH State Provider Server
|
|
2
|
+
|
|
3
|
+
JSON-RPC server that wraps WorkspaceStateProvider and communicates via stdio.
|
|
4
|
+
Designed to be run over SSH for remote experiment monitoring.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
experimaestro experiments monitor-server --workdir /path/to/workspace
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import sys
|
|
12
|
+
import threading
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import IO, Callable, Dict, Optional
|
|
15
|
+
|
|
16
|
+
from experimaestro.scheduler.state_provider import (
|
|
17
|
+
WorkspaceStateProvider,
|
|
18
|
+
StateEvent,
|
|
19
|
+
StateEventType,
|
|
20
|
+
)
|
|
21
|
+
from experimaestro.scheduler.remote.protocol import (
|
|
22
|
+
RPCMethod,
|
|
23
|
+
NotificationMethod,
|
|
24
|
+
parse_message,
|
|
25
|
+
create_success_response,
|
|
26
|
+
create_error_response,
|
|
27
|
+
create_notification,
|
|
28
|
+
serialize_job,
|
|
29
|
+
serialize_experiment,
|
|
30
|
+
serialize_run,
|
|
31
|
+
serialize_datetime,
|
|
32
|
+
deserialize_datetime,
|
|
33
|
+
PARSE_ERROR,
|
|
34
|
+
METHOD_NOT_FOUND,
|
|
35
|
+
INVALID_PARAMS,
|
|
36
|
+
INTERNAL_ERROR,
|
|
37
|
+
WORKSPACE_NOT_FOUND,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
logger = logging.getLogger("xpm.remote.server")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class SSHStateProviderServer:
|
|
44
|
+
"""JSON-RPC server that wraps WorkspaceStateProvider for SSH-based monitoring
|
|
45
|
+
|
|
46
|
+
This server reads JSON-RPC requests from stdin and writes responses to stdout.
|
|
47
|
+
It registers as a listener with the WorkspaceStateProvider to push notifications
|
|
48
|
+
when state changes occur.
|
|
49
|
+
|
|
50
|
+
Thread safety:
|
|
51
|
+
- Writes to stdout are serialized with a lock
|
|
52
|
+
- The main read loop runs in the calling thread
|
|
53
|
+
- Event notifications may come from the state provider's change detector thread
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
workspace_path: Path,
|
|
59
|
+
stdin: IO[bytes] = None,
|
|
60
|
+
stdout: IO[bytes] = None,
|
|
61
|
+
):
|
|
62
|
+
"""Initialize the server
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
workspace_path: Path to the workspace directory
|
|
66
|
+
stdin: Input stream for reading requests (default: sys.stdin.buffer)
|
|
67
|
+
stdout: Output stream for writing responses (default: sys.stdout.buffer)
|
|
68
|
+
"""
|
|
69
|
+
self.workspace_path = workspace_path
|
|
70
|
+
self.stdin = stdin if stdin is not None else sys.stdin.buffer
|
|
71
|
+
self.stdout = stdout if stdout is not None else sys.stdout.buffer
|
|
72
|
+
self._state_provider: Optional[WorkspaceStateProvider] = None
|
|
73
|
+
self._running = False
|
|
74
|
+
self._write_lock = threading.Lock()
|
|
75
|
+
|
|
76
|
+
# Map of method names to handler functions
|
|
77
|
+
self._handlers: Dict[str, Callable] = {
|
|
78
|
+
RPCMethod.GET_EXPERIMENTS.value: self._handle_get_experiments,
|
|
79
|
+
RPCMethod.GET_EXPERIMENT.value: self._handle_get_experiment,
|
|
80
|
+
RPCMethod.GET_EXPERIMENT_RUNS.value: self._handle_get_experiment_runs,
|
|
81
|
+
RPCMethod.GET_JOBS.value: self._handle_get_jobs,
|
|
82
|
+
RPCMethod.GET_JOB.value: self._handle_get_job,
|
|
83
|
+
RPCMethod.GET_ALL_JOBS.value: self._handle_get_all_jobs,
|
|
84
|
+
RPCMethod.GET_SERVICES.value: self._handle_get_services,
|
|
85
|
+
RPCMethod.KILL_JOB.value: self._handle_kill_job,
|
|
86
|
+
RPCMethod.CLEAN_JOB.value: self._handle_clean_job,
|
|
87
|
+
RPCMethod.GET_SYNC_INFO.value: self._handle_get_sync_info,
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
def start(self):
|
|
91
|
+
"""Start the server and begin processing requests
|
|
92
|
+
|
|
93
|
+
This method blocks until the server is stopped or stdin is closed.
|
|
94
|
+
"""
|
|
95
|
+
# Verify workspace exists
|
|
96
|
+
if not self.workspace_path.exists():
|
|
97
|
+
logger.error("Workspace path does not exist: %s", self.workspace_path)
|
|
98
|
+
self._send_error_and_exit(
|
|
99
|
+
WORKSPACE_NOT_FOUND,
|
|
100
|
+
f"Workspace path does not exist: {self.workspace_path}",
|
|
101
|
+
)
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
# Initialize state provider in read-only mode
|
|
105
|
+
try:
|
|
106
|
+
self._state_provider = WorkspaceStateProvider.get_instance(
|
|
107
|
+
self.workspace_path,
|
|
108
|
+
read_only=True,
|
|
109
|
+
sync_on_start=True,
|
|
110
|
+
)
|
|
111
|
+
except Exception as e:
|
|
112
|
+
logger.exception("Failed to initialize state provider")
|
|
113
|
+
self._send_error_and_exit(INTERNAL_ERROR, f"Failed to initialize: {e}")
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
# Register as listener for state changes
|
|
117
|
+
self._state_provider.add_listener(self._on_state_event)
|
|
118
|
+
|
|
119
|
+
self._running = True
|
|
120
|
+
logger.info("SSH State Provider Server started for %s", self.workspace_path)
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
self._read_loop()
|
|
124
|
+
finally:
|
|
125
|
+
self.stop()
|
|
126
|
+
|
|
127
|
+
def stop(self):
|
|
128
|
+
"""Stop the server and clean up resources"""
|
|
129
|
+
self._running = False
|
|
130
|
+
|
|
131
|
+
# Unregister listener
|
|
132
|
+
if self._state_provider is not None:
|
|
133
|
+
try:
|
|
134
|
+
self._state_provider.remove_listener(self._on_state_event)
|
|
135
|
+
except Exception:
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
# Send shutdown notification
|
|
139
|
+
try:
|
|
140
|
+
self._send_notification(
|
|
141
|
+
NotificationMethod.SHUTDOWN, {"reason": "server_shutdown"}
|
|
142
|
+
)
|
|
143
|
+
except Exception:
|
|
144
|
+
pass
|
|
145
|
+
|
|
146
|
+
logger.info("SSH State Provider Server stopped")
|
|
147
|
+
|
|
148
|
+
def _read_loop(self):
|
|
149
|
+
"""Main loop: read JSON-RPC requests from stdin and process them"""
|
|
150
|
+
while self._running:
|
|
151
|
+
try:
|
|
152
|
+
line = self.stdin.readline()
|
|
153
|
+
if not line:
|
|
154
|
+
# EOF - stdin closed
|
|
155
|
+
logger.debug("stdin closed, stopping server")
|
|
156
|
+
break
|
|
157
|
+
|
|
158
|
+
line_str = line.decode("utf-8").strip()
|
|
159
|
+
if not line_str:
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
self._process_request(line_str)
|
|
163
|
+
|
|
164
|
+
except Exception as e:
|
|
165
|
+
logger.exception("Error in read loop: %s", e)
|
|
166
|
+
# Continue processing - don't crash on individual request errors
|
|
167
|
+
|
|
168
|
+
def _process_request(self, line: str):
|
|
169
|
+
"""Process a single JSON-RPC request"""
|
|
170
|
+
try:
|
|
171
|
+
msg = parse_message(line)
|
|
172
|
+
except ValueError as e:
|
|
173
|
+
self._send_response(create_error_response(0, PARSE_ERROR, str(e)))
|
|
174
|
+
return
|
|
175
|
+
|
|
176
|
+
# We only handle requests (with id), not responses or notifications
|
|
177
|
+
from experimaestro.scheduler.remote.protocol import RPCRequest
|
|
178
|
+
|
|
179
|
+
if not isinstance(msg, RPCRequest):
|
|
180
|
+
logger.warning("Received non-request message: %s", type(msg).__name__)
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
request = msg
|
|
184
|
+
method = request.method
|
|
185
|
+
params = request.params
|
|
186
|
+
request_id = request.id
|
|
187
|
+
|
|
188
|
+
if request_id is None:
|
|
189
|
+
# Notification from client - we don't handle these currently
|
|
190
|
+
logger.debug("Received notification: %s", method)
|
|
191
|
+
return
|
|
192
|
+
|
|
193
|
+
# Dispatch to handler
|
|
194
|
+
handler = self._handlers.get(method)
|
|
195
|
+
if handler is None:
|
|
196
|
+
self._send_response(
|
|
197
|
+
create_error_response(
|
|
198
|
+
request_id, METHOD_NOT_FOUND, f"Unknown method: {method}"
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
return
|
|
202
|
+
|
|
203
|
+
try:
|
|
204
|
+
result = handler(params)
|
|
205
|
+
self._send_response(create_success_response(request_id, result))
|
|
206
|
+
except TypeError as e:
|
|
207
|
+
self._send_response(
|
|
208
|
+
create_error_response(request_id, INVALID_PARAMS, str(e))
|
|
209
|
+
)
|
|
210
|
+
except Exception as e:
|
|
211
|
+
logger.exception("Error handling %s", method)
|
|
212
|
+
self._send_response(
|
|
213
|
+
create_error_response(request_id, INTERNAL_ERROR, str(e))
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
def _send_response(self, response: str):
|
|
217
|
+
"""Send a JSON-RPC response (thread-safe)"""
|
|
218
|
+
with self._write_lock:
|
|
219
|
+
self.stdout.write((response + "\n").encode("utf-8"))
|
|
220
|
+
self.stdout.flush()
|
|
221
|
+
|
|
222
|
+
def _send_notification(self, method: NotificationMethod, params: Dict):
|
|
223
|
+
"""Send a JSON-RPC notification (thread-safe)"""
|
|
224
|
+
notification = create_notification(method, params)
|
|
225
|
+
self._send_response(notification)
|
|
226
|
+
|
|
227
|
+
def _send_error_and_exit(self, code: int, message: str):
|
|
228
|
+
"""Send an error notification and exit"""
|
|
229
|
+
self._send_notification(
|
|
230
|
+
NotificationMethod.SHUTDOWN,
|
|
231
|
+
{"reason": "error", "code": code, "message": message},
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
def _on_state_event(self, event: StateEvent):
|
|
235
|
+
"""Handle state change events from the state provider
|
|
236
|
+
|
|
237
|
+
Converts events to JSON-RPC notifications and sends them to the client.
|
|
238
|
+
"""
|
|
239
|
+
try:
|
|
240
|
+
if event.event_type == StateEventType.EXPERIMENT_UPDATED:
|
|
241
|
+
self._send_notification(
|
|
242
|
+
NotificationMethod.EXPERIMENT_UPDATED,
|
|
243
|
+
{
|
|
244
|
+
"experiment_id": event.data.get("experiment_id"),
|
|
245
|
+
"data": event.data,
|
|
246
|
+
},
|
|
247
|
+
)
|
|
248
|
+
elif event.event_type == StateEventType.RUN_UPDATED:
|
|
249
|
+
self._send_notification(
|
|
250
|
+
NotificationMethod.RUN_UPDATED,
|
|
251
|
+
{
|
|
252
|
+
"experiment_id": event.data.get("experiment_id"),
|
|
253
|
+
"run_id": event.data.get("run_id"),
|
|
254
|
+
"data": event.data,
|
|
255
|
+
},
|
|
256
|
+
)
|
|
257
|
+
elif event.event_type == StateEventType.JOB_UPDATED:
|
|
258
|
+
self._send_notification(
|
|
259
|
+
NotificationMethod.JOB_UPDATED,
|
|
260
|
+
{
|
|
261
|
+
"job_id": event.data.get("job_id"),
|
|
262
|
+
"experiment_id": event.data.get("experiment_id"),
|
|
263
|
+
"run_id": event.data.get("run_id"),
|
|
264
|
+
"state": event.data.get("state"),
|
|
265
|
+
"data": event.data,
|
|
266
|
+
},
|
|
267
|
+
)
|
|
268
|
+
# Also send file_changed notification for job metadata
|
|
269
|
+
if "path" in event.data and event.data["path"]:
|
|
270
|
+
self._send_notification(
|
|
271
|
+
NotificationMethod.FILE_CHANGED,
|
|
272
|
+
{
|
|
273
|
+
"path": f"{event.data['path']}/.experimaestro/",
|
|
274
|
+
"change_type": "modified",
|
|
275
|
+
},
|
|
276
|
+
)
|
|
277
|
+
elif event.event_type == StateEventType.SERVICE_UPDATED:
|
|
278
|
+
self._send_notification(
|
|
279
|
+
NotificationMethod.SERVICE_UPDATED,
|
|
280
|
+
{
|
|
281
|
+
"service_id": event.data.get("service_id"),
|
|
282
|
+
"experiment_id": event.data.get("experiment_id"),
|
|
283
|
+
"run_id": event.data.get("run_id"),
|
|
284
|
+
"state": event.data.get("state"),
|
|
285
|
+
"data": event.data,
|
|
286
|
+
},
|
|
287
|
+
)
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.exception("Error sending notification: %s", e)
|
|
290
|
+
|
|
291
|
+
# -------------------------------------------------------------------------
|
|
292
|
+
# Request Handlers
|
|
293
|
+
# -------------------------------------------------------------------------
|
|
294
|
+
|
|
295
|
+
def _handle_get_experiments(self, params: Dict) -> list:
|
|
296
|
+
"""Handle get_experiments request"""
|
|
297
|
+
since = deserialize_datetime(params.get("since"))
|
|
298
|
+
experiments = self._state_provider.get_experiments(since=since)
|
|
299
|
+
return [serialize_experiment(exp) for exp in experiments]
|
|
300
|
+
|
|
301
|
+
def _handle_get_experiment(self, params: Dict) -> Optional[Dict]:
|
|
302
|
+
"""Handle get_experiment request"""
|
|
303
|
+
experiment_id = params.get("experiment_id")
|
|
304
|
+
if not experiment_id:
|
|
305
|
+
raise TypeError("experiment_id is required")
|
|
306
|
+
|
|
307
|
+
experiment = self._state_provider.get_experiment(experiment_id)
|
|
308
|
+
if experiment is None:
|
|
309
|
+
return None
|
|
310
|
+
return serialize_experiment(experiment)
|
|
311
|
+
|
|
312
|
+
def _handle_get_experiment_runs(self, params: Dict) -> list:
|
|
313
|
+
"""Handle get_experiment_runs request"""
|
|
314
|
+
experiment_id = params.get("experiment_id")
|
|
315
|
+
if not experiment_id:
|
|
316
|
+
raise TypeError("experiment_id is required")
|
|
317
|
+
|
|
318
|
+
runs = self._state_provider.get_experiment_runs(experiment_id)
|
|
319
|
+
return [serialize_run(run) for run in runs]
|
|
320
|
+
|
|
321
|
+
def _handle_get_jobs(self, params: Dict) -> list:
|
|
322
|
+
"""Handle get_jobs request"""
|
|
323
|
+
since = deserialize_datetime(params.get("since"))
|
|
324
|
+
jobs = self._state_provider.get_jobs(
|
|
325
|
+
experiment_id=params.get("experiment_id"),
|
|
326
|
+
run_id=params.get("run_id"),
|
|
327
|
+
task_id=params.get("task_id"),
|
|
328
|
+
state=params.get("state"),
|
|
329
|
+
tags=params.get("tags"),
|
|
330
|
+
since=since,
|
|
331
|
+
)
|
|
332
|
+
return [serialize_job(job) for job in jobs]
|
|
333
|
+
|
|
334
|
+
def _handle_get_job(self, params: Dict) -> Optional[Dict]:
|
|
335
|
+
"""Handle get_job request"""
|
|
336
|
+
job_id = params.get("job_id")
|
|
337
|
+
experiment_id = params.get("experiment_id")
|
|
338
|
+
if not job_id or not experiment_id:
|
|
339
|
+
raise TypeError("job_id and experiment_id are required")
|
|
340
|
+
|
|
341
|
+
job = self._state_provider.get_job(
|
|
342
|
+
job_id=job_id,
|
|
343
|
+
experiment_id=experiment_id,
|
|
344
|
+
run_id=params.get("run_id"),
|
|
345
|
+
)
|
|
346
|
+
if job is None:
|
|
347
|
+
return None
|
|
348
|
+
return serialize_job(job)
|
|
349
|
+
|
|
350
|
+
def _handle_get_all_jobs(self, params: Dict) -> list:
|
|
351
|
+
"""Handle get_all_jobs request"""
|
|
352
|
+
since = deserialize_datetime(params.get("since"))
|
|
353
|
+
jobs = self._state_provider.get_all_jobs(
|
|
354
|
+
state=params.get("state"),
|
|
355
|
+
tags=params.get("tags"),
|
|
356
|
+
since=since,
|
|
357
|
+
)
|
|
358
|
+
return [serialize_job(job) for job in jobs]
|
|
359
|
+
|
|
360
|
+
def _handle_get_services(self, params: Dict) -> list:
|
|
361
|
+
"""Handle get_services request
|
|
362
|
+
|
|
363
|
+
Uses get_services_raw to return raw service data without trying to
|
|
364
|
+
recreate Service objects. This allows the client to handle module
|
|
365
|
+
loading and show appropriate error messages.
|
|
366
|
+
"""
|
|
367
|
+
return self._state_provider.get_services_raw(
|
|
368
|
+
experiment_id=params.get("experiment_id"),
|
|
369
|
+
run_id=params.get("run_id"),
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
def _handle_kill_job(self, params: Dict) -> Dict:
|
|
373
|
+
"""Handle kill_job request"""
|
|
374
|
+
job_id = params.get("job_id")
|
|
375
|
+
experiment_id = params.get("experiment_id")
|
|
376
|
+
run_id = params.get("run_id")
|
|
377
|
+
|
|
378
|
+
if not job_id or not experiment_id or not run_id:
|
|
379
|
+
raise TypeError("job_id, experiment_id, and run_id are required")
|
|
380
|
+
|
|
381
|
+
# Get the job first
|
|
382
|
+
job = self._state_provider.get_job(job_id, experiment_id, run_id)
|
|
383
|
+
if job is None:
|
|
384
|
+
return {"success": False, "error": "Job not found"}
|
|
385
|
+
|
|
386
|
+
# Kill the job
|
|
387
|
+
try:
|
|
388
|
+
result = self._state_provider.kill_job(job, perform=True)
|
|
389
|
+
return {"success": result}
|
|
390
|
+
except Exception as e:
|
|
391
|
+
return {"success": False, "error": str(e)}
|
|
392
|
+
|
|
393
|
+
def _handle_clean_job(self, params: Dict) -> Dict:
|
|
394
|
+
"""Handle clean_job request"""
|
|
395
|
+
job_id = params.get("job_id")
|
|
396
|
+
experiment_id = params.get("experiment_id")
|
|
397
|
+
run_id = params.get("run_id")
|
|
398
|
+
|
|
399
|
+
if not job_id or not experiment_id or not run_id:
|
|
400
|
+
raise TypeError("job_id, experiment_id, and run_id are required")
|
|
401
|
+
|
|
402
|
+
# Get the job first
|
|
403
|
+
job = self._state_provider.get_job(job_id, experiment_id, run_id)
|
|
404
|
+
if job is None:
|
|
405
|
+
return {"success": False, "error": "Job not found"}
|
|
406
|
+
|
|
407
|
+
# Clean the job
|
|
408
|
+
try:
|
|
409
|
+
result = self._state_provider.clean_job(job, perform=True)
|
|
410
|
+
return {"success": result}
|
|
411
|
+
except Exception as e:
|
|
412
|
+
return {"success": False, "error": str(e)}
|
|
413
|
+
|
|
414
|
+
def _handle_get_sync_info(self, params: Dict) -> Dict:
|
|
415
|
+
"""Handle get_sync_info request"""
|
|
416
|
+
return {
|
|
417
|
+
"workspace_path": str(self.workspace_path),
|
|
418
|
+
"last_sync_time": (
|
|
419
|
+
serialize_datetime(self._state_provider.get_last_sync_time())
|
|
420
|
+
if hasattr(self._state_provider, "get_last_sync_time")
|
|
421
|
+
else None
|
|
422
|
+
),
|
|
423
|
+
}
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Remote File Synchronizer
|
|
2
|
+
|
|
3
|
+
Handles rsync-based file synchronization between remote and local workspaces
|
|
4
|
+
for SSH-based experiment monitoring. Only syncs specific paths on-demand
|
|
5
|
+
when services need them (e.g., TensorboardService).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import subprocess
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import List, Optional
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger("xpm.remote.sync")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RemoteFileSynchronizer:
|
|
17
|
+
"""Handles rsync-based file synchronization for remote monitoring
|
|
18
|
+
|
|
19
|
+
Syncs specific paths on-demand from a remote host to a local cache
|
|
20
|
+
directory. Used when services need access to remote files.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
host: str,
|
|
26
|
+
remote_workspace: Path,
|
|
27
|
+
local_cache: Path,
|
|
28
|
+
ssh_options: Optional[List[str]] = None,
|
|
29
|
+
):
|
|
30
|
+
"""Initialize the synchronizer
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
host: SSH host (user@host or just host)
|
|
34
|
+
remote_workspace: Path to workspace on the remote host
|
|
35
|
+
local_cache: Local directory to sync files to
|
|
36
|
+
ssh_options: Additional SSH options (e.g., ["-p", "2222"])
|
|
37
|
+
"""
|
|
38
|
+
self.host = host
|
|
39
|
+
self.remote_workspace = remote_workspace
|
|
40
|
+
self.local_cache = local_cache
|
|
41
|
+
self.ssh_options = ssh_options or []
|
|
42
|
+
|
|
43
|
+
def sync_path(self, remote_path: str) -> Path:
|
|
44
|
+
"""Sync a specific path from remote
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
remote_path: Absolute path on remote or path relative to workspace
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Local path where the files were synced to
|
|
51
|
+
"""
|
|
52
|
+
# Normalize the path - get relative path within workspace
|
|
53
|
+
if remote_path.startswith(str(self.remote_workspace)):
|
|
54
|
+
relative_path = remote_path[len(str(self.remote_workspace)) :].lstrip("/")
|
|
55
|
+
else:
|
|
56
|
+
relative_path = remote_path.lstrip("/")
|
|
57
|
+
|
|
58
|
+
if not relative_path:
|
|
59
|
+
raise ValueError("Cannot sync empty path")
|
|
60
|
+
|
|
61
|
+
logger.info("Syncing path: %s", relative_path)
|
|
62
|
+
|
|
63
|
+
# Build source and destination
|
|
64
|
+
source = f"{self.host}:{self.remote_workspace}/{relative_path}/"
|
|
65
|
+
local_path = self.local_cache / relative_path
|
|
66
|
+
local_path.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
dest = f"{local_path}/"
|
|
68
|
+
|
|
69
|
+
self._rsync(source, dest)
|
|
70
|
+
|
|
71
|
+
return local_path
|
|
72
|
+
|
|
73
|
+
def _rsync(self, source: str, dest: str):
|
|
74
|
+
"""Execute rsync command
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
source: Remote source path (host:path/)
|
|
78
|
+
dest: Local destination path
|
|
79
|
+
"""
|
|
80
|
+
cmd = [
|
|
81
|
+
"rsync",
|
|
82
|
+
"--inplace", # Update destination files in-place
|
|
83
|
+
"--delete", # Delete extraneous files from destination
|
|
84
|
+
"-L", # Transform symlinks into referent file/dir
|
|
85
|
+
"-a", # Archive mode (preserves permissions, times, etc.)
|
|
86
|
+
"-z", # Compress during transfer
|
|
87
|
+
"-v", # Verbose
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
# SSH options
|
|
91
|
+
if self.ssh_options:
|
|
92
|
+
ssh_cmd = "ssh " + " ".join(self.ssh_options)
|
|
93
|
+
cmd.extend(["-e", ssh_cmd])
|
|
94
|
+
|
|
95
|
+
cmd.extend([source, dest])
|
|
96
|
+
|
|
97
|
+
logger.debug("Running rsync: %s", " ".join(cmd))
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
result = subprocess.run(
|
|
101
|
+
cmd,
|
|
102
|
+
capture_output=True,
|
|
103
|
+
text=True,
|
|
104
|
+
timeout=300, # 5 minute timeout
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
if result.returncode != 0:
|
|
108
|
+
# rsync returns non-zero for some warnings
|
|
109
|
+
if result.returncode == 23:
|
|
110
|
+
# Partial transfer due to error - some files may be missing
|
|
111
|
+
logger.warning("Rsync partial transfer: %s", result.stderr)
|
|
112
|
+
elif result.returncode == 24:
|
|
113
|
+
# Partial transfer due to vanished source files
|
|
114
|
+
logger.debug("Rsync: some source files vanished")
|
|
115
|
+
else:
|
|
116
|
+
logger.error(
|
|
117
|
+
"Rsync failed (code %d): %s",
|
|
118
|
+
result.returncode,
|
|
119
|
+
result.stderr,
|
|
120
|
+
)
|
|
121
|
+
raise RuntimeError(f"Rsync failed: {result.stderr}")
|
|
122
|
+
else:
|
|
123
|
+
logger.debug("Rsync completed successfully")
|
|
124
|
+
|
|
125
|
+
except subprocess.TimeoutExpired:
|
|
126
|
+
logger.error("Rsync timed out")
|
|
127
|
+
raise
|
|
128
|
+
except FileNotFoundError:
|
|
129
|
+
logger.error("rsync command not found - please install rsync")
|
|
130
|
+
raise RuntimeError("rsync command not found")
|
|
131
|
+
|
|
132
|
+
def get_local_path(self, remote_path: str) -> Path:
|
|
133
|
+
"""Get the local cache path for a remote path
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
remote_path: Absolute path on the remote system
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Corresponding path in the local cache
|
|
140
|
+
"""
|
|
141
|
+
if remote_path.startswith(str(self.remote_workspace)):
|
|
142
|
+
relative = remote_path[len(str(self.remote_workspace)) :].lstrip("/")
|
|
143
|
+
return self.local_cache / relative
|
|
144
|
+
return Path(remote_path)
|