experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +12 -5
- experimaestro/cli/__init__.py +393 -134
- experimaestro/cli/filter.py +48 -23
- experimaestro/cli/jobs.py +253 -71
- experimaestro/cli/refactor.py +1 -2
- experimaestro/commandline.py +7 -4
- experimaestro/connectors/__init__.py +9 -1
- experimaestro/connectors/local.py +43 -3
- experimaestro/core/arguments.py +18 -18
- experimaestro/core/identifier.py +11 -11
- experimaestro/core/objects/config.py +96 -39
- experimaestro/core/objects/config_walk.py +3 -3
- experimaestro/core/{subparameters.py → partial.py} +16 -16
- experimaestro/core/partial_lock.py +394 -0
- experimaestro/core/types.py +12 -15
- experimaestro/dynamic.py +290 -0
- experimaestro/experiments/__init__.py +6 -2
- experimaestro/experiments/cli.py +223 -52
- experimaestro/experiments/configuration.py +24 -0
- experimaestro/generators.py +5 -5
- experimaestro/ipc.py +118 -1
- experimaestro/launcherfinder/__init__.py +2 -2
- experimaestro/launcherfinder/registry.py +6 -7
- experimaestro/launcherfinder/specs.py +2 -9
- experimaestro/launchers/slurm/__init__.py +2 -2
- experimaestro/launchers/slurm/base.py +62 -0
- experimaestro/locking.py +957 -1
- experimaestro/notifications.py +89 -201
- experimaestro/progress.py +63 -366
- experimaestro/rpyc.py +0 -2
- experimaestro/run.py +29 -2
- experimaestro/scheduler/__init__.py +8 -1
- experimaestro/scheduler/base.py +650 -53
- experimaestro/scheduler/dependencies.py +20 -16
- experimaestro/scheduler/experiment.py +764 -169
- experimaestro/scheduler/interfaces.py +338 -96
- experimaestro/scheduler/jobs.py +58 -20
- experimaestro/scheduler/remote/__init__.py +31 -0
- experimaestro/scheduler/remote/adaptive_sync.py +265 -0
- experimaestro/scheduler/remote/client.py +928 -0
- experimaestro/scheduler/remote/protocol.py +282 -0
- experimaestro/scheduler/remote/server.py +447 -0
- experimaestro/scheduler/remote/sync.py +144 -0
- experimaestro/scheduler/services.py +186 -35
- experimaestro/scheduler/state_provider.py +811 -2157
- experimaestro/scheduler/state_status.py +1247 -0
- experimaestro/scheduler/transient.py +31 -0
- experimaestro/scheduler/workspace.py +1 -1
- experimaestro/scheduler/workspace_state_provider.py +1273 -0
- experimaestro/scriptbuilder.py +4 -4
- experimaestro/settings.py +36 -0
- experimaestro/tests/conftest.py +33 -5
- experimaestro/tests/connectors/bin/executable.py +1 -1
- experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
- experimaestro/tests/launchers/bin/test.py +1 -0
- experimaestro/tests/launchers/test_slurm.py +9 -9
- experimaestro/tests/partial_reschedule.py +46 -0
- experimaestro/tests/restart.py +3 -3
- experimaestro/tests/restart_main.py +1 -0
- experimaestro/tests/scripts/notifyandwait.py +1 -0
- experimaestro/tests/task_partial.py +38 -0
- experimaestro/tests/task_tokens.py +2 -2
- experimaestro/tests/tasks/test_dynamic.py +6 -6
- experimaestro/tests/test_dependencies.py +3 -3
- experimaestro/tests/test_deprecated.py +15 -15
- experimaestro/tests/test_dynamic_locking.py +317 -0
- experimaestro/tests/test_environment.py +24 -14
- experimaestro/tests/test_experiment.py +171 -36
- experimaestro/tests/test_identifier.py +25 -25
- experimaestro/tests/test_identifier_stability.py +3 -5
- experimaestro/tests/test_multitoken.py +2 -4
- experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
- experimaestro/tests/test_partial_paths.py +81 -138
- experimaestro/tests/test_pre_experiment.py +219 -0
- experimaestro/tests/test_progress.py +2 -8
- experimaestro/tests/test_remote_state.py +1132 -0
- experimaestro/tests/test_stray_jobs.py +261 -0
- experimaestro/tests/test_tasks.py +1 -2
- experimaestro/tests/test_token_locking.py +52 -67
- experimaestro/tests/test_tokens.py +5 -6
- experimaestro/tests/test_transient.py +225 -0
- experimaestro/tests/test_workspace_state_provider.py +768 -0
- experimaestro/tests/token_reschedule.py +1 -3
- experimaestro/tests/utils.py +2 -7
- experimaestro/tokens.py +227 -372
- experimaestro/tools/diff.py +1 -0
- experimaestro/tools/documentation.py +4 -5
- experimaestro/tools/jobs.py +1 -2
- experimaestro/tui/app.py +459 -1895
- experimaestro/tui/app.tcss +162 -0
- experimaestro/tui/dialogs.py +172 -0
- experimaestro/tui/log_viewer.py +253 -3
- experimaestro/tui/messages.py +137 -0
- experimaestro/tui/utils.py +54 -0
- experimaestro/tui/widgets/__init__.py +23 -0
- experimaestro/tui/widgets/experiments.py +468 -0
- experimaestro/tui/widgets/global_services.py +238 -0
- experimaestro/tui/widgets/jobs.py +972 -0
- experimaestro/tui/widgets/log.py +156 -0
- experimaestro/tui/widgets/orphans.py +363 -0
- experimaestro/tui/widgets/runs.py +185 -0
- experimaestro/tui/widgets/services.py +314 -0
- experimaestro/tui/widgets/stray_jobs.py +528 -0
- experimaestro/utils/__init__.py +1 -1
- experimaestro/utils/environment.py +105 -22
- experimaestro/utils/fswatcher.py +124 -0
- experimaestro/utils/jobs.py +1 -2
- experimaestro/utils/jupyter.py +1 -2
- experimaestro/utils/logging.py +72 -0
- experimaestro/version.py +2 -2
- experimaestro/webui/__init__.py +9 -0
- experimaestro/webui/app.py +117 -0
- experimaestro/{server → webui}/data/index.css +66 -11
- experimaestro/webui/data/index.css.map +1 -0
- experimaestro/{server → webui}/data/index.js +82763 -87217
- experimaestro/webui/data/index.js.map +1 -0
- experimaestro/webui/routes/__init__.py +5 -0
- experimaestro/webui/routes/auth.py +53 -0
- experimaestro/webui/routes/proxy.py +117 -0
- experimaestro/webui/server.py +200 -0
- experimaestro/webui/state_bridge.py +152 -0
- experimaestro/webui/websocket.py +413 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +8 -9
- experimaestro-2.0.0b17.dist-info/RECORD +219 -0
- experimaestro/cli/progress.py +0 -269
- experimaestro/scheduler/state.py +0 -75
- experimaestro/scheduler/state_db.py +0 -388
- experimaestro/scheduler/state_sync.py +0 -834
- experimaestro/server/__init__.py +0 -467
- experimaestro/server/data/index.css.map +0 -1
- experimaestro/server/data/index.js.map +0 -1
- experimaestro/tests/test_cli_jobs.py +0 -615
- experimaestro/tests/test_file_progress.py +0 -425
- experimaestro/tests/test_file_progress_integration.py +0 -477
- experimaestro/tests/test_state_db.py +0 -434
- experimaestro-2.0.0b4.dist-info/RECORD +0 -181
- /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
- /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
- /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
- /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
- /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
- /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
- /experimaestro/{server → webui}/data/favicon.ico +0 -0
- /experimaestro/{server → webui}/data/index.html +0 -0
- /experimaestro/{server → webui}/data/login.html +0 -0
- /experimaestro/{server → webui}/data/manifest.json +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,928 @@
|
|
|
1
|
+
"""SSH State Provider Client
|
|
2
|
+
|
|
3
|
+
Client that connects via SSH to a remote SSHStateProviderServer and implements
|
|
4
|
+
the StateProvider-like interface for local TUI/web UI usage.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
client = SSHStateProviderClient(host="server", remote_workspace="/path/to/workspace")
|
|
8
|
+
client.connect()
|
|
9
|
+
experiments = client.get_experiments()
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import atexit
|
|
13
|
+
import logging
|
|
14
|
+
import shutil
|
|
15
|
+
import subprocess
|
|
16
|
+
import tempfile
|
|
17
|
+
import threading
|
|
18
|
+
from concurrent.futures import Future, TimeoutError as FutureTimeoutError
|
|
19
|
+
from datetime import datetime
|
|
20
|
+
from importlib.metadata import version as get_package_version
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Set
|
|
23
|
+
|
|
24
|
+
from termcolor import colored
|
|
25
|
+
|
|
26
|
+
from experimaestro.scheduler.state_provider import (
|
|
27
|
+
OfflineStateProvider,
|
|
28
|
+
StateListener,
|
|
29
|
+
MockJob,
|
|
30
|
+
MockExperiment,
|
|
31
|
+
MockService,
|
|
32
|
+
)
|
|
33
|
+
from experimaestro.scheduler.state_status import EventBase
|
|
34
|
+
from experimaestro.scheduler.interfaces import (
|
|
35
|
+
BaseJob,
|
|
36
|
+
BaseExperiment,
|
|
37
|
+
BaseService,
|
|
38
|
+
)
|
|
39
|
+
from experimaestro.scheduler.remote.protocol import (
|
|
40
|
+
RPCMethod,
|
|
41
|
+
NotificationMethod,
|
|
42
|
+
RPCResponse,
|
|
43
|
+
RPCNotification,
|
|
44
|
+
parse_message,
|
|
45
|
+
create_request,
|
|
46
|
+
serialize_datetime,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Type for SSH output callback
|
|
50
|
+
OutputCallback = Optional["Callable[[str], None]"]
|
|
51
|
+
|
|
52
|
+
if TYPE_CHECKING:
|
|
53
|
+
from experimaestro.scheduler.remote.sync import RemoteFileSynchronizer
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
logger = logging.getLogger("xpm.remote.client")
|
|
57
|
+
|
|
58
|
+
# Default timeout for RPC requests (seconds)
|
|
59
|
+
DEFAULT_TIMEOUT = 30.0
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _strip_dev_version(version: str) -> str:
|
|
63
|
+
"""Strip the .devN suffix from a version string.
|
|
64
|
+
|
|
65
|
+
Examples:
|
|
66
|
+
'2.0.0b3.dev8' -> '2.0.0b3'
|
|
67
|
+
'1.2.3.dev1' -> '1.2.3'
|
|
68
|
+
'1.2.3' -> '1.2.3' (unchanged)
|
|
69
|
+
"""
|
|
70
|
+
import re
|
|
71
|
+
|
|
72
|
+
return re.sub(r"\.dev\d+$", "", version)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class SSHStateProviderClient(OfflineStateProvider):
|
|
76
|
+
"""Client that connects to SSHStateProviderServer via SSH
|
|
77
|
+
|
|
78
|
+
This client implements the StateProvider interface for remote experiment
|
|
79
|
+
monitoring via SSH.
|
|
80
|
+
|
|
81
|
+
Features:
|
|
82
|
+
- JSON-RPC over SSH stdin/stdout
|
|
83
|
+
- Async request/response handling with futures
|
|
84
|
+
- Server push notifications converted to EventBases
|
|
85
|
+
- On-demand rsync for specific paths (used by services like TensorboardService)
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
def __init__(
|
|
89
|
+
self,
|
|
90
|
+
host: str,
|
|
91
|
+
remote_workspace: str,
|
|
92
|
+
ssh_options: Optional[List[str]] = None,
|
|
93
|
+
remote_xpm_path: Optional[str] = None,
|
|
94
|
+
output_callback: Optional[Callable[[str], None]] = None,
|
|
95
|
+
):
|
|
96
|
+
"""Initialize the client
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
host: SSH host (user@host or host)
|
|
100
|
+
remote_workspace: Path to workspace on the remote host
|
|
101
|
+
ssh_options: Additional SSH options (e.g., ["-p", "2222"])
|
|
102
|
+
remote_xpm_path: Path to experimaestro executable on remote host.
|
|
103
|
+
If None, uses 'uv tool run experimaestro==<version>'.
|
|
104
|
+
output_callback: Callback for SSH process output (stderr).
|
|
105
|
+
If None, a default callback prints with colored prefix.
|
|
106
|
+
Set to False (or a no-op lambda) to disable output display.
|
|
107
|
+
"""
|
|
108
|
+
# Initialize base class (includes service cache)
|
|
109
|
+
super().__init__()
|
|
110
|
+
|
|
111
|
+
self.host = host
|
|
112
|
+
self.remote_workspace = remote_workspace
|
|
113
|
+
self.ssh_options = ssh_options or []
|
|
114
|
+
self.remote_xpm_path = remote_xpm_path
|
|
115
|
+
self._output_callback = output_callback
|
|
116
|
+
|
|
117
|
+
# Session-specific temporary cache directory (created on connect)
|
|
118
|
+
self._temp_dir: Optional[str] = None
|
|
119
|
+
self.local_cache_dir: Optional[Path] = None
|
|
120
|
+
self.workspace_path: Optional[Path] = None # For compatibility
|
|
121
|
+
|
|
122
|
+
self._process: Optional[subprocess.Popen] = None
|
|
123
|
+
self._stdin = None
|
|
124
|
+
self._stdout = None
|
|
125
|
+
self._stderr = None
|
|
126
|
+
|
|
127
|
+
self._listeners: Set[StateListener] = set()
|
|
128
|
+
self._listener_lock = threading.Lock()
|
|
129
|
+
|
|
130
|
+
self._response_handlers: Dict[int, Future] = {}
|
|
131
|
+
self._response_lock = threading.Lock()
|
|
132
|
+
self._request_id = 0
|
|
133
|
+
|
|
134
|
+
self._read_thread: Optional[threading.Thread] = None
|
|
135
|
+
self._notify_thread: Optional[threading.Thread] = None
|
|
136
|
+
self._stderr_thread: Optional[threading.Thread] = None
|
|
137
|
+
self._running = False
|
|
138
|
+
self._connected = False
|
|
139
|
+
|
|
140
|
+
self._synchronizer: Optional["RemoteFileSynchronizer"] = None
|
|
141
|
+
|
|
142
|
+
# Throttled notification delivery to avoid flooding UI
|
|
143
|
+
self._pending_events: List[EventBase] = []
|
|
144
|
+
self._pending_events_lock = threading.Lock()
|
|
145
|
+
self._notify_interval = 2.0 # Seconds between notification batches
|
|
146
|
+
|
|
147
|
+
def connect(self, timeout: float = 30.0):
|
|
148
|
+
"""Establish SSH connection and start remote server
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
timeout: Connection timeout in seconds
|
|
152
|
+
"""
|
|
153
|
+
if self._connected:
|
|
154
|
+
logger.warning("Already connected")
|
|
155
|
+
return
|
|
156
|
+
|
|
157
|
+
# Create session-specific temporary cache directory
|
|
158
|
+
self._temp_dir = tempfile.mkdtemp(prefix="xpm_remote_")
|
|
159
|
+
self.local_cache_dir = Path(self._temp_dir)
|
|
160
|
+
self.workspace_path = self.local_cache_dir
|
|
161
|
+
logger.debug("Created temporary cache directory: %s", self._temp_dir)
|
|
162
|
+
|
|
163
|
+
# Register cleanup on exit (in case disconnect isn't called)
|
|
164
|
+
atexit.register(self._cleanup_temp_dir)
|
|
165
|
+
|
|
166
|
+
# Build SSH command
|
|
167
|
+
cmd = ["ssh"]
|
|
168
|
+
cmd.extend(self.ssh_options)
|
|
169
|
+
cmd.append(self.host)
|
|
170
|
+
|
|
171
|
+
# Build remote command (workdir is passed to experiments group)
|
|
172
|
+
if self.remote_xpm_path:
|
|
173
|
+
# Use specified path to experimaestro
|
|
174
|
+
remote_cmd = f"{self.remote_xpm_path} experiments --workdir {self.remote_workspace} monitor-server"
|
|
175
|
+
else:
|
|
176
|
+
# Use uv tool run with version pinning
|
|
177
|
+
try:
|
|
178
|
+
xpm_version = get_package_version("experimaestro")
|
|
179
|
+
# Strip .devN suffix for release compatibility
|
|
180
|
+
xpm_version = _strip_dev_version(xpm_version)
|
|
181
|
+
except Exception:
|
|
182
|
+
xpm_version = None
|
|
183
|
+
|
|
184
|
+
if xpm_version:
|
|
185
|
+
remote_cmd = f"uv tool run experimaestro=={xpm_version} experiments --workdir {self.remote_workspace} monitor-server"
|
|
186
|
+
else:
|
|
187
|
+
remote_cmd = f"uv tool run experimaestro experiments --workdir {self.remote_workspace} monitor-server"
|
|
188
|
+
cmd.append(remote_cmd)
|
|
189
|
+
|
|
190
|
+
logger.info("Connecting to %s, workspace: %s", self.host, self.remote_workspace)
|
|
191
|
+
logger.debug("SSH command: %s", " ".join(cmd))
|
|
192
|
+
|
|
193
|
+
try:
|
|
194
|
+
self._process = subprocess.Popen(
|
|
195
|
+
cmd,
|
|
196
|
+
stdin=subprocess.PIPE,
|
|
197
|
+
stdout=subprocess.PIPE,
|
|
198
|
+
stderr=subprocess.PIPE,
|
|
199
|
+
bufsize=0, # Unbuffered
|
|
200
|
+
)
|
|
201
|
+
self._stdin = self._process.stdin
|
|
202
|
+
self._stdout = self._process.stdout
|
|
203
|
+
self._stderr = self._process.stderr
|
|
204
|
+
except Exception as e:
|
|
205
|
+
logger.error("Failed to start SSH process: %s", e)
|
|
206
|
+
raise ConnectionError(f"Failed to connect to {self.host}: {e}")
|
|
207
|
+
|
|
208
|
+
self._running = True
|
|
209
|
+
|
|
210
|
+
# Start read thread for responses and notifications
|
|
211
|
+
self._read_thread = threading.Thread(
|
|
212
|
+
target=self._read_loop, daemon=True, name="SSHClient-Read"
|
|
213
|
+
)
|
|
214
|
+
self._read_thread.start()
|
|
215
|
+
|
|
216
|
+
# Start notification thread for throttled event delivery
|
|
217
|
+
self._notify_thread = threading.Thread(
|
|
218
|
+
target=self._notify_loop, daemon=True, name="SSHClient-Notify"
|
|
219
|
+
)
|
|
220
|
+
self._notify_thread.start()
|
|
221
|
+
|
|
222
|
+
# Start stderr thread to display SSH output
|
|
223
|
+
self._stderr_thread = threading.Thread(
|
|
224
|
+
target=self._stderr_loop, daemon=True, name="SSHClient-Stderr"
|
|
225
|
+
)
|
|
226
|
+
self._stderr_thread.start()
|
|
227
|
+
|
|
228
|
+
# Wait for connection to be established by sending a test request
|
|
229
|
+
try:
|
|
230
|
+
sync_info = self._call_sync(RPCMethod.GET_SYNC_INFO, {}, timeout=timeout)
|
|
231
|
+
logger.info(
|
|
232
|
+
"Connected to remote workspace: %s", sync_info.get("workspace_path")
|
|
233
|
+
)
|
|
234
|
+
except Exception as e:
|
|
235
|
+
self.disconnect()
|
|
236
|
+
raise ConnectionError(f"Failed to establish connection: {e}")
|
|
237
|
+
|
|
238
|
+
self._connected = True
|
|
239
|
+
|
|
240
|
+
def disconnect(self):
|
|
241
|
+
"""Disconnect from the remote server"""
|
|
242
|
+
self._running = False
|
|
243
|
+
self._connected = False
|
|
244
|
+
|
|
245
|
+
# Close stdin to signal EOF to remote server
|
|
246
|
+
if self._stdin:
|
|
247
|
+
try:
|
|
248
|
+
self._stdin.close()
|
|
249
|
+
except Exception:
|
|
250
|
+
pass
|
|
251
|
+
|
|
252
|
+
# Terminate the SSH process
|
|
253
|
+
if self._process:
|
|
254
|
+
try:
|
|
255
|
+
self._process.terminate()
|
|
256
|
+
self._process.wait(timeout=5.0)
|
|
257
|
+
except Exception:
|
|
258
|
+
try:
|
|
259
|
+
self._process.kill()
|
|
260
|
+
except Exception:
|
|
261
|
+
pass
|
|
262
|
+
|
|
263
|
+
# Wait for threads to finish
|
|
264
|
+
if self._read_thread and self._read_thread.is_alive():
|
|
265
|
+
self._read_thread.join(timeout=2.0)
|
|
266
|
+
if self._notify_thread and self._notify_thread.is_alive():
|
|
267
|
+
self._notify_thread.join(timeout=2.0)
|
|
268
|
+
if self._stderr_thread and self._stderr_thread.is_alive():
|
|
269
|
+
self._stderr_thread.join(timeout=2.0)
|
|
270
|
+
|
|
271
|
+
# Cancel any pending requests
|
|
272
|
+
with self._response_lock:
|
|
273
|
+
for future in self._response_handlers.values():
|
|
274
|
+
if not future.done():
|
|
275
|
+
future.set_exception(ConnectionError("Disconnected"))
|
|
276
|
+
self._response_handlers.clear()
|
|
277
|
+
|
|
278
|
+
# Clear service cache (using base class method)
|
|
279
|
+
self._clear_service_cache()
|
|
280
|
+
|
|
281
|
+
# Clean up temporary cache directory
|
|
282
|
+
self._cleanup_temp_dir()
|
|
283
|
+
|
|
284
|
+
logger.info("Disconnected from %s", self.host)
|
|
285
|
+
|
|
286
|
+
def _cleanup_temp_dir(self):
|
|
287
|
+
"""Clean up the temporary cache directory"""
|
|
288
|
+
if self._temp_dir and Path(self._temp_dir).exists():
|
|
289
|
+
try:
|
|
290
|
+
shutil.rmtree(self._temp_dir)
|
|
291
|
+
logger.debug("Cleaned up temporary cache directory: %s", self._temp_dir)
|
|
292
|
+
except Exception as e:
|
|
293
|
+
logger.warning("Failed to clean up temp dir %s: %s", self._temp_dir, e)
|
|
294
|
+
finally:
|
|
295
|
+
self._temp_dir = None
|
|
296
|
+
self.local_cache_dir = None
|
|
297
|
+
# Unregister atexit handler if we cleaned up successfully
|
|
298
|
+
try:
|
|
299
|
+
atexit.unregister(self._cleanup_temp_dir)
|
|
300
|
+
except Exception:
|
|
301
|
+
pass
|
|
302
|
+
|
|
303
|
+
def close(self):
|
|
304
|
+
"""Alias for disconnect() for compatibility with WorkspaceStateProvider"""
|
|
305
|
+
self.disconnect()
|
|
306
|
+
|
|
307
|
+
def _read_loop(self):
|
|
308
|
+
"""Read responses and notifications from SSH stdout"""
|
|
309
|
+
while self._running:
|
|
310
|
+
try:
|
|
311
|
+
line = self._stdout.readline()
|
|
312
|
+
if not line:
|
|
313
|
+
# EOF - connection closed
|
|
314
|
+
logger.debug("SSH stdout closed")
|
|
315
|
+
break
|
|
316
|
+
|
|
317
|
+
line_str = line.decode("utf-8").strip()
|
|
318
|
+
if not line_str:
|
|
319
|
+
continue
|
|
320
|
+
|
|
321
|
+
self._process_message(line_str)
|
|
322
|
+
|
|
323
|
+
except Exception as e:
|
|
324
|
+
if self._running:
|
|
325
|
+
logger.exception("Error in read loop: %s", e)
|
|
326
|
+
break
|
|
327
|
+
|
|
328
|
+
# Connection lost
|
|
329
|
+
if self._running:
|
|
330
|
+
logger.warning("Connection to %s lost", self.host)
|
|
331
|
+
self._connected = False
|
|
332
|
+
|
|
333
|
+
def _stderr_loop(self):
|
|
334
|
+
"""Read and display SSH stderr output with colored prefix"""
|
|
335
|
+
while self._running:
|
|
336
|
+
try:
|
|
337
|
+
line = self._stderr.readline()
|
|
338
|
+
if not line:
|
|
339
|
+
# EOF - stderr closed
|
|
340
|
+
logger.debug("SSH stderr closed")
|
|
341
|
+
break
|
|
342
|
+
|
|
343
|
+
line_str = line.decode("utf-8").rstrip("\n\r")
|
|
344
|
+
if not line_str:
|
|
345
|
+
continue
|
|
346
|
+
|
|
347
|
+
# Call output callback or use default
|
|
348
|
+
if self._output_callback is not None:
|
|
349
|
+
self._output_callback(line_str)
|
|
350
|
+
else:
|
|
351
|
+
# Default: print with colored prefix
|
|
352
|
+
prefix = colored("[SSH] ", "cyan", attrs=["bold"])
|
|
353
|
+
print(f"{prefix}{line_str}") # noqa: T201
|
|
354
|
+
|
|
355
|
+
except Exception as e:
|
|
356
|
+
if self._running:
|
|
357
|
+
logger.debug("Error reading stderr: %s", e)
|
|
358
|
+
break
|
|
359
|
+
|
|
360
|
+
def _process_message(self, line: str):
|
|
361
|
+
"""Process a single message from the server"""
|
|
362
|
+
try:
|
|
363
|
+
msg = parse_message(line)
|
|
364
|
+
except ValueError as e:
|
|
365
|
+
logger.warning("Failed to parse message: %s", e)
|
|
366
|
+
return
|
|
367
|
+
|
|
368
|
+
if isinstance(msg, RPCResponse):
|
|
369
|
+
self._handle_response(msg)
|
|
370
|
+
elif isinstance(msg, RPCNotification):
|
|
371
|
+
self._handle_notification(msg)
|
|
372
|
+
else:
|
|
373
|
+
logger.debug("Unexpected message type: %s", type(msg).__name__)
|
|
374
|
+
|
|
375
|
+
def _handle_response(self, response: RPCResponse):
|
|
376
|
+
"""Handle a response from the server"""
|
|
377
|
+
with self._response_lock:
|
|
378
|
+
future = self._response_handlers.pop(response.id, None)
|
|
379
|
+
|
|
380
|
+
if future is None:
|
|
381
|
+
logger.warning("Received response for unknown request ID: %s", response.id)
|
|
382
|
+
return
|
|
383
|
+
|
|
384
|
+
if response.error:
|
|
385
|
+
future.set_exception(
|
|
386
|
+
RuntimeError(
|
|
387
|
+
f"RPC error {response.error.code}: {response.error.message}"
|
|
388
|
+
)
|
|
389
|
+
)
|
|
390
|
+
else:
|
|
391
|
+
future.set_result(response.result)
|
|
392
|
+
|
|
393
|
+
def _handle_notification(self, notification: RPCNotification):
|
|
394
|
+
"""Handle a notification from the server
|
|
395
|
+
|
|
396
|
+
Queues events for throttled delivery to avoid flooding the UI.
|
|
397
|
+
"""
|
|
398
|
+
method = notification.method
|
|
399
|
+
params = notification.params
|
|
400
|
+
|
|
401
|
+
logger.debug("Received notification: %s", method)
|
|
402
|
+
|
|
403
|
+
# Convert notification to EventBase and queue for throttled delivery
|
|
404
|
+
event = self._notification_to_event(method, params)
|
|
405
|
+
if event:
|
|
406
|
+
with self._pending_events_lock:
|
|
407
|
+
self._pending_events.append(event)
|
|
408
|
+
|
|
409
|
+
# Handle shutdown notification immediately
|
|
410
|
+
if method == NotificationMethod.SHUTDOWN.value:
|
|
411
|
+
reason = params.get("reason", "unknown")
|
|
412
|
+
logger.info("Server shutdown: %s", reason)
|
|
413
|
+
self._connected = False
|
|
414
|
+
|
|
415
|
+
def _notify_loop(self):
|
|
416
|
+
"""Background thread that delivers pending events to listeners periodically
|
|
417
|
+
|
|
418
|
+
This throttles notification delivery to avoid flooding the UI with
|
|
419
|
+
rapid state changes.
|
|
420
|
+
"""
|
|
421
|
+
import time
|
|
422
|
+
|
|
423
|
+
while self._running:
|
|
424
|
+
time.sleep(self._notify_interval)
|
|
425
|
+
|
|
426
|
+
if not self._running:
|
|
427
|
+
break
|
|
428
|
+
|
|
429
|
+
# Get and clear pending events atomically
|
|
430
|
+
with self._pending_events_lock:
|
|
431
|
+
if not self._pending_events:
|
|
432
|
+
continue
|
|
433
|
+
events = self._pending_events.copy()
|
|
434
|
+
self._pending_events.clear()
|
|
435
|
+
|
|
436
|
+
# Deduplicate events by type (keep latest of each type)
|
|
437
|
+
# This prevents redundant refreshes for rapidly changing state
|
|
438
|
+
seen_types = set()
|
|
439
|
+
unique_events = []
|
|
440
|
+
for event in reversed(events):
|
|
441
|
+
event_type = type(event)
|
|
442
|
+
if event_type not in seen_types:
|
|
443
|
+
seen_types.add(event_type)
|
|
444
|
+
unique_events.append(event)
|
|
445
|
+
unique_events.reverse()
|
|
446
|
+
|
|
447
|
+
# Notify listeners
|
|
448
|
+
for event in unique_events:
|
|
449
|
+
self._notify_listeners(event)
|
|
450
|
+
|
|
451
|
+
def _notification_to_event(self, method: str, params: Dict) -> Optional[EventBase]:
|
|
452
|
+
"""Convert a notification to a EventBase"""
|
|
453
|
+
if method != NotificationMethod.STATE_EVENT.value:
|
|
454
|
+
# Don't warn for known control notifications (handled elsewhere)
|
|
455
|
+
if method not in (
|
|
456
|
+
NotificationMethod.SHUTDOWN.value,
|
|
457
|
+
NotificationMethod.FILE_CHANGED.value,
|
|
458
|
+
):
|
|
459
|
+
logger.warning("Unhandled notification method: %s", method)
|
|
460
|
+
return None
|
|
461
|
+
|
|
462
|
+
event_type = params.get("event_type")
|
|
463
|
+
data = params.get("data", {})
|
|
464
|
+
event_class = EventBase.get_class(event_type)
|
|
465
|
+
if event_class is None:
|
|
466
|
+
logger.warning("Unknown event type: %s", event_type)
|
|
467
|
+
return None
|
|
468
|
+
|
|
469
|
+
try:
|
|
470
|
+
return event_class(**data)
|
|
471
|
+
except TypeError as e:
|
|
472
|
+
logger.warning("Error deserializing event %s: %s", event_type, e)
|
|
473
|
+
return None
|
|
474
|
+
|
|
475
|
+
def _notify_listeners(self, event: EventBase):
|
|
476
|
+
"""Notify all registered listeners of a state event"""
|
|
477
|
+
with self._listener_lock:
|
|
478
|
+
listeners = list(self._listeners)
|
|
479
|
+
|
|
480
|
+
for listener in listeners:
|
|
481
|
+
try:
|
|
482
|
+
listener(event)
|
|
483
|
+
except Exception as e:
|
|
484
|
+
logger.exception("Error in listener: %s", e)
|
|
485
|
+
|
|
486
|
+
def _call(self, method: RPCMethod, params: Dict) -> Future:
|
|
487
|
+
"""Send an RPC request and return a Future for the response
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
method: RPC method to call
|
|
491
|
+
params: Method parameters
|
|
492
|
+
|
|
493
|
+
Returns:
|
|
494
|
+
Future that resolves to the response result
|
|
495
|
+
"""
|
|
496
|
+
if not self._running:
|
|
497
|
+
future = Future()
|
|
498
|
+
future.set_exception(ConnectionError("Not connected"))
|
|
499
|
+
return future
|
|
500
|
+
|
|
501
|
+
with self._response_lock:
|
|
502
|
+
self._request_id += 1
|
|
503
|
+
request_id = self._request_id
|
|
504
|
+
future = Future()
|
|
505
|
+
self._response_handlers[request_id] = future
|
|
506
|
+
|
|
507
|
+
request_json = create_request(method, params, request_id)
|
|
508
|
+
try:
|
|
509
|
+
self._stdin.write((request_json + "\n").encode("utf-8"))
|
|
510
|
+
self._stdin.flush()
|
|
511
|
+
except Exception as e:
|
|
512
|
+
with self._response_lock:
|
|
513
|
+
self._response_handlers.pop(request_id, None)
|
|
514
|
+
future.set_exception(e)
|
|
515
|
+
|
|
516
|
+
return future
|
|
517
|
+
|
|
518
|
+
def _call_sync(
|
|
519
|
+
self, method: RPCMethod, params: Dict, timeout: float = DEFAULT_TIMEOUT
|
|
520
|
+
):
|
|
521
|
+
"""Send an RPC request and wait for the response
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
method: RPC method to call
|
|
525
|
+
params: Method parameters
|
|
526
|
+
timeout: Request timeout in seconds
|
|
527
|
+
|
|
528
|
+
Returns:
|
|
529
|
+
Response result
|
|
530
|
+
|
|
531
|
+
Raises:
|
|
532
|
+
TimeoutError: If the request times out
|
|
533
|
+
RuntimeError: If the RPC call returns an error
|
|
534
|
+
"""
|
|
535
|
+
future = self._call(method, params)
|
|
536
|
+
try:
|
|
537
|
+
return future.result(timeout=timeout)
|
|
538
|
+
except FutureTimeoutError:
|
|
539
|
+
raise TimeoutError(f"Request {method.value} timed out after {timeout}s")
|
|
540
|
+
|
|
541
|
+
# -------------------------------------------------------------------------
|
|
542
|
+
# StateProvider-like Interface
|
|
543
|
+
# -------------------------------------------------------------------------
|
|
544
|
+
|
|
545
|
+
def add_listener(self, listener: StateListener):
|
|
546
|
+
"""Register a listener for state change events"""
|
|
547
|
+
with self._listener_lock:
|
|
548
|
+
self._listeners.add(listener)
|
|
549
|
+
|
|
550
|
+
def remove_listener(self, listener: StateListener):
|
|
551
|
+
"""Unregister a listener"""
|
|
552
|
+
with self._listener_lock:
|
|
553
|
+
self._listeners.discard(listener)
|
|
554
|
+
|
|
555
|
+
def get_experiments(self, since: Optional[datetime] = None) -> List[BaseExperiment]:
|
|
556
|
+
"""Get list of all experiments"""
|
|
557
|
+
params = {"since": serialize_datetime(since)}
|
|
558
|
+
result = self._call_sync(RPCMethod.GET_EXPERIMENTS, params)
|
|
559
|
+
return [self._dict_to_experiment(d) for d in result]
|
|
560
|
+
|
|
561
|
+
def get_experiment(self, experiment_id: str) -> Optional[BaseExperiment]:
|
|
562
|
+
"""Get a specific experiment by ID"""
|
|
563
|
+
params = {"experiment_id": experiment_id}
|
|
564
|
+
result = self._call_sync(RPCMethod.GET_EXPERIMENT, params)
|
|
565
|
+
if result is None:
|
|
566
|
+
return None
|
|
567
|
+
return self._dict_to_experiment(result)
|
|
568
|
+
|
|
569
|
+
def get_experiment_runs(self, experiment_id: str) -> List[Dict]:
|
|
570
|
+
"""Get all runs for an experiment"""
|
|
571
|
+
params = {"experiment_id": experiment_id}
|
|
572
|
+
return self._call_sync(RPCMethod.GET_EXPERIMENT_RUNS, params)
|
|
573
|
+
|
|
574
|
+
def get_current_run(self, experiment_id: str) -> Optional[str]:
|
|
575
|
+
"""Get the current run ID for an experiment"""
|
|
576
|
+
exp = self.get_experiment(experiment_id)
|
|
577
|
+
if exp is None:
|
|
578
|
+
return None
|
|
579
|
+
return exp.run_id
|
|
580
|
+
|
|
581
|
+
def get_jobs(
|
|
582
|
+
self,
|
|
583
|
+
experiment_id: Optional[str] = None,
|
|
584
|
+
run_id: Optional[str] = None,
|
|
585
|
+
task_id: Optional[str] = None,
|
|
586
|
+
state: Optional[str] = None,
|
|
587
|
+
tags: Optional[Dict[str, str]] = None,
|
|
588
|
+
since: Optional[datetime] = None,
|
|
589
|
+
) -> List[BaseJob]:
|
|
590
|
+
"""Query jobs with optional filters"""
|
|
591
|
+
params = {
|
|
592
|
+
"experiment_id": experiment_id,
|
|
593
|
+
"run_id": run_id,
|
|
594
|
+
"task_id": task_id,
|
|
595
|
+
"state": state,
|
|
596
|
+
"tags": tags,
|
|
597
|
+
"since": serialize_datetime(since),
|
|
598
|
+
}
|
|
599
|
+
result = self._call_sync(RPCMethod.GET_JOBS, params)
|
|
600
|
+
return [self._dict_to_job(d) for d in result]
|
|
601
|
+
|
|
602
|
+
def get_job(
|
|
603
|
+
self, job_id: str, experiment_id: str, run_id: Optional[str] = None
|
|
604
|
+
) -> Optional[BaseJob]:
|
|
605
|
+
"""Get a specific job"""
|
|
606
|
+
params = {
|
|
607
|
+
"job_id": job_id,
|
|
608
|
+
"experiment_id": experiment_id,
|
|
609
|
+
"run_id": run_id,
|
|
610
|
+
}
|
|
611
|
+
result = self._call_sync(RPCMethod.GET_JOB, params)
|
|
612
|
+
if result is None:
|
|
613
|
+
return None
|
|
614
|
+
return self._dict_to_job(result)
|
|
615
|
+
|
|
616
|
+
def get_all_jobs(
|
|
617
|
+
self,
|
|
618
|
+
state: Optional[str] = None,
|
|
619
|
+
tags: Optional[Dict[str, str]] = None,
|
|
620
|
+
since: Optional[datetime] = None,
|
|
621
|
+
) -> List[BaseJob]:
|
|
622
|
+
"""Get all jobs across all experiments"""
|
|
623
|
+
params = {
|
|
624
|
+
"state": state,
|
|
625
|
+
"tags": tags,
|
|
626
|
+
"since": serialize_datetime(since),
|
|
627
|
+
}
|
|
628
|
+
result = self._call_sync(RPCMethod.GET_ALL_JOBS, params)
|
|
629
|
+
return [self._dict_to_job(d) for d in result]
|
|
630
|
+
|
|
631
|
+
def get_tags_map(
|
|
632
|
+
self,
|
|
633
|
+
experiment_id: str,
|
|
634
|
+
run_id: Optional[str] = None,
|
|
635
|
+
) -> Dict[str, Dict[str, str]]:
|
|
636
|
+
"""Get tags map for jobs in an experiment/run"""
|
|
637
|
+
params = {
|
|
638
|
+
"experiment_id": experiment_id,
|
|
639
|
+
"run_id": run_id,
|
|
640
|
+
}
|
|
641
|
+
result = self._call_sync(RPCMethod.GET_TAGS_MAP, params)
|
|
642
|
+
return result or {}
|
|
643
|
+
|
|
644
|
+
def get_dependencies_map(
|
|
645
|
+
self,
|
|
646
|
+
experiment_id: str,
|
|
647
|
+
run_id: Optional[str] = None,
|
|
648
|
+
) -> dict[str, list[str]]:
|
|
649
|
+
"""Get dependencies map for jobs in an experiment/run"""
|
|
650
|
+
params = {
|
|
651
|
+
"experiment_id": experiment_id,
|
|
652
|
+
"run_id": run_id,
|
|
653
|
+
}
|
|
654
|
+
result = self._call_sync(RPCMethod.GET_DEPENDENCIES_MAP, params)
|
|
655
|
+
return result or {}
|
|
656
|
+
|
|
657
|
+
def _fetch_services_from_storage(
|
|
658
|
+
self, experiment_id: Optional[str], run_id: Optional[str]
|
|
659
|
+
) -> List[BaseService]:
|
|
660
|
+
"""Fetch services from remote server.
|
|
661
|
+
|
|
662
|
+
Called by base class get_services when cache is empty.
|
|
663
|
+
"""
|
|
664
|
+
params = {
|
|
665
|
+
"experiment_id": experiment_id,
|
|
666
|
+
"run_id": run_id,
|
|
667
|
+
}
|
|
668
|
+
result = self._call_sync(RPCMethod.GET_SERVICES, params)
|
|
669
|
+
|
|
670
|
+
services = []
|
|
671
|
+
for d in result:
|
|
672
|
+
service = self._dict_to_service(d)
|
|
673
|
+
services.append(service)
|
|
674
|
+
|
|
675
|
+
return services
|
|
676
|
+
|
|
677
|
+
def kill_job(self, job: BaseJob, perform: bool = False) -> bool:
|
|
678
|
+
"""Kill a running job"""
|
|
679
|
+
if not perform:
|
|
680
|
+
# Dry run - just check if job is running
|
|
681
|
+
return job.state.running()
|
|
682
|
+
|
|
683
|
+
params = {
|
|
684
|
+
"job_id": job.identifier,
|
|
685
|
+
"experiment_id": getattr(job, "experiment_id", ""),
|
|
686
|
+
"run_id": getattr(job, "run_id", ""),
|
|
687
|
+
}
|
|
688
|
+
result = self._call_sync(RPCMethod.KILL_JOB, params)
|
|
689
|
+
return result.get("success", False)
|
|
690
|
+
|
|
691
|
+
def clean_job(self, job: BaseJob, perform: bool = False) -> bool:
|
|
692
|
+
"""Clean a finished job"""
|
|
693
|
+
if not perform:
|
|
694
|
+
# Dry run - just check if job is finished
|
|
695
|
+
return job.state.finished()
|
|
696
|
+
|
|
697
|
+
params = {
|
|
698
|
+
"job_id": job.identifier,
|
|
699
|
+
"experiment_id": getattr(job, "experiment_id", ""),
|
|
700
|
+
"run_id": getattr(job, "run_id", ""),
|
|
701
|
+
}
|
|
702
|
+
result = self._call_sync(RPCMethod.CLEAN_JOB, params)
|
|
703
|
+
return result.get("success", False)
|
|
704
|
+
|
|
705
|
+
def get_process_info(self, job: BaseJob):
|
|
706
|
+
"""Get process information for a job
|
|
707
|
+
|
|
708
|
+
Returns None if the remote server doesn't support this method.
|
|
709
|
+
"""
|
|
710
|
+
from experimaestro.scheduler.state_provider import ProcessInfo
|
|
711
|
+
|
|
712
|
+
params = {
|
|
713
|
+
"job_id": job.identifier,
|
|
714
|
+
"experiment_id": getattr(job, "experiment_id", ""),
|
|
715
|
+
"run_id": getattr(job, "run_id", ""),
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
try:
|
|
719
|
+
result = self._call_sync(RPCMethod.GET_PROCESS_INFO, params)
|
|
720
|
+
except RuntimeError:
|
|
721
|
+
# Server doesn't support this method (older version)
|
|
722
|
+
return None
|
|
723
|
+
|
|
724
|
+
if result is None:
|
|
725
|
+
return None
|
|
726
|
+
|
|
727
|
+
return ProcessInfo(
|
|
728
|
+
pid=result["pid"],
|
|
729
|
+
type=result["type"],
|
|
730
|
+
running=result.get("running", False),
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
# -------------------------------------------------------------------------
|
|
734
|
+
# Data Conversion
|
|
735
|
+
# -------------------------------------------------------------------------
|
|
736
|
+
|
|
737
|
+
def _dict_to_job(self, d: Dict) -> MockJob:
|
|
738
|
+
"""Convert a dictionary to a MockJob using from_state_dict"""
|
|
739
|
+
# Translate remote path to local cache path
|
|
740
|
+
if d.get("path"):
|
|
741
|
+
remote_path = d["path"]
|
|
742
|
+
if remote_path.startswith(self.remote_workspace):
|
|
743
|
+
relative = remote_path[len(self.remote_workspace) :].lstrip("/")
|
|
744
|
+
d["path"] = self.local_cache_dir / relative
|
|
745
|
+
else:
|
|
746
|
+
d["path"] = Path(remote_path)
|
|
747
|
+
|
|
748
|
+
# Convert ISO datetime strings back to timestamps (floats)
|
|
749
|
+
for key in ("submitted_time", "started_time", "ended_time"):
|
|
750
|
+
d[key] = self._parse_datetime_to_timestamp(d.get(key))
|
|
751
|
+
|
|
752
|
+
return MockJob.from_state_dict(d, self.local_cache_dir)
|
|
753
|
+
|
|
754
|
+
def _dict_to_experiment(self, d: Dict) -> MockExperiment:
|
|
755
|
+
"""Convert a dictionary to a MockExperiment using from_state_dict"""
|
|
756
|
+
# Translate remote workdir to local cache path
|
|
757
|
+
if d.get("workdir"):
|
|
758
|
+
remote_path = d["workdir"]
|
|
759
|
+
if remote_path.startswith(self.remote_workspace):
|
|
760
|
+
relative = remote_path[len(self.remote_workspace) :].lstrip("/")
|
|
761
|
+
d["workdir"] = self.local_cache_dir / relative
|
|
762
|
+
else:
|
|
763
|
+
d["workdir"] = Path(remote_path)
|
|
764
|
+
|
|
765
|
+
# Convert ISO datetime strings back to timestamps (floats)
|
|
766
|
+
for key in ("started_at", "ended_at"):
|
|
767
|
+
d[key] = self._parse_datetime_to_timestamp(d.get(key))
|
|
768
|
+
|
|
769
|
+
return MockExperiment.from_state_dict(d, self.local_cache_dir)
|
|
770
|
+
|
|
771
|
+
def _dict_to_service(self, d: Dict) -> BaseService:
|
|
772
|
+
"""Convert a dictionary to a Service or MockService
|
|
773
|
+
|
|
774
|
+
Tries to recreate the actual Service from state_dict first.
|
|
775
|
+
Falls back to MockService with error message if module is missing.
|
|
776
|
+
"""
|
|
777
|
+
state_dict = d.get("state_dict", {})
|
|
778
|
+
service_class = d.get("class", "")
|
|
779
|
+
service_id = d.get("service_id", "")
|
|
780
|
+
|
|
781
|
+
# Check for unserializable marker
|
|
782
|
+
if state_dict.get("__unserializable__"):
|
|
783
|
+
reason = state_dict.get("__reason__", "Service cannot be recreated")
|
|
784
|
+
return MockService(
|
|
785
|
+
service_id=service_id,
|
|
786
|
+
description_text=f"[{reason}]",
|
|
787
|
+
state_dict_data=state_dict,
|
|
788
|
+
service_class=service_class,
|
|
789
|
+
experiment_id=d.get("experiment_id"),
|
|
790
|
+
run_id=d.get("run_id"),
|
|
791
|
+
url=d.get("url"),
|
|
792
|
+
)
|
|
793
|
+
|
|
794
|
+
# Try to recreate actual Service from state_dict
|
|
795
|
+
if service_class:
|
|
796
|
+
try:
|
|
797
|
+
from experimaestro.scheduler.services import Service
|
|
798
|
+
|
|
799
|
+
# Create path translator that syncs and translates paths
|
|
800
|
+
def path_translator(remote_path: str) -> Path:
|
|
801
|
+
"""Translate remote path to local, syncing if needed"""
|
|
802
|
+
local_path = self.sync_path(remote_path)
|
|
803
|
+
if local_path:
|
|
804
|
+
return local_path
|
|
805
|
+
# Fallback: map to local cache without sync
|
|
806
|
+
if remote_path.startswith(self.remote_workspace):
|
|
807
|
+
relative = remote_path[len(self.remote_workspace) :].lstrip("/")
|
|
808
|
+
return self.local_cache_dir / relative
|
|
809
|
+
return Path(remote_path)
|
|
810
|
+
|
|
811
|
+
service = Service.from_state_dict(
|
|
812
|
+
service_class, state_dict, path_translator
|
|
813
|
+
)
|
|
814
|
+
service.id = service_id
|
|
815
|
+
# Copy additional attributes
|
|
816
|
+
if d.get("experiment_id"):
|
|
817
|
+
service.experiment_id = d["experiment_id"]
|
|
818
|
+
if d.get("run_id"):
|
|
819
|
+
service.run_id = d["run_id"]
|
|
820
|
+
return service
|
|
821
|
+
except ModuleNotFoundError as e:
|
|
822
|
+
# Module not available locally - show error in description
|
|
823
|
+
missing_module = str(e).replace("No module named ", "").strip("'\"")
|
|
824
|
+
return MockService(
|
|
825
|
+
service_id=service_id,
|
|
826
|
+
description_text=f"[Missing module: {missing_module}]",
|
|
827
|
+
state_dict_data=state_dict,
|
|
828
|
+
service_class=service_class,
|
|
829
|
+
experiment_id=d.get("experiment_id"),
|
|
830
|
+
run_id=d.get("run_id"),
|
|
831
|
+
url=d.get("url"),
|
|
832
|
+
)
|
|
833
|
+
except Exception as e:
|
|
834
|
+
# Other error - show in description
|
|
835
|
+
return MockService(
|
|
836
|
+
service_id=service_id,
|
|
837
|
+
description_text=f"[Error: {e}]",
|
|
838
|
+
state_dict_data=state_dict,
|
|
839
|
+
service_class=service_class,
|
|
840
|
+
experiment_id=d.get("experiment_id"),
|
|
841
|
+
run_id=d.get("run_id"),
|
|
842
|
+
url=d.get("url"),
|
|
843
|
+
)
|
|
844
|
+
|
|
845
|
+
# No class - use MockService.from_full_state_dict
|
|
846
|
+
return MockService.from_full_state_dict(d)
|
|
847
|
+
|
|
848
|
+
def _parse_datetime_to_timestamp(self, value) -> Optional[float]:
|
|
849
|
+
"""Convert datetime value to Unix timestamp
|
|
850
|
+
|
|
851
|
+
Handles: None, ISO string, float timestamp, datetime object
|
|
852
|
+
"""
|
|
853
|
+
if value is None:
|
|
854
|
+
return None
|
|
855
|
+
if isinstance(value, (int, float)):
|
|
856
|
+
return float(value)
|
|
857
|
+
if isinstance(value, str):
|
|
858
|
+
try:
|
|
859
|
+
dt = datetime.fromisoformat(value)
|
|
860
|
+
return dt.timestamp()
|
|
861
|
+
except ValueError:
|
|
862
|
+
return None
|
|
863
|
+
if isinstance(value, datetime):
|
|
864
|
+
return value.timestamp()
|
|
865
|
+
return None
|
|
866
|
+
|
|
867
|
+
# -------------------------------------------------------------------------
|
|
868
|
+
# File Synchronization
|
|
869
|
+
# -------------------------------------------------------------------------
|
|
870
|
+
|
|
871
|
+
def sync_path(self, path: str) -> Optional[Path]:
|
|
872
|
+
"""Sync a specific path from remote on-demand
|
|
873
|
+
|
|
874
|
+
Used by services (e.g., TensorboardService) that need access to
|
|
875
|
+
specific remote directories.
|
|
876
|
+
|
|
877
|
+
Args:
|
|
878
|
+
path: Can be:
|
|
879
|
+
- Remote absolute path (e.g., /remote/workspace/jobs/xxx)
|
|
880
|
+
- Local cache path (e.g., /tmp/xpm_remote_xxx/jobs/xxx)
|
|
881
|
+
- Relative path within workspace (e.g., jobs/xxx)
|
|
882
|
+
|
|
883
|
+
Returns:
|
|
884
|
+
Local path where the files were synced to, or None if sync failed
|
|
885
|
+
"""
|
|
886
|
+
if not self._connected or not self.local_cache_dir:
|
|
887
|
+
logger.warning("Cannot sync: not connected")
|
|
888
|
+
return None
|
|
889
|
+
|
|
890
|
+
# Convert local cache path back to remote path if needed
|
|
891
|
+
local_cache_str = str(self.local_cache_dir)
|
|
892
|
+
if path.startswith(local_cache_str):
|
|
893
|
+
# Path is in local cache - extract relative path
|
|
894
|
+
relative = path[len(local_cache_str) :].lstrip("/")
|
|
895
|
+
remote_path = f"{self.remote_workspace}/{relative}"
|
|
896
|
+
elif path.startswith(self.remote_workspace):
|
|
897
|
+
# Already a remote path
|
|
898
|
+
remote_path = path
|
|
899
|
+
else:
|
|
900
|
+
# Assume it's a relative path
|
|
901
|
+
remote_path = f"{self.remote_workspace}/{path.lstrip('/')}"
|
|
902
|
+
|
|
903
|
+
from experimaestro.scheduler.remote.sync import RemoteFileSynchronizer
|
|
904
|
+
|
|
905
|
+
# Create synchronizer lazily
|
|
906
|
+
if self._synchronizer is None:
|
|
907
|
+
self._synchronizer = RemoteFileSynchronizer(
|
|
908
|
+
host=self.host,
|
|
909
|
+
remote_workspace=Path(self.remote_workspace),
|
|
910
|
+
local_cache=self.local_cache_dir,
|
|
911
|
+
ssh_options=self.ssh_options,
|
|
912
|
+
)
|
|
913
|
+
|
|
914
|
+
try:
|
|
915
|
+
return self._synchronizer.sync_path(remote_path)
|
|
916
|
+
except Exception as e:
|
|
917
|
+
logger.warning("Failed to sync path %s: %s", remote_path, e)
|
|
918
|
+
return None
|
|
919
|
+
|
|
920
|
+
@property
|
|
921
|
+
def read_only(self) -> bool:
|
|
922
|
+
"""Client is always read-only"""
|
|
923
|
+
return True
|
|
924
|
+
|
|
925
|
+
@property
|
|
926
|
+
def is_remote(self) -> bool:
|
|
927
|
+
"""This is a remote provider"""
|
|
928
|
+
return True
|