experimaestro 2.0.0b8__py3-none-any.whl → 2.0.0b17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +12 -5
- experimaestro/cli/__init__.py +239 -126
- experimaestro/cli/filter.py +48 -23
- experimaestro/cli/jobs.py +253 -71
- experimaestro/cli/refactor.py +1 -2
- experimaestro/commandline.py +7 -4
- experimaestro/connectors/__init__.py +9 -1
- experimaestro/connectors/local.py +43 -3
- experimaestro/core/arguments.py +18 -18
- experimaestro/core/identifier.py +11 -11
- experimaestro/core/objects/config.py +96 -39
- experimaestro/core/objects/config_walk.py +3 -3
- experimaestro/core/{subparameters.py → partial.py} +16 -16
- experimaestro/core/partial_lock.py +394 -0
- experimaestro/core/types.py +12 -15
- experimaestro/dynamic.py +290 -0
- experimaestro/experiments/__init__.py +6 -2
- experimaestro/experiments/cli.py +217 -50
- experimaestro/experiments/configuration.py +24 -0
- experimaestro/generators.py +5 -5
- experimaestro/ipc.py +118 -1
- experimaestro/launcherfinder/__init__.py +2 -2
- experimaestro/launcherfinder/registry.py +6 -7
- experimaestro/launcherfinder/specs.py +2 -9
- experimaestro/launchers/slurm/__init__.py +2 -2
- experimaestro/launchers/slurm/base.py +62 -0
- experimaestro/locking.py +957 -1
- experimaestro/notifications.py +89 -201
- experimaestro/progress.py +63 -366
- experimaestro/rpyc.py +0 -2
- experimaestro/run.py +29 -2
- experimaestro/scheduler/__init__.py +8 -1
- experimaestro/scheduler/base.py +629 -53
- experimaestro/scheduler/dependencies.py +20 -16
- experimaestro/scheduler/experiment.py +732 -167
- experimaestro/scheduler/interfaces.py +316 -101
- experimaestro/scheduler/jobs.py +58 -20
- experimaestro/scheduler/remote/adaptive_sync.py +265 -0
- experimaestro/scheduler/remote/client.py +171 -117
- experimaestro/scheduler/remote/protocol.py +8 -193
- experimaestro/scheduler/remote/server.py +95 -71
- experimaestro/scheduler/services.py +53 -28
- experimaestro/scheduler/state_provider.py +663 -2430
- experimaestro/scheduler/state_status.py +1247 -0
- experimaestro/scheduler/transient.py +31 -0
- experimaestro/scheduler/workspace.py +1 -1
- experimaestro/scheduler/workspace_state_provider.py +1273 -0
- experimaestro/scriptbuilder.py +4 -4
- experimaestro/settings.py +36 -0
- experimaestro/tests/conftest.py +33 -5
- experimaestro/tests/connectors/bin/executable.py +1 -1
- experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
- experimaestro/tests/launchers/bin/test.py +1 -0
- experimaestro/tests/launchers/test_slurm.py +9 -9
- experimaestro/tests/partial_reschedule.py +46 -0
- experimaestro/tests/restart.py +3 -3
- experimaestro/tests/restart_main.py +1 -0
- experimaestro/tests/scripts/notifyandwait.py +1 -0
- experimaestro/tests/task_partial.py +38 -0
- experimaestro/tests/task_tokens.py +2 -2
- experimaestro/tests/tasks/test_dynamic.py +6 -6
- experimaestro/tests/test_dependencies.py +3 -3
- experimaestro/tests/test_deprecated.py +15 -15
- experimaestro/tests/test_dynamic_locking.py +317 -0
- experimaestro/tests/test_environment.py +24 -14
- experimaestro/tests/test_experiment.py +171 -36
- experimaestro/tests/test_identifier.py +25 -25
- experimaestro/tests/test_identifier_stability.py +3 -5
- experimaestro/tests/test_multitoken.py +2 -4
- experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
- experimaestro/tests/test_partial_paths.py +81 -138
- experimaestro/tests/test_pre_experiment.py +219 -0
- experimaestro/tests/test_progress.py +2 -8
- experimaestro/tests/test_remote_state.py +560 -99
- experimaestro/tests/test_stray_jobs.py +261 -0
- experimaestro/tests/test_tasks.py +1 -2
- experimaestro/tests/test_token_locking.py +52 -67
- experimaestro/tests/test_tokens.py +5 -6
- experimaestro/tests/test_transient.py +225 -0
- experimaestro/tests/test_workspace_state_provider.py +768 -0
- experimaestro/tests/token_reschedule.py +1 -3
- experimaestro/tests/utils.py +2 -7
- experimaestro/tokens.py +227 -372
- experimaestro/tools/diff.py +1 -0
- experimaestro/tools/documentation.py +4 -5
- experimaestro/tools/jobs.py +1 -2
- experimaestro/tui/app.py +438 -1966
- experimaestro/tui/app.tcss +162 -0
- experimaestro/tui/dialogs.py +172 -0
- experimaestro/tui/log_viewer.py +253 -3
- experimaestro/tui/messages.py +137 -0
- experimaestro/tui/utils.py +54 -0
- experimaestro/tui/widgets/__init__.py +23 -0
- experimaestro/tui/widgets/experiments.py +468 -0
- experimaestro/tui/widgets/global_services.py +238 -0
- experimaestro/tui/widgets/jobs.py +972 -0
- experimaestro/tui/widgets/log.py +156 -0
- experimaestro/tui/widgets/orphans.py +363 -0
- experimaestro/tui/widgets/runs.py +185 -0
- experimaestro/tui/widgets/services.py +314 -0
- experimaestro/tui/widgets/stray_jobs.py +528 -0
- experimaestro/utils/__init__.py +1 -1
- experimaestro/utils/environment.py +105 -22
- experimaestro/utils/fswatcher.py +124 -0
- experimaestro/utils/jobs.py +1 -2
- experimaestro/utils/jupyter.py +1 -2
- experimaestro/utils/logging.py +72 -0
- experimaestro/version.py +2 -2
- experimaestro/webui/__init__.py +9 -0
- experimaestro/webui/app.py +117 -0
- experimaestro/{server → webui}/data/index.css +66 -11
- experimaestro/webui/data/index.css.map +1 -0
- experimaestro/{server → webui}/data/index.js +82763 -87217
- experimaestro/webui/data/index.js.map +1 -0
- experimaestro/webui/routes/__init__.py +5 -0
- experimaestro/webui/routes/auth.py +53 -0
- experimaestro/webui/routes/proxy.py +117 -0
- experimaestro/webui/server.py +200 -0
- experimaestro/webui/state_bridge.py +152 -0
- experimaestro/webui/websocket.py +413 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +5 -6
- experimaestro-2.0.0b17.dist-info/RECORD +219 -0
- experimaestro/cli/progress.py +0 -269
- experimaestro/scheduler/state.py +0 -75
- experimaestro/scheduler/state_db.py +0 -437
- experimaestro/scheduler/state_sync.py +0 -891
- experimaestro/server/__init__.py +0 -467
- experimaestro/server/data/index.css.map +0 -1
- experimaestro/server/data/index.js.map +0 -1
- experimaestro/tests/test_cli_jobs.py +0 -615
- experimaestro/tests/test_file_progress.py +0 -425
- experimaestro/tests/test_file_progress_integration.py +0 -477
- experimaestro/tests/test_state_db.py +0 -434
- experimaestro-2.0.0b8.dist-info/RECORD +0 -187
- /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
- /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
- /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
- /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
- /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
- /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
- /experimaestro/{server → webui}/data/favicon.ico +0 -0
- /experimaestro/{server → webui}/data/index.html +0 -0
- /experimaestro/{server → webui}/data/login.html +0 -0
- /experimaestro/{server → webui}/data/manifest.json +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
|
@@ -19,17 +19,18 @@ from concurrent.futures import Future, TimeoutError as FutureTimeoutError
|
|
|
19
19
|
from datetime import datetime
|
|
20
20
|
from importlib.metadata import version as get_package_version
|
|
21
21
|
from pathlib import Path
|
|
22
|
-
from typing import TYPE_CHECKING, Dict, List, Optional, Set
|
|
22
|
+
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Set
|
|
23
|
+
|
|
24
|
+
from termcolor import colored
|
|
23
25
|
|
|
24
26
|
from experimaestro.scheduler.state_provider import (
|
|
25
|
-
|
|
26
|
-
StateEvent,
|
|
27
|
-
StateEventType,
|
|
27
|
+
OfflineStateProvider,
|
|
28
28
|
StateListener,
|
|
29
29
|
MockJob,
|
|
30
30
|
MockExperiment,
|
|
31
31
|
MockService,
|
|
32
32
|
)
|
|
33
|
+
from experimaestro.scheduler.state_status import EventBase
|
|
33
34
|
from experimaestro.scheduler.interfaces import (
|
|
34
35
|
BaseJob,
|
|
35
36
|
BaseExperiment,
|
|
@@ -45,6 +46,9 @@ from experimaestro.scheduler.remote.protocol import (
|
|
|
45
46
|
serialize_datetime,
|
|
46
47
|
)
|
|
47
48
|
|
|
49
|
+
# Type for SSH output callback
|
|
50
|
+
OutputCallback = Optional["Callable[[str], None]"]
|
|
51
|
+
|
|
48
52
|
if TYPE_CHECKING:
|
|
49
53
|
from experimaestro.scheduler.remote.sync import RemoteFileSynchronizer
|
|
50
54
|
|
|
@@ -68,7 +72,7 @@ def _strip_dev_version(version: str) -> str:
|
|
|
68
72
|
return re.sub(r"\.dev\d+$", "", version)
|
|
69
73
|
|
|
70
74
|
|
|
71
|
-
class SSHStateProviderClient(
|
|
75
|
+
class SSHStateProviderClient(OfflineStateProvider):
|
|
72
76
|
"""Client that connects to SSHStateProviderServer via SSH
|
|
73
77
|
|
|
74
78
|
This client implements the StateProvider interface for remote experiment
|
|
@@ -77,7 +81,7 @@ class SSHStateProviderClient(StateProvider):
|
|
|
77
81
|
Features:
|
|
78
82
|
- JSON-RPC over SSH stdin/stdout
|
|
79
83
|
- Async request/response handling with futures
|
|
80
|
-
- Server push notifications converted to
|
|
84
|
+
- Server push notifications converted to EventBases
|
|
81
85
|
- On-demand rsync for specific paths (used by services like TensorboardService)
|
|
82
86
|
"""
|
|
83
87
|
|
|
@@ -87,6 +91,7 @@ class SSHStateProviderClient(StateProvider):
|
|
|
87
91
|
remote_workspace: str,
|
|
88
92
|
ssh_options: Optional[List[str]] = None,
|
|
89
93
|
remote_xpm_path: Optional[str] = None,
|
|
94
|
+
output_callback: Optional[Callable[[str], None]] = None,
|
|
90
95
|
):
|
|
91
96
|
"""Initialize the client
|
|
92
97
|
|
|
@@ -96,11 +101,18 @@ class SSHStateProviderClient(StateProvider):
|
|
|
96
101
|
ssh_options: Additional SSH options (e.g., ["-p", "2222"])
|
|
97
102
|
remote_xpm_path: Path to experimaestro executable on remote host.
|
|
98
103
|
If None, uses 'uv tool run experimaestro==<version>'.
|
|
104
|
+
output_callback: Callback for SSH process output (stderr).
|
|
105
|
+
If None, a default callback prints with colored prefix.
|
|
106
|
+
Set to False (or a no-op lambda) to disable output display.
|
|
99
107
|
"""
|
|
108
|
+
# Initialize base class (includes service cache)
|
|
109
|
+
super().__init__()
|
|
110
|
+
|
|
100
111
|
self.host = host
|
|
101
112
|
self.remote_workspace = remote_workspace
|
|
102
113
|
self.ssh_options = ssh_options or []
|
|
103
114
|
self.remote_xpm_path = remote_xpm_path
|
|
115
|
+
self._output_callback = output_callback
|
|
104
116
|
|
|
105
117
|
# Session-specific temporary cache directory (created on connect)
|
|
106
118
|
self._temp_dir: Optional[str] = None
|
|
@@ -121,19 +133,17 @@ class SSHStateProviderClient(StateProvider):
|
|
|
121
133
|
|
|
122
134
|
self._read_thread: Optional[threading.Thread] = None
|
|
123
135
|
self._notify_thread: Optional[threading.Thread] = None
|
|
136
|
+
self._stderr_thread: Optional[threading.Thread] = None
|
|
124
137
|
self._running = False
|
|
125
138
|
self._connected = False
|
|
126
139
|
|
|
127
140
|
self._synchronizer: Optional["RemoteFileSynchronizer"] = None
|
|
128
141
|
|
|
129
142
|
# Throttled notification delivery to avoid flooding UI
|
|
130
|
-
self._pending_events: List[
|
|
143
|
+
self._pending_events: List[EventBase] = []
|
|
131
144
|
self._pending_events_lock = threading.Lock()
|
|
132
145
|
self._notify_interval = 2.0 # Seconds between notification batches
|
|
133
146
|
|
|
134
|
-
# Service cache (from base class)
|
|
135
|
-
self._init_service_cache()
|
|
136
|
-
|
|
137
147
|
def connect(self, timeout: float = 30.0):
|
|
138
148
|
"""Establish SSH connection and start remote server
|
|
139
149
|
|
|
@@ -161,10 +171,7 @@ class SSHStateProviderClient(StateProvider):
|
|
|
161
171
|
# Build remote command (workdir is passed to experiments group)
|
|
162
172
|
if self.remote_xpm_path:
|
|
163
173
|
# Use specified path to experimaestro
|
|
164
|
-
remote_cmd =
|
|
165
|
-
f"{self.remote_xpm_path} experiments "
|
|
166
|
-
f"--workdir {self.remote_workspace} monitor-server"
|
|
167
|
-
)
|
|
174
|
+
remote_cmd = f"{self.remote_xpm_path} experiments --workdir {self.remote_workspace} monitor-server"
|
|
168
175
|
else:
|
|
169
176
|
# Use uv tool run with version pinning
|
|
170
177
|
try:
|
|
@@ -175,15 +182,9 @@ class SSHStateProviderClient(StateProvider):
|
|
|
175
182
|
xpm_version = None
|
|
176
183
|
|
|
177
184
|
if xpm_version:
|
|
178
|
-
remote_cmd =
|
|
179
|
-
f"uv tool run experimaestro=={xpm_version} experiments "
|
|
180
|
-
f"--workdir {self.remote_workspace} monitor-server"
|
|
181
|
-
)
|
|
185
|
+
remote_cmd = f"uv tool run experimaestro=={xpm_version} experiments --workdir {self.remote_workspace} monitor-server"
|
|
182
186
|
else:
|
|
183
|
-
remote_cmd =
|
|
184
|
-
f"uv tool run experimaestro experiments "
|
|
185
|
-
f"--workdir {self.remote_workspace} monitor-server"
|
|
186
|
-
)
|
|
187
|
+
remote_cmd = f"uv tool run experimaestro experiments --workdir {self.remote_workspace} monitor-server"
|
|
187
188
|
cmd.append(remote_cmd)
|
|
188
189
|
|
|
189
190
|
logger.info("Connecting to %s, workspace: %s", self.host, self.remote_workspace)
|
|
@@ -218,6 +219,12 @@ class SSHStateProviderClient(StateProvider):
|
|
|
218
219
|
)
|
|
219
220
|
self._notify_thread.start()
|
|
220
221
|
|
|
222
|
+
# Start stderr thread to display SSH output
|
|
223
|
+
self._stderr_thread = threading.Thread(
|
|
224
|
+
target=self._stderr_loop, daemon=True, name="SSHClient-Stderr"
|
|
225
|
+
)
|
|
226
|
+
self._stderr_thread.start()
|
|
227
|
+
|
|
221
228
|
# Wait for connection to be established by sending a test request
|
|
222
229
|
try:
|
|
223
230
|
sync_info = self._call_sync(RPCMethod.GET_SYNC_INFO, {}, timeout=timeout)
|
|
@@ -258,6 +265,8 @@ class SSHStateProviderClient(StateProvider):
|
|
|
258
265
|
self._read_thread.join(timeout=2.0)
|
|
259
266
|
if self._notify_thread and self._notify_thread.is_alive():
|
|
260
267
|
self._notify_thread.join(timeout=2.0)
|
|
268
|
+
if self._stderr_thread and self._stderr_thread.is_alive():
|
|
269
|
+
self._stderr_thread.join(timeout=2.0)
|
|
261
270
|
|
|
262
271
|
# Cancel any pending requests
|
|
263
272
|
with self._response_lock:
|
|
@@ -321,6 +330,33 @@ class SSHStateProviderClient(StateProvider):
|
|
|
321
330
|
logger.warning("Connection to %s lost", self.host)
|
|
322
331
|
self._connected = False
|
|
323
332
|
|
|
333
|
+
def _stderr_loop(self):
|
|
334
|
+
"""Read and display SSH stderr output with colored prefix"""
|
|
335
|
+
while self._running:
|
|
336
|
+
try:
|
|
337
|
+
line = self._stderr.readline()
|
|
338
|
+
if not line:
|
|
339
|
+
# EOF - stderr closed
|
|
340
|
+
logger.debug("SSH stderr closed")
|
|
341
|
+
break
|
|
342
|
+
|
|
343
|
+
line_str = line.decode("utf-8").rstrip("\n\r")
|
|
344
|
+
if not line_str:
|
|
345
|
+
continue
|
|
346
|
+
|
|
347
|
+
# Call output callback or use default
|
|
348
|
+
if self._output_callback is not None:
|
|
349
|
+
self._output_callback(line_str)
|
|
350
|
+
else:
|
|
351
|
+
# Default: print with colored prefix
|
|
352
|
+
prefix = colored("[SSH] ", "cyan", attrs=["bold"])
|
|
353
|
+
print(f"{prefix}{line_str}") # noqa: T201
|
|
354
|
+
|
|
355
|
+
except Exception as e:
|
|
356
|
+
if self._running:
|
|
357
|
+
logger.debug("Error reading stderr: %s", e)
|
|
358
|
+
break
|
|
359
|
+
|
|
324
360
|
def _process_message(self, line: str):
|
|
325
361
|
"""Process a single message from the server"""
|
|
326
362
|
try:
|
|
@@ -364,7 +400,7 @@ class SSHStateProviderClient(StateProvider):
|
|
|
364
400
|
|
|
365
401
|
logger.debug("Received notification: %s", method)
|
|
366
402
|
|
|
367
|
-
# Convert notification to
|
|
403
|
+
# Convert notification to EventBase and queue for throttled delivery
|
|
368
404
|
event = self._notification_to_event(method, params)
|
|
369
405
|
if event:
|
|
370
406
|
with self._pending_events_lock:
|
|
@@ -402,8 +438,9 @@ class SSHStateProviderClient(StateProvider):
|
|
|
402
438
|
seen_types = set()
|
|
403
439
|
unique_events = []
|
|
404
440
|
for event in reversed(events):
|
|
405
|
-
|
|
406
|
-
|
|
441
|
+
event_type = type(event)
|
|
442
|
+
if event_type not in seen_types:
|
|
443
|
+
seen_types.add(event_type)
|
|
407
444
|
unique_events.append(event)
|
|
408
445
|
unique_events.reverse()
|
|
409
446
|
|
|
@@ -411,31 +448,31 @@ class SSHStateProviderClient(StateProvider):
|
|
|
411
448
|
for event in unique_events:
|
|
412
449
|
self._notify_listeners(event)
|
|
413
450
|
|
|
414
|
-
def _notification_to_event(self, method: str, params: Dict) -> Optional[
|
|
415
|
-
"""Convert a notification to a
|
|
416
|
-
if method
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
elif method == NotificationMethod.SERVICE_UPDATED.value:
|
|
432
|
-
return StateEvent(
|
|
433
|
-
event_type=StateEventType.SERVICE_UPDATED,
|
|
434
|
-
data=params.get("data", params),
|
|
435
|
-
)
|
|
436
|
-
return None
|
|
451
|
+
def _notification_to_event(self, method: str, params: Dict) -> Optional[EventBase]:
|
|
452
|
+
"""Convert a notification to a EventBase"""
|
|
453
|
+
if method != NotificationMethod.STATE_EVENT.value:
|
|
454
|
+
# Don't warn for known control notifications (handled elsewhere)
|
|
455
|
+
if method not in (
|
|
456
|
+
NotificationMethod.SHUTDOWN.value,
|
|
457
|
+
NotificationMethod.FILE_CHANGED.value,
|
|
458
|
+
):
|
|
459
|
+
logger.warning("Unhandled notification method: %s", method)
|
|
460
|
+
return None
|
|
461
|
+
|
|
462
|
+
event_type = params.get("event_type")
|
|
463
|
+
data = params.get("data", {})
|
|
464
|
+
event_class = EventBase.get_class(event_type)
|
|
465
|
+
if event_class is None:
|
|
466
|
+
logger.warning("Unknown event type: %s", event_type)
|
|
467
|
+
return None
|
|
437
468
|
|
|
438
|
-
|
|
469
|
+
try:
|
|
470
|
+
return event_class(**data)
|
|
471
|
+
except TypeError as e:
|
|
472
|
+
logger.warning("Error deserializing event %s: %s", event_type, e)
|
|
473
|
+
return None
|
|
474
|
+
|
|
475
|
+
def _notify_listeners(self, event: EventBase):
|
|
439
476
|
"""Notify all registered listeners of a state event"""
|
|
440
477
|
with self._listener_lock:
|
|
441
478
|
listeners = list(self._listeners)
|
|
@@ -539,7 +576,7 @@ class SSHStateProviderClient(StateProvider):
|
|
|
539
576
|
exp = self.get_experiment(experiment_id)
|
|
540
577
|
if exp is None:
|
|
541
578
|
return None
|
|
542
|
-
return exp.
|
|
579
|
+
return exp.run_id
|
|
543
580
|
|
|
544
581
|
def get_jobs(
|
|
545
582
|
self,
|
|
@@ -591,6 +628,32 @@ class SSHStateProviderClient(StateProvider):
|
|
|
591
628
|
result = self._call_sync(RPCMethod.GET_ALL_JOBS, params)
|
|
592
629
|
return [self._dict_to_job(d) for d in result]
|
|
593
630
|
|
|
631
|
+
def get_tags_map(
|
|
632
|
+
self,
|
|
633
|
+
experiment_id: str,
|
|
634
|
+
run_id: Optional[str] = None,
|
|
635
|
+
) -> Dict[str, Dict[str, str]]:
|
|
636
|
+
"""Get tags map for jobs in an experiment/run"""
|
|
637
|
+
params = {
|
|
638
|
+
"experiment_id": experiment_id,
|
|
639
|
+
"run_id": run_id,
|
|
640
|
+
}
|
|
641
|
+
result = self._call_sync(RPCMethod.GET_TAGS_MAP, params)
|
|
642
|
+
return result or {}
|
|
643
|
+
|
|
644
|
+
def get_dependencies_map(
|
|
645
|
+
self,
|
|
646
|
+
experiment_id: str,
|
|
647
|
+
run_id: Optional[str] = None,
|
|
648
|
+
) -> dict[str, list[str]]:
|
|
649
|
+
"""Get dependencies map for jobs in an experiment/run"""
|
|
650
|
+
params = {
|
|
651
|
+
"experiment_id": experiment_id,
|
|
652
|
+
"run_id": run_id,
|
|
653
|
+
}
|
|
654
|
+
result = self._call_sync(RPCMethod.GET_DEPENDENCIES_MAP, params)
|
|
655
|
+
return result or {}
|
|
656
|
+
|
|
594
657
|
def _fetch_services_from_storage(
|
|
595
658
|
self, experiment_id: Optional[str], run_id: Optional[str]
|
|
596
659
|
) -> List[BaseService]:
|
|
@@ -611,16 +674,6 @@ class SSHStateProviderClient(StateProvider):
|
|
|
611
674
|
|
|
612
675
|
return services
|
|
613
676
|
|
|
614
|
-
def get_services_raw(
|
|
615
|
-
self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
|
|
616
|
-
) -> List[Dict]:
|
|
617
|
-
"""Get raw service data as dictionaries"""
|
|
618
|
-
params = {
|
|
619
|
-
"experiment_id": experiment_id,
|
|
620
|
-
"run_id": run_id,
|
|
621
|
-
}
|
|
622
|
-
return self._call_sync(RPCMethod.GET_SERVICES, params)
|
|
623
|
-
|
|
624
677
|
def kill_job(self, job: BaseJob, perform: bool = False) -> bool:
|
|
625
678
|
"""Kill a running job"""
|
|
626
679
|
if not perform:
|
|
@@ -649,69 +702,71 @@ class SSHStateProviderClient(StateProvider):
|
|
|
649
702
|
result = self._call_sync(RPCMethod.CLEAN_JOB, params)
|
|
650
703
|
return result.get("success", False)
|
|
651
704
|
|
|
705
|
+
def get_process_info(self, job: BaseJob):
|
|
706
|
+
"""Get process information for a job
|
|
707
|
+
|
|
708
|
+
Returns None if the remote server doesn't support this method.
|
|
709
|
+
"""
|
|
710
|
+
from experimaestro.scheduler.state_provider import ProcessInfo
|
|
711
|
+
|
|
712
|
+
params = {
|
|
713
|
+
"job_id": job.identifier,
|
|
714
|
+
"experiment_id": getattr(job, "experiment_id", ""),
|
|
715
|
+
"run_id": getattr(job, "run_id", ""),
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
try:
|
|
719
|
+
result = self._call_sync(RPCMethod.GET_PROCESS_INFO, params)
|
|
720
|
+
except RuntimeError:
|
|
721
|
+
# Server doesn't support this method (older version)
|
|
722
|
+
return None
|
|
723
|
+
|
|
724
|
+
if result is None:
|
|
725
|
+
return None
|
|
726
|
+
|
|
727
|
+
return ProcessInfo(
|
|
728
|
+
pid=result["pid"],
|
|
729
|
+
type=result["type"],
|
|
730
|
+
running=result.get("running", False),
|
|
731
|
+
)
|
|
732
|
+
|
|
652
733
|
# -------------------------------------------------------------------------
|
|
653
734
|
# Data Conversion
|
|
654
735
|
# -------------------------------------------------------------------------
|
|
655
736
|
|
|
656
737
|
def _dict_to_job(self, d: Dict) -> MockJob:
|
|
657
|
-
"""Convert a dictionary to a MockJob"""
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
# Map local cache path for the job
|
|
661
|
-
path = None
|
|
738
|
+
"""Convert a dictionary to a MockJob using from_state_dict"""
|
|
739
|
+
# Translate remote path to local cache path
|
|
662
740
|
if d.get("path"):
|
|
663
|
-
# The path from remote is absolute on remote system
|
|
664
|
-
# We map it to local cache
|
|
665
741
|
remote_path = d["path"]
|
|
666
742
|
if remote_path.startswith(self.remote_workspace):
|
|
667
743
|
relative = remote_path[len(self.remote_workspace) :].lstrip("/")
|
|
668
|
-
path = self.local_cache_dir / relative
|
|
744
|
+
d["path"] = self.local_cache_dir / relative
|
|
669
745
|
else:
|
|
670
|
-
path = Path(remote_path)
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
state=state_str,
|
|
678
|
-
submittime=self._parse_datetime_to_timestamp(d.get("submittime")),
|
|
679
|
-
starttime=self._parse_datetime_to_timestamp(d.get("starttime")),
|
|
680
|
-
endtime=self._parse_datetime_to_timestamp(d.get("endtime")),
|
|
681
|
-
progress=d.get("progress", []),
|
|
682
|
-
tags=d.get("tags", {}),
|
|
683
|
-
experiment_id=d.get("experiment_id", ""),
|
|
684
|
-
run_id=d.get("run_id", ""),
|
|
685
|
-
updated_at="",
|
|
686
|
-
)
|
|
746
|
+
d["path"] = Path(remote_path)
|
|
747
|
+
|
|
748
|
+
# Convert ISO datetime strings back to timestamps (floats)
|
|
749
|
+
for key in ("submitted_time", "started_time", "ended_time"):
|
|
750
|
+
d[key] = self._parse_datetime_to_timestamp(d.get(key))
|
|
751
|
+
|
|
752
|
+
return MockJob.from_state_dict(d, self.local_cache_dir)
|
|
687
753
|
|
|
688
754
|
def _dict_to_experiment(self, d: Dict) -> MockExperiment:
|
|
689
|
-
"""Convert a dictionary to a MockExperiment"""
|
|
690
|
-
#
|
|
691
|
-
workdir = None
|
|
755
|
+
"""Convert a dictionary to a MockExperiment using from_state_dict"""
|
|
756
|
+
# Translate remote workdir to local cache path
|
|
692
757
|
if d.get("workdir"):
|
|
693
758
|
remote_path = d["workdir"]
|
|
694
759
|
if remote_path.startswith(self.remote_workspace):
|
|
695
760
|
relative = remote_path[len(self.remote_workspace) :].lstrip("/")
|
|
696
|
-
workdir = self.local_cache_dir / relative
|
|
761
|
+
d["workdir"] = self.local_cache_dir / relative
|
|
697
762
|
else:
|
|
698
|
-
workdir = Path(remote_path)
|
|
699
|
-
|
|
700
|
-
# Convert ISO datetime strings to
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
return MockExperiment(
|
|
705
|
-
workdir=workdir or self.local_cache_dir / "xp" / d["experiment_id"],
|
|
706
|
-
current_run_id=d.get("current_run_id"),
|
|
707
|
-
total_jobs=d.get("total_jobs", 0),
|
|
708
|
-
finished_jobs=d.get("finished_jobs", 0),
|
|
709
|
-
failed_jobs=d.get("failed_jobs", 0),
|
|
710
|
-
updated_at=d.get("updated_at", ""),
|
|
711
|
-
started_at=started_at,
|
|
712
|
-
ended_at=ended_at,
|
|
713
|
-
hostname=d.get("hostname"),
|
|
714
|
-
)
|
|
763
|
+
d["workdir"] = Path(remote_path)
|
|
764
|
+
|
|
765
|
+
# Convert ISO datetime strings back to timestamps (floats)
|
|
766
|
+
for key in ("started_at", "ended_at"):
|
|
767
|
+
d[key] = self._parse_datetime_to_timestamp(d.get(key))
|
|
768
|
+
|
|
769
|
+
return MockExperiment.from_state_dict(d, self.local_cache_dir)
|
|
715
770
|
|
|
716
771
|
def _dict_to_service(self, d: Dict) -> BaseService:
|
|
717
772
|
"""Convert a dictionary to a Service or MockService
|
|
@@ -720,6 +775,7 @@ class SSHStateProviderClient(StateProvider):
|
|
|
720
775
|
Falls back to MockService with error message if module is missing.
|
|
721
776
|
"""
|
|
722
777
|
state_dict = d.get("state_dict", {})
|
|
778
|
+
service_class = d.get("class", "")
|
|
723
779
|
service_id = d.get("service_id", "")
|
|
724
780
|
|
|
725
781
|
# Check for unserializable marker
|
|
@@ -729,13 +785,14 @@ class SSHStateProviderClient(StateProvider):
|
|
|
729
785
|
service_id=service_id,
|
|
730
786
|
description_text=f"[{reason}]",
|
|
731
787
|
state_dict_data=state_dict,
|
|
788
|
+
service_class=service_class,
|
|
732
789
|
experiment_id=d.get("experiment_id"),
|
|
733
790
|
run_id=d.get("run_id"),
|
|
734
791
|
url=d.get("url"),
|
|
735
792
|
)
|
|
736
793
|
|
|
737
794
|
# Try to recreate actual Service from state_dict
|
|
738
|
-
if
|
|
795
|
+
if service_class:
|
|
739
796
|
try:
|
|
740
797
|
from experimaestro.scheduler.services import Service
|
|
741
798
|
|
|
@@ -751,7 +808,9 @@ class SSHStateProviderClient(StateProvider):
|
|
|
751
808
|
return self.local_cache_dir / relative
|
|
752
809
|
return Path(remote_path)
|
|
753
810
|
|
|
754
|
-
service = Service.from_state_dict(
|
|
811
|
+
service = Service.from_state_dict(
|
|
812
|
+
service_class, state_dict, path_translator
|
|
813
|
+
)
|
|
755
814
|
service.id = service_id
|
|
756
815
|
# Copy additional attributes
|
|
757
816
|
if d.get("experiment_id"):
|
|
@@ -766,6 +825,7 @@ class SSHStateProviderClient(StateProvider):
|
|
|
766
825
|
service_id=service_id,
|
|
767
826
|
description_text=f"[Missing module: {missing_module}]",
|
|
768
827
|
state_dict_data=state_dict,
|
|
828
|
+
service_class=service_class,
|
|
769
829
|
experiment_id=d.get("experiment_id"),
|
|
770
830
|
run_id=d.get("run_id"),
|
|
771
831
|
url=d.get("url"),
|
|
@@ -776,20 +836,14 @@ class SSHStateProviderClient(StateProvider):
|
|
|
776
836
|
service_id=service_id,
|
|
777
837
|
description_text=f"[Error: {e}]",
|
|
778
838
|
state_dict_data=state_dict,
|
|
839
|
+
service_class=service_class,
|
|
779
840
|
experiment_id=d.get("experiment_id"),
|
|
780
841
|
run_id=d.get("run_id"),
|
|
781
842
|
url=d.get("url"),
|
|
782
843
|
)
|
|
783
844
|
|
|
784
|
-
# No
|
|
785
|
-
return MockService(
|
|
786
|
-
service_id=service_id,
|
|
787
|
-
description_text=d.get("description", ""),
|
|
788
|
-
state_dict_data=state_dict,
|
|
789
|
-
experiment_id=d.get("experiment_id"),
|
|
790
|
-
run_id=d.get("run_id"),
|
|
791
|
-
url=d.get("url"),
|
|
792
|
-
)
|
|
845
|
+
# No class - use MockService.from_full_state_dict
|
|
846
|
+
return MockService.from_full_state_dict(d)
|
|
793
847
|
|
|
794
848
|
def _parse_datetime_to_timestamp(self, value) -> Optional[float]:
|
|
795
849
|
"""Convert datetime value to Unix timestamp
|