experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +12 -5
- experimaestro/cli/__init__.py +393 -134
- experimaestro/cli/filter.py +48 -23
- experimaestro/cli/jobs.py +253 -71
- experimaestro/cli/refactor.py +1 -2
- experimaestro/commandline.py +7 -4
- experimaestro/connectors/__init__.py +9 -1
- experimaestro/connectors/local.py +43 -3
- experimaestro/core/arguments.py +18 -18
- experimaestro/core/identifier.py +11 -11
- experimaestro/core/objects/config.py +96 -39
- experimaestro/core/objects/config_walk.py +3 -3
- experimaestro/core/{subparameters.py → partial.py} +16 -16
- experimaestro/core/partial_lock.py +394 -0
- experimaestro/core/types.py +12 -15
- experimaestro/dynamic.py +290 -0
- experimaestro/experiments/__init__.py +6 -2
- experimaestro/experiments/cli.py +223 -52
- experimaestro/experiments/configuration.py +24 -0
- experimaestro/generators.py +5 -5
- experimaestro/ipc.py +118 -1
- experimaestro/launcherfinder/__init__.py +2 -2
- experimaestro/launcherfinder/registry.py +6 -7
- experimaestro/launcherfinder/specs.py +2 -9
- experimaestro/launchers/slurm/__init__.py +2 -2
- experimaestro/launchers/slurm/base.py +62 -0
- experimaestro/locking.py +957 -1
- experimaestro/notifications.py +89 -201
- experimaestro/progress.py +63 -366
- experimaestro/rpyc.py +0 -2
- experimaestro/run.py +29 -2
- experimaestro/scheduler/__init__.py +8 -1
- experimaestro/scheduler/base.py +650 -53
- experimaestro/scheduler/dependencies.py +20 -16
- experimaestro/scheduler/experiment.py +764 -169
- experimaestro/scheduler/interfaces.py +338 -96
- experimaestro/scheduler/jobs.py +58 -20
- experimaestro/scheduler/remote/__init__.py +31 -0
- experimaestro/scheduler/remote/adaptive_sync.py +265 -0
- experimaestro/scheduler/remote/client.py +928 -0
- experimaestro/scheduler/remote/protocol.py +282 -0
- experimaestro/scheduler/remote/server.py +447 -0
- experimaestro/scheduler/remote/sync.py +144 -0
- experimaestro/scheduler/services.py +186 -35
- experimaestro/scheduler/state_provider.py +811 -2157
- experimaestro/scheduler/state_status.py +1247 -0
- experimaestro/scheduler/transient.py +31 -0
- experimaestro/scheduler/workspace.py +1 -1
- experimaestro/scheduler/workspace_state_provider.py +1273 -0
- experimaestro/scriptbuilder.py +4 -4
- experimaestro/settings.py +36 -0
- experimaestro/tests/conftest.py +33 -5
- experimaestro/tests/connectors/bin/executable.py +1 -1
- experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
- experimaestro/tests/launchers/bin/test.py +1 -0
- experimaestro/tests/launchers/test_slurm.py +9 -9
- experimaestro/tests/partial_reschedule.py +46 -0
- experimaestro/tests/restart.py +3 -3
- experimaestro/tests/restart_main.py +1 -0
- experimaestro/tests/scripts/notifyandwait.py +1 -0
- experimaestro/tests/task_partial.py +38 -0
- experimaestro/tests/task_tokens.py +2 -2
- experimaestro/tests/tasks/test_dynamic.py +6 -6
- experimaestro/tests/test_dependencies.py +3 -3
- experimaestro/tests/test_deprecated.py +15 -15
- experimaestro/tests/test_dynamic_locking.py +317 -0
- experimaestro/tests/test_environment.py +24 -14
- experimaestro/tests/test_experiment.py +171 -36
- experimaestro/tests/test_identifier.py +25 -25
- experimaestro/tests/test_identifier_stability.py +3 -5
- experimaestro/tests/test_multitoken.py +2 -4
- experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
- experimaestro/tests/test_partial_paths.py +81 -138
- experimaestro/tests/test_pre_experiment.py +219 -0
- experimaestro/tests/test_progress.py +2 -8
- experimaestro/tests/test_remote_state.py +1132 -0
- experimaestro/tests/test_stray_jobs.py +261 -0
- experimaestro/tests/test_tasks.py +1 -2
- experimaestro/tests/test_token_locking.py +52 -67
- experimaestro/tests/test_tokens.py +5 -6
- experimaestro/tests/test_transient.py +225 -0
- experimaestro/tests/test_workspace_state_provider.py +768 -0
- experimaestro/tests/token_reschedule.py +1 -3
- experimaestro/tests/utils.py +2 -7
- experimaestro/tokens.py +227 -372
- experimaestro/tools/diff.py +1 -0
- experimaestro/tools/documentation.py +4 -5
- experimaestro/tools/jobs.py +1 -2
- experimaestro/tui/app.py +459 -1895
- experimaestro/tui/app.tcss +162 -0
- experimaestro/tui/dialogs.py +172 -0
- experimaestro/tui/log_viewer.py +253 -3
- experimaestro/tui/messages.py +137 -0
- experimaestro/tui/utils.py +54 -0
- experimaestro/tui/widgets/__init__.py +23 -0
- experimaestro/tui/widgets/experiments.py +468 -0
- experimaestro/tui/widgets/global_services.py +238 -0
- experimaestro/tui/widgets/jobs.py +972 -0
- experimaestro/tui/widgets/log.py +156 -0
- experimaestro/tui/widgets/orphans.py +363 -0
- experimaestro/tui/widgets/runs.py +185 -0
- experimaestro/tui/widgets/services.py +314 -0
- experimaestro/tui/widgets/stray_jobs.py +528 -0
- experimaestro/utils/__init__.py +1 -1
- experimaestro/utils/environment.py +105 -22
- experimaestro/utils/fswatcher.py +124 -0
- experimaestro/utils/jobs.py +1 -2
- experimaestro/utils/jupyter.py +1 -2
- experimaestro/utils/logging.py +72 -0
- experimaestro/version.py +2 -2
- experimaestro/webui/__init__.py +9 -0
- experimaestro/webui/app.py +117 -0
- experimaestro/{server → webui}/data/index.css +66 -11
- experimaestro/webui/data/index.css.map +1 -0
- experimaestro/{server → webui}/data/index.js +82763 -87217
- experimaestro/webui/data/index.js.map +1 -0
- experimaestro/webui/routes/__init__.py +5 -0
- experimaestro/webui/routes/auth.py +53 -0
- experimaestro/webui/routes/proxy.py +117 -0
- experimaestro/webui/server.py +200 -0
- experimaestro/webui/state_bridge.py +152 -0
- experimaestro/webui/websocket.py +413 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +8 -9
- experimaestro-2.0.0b17.dist-info/RECORD +219 -0
- experimaestro/cli/progress.py +0 -269
- experimaestro/scheduler/state.py +0 -75
- experimaestro/scheduler/state_db.py +0 -388
- experimaestro/scheduler/state_sync.py +0 -834
- experimaestro/server/__init__.py +0 -467
- experimaestro/server/data/index.css.map +0 -1
- experimaestro/server/data/index.js.map +0 -1
- experimaestro/tests/test_cli_jobs.py +0 -615
- experimaestro/tests/test_file_progress.py +0 -425
- experimaestro/tests/test_file_progress_integration.py +0 -477
- experimaestro/tests/test_state_db.py +0 -434
- experimaestro-2.0.0b4.dist-info/RECORD +0 -181
- /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
- /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
- /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
- /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
- /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
- /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
- /experimaestro/{server → webui}/data/favicon.ico +0 -0
- /experimaestro/{server → webui}/data/index.html +0 -0
- /experimaestro/{server → webui}/data/login.html +0 -0
- /experimaestro/{server → webui}/data/manifest.json +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
experimaestro/scheduler/base.py
CHANGED
|
@@ -1,17 +1,34 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import threading
|
|
2
3
|
import time
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
3
6
|
from typing import (
|
|
7
|
+
Dict,
|
|
8
|
+
List,
|
|
4
9
|
Optional,
|
|
5
10
|
Set,
|
|
6
11
|
ClassVar,
|
|
7
12
|
TYPE_CHECKING,
|
|
8
13
|
)
|
|
9
14
|
import asyncio
|
|
10
|
-
from typing import Dict
|
|
11
15
|
|
|
12
16
|
from experimaestro.scheduler import experiment
|
|
13
|
-
from experimaestro.scheduler.jobs import Job, JobState, JobError
|
|
17
|
+
from experimaestro.scheduler.jobs import Job, JobState, JobError, JobDependency
|
|
14
18
|
from experimaestro.scheduler.services import Service
|
|
19
|
+
from experimaestro.scheduler.interfaces import (
|
|
20
|
+
BaseJob,
|
|
21
|
+
BaseExperiment,
|
|
22
|
+
BaseService,
|
|
23
|
+
)
|
|
24
|
+
from experimaestro.scheduler.state_provider import StateProvider
|
|
25
|
+
from experimaestro.scheduler.state_status import (
|
|
26
|
+
EventReader,
|
|
27
|
+
JobProgressEvent,
|
|
28
|
+
JobStateChangedEvent,
|
|
29
|
+
WatchedDirectory,
|
|
30
|
+
job_entity_id_extractor,
|
|
31
|
+
)
|
|
15
32
|
|
|
16
33
|
|
|
17
34
|
from experimaestro.utils import logger
|
|
@@ -19,7 +36,7 @@ from experimaestro.utils.asyncio import asyncThreadcheck
|
|
|
19
36
|
import concurrent.futures
|
|
20
37
|
|
|
21
38
|
if TYPE_CHECKING:
|
|
22
|
-
from experimaestro.
|
|
39
|
+
from experimaestro.webui import WebUIServer
|
|
23
40
|
from experimaestro.settings import ServerSettings
|
|
24
41
|
from experimaestro.scheduler.workspace import Workspace
|
|
25
42
|
|
|
@@ -36,18 +53,25 @@ class Listener:
|
|
|
36
53
|
pass
|
|
37
54
|
|
|
38
55
|
|
|
39
|
-
class Scheduler(threading.Thread):
|
|
40
|
-
"""A job scheduler (singleton)
|
|
56
|
+
class Scheduler(StateProvider, threading.Thread):
|
|
57
|
+
"""A job scheduler (singleton) that provides live state
|
|
41
58
|
|
|
42
59
|
The scheduler is based on asyncio for easy concurrency handling.
|
|
43
60
|
This is a singleton - only one scheduler instance exists per process.
|
|
61
|
+
|
|
62
|
+
Inherits from StateProvider to allow TUI/Web interfaces to access
|
|
63
|
+
live job and experiment state during experiment execution.
|
|
44
64
|
"""
|
|
45
65
|
|
|
46
66
|
_instance: ClassVar[Optional["Scheduler"]] = None
|
|
47
67
|
_lock: ClassVar[threading.Lock] = threading.Lock()
|
|
48
68
|
|
|
69
|
+
#: Scheduler is always live
|
|
70
|
+
is_live: bool = True
|
|
71
|
+
|
|
49
72
|
def __init__(self, name: str = "Global"):
|
|
50
|
-
|
|
73
|
+
StateProvider.__init__(self) # Initialize state listener management
|
|
74
|
+
threading.Thread.__init__(self, name=f"Scheduler ({name})", daemon=True)
|
|
51
75
|
self._ready = threading.Event()
|
|
52
76
|
|
|
53
77
|
# Name of the scheduler
|
|
@@ -62,10 +86,19 @@ class Scheduler(threading.Thread):
|
|
|
62
86
|
# List of all jobs
|
|
63
87
|
self.jobs: Dict[str, "Job"] = {}
|
|
64
88
|
|
|
89
|
+
# Services: (experiment_id, run_id) -> {service_id -> Service}
|
|
90
|
+
self.services: Dict[tuple[str, str], Dict[str, Service]] = {}
|
|
91
|
+
|
|
92
|
+
# Tags map: (experiment_id, run_id) -> {job_id -> {tag_key: tag_value}}
|
|
93
|
+
self._tags_map: dict[tuple[str, str], dict[str, dict[str, str]]] = {}
|
|
94
|
+
|
|
95
|
+
# Dependencies map: (experiment_id, run_id) -> {job_id -> [depends_on_job_ids]}
|
|
96
|
+
self._dependencies_map: dict[tuple[str, str], dict[str, list[str]]] = {}
|
|
97
|
+
|
|
65
98
|
# List of jobs
|
|
66
99
|
self.waitingjobs: Set[Job] = set()
|
|
67
100
|
|
|
68
|
-
#
|
|
101
|
+
# Legacy listeners with thread-safe access
|
|
69
102
|
self._listeners: Set[Listener] = set()
|
|
70
103
|
self._listeners_lock = threading.Lock()
|
|
71
104
|
|
|
@@ -75,7 +108,12 @@ class Scheduler(threading.Thread):
|
|
|
75
108
|
)
|
|
76
109
|
|
|
77
110
|
# Server (managed by scheduler)
|
|
78
|
-
self.server: Optional["
|
|
111
|
+
self.server: Optional["WebUIServer"] = None
|
|
112
|
+
|
|
113
|
+
# Job event readers per workspace
|
|
114
|
+
# Uses EventReader to watch .events/jobs/ directory
|
|
115
|
+
self._job_event_readers: Dict[Path, EventReader] = {}
|
|
116
|
+
self._job_event_readers_lock = threading.Lock()
|
|
79
117
|
|
|
80
118
|
@staticmethod
|
|
81
119
|
def has_instance() -> bool:
|
|
@@ -114,51 +152,58 @@ class Scheduler(threading.Thread):
|
|
|
114
152
|
|
|
115
153
|
def register_experiment(self, xp: "experiment"):
|
|
116
154
|
"""Register an experiment with the scheduler"""
|
|
117
|
-
# Use experiment name as key
|
|
118
|
-
key = xp.
|
|
155
|
+
# Use experiment name as key (not workdir.name which is now run_id)
|
|
156
|
+
key = xp.name
|
|
119
157
|
self.experiments[key] = xp
|
|
120
158
|
|
|
159
|
+
# Start watching job events for this workspace
|
|
160
|
+
self._start_job_event_reader(xp.workspace.path)
|
|
161
|
+
|
|
121
162
|
logger.debug("Registered experiment %s with scheduler", key)
|
|
122
163
|
|
|
123
164
|
def unregister_experiment(self, xp: "experiment"):
|
|
124
165
|
"""Unregister an experiment from the scheduler"""
|
|
125
|
-
key = xp.
|
|
166
|
+
key = xp.name
|
|
126
167
|
if key in self.experiments:
|
|
127
168
|
del self.experiments[key]
|
|
128
169
|
logger.debug("Unregistered experiment %s from scheduler", key)
|
|
129
170
|
|
|
130
171
|
def start_server(
|
|
131
|
-
self,
|
|
172
|
+
self,
|
|
173
|
+
settings: "ServerSettings" = None,
|
|
174
|
+
workspace: "Workspace" = None, # noqa: ARG002 - kept for backward compat
|
|
175
|
+
wait_for_quit: bool = False,
|
|
132
176
|
):
|
|
133
|
-
"""Start the
|
|
177
|
+
"""Start the web server (if not already running)
|
|
134
178
|
|
|
135
179
|
Args:
|
|
136
180
|
settings: Server settings
|
|
137
|
-
workspace: Workspace instance (
|
|
181
|
+
workspace: Workspace instance (deprecated, not used)
|
|
182
|
+
wait_for_quit: If True, server waits for explicit quit from web UI
|
|
138
183
|
"""
|
|
139
184
|
if self.server is None:
|
|
140
|
-
from experimaestro.
|
|
141
|
-
from experimaestro.scheduler.state_provider import WorkspaceStateProvider
|
|
185
|
+
from experimaestro.webui import WebUIServer
|
|
142
186
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
# Get the workspace state provider singleton
|
|
147
|
-
state_provider = WorkspaceStateProvider.get_instance(
|
|
148
|
-
workspace.path, read_only=False, sync_on_start=False
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
self.server = Server.instance(settings, state_provider)
|
|
187
|
+
# Use the Scheduler itself as the StateProvider for live state access
|
|
188
|
+
self.server = WebUIServer.instance(settings, self, wait_for_quit)
|
|
152
189
|
self.server.start()
|
|
153
|
-
logger.info("
|
|
190
|
+
logger.info("Web server started by scheduler")
|
|
154
191
|
else:
|
|
155
|
-
logger.debug("
|
|
192
|
+
logger.debug("Web server already running")
|
|
156
193
|
|
|
157
194
|
def stop_server(self):
|
|
158
|
-
"""Stop the
|
|
195
|
+
"""Stop the web server"""
|
|
159
196
|
if self.server is not None:
|
|
160
197
|
self.server.stop()
|
|
161
|
-
logger.info("
|
|
198
|
+
logger.info("Web server stopped by scheduler")
|
|
199
|
+
|
|
200
|
+
def wait_for_server_quit(self):
|
|
201
|
+
"""Wait for explicit quit from web interface
|
|
202
|
+
|
|
203
|
+
Only blocks if server was started with wait_for_quit=True.
|
|
204
|
+
"""
|
|
205
|
+
if self.server is not None:
|
|
206
|
+
self.server.wait()
|
|
162
207
|
|
|
163
208
|
def run(self):
|
|
164
209
|
"""Run the event loop forever"""
|
|
@@ -197,6 +242,27 @@ class Scheduler(threading.Thread):
|
|
|
197
242
|
with self._listeners_lock:
|
|
198
243
|
self._listeners.clear()
|
|
199
244
|
|
|
245
|
+
def wait_for_notifications(self, timeout: float = 5.0) -> bool:
|
|
246
|
+
"""Wait for all pending notifications to be processed.
|
|
247
|
+
|
|
248
|
+
This submits a sentinel task and waits for it to complete,
|
|
249
|
+
ensuring all previously submitted notifications have been processed.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
timeout: Maximum time to wait in seconds
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
True if all notifications were processed, False if timeout occurred
|
|
256
|
+
"""
|
|
257
|
+
try:
|
|
258
|
+
# Submit a no-op and wait for it to complete
|
|
259
|
+
future = self._notification_executor.submit(lambda: None)
|
|
260
|
+
future.result(timeout=timeout)
|
|
261
|
+
return True
|
|
262
|
+
except concurrent.futures.TimeoutError:
|
|
263
|
+
logger.warning("Timeout waiting for notification queue to drain")
|
|
264
|
+
return False
|
|
265
|
+
|
|
200
266
|
def getJobState(self, job: Job) -> "concurrent.futures.Future[JobState]":
|
|
201
267
|
# Check if the job belongs to this scheduler
|
|
202
268
|
if job.identifier not in self.jobs:
|
|
@@ -256,12 +322,35 @@ class Scheduler(threading.Thread):
|
|
|
256
322
|
xp = experiment.current()
|
|
257
323
|
xp.add_job(other)
|
|
258
324
|
|
|
325
|
+
# Merge transient modes: more conservative mode wins
|
|
326
|
+
# NONE(0) > TRANSIENT(1) > REMOVE(2) - lower value wins
|
|
327
|
+
was_transient = other.transient.is_transient
|
|
328
|
+
if job.transient < other.transient:
|
|
329
|
+
other.transient = job.transient
|
|
330
|
+
# If job was transient and is now non-transient, mark it as needed
|
|
331
|
+
# This flag tells aio_submit not to skip the job
|
|
332
|
+
if was_transient and not other.transient.is_transient:
|
|
333
|
+
other._needed_transient = True
|
|
334
|
+
|
|
259
335
|
# Copy watched outputs from new job to existing job
|
|
260
336
|
# This ensures new callbacks are registered even for resubmitted jobs
|
|
261
337
|
other.watched_outputs.extend(job.watched_outputs)
|
|
262
338
|
|
|
339
|
+
# Check if job needs to be re-started
|
|
340
|
+
need_restart = False
|
|
263
341
|
if other.state.is_error():
|
|
264
|
-
logger.info("Re-submitting job")
|
|
342
|
+
logger.info("Re-submitting job (was in error state)")
|
|
343
|
+
need_restart = True
|
|
344
|
+
elif (
|
|
345
|
+
was_transient
|
|
346
|
+
and not other.transient.is_transient
|
|
347
|
+
and other.state == JobState.UNSCHEDULED
|
|
348
|
+
):
|
|
349
|
+
# Job was transient and skipped, but now is non-transient - restart it
|
|
350
|
+
logger.info("Re-submitting job (was transient, now non-transient)")
|
|
351
|
+
need_restart = True
|
|
352
|
+
|
|
353
|
+
if need_restart:
|
|
265
354
|
# Clean up old process info so it will be re-started
|
|
266
355
|
other._process = None
|
|
267
356
|
if other.pidpath.is_file():
|
|
@@ -269,6 +358,7 @@ class Scheduler(threading.Thread):
|
|
|
269
358
|
# Use set_state to handle experiment statistics updates
|
|
270
359
|
other.set_state(JobState.WAITING)
|
|
271
360
|
self.notify_job_state(other) # Notify listeners of re-submit
|
|
361
|
+
# The calling aio_submit will continue with this job and start it
|
|
272
362
|
else:
|
|
273
363
|
logger.warning("Job %s already submitted", job.identifier)
|
|
274
364
|
|
|
@@ -283,13 +373,145 @@ class Scheduler(threading.Thread):
|
|
|
283
373
|
job.submittime = time.time()
|
|
284
374
|
xp.add_job(job)
|
|
285
375
|
|
|
376
|
+
# Update tags map for this experiment/run
|
|
377
|
+
if job.tags:
|
|
378
|
+
exp_run_key = (xp.name, xp.run_id)
|
|
379
|
+
if exp_run_key not in self._tags_map:
|
|
380
|
+
self._tags_map[exp_run_key] = {}
|
|
381
|
+
self._tags_map[exp_run_key][job.identifier] = dict(job.tags)
|
|
382
|
+
|
|
383
|
+
# Update dependencies map for this experiment/run
|
|
384
|
+
exp_run_key = (xp.name, xp.run_id)
|
|
385
|
+
if exp_run_key not in self._dependencies_map:
|
|
386
|
+
self._dependencies_map[exp_run_key] = {}
|
|
387
|
+
depends_on_ids = [
|
|
388
|
+
dep.origin.identifier
|
|
389
|
+
for dep in job.dependencies
|
|
390
|
+
if isinstance(dep, JobDependency)
|
|
391
|
+
]
|
|
392
|
+
if depends_on_ids:
|
|
393
|
+
self._dependencies_map[exp_run_key][job.identifier] = depends_on_ids
|
|
394
|
+
|
|
286
395
|
# Set up dependencies
|
|
287
396
|
for dependency in job.dependencies:
|
|
288
397
|
dependency.target = job
|
|
289
|
-
|
|
398
|
+
# Some dependencies (like PartialDependency) don't have an origin resource
|
|
399
|
+
if dependency.origin is not None:
|
|
400
|
+
dependency.origin.dependents.add(dependency)
|
|
290
401
|
|
|
291
402
|
return None
|
|
292
403
|
|
|
404
|
+
def _start_job_event_reader(self, workspace_path: Path) -> None:
|
|
405
|
+
"""Start watching job events in a workspace
|
|
406
|
+
|
|
407
|
+
Uses EventReader to watch .events/jobs/ for job progress events.
|
|
408
|
+
Job state events are emitted by the job process itself.
|
|
409
|
+
Only starts one reader per workspace.
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
workspace_path: Path to the workspace directory
|
|
413
|
+
"""
|
|
414
|
+
with self._job_event_readers_lock:
|
|
415
|
+
# Already watching this workspace
|
|
416
|
+
if workspace_path in self._job_event_readers:
|
|
417
|
+
return
|
|
418
|
+
|
|
419
|
+
jobs_dir = workspace_path / ".events" / "jobs"
|
|
420
|
+
|
|
421
|
+
# Create new reader for this workspace
|
|
422
|
+
reader = EventReader(
|
|
423
|
+
[
|
|
424
|
+
WatchedDirectory(
|
|
425
|
+
path=jobs_dir,
|
|
426
|
+
glob_pattern="*/event-*-*.jsonl",
|
|
427
|
+
entity_id_extractor=job_entity_id_extractor,
|
|
428
|
+
)
|
|
429
|
+
]
|
|
430
|
+
)
|
|
431
|
+
reader.start_watching(
|
|
432
|
+
on_event=self._on_job_event,
|
|
433
|
+
)
|
|
434
|
+
self._job_event_readers[workspace_path] = reader
|
|
435
|
+
logger.debug("Started job event reader for %s", jobs_dir)
|
|
436
|
+
|
|
437
|
+
def _stop_job_event_reader(self, workspace_path: Optional[Path] = None) -> None:
|
|
438
|
+
"""Stop watching job events
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
workspace_path: If provided, stop only this workspace's reader.
|
|
442
|
+
If None, stop all readers.
|
|
443
|
+
"""
|
|
444
|
+
with self._job_event_readers_lock:
|
|
445
|
+
if workspace_path is not None:
|
|
446
|
+
reader = self._job_event_readers.pop(workspace_path, None)
|
|
447
|
+
if reader is not None:
|
|
448
|
+
reader.stop_watching()
|
|
449
|
+
logger.debug("Stopped job event reader for %s", workspace_path)
|
|
450
|
+
else:
|
|
451
|
+
# Stop all readers
|
|
452
|
+
for path, reader in self._job_event_readers.items():
|
|
453
|
+
reader.stop_watching()
|
|
454
|
+
logger.debug("Stopped job event reader for %s", path)
|
|
455
|
+
self._job_event_readers.clear()
|
|
456
|
+
|
|
457
|
+
def _on_job_event(self, entity_id: str, event) -> None:
|
|
458
|
+
"""Handle job events from EventReader
|
|
459
|
+
|
|
460
|
+
Updates job state from file-based events and notifies listeners.
|
|
461
|
+
|
|
462
|
+
Args:
|
|
463
|
+
entity_id: The job ID
|
|
464
|
+
event: The event (JobProgressEvent or JobStateChangedEvent)
|
|
465
|
+
"""
|
|
466
|
+
job = self.jobs.get(entity_id)
|
|
467
|
+
if job is None:
|
|
468
|
+
logger.debug(
|
|
469
|
+
"Job event for unknown job %s",
|
|
470
|
+
entity_id,
|
|
471
|
+
)
|
|
472
|
+
return
|
|
473
|
+
logger.debug("Received event for job %s: %s", job, event)
|
|
474
|
+
|
|
475
|
+
if isinstance(event, JobProgressEvent):
|
|
476
|
+
# Update job's in-memory progress and notify legacy listeners
|
|
477
|
+
job.set_progress(event.level, event.progress, event.desc)
|
|
478
|
+
self.notify_job_state(job)
|
|
479
|
+
|
|
480
|
+
# Notify StateProvider-style listeners (TUI/WebUI)
|
|
481
|
+
state_event = JobStateChangedEvent(
|
|
482
|
+
job_id=job.identifier,
|
|
483
|
+
state=job.state.name.lower(),
|
|
484
|
+
)
|
|
485
|
+
self._notify_state_listeners_async(state_event)
|
|
486
|
+
|
|
487
|
+
def _cleanup_job_event_files(self, job: Job) -> None:
|
|
488
|
+
"""Clean up old job event files from previous runs
|
|
489
|
+
|
|
490
|
+
Removes event files at .events/jobs/{task_id}/event-{job_id}-*.jsonl
|
|
491
|
+
Called when a job is about to start to ensure clean state.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
job: The job being started
|
|
495
|
+
"""
|
|
496
|
+
# Get the workspace path from the job's path
|
|
497
|
+
# job.path is workspace/jobs/task_id/job_id
|
|
498
|
+
workspace_path = job.path.parent.parent.parent
|
|
499
|
+
task_id = str(job.type.identifier)
|
|
500
|
+
job_id = job.identifier
|
|
501
|
+
|
|
502
|
+
events_dir = workspace_path / ".events" / "jobs" / task_id
|
|
503
|
+
if not events_dir.exists():
|
|
504
|
+
return
|
|
505
|
+
|
|
506
|
+
# Find and delete old event files for this job
|
|
507
|
+
pattern = f"event-{job_id}-*.jsonl"
|
|
508
|
+
for event_file in events_dir.glob(pattern):
|
|
509
|
+
try:
|
|
510
|
+
event_file.unlink()
|
|
511
|
+
logger.debug("Removed old job event file: %s", event_file)
|
|
512
|
+
except OSError as e:
|
|
513
|
+
logger.warning("Failed to remove job event file %s: %s", event_file, e)
|
|
514
|
+
|
|
293
515
|
def _notify_listeners(self, notification_func, job: Job):
|
|
294
516
|
"""Execute notification in thread pool with error isolation.
|
|
295
517
|
|
|
@@ -310,18 +532,103 @@ class Scheduler(threading.Thread):
|
|
|
310
532
|
|
|
311
533
|
self._notification_executor.submit(_do_notify)
|
|
312
534
|
|
|
535
|
+
def _notify_state_listeners_async(self, event):
|
|
536
|
+
"""Notify StateProvider-style listeners asynchronously with error isolation.
|
|
537
|
+
|
|
538
|
+
This runs notifications in the same thread pool as _notify_listeners
|
|
539
|
+
to avoid blocking the scheduler and isolate errors.
|
|
540
|
+
"""
|
|
541
|
+
|
|
542
|
+
def _do_notify():
|
|
543
|
+
# Get a snapshot of listeners with the lock
|
|
544
|
+
with self._state_listener_lock:
|
|
545
|
+
listeners_snapshot = list(self._state_listeners)
|
|
546
|
+
|
|
547
|
+
for listener in listeners_snapshot:
|
|
548
|
+
try:
|
|
549
|
+
listener(event)
|
|
550
|
+
except Exception:
|
|
551
|
+
logger.exception("Got an error with state listener %s", listener)
|
|
552
|
+
|
|
553
|
+
self._notification_executor.submit(_do_notify)
|
|
554
|
+
|
|
313
555
|
def notify_job_submitted(self, job: Job):
|
|
314
556
|
"""Notify the listeners that a job has been submitted"""
|
|
315
557
|
self._notify_listeners(lambda lst, j: lst.job_submitted(j), job)
|
|
316
558
|
|
|
559
|
+
# Also notify StateProvider-style listeners (for TUI etc.)
|
|
560
|
+
from experimaestro.scheduler.state_status import JobSubmittedEvent, JobTag
|
|
561
|
+
|
|
562
|
+
# Get experiment info from job's experiments list
|
|
563
|
+
for exp in job.experiments:
|
|
564
|
+
experiment_id = exp.experiment_id
|
|
565
|
+
run_id = exp.run_id
|
|
566
|
+
|
|
567
|
+
# Get tags and dependencies for this job
|
|
568
|
+
exp_run_key = (experiment_id, run_id)
|
|
569
|
+
tags_dict = self._tags_map.get(exp_run_key, {}).get(job.identifier, {})
|
|
570
|
+
tags = [JobTag(key=k, value=v) for k, v in tags_dict.items()]
|
|
571
|
+
depends_on = self._dependencies_map.get(exp_run_key, {}).get(
|
|
572
|
+
job.identifier, []
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
event = JobSubmittedEvent(
|
|
576
|
+
experiment_id=experiment_id,
|
|
577
|
+
run_id=run_id,
|
|
578
|
+
job_id=job.identifier,
|
|
579
|
+
tags=tags,
|
|
580
|
+
depends_on=depends_on,
|
|
581
|
+
)
|
|
582
|
+
self._notify_state_listeners_async(event)
|
|
583
|
+
|
|
317
584
|
def notify_job_state(self, job: Job):
|
|
318
|
-
"""Notify the listeners that a job has changed state
|
|
585
|
+
"""Notify the listeners that a job has changed state
|
|
586
|
+
|
|
587
|
+
Note: This does NOT write to job event files. Job events are written
|
|
588
|
+
by the job process itself. The scheduler only forwards notifications
|
|
589
|
+
to listeners.
|
|
590
|
+
"""
|
|
591
|
+
# Legacy listener notification (per-experiment)
|
|
319
592
|
self._notify_listeners(lambda lst, j: lst.job_state(j), job)
|
|
320
593
|
|
|
321
|
-
|
|
594
|
+
# Notify StateProvider-style listeners with experiment-independent event
|
|
595
|
+
from experimaestro.scheduler.state_status import JobStateChangedEvent
|
|
596
|
+
|
|
597
|
+
event = JobStateChangedEvent(
|
|
598
|
+
job_id=job.identifier,
|
|
599
|
+
state=job.state.name.lower(),
|
|
600
|
+
)
|
|
601
|
+
self._notify_state_listeners_async(event)
|
|
602
|
+
|
|
603
|
+
def notify_service_add(
|
|
604
|
+
self, service: Service, experiment_id: str = "", run_id: str = ""
|
|
605
|
+
):
|
|
322
606
|
"""Notify the listeners that a service has been added"""
|
|
323
607
|
self._notify_listeners(lambda lst, s: lst.service_add(s), service)
|
|
324
608
|
|
|
609
|
+
# Store experiment info on the service for later retrieval
|
|
610
|
+
if experiment_id:
|
|
611
|
+
service._experiment_id = experiment_id
|
|
612
|
+
service._run_id = run_id or ""
|
|
613
|
+
|
|
614
|
+
# Store service in scheduler's services dict (persists after experiment ends)
|
|
615
|
+
if experiment_id:
|
|
616
|
+
key = (experiment_id, run_id or "")
|
|
617
|
+
if key not in self.services:
|
|
618
|
+
self.services[key] = {}
|
|
619
|
+
self.services[key][service.id] = service
|
|
620
|
+
|
|
621
|
+
# Also notify StateProvider-style listeners (for TUI etc.)
|
|
622
|
+
from experimaestro.scheduler.state_status import ServiceAddedEvent
|
|
623
|
+
|
|
624
|
+
if experiment_id:
|
|
625
|
+
event = ServiceAddedEvent(
|
|
626
|
+
experiment_id=experiment_id,
|
|
627
|
+
run_id=run_id or "",
|
|
628
|
+
service_id=service.id,
|
|
629
|
+
)
|
|
630
|
+
self._notify_state_listeners_async(event)
|
|
631
|
+
|
|
325
632
|
async def aio_submit(self, job: Job) -> JobState:
|
|
326
633
|
"""Main scheduler function: submit a job, run it (if needed), and returns
|
|
327
634
|
the status code
|
|
@@ -341,7 +648,7 @@ class Scheduler(threading.Thread):
|
|
|
341
648
|
# Check that we don't have a completed job in
|
|
342
649
|
# alternate directories
|
|
343
650
|
for jobspath in experiment.current().alt_jobspaths:
|
|
344
|
-
#
|
|
651
|
+
# Future enhancement: check if done
|
|
345
652
|
pass
|
|
346
653
|
|
|
347
654
|
# Creates a link into the experiment folder
|
|
@@ -367,10 +674,6 @@ class Scheduler(threading.Thread):
|
|
|
367
674
|
job.set_state(JobState.RUNNING)
|
|
368
675
|
self.notify_job_state(job)
|
|
369
676
|
|
|
370
|
-
# Adds to the listeners
|
|
371
|
-
if self.server is not None:
|
|
372
|
-
job.add_notification_server(self.server)
|
|
373
|
-
|
|
374
677
|
# And now, we wait...
|
|
375
678
|
logger.info("Got a process for job %s - waiting to complete", job)
|
|
376
679
|
code = await process.aio_code()
|
|
@@ -413,19 +716,27 @@ class Scheduler(threading.Thread):
|
|
|
413
716
|
|
|
414
717
|
# If not done or running, start the job
|
|
415
718
|
if not job.state.finished():
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
719
|
+
# Check if this is a transient job that is not needed
|
|
720
|
+
if job.transient.is_transient and not job._needed_transient:
|
|
721
|
+
job.set_state(JobState.UNSCHEDULED)
|
|
722
|
+
|
|
723
|
+
# Start the job if not skipped (state is still WAITING)
|
|
724
|
+
if job.state == JobState.WAITING:
|
|
725
|
+
try:
|
|
726
|
+
state = await self.aio_start(job)
|
|
727
|
+
if state is not None:
|
|
728
|
+
job.endtime = time.time()
|
|
729
|
+
job.set_state(state)
|
|
730
|
+
except Exception:
|
|
731
|
+
logger.exception("Got an exception while starting the job")
|
|
732
|
+
raise
|
|
424
733
|
|
|
425
734
|
# Job is finished - experiment statistics already updated by set_state
|
|
426
735
|
|
|
427
736
|
# Write final metadata with end time and final state
|
|
428
|
-
|
|
737
|
+
# Only for jobs that actually started (starttime is set in aio_start)
|
|
738
|
+
if job.starttime is not None:
|
|
739
|
+
job.status_path.write_text(json.dumps(job.state_dict()))
|
|
429
740
|
|
|
430
741
|
if job in self.waitingjobs:
|
|
431
742
|
self.waitingjobs.remove(job)
|
|
@@ -457,7 +768,7 @@ class Scheduler(threading.Thread):
|
|
|
457
768
|
or process creation
|
|
458
769
|
"""
|
|
459
770
|
from experimaestro.scheduler.jobs import JobStateError
|
|
460
|
-
from experimaestro.locking import
|
|
771
|
+
from experimaestro.locking import DynamicDependencyLocks, LockError
|
|
461
772
|
from experimaestro.scheduler.jobs import JobFailureStatus
|
|
462
773
|
|
|
463
774
|
# Assert preconditions
|
|
@@ -489,7 +800,7 @@ class Scheduler(threading.Thread):
|
|
|
489
800
|
return JobStateError(JobFailureStatus.DEPENDENCY)
|
|
490
801
|
|
|
491
802
|
# We first lock the job before proceeding
|
|
492
|
-
with
|
|
803
|
+
with DynamicDependencyLocks() as locks:
|
|
493
804
|
logger.debug("[starting] Locking job %s", job)
|
|
494
805
|
async with job.launcher.connector.lock(job.lockpath):
|
|
495
806
|
logger.debug("[starting] Locked job %s", job)
|
|
@@ -562,12 +873,15 @@ class Scheduler(threading.Thread):
|
|
|
562
873
|
if not directory.is_dir():
|
|
563
874
|
directory.mkdir(parents=True, exist_ok=True)
|
|
564
875
|
|
|
876
|
+
# Clean up old job event files from previous runs
|
|
877
|
+
self._cleanup_job_event_files(job)
|
|
878
|
+
|
|
565
879
|
# Write metadata with submit and start time (after directory creation)
|
|
566
|
-
job.
|
|
880
|
+
job.status_path.parent.mkdir(parents=True, exist_ok=True)
|
|
881
|
+
job.status_path.write_text(json.dumps(job.state_dict()))
|
|
567
882
|
|
|
568
|
-
#
|
|
569
|
-
|
|
570
|
-
job.add_notification_server(self.server)
|
|
883
|
+
# Notify locks before job starts (e.g., create symlinks)
|
|
884
|
+
await locks.aio_job_before_start(job)
|
|
571
885
|
|
|
572
886
|
except Exception:
|
|
573
887
|
logger.warning("Error while locking job", exc_info=True)
|
|
@@ -576,6 +890,30 @@ class Scheduler(threading.Thread):
|
|
|
576
890
|
try:
|
|
577
891
|
# Runs the job
|
|
578
892
|
process = await job.aio_run()
|
|
893
|
+
|
|
894
|
+
# Notify locks that job has started
|
|
895
|
+
await locks.aio_job_started(job, process)
|
|
896
|
+
|
|
897
|
+
# Write locks.json for job process (if there are dynamic locks)
|
|
898
|
+
if locks.locks:
|
|
899
|
+
import tempfile
|
|
900
|
+
|
|
901
|
+
locks_path = job.path / "locks.json"
|
|
902
|
+
locks_data = {"dynamic_locks": locks.to_json()}
|
|
903
|
+
# Atomic write: write to temp file then rename
|
|
904
|
+
with tempfile.NamedTemporaryFile(
|
|
905
|
+
mode="w",
|
|
906
|
+
dir=job.path,
|
|
907
|
+
prefix=".locks.",
|
|
908
|
+
suffix=".json",
|
|
909
|
+
delete=False,
|
|
910
|
+
) as tmp:
|
|
911
|
+
json.dump(locks_data, tmp)
|
|
912
|
+
tmp_path = tmp.name
|
|
913
|
+
# Rename is atomic on POSIX
|
|
914
|
+
import os
|
|
915
|
+
|
|
916
|
+
os.rename(tmp_path, locks_path)
|
|
579
917
|
except Exception:
|
|
580
918
|
logger.warning("Error while starting job", exc_info=True)
|
|
581
919
|
return JobState.ERROR
|
|
@@ -633,6 +971,9 @@ class Scheduler(threading.Thread):
|
|
|
633
971
|
)
|
|
634
972
|
state = JobState.ERROR
|
|
635
973
|
|
|
974
|
+
# Notify locks that job has finished (before releasing)
|
|
975
|
+
await locks.aio_job_finished(job)
|
|
976
|
+
|
|
636
977
|
# Locks are released here after job completes
|
|
637
978
|
|
|
638
979
|
# Check if we should restart a resumable task that timed out
|
|
@@ -672,3 +1013,259 @@ class Scheduler(threading.Thread):
|
|
|
672
1013
|
# Notify scheduler listeners of job state after job completes
|
|
673
1014
|
self.notify_job_state(job)
|
|
674
1015
|
return state
|
|
1016
|
+
|
|
1017
|
+
# =========================================================================
|
|
1018
|
+
# StateProvider abstract method implementations
|
|
1019
|
+
# =========================================================================
|
|
1020
|
+
|
|
1021
|
+
def get_experiments(
|
|
1022
|
+
self,
|
|
1023
|
+
since: Optional[datetime] = None, # noqa: ARG002
|
|
1024
|
+
) -> List[BaseExperiment]:
|
|
1025
|
+
"""Get list of all live experiments"""
|
|
1026
|
+
# Note: 'since' filter not applicable for live scheduler
|
|
1027
|
+
return list(self.experiments.values())
|
|
1028
|
+
|
|
1029
|
+
def get_experiment(self, experiment_id: str) -> Optional[BaseExperiment]:
|
|
1030
|
+
"""Get a specific experiment by ID"""
|
|
1031
|
+
return self.experiments.get(experiment_id)
|
|
1032
|
+
|
|
1033
|
+
def get_experiment_runs(self, experiment_id: str) -> List[BaseExperiment]:
|
|
1034
|
+
"""Get all runs for an experiment
|
|
1035
|
+
|
|
1036
|
+
For a live scheduler, returns the live experiment directly.
|
|
1037
|
+
"""
|
|
1038
|
+
exp = self.experiments.get(experiment_id)
|
|
1039
|
+
if not exp:
|
|
1040
|
+
return []
|
|
1041
|
+
|
|
1042
|
+
# Return the live experiment (it already implements BaseExperiment)
|
|
1043
|
+
return [exp]
|
|
1044
|
+
|
|
1045
|
+
def get_current_run(self, experiment_id: str) -> Optional[str]:
|
|
1046
|
+
"""Get the current run ID for an experiment"""
|
|
1047
|
+
exp = self.experiments.get(experiment_id)
|
|
1048
|
+
return exp.run_id if exp else None
|
|
1049
|
+
|
|
1050
|
+
def get_jobs(
|
|
1051
|
+
self,
|
|
1052
|
+
experiment_id: Optional[str] = None,
|
|
1053
|
+
run_id: Optional[str] = None, # noqa: ARG002 - not used in live scheduler
|
|
1054
|
+
task_id: Optional[str] = None,
|
|
1055
|
+
state: Optional[str] = None,
|
|
1056
|
+
tags: Optional[Dict[str, str]] = None,
|
|
1057
|
+
since: Optional[datetime] = None, # noqa: ARG002 - not used in live scheduler
|
|
1058
|
+
) -> List[BaseJob]:
|
|
1059
|
+
"""Query jobs with optional filters"""
|
|
1060
|
+
jobs: List[BaseJob] = list(self.jobs.values())
|
|
1061
|
+
|
|
1062
|
+
# Filter by experiment
|
|
1063
|
+
if experiment_id:
|
|
1064
|
+
exp = self.experiments.get(experiment_id)
|
|
1065
|
+
if exp:
|
|
1066
|
+
jobs = [j for j in jobs if j.experiments and exp in j.experiments]
|
|
1067
|
+
else:
|
|
1068
|
+
jobs = []
|
|
1069
|
+
|
|
1070
|
+
# Filter by task_id
|
|
1071
|
+
if task_id:
|
|
1072
|
+
jobs = [j for j in jobs if j.task_id == task_id]
|
|
1073
|
+
|
|
1074
|
+
# Filter by state
|
|
1075
|
+
if state:
|
|
1076
|
+
jobs = [j for j in jobs if j.state.name.lower() == state.lower()]
|
|
1077
|
+
|
|
1078
|
+
# Filter by tags (all tags must match)
|
|
1079
|
+
if tags:
|
|
1080
|
+
jobs = [j for j in jobs if all(j.tags.get(k) == v for k, v in tags.items())]
|
|
1081
|
+
|
|
1082
|
+
return jobs
|
|
1083
|
+
|
|
1084
|
+
def get_job(
|
|
1085
|
+
self,
|
|
1086
|
+
job_id: str,
|
|
1087
|
+
experiment_id: str, # noqa: ARG002 - job_id is sufficient in live scheduler
|
|
1088
|
+
run_id: Optional[str] = None, # noqa: ARG002 - job_id is sufficient in live scheduler
|
|
1089
|
+
) -> Optional[BaseJob]:
|
|
1090
|
+
"""Get a specific job"""
|
|
1091
|
+
return self.jobs.get(job_id)
|
|
1092
|
+
|
|
1093
|
+
def get_all_jobs(
|
|
1094
|
+
self,
|
|
1095
|
+
state: Optional[str] = None,
|
|
1096
|
+
tags: Optional[Dict[str, str]] = None,
|
|
1097
|
+
since: Optional[datetime] = None, # noqa: ARG002 - not used in live scheduler
|
|
1098
|
+
) -> List[BaseJob]:
|
|
1099
|
+
"""Get all jobs across all experiments"""
|
|
1100
|
+
jobs: List[BaseJob] = list(self.jobs.values())
|
|
1101
|
+
|
|
1102
|
+
if state:
|
|
1103
|
+
jobs = [j for j in jobs if j.state.name.lower() == state.lower()]
|
|
1104
|
+
|
|
1105
|
+
if tags:
|
|
1106
|
+
jobs = [j for j in jobs if all(j.tags.get(k) == v for k, v in tags.items())]
|
|
1107
|
+
|
|
1108
|
+
return jobs
|
|
1109
|
+
|
|
1110
|
+
def get_services(
|
|
1111
|
+
self,
|
|
1112
|
+
experiment_id: Optional[str] = None,
|
|
1113
|
+
run_id: Optional[str] = None,
|
|
1114
|
+
) -> List[BaseService]:
|
|
1115
|
+
"""Get services for an experiment
|
|
1116
|
+
|
|
1117
|
+
Services are stored in the scheduler and persist after experiments finish.
|
|
1118
|
+
"""
|
|
1119
|
+
if experiment_id is None:
|
|
1120
|
+
# Return all services from all experiments
|
|
1121
|
+
services = []
|
|
1122
|
+
for services_dict in self.services.values():
|
|
1123
|
+
services.extend(services_dict.values())
|
|
1124
|
+
return services
|
|
1125
|
+
|
|
1126
|
+
# Get services for specific experiment
|
|
1127
|
+
services = []
|
|
1128
|
+
if run_id is not None:
|
|
1129
|
+
# Specific run requested
|
|
1130
|
+
key = (experiment_id, run_id)
|
|
1131
|
+
services_dict = self.services.get(key, {})
|
|
1132
|
+
services = list(services_dict.values())
|
|
1133
|
+
else:
|
|
1134
|
+
# No run_id specified - return services from all runs of this experiment
|
|
1135
|
+
for (exp_id, _run_id), services_dict in self.services.items():
|
|
1136
|
+
if exp_id == experiment_id:
|
|
1137
|
+
services.extend(services_dict.values())
|
|
1138
|
+
|
|
1139
|
+
logger.debug(
|
|
1140
|
+
"get_services(%s, %s): returning %d services",
|
|
1141
|
+
experiment_id,
|
|
1142
|
+
run_id,
|
|
1143
|
+
len(services),
|
|
1144
|
+
)
|
|
1145
|
+
return services
|
|
1146
|
+
|
|
1147
|
+
def get_tags_map(
|
|
1148
|
+
self,
|
|
1149
|
+
experiment_id: str,
|
|
1150
|
+
run_id: Optional[str] = None,
|
|
1151
|
+
) -> dict[str, dict[str, str]]:
|
|
1152
|
+
"""Get tags map for jobs in an experiment/run
|
|
1153
|
+
|
|
1154
|
+
Returns a map from job_id to {tag_key: tag_value}.
|
|
1155
|
+
"""
|
|
1156
|
+
exp = self.experiments.get(experiment_id)
|
|
1157
|
+
if not exp:
|
|
1158
|
+
return {}
|
|
1159
|
+
|
|
1160
|
+
# Use current run if not specified
|
|
1161
|
+
if run_id is None:
|
|
1162
|
+
run_id = exp.run_id
|
|
1163
|
+
|
|
1164
|
+
exp_run_key = (experiment_id, run_id)
|
|
1165
|
+
return self._tags_map.get(exp_run_key, {})
|
|
1166
|
+
|
|
1167
|
+
def get_dependencies_map(
|
|
1168
|
+
self,
|
|
1169
|
+
experiment_id: str,
|
|
1170
|
+
run_id: Optional[str] = None,
|
|
1171
|
+
) -> dict[str, list[str]]:
|
|
1172
|
+
"""Get dependencies map for jobs in an experiment/run
|
|
1173
|
+
|
|
1174
|
+
Returns a map from job_id to list of job_ids it depends on.
|
|
1175
|
+
"""
|
|
1176
|
+
exp = self.experiments.get(experiment_id)
|
|
1177
|
+
if not exp:
|
|
1178
|
+
return {}
|
|
1179
|
+
|
|
1180
|
+
# Use current run if not specified
|
|
1181
|
+
if run_id is None:
|
|
1182
|
+
run_id = exp.run_id
|
|
1183
|
+
|
|
1184
|
+
exp_run_key = (experiment_id, run_id)
|
|
1185
|
+
return self._dependencies_map.get(exp_run_key, {})
|
|
1186
|
+
|
|
1187
|
+
def kill_job(self, job: BaseJob, perform: bool = False) -> bool:
|
|
1188
|
+
"""Kill a running job
|
|
1189
|
+
|
|
1190
|
+
For the scheduler, this is a live operation.
|
|
1191
|
+
"""
|
|
1192
|
+
if not perform:
|
|
1193
|
+
# Just check if the job can be killed
|
|
1194
|
+
return job.state == JobState.RUNNING
|
|
1195
|
+
|
|
1196
|
+
if job.state != JobState.RUNNING:
|
|
1197
|
+
return False
|
|
1198
|
+
|
|
1199
|
+
# Get the actual Job from our jobs dict
|
|
1200
|
+
actual_job = self.jobs.get(job.identifier)
|
|
1201
|
+
if actual_job is None:
|
|
1202
|
+
return False
|
|
1203
|
+
|
|
1204
|
+
# Try to kill the process via the process attribute
|
|
1205
|
+
process = getattr(actual_job, "process", None)
|
|
1206
|
+
if process is not None:
|
|
1207
|
+
try:
|
|
1208
|
+
process.kill()
|
|
1209
|
+
return True
|
|
1210
|
+
except Exception:
|
|
1211
|
+
logger.exception("Failed to kill job %s", job.identifier)
|
|
1212
|
+
return False
|
|
1213
|
+
|
|
1214
|
+
def clean_job(
|
|
1215
|
+
self,
|
|
1216
|
+
job: BaseJob, # noqa: ARG002
|
|
1217
|
+
perform: bool = False, # noqa: ARG002
|
|
1218
|
+
) -> bool:
|
|
1219
|
+
"""Clean a finished job
|
|
1220
|
+
|
|
1221
|
+
For the scheduler, jobs are automatically cleaned when they finish.
|
|
1222
|
+
"""
|
|
1223
|
+
# Live scheduler doesn't support cleaning jobs
|
|
1224
|
+
return False
|
|
1225
|
+
|
|
1226
|
+
def get_process_info(self, job: BaseJob):
|
|
1227
|
+
"""Get process information for a job
|
|
1228
|
+
|
|
1229
|
+
For the scheduler, we can access the actual Job and read its PID file.
|
|
1230
|
+
"""
|
|
1231
|
+
from experimaestro.scheduler.state_provider import ProcessInfo
|
|
1232
|
+
|
|
1233
|
+
# Get the actual Job from our jobs dict
|
|
1234
|
+
actual_job = self.jobs.get(job.identifier)
|
|
1235
|
+
if actual_job is None:
|
|
1236
|
+
return None
|
|
1237
|
+
|
|
1238
|
+
# Try to read the PID file
|
|
1239
|
+
try:
|
|
1240
|
+
pidpath = getattr(actual_job, "pidpath", None)
|
|
1241
|
+
if pidpath is None or not pidpath.exists():
|
|
1242
|
+
return None
|
|
1243
|
+
|
|
1244
|
+
pinfo = json.loads(pidpath.read_text())
|
|
1245
|
+
pid = pinfo.get("pid")
|
|
1246
|
+
proc_type = pinfo.get("type", "unknown")
|
|
1247
|
+
|
|
1248
|
+
if pid is None:
|
|
1249
|
+
return None
|
|
1250
|
+
|
|
1251
|
+
# Check if running based on job state
|
|
1252
|
+
running = actual_job.state == JobState.RUNNING
|
|
1253
|
+
|
|
1254
|
+
return ProcessInfo(pid=pid, type=proc_type, running=running)
|
|
1255
|
+
except Exception:
|
|
1256
|
+
return None
|
|
1257
|
+
|
|
1258
|
+
def close(self) -> None:
|
|
1259
|
+
"""Close the state provider and clean up resources"""
|
|
1260
|
+
# Stop all job event readers
|
|
1261
|
+
self._stop_job_event_reader()
|
|
1262
|
+
|
|
1263
|
+
@property
|
|
1264
|
+
def read_only(self) -> bool:
|
|
1265
|
+
"""Live scheduler is read-write"""
|
|
1266
|
+
return False
|
|
1267
|
+
|
|
1268
|
+
@property
|
|
1269
|
+
def is_remote(self) -> bool:
|
|
1270
|
+
"""Live scheduler is local"""
|
|
1271
|
+
return False
|