experimaestro 2.0.0b8__py3-none-any.whl → 2.0.0b17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +12 -5
- experimaestro/cli/__init__.py +239 -126
- experimaestro/cli/filter.py +48 -23
- experimaestro/cli/jobs.py +253 -71
- experimaestro/cli/refactor.py +1 -2
- experimaestro/commandline.py +7 -4
- experimaestro/connectors/__init__.py +9 -1
- experimaestro/connectors/local.py +43 -3
- experimaestro/core/arguments.py +18 -18
- experimaestro/core/identifier.py +11 -11
- experimaestro/core/objects/config.py +96 -39
- experimaestro/core/objects/config_walk.py +3 -3
- experimaestro/core/{subparameters.py → partial.py} +16 -16
- experimaestro/core/partial_lock.py +394 -0
- experimaestro/core/types.py +12 -15
- experimaestro/dynamic.py +290 -0
- experimaestro/experiments/__init__.py +6 -2
- experimaestro/experiments/cli.py +217 -50
- experimaestro/experiments/configuration.py +24 -0
- experimaestro/generators.py +5 -5
- experimaestro/ipc.py +118 -1
- experimaestro/launcherfinder/__init__.py +2 -2
- experimaestro/launcherfinder/registry.py +6 -7
- experimaestro/launcherfinder/specs.py +2 -9
- experimaestro/launchers/slurm/__init__.py +2 -2
- experimaestro/launchers/slurm/base.py +62 -0
- experimaestro/locking.py +957 -1
- experimaestro/notifications.py +89 -201
- experimaestro/progress.py +63 -366
- experimaestro/rpyc.py +0 -2
- experimaestro/run.py +29 -2
- experimaestro/scheduler/__init__.py +8 -1
- experimaestro/scheduler/base.py +629 -53
- experimaestro/scheduler/dependencies.py +20 -16
- experimaestro/scheduler/experiment.py +732 -167
- experimaestro/scheduler/interfaces.py +316 -101
- experimaestro/scheduler/jobs.py +58 -20
- experimaestro/scheduler/remote/adaptive_sync.py +265 -0
- experimaestro/scheduler/remote/client.py +171 -117
- experimaestro/scheduler/remote/protocol.py +8 -193
- experimaestro/scheduler/remote/server.py +95 -71
- experimaestro/scheduler/services.py +53 -28
- experimaestro/scheduler/state_provider.py +663 -2430
- experimaestro/scheduler/state_status.py +1247 -0
- experimaestro/scheduler/transient.py +31 -0
- experimaestro/scheduler/workspace.py +1 -1
- experimaestro/scheduler/workspace_state_provider.py +1273 -0
- experimaestro/scriptbuilder.py +4 -4
- experimaestro/settings.py +36 -0
- experimaestro/tests/conftest.py +33 -5
- experimaestro/tests/connectors/bin/executable.py +1 -1
- experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
- experimaestro/tests/launchers/bin/test.py +1 -0
- experimaestro/tests/launchers/test_slurm.py +9 -9
- experimaestro/tests/partial_reschedule.py +46 -0
- experimaestro/tests/restart.py +3 -3
- experimaestro/tests/restart_main.py +1 -0
- experimaestro/tests/scripts/notifyandwait.py +1 -0
- experimaestro/tests/task_partial.py +38 -0
- experimaestro/tests/task_tokens.py +2 -2
- experimaestro/tests/tasks/test_dynamic.py +6 -6
- experimaestro/tests/test_dependencies.py +3 -3
- experimaestro/tests/test_deprecated.py +15 -15
- experimaestro/tests/test_dynamic_locking.py +317 -0
- experimaestro/tests/test_environment.py +24 -14
- experimaestro/tests/test_experiment.py +171 -36
- experimaestro/tests/test_identifier.py +25 -25
- experimaestro/tests/test_identifier_stability.py +3 -5
- experimaestro/tests/test_multitoken.py +2 -4
- experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
- experimaestro/tests/test_partial_paths.py +81 -138
- experimaestro/tests/test_pre_experiment.py +219 -0
- experimaestro/tests/test_progress.py +2 -8
- experimaestro/tests/test_remote_state.py +560 -99
- experimaestro/tests/test_stray_jobs.py +261 -0
- experimaestro/tests/test_tasks.py +1 -2
- experimaestro/tests/test_token_locking.py +52 -67
- experimaestro/tests/test_tokens.py +5 -6
- experimaestro/tests/test_transient.py +225 -0
- experimaestro/tests/test_workspace_state_provider.py +768 -0
- experimaestro/tests/token_reschedule.py +1 -3
- experimaestro/tests/utils.py +2 -7
- experimaestro/tokens.py +227 -372
- experimaestro/tools/diff.py +1 -0
- experimaestro/tools/documentation.py +4 -5
- experimaestro/tools/jobs.py +1 -2
- experimaestro/tui/app.py +438 -1966
- experimaestro/tui/app.tcss +162 -0
- experimaestro/tui/dialogs.py +172 -0
- experimaestro/tui/log_viewer.py +253 -3
- experimaestro/tui/messages.py +137 -0
- experimaestro/tui/utils.py +54 -0
- experimaestro/tui/widgets/__init__.py +23 -0
- experimaestro/tui/widgets/experiments.py +468 -0
- experimaestro/tui/widgets/global_services.py +238 -0
- experimaestro/tui/widgets/jobs.py +972 -0
- experimaestro/tui/widgets/log.py +156 -0
- experimaestro/tui/widgets/orphans.py +363 -0
- experimaestro/tui/widgets/runs.py +185 -0
- experimaestro/tui/widgets/services.py +314 -0
- experimaestro/tui/widgets/stray_jobs.py +528 -0
- experimaestro/utils/__init__.py +1 -1
- experimaestro/utils/environment.py +105 -22
- experimaestro/utils/fswatcher.py +124 -0
- experimaestro/utils/jobs.py +1 -2
- experimaestro/utils/jupyter.py +1 -2
- experimaestro/utils/logging.py +72 -0
- experimaestro/version.py +2 -2
- experimaestro/webui/__init__.py +9 -0
- experimaestro/webui/app.py +117 -0
- experimaestro/{server → webui}/data/index.css +66 -11
- experimaestro/webui/data/index.css.map +1 -0
- experimaestro/{server → webui}/data/index.js +82763 -87217
- experimaestro/webui/data/index.js.map +1 -0
- experimaestro/webui/routes/__init__.py +5 -0
- experimaestro/webui/routes/auth.py +53 -0
- experimaestro/webui/routes/proxy.py +117 -0
- experimaestro/webui/server.py +200 -0
- experimaestro/webui/state_bridge.py +152 -0
- experimaestro/webui/websocket.py +413 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +5 -6
- experimaestro-2.0.0b17.dist-info/RECORD +219 -0
- experimaestro/cli/progress.py +0 -269
- experimaestro/scheduler/state.py +0 -75
- experimaestro/scheduler/state_db.py +0 -437
- experimaestro/scheduler/state_sync.py +0 -891
- experimaestro/server/__init__.py +0 -467
- experimaestro/server/data/index.css.map +0 -1
- experimaestro/server/data/index.js.map +0 -1
- experimaestro/tests/test_cli_jobs.py +0 -615
- experimaestro/tests/test_file_progress.py +0 -425
- experimaestro/tests/test_file_progress_integration.py +0 -477
- experimaestro/tests/test_state_db.py +0 -434
- experimaestro-2.0.0b8.dist-info/RECORD +0 -187
- /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
- /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
- /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
- /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
- /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
- /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
- /experimaestro/{server → webui}/data/favicon.ico +0 -0
- /experimaestro/{server → webui}/data/index.html +0 -0
- /experimaestro/{server → webui}/data/login.html +0 -0
- /experimaestro/{server → webui}/data/manifest.json +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
experimaestro/scheduler/base.py
CHANGED
|
@@ -1,17 +1,34 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import threading
|
|
2
3
|
import time
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
3
6
|
from typing import (
|
|
7
|
+
Dict,
|
|
8
|
+
List,
|
|
4
9
|
Optional,
|
|
5
10
|
Set,
|
|
6
11
|
ClassVar,
|
|
7
12
|
TYPE_CHECKING,
|
|
8
13
|
)
|
|
9
14
|
import asyncio
|
|
10
|
-
from typing import Dict
|
|
11
15
|
|
|
12
16
|
from experimaestro.scheduler import experiment
|
|
13
|
-
from experimaestro.scheduler.jobs import Job, JobState, JobError
|
|
17
|
+
from experimaestro.scheduler.jobs import Job, JobState, JobError, JobDependency
|
|
14
18
|
from experimaestro.scheduler.services import Service
|
|
19
|
+
from experimaestro.scheduler.interfaces import (
|
|
20
|
+
BaseJob,
|
|
21
|
+
BaseExperiment,
|
|
22
|
+
BaseService,
|
|
23
|
+
)
|
|
24
|
+
from experimaestro.scheduler.state_provider import StateProvider
|
|
25
|
+
from experimaestro.scheduler.state_status import (
|
|
26
|
+
EventReader,
|
|
27
|
+
JobProgressEvent,
|
|
28
|
+
JobStateChangedEvent,
|
|
29
|
+
WatchedDirectory,
|
|
30
|
+
job_entity_id_extractor,
|
|
31
|
+
)
|
|
15
32
|
|
|
16
33
|
|
|
17
34
|
from experimaestro.utils import logger
|
|
@@ -19,7 +36,7 @@ from experimaestro.utils.asyncio import asyncThreadcheck
|
|
|
19
36
|
import concurrent.futures
|
|
20
37
|
|
|
21
38
|
if TYPE_CHECKING:
|
|
22
|
-
from experimaestro.
|
|
39
|
+
from experimaestro.webui import WebUIServer
|
|
23
40
|
from experimaestro.settings import ServerSettings
|
|
24
41
|
from experimaestro.scheduler.workspace import Workspace
|
|
25
42
|
|
|
@@ -36,18 +53,25 @@ class Listener:
|
|
|
36
53
|
pass
|
|
37
54
|
|
|
38
55
|
|
|
39
|
-
class Scheduler(threading.Thread):
|
|
40
|
-
"""A job scheduler (singleton)
|
|
56
|
+
class Scheduler(StateProvider, threading.Thread):
|
|
57
|
+
"""A job scheduler (singleton) that provides live state
|
|
41
58
|
|
|
42
59
|
The scheduler is based on asyncio for easy concurrency handling.
|
|
43
60
|
This is a singleton - only one scheduler instance exists per process.
|
|
61
|
+
|
|
62
|
+
Inherits from StateProvider to allow TUI/Web interfaces to access
|
|
63
|
+
live job and experiment state during experiment execution.
|
|
44
64
|
"""
|
|
45
65
|
|
|
46
66
|
_instance: ClassVar[Optional["Scheduler"]] = None
|
|
47
67
|
_lock: ClassVar[threading.Lock] = threading.Lock()
|
|
48
68
|
|
|
69
|
+
#: Scheduler is always live
|
|
70
|
+
is_live: bool = True
|
|
71
|
+
|
|
49
72
|
def __init__(self, name: str = "Global"):
|
|
50
|
-
|
|
73
|
+
StateProvider.__init__(self) # Initialize state listener management
|
|
74
|
+
threading.Thread.__init__(self, name=f"Scheduler ({name})", daemon=True)
|
|
51
75
|
self._ready = threading.Event()
|
|
52
76
|
|
|
53
77
|
# Name of the scheduler
|
|
@@ -62,10 +86,19 @@ class Scheduler(threading.Thread):
|
|
|
62
86
|
# List of all jobs
|
|
63
87
|
self.jobs: Dict[str, "Job"] = {}
|
|
64
88
|
|
|
89
|
+
# Services: (experiment_id, run_id) -> {service_id -> Service}
|
|
90
|
+
self.services: Dict[tuple[str, str], Dict[str, Service]] = {}
|
|
91
|
+
|
|
92
|
+
# Tags map: (experiment_id, run_id) -> {job_id -> {tag_key: tag_value}}
|
|
93
|
+
self._tags_map: dict[tuple[str, str], dict[str, dict[str, str]]] = {}
|
|
94
|
+
|
|
95
|
+
# Dependencies map: (experiment_id, run_id) -> {job_id -> [depends_on_job_ids]}
|
|
96
|
+
self._dependencies_map: dict[tuple[str, str], dict[str, list[str]]] = {}
|
|
97
|
+
|
|
65
98
|
# List of jobs
|
|
66
99
|
self.waitingjobs: Set[Job] = set()
|
|
67
100
|
|
|
68
|
-
#
|
|
101
|
+
# Legacy listeners with thread-safe access
|
|
69
102
|
self._listeners: Set[Listener] = set()
|
|
70
103
|
self._listeners_lock = threading.Lock()
|
|
71
104
|
|
|
@@ -75,7 +108,12 @@ class Scheduler(threading.Thread):
|
|
|
75
108
|
)
|
|
76
109
|
|
|
77
110
|
# Server (managed by scheduler)
|
|
78
|
-
self.server: Optional["
|
|
111
|
+
self.server: Optional["WebUIServer"] = None
|
|
112
|
+
|
|
113
|
+
# Job event readers per workspace
|
|
114
|
+
# Uses EventReader to watch .events/jobs/ directory
|
|
115
|
+
self._job_event_readers: Dict[Path, EventReader] = {}
|
|
116
|
+
self._job_event_readers_lock = threading.Lock()
|
|
79
117
|
|
|
80
118
|
@staticmethod
|
|
81
119
|
def has_instance() -> bool:
|
|
@@ -114,51 +152,58 @@ class Scheduler(threading.Thread):
|
|
|
114
152
|
|
|
115
153
|
def register_experiment(self, xp: "experiment"):
|
|
116
154
|
"""Register an experiment with the scheduler"""
|
|
117
|
-
# Use experiment name as key
|
|
118
|
-
key = xp.
|
|
155
|
+
# Use experiment name as key (not workdir.name which is now run_id)
|
|
156
|
+
key = xp.name
|
|
119
157
|
self.experiments[key] = xp
|
|
120
158
|
|
|
159
|
+
# Start watching job events for this workspace
|
|
160
|
+
self._start_job_event_reader(xp.workspace.path)
|
|
161
|
+
|
|
121
162
|
logger.debug("Registered experiment %s with scheduler", key)
|
|
122
163
|
|
|
123
164
|
def unregister_experiment(self, xp: "experiment"):
|
|
124
165
|
"""Unregister an experiment from the scheduler"""
|
|
125
|
-
key = xp.
|
|
166
|
+
key = xp.name
|
|
126
167
|
if key in self.experiments:
|
|
127
168
|
del self.experiments[key]
|
|
128
169
|
logger.debug("Unregistered experiment %s from scheduler", key)
|
|
129
170
|
|
|
130
171
|
def start_server(
|
|
131
|
-
self,
|
|
172
|
+
self,
|
|
173
|
+
settings: "ServerSettings" = None,
|
|
174
|
+
workspace: "Workspace" = None, # noqa: ARG002 - kept for backward compat
|
|
175
|
+
wait_for_quit: bool = False,
|
|
132
176
|
):
|
|
133
|
-
"""Start the
|
|
177
|
+
"""Start the web server (if not already running)
|
|
134
178
|
|
|
135
179
|
Args:
|
|
136
180
|
settings: Server settings
|
|
137
|
-
workspace: Workspace instance (
|
|
181
|
+
workspace: Workspace instance (deprecated, not used)
|
|
182
|
+
wait_for_quit: If True, server waits for explicit quit from web UI
|
|
138
183
|
"""
|
|
139
184
|
if self.server is None:
|
|
140
|
-
from experimaestro.
|
|
141
|
-
from experimaestro.scheduler.state_provider import WorkspaceStateProvider
|
|
185
|
+
from experimaestro.webui import WebUIServer
|
|
142
186
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
# Get the workspace state provider singleton
|
|
147
|
-
state_provider = WorkspaceStateProvider.get_instance(
|
|
148
|
-
workspace.path, read_only=False, sync_on_start=False
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
self.server = Server.instance(settings, state_provider)
|
|
187
|
+
# Use the Scheduler itself as the StateProvider for live state access
|
|
188
|
+
self.server = WebUIServer.instance(settings, self, wait_for_quit)
|
|
152
189
|
self.server.start()
|
|
153
|
-
logger.info("
|
|
190
|
+
logger.info("Web server started by scheduler")
|
|
154
191
|
else:
|
|
155
|
-
logger.debug("
|
|
192
|
+
logger.debug("Web server already running")
|
|
156
193
|
|
|
157
194
|
def stop_server(self):
|
|
158
|
-
"""Stop the
|
|
195
|
+
"""Stop the web server"""
|
|
159
196
|
if self.server is not None:
|
|
160
197
|
self.server.stop()
|
|
161
|
-
logger.info("
|
|
198
|
+
logger.info("Web server stopped by scheduler")
|
|
199
|
+
|
|
200
|
+
def wait_for_server_quit(self):
|
|
201
|
+
"""Wait for explicit quit from web interface
|
|
202
|
+
|
|
203
|
+
Only blocks if server was started with wait_for_quit=True.
|
|
204
|
+
"""
|
|
205
|
+
if self.server is not None:
|
|
206
|
+
self.server.wait()
|
|
162
207
|
|
|
163
208
|
def run(self):
|
|
164
209
|
"""Run the event loop forever"""
|
|
@@ -277,12 +322,35 @@ class Scheduler(threading.Thread):
|
|
|
277
322
|
xp = experiment.current()
|
|
278
323
|
xp.add_job(other)
|
|
279
324
|
|
|
325
|
+
# Merge transient modes: more conservative mode wins
|
|
326
|
+
# NONE(0) > TRANSIENT(1) > REMOVE(2) - lower value wins
|
|
327
|
+
was_transient = other.transient.is_transient
|
|
328
|
+
if job.transient < other.transient:
|
|
329
|
+
other.transient = job.transient
|
|
330
|
+
# If job was transient and is now non-transient, mark it as needed
|
|
331
|
+
# This flag tells aio_submit not to skip the job
|
|
332
|
+
if was_transient and not other.transient.is_transient:
|
|
333
|
+
other._needed_transient = True
|
|
334
|
+
|
|
280
335
|
# Copy watched outputs from new job to existing job
|
|
281
336
|
# This ensures new callbacks are registered even for resubmitted jobs
|
|
282
337
|
other.watched_outputs.extend(job.watched_outputs)
|
|
283
338
|
|
|
339
|
+
# Check if job needs to be re-started
|
|
340
|
+
need_restart = False
|
|
284
341
|
if other.state.is_error():
|
|
285
|
-
logger.info("Re-submitting job")
|
|
342
|
+
logger.info("Re-submitting job (was in error state)")
|
|
343
|
+
need_restart = True
|
|
344
|
+
elif (
|
|
345
|
+
was_transient
|
|
346
|
+
and not other.transient.is_transient
|
|
347
|
+
and other.state == JobState.UNSCHEDULED
|
|
348
|
+
):
|
|
349
|
+
# Job was transient and skipped, but now is non-transient - restart it
|
|
350
|
+
logger.info("Re-submitting job (was transient, now non-transient)")
|
|
351
|
+
need_restart = True
|
|
352
|
+
|
|
353
|
+
if need_restart:
|
|
286
354
|
# Clean up old process info so it will be re-started
|
|
287
355
|
other._process = None
|
|
288
356
|
if other.pidpath.is_file():
|
|
@@ -290,6 +358,7 @@ class Scheduler(threading.Thread):
|
|
|
290
358
|
# Use set_state to handle experiment statistics updates
|
|
291
359
|
other.set_state(JobState.WAITING)
|
|
292
360
|
self.notify_job_state(other) # Notify listeners of re-submit
|
|
361
|
+
# The calling aio_submit will continue with this job and start it
|
|
293
362
|
else:
|
|
294
363
|
logger.warning("Job %s already submitted", job.identifier)
|
|
295
364
|
|
|
@@ -304,13 +373,145 @@ class Scheduler(threading.Thread):
|
|
|
304
373
|
job.submittime = time.time()
|
|
305
374
|
xp.add_job(job)
|
|
306
375
|
|
|
376
|
+
# Update tags map for this experiment/run
|
|
377
|
+
if job.tags:
|
|
378
|
+
exp_run_key = (xp.name, xp.run_id)
|
|
379
|
+
if exp_run_key not in self._tags_map:
|
|
380
|
+
self._tags_map[exp_run_key] = {}
|
|
381
|
+
self._tags_map[exp_run_key][job.identifier] = dict(job.tags)
|
|
382
|
+
|
|
383
|
+
# Update dependencies map for this experiment/run
|
|
384
|
+
exp_run_key = (xp.name, xp.run_id)
|
|
385
|
+
if exp_run_key not in self._dependencies_map:
|
|
386
|
+
self._dependencies_map[exp_run_key] = {}
|
|
387
|
+
depends_on_ids = [
|
|
388
|
+
dep.origin.identifier
|
|
389
|
+
for dep in job.dependencies
|
|
390
|
+
if isinstance(dep, JobDependency)
|
|
391
|
+
]
|
|
392
|
+
if depends_on_ids:
|
|
393
|
+
self._dependencies_map[exp_run_key][job.identifier] = depends_on_ids
|
|
394
|
+
|
|
307
395
|
# Set up dependencies
|
|
308
396
|
for dependency in job.dependencies:
|
|
309
397
|
dependency.target = job
|
|
310
|
-
|
|
398
|
+
# Some dependencies (like PartialDependency) don't have an origin resource
|
|
399
|
+
if dependency.origin is not None:
|
|
400
|
+
dependency.origin.dependents.add(dependency)
|
|
311
401
|
|
|
312
402
|
return None
|
|
313
403
|
|
|
404
|
+
def _start_job_event_reader(self, workspace_path: Path) -> None:
|
|
405
|
+
"""Start watching job events in a workspace
|
|
406
|
+
|
|
407
|
+
Uses EventReader to watch .events/jobs/ for job progress events.
|
|
408
|
+
Job state events are emitted by the job process itself.
|
|
409
|
+
Only starts one reader per workspace.
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
workspace_path: Path to the workspace directory
|
|
413
|
+
"""
|
|
414
|
+
with self._job_event_readers_lock:
|
|
415
|
+
# Already watching this workspace
|
|
416
|
+
if workspace_path in self._job_event_readers:
|
|
417
|
+
return
|
|
418
|
+
|
|
419
|
+
jobs_dir = workspace_path / ".events" / "jobs"
|
|
420
|
+
|
|
421
|
+
# Create new reader for this workspace
|
|
422
|
+
reader = EventReader(
|
|
423
|
+
[
|
|
424
|
+
WatchedDirectory(
|
|
425
|
+
path=jobs_dir,
|
|
426
|
+
glob_pattern="*/event-*-*.jsonl",
|
|
427
|
+
entity_id_extractor=job_entity_id_extractor,
|
|
428
|
+
)
|
|
429
|
+
]
|
|
430
|
+
)
|
|
431
|
+
reader.start_watching(
|
|
432
|
+
on_event=self._on_job_event,
|
|
433
|
+
)
|
|
434
|
+
self._job_event_readers[workspace_path] = reader
|
|
435
|
+
logger.debug("Started job event reader for %s", jobs_dir)
|
|
436
|
+
|
|
437
|
+
def _stop_job_event_reader(self, workspace_path: Optional[Path] = None) -> None:
|
|
438
|
+
"""Stop watching job events
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
workspace_path: If provided, stop only this workspace's reader.
|
|
442
|
+
If None, stop all readers.
|
|
443
|
+
"""
|
|
444
|
+
with self._job_event_readers_lock:
|
|
445
|
+
if workspace_path is not None:
|
|
446
|
+
reader = self._job_event_readers.pop(workspace_path, None)
|
|
447
|
+
if reader is not None:
|
|
448
|
+
reader.stop_watching()
|
|
449
|
+
logger.debug("Stopped job event reader for %s", workspace_path)
|
|
450
|
+
else:
|
|
451
|
+
# Stop all readers
|
|
452
|
+
for path, reader in self._job_event_readers.items():
|
|
453
|
+
reader.stop_watching()
|
|
454
|
+
logger.debug("Stopped job event reader for %s", path)
|
|
455
|
+
self._job_event_readers.clear()
|
|
456
|
+
|
|
457
|
+
def _on_job_event(self, entity_id: str, event) -> None:
|
|
458
|
+
"""Handle job events from EventReader
|
|
459
|
+
|
|
460
|
+
Updates job state from file-based events and notifies listeners.
|
|
461
|
+
|
|
462
|
+
Args:
|
|
463
|
+
entity_id: The job ID
|
|
464
|
+
event: The event (JobProgressEvent or JobStateChangedEvent)
|
|
465
|
+
"""
|
|
466
|
+
job = self.jobs.get(entity_id)
|
|
467
|
+
if job is None:
|
|
468
|
+
logger.debug(
|
|
469
|
+
"Job event for unknown job %s",
|
|
470
|
+
entity_id,
|
|
471
|
+
)
|
|
472
|
+
return
|
|
473
|
+
logger.debug("Received event for job %s: %s", job, event)
|
|
474
|
+
|
|
475
|
+
if isinstance(event, JobProgressEvent):
|
|
476
|
+
# Update job's in-memory progress and notify legacy listeners
|
|
477
|
+
job.set_progress(event.level, event.progress, event.desc)
|
|
478
|
+
self.notify_job_state(job)
|
|
479
|
+
|
|
480
|
+
# Notify StateProvider-style listeners (TUI/WebUI)
|
|
481
|
+
state_event = JobStateChangedEvent(
|
|
482
|
+
job_id=job.identifier,
|
|
483
|
+
state=job.state.name.lower(),
|
|
484
|
+
)
|
|
485
|
+
self._notify_state_listeners_async(state_event)
|
|
486
|
+
|
|
487
|
+
def _cleanup_job_event_files(self, job: Job) -> None:
|
|
488
|
+
"""Clean up old job event files from previous runs
|
|
489
|
+
|
|
490
|
+
Removes event files at .events/jobs/{task_id}/event-{job_id}-*.jsonl
|
|
491
|
+
Called when a job is about to start to ensure clean state.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
job: The job being started
|
|
495
|
+
"""
|
|
496
|
+
# Get the workspace path from the job's path
|
|
497
|
+
# job.path is workspace/jobs/task_id/job_id
|
|
498
|
+
workspace_path = job.path.parent.parent.parent
|
|
499
|
+
task_id = str(job.type.identifier)
|
|
500
|
+
job_id = job.identifier
|
|
501
|
+
|
|
502
|
+
events_dir = workspace_path / ".events" / "jobs" / task_id
|
|
503
|
+
if not events_dir.exists():
|
|
504
|
+
return
|
|
505
|
+
|
|
506
|
+
# Find and delete old event files for this job
|
|
507
|
+
pattern = f"event-{job_id}-*.jsonl"
|
|
508
|
+
for event_file in events_dir.glob(pattern):
|
|
509
|
+
try:
|
|
510
|
+
event_file.unlink()
|
|
511
|
+
logger.debug("Removed old job event file: %s", event_file)
|
|
512
|
+
except OSError as e:
|
|
513
|
+
logger.warning("Failed to remove job event file %s: %s", event_file, e)
|
|
514
|
+
|
|
314
515
|
def _notify_listeners(self, notification_func, job: Job):
|
|
315
516
|
"""Execute notification in thread pool with error isolation.
|
|
316
517
|
|
|
@@ -331,18 +532,103 @@ class Scheduler(threading.Thread):
|
|
|
331
532
|
|
|
332
533
|
self._notification_executor.submit(_do_notify)
|
|
333
534
|
|
|
535
|
+
def _notify_state_listeners_async(self, event):
|
|
536
|
+
"""Notify StateProvider-style listeners asynchronously with error isolation.
|
|
537
|
+
|
|
538
|
+
This runs notifications in the same thread pool as _notify_listeners
|
|
539
|
+
to avoid blocking the scheduler and isolate errors.
|
|
540
|
+
"""
|
|
541
|
+
|
|
542
|
+
def _do_notify():
|
|
543
|
+
# Get a snapshot of listeners with the lock
|
|
544
|
+
with self._state_listener_lock:
|
|
545
|
+
listeners_snapshot = list(self._state_listeners)
|
|
546
|
+
|
|
547
|
+
for listener in listeners_snapshot:
|
|
548
|
+
try:
|
|
549
|
+
listener(event)
|
|
550
|
+
except Exception:
|
|
551
|
+
logger.exception("Got an error with state listener %s", listener)
|
|
552
|
+
|
|
553
|
+
self._notification_executor.submit(_do_notify)
|
|
554
|
+
|
|
334
555
|
def notify_job_submitted(self, job: Job):
|
|
335
556
|
"""Notify the listeners that a job has been submitted"""
|
|
336
557
|
self._notify_listeners(lambda lst, j: lst.job_submitted(j), job)
|
|
337
558
|
|
|
559
|
+
# Also notify StateProvider-style listeners (for TUI etc.)
|
|
560
|
+
from experimaestro.scheduler.state_status import JobSubmittedEvent, JobTag
|
|
561
|
+
|
|
562
|
+
# Get experiment info from job's experiments list
|
|
563
|
+
for exp in job.experiments:
|
|
564
|
+
experiment_id = exp.experiment_id
|
|
565
|
+
run_id = exp.run_id
|
|
566
|
+
|
|
567
|
+
# Get tags and dependencies for this job
|
|
568
|
+
exp_run_key = (experiment_id, run_id)
|
|
569
|
+
tags_dict = self._tags_map.get(exp_run_key, {}).get(job.identifier, {})
|
|
570
|
+
tags = [JobTag(key=k, value=v) for k, v in tags_dict.items()]
|
|
571
|
+
depends_on = self._dependencies_map.get(exp_run_key, {}).get(
|
|
572
|
+
job.identifier, []
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
event = JobSubmittedEvent(
|
|
576
|
+
experiment_id=experiment_id,
|
|
577
|
+
run_id=run_id,
|
|
578
|
+
job_id=job.identifier,
|
|
579
|
+
tags=tags,
|
|
580
|
+
depends_on=depends_on,
|
|
581
|
+
)
|
|
582
|
+
self._notify_state_listeners_async(event)
|
|
583
|
+
|
|
338
584
|
def notify_job_state(self, job: Job):
|
|
339
|
-
"""Notify the listeners that a job has changed state
|
|
585
|
+
"""Notify the listeners that a job has changed state
|
|
586
|
+
|
|
587
|
+
Note: This does NOT write to job event files. Job events are written
|
|
588
|
+
by the job process itself. The scheduler only forwards notifications
|
|
589
|
+
to listeners.
|
|
590
|
+
"""
|
|
591
|
+
# Legacy listener notification (per-experiment)
|
|
340
592
|
self._notify_listeners(lambda lst, j: lst.job_state(j), job)
|
|
341
593
|
|
|
342
|
-
|
|
594
|
+
# Notify StateProvider-style listeners with experiment-independent event
|
|
595
|
+
from experimaestro.scheduler.state_status import JobStateChangedEvent
|
|
596
|
+
|
|
597
|
+
event = JobStateChangedEvent(
|
|
598
|
+
job_id=job.identifier,
|
|
599
|
+
state=job.state.name.lower(),
|
|
600
|
+
)
|
|
601
|
+
self._notify_state_listeners_async(event)
|
|
602
|
+
|
|
603
|
+
def notify_service_add(
|
|
604
|
+
self, service: Service, experiment_id: str = "", run_id: str = ""
|
|
605
|
+
):
|
|
343
606
|
"""Notify the listeners that a service has been added"""
|
|
344
607
|
self._notify_listeners(lambda lst, s: lst.service_add(s), service)
|
|
345
608
|
|
|
609
|
+
# Store experiment info on the service for later retrieval
|
|
610
|
+
if experiment_id:
|
|
611
|
+
service._experiment_id = experiment_id
|
|
612
|
+
service._run_id = run_id or ""
|
|
613
|
+
|
|
614
|
+
# Store service in scheduler's services dict (persists after experiment ends)
|
|
615
|
+
if experiment_id:
|
|
616
|
+
key = (experiment_id, run_id or "")
|
|
617
|
+
if key not in self.services:
|
|
618
|
+
self.services[key] = {}
|
|
619
|
+
self.services[key][service.id] = service
|
|
620
|
+
|
|
621
|
+
# Also notify StateProvider-style listeners (for TUI etc.)
|
|
622
|
+
from experimaestro.scheduler.state_status import ServiceAddedEvent
|
|
623
|
+
|
|
624
|
+
if experiment_id:
|
|
625
|
+
event = ServiceAddedEvent(
|
|
626
|
+
experiment_id=experiment_id,
|
|
627
|
+
run_id=run_id or "",
|
|
628
|
+
service_id=service.id,
|
|
629
|
+
)
|
|
630
|
+
self._notify_state_listeners_async(event)
|
|
631
|
+
|
|
346
632
|
async def aio_submit(self, job: Job) -> JobState:
|
|
347
633
|
"""Main scheduler function: submit a job, run it (if needed), and returns
|
|
348
634
|
the status code
|
|
@@ -362,7 +648,7 @@ class Scheduler(threading.Thread):
|
|
|
362
648
|
# Check that we don't have a completed job in
|
|
363
649
|
# alternate directories
|
|
364
650
|
for jobspath in experiment.current().alt_jobspaths:
|
|
365
|
-
#
|
|
651
|
+
# Future enhancement: check if done
|
|
366
652
|
pass
|
|
367
653
|
|
|
368
654
|
# Creates a link into the experiment folder
|
|
@@ -388,10 +674,6 @@ class Scheduler(threading.Thread):
|
|
|
388
674
|
job.set_state(JobState.RUNNING)
|
|
389
675
|
self.notify_job_state(job)
|
|
390
676
|
|
|
391
|
-
# Adds to the listeners
|
|
392
|
-
if self.server is not None:
|
|
393
|
-
job.add_notification_server(self.server)
|
|
394
|
-
|
|
395
677
|
# And now, we wait...
|
|
396
678
|
logger.info("Got a process for job %s - waiting to complete", job)
|
|
397
679
|
code = await process.aio_code()
|
|
@@ -434,19 +716,27 @@ class Scheduler(threading.Thread):
|
|
|
434
716
|
|
|
435
717
|
# If not done or running, start the job
|
|
436
718
|
if not job.state.finished():
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
719
|
+
# Check if this is a transient job that is not needed
|
|
720
|
+
if job.transient.is_transient and not job._needed_transient:
|
|
721
|
+
job.set_state(JobState.UNSCHEDULED)
|
|
722
|
+
|
|
723
|
+
# Start the job if not skipped (state is still WAITING)
|
|
724
|
+
if job.state == JobState.WAITING:
|
|
725
|
+
try:
|
|
726
|
+
state = await self.aio_start(job)
|
|
727
|
+
if state is not None:
|
|
728
|
+
job.endtime = time.time()
|
|
729
|
+
job.set_state(state)
|
|
730
|
+
except Exception:
|
|
731
|
+
logger.exception("Got an exception while starting the job")
|
|
732
|
+
raise
|
|
445
733
|
|
|
446
734
|
# Job is finished - experiment statistics already updated by set_state
|
|
447
735
|
|
|
448
736
|
# Write final metadata with end time and final state
|
|
449
|
-
|
|
737
|
+
# Only for jobs that actually started (starttime is set in aio_start)
|
|
738
|
+
if job.starttime is not None:
|
|
739
|
+
job.status_path.write_text(json.dumps(job.state_dict()))
|
|
450
740
|
|
|
451
741
|
if job in self.waitingjobs:
|
|
452
742
|
self.waitingjobs.remove(job)
|
|
@@ -478,7 +768,7 @@ class Scheduler(threading.Thread):
|
|
|
478
768
|
or process creation
|
|
479
769
|
"""
|
|
480
770
|
from experimaestro.scheduler.jobs import JobStateError
|
|
481
|
-
from experimaestro.locking import
|
|
771
|
+
from experimaestro.locking import DynamicDependencyLocks, LockError
|
|
482
772
|
from experimaestro.scheduler.jobs import JobFailureStatus
|
|
483
773
|
|
|
484
774
|
# Assert preconditions
|
|
@@ -510,7 +800,7 @@ class Scheduler(threading.Thread):
|
|
|
510
800
|
return JobStateError(JobFailureStatus.DEPENDENCY)
|
|
511
801
|
|
|
512
802
|
# We first lock the job before proceeding
|
|
513
|
-
with
|
|
803
|
+
with DynamicDependencyLocks() as locks:
|
|
514
804
|
logger.debug("[starting] Locking job %s", job)
|
|
515
805
|
async with job.launcher.connector.lock(job.lockpath):
|
|
516
806
|
logger.debug("[starting] Locked job %s", job)
|
|
@@ -583,12 +873,15 @@ class Scheduler(threading.Thread):
|
|
|
583
873
|
if not directory.is_dir():
|
|
584
874
|
directory.mkdir(parents=True, exist_ok=True)
|
|
585
875
|
|
|
876
|
+
# Clean up old job event files from previous runs
|
|
877
|
+
self._cleanup_job_event_files(job)
|
|
878
|
+
|
|
586
879
|
# Write metadata with submit and start time (after directory creation)
|
|
587
|
-
job.
|
|
880
|
+
job.status_path.parent.mkdir(parents=True, exist_ok=True)
|
|
881
|
+
job.status_path.write_text(json.dumps(job.state_dict()))
|
|
588
882
|
|
|
589
|
-
#
|
|
590
|
-
|
|
591
|
-
job.add_notification_server(self.server)
|
|
883
|
+
# Notify locks before job starts (e.g., create symlinks)
|
|
884
|
+
await locks.aio_job_before_start(job)
|
|
592
885
|
|
|
593
886
|
except Exception:
|
|
594
887
|
logger.warning("Error while locking job", exc_info=True)
|
|
@@ -597,6 +890,30 @@ class Scheduler(threading.Thread):
|
|
|
597
890
|
try:
|
|
598
891
|
# Runs the job
|
|
599
892
|
process = await job.aio_run()
|
|
893
|
+
|
|
894
|
+
# Notify locks that job has started
|
|
895
|
+
await locks.aio_job_started(job, process)
|
|
896
|
+
|
|
897
|
+
# Write locks.json for job process (if there are dynamic locks)
|
|
898
|
+
if locks.locks:
|
|
899
|
+
import tempfile
|
|
900
|
+
|
|
901
|
+
locks_path = job.path / "locks.json"
|
|
902
|
+
locks_data = {"dynamic_locks": locks.to_json()}
|
|
903
|
+
# Atomic write: write to temp file then rename
|
|
904
|
+
with tempfile.NamedTemporaryFile(
|
|
905
|
+
mode="w",
|
|
906
|
+
dir=job.path,
|
|
907
|
+
prefix=".locks.",
|
|
908
|
+
suffix=".json",
|
|
909
|
+
delete=False,
|
|
910
|
+
) as tmp:
|
|
911
|
+
json.dump(locks_data, tmp)
|
|
912
|
+
tmp_path = tmp.name
|
|
913
|
+
# Rename is atomic on POSIX
|
|
914
|
+
import os
|
|
915
|
+
|
|
916
|
+
os.rename(tmp_path, locks_path)
|
|
600
917
|
except Exception:
|
|
601
918
|
logger.warning("Error while starting job", exc_info=True)
|
|
602
919
|
return JobState.ERROR
|
|
@@ -654,6 +971,9 @@ class Scheduler(threading.Thread):
|
|
|
654
971
|
)
|
|
655
972
|
state = JobState.ERROR
|
|
656
973
|
|
|
974
|
+
# Notify locks that job has finished (before releasing)
|
|
975
|
+
await locks.aio_job_finished(job)
|
|
976
|
+
|
|
657
977
|
# Locks are released here after job completes
|
|
658
978
|
|
|
659
979
|
# Check if we should restart a resumable task that timed out
|
|
@@ -693,3 +1013,259 @@ class Scheduler(threading.Thread):
|
|
|
693
1013
|
# Notify scheduler listeners of job state after job completes
|
|
694
1014
|
self.notify_job_state(job)
|
|
695
1015
|
return state
|
|
1016
|
+
|
|
1017
|
+
# =========================================================================
|
|
1018
|
+
# StateProvider abstract method implementations
|
|
1019
|
+
# =========================================================================
|
|
1020
|
+
|
|
1021
|
+
def get_experiments(
|
|
1022
|
+
self,
|
|
1023
|
+
since: Optional[datetime] = None, # noqa: ARG002
|
|
1024
|
+
) -> List[BaseExperiment]:
|
|
1025
|
+
"""Get list of all live experiments"""
|
|
1026
|
+
# Note: 'since' filter not applicable for live scheduler
|
|
1027
|
+
return list(self.experiments.values())
|
|
1028
|
+
|
|
1029
|
+
def get_experiment(self, experiment_id: str) -> Optional[BaseExperiment]:
|
|
1030
|
+
"""Get a specific experiment by ID"""
|
|
1031
|
+
return self.experiments.get(experiment_id)
|
|
1032
|
+
|
|
1033
|
+
def get_experiment_runs(self, experiment_id: str) -> List[BaseExperiment]:
|
|
1034
|
+
"""Get all runs for an experiment
|
|
1035
|
+
|
|
1036
|
+
For a live scheduler, returns the live experiment directly.
|
|
1037
|
+
"""
|
|
1038
|
+
exp = self.experiments.get(experiment_id)
|
|
1039
|
+
if not exp:
|
|
1040
|
+
return []
|
|
1041
|
+
|
|
1042
|
+
# Return the live experiment (it already implements BaseExperiment)
|
|
1043
|
+
return [exp]
|
|
1044
|
+
|
|
1045
|
+
def get_current_run(self, experiment_id: str) -> Optional[str]:
|
|
1046
|
+
"""Get the current run ID for an experiment"""
|
|
1047
|
+
exp = self.experiments.get(experiment_id)
|
|
1048
|
+
return exp.run_id if exp else None
|
|
1049
|
+
|
|
1050
|
+
def get_jobs(
|
|
1051
|
+
self,
|
|
1052
|
+
experiment_id: Optional[str] = None,
|
|
1053
|
+
run_id: Optional[str] = None, # noqa: ARG002 - not used in live scheduler
|
|
1054
|
+
task_id: Optional[str] = None,
|
|
1055
|
+
state: Optional[str] = None,
|
|
1056
|
+
tags: Optional[Dict[str, str]] = None,
|
|
1057
|
+
since: Optional[datetime] = None, # noqa: ARG002 - not used in live scheduler
|
|
1058
|
+
) -> List[BaseJob]:
|
|
1059
|
+
"""Query jobs with optional filters"""
|
|
1060
|
+
jobs: List[BaseJob] = list(self.jobs.values())
|
|
1061
|
+
|
|
1062
|
+
# Filter by experiment
|
|
1063
|
+
if experiment_id:
|
|
1064
|
+
exp = self.experiments.get(experiment_id)
|
|
1065
|
+
if exp:
|
|
1066
|
+
jobs = [j for j in jobs if j.experiments and exp in j.experiments]
|
|
1067
|
+
else:
|
|
1068
|
+
jobs = []
|
|
1069
|
+
|
|
1070
|
+
# Filter by task_id
|
|
1071
|
+
if task_id:
|
|
1072
|
+
jobs = [j for j in jobs if j.task_id == task_id]
|
|
1073
|
+
|
|
1074
|
+
# Filter by state
|
|
1075
|
+
if state:
|
|
1076
|
+
jobs = [j for j in jobs if j.state.name.lower() == state.lower()]
|
|
1077
|
+
|
|
1078
|
+
# Filter by tags (all tags must match)
|
|
1079
|
+
if tags:
|
|
1080
|
+
jobs = [j for j in jobs if all(j.tags.get(k) == v for k, v in tags.items())]
|
|
1081
|
+
|
|
1082
|
+
return jobs
|
|
1083
|
+
|
|
1084
|
+
def get_job(
|
|
1085
|
+
self,
|
|
1086
|
+
job_id: str,
|
|
1087
|
+
experiment_id: str, # noqa: ARG002 - job_id is sufficient in live scheduler
|
|
1088
|
+
run_id: Optional[str] = None, # noqa: ARG002 - job_id is sufficient in live scheduler
|
|
1089
|
+
) -> Optional[BaseJob]:
|
|
1090
|
+
"""Get a specific job"""
|
|
1091
|
+
return self.jobs.get(job_id)
|
|
1092
|
+
|
|
1093
|
+
def get_all_jobs(
|
|
1094
|
+
self,
|
|
1095
|
+
state: Optional[str] = None,
|
|
1096
|
+
tags: Optional[Dict[str, str]] = None,
|
|
1097
|
+
since: Optional[datetime] = None, # noqa: ARG002 - not used in live scheduler
|
|
1098
|
+
) -> List[BaseJob]:
|
|
1099
|
+
"""Get all jobs across all experiments"""
|
|
1100
|
+
jobs: List[BaseJob] = list(self.jobs.values())
|
|
1101
|
+
|
|
1102
|
+
if state:
|
|
1103
|
+
jobs = [j for j in jobs if j.state.name.lower() == state.lower()]
|
|
1104
|
+
|
|
1105
|
+
if tags:
|
|
1106
|
+
jobs = [j for j in jobs if all(j.tags.get(k) == v for k, v in tags.items())]
|
|
1107
|
+
|
|
1108
|
+
return jobs
|
|
1109
|
+
|
|
1110
|
+
def get_services(
|
|
1111
|
+
self,
|
|
1112
|
+
experiment_id: Optional[str] = None,
|
|
1113
|
+
run_id: Optional[str] = None,
|
|
1114
|
+
) -> List[BaseService]:
|
|
1115
|
+
"""Get services for an experiment
|
|
1116
|
+
|
|
1117
|
+
Services are stored in the scheduler and persist after experiments finish.
|
|
1118
|
+
"""
|
|
1119
|
+
if experiment_id is None:
|
|
1120
|
+
# Return all services from all experiments
|
|
1121
|
+
services = []
|
|
1122
|
+
for services_dict in self.services.values():
|
|
1123
|
+
services.extend(services_dict.values())
|
|
1124
|
+
return services
|
|
1125
|
+
|
|
1126
|
+
# Get services for specific experiment
|
|
1127
|
+
services = []
|
|
1128
|
+
if run_id is not None:
|
|
1129
|
+
# Specific run requested
|
|
1130
|
+
key = (experiment_id, run_id)
|
|
1131
|
+
services_dict = self.services.get(key, {})
|
|
1132
|
+
services = list(services_dict.values())
|
|
1133
|
+
else:
|
|
1134
|
+
# No run_id specified - return services from all runs of this experiment
|
|
1135
|
+
for (exp_id, _run_id), services_dict in self.services.items():
|
|
1136
|
+
if exp_id == experiment_id:
|
|
1137
|
+
services.extend(services_dict.values())
|
|
1138
|
+
|
|
1139
|
+
logger.debug(
|
|
1140
|
+
"get_services(%s, %s): returning %d services",
|
|
1141
|
+
experiment_id,
|
|
1142
|
+
run_id,
|
|
1143
|
+
len(services),
|
|
1144
|
+
)
|
|
1145
|
+
return services
|
|
1146
|
+
|
|
1147
|
+
def get_tags_map(
|
|
1148
|
+
self,
|
|
1149
|
+
experiment_id: str,
|
|
1150
|
+
run_id: Optional[str] = None,
|
|
1151
|
+
) -> dict[str, dict[str, str]]:
|
|
1152
|
+
"""Get tags map for jobs in an experiment/run
|
|
1153
|
+
|
|
1154
|
+
Returns a map from job_id to {tag_key: tag_value}.
|
|
1155
|
+
"""
|
|
1156
|
+
exp = self.experiments.get(experiment_id)
|
|
1157
|
+
if not exp:
|
|
1158
|
+
return {}
|
|
1159
|
+
|
|
1160
|
+
# Use current run if not specified
|
|
1161
|
+
if run_id is None:
|
|
1162
|
+
run_id = exp.run_id
|
|
1163
|
+
|
|
1164
|
+
exp_run_key = (experiment_id, run_id)
|
|
1165
|
+
return self._tags_map.get(exp_run_key, {})
|
|
1166
|
+
|
|
1167
|
+
def get_dependencies_map(
|
|
1168
|
+
self,
|
|
1169
|
+
experiment_id: str,
|
|
1170
|
+
run_id: Optional[str] = None,
|
|
1171
|
+
) -> dict[str, list[str]]:
|
|
1172
|
+
"""Get dependencies map for jobs in an experiment/run
|
|
1173
|
+
|
|
1174
|
+
Returns a map from job_id to list of job_ids it depends on.
|
|
1175
|
+
"""
|
|
1176
|
+
exp = self.experiments.get(experiment_id)
|
|
1177
|
+
if not exp:
|
|
1178
|
+
return {}
|
|
1179
|
+
|
|
1180
|
+
# Use current run if not specified
|
|
1181
|
+
if run_id is None:
|
|
1182
|
+
run_id = exp.run_id
|
|
1183
|
+
|
|
1184
|
+
exp_run_key = (experiment_id, run_id)
|
|
1185
|
+
return self._dependencies_map.get(exp_run_key, {})
|
|
1186
|
+
|
|
1187
|
+
def kill_job(self, job: BaseJob, perform: bool = False) -> bool:
|
|
1188
|
+
"""Kill a running job
|
|
1189
|
+
|
|
1190
|
+
For the scheduler, this is a live operation.
|
|
1191
|
+
"""
|
|
1192
|
+
if not perform:
|
|
1193
|
+
# Just check if the job can be killed
|
|
1194
|
+
return job.state == JobState.RUNNING
|
|
1195
|
+
|
|
1196
|
+
if job.state != JobState.RUNNING:
|
|
1197
|
+
return False
|
|
1198
|
+
|
|
1199
|
+
# Get the actual Job from our jobs dict
|
|
1200
|
+
actual_job = self.jobs.get(job.identifier)
|
|
1201
|
+
if actual_job is None:
|
|
1202
|
+
return False
|
|
1203
|
+
|
|
1204
|
+
# Try to kill the process via the process attribute
|
|
1205
|
+
process = getattr(actual_job, "process", None)
|
|
1206
|
+
if process is not None:
|
|
1207
|
+
try:
|
|
1208
|
+
process.kill()
|
|
1209
|
+
return True
|
|
1210
|
+
except Exception:
|
|
1211
|
+
logger.exception("Failed to kill job %s", job.identifier)
|
|
1212
|
+
return False
|
|
1213
|
+
|
|
1214
|
+
def clean_job(
|
|
1215
|
+
self,
|
|
1216
|
+
job: BaseJob, # noqa: ARG002
|
|
1217
|
+
perform: bool = False, # noqa: ARG002
|
|
1218
|
+
) -> bool:
|
|
1219
|
+
"""Clean a finished job
|
|
1220
|
+
|
|
1221
|
+
For the scheduler, jobs are automatically cleaned when they finish.
|
|
1222
|
+
"""
|
|
1223
|
+
# Live scheduler doesn't support cleaning jobs
|
|
1224
|
+
return False
|
|
1225
|
+
|
|
1226
|
+
def get_process_info(self, job: BaseJob):
|
|
1227
|
+
"""Get process information for a job
|
|
1228
|
+
|
|
1229
|
+
For the scheduler, we can access the actual Job and read its PID file.
|
|
1230
|
+
"""
|
|
1231
|
+
from experimaestro.scheduler.state_provider import ProcessInfo
|
|
1232
|
+
|
|
1233
|
+
# Get the actual Job from our jobs dict
|
|
1234
|
+
actual_job = self.jobs.get(job.identifier)
|
|
1235
|
+
if actual_job is None:
|
|
1236
|
+
return None
|
|
1237
|
+
|
|
1238
|
+
# Try to read the PID file
|
|
1239
|
+
try:
|
|
1240
|
+
pidpath = getattr(actual_job, "pidpath", None)
|
|
1241
|
+
if pidpath is None or not pidpath.exists():
|
|
1242
|
+
return None
|
|
1243
|
+
|
|
1244
|
+
pinfo = json.loads(pidpath.read_text())
|
|
1245
|
+
pid = pinfo.get("pid")
|
|
1246
|
+
proc_type = pinfo.get("type", "unknown")
|
|
1247
|
+
|
|
1248
|
+
if pid is None:
|
|
1249
|
+
return None
|
|
1250
|
+
|
|
1251
|
+
# Check if running based on job state
|
|
1252
|
+
running = actual_job.state == JobState.RUNNING
|
|
1253
|
+
|
|
1254
|
+
return ProcessInfo(pid=pid, type=proc_type, running=running)
|
|
1255
|
+
except Exception:
|
|
1256
|
+
return None
|
|
1257
|
+
|
|
1258
|
+
def close(self) -> None:
|
|
1259
|
+
"""Close the state provider and clean up resources"""
|
|
1260
|
+
# Stop all job event readers
|
|
1261
|
+
self._stop_job_event_reader()
|
|
1262
|
+
|
|
1263
|
+
@property
|
|
1264
|
+
def read_only(self) -> bool:
|
|
1265
|
+
"""Live scheduler is read-write"""
|
|
1266
|
+
return False
|
|
1267
|
+
|
|
1268
|
+
@property
|
|
1269
|
+
def is_remote(self) -> bool:
|
|
1270
|
+
"""Live scheduler is local"""
|
|
1271
|
+
return False
|