experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +10 -11
- experimaestro/annotations.py +167 -206
- experimaestro/cli/__init__.py +278 -7
- experimaestro/cli/filter.py +42 -74
- experimaestro/cli/jobs.py +157 -106
- experimaestro/cli/refactor.py +249 -0
- experimaestro/click.py +0 -1
- experimaestro/commandline.py +19 -3
- experimaestro/connectors/__init__.py +20 -1
- experimaestro/connectors/local.py +12 -0
- experimaestro/core/arguments.py +182 -46
- experimaestro/core/identifier.py +107 -6
- experimaestro/core/objects/__init__.py +6 -0
- experimaestro/core/objects/config.py +542 -25
- experimaestro/core/objects/config_walk.py +20 -0
- experimaestro/core/serialization.py +91 -34
- experimaestro/core/subparameters.py +164 -0
- experimaestro/core/types.py +175 -38
- experimaestro/exceptions.py +26 -0
- experimaestro/experiments/cli.py +111 -25
- experimaestro/generators.py +50 -9
- experimaestro/huggingface.py +3 -1
- experimaestro/launcherfinder/parser.py +29 -0
- experimaestro/launchers/__init__.py +26 -1
- experimaestro/launchers/direct.py +12 -0
- experimaestro/launchers/slurm/base.py +154 -2
- experimaestro/mkdocs/metaloader.py +0 -1
- experimaestro/mypy.py +452 -7
- experimaestro/notifications.py +63 -13
- experimaestro/progress.py +0 -2
- experimaestro/rpyc.py +0 -1
- experimaestro/run.py +19 -6
- experimaestro/scheduler/base.py +510 -125
- experimaestro/scheduler/dependencies.py +43 -28
- experimaestro/scheduler/dynamic_outputs.py +259 -130
- experimaestro/scheduler/experiment.py +256 -31
- experimaestro/scheduler/interfaces.py +501 -0
- experimaestro/scheduler/jobs.py +216 -206
- experimaestro/scheduler/remote/__init__.py +31 -0
- experimaestro/scheduler/remote/client.py +874 -0
- experimaestro/scheduler/remote/protocol.py +467 -0
- experimaestro/scheduler/remote/server.py +423 -0
- experimaestro/scheduler/remote/sync.py +144 -0
- experimaestro/scheduler/services.py +323 -23
- experimaestro/scheduler/state_db.py +437 -0
- experimaestro/scheduler/state_provider.py +2766 -0
- experimaestro/scheduler/state_sync.py +891 -0
- experimaestro/scheduler/workspace.py +52 -10
- experimaestro/scriptbuilder.py +7 -0
- experimaestro/server/__init__.py +147 -57
- experimaestro/server/data/index.css +0 -125
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +194 -58
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +44 -5
- experimaestro/sphinx/__init__.py +3 -3
- experimaestro/taskglobals.py +20 -0
- experimaestro/tests/conftest.py +80 -0
- experimaestro/tests/core/test_generics.py +2 -2
- experimaestro/tests/identifier_stability.json +45 -0
- experimaestro/tests/launchers/bin/sacct +6 -2
- experimaestro/tests/launchers/bin/sbatch +4 -2
- experimaestro/tests/launchers/test_slurm.py +80 -0
- experimaestro/tests/tasks/test_dynamic.py +231 -0
- experimaestro/tests/test_cli_jobs.py +615 -0
- experimaestro/tests/test_deprecated.py +630 -0
- experimaestro/tests/test_environment.py +200 -0
- experimaestro/tests/test_file_progress_integration.py +1 -1
- experimaestro/tests/test_forward.py +3 -3
- experimaestro/tests/test_identifier.py +372 -41
- experimaestro/tests/test_identifier_stability.py +458 -0
- experimaestro/tests/test_instance.py +3 -3
- experimaestro/tests/test_multitoken.py +442 -0
- experimaestro/tests/test_mypy.py +433 -0
- experimaestro/tests/test_objects.py +312 -5
- experimaestro/tests/test_outputs.py +2 -2
- experimaestro/tests/test_param.py +8 -12
- experimaestro/tests/test_partial_paths.py +231 -0
- experimaestro/tests/test_progress.py +0 -48
- experimaestro/tests/test_remote_state.py +671 -0
- experimaestro/tests/test_resumable_task.py +480 -0
- experimaestro/tests/test_serializers.py +141 -1
- experimaestro/tests/test_state_db.py +434 -0
- experimaestro/tests/test_subparameters.py +160 -0
- experimaestro/tests/test_tags.py +136 -0
- experimaestro/tests/test_tasks.py +107 -121
- experimaestro/tests/test_token_locking.py +252 -0
- experimaestro/tests/test_tokens.py +17 -13
- experimaestro/tests/test_types.py +123 -1
- experimaestro/tests/test_workspace_triggers.py +158 -0
- experimaestro/tests/token_reschedule.py +4 -2
- experimaestro/tests/utils.py +2 -2
- experimaestro/tokens.py +154 -57
- experimaestro/tools/diff.py +1 -1
- experimaestro/tui/__init__.py +8 -0
- experimaestro/tui/app.py +2395 -0
- experimaestro/tui/app.tcss +353 -0
- experimaestro/tui/log_viewer.py +228 -0
- experimaestro/utils/__init__.py +23 -0
- experimaestro/utils/environment.py +148 -0
- experimaestro/utils/git.py +129 -0
- experimaestro/utils/resources.py +1 -1
- experimaestro/version.py +34 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +68 -38
- experimaestro-2.0.0b8.dist-info/RECORD +187 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +1 -1
- experimaestro-2.0.0b8.dist-info/entry_points.txt +16 -0
- experimaestro/compat.py +0 -6
- experimaestro/core/objects.pyi +0 -221
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro-2.0.0a8.dist-info/RECORD +0 -166
- experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0
experimaestro/scheduler/base.py
CHANGED
|
@@ -1,15 +1,16 @@
|
|
|
1
|
-
import logging
|
|
2
1
|
import threading
|
|
3
2
|
import time
|
|
4
3
|
from typing import (
|
|
5
4
|
Optional,
|
|
6
5
|
Set,
|
|
6
|
+
ClassVar,
|
|
7
|
+
TYPE_CHECKING,
|
|
7
8
|
)
|
|
8
9
|
import asyncio
|
|
9
10
|
from typing import Dict
|
|
10
11
|
|
|
11
12
|
from experimaestro.scheduler import experiment
|
|
12
|
-
from experimaestro.scheduler.jobs import Job, JobState
|
|
13
|
+
from experimaestro.scheduler.jobs import Job, JobState, JobError
|
|
13
14
|
from experimaestro.scheduler.services import Service
|
|
14
15
|
|
|
15
16
|
|
|
@@ -17,6 +18,11 @@ from experimaestro.utils import logger
|
|
|
17
18
|
from experimaestro.utils.asyncio import asyncThreadcheck
|
|
18
19
|
import concurrent.futures
|
|
19
20
|
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from experimaestro.server import Server
|
|
23
|
+
from experimaestro.settings import ServerSettings
|
|
24
|
+
from experimaestro.scheduler.workspace import Workspace
|
|
25
|
+
|
|
20
26
|
|
|
21
27
|
class Listener:
|
|
22
28
|
def job_submitted(self, job):
|
|
@@ -31,18 +37,24 @@ class Listener:
|
|
|
31
37
|
|
|
32
38
|
|
|
33
39
|
class Scheduler(threading.Thread):
|
|
34
|
-
"""A job scheduler
|
|
40
|
+
"""A job scheduler (singleton)
|
|
35
41
|
|
|
36
|
-
The scheduler is based on asyncio for easy concurrency handling
|
|
42
|
+
The scheduler is based on asyncio for easy concurrency handling.
|
|
43
|
+
This is a singleton - only one scheduler instance exists per process.
|
|
37
44
|
"""
|
|
38
45
|
|
|
39
|
-
|
|
46
|
+
_instance: ClassVar[Optional["Scheduler"]] = None
|
|
47
|
+
_lock: ClassVar[threading.Lock] = threading.Lock()
|
|
48
|
+
|
|
49
|
+
def __init__(self, name: str = "Global"):
|
|
40
50
|
super().__init__(name=f"Scheduler ({name})", daemon=True)
|
|
41
51
|
self._ready = threading.Event()
|
|
42
52
|
|
|
43
|
-
# Name of the
|
|
53
|
+
# Name of the scheduler
|
|
44
54
|
self.name = name
|
|
45
|
-
|
|
55
|
+
|
|
56
|
+
# Track experiments (simple dict for now)
|
|
57
|
+
self.experiments: Dict[str, "experiment"] = {}
|
|
46
58
|
|
|
47
59
|
# Exit mode activated
|
|
48
60
|
self.exitmode = False
|
|
@@ -53,16 +65,101 @@ class Scheduler(threading.Thread):
|
|
|
53
65
|
# List of jobs
|
|
54
66
|
self.waitingjobs: Set[Job] = set()
|
|
55
67
|
|
|
56
|
-
# Listeners
|
|
57
|
-
self.
|
|
68
|
+
# Listeners with thread-safe access
|
|
69
|
+
self._listeners: Set[Listener] = set()
|
|
70
|
+
self._listeners_lock = threading.Lock()
|
|
71
|
+
|
|
72
|
+
# Notification thread pool (single worker to serialize notifications)
|
|
73
|
+
self._notification_executor = concurrent.futures.ThreadPoolExecutor(
|
|
74
|
+
max_workers=1, thread_name_prefix="NotificationWorker"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Server (managed by scheduler)
|
|
78
|
+
self.server: Optional["Server"] = None
|
|
58
79
|
|
|
59
80
|
@staticmethod
|
|
60
|
-
def
|
|
61
|
-
instance
|
|
81
|
+
def has_instance() -> bool:
|
|
82
|
+
"""Check if a scheduler instance exists without creating one"""
|
|
83
|
+
return Scheduler._instance is not None
|
|
84
|
+
|
|
85
|
+
@staticmethod
|
|
86
|
+
def instance() -> "Scheduler":
|
|
87
|
+
"""Get or create the global scheduler instance"""
|
|
88
|
+
if Scheduler._instance is None:
|
|
89
|
+
with Scheduler._lock:
|
|
90
|
+
if Scheduler._instance is None:
|
|
91
|
+
Scheduler._instance = Scheduler._create()
|
|
92
|
+
return Scheduler._instance
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def _create(name: str = "Global"):
|
|
96
|
+
"""Internal method to create and start scheduler"""
|
|
97
|
+
instance = Scheduler(name)
|
|
62
98
|
instance.start()
|
|
63
99
|
instance._ready.wait()
|
|
64
100
|
return instance
|
|
65
101
|
|
|
102
|
+
@staticmethod
|
|
103
|
+
def create(xp: "experiment" = None, name: str = "Global"):
|
|
104
|
+
"""Create or get the scheduler instance
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
xp: (Deprecated) Experiment reference, ignored
|
|
108
|
+
name: Name for the scheduler (only used on first creation)
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
The global scheduler instance
|
|
112
|
+
"""
|
|
113
|
+
return Scheduler.instance()
|
|
114
|
+
|
|
115
|
+
def register_experiment(self, xp: "experiment"):
|
|
116
|
+
"""Register an experiment with the scheduler"""
|
|
117
|
+
# Use experiment name as key for now
|
|
118
|
+
key = xp.workdir.name
|
|
119
|
+
self.experiments[key] = xp
|
|
120
|
+
|
|
121
|
+
logger.debug("Registered experiment %s with scheduler", key)
|
|
122
|
+
|
|
123
|
+
def unregister_experiment(self, xp: "experiment"):
|
|
124
|
+
"""Unregister an experiment from the scheduler"""
|
|
125
|
+
key = xp.workdir.name
|
|
126
|
+
if key in self.experiments:
|
|
127
|
+
del self.experiments[key]
|
|
128
|
+
logger.debug("Unregistered experiment %s from scheduler", key)
|
|
129
|
+
|
|
130
|
+
def start_server(
|
|
131
|
+
self, settings: "ServerSettings" = None, workspace: "Workspace" = None
|
|
132
|
+
):
|
|
133
|
+
"""Start the notification server (if not already running)
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
settings: Server settings
|
|
137
|
+
workspace: Workspace instance (required to get workspace path)
|
|
138
|
+
"""
|
|
139
|
+
if self.server is None:
|
|
140
|
+
from experimaestro.server import Server
|
|
141
|
+
from experimaestro.scheduler.state_provider import WorkspaceStateProvider
|
|
142
|
+
|
|
143
|
+
if workspace is None:
|
|
144
|
+
raise ValueError("workspace parameter is required to start server")
|
|
145
|
+
|
|
146
|
+
# Get the workspace state provider singleton
|
|
147
|
+
state_provider = WorkspaceStateProvider.get_instance(
|
|
148
|
+
workspace.path, read_only=False, sync_on_start=False
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
self.server = Server.instance(settings, state_provider)
|
|
152
|
+
self.server.start()
|
|
153
|
+
logger.info("Server started by scheduler")
|
|
154
|
+
else:
|
|
155
|
+
logger.debug("Server already running")
|
|
156
|
+
|
|
157
|
+
def stop_server(self):
|
|
158
|
+
"""Stop the notification server"""
|
|
159
|
+
if self.server is not None:
|
|
160
|
+
self.server.stop()
|
|
161
|
+
logger.info("Server stopped by scheduler")
|
|
162
|
+
|
|
66
163
|
def run(self):
|
|
67
164
|
"""Run the event loop forever"""
|
|
68
165
|
logger.debug("Starting event loop thread")
|
|
@@ -72,6 +169,10 @@ class Scheduler(threading.Thread):
|
|
|
72
169
|
# Set loop-dependent variables
|
|
73
170
|
self.exitCondition = asyncio.Condition()
|
|
74
171
|
self.dependencyLock = asyncio.Lock()
|
|
172
|
+
|
|
173
|
+
# Note: State provider removed - now managed at workspace level
|
|
174
|
+
# Each experiment has its own workspace with database
|
|
175
|
+
|
|
75
176
|
self._ready.set()
|
|
76
177
|
self.loop.run_forever()
|
|
77
178
|
|
|
@@ -84,10 +185,38 @@ class Scheduler(threading.Thread):
|
|
|
84
185
|
logger.warning("Scheduler already started")
|
|
85
186
|
|
|
86
187
|
def addlistener(self, listener: Listener):
|
|
87
|
-
self.
|
|
188
|
+
with self._listeners_lock:
|
|
189
|
+
self._listeners.add(listener)
|
|
88
190
|
|
|
89
191
|
def removelistener(self, listener: Listener):
|
|
90
|
-
self.
|
|
192
|
+
with self._listeners_lock:
|
|
193
|
+
self._listeners.discard(listener)
|
|
194
|
+
|
|
195
|
+
def clear_listeners(self):
|
|
196
|
+
"""Clear all listeners (for testing purposes)"""
|
|
197
|
+
with self._listeners_lock:
|
|
198
|
+
self._listeners.clear()
|
|
199
|
+
|
|
200
|
+
def wait_for_notifications(self, timeout: float = 5.0) -> bool:
|
|
201
|
+
"""Wait for all pending notifications to be processed.
|
|
202
|
+
|
|
203
|
+
This submits a sentinel task and waits for it to complete,
|
|
204
|
+
ensuring all previously submitted notifications have been processed.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
timeout: Maximum time to wait in seconds
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
True if all notifications were processed, False if timeout occurred
|
|
211
|
+
"""
|
|
212
|
+
try:
|
|
213
|
+
# Submit a no-op and wait for it to complete
|
|
214
|
+
future = self._notification_executor.submit(lambda: None)
|
|
215
|
+
future.result(timeout=timeout)
|
|
216
|
+
return True
|
|
217
|
+
except concurrent.futures.TimeoutError:
|
|
218
|
+
logger.warning("Timeout waiting for notification queue to drain")
|
|
219
|
+
return False
|
|
91
220
|
|
|
92
221
|
def getJobState(self, job: Job) -> "concurrent.futures.Future[JobState]":
|
|
93
222
|
# Check if the job belongs to this scheduler
|
|
@@ -104,17 +233,25 @@ class Scheduler(threading.Thread):
|
|
|
104
233
|
|
|
105
234
|
def submit(self, job: Job) -> Optional[Job]:
|
|
106
235
|
# Wait for the future containing the submitted job
|
|
107
|
-
logger.debug("
|
|
236
|
+
logger.debug("Submit job %s to the scheduler", job)
|
|
108
237
|
otherFuture = asyncio.run_coroutine_threadsafe(
|
|
109
238
|
self.aio_registerJob(job), self.loop
|
|
110
239
|
)
|
|
111
240
|
other = otherFuture.result()
|
|
112
241
|
logger.debug("Job already submitted" if other else "First submission")
|
|
113
|
-
|
|
114
|
-
|
|
242
|
+
|
|
243
|
+
# Only returns if job was already submitted and doesn't need reprocessing
|
|
244
|
+
if other is not None:
|
|
245
|
+
# If state is WAITING, it was just reset for resubmission and needs processing
|
|
246
|
+
# If state is RUNNING or finished (DONE), no need to reprocess
|
|
247
|
+
if other.state != JobState.WAITING:
|
|
248
|
+
return other
|
|
249
|
+
# Use 'other' for resubmission since it has the correct experiments list
|
|
250
|
+
job = other
|
|
115
251
|
|
|
116
252
|
job._future = asyncio.run_coroutine_threadsafe(self.aio_submit(job), self.loop)
|
|
117
|
-
|
|
253
|
+
|
|
254
|
+
return other
|
|
118
255
|
|
|
119
256
|
def prepare(self, job: Job):
|
|
120
257
|
"""Prepares the job for running"""
|
|
@@ -129,49 +266,99 @@ class Scheduler(threading.Thread):
|
|
|
129
266
|
|
|
130
267
|
if self.exitmode:
|
|
131
268
|
logger.warning("Exit mode: not submitting")
|
|
269
|
+
return
|
|
132
270
|
|
|
133
|
-
|
|
271
|
+
# Job was already submitted
|
|
272
|
+
if job.identifier in self.jobs:
|
|
134
273
|
other = self.jobs[job.identifier]
|
|
135
274
|
assert job.type == other.type
|
|
136
|
-
|
|
275
|
+
|
|
276
|
+
# Add current experiment to the existing job's experiments list
|
|
277
|
+
xp = experiment.current()
|
|
278
|
+
xp.add_job(other)
|
|
279
|
+
|
|
280
|
+
# Copy watched outputs from new job to existing job
|
|
281
|
+
# This ensures new callbacks are registered even for resubmitted jobs
|
|
282
|
+
other.watched_outputs.extend(job.watched_outputs)
|
|
283
|
+
|
|
284
|
+
if other.state.is_error():
|
|
137
285
|
logger.info("Re-submitting job")
|
|
286
|
+
# Clean up old process info so it will be re-started
|
|
287
|
+
other._process = None
|
|
288
|
+
if other.pidpath.is_file():
|
|
289
|
+
other.pidpath.unlink()
|
|
290
|
+
# Use set_state to handle experiment statistics updates
|
|
291
|
+
other.set_state(JobState.WAITING)
|
|
292
|
+
self.notify_job_state(other) # Notify listeners of re-submit
|
|
138
293
|
else:
|
|
139
294
|
logger.warning("Job %s already submitted", job.identifier)
|
|
140
|
-
return other
|
|
141
295
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
296
|
+
# Returns the previous job
|
|
297
|
+
return other
|
|
298
|
+
|
|
299
|
+
# Register this job
|
|
300
|
+
xp = experiment.current()
|
|
301
|
+
self.jobs[job.identifier] = job
|
|
302
|
+
# Set submittime now so that add_job can record it in the database
|
|
303
|
+
# (aio_submit may update this later for re-submitted jobs)
|
|
304
|
+
job.submittime = time.time()
|
|
305
|
+
xp.add_job(job)
|
|
306
|
+
|
|
307
|
+
# Set up dependencies
|
|
308
|
+
for dependency in job.dependencies:
|
|
309
|
+
dependency.target = job
|
|
310
|
+
dependency.origin.dependents.add(dependency)
|
|
146
311
|
|
|
147
312
|
return None
|
|
148
313
|
|
|
314
|
+
def _notify_listeners(self, notification_func, job: Job):
|
|
315
|
+
"""Execute notification in thread pool with error isolation.
|
|
316
|
+
|
|
317
|
+
This runs notifications in a dedicated thread pool to avoid blocking
|
|
318
|
+
the scheduler and to isolate errors from affecting other listeners.
|
|
319
|
+
"""
|
|
320
|
+
|
|
321
|
+
def _do_notify():
|
|
322
|
+
# Get a snapshot of listeners with the lock
|
|
323
|
+
with self._listeners_lock:
|
|
324
|
+
listeners_snapshot = list(self._listeners)
|
|
325
|
+
|
|
326
|
+
for listener in listeners_snapshot:
|
|
327
|
+
try:
|
|
328
|
+
notification_func(listener, job)
|
|
329
|
+
except Exception:
|
|
330
|
+
logger.exception("Got an error with listener %s", listener)
|
|
331
|
+
|
|
332
|
+
self._notification_executor.submit(_do_notify)
|
|
333
|
+
|
|
149
334
|
def notify_job_submitted(self, job: Job):
|
|
150
335
|
"""Notify the listeners that a job has been submitted"""
|
|
151
|
-
|
|
152
|
-
try:
|
|
153
|
-
listener.job_submitted(job)
|
|
154
|
-
except Exception:
|
|
155
|
-
logger.exception("Got an error with listener %s", listener)
|
|
336
|
+
self._notify_listeners(lambda lst, j: lst.job_submitted(j), job)
|
|
156
337
|
|
|
157
338
|
def notify_job_state(self, job: Job):
|
|
158
339
|
"""Notify the listeners that a job has changed state"""
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
340
|
+
self._notify_listeners(lambda lst, j: lst.job_state(j), job)
|
|
341
|
+
|
|
342
|
+
def notify_service_add(self, service: Service):
|
|
343
|
+
"""Notify the listeners that a service has been added"""
|
|
344
|
+
self._notify_listeners(lambda lst, s: lst.service_add(s), service)
|
|
164
345
|
|
|
165
|
-
async def aio_submit(self, job: Job) -> JobState:
|
|
346
|
+
async def aio_submit(self, job: Job) -> JobState:
|
|
166
347
|
"""Main scheduler function: submit a job, run it (if needed), and returns
|
|
167
348
|
the status code
|
|
168
349
|
"""
|
|
350
|
+
from experimaestro.scheduler.jobs import JobStateError, JobFailureStatus
|
|
351
|
+
|
|
169
352
|
logger.info("Submitting job %s", job)
|
|
170
|
-
job._readyEvent = asyncio.Event()
|
|
171
353
|
job.submittime = time.time()
|
|
172
354
|
job.scheduler = self
|
|
173
355
|
self.waitingjobs.add(job)
|
|
174
356
|
|
|
357
|
+
# Register watched outputs now that the job has a scheduler
|
|
358
|
+
job.register_watched_outputs()
|
|
359
|
+
|
|
360
|
+
# Note: Job metadata will be written after directory is created in aio_start
|
|
361
|
+
|
|
175
362
|
# Check that we don't have a completed job in
|
|
176
363
|
# alternate directories
|
|
177
364
|
for jobspath in experiment.current().alt_jobspaths:
|
|
@@ -185,126 +372,324 @@ class Scheduler(threading.Thread):
|
|
|
185
372
|
path.unlink()
|
|
186
373
|
path.symlink_to(job.path)
|
|
187
374
|
|
|
188
|
-
job.
|
|
189
|
-
|
|
375
|
+
job.set_state(JobState.WAITING)
|
|
190
376
|
self.notify_job_submitted(job)
|
|
191
377
|
|
|
192
|
-
#
|
|
193
|
-
if job.dependencies:
|
|
194
|
-
job.unsatisfied = len(job.dependencies)
|
|
195
|
-
|
|
196
|
-
for dependency in job.dependencies:
|
|
197
|
-
dependency.target = job
|
|
198
|
-
dependency.loop = self.loop
|
|
199
|
-
dependency.origin.dependents.add(dependency)
|
|
200
|
-
dependency.check()
|
|
201
|
-
else:
|
|
202
|
-
job._readyEvent.set()
|
|
203
|
-
job.state = JobState.READY
|
|
204
|
-
|
|
378
|
+
# Check if already done
|
|
205
379
|
if job.donepath.exists():
|
|
206
|
-
job.
|
|
380
|
+
job.set_state(JobState.DONE)
|
|
381
|
+
self.notify_job_state(job) # Notify listeners of done state
|
|
207
382
|
|
|
208
383
|
# Check if we have a running process
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
384
|
+
if not job.state.finished():
|
|
385
|
+
process = await job.aio_process()
|
|
386
|
+
if process is not None:
|
|
387
|
+
# Notify listeners that job is running
|
|
388
|
+
job.set_state(JobState.RUNNING)
|
|
389
|
+
self.notify_job_state(job)
|
|
390
|
+
|
|
391
|
+
# Adds to the listeners
|
|
392
|
+
if self.server is not None:
|
|
393
|
+
job.add_notification_server(self.server)
|
|
394
|
+
|
|
395
|
+
# And now, we wait...
|
|
396
|
+
logger.info("Got a process for job %s - waiting to complete", job)
|
|
397
|
+
code = await process.aio_code()
|
|
398
|
+
logger.info("Job %s completed with code %s", job, code)
|
|
399
|
+
|
|
400
|
+
# Record exit code if available
|
|
401
|
+
if code is not None:
|
|
402
|
+
job.exit_code = code
|
|
403
|
+
|
|
404
|
+
# Read state from .done/.failed files (contains detailed failure reason)
|
|
405
|
+
state = JobState.from_path(job.path, job.name)
|
|
406
|
+
|
|
407
|
+
# If state is a generic FAILED error, let the process determine
|
|
408
|
+
# the state (it may detect launcher-specific failures like SLURM timeout)
|
|
409
|
+
if (
|
|
410
|
+
state is not None
|
|
411
|
+
and isinstance(state, JobStateError)
|
|
412
|
+
and state.failure_reason == JobFailureStatus.FAILED
|
|
413
|
+
and code is not None
|
|
414
|
+
):
|
|
415
|
+
process_state = process.get_job_state(code)
|
|
416
|
+
if (
|
|
417
|
+
isinstance(process_state, JobStateError)
|
|
418
|
+
and process_state.failure_reason != JobFailureStatus.FAILED
|
|
419
|
+
):
|
|
420
|
+
# Process detected a more specific failure reason
|
|
421
|
+
state = process_state
|
|
242
422
|
|
|
243
423
|
if state is None:
|
|
244
|
-
|
|
245
|
-
|
|
424
|
+
if code is not None:
|
|
425
|
+
# Fall back to process-specific state detection
|
|
426
|
+
state = process.get_job_state(code)
|
|
427
|
+
else:
|
|
428
|
+
logger.error("No .done or .failed file found for job %s", job)
|
|
429
|
+
state = JobState.ERROR
|
|
430
|
+
# Set endtime before set_state so database gets the timestamp
|
|
431
|
+
job.endtime = time.time()
|
|
432
|
+
job.set_state(state)
|
|
433
|
+
self.notify_job_state(job) # Notify listeners of final state
|
|
434
|
+
|
|
435
|
+
# If not done or running, start the job
|
|
436
|
+
if not job.state.finished():
|
|
437
|
+
try:
|
|
438
|
+
state = await self.aio_start(job)
|
|
439
|
+
# Set endtime before set_state so database gets the timestamp
|
|
440
|
+
job.endtime = time.time()
|
|
441
|
+
job.set_state(state)
|
|
442
|
+
except Exception:
|
|
443
|
+
logger.exception("Got an exception while starting the job")
|
|
444
|
+
raise
|
|
246
445
|
|
|
247
|
-
|
|
446
|
+
# Job is finished - experiment statistics already updated by set_state
|
|
248
447
|
|
|
249
|
-
|
|
448
|
+
# Write final metadata with end time and final state
|
|
449
|
+
job.write_metadata()
|
|
250
450
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
self.xp.failedJobs[job.identifier] = job
|
|
451
|
+
if job in self.waitingjobs:
|
|
452
|
+
self.waitingjobs.remove(job)
|
|
254
453
|
|
|
255
|
-
# Process all remaining
|
|
454
|
+
# Process all remaining task outputs BEFORE notifying exit condition
|
|
455
|
+
# This ensures taskOutputQueueSize is updated before wait() can check it,
|
|
456
|
+
# preventing a race where wait() sees both unfinishedJobs==0 and
|
|
457
|
+
# taskOutputQueueSize==0 before callbacks have been queued.
|
|
256
458
|
await asyncThreadcheck("End of job processing", job.done_handler)
|
|
257
459
|
|
|
258
|
-
#
|
|
259
|
-
self.xp.unfinishedJobs -= 1
|
|
460
|
+
# Now notify - wait() will see the correct taskOutputQueueSize
|
|
260
461
|
async with self.exitCondition:
|
|
261
|
-
logging.debug("Updated number of unfinished jobs")
|
|
262
462
|
self.exitCondition.notify_all()
|
|
263
463
|
|
|
264
|
-
job.endtime = time.time()
|
|
265
|
-
if job in self.waitingjobs:
|
|
266
|
-
self.waitingjobs.remove(job)
|
|
267
|
-
|
|
268
|
-
with job.dependents as dependents:
|
|
269
|
-
logger.info("Processing %d dependent jobs", len(dependents))
|
|
270
|
-
for dependency in dependents:
|
|
271
|
-
logger.debug("Checking dependency %s", dependency)
|
|
272
|
-
self.loop.call_soon(dependency.check)
|
|
273
|
-
|
|
274
464
|
return job.state
|
|
275
465
|
|
|
276
|
-
async def aio_start(self, job: Job) -> Optional[JobState]:
|
|
277
|
-
"""Start a job
|
|
466
|
+
async def aio_start(self, job: Job) -> Optional[JobState]: # noqa: C901
|
|
467
|
+
"""Start a job with full job starting logic
|
|
278
468
|
|
|
279
|
-
This method
|
|
280
|
-
job
|
|
281
|
-
|
|
469
|
+
This method handles job locking, dependency acquisition, directory setup,
|
|
470
|
+
and job execution while using the scheduler's coordination lock to prevent
|
|
471
|
+
race conditions between multiple jobs.
|
|
282
472
|
|
|
283
473
|
:param job: The job to start
|
|
284
474
|
:return: JobState.WAITING if dependencies could not be locked, JobState.DONE
|
|
285
475
|
if job completed successfully, JobState.ERROR if job failed during execution,
|
|
286
476
|
or None (should not occur in normal operation)
|
|
287
|
-
:raises Exception: Various exceptions during
|
|
477
|
+
:raises Exception: Various exceptions during job execution, dependency locking,
|
|
478
|
+
or process creation
|
|
288
479
|
"""
|
|
480
|
+
from experimaestro.scheduler.jobs import JobStateError
|
|
481
|
+
from experimaestro.locking import Locks, LockError
|
|
482
|
+
from experimaestro.scheduler.jobs import JobFailureStatus
|
|
289
483
|
|
|
290
484
|
# Assert preconditions
|
|
291
485
|
assert job.launcher is not None
|
|
292
486
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
487
|
+
# Restart loop for resumable tasks that timeout
|
|
488
|
+
while True:
|
|
489
|
+
logger.debug(
|
|
490
|
+
"Starting job %s with %d dependencies",
|
|
491
|
+
job,
|
|
492
|
+
len(job.dependencies),
|
|
298
493
|
)
|
|
299
494
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
495
|
+
# Separate static and dynamic dependencies
|
|
496
|
+
static_deps = [d for d in job.dependencies if not d.is_dynamic()]
|
|
497
|
+
dynamic_deps = [d for d in job.dependencies if d.is_dynamic()]
|
|
303
498
|
|
|
304
|
-
#
|
|
499
|
+
# First, wait for all static dependencies (jobs) to complete
|
|
500
|
+
# These don't need the dependency lock as they can't change state
|
|
501
|
+
# Static dependency locks don't need to be added to locks list
|
|
502
|
+
logger.debug("Waiting for %d static dependencies", len(static_deps))
|
|
503
|
+
for dependency in static_deps:
|
|
504
|
+
logger.debug("Waiting for static dependency %s", dependency)
|
|
505
|
+
try:
|
|
506
|
+
await dependency.aio_lock()
|
|
507
|
+
except RuntimeError as e:
|
|
508
|
+
# Dependency failed - mark job as failed due to dependency
|
|
509
|
+
logger.warning("Dependency failed: %s", e)
|
|
510
|
+
return JobStateError(JobFailureStatus.DEPENDENCY)
|
|
511
|
+
|
|
512
|
+
# We first lock the job before proceeding
|
|
513
|
+
with Locks() as locks:
|
|
514
|
+
logger.debug("[starting] Locking job %s", job)
|
|
515
|
+
async with job.launcher.connector.lock(job.lockpath):
|
|
516
|
+
logger.debug("[starting] Locked job %s", job)
|
|
517
|
+
|
|
518
|
+
state = None
|
|
519
|
+
try:
|
|
520
|
+
# Now handle dynamic dependencies (tokens) with retry logic
|
|
521
|
+
# CRITICAL: Only one task at a time can acquire dynamic dependencies
|
|
522
|
+
# to prevent deadlocks (e.g., Task A holds Token1 waiting for Token2,
|
|
523
|
+
# Task B holds Token2 waiting for Token1)
|
|
524
|
+
if dynamic_deps:
|
|
525
|
+
async with self.dependencyLock:
|
|
526
|
+
logger.debug(
|
|
527
|
+
"Locking %d dynamic dependencies (tokens)",
|
|
528
|
+
len(dynamic_deps),
|
|
529
|
+
)
|
|
530
|
+
while True:
|
|
531
|
+
all_locked = True
|
|
532
|
+
for idx, dependency in enumerate(dynamic_deps):
|
|
533
|
+
try:
|
|
534
|
+
# Use timeout=0 for first dependency, 0.1s for subsequent
|
|
535
|
+
timeout = 0 if idx == 0 else 0.1
|
|
536
|
+
# Acquire the lock (this might block on IPC locks)
|
|
537
|
+
lock = await dependency.aio_lock(
|
|
538
|
+
timeout=timeout
|
|
539
|
+
)
|
|
540
|
+
locks.append(lock)
|
|
541
|
+
except LockError:
|
|
542
|
+
logger.info(
|
|
543
|
+
"Could not lock %s, retrying",
|
|
544
|
+
dependency,
|
|
545
|
+
)
|
|
546
|
+
# Release all locks and restart
|
|
547
|
+
for lock in locks.locks:
|
|
548
|
+
lock.release()
|
|
549
|
+
locks.locks.clear()
|
|
550
|
+
# Put failed dependency first
|
|
551
|
+
dynamic_deps.remove(dependency)
|
|
552
|
+
dynamic_deps.insert(0, dependency)
|
|
553
|
+
all_locked = False
|
|
554
|
+
break
|
|
555
|
+
|
|
556
|
+
if all_locked:
|
|
557
|
+
# All locks acquired successfully
|
|
558
|
+
break
|
|
559
|
+
|
|
560
|
+
# Dependencies have been locked, we can start the job
|
|
561
|
+
job.starttime = time.time()
|
|
562
|
+
|
|
563
|
+
# Creates the main directory
|
|
564
|
+
directory = job.path
|
|
565
|
+
logger.debug("Making directories job %s...", directory)
|
|
566
|
+
|
|
567
|
+
# Warn about directory cleanup for non-resumable tasks
|
|
568
|
+
# (only once per task type)
|
|
569
|
+
xpmtype = job.config.__xpmtype__
|
|
570
|
+
if (
|
|
571
|
+
directory.is_dir()
|
|
572
|
+
and not job.resumable
|
|
573
|
+
and not xpmtype.warned_clean_not_resumable
|
|
574
|
+
):
|
|
575
|
+
xpmtype.warned_clean_not_resumable = True
|
|
576
|
+
logger.warning(
|
|
577
|
+
"In a future version, directory will be cleaned up for "
|
|
578
|
+
"non-resumable tasks (%s). Use ResumableTask if you want "
|
|
579
|
+
"to preserve the directory contents.",
|
|
580
|
+
xpmtype.identifier,
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
if not directory.is_dir():
|
|
584
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
585
|
+
|
|
586
|
+
# Write metadata with submit and start time (after directory creation)
|
|
587
|
+
job.write_metadata()
|
|
588
|
+
|
|
589
|
+
# Sets up the notification URL
|
|
590
|
+
if self.server is not None:
|
|
591
|
+
job.add_notification_server(self.server)
|
|
592
|
+
|
|
593
|
+
except Exception:
|
|
594
|
+
logger.warning("Error while locking job", exc_info=True)
|
|
595
|
+
return JobState.WAITING
|
|
596
|
+
|
|
597
|
+
try:
|
|
598
|
+
# Runs the job
|
|
599
|
+
process = await job.aio_run()
|
|
600
|
+
except Exception:
|
|
601
|
+
logger.warning("Error while starting job", exc_info=True)
|
|
602
|
+
return JobState.ERROR
|
|
603
|
+
|
|
604
|
+
# Wait for job to complete while holding locks
|
|
605
|
+
try:
|
|
606
|
+
logger.debug("Waiting for job %s process to end", job)
|
|
607
|
+
|
|
608
|
+
code = await process.aio_code()
|
|
609
|
+
logger.debug("Got return code %s for %s", code, job)
|
|
610
|
+
|
|
611
|
+
# Record exit code if available
|
|
612
|
+
if code is not None:
|
|
613
|
+
logger.info("Job %s ended with code %s", job, code)
|
|
614
|
+
job.exit_code = code
|
|
615
|
+
else:
|
|
616
|
+
logger.info("Job %s ended, reading state from files", job)
|
|
617
|
+
|
|
618
|
+
# Read state from .done/.failed files (contains detailed failure reason)
|
|
619
|
+
state = JobState.from_path(job.path, job.name)
|
|
620
|
+
|
|
621
|
+
# If state is a generic FAILED error, let the process determine
|
|
622
|
+
# the state (it may detect launcher-specific failures like SLURM timeout)
|
|
623
|
+
if (
|
|
624
|
+
state is not None
|
|
625
|
+
and isinstance(state, JobStateError)
|
|
626
|
+
and state.failure_reason == JobFailureStatus.FAILED
|
|
627
|
+
and code is not None
|
|
628
|
+
):
|
|
629
|
+
process_state = process.get_job_state(code)
|
|
630
|
+
if (
|
|
631
|
+
isinstance(process_state, JobStateError)
|
|
632
|
+
and process_state.failure_reason != JobFailureStatus.FAILED
|
|
633
|
+
):
|
|
634
|
+
# Process detected a more specific failure reason
|
|
635
|
+
state = process_state
|
|
636
|
+
|
|
637
|
+
if state is None:
|
|
638
|
+
if code is not None:
|
|
639
|
+
# Fall back to process-specific state detection
|
|
640
|
+
state = process.get_job_state(code)
|
|
641
|
+
else:
|
|
642
|
+
logger.error(
|
|
643
|
+
"No .done or .failed file found for job %s", job
|
|
644
|
+
)
|
|
645
|
+
state = JobState.ERROR
|
|
646
|
+
|
|
647
|
+
except JobError:
|
|
648
|
+
logger.warning("Error while running job")
|
|
649
|
+
state = JobState.ERROR
|
|
650
|
+
|
|
651
|
+
except Exception:
|
|
652
|
+
logger.warning(
|
|
653
|
+
"Error while running job (in experimaestro)", exc_info=True
|
|
654
|
+
)
|
|
655
|
+
state = JobState.ERROR
|
|
656
|
+
|
|
657
|
+
# Locks are released here after job completes
|
|
658
|
+
|
|
659
|
+
# Check if we should restart a resumable task that timed out
|
|
660
|
+
from experimaestro.scheduler.jobs import JobStateError
|
|
661
|
+
|
|
662
|
+
if (
|
|
663
|
+
isinstance(state, JobStateError)
|
|
664
|
+
and state.failure_reason == JobFailureStatus.TIMEOUT
|
|
665
|
+
and job.resumable
|
|
666
|
+
):
|
|
667
|
+
job.retry_count += 1
|
|
668
|
+
if job.retry_count <= job.max_retries:
|
|
669
|
+
logger.info(
|
|
670
|
+
"Resumable task %s timed out - restarting (attempt %d/%d)",
|
|
671
|
+
job,
|
|
672
|
+
job.retry_count,
|
|
673
|
+
job.max_retries,
|
|
674
|
+
)
|
|
675
|
+
# Rotate log files to preserve previous run's logs
|
|
676
|
+
job.rotate_logs()
|
|
677
|
+
# Clear cached process so aio_run() will create a new one
|
|
678
|
+
job._process = None
|
|
679
|
+
# Delete PID file so the job will be resubmitted
|
|
680
|
+
if job.pidpath.exists():
|
|
681
|
+
job.pidpath.unlink()
|
|
682
|
+
# Continue the loop to restart
|
|
683
|
+
continue
|
|
684
|
+
else:
|
|
685
|
+
logger.warning(
|
|
686
|
+
"Resumable task %s exceeded max retries (%d), marking as failed",
|
|
687
|
+
job,
|
|
688
|
+
job.max_retries,
|
|
689
|
+
)
|
|
690
|
+
# Fall through to return the error state
|
|
691
|
+
|
|
692
|
+
# Job finished (success or non-recoverable error)
|
|
693
|
+
# Notify scheduler listeners of job state after job completes
|
|
305
694
|
self.notify_job_state(job)
|
|
306
695
|
return state
|
|
307
|
-
|
|
308
|
-
except Exception:
|
|
309
|
-
logger.warning("Error in scheduler job coordination", exc_info=True)
|
|
310
|
-
return JobState.ERROR
|