experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +10 -11
- experimaestro/annotations.py +167 -206
- experimaestro/cli/__init__.py +130 -5
- experimaestro/cli/filter.py +42 -74
- experimaestro/cli/jobs.py +157 -106
- experimaestro/cli/refactor.py +249 -0
- experimaestro/click.py +0 -1
- experimaestro/commandline.py +19 -3
- experimaestro/connectors/__init__.py +20 -1
- experimaestro/connectors/local.py +12 -0
- experimaestro/core/arguments.py +182 -46
- experimaestro/core/identifier.py +107 -6
- experimaestro/core/objects/__init__.py +6 -0
- experimaestro/core/objects/config.py +542 -25
- experimaestro/core/objects/config_walk.py +20 -0
- experimaestro/core/serialization.py +91 -34
- experimaestro/core/subparameters.py +164 -0
- experimaestro/core/types.py +175 -38
- experimaestro/exceptions.py +26 -0
- experimaestro/experiments/cli.py +107 -25
- experimaestro/generators.py +50 -9
- experimaestro/huggingface.py +3 -1
- experimaestro/launcherfinder/parser.py +29 -0
- experimaestro/launchers/__init__.py +26 -1
- experimaestro/launchers/direct.py +12 -0
- experimaestro/launchers/slurm/base.py +154 -2
- experimaestro/mkdocs/metaloader.py +0 -1
- experimaestro/mypy.py +452 -7
- experimaestro/notifications.py +63 -13
- experimaestro/progress.py +0 -2
- experimaestro/rpyc.py +0 -1
- experimaestro/run.py +19 -6
- experimaestro/scheduler/base.py +489 -125
- experimaestro/scheduler/dependencies.py +43 -28
- experimaestro/scheduler/dynamic_outputs.py +259 -130
- experimaestro/scheduler/experiment.py +225 -30
- experimaestro/scheduler/interfaces.py +474 -0
- experimaestro/scheduler/jobs.py +216 -206
- experimaestro/scheduler/services.py +186 -12
- experimaestro/scheduler/state_db.py +388 -0
- experimaestro/scheduler/state_provider.py +2345 -0
- experimaestro/scheduler/state_sync.py +834 -0
- experimaestro/scheduler/workspace.py +52 -10
- experimaestro/scriptbuilder.py +7 -0
- experimaestro/server/__init__.py +147 -57
- experimaestro/server/data/index.css +0 -125
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +194 -58
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +44 -5
- experimaestro/sphinx/__init__.py +3 -3
- experimaestro/taskglobals.py +20 -0
- experimaestro/tests/conftest.py +80 -0
- experimaestro/tests/core/test_generics.py +2 -2
- experimaestro/tests/identifier_stability.json +45 -0
- experimaestro/tests/launchers/bin/sacct +6 -2
- experimaestro/tests/launchers/bin/sbatch +4 -2
- experimaestro/tests/launchers/test_slurm.py +80 -0
- experimaestro/tests/tasks/test_dynamic.py +231 -0
- experimaestro/tests/test_cli_jobs.py +615 -0
- experimaestro/tests/test_deprecated.py +630 -0
- experimaestro/tests/test_environment.py +200 -0
- experimaestro/tests/test_file_progress_integration.py +1 -1
- experimaestro/tests/test_forward.py +3 -3
- experimaestro/tests/test_identifier.py +372 -41
- experimaestro/tests/test_identifier_stability.py +458 -0
- experimaestro/tests/test_instance.py +3 -3
- experimaestro/tests/test_multitoken.py +442 -0
- experimaestro/tests/test_mypy.py +433 -0
- experimaestro/tests/test_objects.py +312 -5
- experimaestro/tests/test_outputs.py +2 -2
- experimaestro/tests/test_param.py +8 -12
- experimaestro/tests/test_partial_paths.py +231 -0
- experimaestro/tests/test_progress.py +0 -48
- experimaestro/tests/test_resumable_task.py +480 -0
- experimaestro/tests/test_serializers.py +141 -1
- experimaestro/tests/test_state_db.py +434 -0
- experimaestro/tests/test_subparameters.py +160 -0
- experimaestro/tests/test_tags.py +136 -0
- experimaestro/tests/test_tasks.py +107 -121
- experimaestro/tests/test_token_locking.py +252 -0
- experimaestro/tests/test_tokens.py +17 -13
- experimaestro/tests/test_types.py +123 -1
- experimaestro/tests/test_workspace_triggers.py +158 -0
- experimaestro/tests/token_reschedule.py +4 -2
- experimaestro/tests/utils.py +2 -2
- experimaestro/tokens.py +154 -57
- experimaestro/tools/diff.py +1 -1
- experimaestro/tui/__init__.py +8 -0
- experimaestro/tui/app.py +2303 -0
- experimaestro/tui/app.tcss +353 -0
- experimaestro/tui/log_viewer.py +228 -0
- experimaestro/utils/__init__.py +23 -0
- experimaestro/utils/environment.py +148 -0
- experimaestro/utils/git.py +129 -0
- experimaestro/utils/resources.py +1 -1
- experimaestro/version.py +34 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +68 -38
- experimaestro-2.0.0b4.dist-info/RECORD +181 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
- experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
- experimaestro/compat.py +0 -6
- experimaestro/core/objects.pyi +0 -221
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro-2.0.0a8.dist-info/RECORD +0 -166
- experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b4.dist-info}/licenses/LICENSE +0 -0
experimaestro/scheduler/base.py
CHANGED
|
@@ -1,15 +1,16 @@
|
|
|
1
|
-
import logging
|
|
2
1
|
import threading
|
|
3
2
|
import time
|
|
4
3
|
from typing import (
|
|
5
4
|
Optional,
|
|
6
5
|
Set,
|
|
6
|
+
ClassVar,
|
|
7
|
+
TYPE_CHECKING,
|
|
7
8
|
)
|
|
8
9
|
import asyncio
|
|
9
10
|
from typing import Dict
|
|
10
11
|
|
|
11
12
|
from experimaestro.scheduler import experiment
|
|
12
|
-
from experimaestro.scheduler.jobs import Job, JobState
|
|
13
|
+
from experimaestro.scheduler.jobs import Job, JobState, JobError
|
|
13
14
|
from experimaestro.scheduler.services import Service
|
|
14
15
|
|
|
15
16
|
|
|
@@ -17,6 +18,11 @@ from experimaestro.utils import logger
|
|
|
17
18
|
from experimaestro.utils.asyncio import asyncThreadcheck
|
|
18
19
|
import concurrent.futures
|
|
19
20
|
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from experimaestro.server import Server
|
|
23
|
+
from experimaestro.settings import ServerSettings
|
|
24
|
+
from experimaestro.scheduler.workspace import Workspace
|
|
25
|
+
|
|
20
26
|
|
|
21
27
|
class Listener:
|
|
22
28
|
def job_submitted(self, job):
|
|
@@ -31,18 +37,24 @@ class Listener:
|
|
|
31
37
|
|
|
32
38
|
|
|
33
39
|
class Scheduler(threading.Thread):
|
|
34
|
-
"""A job scheduler
|
|
40
|
+
"""A job scheduler (singleton)
|
|
35
41
|
|
|
36
|
-
The scheduler is based on asyncio for easy concurrency handling
|
|
42
|
+
The scheduler is based on asyncio for easy concurrency handling.
|
|
43
|
+
This is a singleton - only one scheduler instance exists per process.
|
|
37
44
|
"""
|
|
38
45
|
|
|
39
|
-
|
|
46
|
+
_instance: ClassVar[Optional["Scheduler"]] = None
|
|
47
|
+
_lock: ClassVar[threading.Lock] = threading.Lock()
|
|
48
|
+
|
|
49
|
+
def __init__(self, name: str = "Global"):
|
|
40
50
|
super().__init__(name=f"Scheduler ({name})", daemon=True)
|
|
41
51
|
self._ready = threading.Event()
|
|
42
52
|
|
|
43
|
-
# Name of the
|
|
53
|
+
# Name of the scheduler
|
|
44
54
|
self.name = name
|
|
45
|
-
|
|
55
|
+
|
|
56
|
+
# Track experiments (simple dict for now)
|
|
57
|
+
self.experiments: Dict[str, "experiment"] = {}
|
|
46
58
|
|
|
47
59
|
# Exit mode activated
|
|
48
60
|
self.exitmode = False
|
|
@@ -53,16 +65,101 @@ class Scheduler(threading.Thread):
|
|
|
53
65
|
# List of jobs
|
|
54
66
|
self.waitingjobs: Set[Job] = set()
|
|
55
67
|
|
|
56
|
-
# Listeners
|
|
57
|
-
self.
|
|
68
|
+
# Listeners with thread-safe access
|
|
69
|
+
self._listeners: Set[Listener] = set()
|
|
70
|
+
self._listeners_lock = threading.Lock()
|
|
71
|
+
|
|
72
|
+
# Notification thread pool (single worker to serialize notifications)
|
|
73
|
+
self._notification_executor = concurrent.futures.ThreadPoolExecutor(
|
|
74
|
+
max_workers=1, thread_name_prefix="NotificationWorker"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Server (managed by scheduler)
|
|
78
|
+
self.server: Optional["Server"] = None
|
|
58
79
|
|
|
59
80
|
@staticmethod
|
|
60
|
-
def
|
|
61
|
-
instance
|
|
81
|
+
def has_instance() -> bool:
|
|
82
|
+
"""Check if a scheduler instance exists without creating one"""
|
|
83
|
+
return Scheduler._instance is not None
|
|
84
|
+
|
|
85
|
+
@staticmethod
|
|
86
|
+
def instance() -> "Scheduler":
|
|
87
|
+
"""Get or create the global scheduler instance"""
|
|
88
|
+
if Scheduler._instance is None:
|
|
89
|
+
with Scheduler._lock:
|
|
90
|
+
if Scheduler._instance is None:
|
|
91
|
+
Scheduler._instance = Scheduler._create()
|
|
92
|
+
return Scheduler._instance
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def _create(name: str = "Global"):
|
|
96
|
+
"""Internal method to create and start scheduler"""
|
|
97
|
+
instance = Scheduler(name)
|
|
62
98
|
instance.start()
|
|
63
99
|
instance._ready.wait()
|
|
64
100
|
return instance
|
|
65
101
|
|
|
102
|
+
@staticmethod
|
|
103
|
+
def create(xp: "experiment" = None, name: str = "Global"):
|
|
104
|
+
"""Create or get the scheduler instance
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
xp: (Deprecated) Experiment reference, ignored
|
|
108
|
+
name: Name for the scheduler (only used on first creation)
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
The global scheduler instance
|
|
112
|
+
"""
|
|
113
|
+
return Scheduler.instance()
|
|
114
|
+
|
|
115
|
+
def register_experiment(self, xp: "experiment"):
|
|
116
|
+
"""Register an experiment with the scheduler"""
|
|
117
|
+
# Use experiment name as key for now
|
|
118
|
+
key = xp.workdir.name
|
|
119
|
+
self.experiments[key] = xp
|
|
120
|
+
|
|
121
|
+
logger.debug("Registered experiment %s with scheduler", key)
|
|
122
|
+
|
|
123
|
+
def unregister_experiment(self, xp: "experiment"):
|
|
124
|
+
"""Unregister an experiment from the scheduler"""
|
|
125
|
+
key = xp.workdir.name
|
|
126
|
+
if key in self.experiments:
|
|
127
|
+
del self.experiments[key]
|
|
128
|
+
logger.debug("Unregistered experiment %s from scheduler", key)
|
|
129
|
+
|
|
130
|
+
def start_server(
|
|
131
|
+
self, settings: "ServerSettings" = None, workspace: "Workspace" = None
|
|
132
|
+
):
|
|
133
|
+
"""Start the notification server (if not already running)
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
settings: Server settings
|
|
137
|
+
workspace: Workspace instance (required to get workspace path)
|
|
138
|
+
"""
|
|
139
|
+
if self.server is None:
|
|
140
|
+
from experimaestro.server import Server
|
|
141
|
+
from experimaestro.scheduler.state_provider import WorkspaceStateProvider
|
|
142
|
+
|
|
143
|
+
if workspace is None:
|
|
144
|
+
raise ValueError("workspace parameter is required to start server")
|
|
145
|
+
|
|
146
|
+
# Get the workspace state provider singleton
|
|
147
|
+
state_provider = WorkspaceStateProvider.get_instance(
|
|
148
|
+
workspace.path, read_only=False, sync_on_start=False
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
self.server = Server.instance(settings, state_provider)
|
|
152
|
+
self.server.start()
|
|
153
|
+
logger.info("Server started by scheduler")
|
|
154
|
+
else:
|
|
155
|
+
logger.debug("Server already running")
|
|
156
|
+
|
|
157
|
+
def stop_server(self):
|
|
158
|
+
"""Stop the notification server"""
|
|
159
|
+
if self.server is not None:
|
|
160
|
+
self.server.stop()
|
|
161
|
+
logger.info("Server stopped by scheduler")
|
|
162
|
+
|
|
66
163
|
def run(self):
|
|
67
164
|
"""Run the event loop forever"""
|
|
68
165
|
logger.debug("Starting event loop thread")
|
|
@@ -72,6 +169,10 @@ class Scheduler(threading.Thread):
|
|
|
72
169
|
# Set loop-dependent variables
|
|
73
170
|
self.exitCondition = asyncio.Condition()
|
|
74
171
|
self.dependencyLock = asyncio.Lock()
|
|
172
|
+
|
|
173
|
+
# Note: State provider removed - now managed at workspace level
|
|
174
|
+
# Each experiment has its own workspace with database
|
|
175
|
+
|
|
75
176
|
self._ready.set()
|
|
76
177
|
self.loop.run_forever()
|
|
77
178
|
|
|
@@ -84,10 +185,17 @@ class Scheduler(threading.Thread):
|
|
|
84
185
|
logger.warning("Scheduler already started")
|
|
85
186
|
|
|
86
187
|
def addlistener(self, listener: Listener):
|
|
87
|
-
self.
|
|
188
|
+
with self._listeners_lock:
|
|
189
|
+
self._listeners.add(listener)
|
|
88
190
|
|
|
89
191
|
def removelistener(self, listener: Listener):
|
|
90
|
-
self.
|
|
192
|
+
with self._listeners_lock:
|
|
193
|
+
self._listeners.discard(listener)
|
|
194
|
+
|
|
195
|
+
def clear_listeners(self):
|
|
196
|
+
"""Clear all listeners (for testing purposes)"""
|
|
197
|
+
with self._listeners_lock:
|
|
198
|
+
self._listeners.clear()
|
|
91
199
|
|
|
92
200
|
def getJobState(self, job: Job) -> "concurrent.futures.Future[JobState]":
|
|
93
201
|
# Check if the job belongs to this scheduler
|
|
@@ -104,17 +212,25 @@ class Scheduler(threading.Thread):
|
|
|
104
212
|
|
|
105
213
|
def submit(self, job: Job) -> Optional[Job]:
|
|
106
214
|
# Wait for the future containing the submitted job
|
|
107
|
-
logger.debug("
|
|
215
|
+
logger.debug("Submit job %s to the scheduler", job)
|
|
108
216
|
otherFuture = asyncio.run_coroutine_threadsafe(
|
|
109
217
|
self.aio_registerJob(job), self.loop
|
|
110
218
|
)
|
|
111
219
|
other = otherFuture.result()
|
|
112
220
|
logger.debug("Job already submitted" if other else "First submission")
|
|
113
|
-
|
|
114
|
-
|
|
221
|
+
|
|
222
|
+
# Only returns if job was already submitted and doesn't need reprocessing
|
|
223
|
+
if other is not None:
|
|
224
|
+
# If state is WAITING, it was just reset for resubmission and needs processing
|
|
225
|
+
# If state is RUNNING or finished (DONE), no need to reprocess
|
|
226
|
+
if other.state != JobState.WAITING:
|
|
227
|
+
return other
|
|
228
|
+
# Use 'other' for resubmission since it has the correct experiments list
|
|
229
|
+
job = other
|
|
115
230
|
|
|
116
231
|
job._future = asyncio.run_coroutine_threadsafe(self.aio_submit(job), self.loop)
|
|
117
|
-
|
|
232
|
+
|
|
233
|
+
return other
|
|
118
234
|
|
|
119
235
|
def prepare(self, job: Job):
|
|
120
236
|
"""Prepares the job for running"""
|
|
@@ -129,49 +245,99 @@ class Scheduler(threading.Thread):
|
|
|
129
245
|
|
|
130
246
|
if self.exitmode:
|
|
131
247
|
logger.warning("Exit mode: not submitting")
|
|
248
|
+
return
|
|
132
249
|
|
|
133
|
-
|
|
250
|
+
# Job was already submitted
|
|
251
|
+
if job.identifier in self.jobs:
|
|
134
252
|
other = self.jobs[job.identifier]
|
|
135
253
|
assert job.type == other.type
|
|
136
|
-
|
|
254
|
+
|
|
255
|
+
# Add current experiment to the existing job's experiments list
|
|
256
|
+
xp = experiment.current()
|
|
257
|
+
xp.add_job(other)
|
|
258
|
+
|
|
259
|
+
# Copy watched outputs from new job to existing job
|
|
260
|
+
# This ensures new callbacks are registered even for resubmitted jobs
|
|
261
|
+
other.watched_outputs.extend(job.watched_outputs)
|
|
262
|
+
|
|
263
|
+
if other.state.is_error():
|
|
137
264
|
logger.info("Re-submitting job")
|
|
265
|
+
# Clean up old process info so it will be re-started
|
|
266
|
+
other._process = None
|
|
267
|
+
if other.pidpath.is_file():
|
|
268
|
+
other.pidpath.unlink()
|
|
269
|
+
# Use set_state to handle experiment statistics updates
|
|
270
|
+
other.set_state(JobState.WAITING)
|
|
271
|
+
self.notify_job_state(other) # Notify listeners of re-submit
|
|
138
272
|
else:
|
|
139
273
|
logger.warning("Job %s already submitted", job.identifier)
|
|
140
|
-
return other
|
|
141
274
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
275
|
+
# Returns the previous job
|
|
276
|
+
return other
|
|
277
|
+
|
|
278
|
+
# Register this job
|
|
279
|
+
xp = experiment.current()
|
|
280
|
+
self.jobs[job.identifier] = job
|
|
281
|
+
# Set submittime now so that add_job can record it in the database
|
|
282
|
+
# (aio_submit may update this later for re-submitted jobs)
|
|
283
|
+
job.submittime = time.time()
|
|
284
|
+
xp.add_job(job)
|
|
285
|
+
|
|
286
|
+
# Set up dependencies
|
|
287
|
+
for dependency in job.dependencies:
|
|
288
|
+
dependency.target = job
|
|
289
|
+
dependency.origin.dependents.add(dependency)
|
|
146
290
|
|
|
147
291
|
return None
|
|
148
292
|
|
|
293
|
+
def _notify_listeners(self, notification_func, job: Job):
|
|
294
|
+
"""Execute notification in thread pool with error isolation.
|
|
295
|
+
|
|
296
|
+
This runs notifications in a dedicated thread pool to avoid blocking
|
|
297
|
+
the scheduler and to isolate errors from affecting other listeners.
|
|
298
|
+
"""
|
|
299
|
+
|
|
300
|
+
def _do_notify():
|
|
301
|
+
# Get a snapshot of listeners with the lock
|
|
302
|
+
with self._listeners_lock:
|
|
303
|
+
listeners_snapshot = list(self._listeners)
|
|
304
|
+
|
|
305
|
+
for listener in listeners_snapshot:
|
|
306
|
+
try:
|
|
307
|
+
notification_func(listener, job)
|
|
308
|
+
except Exception:
|
|
309
|
+
logger.exception("Got an error with listener %s", listener)
|
|
310
|
+
|
|
311
|
+
self._notification_executor.submit(_do_notify)
|
|
312
|
+
|
|
149
313
|
def notify_job_submitted(self, job: Job):
|
|
150
314
|
"""Notify the listeners that a job has been submitted"""
|
|
151
|
-
|
|
152
|
-
try:
|
|
153
|
-
listener.job_submitted(job)
|
|
154
|
-
except Exception:
|
|
155
|
-
logger.exception("Got an error with listener %s", listener)
|
|
315
|
+
self._notify_listeners(lambda lst, j: lst.job_submitted(j), job)
|
|
156
316
|
|
|
157
317
|
def notify_job_state(self, job: Job):
|
|
158
318
|
"""Notify the listeners that a job has changed state"""
|
|
159
|
-
|
|
160
|
-
try:
|
|
161
|
-
listener.job_state(job)
|
|
162
|
-
except Exception:
|
|
163
|
-
logger.exception("Got an error with listener %s", listener)
|
|
319
|
+
self._notify_listeners(lambda lst, j: lst.job_state(j), job)
|
|
164
320
|
|
|
165
|
-
|
|
321
|
+
def notify_service_add(self, service: Service):
|
|
322
|
+
"""Notify the listeners that a service has been added"""
|
|
323
|
+
self._notify_listeners(lambda lst, s: lst.service_add(s), service)
|
|
324
|
+
|
|
325
|
+
async def aio_submit(self, job: Job) -> JobState:
|
|
166
326
|
"""Main scheduler function: submit a job, run it (if needed), and returns
|
|
167
327
|
the status code
|
|
168
328
|
"""
|
|
329
|
+
from experimaestro.scheduler.jobs import JobStateError, JobFailureStatus
|
|
330
|
+
|
|
169
331
|
logger.info("Submitting job %s", job)
|
|
170
|
-
job._readyEvent = asyncio.Event()
|
|
171
332
|
job.submittime = time.time()
|
|
172
333
|
job.scheduler = self
|
|
173
334
|
self.waitingjobs.add(job)
|
|
174
335
|
|
|
336
|
+
# Register watched outputs now that the job has a scheduler
|
|
337
|
+
job.register_watched_outputs()
|
|
338
|
+
|
|
339
|
+
# Note: Job metadata will be written after directory is created in aio_start
|
|
340
|
+
|
|
175
341
|
# Check that we don't have a completed job in
|
|
176
342
|
# alternate directories
|
|
177
343
|
for jobspath in experiment.current().alt_jobspaths:
|
|
@@ -185,126 +351,324 @@ class Scheduler(threading.Thread):
|
|
|
185
351
|
path.unlink()
|
|
186
352
|
path.symlink_to(job.path)
|
|
187
353
|
|
|
188
|
-
job.
|
|
189
|
-
|
|
354
|
+
job.set_state(JobState.WAITING)
|
|
190
355
|
self.notify_job_submitted(job)
|
|
191
356
|
|
|
192
|
-
#
|
|
193
|
-
if job.dependencies:
|
|
194
|
-
job.unsatisfied = len(job.dependencies)
|
|
195
|
-
|
|
196
|
-
for dependency in job.dependencies:
|
|
197
|
-
dependency.target = job
|
|
198
|
-
dependency.loop = self.loop
|
|
199
|
-
dependency.origin.dependents.add(dependency)
|
|
200
|
-
dependency.check()
|
|
201
|
-
else:
|
|
202
|
-
job._readyEvent.set()
|
|
203
|
-
job.state = JobState.READY
|
|
204
|
-
|
|
357
|
+
# Check if already done
|
|
205
358
|
if job.donepath.exists():
|
|
206
|
-
job.
|
|
359
|
+
job.set_state(JobState.DONE)
|
|
360
|
+
self.notify_job_state(job) # Notify listeners of done state
|
|
207
361
|
|
|
208
362
|
# Check if we have a running process
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
363
|
+
if not job.state.finished():
|
|
364
|
+
process = await job.aio_process()
|
|
365
|
+
if process is not None:
|
|
366
|
+
# Notify listeners that job is running
|
|
367
|
+
job.set_state(JobState.RUNNING)
|
|
368
|
+
self.notify_job_state(job)
|
|
369
|
+
|
|
370
|
+
# Adds to the listeners
|
|
371
|
+
if self.server is not None:
|
|
372
|
+
job.add_notification_server(self.server)
|
|
373
|
+
|
|
374
|
+
# And now, we wait...
|
|
375
|
+
logger.info("Got a process for job %s - waiting to complete", job)
|
|
376
|
+
code = await process.aio_code()
|
|
377
|
+
logger.info("Job %s completed with code %s", job, code)
|
|
378
|
+
|
|
379
|
+
# Record exit code if available
|
|
380
|
+
if code is not None:
|
|
381
|
+
job.exit_code = code
|
|
382
|
+
|
|
383
|
+
# Read state from .done/.failed files (contains detailed failure reason)
|
|
384
|
+
state = JobState.from_path(job.path, job.name)
|
|
385
|
+
|
|
386
|
+
# If state is a generic FAILED error, let the process determine
|
|
387
|
+
# the state (it may detect launcher-specific failures like SLURM timeout)
|
|
388
|
+
if (
|
|
389
|
+
state is not None
|
|
390
|
+
and isinstance(state, JobStateError)
|
|
391
|
+
and state.failure_reason == JobFailureStatus.FAILED
|
|
392
|
+
and code is not None
|
|
393
|
+
):
|
|
394
|
+
process_state = process.get_job_state(code)
|
|
395
|
+
if (
|
|
396
|
+
isinstance(process_state, JobStateError)
|
|
397
|
+
and process_state.failure_reason != JobFailureStatus.FAILED
|
|
398
|
+
):
|
|
399
|
+
# Process detected a more specific failure reason
|
|
400
|
+
state = process_state
|
|
242
401
|
|
|
243
402
|
if state is None:
|
|
244
|
-
|
|
245
|
-
|
|
403
|
+
if code is not None:
|
|
404
|
+
# Fall back to process-specific state detection
|
|
405
|
+
state = process.get_job_state(code)
|
|
406
|
+
else:
|
|
407
|
+
logger.error("No .done or .failed file found for job %s", job)
|
|
408
|
+
state = JobState.ERROR
|
|
409
|
+
# Set endtime before set_state so database gets the timestamp
|
|
410
|
+
job.endtime = time.time()
|
|
411
|
+
job.set_state(state)
|
|
412
|
+
self.notify_job_state(job) # Notify listeners of final state
|
|
413
|
+
|
|
414
|
+
# If not done or running, start the job
|
|
415
|
+
if not job.state.finished():
|
|
416
|
+
try:
|
|
417
|
+
state = await self.aio_start(job)
|
|
418
|
+
# Set endtime before set_state so database gets the timestamp
|
|
419
|
+
job.endtime = time.time()
|
|
420
|
+
job.set_state(state)
|
|
421
|
+
except Exception:
|
|
422
|
+
logger.exception("Got an exception while starting the job")
|
|
423
|
+
raise
|
|
246
424
|
|
|
247
|
-
|
|
425
|
+
# Job is finished - experiment statistics already updated by set_state
|
|
248
426
|
|
|
249
|
-
|
|
427
|
+
# Write final metadata with end time and final state
|
|
428
|
+
job.write_metadata()
|
|
250
429
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
self.xp.failedJobs[job.identifier] = job
|
|
430
|
+
if job in self.waitingjobs:
|
|
431
|
+
self.waitingjobs.remove(job)
|
|
254
432
|
|
|
255
|
-
# Process all remaining
|
|
433
|
+
# Process all remaining task outputs BEFORE notifying exit condition
|
|
434
|
+
# This ensures taskOutputQueueSize is updated before wait() can check it,
|
|
435
|
+
# preventing a race where wait() sees both unfinishedJobs==0 and
|
|
436
|
+
# taskOutputQueueSize==0 before callbacks have been queued.
|
|
256
437
|
await asyncThreadcheck("End of job processing", job.done_handler)
|
|
257
438
|
|
|
258
|
-
#
|
|
259
|
-
self.xp.unfinishedJobs -= 1
|
|
439
|
+
# Now notify - wait() will see the correct taskOutputQueueSize
|
|
260
440
|
async with self.exitCondition:
|
|
261
|
-
logging.debug("Updated number of unfinished jobs")
|
|
262
441
|
self.exitCondition.notify_all()
|
|
263
442
|
|
|
264
|
-
job.endtime = time.time()
|
|
265
|
-
if job in self.waitingjobs:
|
|
266
|
-
self.waitingjobs.remove(job)
|
|
267
|
-
|
|
268
|
-
with job.dependents as dependents:
|
|
269
|
-
logger.info("Processing %d dependent jobs", len(dependents))
|
|
270
|
-
for dependency in dependents:
|
|
271
|
-
logger.debug("Checking dependency %s", dependency)
|
|
272
|
-
self.loop.call_soon(dependency.check)
|
|
273
|
-
|
|
274
443
|
return job.state
|
|
275
444
|
|
|
276
|
-
async def aio_start(self, job: Job) -> Optional[JobState]:
|
|
277
|
-
"""Start a job
|
|
445
|
+
async def aio_start(self, job: Job) -> Optional[JobState]: # noqa: C901
|
|
446
|
+
"""Start a job with full job starting logic
|
|
278
447
|
|
|
279
|
-
This method
|
|
280
|
-
job
|
|
281
|
-
|
|
448
|
+
This method handles job locking, dependency acquisition, directory setup,
|
|
449
|
+
and job execution while using the scheduler's coordination lock to prevent
|
|
450
|
+
race conditions between multiple jobs.
|
|
282
451
|
|
|
283
452
|
:param job: The job to start
|
|
284
453
|
:return: JobState.WAITING if dependencies could not be locked, JobState.DONE
|
|
285
454
|
if job completed successfully, JobState.ERROR if job failed during execution,
|
|
286
455
|
or None (should not occur in normal operation)
|
|
287
|
-
:raises Exception: Various exceptions during
|
|
456
|
+
:raises Exception: Various exceptions during job execution, dependency locking,
|
|
457
|
+
or process creation
|
|
288
458
|
"""
|
|
459
|
+
from experimaestro.scheduler.jobs import JobStateError
|
|
460
|
+
from experimaestro.locking import Locks, LockError
|
|
461
|
+
from experimaestro.scheduler.jobs import JobFailureStatus
|
|
289
462
|
|
|
290
463
|
# Assert preconditions
|
|
291
464
|
assert job.launcher is not None
|
|
292
465
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
466
|
+
# Restart loop for resumable tasks that timeout
|
|
467
|
+
while True:
|
|
468
|
+
logger.debug(
|
|
469
|
+
"Starting job %s with %d dependencies",
|
|
470
|
+
job,
|
|
471
|
+
len(job.dependencies),
|
|
298
472
|
)
|
|
299
473
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
474
|
+
# Separate static and dynamic dependencies
|
|
475
|
+
static_deps = [d for d in job.dependencies if not d.is_dynamic()]
|
|
476
|
+
dynamic_deps = [d for d in job.dependencies if d.is_dynamic()]
|
|
303
477
|
|
|
304
|
-
#
|
|
478
|
+
# First, wait for all static dependencies (jobs) to complete
|
|
479
|
+
# These don't need the dependency lock as they can't change state
|
|
480
|
+
# Static dependency locks don't need to be added to locks list
|
|
481
|
+
logger.debug("Waiting for %d static dependencies", len(static_deps))
|
|
482
|
+
for dependency in static_deps:
|
|
483
|
+
logger.debug("Waiting for static dependency %s", dependency)
|
|
484
|
+
try:
|
|
485
|
+
await dependency.aio_lock()
|
|
486
|
+
except RuntimeError as e:
|
|
487
|
+
# Dependency failed - mark job as failed due to dependency
|
|
488
|
+
logger.warning("Dependency failed: %s", e)
|
|
489
|
+
return JobStateError(JobFailureStatus.DEPENDENCY)
|
|
490
|
+
|
|
491
|
+
# We first lock the job before proceeding
|
|
492
|
+
with Locks() as locks:
|
|
493
|
+
logger.debug("[starting] Locking job %s", job)
|
|
494
|
+
async with job.launcher.connector.lock(job.lockpath):
|
|
495
|
+
logger.debug("[starting] Locked job %s", job)
|
|
496
|
+
|
|
497
|
+
state = None
|
|
498
|
+
try:
|
|
499
|
+
# Now handle dynamic dependencies (tokens) with retry logic
|
|
500
|
+
# CRITICAL: Only one task at a time can acquire dynamic dependencies
|
|
501
|
+
# to prevent deadlocks (e.g., Task A holds Token1 waiting for Token2,
|
|
502
|
+
# Task B holds Token2 waiting for Token1)
|
|
503
|
+
if dynamic_deps:
|
|
504
|
+
async with self.dependencyLock:
|
|
505
|
+
logger.debug(
|
|
506
|
+
"Locking %d dynamic dependencies (tokens)",
|
|
507
|
+
len(dynamic_deps),
|
|
508
|
+
)
|
|
509
|
+
while True:
|
|
510
|
+
all_locked = True
|
|
511
|
+
for idx, dependency in enumerate(dynamic_deps):
|
|
512
|
+
try:
|
|
513
|
+
# Use timeout=0 for first dependency, 0.1s for subsequent
|
|
514
|
+
timeout = 0 if idx == 0 else 0.1
|
|
515
|
+
# Acquire the lock (this might block on IPC locks)
|
|
516
|
+
lock = await dependency.aio_lock(
|
|
517
|
+
timeout=timeout
|
|
518
|
+
)
|
|
519
|
+
locks.append(lock)
|
|
520
|
+
except LockError:
|
|
521
|
+
logger.info(
|
|
522
|
+
"Could not lock %s, retrying",
|
|
523
|
+
dependency,
|
|
524
|
+
)
|
|
525
|
+
# Release all locks and restart
|
|
526
|
+
for lock in locks.locks:
|
|
527
|
+
lock.release()
|
|
528
|
+
locks.locks.clear()
|
|
529
|
+
# Put failed dependency first
|
|
530
|
+
dynamic_deps.remove(dependency)
|
|
531
|
+
dynamic_deps.insert(0, dependency)
|
|
532
|
+
all_locked = False
|
|
533
|
+
break
|
|
534
|
+
|
|
535
|
+
if all_locked:
|
|
536
|
+
# All locks acquired successfully
|
|
537
|
+
break
|
|
538
|
+
|
|
539
|
+
# Dependencies have been locked, we can start the job
|
|
540
|
+
job.starttime = time.time()
|
|
541
|
+
|
|
542
|
+
# Creates the main directory
|
|
543
|
+
directory = job.path
|
|
544
|
+
logger.debug("Making directories job %s...", directory)
|
|
545
|
+
|
|
546
|
+
# Warn about directory cleanup for non-resumable tasks
|
|
547
|
+
# (only once per task type)
|
|
548
|
+
xpmtype = job.config.__xpmtype__
|
|
549
|
+
if (
|
|
550
|
+
directory.is_dir()
|
|
551
|
+
and not job.resumable
|
|
552
|
+
and not xpmtype.warned_clean_not_resumable
|
|
553
|
+
):
|
|
554
|
+
xpmtype.warned_clean_not_resumable = True
|
|
555
|
+
logger.warning(
|
|
556
|
+
"In a future version, directory will be cleaned up for "
|
|
557
|
+
"non-resumable tasks (%s). Use ResumableTask if you want "
|
|
558
|
+
"to preserve the directory contents.",
|
|
559
|
+
xpmtype.identifier,
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
if not directory.is_dir():
|
|
563
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
564
|
+
|
|
565
|
+
# Write metadata with submit and start time (after directory creation)
|
|
566
|
+
job.write_metadata()
|
|
567
|
+
|
|
568
|
+
# Sets up the notification URL
|
|
569
|
+
if self.server is not None:
|
|
570
|
+
job.add_notification_server(self.server)
|
|
571
|
+
|
|
572
|
+
except Exception:
|
|
573
|
+
logger.warning("Error while locking job", exc_info=True)
|
|
574
|
+
return JobState.WAITING
|
|
575
|
+
|
|
576
|
+
try:
|
|
577
|
+
# Runs the job
|
|
578
|
+
process = await job.aio_run()
|
|
579
|
+
except Exception:
|
|
580
|
+
logger.warning("Error while starting job", exc_info=True)
|
|
581
|
+
return JobState.ERROR
|
|
582
|
+
|
|
583
|
+
# Wait for job to complete while holding locks
|
|
584
|
+
try:
|
|
585
|
+
logger.debug("Waiting for job %s process to end", job)
|
|
586
|
+
|
|
587
|
+
code = await process.aio_code()
|
|
588
|
+
logger.debug("Got return code %s for %s", code, job)
|
|
589
|
+
|
|
590
|
+
# Record exit code if available
|
|
591
|
+
if code is not None:
|
|
592
|
+
logger.info("Job %s ended with code %s", job, code)
|
|
593
|
+
job.exit_code = code
|
|
594
|
+
else:
|
|
595
|
+
logger.info("Job %s ended, reading state from files", job)
|
|
596
|
+
|
|
597
|
+
# Read state from .done/.failed files (contains detailed failure reason)
|
|
598
|
+
state = JobState.from_path(job.path, job.name)
|
|
599
|
+
|
|
600
|
+
# If state is a generic FAILED error, let the process determine
|
|
601
|
+
# the state (it may detect launcher-specific failures like SLURM timeout)
|
|
602
|
+
if (
|
|
603
|
+
state is not None
|
|
604
|
+
and isinstance(state, JobStateError)
|
|
605
|
+
and state.failure_reason == JobFailureStatus.FAILED
|
|
606
|
+
and code is not None
|
|
607
|
+
):
|
|
608
|
+
process_state = process.get_job_state(code)
|
|
609
|
+
if (
|
|
610
|
+
isinstance(process_state, JobStateError)
|
|
611
|
+
and process_state.failure_reason != JobFailureStatus.FAILED
|
|
612
|
+
):
|
|
613
|
+
# Process detected a more specific failure reason
|
|
614
|
+
state = process_state
|
|
615
|
+
|
|
616
|
+
if state is None:
|
|
617
|
+
if code is not None:
|
|
618
|
+
# Fall back to process-specific state detection
|
|
619
|
+
state = process.get_job_state(code)
|
|
620
|
+
else:
|
|
621
|
+
logger.error(
|
|
622
|
+
"No .done or .failed file found for job %s", job
|
|
623
|
+
)
|
|
624
|
+
state = JobState.ERROR
|
|
625
|
+
|
|
626
|
+
except JobError:
|
|
627
|
+
logger.warning("Error while running job")
|
|
628
|
+
state = JobState.ERROR
|
|
629
|
+
|
|
630
|
+
except Exception:
|
|
631
|
+
logger.warning(
|
|
632
|
+
"Error while running job (in experimaestro)", exc_info=True
|
|
633
|
+
)
|
|
634
|
+
state = JobState.ERROR
|
|
635
|
+
|
|
636
|
+
# Locks are released here after job completes
|
|
637
|
+
|
|
638
|
+
# Check if we should restart a resumable task that timed out
|
|
639
|
+
from experimaestro.scheduler.jobs import JobStateError
|
|
640
|
+
|
|
641
|
+
if (
|
|
642
|
+
isinstance(state, JobStateError)
|
|
643
|
+
and state.failure_reason == JobFailureStatus.TIMEOUT
|
|
644
|
+
and job.resumable
|
|
645
|
+
):
|
|
646
|
+
job.retry_count += 1
|
|
647
|
+
if job.retry_count <= job.max_retries:
|
|
648
|
+
logger.info(
|
|
649
|
+
"Resumable task %s timed out - restarting (attempt %d/%d)",
|
|
650
|
+
job,
|
|
651
|
+
job.retry_count,
|
|
652
|
+
job.max_retries,
|
|
653
|
+
)
|
|
654
|
+
# Rotate log files to preserve previous run's logs
|
|
655
|
+
job.rotate_logs()
|
|
656
|
+
# Clear cached process so aio_run() will create a new one
|
|
657
|
+
job._process = None
|
|
658
|
+
# Delete PID file so the job will be resubmitted
|
|
659
|
+
if job.pidpath.exists():
|
|
660
|
+
job.pidpath.unlink()
|
|
661
|
+
# Continue the loop to restart
|
|
662
|
+
continue
|
|
663
|
+
else:
|
|
664
|
+
logger.warning(
|
|
665
|
+
"Resumable task %s exceeded max retries (%d), marking as failed",
|
|
666
|
+
job,
|
|
667
|
+
job.max_retries,
|
|
668
|
+
)
|
|
669
|
+
# Fall through to return the error state
|
|
670
|
+
|
|
671
|
+
# Job finished (success or non-recoverable error)
|
|
672
|
+
# Notify scheduler listeners of job state after job completes
|
|
305
673
|
self.notify_job_state(job)
|
|
306
674
|
return state
|
|
307
|
-
|
|
308
|
-
except Exception:
|
|
309
|
-
logger.warning("Error in scheduler job coordination", exc_info=True)
|
|
310
|
-
return JobState.ERROR
|