experimaestro 2.0.0a3__py3-none-any.whl → 2.0.0a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/connectors/__init__.py +2 -2
- experimaestro/core/objects/config.py +28 -9
- experimaestro/scheduler/__init__.py +18 -1
- experimaestro/scheduler/base.py +87 -906
- experimaestro/scheduler/experiment.py +387 -0
- experimaestro/scheduler/jobs.py +475 -0
- experimaestro/scheduler/signal_handler.py +32 -0
- experimaestro/scheduler/state.py +1 -1
- experimaestro/server/__init__.py +36 -5
- experimaestro/tests/test_dependencies.py +1 -1
- experimaestro/tests/test_generators.py +34 -9
- experimaestro/typingutils.py +11 -2
- {experimaestro-2.0.0a3.dist-info → experimaestro-2.0.0a4.dist-info}/METADATA +3 -2
- {experimaestro-2.0.0a3.dist-info → experimaestro-2.0.0a4.dist-info}/RECORD +17 -14
- {experimaestro-2.0.0a3.dist-info → experimaestro-2.0.0a4.dist-info}/WHEEL +1 -1
- {experimaestro-2.0.0a3.dist-info → experimaestro-2.0.0a4.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0a3.dist-info → experimaestro-2.0.0a4.dist-info/licenses}/LICENSE +0 -0
experimaestro/scheduler/base.py
CHANGED
|
@@ -1,392 +1,23 @@
|
|
|
1
|
-
from collections import ChainMap
|
|
2
|
-
from functools import cached_property
|
|
3
|
-
import itertools
|
|
4
1
|
import logging
|
|
5
|
-
import os
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from shutil import rmtree
|
|
8
2
|
import threading
|
|
9
3
|
import time
|
|
10
4
|
from typing import (
|
|
11
|
-
Any,
|
|
12
|
-
Iterator,
|
|
13
|
-
List,
|
|
14
5
|
Optional,
|
|
15
6
|
Set,
|
|
16
|
-
TypeVar,
|
|
17
|
-
Union,
|
|
18
|
-
TYPE_CHECKING,
|
|
19
7
|
)
|
|
20
|
-
import enum
|
|
21
|
-
import signal
|
|
22
8
|
import asyncio
|
|
23
|
-
from experimaestro.exceptions import HandledException
|
|
24
|
-
from experimaestro.notifications import LevelInformation, Reporter
|
|
25
9
|
from typing import Dict
|
|
10
|
+
|
|
11
|
+
from experimaestro.scheduler import experiment
|
|
12
|
+
from experimaestro.scheduler.jobs import Job, JobState
|
|
26
13
|
from experimaestro.scheduler.services import Service
|
|
27
|
-
from experimaestro.settings import WorkspaceSettings, get_settings
|
|
28
14
|
|
|
29
15
|
|
|
30
|
-
from experimaestro.core.objects import Config, ConfigWalkContext, WatchedOutput
|
|
31
16
|
from experimaestro.utils import logger
|
|
32
|
-
from experimaestro.locking import Locks, LockError, Lock
|
|
33
17
|
from experimaestro.utils.asyncio import asyncThreadcheck
|
|
34
|
-
from .workspace import RunMode, Workspace
|
|
35
|
-
from .dependencies import Dependency, DependencyStatus, Resource
|
|
36
18
|
import concurrent.futures
|
|
37
19
|
|
|
38
20
|
|
|
39
|
-
if TYPE_CHECKING:
|
|
40
|
-
from experimaestro.connectors import Process
|
|
41
|
-
from experimaestro.launchers import Launcher
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class FailedExperiment(HandledException):
|
|
45
|
-
"""Raised when an experiment failed"""
|
|
46
|
-
|
|
47
|
-
pass
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class JobState(enum.Enum):
|
|
51
|
-
# Job is not yet scheduled
|
|
52
|
-
UNSCHEDULED = 0
|
|
53
|
-
|
|
54
|
-
# Job is waiting for dependencies to be done
|
|
55
|
-
WAITING = 1
|
|
56
|
-
|
|
57
|
-
# Job is ready to run
|
|
58
|
-
READY = 2
|
|
59
|
-
|
|
60
|
-
# Job is scheduled (e.g. slurm)
|
|
61
|
-
SCHEDULED = 3
|
|
62
|
-
|
|
63
|
-
# Job is running
|
|
64
|
-
RUNNING = 4
|
|
65
|
-
|
|
66
|
-
# Job is done (finished)
|
|
67
|
-
DONE = 5
|
|
68
|
-
|
|
69
|
-
# Job failed (finished)
|
|
70
|
-
ERROR = 6
|
|
71
|
-
|
|
72
|
-
def notstarted(self):
|
|
73
|
-
return self.value <= JobState.READY.value
|
|
74
|
-
|
|
75
|
-
def running(self):
|
|
76
|
-
return (
|
|
77
|
-
self.value == JobState.RUNNING.value
|
|
78
|
-
or self.value == JobState.SCHEDULED.value
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
def finished(self):
|
|
82
|
-
return self.value >= JobState.DONE.value
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
class JobFailureStatus(enum.Enum):
|
|
86
|
-
#: Job failed
|
|
87
|
-
DEPENDENCY = 0
|
|
88
|
-
|
|
89
|
-
#: Job dependency failed
|
|
90
|
-
FAILED = 1
|
|
91
|
-
|
|
92
|
-
#: Memory
|
|
93
|
-
MEMORY = 2
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
class JobLock(Lock):
|
|
97
|
-
def __init__(self, job):
|
|
98
|
-
super().__init__()
|
|
99
|
-
self.job = job
|
|
100
|
-
|
|
101
|
-
def _acquire(self):
|
|
102
|
-
return self.job.state == JobState.DONE
|
|
103
|
-
|
|
104
|
-
def _release(self):
|
|
105
|
-
return False
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
class JobDependency(Dependency):
|
|
109
|
-
def __init__(self, job):
|
|
110
|
-
super().__init__(job)
|
|
111
|
-
|
|
112
|
-
def status(self) -> DependencyStatus:
|
|
113
|
-
if self.origin.state == JobState.DONE:
|
|
114
|
-
return DependencyStatus.OK
|
|
115
|
-
elif self.origin.state == JobState.ERROR:
|
|
116
|
-
return DependencyStatus.FAIL
|
|
117
|
-
return DependencyStatus.WAIT
|
|
118
|
-
|
|
119
|
-
def lock(self):
|
|
120
|
-
return JobLock(self.origin)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
class Job(Resource):
|
|
124
|
-
"""A job is a resource that is produced by the execution of some code"""
|
|
125
|
-
|
|
126
|
-
# Set by the scheduler
|
|
127
|
-
_readyEvent: Optional[asyncio.Event]
|
|
128
|
-
_future: Optional["concurrent.futures.Future"]
|
|
129
|
-
|
|
130
|
-
def __init__(
|
|
131
|
-
self,
|
|
132
|
-
config: Config,
|
|
133
|
-
*,
|
|
134
|
-
workspace: Workspace = None,
|
|
135
|
-
launcher: "Launcher" = None,
|
|
136
|
-
run_mode: RunMode = RunMode.NORMAL,
|
|
137
|
-
):
|
|
138
|
-
super().__init__()
|
|
139
|
-
|
|
140
|
-
self.workspace = workspace or Workspace.CURRENT
|
|
141
|
-
self.launcher = launcher or self.workspace.launcher if self.workspace else None
|
|
142
|
-
|
|
143
|
-
if run_mode == RunMode.NORMAL:
|
|
144
|
-
assert self.workspace is not None, "No experiment has been defined"
|
|
145
|
-
assert self.launcher is not None, (
|
|
146
|
-
"No launcher, and no default defined for the workspace %s" % workspace
|
|
147
|
-
)
|
|
148
|
-
|
|
149
|
-
self.type = config.__xpmtype__
|
|
150
|
-
self.name = str(self.type.identifier).rsplit(".", 1)[-1]
|
|
151
|
-
|
|
152
|
-
self.scheduler: Optional["Scheduler"] = None
|
|
153
|
-
self.config = config
|
|
154
|
-
self.state: JobState = JobState.UNSCHEDULED
|
|
155
|
-
|
|
156
|
-
#: If a job has failed, indicates the failure status
|
|
157
|
-
self.failure_status: JobFailureStatus = None
|
|
158
|
-
|
|
159
|
-
# Dependencies
|
|
160
|
-
self.dependencies: Set[Dependency] = set() # as target
|
|
161
|
-
|
|
162
|
-
# Watched outputs
|
|
163
|
-
self.watched_outputs = {}
|
|
164
|
-
for watched in config.__xpm__.watched_outputs:
|
|
165
|
-
self.watch_output(watched)
|
|
166
|
-
|
|
167
|
-
# Process
|
|
168
|
-
self._process = None
|
|
169
|
-
self.unsatisfied = 0
|
|
170
|
-
|
|
171
|
-
# Meta-information
|
|
172
|
-
self.starttime: Optional[float] = None
|
|
173
|
-
self.submittime: Optional[float] = None
|
|
174
|
-
self.endtime: Optional[float] = None
|
|
175
|
-
self._progress: List[LevelInformation] = []
|
|
176
|
-
self.tags = config.tags()
|
|
177
|
-
|
|
178
|
-
def watch_output(self, watched: "WatchedOutput"):
|
|
179
|
-
"""Monitor task outputs
|
|
180
|
-
|
|
181
|
-
:param watched: A description of the watched output
|
|
182
|
-
"""
|
|
183
|
-
self.scheduler.xp.watch_output(watched)
|
|
184
|
-
|
|
185
|
-
def task_output_update(self, subpath: Path):
|
|
186
|
-
"""Notification of an updated task output"""
|
|
187
|
-
if watcher := self.watched_outputs.get(subpath, None):
|
|
188
|
-
watcher.update()
|
|
189
|
-
|
|
190
|
-
def done_handler(self):
|
|
191
|
-
"""The task has been completed"""
|
|
192
|
-
for watcher in self.watched_outputs.values():
|
|
193
|
-
watcher.update()
|
|
194
|
-
|
|
195
|
-
def __str__(self):
|
|
196
|
-
return "Job[{}]".format(self.identifier)
|
|
197
|
-
|
|
198
|
-
def wait(self) -> JobState:
|
|
199
|
-
assert self._future, "Cannot wait a not submitted job"
|
|
200
|
-
return self._future.result()
|
|
201
|
-
|
|
202
|
-
@cached_property
|
|
203
|
-
def python_path(self) -> Iterator[str]:
|
|
204
|
-
"""Returns an iterator over python path"""
|
|
205
|
-
return itertools.chain(self.workspace.python_path)
|
|
206
|
-
|
|
207
|
-
@cached_property
|
|
208
|
-
def environ(self):
|
|
209
|
-
"""Returns the job environment
|
|
210
|
-
|
|
211
|
-
It is made of (by order of priority):
|
|
212
|
-
|
|
213
|
-
1. The job environment
|
|
214
|
-
1. The launcher environment
|
|
215
|
-
1. The workspace environment
|
|
216
|
-
|
|
217
|
-
"""
|
|
218
|
-
return ChainMap(
|
|
219
|
-
{},
|
|
220
|
-
self.launcher.environ if self.launcher else {},
|
|
221
|
-
self.workspace.env if self.workspace else {},
|
|
222
|
-
)
|
|
223
|
-
|
|
224
|
-
@property
|
|
225
|
-
def progress(self):
|
|
226
|
-
return self._progress
|
|
227
|
-
|
|
228
|
-
def set_progress(self, level: int, value: float, desc: Optional[str]):
|
|
229
|
-
if value < 0:
|
|
230
|
-
logger.warning(f"Progress value out of bounds ({value})")
|
|
231
|
-
value = 0
|
|
232
|
-
elif value > 1:
|
|
233
|
-
logger.warning(f"Progress value out of bounds ({value})")
|
|
234
|
-
value = 1
|
|
235
|
-
|
|
236
|
-
# Adjust the length of the array
|
|
237
|
-
self._progress = self._progress[: (level + 1)]
|
|
238
|
-
while len(self._progress) <= level:
|
|
239
|
-
self._progress.append(LevelInformation(len(self._progress), None, 0.0))
|
|
240
|
-
|
|
241
|
-
if desc:
|
|
242
|
-
self._progress[-1].desc = desc
|
|
243
|
-
self._progress[-1].progress = value
|
|
244
|
-
|
|
245
|
-
for listener in self.scheduler.listeners:
|
|
246
|
-
listener.job_state(self)
|
|
247
|
-
|
|
248
|
-
def add_notification_server(self, server):
|
|
249
|
-
"""Adds a notification server"""
|
|
250
|
-
key, baseurl = server.getNotificationSpec()
|
|
251
|
-
dirpath = self.path / Reporter.NOTIFICATION_FOLDER
|
|
252
|
-
dirpath.mkdir(exist_ok=True)
|
|
253
|
-
(dirpath / key).write_text(f"{baseurl}/{self.identifier}")
|
|
254
|
-
|
|
255
|
-
@property
|
|
256
|
-
def ready(self):
|
|
257
|
-
return self.state == JobState.READY
|
|
258
|
-
|
|
259
|
-
@property
|
|
260
|
-
def jobpath(self) -> Path:
|
|
261
|
-
"""Deprecated, use `path`"""
|
|
262
|
-
return self.workspace.jobspath / self.relpath
|
|
263
|
-
|
|
264
|
-
@property
|
|
265
|
-
def path(self) -> Path:
|
|
266
|
-
return self.workspace.jobspath / self.relpath
|
|
267
|
-
|
|
268
|
-
@property
|
|
269
|
-
def experimaestro_path(self) -> Path:
|
|
270
|
-
return (self.path / ".experimaestro").resolve()
|
|
271
|
-
|
|
272
|
-
@cached_property
|
|
273
|
-
def task_outputs_path(self) -> Path:
|
|
274
|
-
return self.experimaestro_path / "task-outputs.jsonl"
|
|
275
|
-
|
|
276
|
-
@property
|
|
277
|
-
def relpath(self):
|
|
278
|
-
identifier = self.config.__xpm__.identifier
|
|
279
|
-
base = Path(str(self.type.identifier))
|
|
280
|
-
return base / identifier.all.hex()
|
|
281
|
-
|
|
282
|
-
@property
|
|
283
|
-
def relmainpath(self):
|
|
284
|
-
identifier = self.config.__xpm__.identifier
|
|
285
|
-
base = Path(str(self.type.identifier))
|
|
286
|
-
return base / identifier.main.hex()
|
|
287
|
-
|
|
288
|
-
@property
|
|
289
|
-
def hashidentifier(self):
|
|
290
|
-
return self.config.__xpm__.identifier
|
|
291
|
-
|
|
292
|
-
@property
|
|
293
|
-
def identifier(self):
|
|
294
|
-
return self.config.__xpm__.identifier.all.hex()
|
|
295
|
-
|
|
296
|
-
def prepare(self, overwrite=False):
|
|
297
|
-
"""Prepare all files before starting a task
|
|
298
|
-
|
|
299
|
-
:param overwrite: if True, overwrite files even if the task has been run
|
|
300
|
-
"""
|
|
301
|
-
pass
|
|
302
|
-
|
|
303
|
-
async def aio_run(self):
|
|
304
|
-
"""Actually run the code"""
|
|
305
|
-
raise NotImplementedError(f"Method aio_run not implemented in {self.__class__}")
|
|
306
|
-
|
|
307
|
-
async def aio_process(self) -> Optional["Process"]:
|
|
308
|
-
"""Returns the process if it exists"""
|
|
309
|
-
raise NotImplementedError("Not implemented")
|
|
310
|
-
|
|
311
|
-
@property
|
|
312
|
-
def pidpath(self):
|
|
313
|
-
"""This file contains the file PID"""
|
|
314
|
-
return self.jobpath / ("%s.pid" % self.name)
|
|
315
|
-
|
|
316
|
-
@property
|
|
317
|
-
def lockpath(self):
|
|
318
|
-
"""This file is used as a lock for running the job"""
|
|
319
|
-
return self.workspace.jobspath / self.relmainpath / ("%s.lock" % self.name)
|
|
320
|
-
|
|
321
|
-
@property
|
|
322
|
-
def donepath(self) -> Path:
|
|
323
|
-
"""When a job has been successful, this file is written"""
|
|
324
|
-
return self.jobpath / ("%s.done" % self.name)
|
|
325
|
-
|
|
326
|
-
@property
|
|
327
|
-
def failedpath(self):
|
|
328
|
-
"""When a job has been unsuccessful, this file is written with an error
|
|
329
|
-
code inside"""
|
|
330
|
-
return self.jobpath / ("%s.failed" % self.name)
|
|
331
|
-
|
|
332
|
-
@property
|
|
333
|
-
def stdout(self) -> Path:
|
|
334
|
-
return self.jobpath / ("%s.out" % self.name)
|
|
335
|
-
|
|
336
|
-
@property
|
|
337
|
-
def stderr(self) -> Path:
|
|
338
|
-
return self.jobpath / ("%s.err" % self.name)
|
|
339
|
-
|
|
340
|
-
@property
|
|
341
|
-
def basepath(self) -> Path:
|
|
342
|
-
return self.jobpath / self.name
|
|
343
|
-
|
|
344
|
-
def dependencychanged(self, dependency, oldstatus, status):
|
|
345
|
-
"""Called when a dependency has changed"""
|
|
346
|
-
|
|
347
|
-
def value(s):
|
|
348
|
-
return 1 if s == DependencyStatus.OK else 0
|
|
349
|
-
|
|
350
|
-
self.unsatisfied -= value(status) - value(oldstatus)
|
|
351
|
-
|
|
352
|
-
logger.debug("Job %s: unsatisfied %d", self, self.unsatisfied)
|
|
353
|
-
|
|
354
|
-
if status == DependencyStatus.FAIL:
|
|
355
|
-
# Job completed
|
|
356
|
-
if not self.state.finished():
|
|
357
|
-
self.state = JobState.ERROR
|
|
358
|
-
self.failure_status = JobFailureStatus.DEPENDENCY
|
|
359
|
-
self._readyEvent.set()
|
|
360
|
-
|
|
361
|
-
if self.unsatisfied == 0:
|
|
362
|
-
logger.info("Job %s is ready to run", self)
|
|
363
|
-
# We are ready
|
|
364
|
-
self.state = JobState.READY
|
|
365
|
-
self._readyEvent.set()
|
|
366
|
-
|
|
367
|
-
def finalState(self) -> "concurrent.futures.Future[JobState]":
|
|
368
|
-
assert self._future is not None
|
|
369
|
-
return self._future
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
class JobContext(ConfigWalkContext):
|
|
373
|
-
def __init__(self, job: Job):
|
|
374
|
-
super().__init__()
|
|
375
|
-
self.job = job
|
|
376
|
-
|
|
377
|
-
@property
|
|
378
|
-
def name(self):
|
|
379
|
-
return self.job.name
|
|
380
|
-
|
|
381
|
-
@property
|
|
382
|
-
def path(self):
|
|
383
|
-
return self.job.path
|
|
384
|
-
|
|
385
|
-
@property
|
|
386
|
-
def task(self):
|
|
387
|
-
return self.job.config
|
|
388
|
-
|
|
389
|
-
|
|
390
21
|
class Listener:
|
|
391
22
|
def job_submitted(self, job):
|
|
392
23
|
pass
|
|
@@ -399,78 +30,16 @@ class Listener:
|
|
|
399
30
|
pass
|
|
400
31
|
|
|
401
32
|
|
|
402
|
-
class
|
|
403
|
-
def __init__(self, code):
|
|
404
|
-
super().__init__(f"Job exited with code {code}")
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
class SignalHandler:
|
|
408
|
-
def __init__(self):
|
|
409
|
-
self.experiments: Set["experiment"] = set()
|
|
410
|
-
self.original_sigint_handler = None
|
|
411
|
-
|
|
412
|
-
def add(self, xp: "experiment"):
|
|
413
|
-
if not self.experiments:
|
|
414
|
-
self.original_sigint_handler = signal.getsignal(signal.SIGINT)
|
|
415
|
-
|
|
416
|
-
signal.signal(signal.SIGINT, self)
|
|
417
|
-
|
|
418
|
-
self.experiments.add(xp)
|
|
419
|
-
|
|
420
|
-
def remove(self, xp):
|
|
421
|
-
self.experiments.remove(xp)
|
|
422
|
-
if not self.experiments:
|
|
423
|
-
signal.signal(signal.SIGINT, self.original_sigint_handler)
|
|
424
|
-
|
|
425
|
-
def __call__(self, signum, frame):
|
|
426
|
-
"""SIGINT signal handler"""
|
|
427
|
-
logger.warning("Signal received")
|
|
428
|
-
for xp in self.experiments:
|
|
429
|
-
xp.stop()
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
SIGNAL_HANDLER = SignalHandler()
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
class SchedulerCentral(threading.Thread):
|
|
436
|
-
loop: asyncio.AbstractEventLoop
|
|
437
|
-
|
|
438
|
-
"""The event loop thread used by the scheduler"""
|
|
439
|
-
|
|
440
|
-
def __init__(self, name: str):
|
|
441
|
-
# Daemon thread so it is non blocking
|
|
442
|
-
super().__init__(name=f"Scheduler EL ({name})", daemon=True)
|
|
443
|
-
|
|
444
|
-
self._ready = threading.Event()
|
|
445
|
-
|
|
446
|
-
def run(self):
|
|
447
|
-
logger.debug("Starting event loop thread")
|
|
448
|
-
self.loop = asyncio.new_event_loop()
|
|
449
|
-
asyncio.set_event_loop(self.loop)
|
|
450
|
-
|
|
451
|
-
# Set loop-dependent variables
|
|
452
|
-
self.exitCondition = asyncio.Condition()
|
|
453
|
-
self.dependencyLock = asyncio.Lock()
|
|
454
|
-
|
|
455
|
-
# Start the event loop
|
|
456
|
-
self._ready.set()
|
|
457
|
-
self.loop.run_forever()
|
|
458
|
-
|
|
459
|
-
@staticmethod
|
|
460
|
-
def create(name: str):
|
|
461
|
-
instance = SchedulerCentral(name)
|
|
462
|
-
instance.start()
|
|
463
|
-
instance._ready.wait()
|
|
464
|
-
return instance
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
class Scheduler:
|
|
33
|
+
class Scheduler(threading.Thread):
|
|
468
34
|
"""A job scheduler
|
|
469
35
|
|
|
470
36
|
The scheduler is based on asyncio for easy concurrency handling
|
|
471
37
|
"""
|
|
472
38
|
|
|
473
39
|
def __init__(self, xp: "experiment", name: str):
|
|
40
|
+
super().__init__(name=f"Scheduler ({name})", daemon=True)
|
|
41
|
+
self._ready = threading.Event()
|
|
42
|
+
|
|
474
43
|
# Name of the experiment
|
|
475
44
|
self.name = name
|
|
476
45
|
self.xp = xp
|
|
@@ -487,9 +56,32 @@ class Scheduler:
|
|
|
487
56
|
# Listeners
|
|
488
57
|
self.listeners: Set[Listener] = set()
|
|
489
58
|
|
|
490
|
-
@
|
|
491
|
-
def
|
|
492
|
-
|
|
59
|
+
@staticmethod
|
|
60
|
+
def create(xp: "experiment", name: str):
|
|
61
|
+
instance = Scheduler(xp, name)
|
|
62
|
+
instance.start()
|
|
63
|
+
instance._ready.wait()
|
|
64
|
+
return instance
|
|
65
|
+
|
|
66
|
+
def run(self):
|
|
67
|
+
"""Run the event loop forever"""
|
|
68
|
+
logger.debug("Starting event loop thread")
|
|
69
|
+
# Ported from SchedulerCentral
|
|
70
|
+
self.loop = asyncio.new_event_loop()
|
|
71
|
+
asyncio.set_event_loop(self.loop)
|
|
72
|
+
# Set loop-dependent variables
|
|
73
|
+
self.exitCondition = asyncio.Condition()
|
|
74
|
+
self.dependencyLock = asyncio.Lock()
|
|
75
|
+
self._ready.set()
|
|
76
|
+
self.loop.run_forever()
|
|
77
|
+
|
|
78
|
+
def start_scheduler(self):
|
|
79
|
+
"""Start the scheduler event loop in a thread"""
|
|
80
|
+
if not self.is_alive():
|
|
81
|
+
self.start()
|
|
82
|
+
self._ready.wait()
|
|
83
|
+
else:
|
|
84
|
+
logger.warning("Scheduler already started")
|
|
493
85
|
|
|
494
86
|
def addlistener(self, listener: Listener):
|
|
495
87
|
self.listeners.add(listener)
|
|
@@ -498,6 +90,13 @@ class Scheduler:
|
|
|
498
90
|
self.listeners.remove(listener)
|
|
499
91
|
|
|
500
92
|
def getJobState(self, job: Job) -> "concurrent.futures.Future[JobState]":
|
|
93
|
+
# Check if the job belongs to this scheduler
|
|
94
|
+
if job.identifier not in self.jobs:
|
|
95
|
+
# If job is not in this scheduler, return its current state directly
|
|
96
|
+
future = concurrent.futures.Future()
|
|
97
|
+
future.set_result(job.state)
|
|
98
|
+
return future
|
|
99
|
+
|
|
501
100
|
return asyncio.run_coroutine_threadsafe(self.aio_getjobstate(job), self.loop)
|
|
502
101
|
|
|
503
102
|
async def aio_getjobstate(self, job: Job):
|
|
@@ -547,6 +146,22 @@ class Scheduler:
|
|
|
547
146
|
|
|
548
147
|
return None
|
|
549
148
|
|
|
149
|
+
def notify_job_submitted(self, job: Job):
|
|
150
|
+
"""Notify the listeners that a job has been submitted"""
|
|
151
|
+
for listener in self.listeners:
|
|
152
|
+
try:
|
|
153
|
+
listener.job_submitted(job)
|
|
154
|
+
except Exception:
|
|
155
|
+
logger.exception("Got an error with listener %s", listener)
|
|
156
|
+
|
|
157
|
+
def notify_job_state(self, job: Job):
|
|
158
|
+
"""Notify the listeners that a job has changed state"""
|
|
159
|
+
for listener in self.listeners:
|
|
160
|
+
try:
|
|
161
|
+
listener.job_state(job)
|
|
162
|
+
except Exception:
|
|
163
|
+
logger.exception("Got an error with listener %s", listener)
|
|
164
|
+
|
|
550
165
|
async def aio_submit(self, job: Job) -> JobState: # noqa: C901
|
|
551
166
|
"""Main scheduler function: submit a job, run it (if needed), and returns
|
|
552
167
|
the status code
|
|
@@ -571,11 +186,8 @@ class Scheduler:
|
|
|
571
186
|
path.symlink_to(job.path)
|
|
572
187
|
|
|
573
188
|
job.state = JobState.WAITING
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
listener.job_submitted(job)
|
|
577
|
-
except Exception:
|
|
578
|
-
logger.exception("Got an error with listener %s", listener)
|
|
189
|
+
|
|
190
|
+
self.notify_job_submitted(job)
|
|
579
191
|
|
|
580
192
|
# Add dependencies, and add to blocking resources
|
|
581
193
|
if job.dependencies:
|
|
@@ -598,11 +210,8 @@ class Scheduler:
|
|
|
598
210
|
if process is not None:
|
|
599
211
|
# Yep! First we notify the listeners
|
|
600
212
|
job.state = JobState.RUNNING
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
listener.job_state(job)
|
|
604
|
-
except Exception:
|
|
605
|
-
logger.exception("Got an error with listener %s", listener)
|
|
213
|
+
# Notify the listeners
|
|
214
|
+
self.notify_job_state(job)
|
|
606
215
|
|
|
607
216
|
# Adds to the listeners
|
|
608
217
|
if self.xp.server is not None:
|
|
@@ -637,11 +246,7 @@ class Scheduler:
|
|
|
637
246
|
|
|
638
247
|
job.state = state
|
|
639
248
|
|
|
640
|
-
|
|
641
|
-
try:
|
|
642
|
-
listener.job_state(job)
|
|
643
|
-
except Exception as e:
|
|
644
|
-
logger.exception("Listener %s did raise an exception", e)
|
|
249
|
+
self.notify_job_state(job)
|
|
645
250
|
|
|
646
251
|
# Job is finished
|
|
647
252
|
if job.state != JobState.DONE:
|
|
@@ -652,9 +257,9 @@ class Scheduler:
|
|
|
652
257
|
|
|
653
258
|
# Decrement the number of unfinished jobs and notify
|
|
654
259
|
self.xp.unfinishedJobs -= 1
|
|
655
|
-
async with self.
|
|
260
|
+
async with self.exitCondition:
|
|
656
261
|
logging.debug("Updated number of unfinished jobs")
|
|
657
|
-
self.
|
|
262
|
+
self.exitCondition.notify_all()
|
|
658
263
|
|
|
659
264
|
job.endtime = time.time()
|
|
660
265
|
if job in self.waitingjobs:
|
|
@@ -669,461 +274,37 @@ class Scheduler:
|
|
|
669
274
|
return job.state
|
|
670
275
|
|
|
671
276
|
async def aio_start(self, job: Job) -> Optional[JobState]:
|
|
672
|
-
"""Start a job
|
|
673
|
-
|
|
674
|
-
Returns None if the dependencies could not be locked after all
|
|
675
|
-
Returns DONE/ERROR depending on the process outcome
|
|
676
|
-
"""
|
|
677
|
-
|
|
678
|
-
# We first lock the job before proceeding
|
|
679
|
-
assert job.launcher is not None
|
|
680
|
-
assert self.xp.central is not None
|
|
681
|
-
|
|
682
|
-
with Locks() as locks:
|
|
683
|
-
logger.debug("[starting] Locking job %s", job)
|
|
684
|
-
async with job.launcher.connector.lock(job.lockpath):
|
|
685
|
-
logger.debug("[starting] Locked job %s", job)
|
|
686
|
-
|
|
687
|
-
state = None
|
|
688
|
-
try:
|
|
689
|
-
logger.debug(
|
|
690
|
-
"Starting job %s with %d dependencies",
|
|
691
|
-
job,
|
|
692
|
-
len(job.dependencies),
|
|
693
|
-
)
|
|
694
|
-
|
|
695
|
-
async with self.xp.central.dependencyLock:
|
|
696
|
-
for dependency in job.dependencies:
|
|
697
|
-
try:
|
|
698
|
-
locks.append(dependency.lock().acquire())
|
|
699
|
-
except LockError:
|
|
700
|
-
logger.warning(
|
|
701
|
-
"Could not lock %s, aborting start for job %s",
|
|
702
|
-
dependency,
|
|
703
|
-
job,
|
|
704
|
-
)
|
|
705
|
-
dependency.check()
|
|
706
|
-
return JobState.WAITING
|
|
707
|
-
|
|
708
|
-
for listener in self.listeners:
|
|
709
|
-
listener.job_state(job)
|
|
710
|
-
|
|
711
|
-
job.starttime = time.time()
|
|
712
|
-
|
|
713
|
-
# Creates the main directory
|
|
714
|
-
directory = job.path
|
|
715
|
-
logger.debug("Making directories job %s...", directory)
|
|
716
|
-
if not directory.is_dir():
|
|
717
|
-
directory.mkdir(parents=True, exist_ok=True)
|
|
718
|
-
|
|
719
|
-
# Sets up the notification URL
|
|
720
|
-
if self.xp.server is not None:
|
|
721
|
-
job.add_notification_server(self.xp.server)
|
|
722
|
-
|
|
723
|
-
except Exception:
|
|
724
|
-
logger.warning("Error while locking job", exc_info=True)
|
|
725
|
-
return JobState.WAITING
|
|
726
|
-
|
|
727
|
-
try:
|
|
728
|
-
# Runs the job
|
|
729
|
-
process = await job.aio_run()
|
|
730
|
-
except Exception:
|
|
731
|
-
logger.warning("Error while starting job", exc_info=True)
|
|
732
|
-
return JobState.ERROR
|
|
733
|
-
|
|
734
|
-
try:
|
|
735
|
-
if isinstance(process, JobState):
|
|
736
|
-
state = process
|
|
737
|
-
logger.debug("Job %s ended (state %s)", job, state)
|
|
738
|
-
else:
|
|
739
|
-
logger.debug("Waiting for job %s process to end", job)
|
|
740
|
-
|
|
741
|
-
code = await process.aio_code()
|
|
742
|
-
logger.debug("Got return code %s for %s", code, job)
|
|
743
|
-
|
|
744
|
-
# Check the file if there is no return code
|
|
745
|
-
if code is None:
|
|
746
|
-
# Case where we cannot retrieve the code right away
|
|
747
|
-
if job.donepath.is_file():
|
|
748
|
-
code = 0
|
|
749
|
-
else:
|
|
750
|
-
code = int(job.failedpath.read_text())
|
|
751
|
-
|
|
752
|
-
logger.debug("Job %s ended with code %s", job, code)
|
|
753
|
-
state = JobState.DONE if code == 0 else JobState.ERROR
|
|
754
|
-
|
|
755
|
-
except JobError:
|
|
756
|
-
logger.warning("Error while running job")
|
|
757
|
-
state = JobState.ERROR
|
|
758
|
-
|
|
759
|
-
except Exception:
|
|
760
|
-
logger.warning(
|
|
761
|
-
"Error while running job (in experimaestro)", exc_info=True
|
|
762
|
-
)
|
|
763
|
-
state = JobState.ERROR
|
|
764
|
-
|
|
765
|
-
return state
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
ServiceClass = TypeVar("ServiceClass", bound=Service)
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
class experiment:
|
|
772
|
-
"""Main experiment object
|
|
773
|
-
|
|
774
|
-
It is a context object, i.e. experiments is run with
|
|
775
|
-
|
|
776
|
-
```py
|
|
777
|
-
with experiment(...) as xp:
|
|
778
|
-
...
|
|
779
|
-
```
|
|
780
|
-
"""
|
|
781
|
-
|
|
782
|
-
#: Current experiment
|
|
783
|
-
CURRENT: Optional["experiment"] = None
|
|
784
|
-
|
|
785
|
-
@staticmethod
|
|
786
|
-
def current() -> "experiment":
|
|
787
|
-
"""Returns the current experiment, but checking first if set
|
|
788
|
-
|
|
789
|
-
If there is no current experiment, raises an AssertError
|
|
790
|
-
"""
|
|
791
|
-
assert experiment.CURRENT is not None, "No current experiment defined"
|
|
792
|
-
return experiment.CURRENT
|
|
793
|
-
|
|
794
|
-
def __init__(
|
|
795
|
-
self,
|
|
796
|
-
env: Union[Path, str, WorkspaceSettings],
|
|
797
|
-
name: str,
|
|
798
|
-
*,
|
|
799
|
-
host: Optional[str] = None,
|
|
800
|
-
port: Optional[int] = None,
|
|
801
|
-
token: Optional[str] = None,
|
|
802
|
-
run_mode: Optional[RunMode] = None,
|
|
803
|
-
launcher=None,
|
|
804
|
-
):
|
|
805
|
-
"""
|
|
806
|
-
:param env: an environment -- or a working directory for a local
|
|
807
|
-
environment
|
|
277
|
+
"""Start a job (scheduler coordination layer)
|
|
808
278
|
|
|
809
|
-
|
|
279
|
+
This method serves as a coordination layer that delegates the actual
|
|
280
|
+
job starting logic to the job itself while handling scheduler-specific
|
|
281
|
+
concerns like state notifications and providing coordination context.
|
|
810
282
|
|
|
811
|
-
:param
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
:
|
|
816
|
-
set). Use negative number to avoid running a web server (default when dry run).
|
|
817
|
-
|
|
818
|
-
:param run_mode: The run mode for the experiment (normal, generate run
|
|
819
|
-
files, dry run)
|
|
283
|
+
:param job: The job to start
|
|
284
|
+
:return: JobState.WAITING if dependencies could not be locked, JobState.DONE
|
|
285
|
+
if job completed successfully, JobState.ERROR if job failed during execution,
|
|
286
|
+
or None (should not occur in normal operation)
|
|
287
|
+
:raises Exception: Various exceptions during scheduler coordination
|
|
820
288
|
"""
|
|
821
289
|
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
settings = get_settings()
|
|
826
|
-
if not isinstance(env, WorkspaceSettings):
|
|
827
|
-
env = WorkspaceSettings(id=None, path=Path(env))
|
|
828
|
-
|
|
829
|
-
# Creates the workspace
|
|
830
|
-
run_mode = run_mode or RunMode.NORMAL
|
|
831
|
-
self.workspace = Workspace(settings, env, launcher=launcher, run_mode=run_mode)
|
|
832
|
-
|
|
833
|
-
# Mark the directory has an experimaestro folder
|
|
834
|
-
self.workdir = self.workspace.experimentspath / name
|
|
835
|
-
self.workdir.mkdir(parents=True, exist_ok=True)
|
|
836
|
-
self.xplockpath = self.workdir / "lock"
|
|
837
|
-
self.xplock = None
|
|
838
|
-
self.old_experiment = None
|
|
839
|
-
self.services: Dict[str, Service] = {}
|
|
840
|
-
self._job_listener: Optional[Listener] = None
|
|
841
|
-
|
|
842
|
-
# Get configuration settings
|
|
843
|
-
|
|
844
|
-
if host is not None:
|
|
845
|
-
settings.server.host = host
|
|
846
|
-
|
|
847
|
-
if port is not None:
|
|
848
|
-
settings.server.port = port
|
|
849
|
-
|
|
850
|
-
if token is not None:
|
|
851
|
-
settings.server.token = token
|
|
852
|
-
|
|
853
|
-
# Create the scheduler
|
|
854
|
-
self.scheduler = Scheduler(self, name)
|
|
855
|
-
self.server = (
|
|
856
|
-
Server(self.scheduler, settings.server)
|
|
857
|
-
if (settings.server.port is not None and settings.server.port >= 0)
|
|
858
|
-
and self.workspace.run_mode == RunMode.NORMAL
|
|
859
|
-
else None
|
|
860
|
-
)
|
|
861
|
-
|
|
862
|
-
if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
|
|
863
|
-
import faulthandler
|
|
864
|
-
|
|
865
|
-
logger.info("Enabling fault handler")
|
|
866
|
-
faulthandler.enable(all_threads=True)
|
|
867
|
-
|
|
868
|
-
def submit(self, job: Job):
|
|
869
|
-
return self.scheduler.submit(job)
|
|
290
|
+
# Assert preconditions
|
|
291
|
+
assert job.launcher is not None
|
|
870
292
|
|
|
871
|
-
def prepare(self, job: Job):
|
|
872
|
-
"""Generate the file"""
|
|
873
|
-
return self.scheduler.prepare(job)
|
|
874
|
-
|
|
875
|
-
@property
|
|
876
|
-
def run_mode(self):
|
|
877
|
-
return self.workspace.run_mode
|
|
878
|
-
|
|
879
|
-
@property
|
|
880
|
-
def loop(self):
|
|
881
|
-
assert self.central is not None
|
|
882
|
-
return self.central.loop
|
|
883
|
-
|
|
884
|
-
@property
|
|
885
|
-
def resultspath(self):
|
|
886
|
-
"""Return the directory in which results can be stored for this experiment"""
|
|
887
|
-
return self.workdir / "results"
|
|
888
|
-
|
|
889
|
-
@property
|
|
890
|
-
def jobspath(self):
|
|
891
|
-
"""Return the directory in which results can be stored for this experiment"""
|
|
892
|
-
return self.workdir / "jobs"
|
|
893
|
-
|
|
894
|
-
@property
|
|
895
|
-
def alt_jobspaths(self):
|
|
896
|
-
"""Return potential other directories"""
|
|
897
|
-
for alt_workdir in self.workspace.alt_workdirs:
|
|
898
|
-
yield alt_workdir / "jobs"
|
|
899
|
-
|
|
900
|
-
@property
|
|
901
|
-
def jobsbakpath(self):
|
|
902
|
-
"""Return the directory in which results can be stored for this experiment"""
|
|
903
|
-
return self.workdir / "jobs.bak"
|
|
904
|
-
|
|
905
|
-
def stop(self):
|
|
906
|
-
"""Stop the experiment as soon as possible"""
|
|
907
|
-
|
|
908
|
-
async def doStop():
|
|
909
|
-
assert self.central is not None
|
|
910
|
-
async with self.central.exitCondition:
|
|
911
|
-
self.exitMode = True
|
|
912
|
-
logging.debug("Setting exit mode to true")
|
|
913
|
-
self.central.exitCondition.notify_all()
|
|
914
|
-
|
|
915
|
-
assert self.central is not None and self.central.loop is not None
|
|
916
|
-
asyncio.run_coroutine_threadsafe(doStop(), self.central.loop)
|
|
917
|
-
|
|
918
|
-
def wait(self):
|
|
919
|
-
"""Wait until the running processes have finished"""
|
|
920
|
-
|
|
921
|
-
async def awaitcompletion():
|
|
922
|
-
assert self.central is not None
|
|
923
|
-
logger.debug("Waiting to exit scheduler...")
|
|
924
|
-
async with self.central.exitCondition:
|
|
925
|
-
while True:
|
|
926
|
-
if self.exitMode:
|
|
927
|
-
break
|
|
928
|
-
|
|
929
|
-
# If we have still unfinished jobs or possible new tasks, wait
|
|
930
|
-
logger.debug(
|
|
931
|
-
"Checking exit condition: unfinished jobs=%d, task output queue size=%d",
|
|
932
|
-
self.unfinishedJobs,
|
|
933
|
-
self.taskOutputQueueSize,
|
|
934
|
-
)
|
|
935
|
-
if self.unfinishedJobs == 0 and self.taskOutputQueueSize == 0:
|
|
936
|
-
break
|
|
937
|
-
|
|
938
|
-
# Wait for more news...
|
|
939
|
-
await self.central.exitCondition.wait()
|
|
940
|
-
|
|
941
|
-
if self.failedJobs:
|
|
942
|
-
# Show some more information
|
|
943
|
-
count = 0
|
|
944
|
-
for job in self.failedJobs.values():
|
|
945
|
-
if job.failure_status != JobFailureStatus.DEPENDENCY:
|
|
946
|
-
count += 1
|
|
947
|
-
logger.error(
|
|
948
|
-
"Job %s failed, check the log file %s",
|
|
949
|
-
job.relpath,
|
|
950
|
-
job.stderr,
|
|
951
|
-
)
|
|
952
|
-
raise FailedExperiment(f"{count} failed jobs")
|
|
953
|
-
|
|
954
|
-
future = asyncio.run_coroutine_threadsafe(awaitcompletion(), self.loop)
|
|
955
|
-
return future.result()
|
|
956
|
-
|
|
957
|
-
def setenv(self, name, value, override=True):
|
|
958
|
-
"""Shortcut to set the environment value"""
|
|
959
|
-
if override or name not in self.workspace.env:
|
|
960
|
-
logging.info("Setting environment: %s=%s", name, value)
|
|
961
|
-
self.workspace.env[name] = value
|
|
962
|
-
|
|
963
|
-
def token(self, name: str, count: int):
|
|
964
|
-
"""Returns a token for this experiment
|
|
965
|
-
|
|
966
|
-
The token is the default token of the workspace connector"""
|
|
967
|
-
return self.workspace.connector.createtoken(name, count)
|
|
968
|
-
|
|
969
|
-
def __enter__(self):
|
|
970
|
-
from .dynamic_outputs import TaskOutputsWorker
|
|
971
|
-
|
|
972
|
-
if self.workspace.run_mode != RunMode.DRY_RUN:
|
|
973
|
-
logger.info("Locking experiment %s", self.xplockpath)
|
|
974
|
-
self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
|
|
975
|
-
logger.info("Experiment locked")
|
|
976
|
-
|
|
977
|
-
# Move old jobs into "jobs.bak"
|
|
978
|
-
if self.workspace.run_mode == RunMode.NORMAL:
|
|
979
|
-
self.jobsbakpath.mkdir(exist_ok=True)
|
|
980
|
-
for p in self.jobspath.glob("*/*"):
|
|
981
|
-
if p.is_symlink():
|
|
982
|
-
target = self.jobsbakpath / p.relative_to(self.jobspath)
|
|
983
|
-
if target.is_symlink():
|
|
984
|
-
# Remove if duplicate
|
|
985
|
-
p.unlink()
|
|
986
|
-
else:
|
|
987
|
-
# Rename otherwise
|
|
988
|
-
target.parent.mkdir(parents=True, exist_ok=True)
|
|
989
|
-
p.rename(target)
|
|
990
|
-
|
|
991
|
-
if self.server:
|
|
992
|
-
self.server.start()
|
|
993
|
-
|
|
994
|
-
self.workspace.__enter__()
|
|
995
|
-
(self.workspace.path / ".__experimaestro__").touch()
|
|
996
|
-
|
|
997
|
-
global SIGNAL_HANDLER
|
|
998
|
-
# Number of unfinished jobs
|
|
999
|
-
self.unfinishedJobs = 0
|
|
1000
|
-
self.taskOutputQueueSize = 0
|
|
1001
|
-
|
|
1002
|
-
# List of failed jobs
|
|
1003
|
-
self.failedJobs: Dict[str, Job] = {}
|
|
1004
|
-
|
|
1005
|
-
# Exit mode when catching signals
|
|
1006
|
-
self.exitMode = False
|
|
1007
|
-
|
|
1008
|
-
self.central = SchedulerCentral.create(self.scheduler.name)
|
|
1009
|
-
self.taskOutputsWorker = TaskOutputsWorker(self)
|
|
1010
|
-
self.taskOutputsWorker.start()
|
|
1011
|
-
|
|
1012
|
-
SIGNAL_HANDLER.add(self)
|
|
1013
|
-
|
|
1014
|
-
self.old_experiment = experiment.CURRENT
|
|
1015
|
-
experiment.CURRENT = self
|
|
1016
|
-
return self
|
|
1017
|
-
|
|
1018
|
-
def __exit__(self, exc_type, exc_value, traceback):
|
|
1019
|
-
logger.debug("Exiting scheduler context")
|
|
1020
|
-
# If no exception and normal run mode, remove old "jobs"
|
|
1021
|
-
if self.workspace.run_mode == RunMode.NORMAL:
|
|
1022
|
-
if exc_type is None and self.jobsbakpath.is_dir():
|
|
1023
|
-
rmtree(self.jobsbakpath)
|
|
1024
|
-
|
|
1025
|
-
# Close the different locks
|
|
1026
293
|
try:
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
"Not waiting since an exception was thrown"
|
|
1032
|
-
" (some jobs may be running)"
|
|
1033
|
-
)
|
|
1034
|
-
else:
|
|
1035
|
-
self.wait()
|
|
1036
|
-
finally:
|
|
1037
|
-
SIGNAL_HANDLER.remove(self)
|
|
1038
|
-
|
|
1039
|
-
# Stop services
|
|
1040
|
-
for service in self.services.values():
|
|
1041
|
-
logger.info("Closing service %s", service.description())
|
|
1042
|
-
service.stop()
|
|
1043
|
-
|
|
1044
|
-
if self.central is not None:
|
|
1045
|
-
logger.info("Stopping scheduler event loop")
|
|
1046
|
-
self.central.loop.stop()
|
|
1047
|
-
|
|
1048
|
-
if self.taskOutputsWorker is not None:
|
|
1049
|
-
logger.info("Stopping tasks outputs worker")
|
|
1050
|
-
self.taskOutputsWorker.queue.put(None)
|
|
1051
|
-
|
|
1052
|
-
self.central = None
|
|
1053
|
-
self.workspace.__exit__(exc_type, exc_value, traceback)
|
|
1054
|
-
if self.xplock:
|
|
1055
|
-
self.xplock.__exit__(exc_type, exc_value, traceback)
|
|
1056
|
-
|
|
1057
|
-
# Put back old experiment as current one
|
|
1058
|
-
experiment.CURRENT = self.old_experiment
|
|
1059
|
-
if self.server:
|
|
1060
|
-
logger.info("Stopping web server")
|
|
1061
|
-
self.server.stop()
|
|
1062
|
-
|
|
1063
|
-
if self.workspace.run_mode == RunMode.NORMAL:
|
|
1064
|
-
# Write the state
|
|
1065
|
-
logging.info("Saving the experiment state")
|
|
1066
|
-
from experimaestro.scheduler.state import ExperimentState
|
|
1067
|
-
|
|
1068
|
-
ExperimentState.save(
|
|
1069
|
-
self.workdir / "state.json", self.scheduler.jobs.values()
|
|
294
|
+
# Call job's start method with scheduler context
|
|
295
|
+
state = await job.aio_start(
|
|
296
|
+
sched_dependency_lock=self.dependencyLock,
|
|
297
|
+
notification_server=self.xp.server if self.xp else None,
|
|
1070
298
|
)
|
|
1071
299
|
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
self.taskOutputQueueSize += delta
|
|
1076
|
-
logging.debug(
|
|
1077
|
-
"Updating queue size with %d => %d", delta, self.taskOutputQueueSize
|
|
1078
|
-
)
|
|
1079
|
-
if self.taskOutputQueueSize == 0:
|
|
1080
|
-
self.central.exitCondition.notify_all()
|
|
1081
|
-
|
|
1082
|
-
def watch_output(self, watched: "WatchedOutput"):
|
|
1083
|
-
"""Watch an output
|
|
1084
|
-
|
|
1085
|
-
:param watched: The watched output specification
|
|
1086
|
-
"""
|
|
1087
|
-
|
|
1088
|
-
self.taskOutputsWorker.watch_output(watched)
|
|
1089
|
-
|
|
1090
|
-
def add_service(self, service: ServiceClass) -> ServiceClass:
|
|
1091
|
-
"""Adds a service (e.g. tensorboard viewer) to the experiment
|
|
1092
|
-
|
|
1093
|
-
:param service: A service instance
|
|
1094
|
-
:return: The same service instance
|
|
1095
|
-
"""
|
|
1096
|
-
self.services[service.id] = service
|
|
1097
|
-
for listener in self.scheduler.listeners:
|
|
1098
|
-
listener.service_add(service)
|
|
1099
|
-
return service
|
|
1100
|
-
|
|
1101
|
-
def save(self, obj: Any, name: str = "default"):
|
|
1102
|
-
"""Serializes configurations.
|
|
1103
|
-
|
|
1104
|
-
Saves configuration objects within the experimental directory
|
|
1105
|
-
|
|
1106
|
-
:param obj: The object to save
|
|
1107
|
-
:param name: The name of the saving directory (default to `default`)
|
|
1108
|
-
"""
|
|
1109
|
-
|
|
1110
|
-
if self.workspace.run_mode == RunMode.NORMAL:
|
|
1111
|
-
from experimaestro import save
|
|
1112
|
-
|
|
1113
|
-
save_dir = self.workdir / "data" / name
|
|
1114
|
-
save_dir.mkdir(exist_ok=True, parents=True)
|
|
1115
|
-
|
|
1116
|
-
save(obj, save_dir)
|
|
300
|
+
if state is None:
|
|
301
|
+
# Dependencies couldn't be locked, return WAITING state
|
|
302
|
+
return JobState.WAITING
|
|
1117
303
|
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
Loads configuration objects from an experimental directory
|
|
1122
|
-
|
|
1123
|
-
:param reference: The name of the experiment
|
|
1124
|
-
:param name: The name of the saving directory (default to `default`)
|
|
1125
|
-
"""
|
|
1126
|
-
from experimaestro import load
|
|
304
|
+
# Notify scheduler listeners of job state after successful start
|
|
305
|
+
self.notify_job_state(job)
|
|
306
|
+
return state
|
|
1127
307
|
|
|
1128
|
-
|
|
1129
|
-
|
|
308
|
+
except Exception:
|
|
309
|
+
logger.warning("Error in scheduler job coordination", exc_info=True)
|
|
310
|
+
return JobState.ERROR
|