experimaestro 1.5.1__py3-none-any.whl → 2.0.0a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +14 -4
- experimaestro/__main__.py +3 -423
- experimaestro/annotations.py +14 -4
- experimaestro/cli/__init__.py +311 -0
- experimaestro/{filter.py → cli/filter.py} +23 -9
- experimaestro/cli/jobs.py +268 -0
- experimaestro/cli/progress.py +269 -0
- experimaestro/click.py +0 -35
- experimaestro/commandline.py +3 -7
- experimaestro/connectors/__init__.py +29 -14
- experimaestro/connectors/local.py +19 -10
- experimaestro/connectors/ssh.py +27 -8
- experimaestro/core/arguments.py +45 -3
- experimaestro/core/callbacks.py +52 -0
- experimaestro/core/context.py +8 -9
- experimaestro/core/identifier.py +310 -0
- experimaestro/core/objects/__init__.py +44 -0
- experimaestro/core/{objects.py → objects/config.py} +399 -772
- experimaestro/core/objects/config_utils.py +58 -0
- experimaestro/core/objects/config_walk.py +151 -0
- experimaestro/core/objects.pyi +15 -45
- experimaestro/core/serialization.py +63 -9
- experimaestro/core/serializers.py +1 -8
- experimaestro/core/types.py +104 -66
- experimaestro/experiments/cli.py +154 -72
- experimaestro/experiments/configuration.py +10 -1
- experimaestro/generators.py +6 -1
- experimaestro/ipc.py +4 -1
- experimaestro/launcherfinder/__init__.py +1 -1
- experimaestro/launcherfinder/base.py +2 -18
- experimaestro/launcherfinder/parser.py +8 -3
- experimaestro/launcherfinder/registry.py +52 -140
- experimaestro/launcherfinder/specs.py +49 -10
- experimaestro/launchers/direct.py +0 -47
- experimaestro/launchers/slurm/base.py +54 -14
- experimaestro/mkdocs/__init__.py +1 -1
- experimaestro/mkdocs/base.py +6 -8
- experimaestro/notifications.py +38 -12
- experimaestro/progress.py +406 -0
- experimaestro/run.py +24 -3
- experimaestro/scheduler/__init__.py +18 -1
- experimaestro/scheduler/base.py +108 -808
- experimaestro/scheduler/dynamic_outputs.py +184 -0
- experimaestro/scheduler/experiment.py +387 -0
- experimaestro/scheduler/jobs.py +475 -0
- experimaestro/scheduler/signal_handler.py +32 -0
- experimaestro/scheduler/state.py +75 -0
- experimaestro/scheduler/workspace.py +27 -8
- experimaestro/scriptbuilder.py +18 -3
- experimaestro/server/__init__.py +36 -5
- experimaestro/server/data/1815e00441357e01619e.ttf +0 -0
- experimaestro/server/data/2463b90d9a316e4e5294.woff2 +0 -0
- experimaestro/server/data/2582b0e4bcf85eceead0.ttf +0 -0
- experimaestro/server/data/89999bdf5d835c012025.woff2 +0 -0
- experimaestro/server/data/914997e1bdfc990d0897.ttf +0 -0
- experimaestro/server/data/c210719e60948b211a12.woff2 +0 -0
- experimaestro/server/data/index.css +5187 -5068
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +68887 -68064
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +45 -5
- experimaestro/sphinx/__init__.py +7 -17
- experimaestro/taskglobals.py +7 -2
- experimaestro/tests/core/__init__.py +0 -0
- experimaestro/tests/core/test_generics.py +206 -0
- experimaestro/tests/definitions_types.py +5 -3
- experimaestro/tests/launchers/bin/sbatch +34 -7
- experimaestro/tests/launchers/bin/srun +5 -0
- experimaestro/tests/launchers/common.py +17 -5
- experimaestro/tests/launchers/config_slurm/launchers.py +25 -0
- experimaestro/tests/restart.py +10 -5
- experimaestro/tests/tasks/all.py +23 -10
- experimaestro/tests/tasks/foreign.py +2 -4
- experimaestro/tests/test_checkers.py +2 -2
- experimaestro/tests/test_dependencies.py +11 -17
- experimaestro/tests/test_experiment.py +73 -0
- experimaestro/tests/test_file_progress.py +425 -0
- experimaestro/tests/test_file_progress_integration.py +477 -0
- experimaestro/tests/test_findlauncher.py +12 -5
- experimaestro/tests/test_forward.py +5 -5
- experimaestro/tests/test_generators.py +93 -0
- experimaestro/tests/test_identifier.py +182 -158
- experimaestro/tests/test_instance.py +19 -27
- experimaestro/tests/test_objects.py +13 -20
- experimaestro/tests/test_outputs.py +6 -6
- experimaestro/tests/test_param.py +68 -30
- experimaestro/tests/test_progress.py +4 -4
- experimaestro/tests/test_serializers.py +24 -64
- experimaestro/tests/test_ssh.py +7 -0
- experimaestro/tests/test_tags.py +50 -21
- experimaestro/tests/test_tasks.py +42 -51
- experimaestro/tests/test_tokens.py +11 -8
- experimaestro/tests/test_types.py +24 -21
- experimaestro/tests/test_validation.py +67 -110
- experimaestro/tests/token_reschedule.py +1 -1
- experimaestro/tokens.py +24 -13
- experimaestro/tools/diff.py +8 -1
- experimaestro/typingutils.py +20 -11
- experimaestro/utils/asyncio.py +6 -2
- experimaestro/utils/multiprocessing.py +44 -0
- experimaestro/utils/resources.py +11 -3
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/METADATA +28 -36
- experimaestro-2.0.0a8.dist-info/RECORD +166 -0
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/WHEEL +1 -1
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/entry_points.txt +0 -4
- experimaestro/launchers/slurm/cli.py +0 -29
- experimaestro/launchers/slurm/configuration.py +0 -597
- experimaestro/scheduler/environment.py +0 -94
- experimaestro/server/data/016b4a6cdced82ab3aa1.ttf +0 -0
- experimaestro/server/data/50701fbb8177c2dde530.ttf +0 -0
- experimaestro/server/data/878f31251d960bd6266f.woff2 +0 -0
- experimaestro/server/data/b041b1fa4fe241b23445.woff2 +0 -0
- experimaestro/server/data/b6879d41b0852f01ed5b.woff2 +0 -0
- experimaestro/server/data/d75e3fd1eb12e9bd6655.ttf +0 -0
- experimaestro/tests/launchers/config_slurm/launchers.yaml +0 -134
- experimaestro/utils/yaml.py +0 -202
- experimaestro-1.5.1.dist-info/RECORD +0 -148
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info/licenses}/LICENSE +0 -0
experimaestro/scheduler/base.py
CHANGED
|
@@ -1,346 +1,23 @@
|
|
|
1
|
-
|
|
2
|
-
from functools import cached_property
|
|
3
|
-
import os
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from shutil import rmtree
|
|
1
|
+
import logging
|
|
6
2
|
import threading
|
|
7
3
|
import time
|
|
8
|
-
from typing import
|
|
9
|
-
|
|
10
|
-
|
|
4
|
+
from typing import (
|
|
5
|
+
Optional,
|
|
6
|
+
Set,
|
|
7
|
+
)
|
|
11
8
|
import asyncio
|
|
12
|
-
from experimaestro.exceptions import HandledException
|
|
13
|
-
from experimaestro.notifications import LevelInformation, Reporter
|
|
14
9
|
from typing import Dict
|
|
10
|
+
|
|
11
|
+
from experimaestro.scheduler import experiment
|
|
12
|
+
from experimaestro.scheduler.jobs import Job, JobState
|
|
15
13
|
from experimaestro.scheduler.services import Service
|
|
16
|
-
from experimaestro.settings import get_settings
|
|
17
14
|
|
|
18
15
|
|
|
19
|
-
from experimaestro.core.objects import Config, ConfigWalkContext
|
|
20
16
|
from experimaestro.utils import logger
|
|
21
|
-
from experimaestro.
|
|
22
|
-
from .environment import Environment
|
|
23
|
-
from .workspace import RunMode, Workspace
|
|
24
|
-
from .dependencies import Dependency, DependencyStatus, Resource
|
|
17
|
+
from experimaestro.utils.asyncio import asyncThreadcheck
|
|
25
18
|
import concurrent.futures
|
|
26
19
|
|
|
27
20
|
|
|
28
|
-
if TYPE_CHECKING:
|
|
29
|
-
from experimaestro.connectors import Process
|
|
30
|
-
from experimaestro.launchers import Launcher
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class FailedExperiment(HandledException):
|
|
34
|
-
"""Raised when an experiment failed"""
|
|
35
|
-
|
|
36
|
-
pass
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
class JobState(enum.Enum):
|
|
40
|
-
# Job is not yet scheduled
|
|
41
|
-
UNSCHEDULED = 0
|
|
42
|
-
|
|
43
|
-
# Job is waiting for dependencies to be done
|
|
44
|
-
WAITING = 1
|
|
45
|
-
|
|
46
|
-
# Job is ready to run
|
|
47
|
-
READY = 2
|
|
48
|
-
|
|
49
|
-
# Job is scheduled (e.g. slurm)
|
|
50
|
-
SCHEDULED = 3
|
|
51
|
-
|
|
52
|
-
# Job is running
|
|
53
|
-
RUNNING = 4
|
|
54
|
-
|
|
55
|
-
# Job is done (finished)
|
|
56
|
-
DONE = 5
|
|
57
|
-
|
|
58
|
-
# Job failed (finished)
|
|
59
|
-
ERROR = 6
|
|
60
|
-
|
|
61
|
-
def notstarted(self):
|
|
62
|
-
return self.value <= JobState.READY.value
|
|
63
|
-
|
|
64
|
-
def running(self):
|
|
65
|
-
return (
|
|
66
|
-
self.value == JobState.RUNNING.value
|
|
67
|
-
or self.value == JobState.SCHEDULED.value
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
def finished(self):
|
|
71
|
-
return self.value >= JobState.DONE.value
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
class JobFailureStatus(enum.Enum):
|
|
75
|
-
#: Job failed
|
|
76
|
-
DEPENDENCY = 0
|
|
77
|
-
|
|
78
|
-
#: Job dependency failed
|
|
79
|
-
FAILED = 1
|
|
80
|
-
|
|
81
|
-
#: Memory
|
|
82
|
-
MEMORY = 2
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
class JobLock(Lock):
|
|
86
|
-
def __init__(self, job):
|
|
87
|
-
super().__init__()
|
|
88
|
-
self.job = job
|
|
89
|
-
|
|
90
|
-
def _acquire(self):
|
|
91
|
-
return self.job.state == JobState.DONE
|
|
92
|
-
|
|
93
|
-
def _release(self):
|
|
94
|
-
return False
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
class JobDependency(Dependency):
|
|
98
|
-
def __init__(self, job):
|
|
99
|
-
super().__init__(job)
|
|
100
|
-
|
|
101
|
-
def status(self) -> DependencyStatus:
|
|
102
|
-
if self.origin.state == JobState.DONE:
|
|
103
|
-
return DependencyStatus.OK
|
|
104
|
-
elif self.origin.state == JobState.ERROR:
|
|
105
|
-
return DependencyStatus.FAIL
|
|
106
|
-
return DependencyStatus.WAIT
|
|
107
|
-
|
|
108
|
-
def lock(self):
|
|
109
|
-
return JobLock(self.origin)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
class Job(Resource):
|
|
113
|
-
"""A job is a resouce that is produced by the execution of some code"""
|
|
114
|
-
|
|
115
|
-
# Set by the scheduler
|
|
116
|
-
_readyEvent: Optional[asyncio.Event]
|
|
117
|
-
_future: Optional["concurrent.futures.Future"]
|
|
118
|
-
|
|
119
|
-
def __init__(
|
|
120
|
-
self,
|
|
121
|
-
config: Config,
|
|
122
|
-
*,
|
|
123
|
-
workspace: Workspace = None,
|
|
124
|
-
launcher: "Launcher" = None,
|
|
125
|
-
run_mode: RunMode = RunMode.NORMAL,
|
|
126
|
-
):
|
|
127
|
-
super().__init__()
|
|
128
|
-
|
|
129
|
-
self.workspace = workspace or Workspace.CURRENT
|
|
130
|
-
self.launcher = launcher or self.workspace.launcher if self.workspace else None
|
|
131
|
-
|
|
132
|
-
if run_mode == RunMode.NORMAL:
|
|
133
|
-
assert self.workspace is not None, "No experiment has been defined"
|
|
134
|
-
assert self.launcher is not None, (
|
|
135
|
-
"No launcher, and no default defined for the workspace %s" % workspace
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
self.type = config.__xpmtype__
|
|
139
|
-
self.name = str(self.type.identifier).rsplit(".", 1)[-1]
|
|
140
|
-
|
|
141
|
-
self.scheduler: Optional["Scheduler"] = None
|
|
142
|
-
self.config = config
|
|
143
|
-
self.state: JobState = JobState.UNSCHEDULED
|
|
144
|
-
|
|
145
|
-
#: If a job has failed, indicates the failure status
|
|
146
|
-
self.failure_status: JobFailureStatus = None
|
|
147
|
-
|
|
148
|
-
# Dependencies
|
|
149
|
-
self.dependencies: Set[Dependency] = set() # as target
|
|
150
|
-
|
|
151
|
-
# Process
|
|
152
|
-
self._process = None
|
|
153
|
-
self.unsatisfied = 0
|
|
154
|
-
|
|
155
|
-
# Meta-information
|
|
156
|
-
self.starttime: Optional[float] = None
|
|
157
|
-
self.submittime: Optional[float] = None
|
|
158
|
-
self.endtime: Optional[float] = None
|
|
159
|
-
self._progress: List[LevelInformation] = []
|
|
160
|
-
self.tags = config.tags()
|
|
161
|
-
|
|
162
|
-
def __str__(self):
|
|
163
|
-
return "Job[{}]".format(self.identifier)
|
|
164
|
-
|
|
165
|
-
def wait(self) -> JobState:
|
|
166
|
-
assert self._future, "Cannot wait a not submitted job"
|
|
167
|
-
return self._future.result()
|
|
168
|
-
|
|
169
|
-
@cached_property
|
|
170
|
-
def environ(self):
|
|
171
|
-
"""Returns the job environment
|
|
172
|
-
|
|
173
|
-
It is made of (by order of priority):
|
|
174
|
-
|
|
175
|
-
1. The job environment
|
|
176
|
-
1. The launcher environment
|
|
177
|
-
1. The workspace environment
|
|
178
|
-
|
|
179
|
-
"""
|
|
180
|
-
return ChainMap(
|
|
181
|
-
{},
|
|
182
|
-
self.launcher.environ if self.launcher else {},
|
|
183
|
-
self.workspace.environment.environ if self.workspace else {},
|
|
184
|
-
)
|
|
185
|
-
|
|
186
|
-
@property
|
|
187
|
-
def progress(self):
|
|
188
|
-
return self._progress
|
|
189
|
-
|
|
190
|
-
def set_progress(self, level: int, value: float, desc: Optional[str]):
|
|
191
|
-
if value < 0:
|
|
192
|
-
logger.warning(f"Progress value out of bounds ({value})")
|
|
193
|
-
value = 0
|
|
194
|
-
elif value > 1:
|
|
195
|
-
logger.warning(f"Progress value out of bounds ({value})")
|
|
196
|
-
value = 1
|
|
197
|
-
|
|
198
|
-
# Adjust the length of the array
|
|
199
|
-
self._progress = self._progress[: (level + 1)]
|
|
200
|
-
while len(self._progress) <= level:
|
|
201
|
-
self._progress.append(LevelInformation(len(self._progress), None, 0.0))
|
|
202
|
-
|
|
203
|
-
if desc:
|
|
204
|
-
self._progress[-1].desc = desc
|
|
205
|
-
self._progress[-1].progress = value
|
|
206
|
-
|
|
207
|
-
for listener in self.scheduler.listeners:
|
|
208
|
-
listener.job_state(self)
|
|
209
|
-
|
|
210
|
-
def add_notification_server(self, server):
|
|
211
|
-
"""Adds a notification server"""
|
|
212
|
-
key, baseurl = server.getNotificationSpec()
|
|
213
|
-
dirpath = self.path / Reporter.NOTIFICATION_FOLDER
|
|
214
|
-
dirpath.mkdir(exist_ok=True)
|
|
215
|
-
(dirpath / key).write_text(f"{baseurl}/{self.identifier}")
|
|
216
|
-
|
|
217
|
-
@property
|
|
218
|
-
def ready(self):
|
|
219
|
-
return self.state == JobState.READY
|
|
220
|
-
|
|
221
|
-
@property
|
|
222
|
-
def jobpath(self):
|
|
223
|
-
"""Deprecated, use `path`"""
|
|
224
|
-
return self.workspace.jobspath / self.relpath
|
|
225
|
-
|
|
226
|
-
@property
|
|
227
|
-
def path(self) -> Path:
|
|
228
|
-
return self.workspace.jobspath / self.relpath
|
|
229
|
-
|
|
230
|
-
@property
|
|
231
|
-
def relpath(self):
|
|
232
|
-
identifier = self.config.__xpm__.identifier
|
|
233
|
-
base = Path(str(self.type.identifier))
|
|
234
|
-
return base / identifier.all.hex()
|
|
235
|
-
|
|
236
|
-
@property
|
|
237
|
-
def relmainpath(self):
|
|
238
|
-
identifier = self.config.__xpm__.identifier
|
|
239
|
-
base = Path(str(self.type.identifier))
|
|
240
|
-
return base / identifier.main.hex()
|
|
241
|
-
|
|
242
|
-
@property
|
|
243
|
-
def hashidentifier(self):
|
|
244
|
-
return self.config.__xpm__.identifier
|
|
245
|
-
|
|
246
|
-
@property
|
|
247
|
-
def identifier(self):
|
|
248
|
-
return self.config.__xpm__.identifier.all.hex()
|
|
249
|
-
|
|
250
|
-
def prepare(self, overwrite=False):
|
|
251
|
-
"""Prepare all files before starting a task
|
|
252
|
-
|
|
253
|
-
:param overwrite: if True, overwrite files even if the task has been run
|
|
254
|
-
"""
|
|
255
|
-
pass
|
|
256
|
-
|
|
257
|
-
async def aio_run(self):
|
|
258
|
-
"""Actually run the code"""
|
|
259
|
-
raise NotImplementedError(f"Method aio_run not implemented in {self.__class__}")
|
|
260
|
-
|
|
261
|
-
async def aio_process(self) -> Optional["Process"]:
|
|
262
|
-
"""Returns the process if it exists"""
|
|
263
|
-
raise NotImplementedError("Not implemented")
|
|
264
|
-
|
|
265
|
-
@property
|
|
266
|
-
def pidpath(self):
|
|
267
|
-
"""This file contains the file PID"""
|
|
268
|
-
return self.jobpath / ("%s.pid" % self.name)
|
|
269
|
-
|
|
270
|
-
@property
|
|
271
|
-
def lockpath(self):
|
|
272
|
-
"""This file is used as a lock for running the job"""
|
|
273
|
-
return self.workspace.jobspath / self.relmainpath / ("%s.lock" % self.name)
|
|
274
|
-
|
|
275
|
-
@property
|
|
276
|
-
def donepath(self) -> Path:
|
|
277
|
-
"""When a job has been successful, this file is written"""
|
|
278
|
-
return self.jobpath / ("%s.done" % self.name)
|
|
279
|
-
|
|
280
|
-
@property
|
|
281
|
-
def failedpath(self):
|
|
282
|
-
"""When a job has been unsuccessful, this file is written with an error
|
|
283
|
-
code inside"""
|
|
284
|
-
return self.jobpath / ("%s.failed" % self.name)
|
|
285
|
-
|
|
286
|
-
@property
|
|
287
|
-
def stdout(self) -> Path:
|
|
288
|
-
return self.jobpath / ("%s.out" % self.name)
|
|
289
|
-
|
|
290
|
-
@property
|
|
291
|
-
def stderr(self) -> Path:
|
|
292
|
-
return self.jobpath / ("%s.err" % self.name)
|
|
293
|
-
|
|
294
|
-
@property
|
|
295
|
-
def basepath(self) -> Path:
|
|
296
|
-
return self.jobpath / self.name
|
|
297
|
-
|
|
298
|
-
def dependencychanged(self, dependency, oldstatus, status):
|
|
299
|
-
"""Called when a dependency has changed"""
|
|
300
|
-
|
|
301
|
-
def value(s):
|
|
302
|
-
return 1 if s == DependencyStatus.OK else 0
|
|
303
|
-
|
|
304
|
-
self.unsatisfied -= value(status) - value(oldstatus)
|
|
305
|
-
|
|
306
|
-
logger.debug("Job %s: unsatisfied %d", self, self.unsatisfied)
|
|
307
|
-
|
|
308
|
-
if status == DependencyStatus.FAIL:
|
|
309
|
-
# Job completed
|
|
310
|
-
if not self.state.finished():
|
|
311
|
-
self.state = JobState.ERROR
|
|
312
|
-
self.failure_status = JobFailureStatus.DEPENDENCY
|
|
313
|
-
self._readyEvent.set()
|
|
314
|
-
|
|
315
|
-
if self.unsatisfied == 0:
|
|
316
|
-
logger.info("Job %s is ready to run", self)
|
|
317
|
-
# We are ready
|
|
318
|
-
self.state = JobState.READY
|
|
319
|
-
self._readyEvent.set()
|
|
320
|
-
|
|
321
|
-
def finalState(self) -> "concurrent.futures.Future[JobState]":
|
|
322
|
-
assert self._future is not None
|
|
323
|
-
return self._future
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
class JobContext(ConfigWalkContext):
|
|
327
|
-
def __init__(self, job: Job):
|
|
328
|
-
super().__init__()
|
|
329
|
-
self.job = job
|
|
330
|
-
|
|
331
|
-
@property
|
|
332
|
-
def name(self):
|
|
333
|
-
return self.job.name
|
|
334
|
-
|
|
335
|
-
@property
|
|
336
|
-
def path(self):
|
|
337
|
-
return self.job.path
|
|
338
|
-
|
|
339
|
-
@property
|
|
340
|
-
def task(self):
|
|
341
|
-
return self.job.config
|
|
342
|
-
|
|
343
|
-
|
|
344
21
|
class Listener:
|
|
345
22
|
def job_submitted(self, job):
|
|
346
23
|
pass
|
|
@@ -353,78 +30,16 @@ class Listener:
|
|
|
353
30
|
pass
|
|
354
31
|
|
|
355
32
|
|
|
356
|
-
class
|
|
357
|
-
def __init__(self, code):
|
|
358
|
-
super().__init__(f"Job exited with code {code}")
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
class SignalHandler:
|
|
362
|
-
def __init__(self):
|
|
363
|
-
self.experiments: Set["experiment"] = set()
|
|
364
|
-
self.original_sigint_handler = None
|
|
365
|
-
|
|
366
|
-
def add(self, xp: "experiment"):
|
|
367
|
-
if not self.experiments:
|
|
368
|
-
self.original_sigint_handler = signal.getsignal(signal.SIGINT)
|
|
369
|
-
|
|
370
|
-
signal.signal(signal.SIGINT, self)
|
|
371
|
-
|
|
372
|
-
self.experiments.add(xp)
|
|
373
|
-
|
|
374
|
-
def remove(self, xp):
|
|
375
|
-
self.experiments.remove(xp)
|
|
376
|
-
if not self.experiments:
|
|
377
|
-
signal.signal(signal.SIGINT, self.original_sigint_handler)
|
|
378
|
-
|
|
379
|
-
def __call__(self, signum, frame):
|
|
380
|
-
"""SIGINT signal handler"""
|
|
381
|
-
logger.warning("Signal received")
|
|
382
|
-
for xp in self.experiments:
|
|
383
|
-
xp.stop()
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
SIGNAL_HANDLER = SignalHandler()
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
class SchedulerCentral(threading.Thread):
|
|
390
|
-
loop: asyncio.AbstractEventLoop
|
|
391
|
-
|
|
392
|
-
"""The event loop thread used by the scheduler"""
|
|
393
|
-
|
|
394
|
-
def __init__(self, name: str):
|
|
395
|
-
# Daemon thread so it is non blocking
|
|
396
|
-
super().__init__(name=f"Scheduler EL ({name})", daemon=True)
|
|
397
|
-
|
|
398
|
-
self._ready = threading.Event()
|
|
399
|
-
|
|
400
|
-
def run(self):
|
|
401
|
-
logger.debug("Starting event loop thread")
|
|
402
|
-
self.loop = asyncio.new_event_loop()
|
|
403
|
-
asyncio.set_event_loop(self.loop)
|
|
404
|
-
|
|
405
|
-
# Set loop-dependent variables
|
|
406
|
-
self.exitCondition = asyncio.Condition()
|
|
407
|
-
self.dependencyLock = asyncio.Lock()
|
|
408
|
-
|
|
409
|
-
# Start the event loop
|
|
410
|
-
self._ready.set()
|
|
411
|
-
self.loop.run_forever()
|
|
412
|
-
|
|
413
|
-
@staticmethod
|
|
414
|
-
def create(name: str):
|
|
415
|
-
instance = SchedulerCentral(name)
|
|
416
|
-
instance.start()
|
|
417
|
-
instance._ready.wait()
|
|
418
|
-
return instance
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
class Scheduler:
|
|
33
|
+
class Scheduler(threading.Thread):
|
|
422
34
|
"""A job scheduler
|
|
423
35
|
|
|
424
36
|
The scheduler is based on asyncio for easy concurrency handling
|
|
425
37
|
"""
|
|
426
38
|
|
|
427
39
|
def __init__(self, xp: "experiment", name: str):
|
|
40
|
+
super().__init__(name=f"Scheduler ({name})", daemon=True)
|
|
41
|
+
self._ready = threading.Event()
|
|
42
|
+
|
|
428
43
|
# Name of the experiment
|
|
429
44
|
self.name = name
|
|
430
45
|
self.xp = xp
|
|
@@ -436,14 +51,37 @@ class Scheduler:
|
|
|
436
51
|
self.jobs: Dict[str, "Job"] = {}
|
|
437
52
|
|
|
438
53
|
# List of jobs
|
|
439
|
-
self.waitingjobs = set()
|
|
54
|
+
self.waitingjobs: Set[Job] = set()
|
|
440
55
|
|
|
441
56
|
# Listeners
|
|
442
57
|
self.listeners: Set[Listener] = set()
|
|
443
58
|
|
|
444
|
-
@
|
|
445
|
-
def
|
|
446
|
-
|
|
59
|
+
@staticmethod
|
|
60
|
+
def create(xp: "experiment", name: str):
|
|
61
|
+
instance = Scheduler(xp, name)
|
|
62
|
+
instance.start()
|
|
63
|
+
instance._ready.wait()
|
|
64
|
+
return instance
|
|
65
|
+
|
|
66
|
+
def run(self):
|
|
67
|
+
"""Run the event loop forever"""
|
|
68
|
+
logger.debug("Starting event loop thread")
|
|
69
|
+
# Ported from SchedulerCentral
|
|
70
|
+
self.loop = asyncio.new_event_loop()
|
|
71
|
+
asyncio.set_event_loop(self.loop)
|
|
72
|
+
# Set loop-dependent variables
|
|
73
|
+
self.exitCondition = asyncio.Condition()
|
|
74
|
+
self.dependencyLock = asyncio.Lock()
|
|
75
|
+
self._ready.set()
|
|
76
|
+
self.loop.run_forever()
|
|
77
|
+
|
|
78
|
+
def start_scheduler(self):
|
|
79
|
+
"""Start the scheduler event loop in a thread"""
|
|
80
|
+
if not self.is_alive():
|
|
81
|
+
self.start()
|
|
82
|
+
self._ready.wait()
|
|
83
|
+
else:
|
|
84
|
+
logger.warning("Scheduler already started")
|
|
447
85
|
|
|
448
86
|
def addlistener(self, listener: Listener):
|
|
449
87
|
self.listeners.add(listener)
|
|
@@ -452,6 +90,13 @@ class Scheduler:
|
|
|
452
90
|
self.listeners.remove(listener)
|
|
453
91
|
|
|
454
92
|
def getJobState(self, job: Job) -> "concurrent.futures.Future[JobState]":
|
|
93
|
+
# Check if the job belongs to this scheduler
|
|
94
|
+
if job.identifier not in self.jobs:
|
|
95
|
+
# If job is not in this scheduler, return its current state directly
|
|
96
|
+
future = concurrent.futures.Future()
|
|
97
|
+
future.set_result(job.state)
|
|
98
|
+
return future
|
|
99
|
+
|
|
455
100
|
return asyncio.run_coroutine_threadsafe(self.aio_getjobstate(job), self.loop)
|
|
456
101
|
|
|
457
102
|
async def aio_getjobstate(self, job: Job):
|
|
@@ -459,14 +104,17 @@ class Scheduler:
|
|
|
459
104
|
|
|
460
105
|
def submit(self, job: Job) -> Optional[Job]:
|
|
461
106
|
# Wait for the future containing the submitted job
|
|
107
|
+
logger.debug("Registering the job %s within the scheduler", job)
|
|
462
108
|
otherFuture = asyncio.run_coroutine_threadsafe(
|
|
463
109
|
self.aio_registerJob(job), self.loop
|
|
464
110
|
)
|
|
465
111
|
other = otherFuture.result()
|
|
112
|
+
logger.debug("Job already submitted" if other else "First submission")
|
|
466
113
|
if other:
|
|
467
114
|
return other
|
|
468
115
|
|
|
469
116
|
job._future = asyncio.run_coroutine_threadsafe(self.aio_submit(job), self.loop)
|
|
117
|
+
return None
|
|
470
118
|
|
|
471
119
|
def prepare(self, job: Job):
|
|
472
120
|
"""Prepares the job for running"""
|
|
@@ -498,6 +146,22 @@ class Scheduler:
|
|
|
498
146
|
|
|
499
147
|
return None
|
|
500
148
|
|
|
149
|
+
def notify_job_submitted(self, job: Job):
|
|
150
|
+
"""Notify the listeners that a job has been submitted"""
|
|
151
|
+
for listener in self.listeners:
|
|
152
|
+
try:
|
|
153
|
+
listener.job_submitted(job)
|
|
154
|
+
except Exception:
|
|
155
|
+
logger.exception("Got an error with listener %s", listener)
|
|
156
|
+
|
|
157
|
+
def notify_job_state(self, job: Job):
|
|
158
|
+
"""Notify the listeners that a job has changed state"""
|
|
159
|
+
for listener in self.listeners:
|
|
160
|
+
try:
|
|
161
|
+
listener.job_state(job)
|
|
162
|
+
except Exception:
|
|
163
|
+
logger.exception("Got an error with listener %s", listener)
|
|
164
|
+
|
|
501
165
|
async def aio_submit(self, job: Job) -> JobState: # noqa: C901
|
|
502
166
|
"""Main scheduler function: submit a job, run it (if needed), and returns
|
|
503
167
|
the status code
|
|
@@ -508,6 +172,12 @@ class Scheduler:
|
|
|
508
172
|
job.scheduler = self
|
|
509
173
|
self.waitingjobs.add(job)
|
|
510
174
|
|
|
175
|
+
# Check that we don't have a completed job in
|
|
176
|
+
# alternate directories
|
|
177
|
+
for jobspath in experiment.current().alt_jobspaths:
|
|
178
|
+
# FIXME: check if done
|
|
179
|
+
pass
|
|
180
|
+
|
|
511
181
|
# Creates a link into the experiment folder
|
|
512
182
|
path = experiment.current().jobspath / job.relpath
|
|
513
183
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -516,11 +186,8 @@ class Scheduler:
|
|
|
516
186
|
path.symlink_to(job.path)
|
|
517
187
|
|
|
518
188
|
job.state = JobState.WAITING
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
listener.job_submitted(job)
|
|
522
|
-
except Exception:
|
|
523
|
-
logger.exception("Got an error with listener %s", listener)
|
|
189
|
+
|
|
190
|
+
self.notify_job_submitted(job)
|
|
524
191
|
|
|
525
192
|
# Add dependencies, and add to blocking resources
|
|
526
193
|
if job.dependencies:
|
|
@@ -543,11 +210,8 @@ class Scheduler:
|
|
|
543
210
|
if process is not None:
|
|
544
211
|
# Yep! First we notify the listeners
|
|
545
212
|
job.state = JobState.RUNNING
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
listener.job_state(job)
|
|
549
|
-
except Exception:
|
|
550
|
-
logger.exception("Got an error with listener %s", listener)
|
|
213
|
+
# Notify the listeners
|
|
214
|
+
self.notify_job_state(job)
|
|
551
215
|
|
|
552
216
|
# Adds to the listeners
|
|
553
217
|
if self.xp.server is not None:
|
|
@@ -582,20 +246,20 @@ class Scheduler:
|
|
|
582
246
|
|
|
583
247
|
job.state = state
|
|
584
248
|
|
|
585
|
-
|
|
586
|
-
try:
|
|
587
|
-
listener.job_state(job)
|
|
588
|
-
except Exception as e:
|
|
589
|
-
logger.exception("Listener %s did raise an exception", e)
|
|
249
|
+
self.notify_job_state(job)
|
|
590
250
|
|
|
591
251
|
# Job is finished
|
|
592
252
|
if job.state != JobState.DONE:
|
|
593
253
|
self.xp.failedJobs[job.identifier] = job
|
|
594
254
|
|
|
255
|
+
# Process all remaining tasks outputs
|
|
256
|
+
await asyncThreadcheck("End of job processing", job.done_handler)
|
|
257
|
+
|
|
595
258
|
# Decrement the number of unfinished jobs and notify
|
|
596
259
|
self.xp.unfinishedJobs -= 1
|
|
597
|
-
async with self.
|
|
598
|
-
|
|
260
|
+
async with self.exitCondition:
|
|
261
|
+
logging.debug("Updated number of unfinished jobs")
|
|
262
|
+
self.exitCondition.notify_all()
|
|
599
263
|
|
|
600
264
|
job.endtime = time.time()
|
|
601
265
|
if job in self.waitingjobs:
|
|
@@ -610,401 +274,37 @@ class Scheduler:
|
|
|
610
274
|
return job.state
|
|
611
275
|
|
|
612
276
|
async def aio_start(self, job: Job) -> Optional[JobState]:
|
|
613
|
-
"""Start a job
|
|
614
|
-
|
|
615
|
-
Returns None if the dependencies could not be locked after all
|
|
616
|
-
Returns DONE/ERROR depending on the process outcome
|
|
617
|
-
"""
|
|
618
|
-
|
|
619
|
-
# We first lock the job before proceeding
|
|
620
|
-
assert job.launcher is not None
|
|
621
|
-
assert self.xp.central is not None
|
|
622
|
-
|
|
623
|
-
with Locks() as locks:
|
|
624
|
-
logger.debug("[starting] Locking job %s", job)
|
|
625
|
-
async with job.launcher.connector.lock(job.lockpath):
|
|
626
|
-
logger.debug("[starting] Locked job %s", job)
|
|
627
|
-
|
|
628
|
-
state = None
|
|
629
|
-
try:
|
|
630
|
-
logger.debug(
|
|
631
|
-
"Starting job %s with %d dependencies",
|
|
632
|
-
job,
|
|
633
|
-
len(job.dependencies),
|
|
634
|
-
)
|
|
635
|
-
|
|
636
|
-
async with self.xp.central.dependencyLock:
|
|
637
|
-
for dependency in job.dependencies:
|
|
638
|
-
try:
|
|
639
|
-
locks.append(dependency.lock().acquire())
|
|
640
|
-
except LockError:
|
|
641
|
-
logger.warning(
|
|
642
|
-
"Could not lock %s, aborting start for job %s",
|
|
643
|
-
dependency,
|
|
644
|
-
job,
|
|
645
|
-
)
|
|
646
|
-
dependency.check()
|
|
647
|
-
return JobState.WAITING
|
|
648
|
-
|
|
649
|
-
for listener in self.listeners:
|
|
650
|
-
listener.job_state(job)
|
|
651
|
-
|
|
652
|
-
job.starttime = time.time()
|
|
653
|
-
|
|
654
|
-
# Creates the main directory
|
|
655
|
-
directory = job.path
|
|
656
|
-
logger.debug("Making directories job %s...", directory)
|
|
657
|
-
if not directory.is_dir():
|
|
658
|
-
directory.mkdir(parents=True, exist_ok=True)
|
|
659
|
-
|
|
660
|
-
# Sets up the notification URL
|
|
661
|
-
if self.xp.server is not None:
|
|
662
|
-
job.add_notification_server(self.xp.server)
|
|
663
|
-
|
|
664
|
-
except Exception:
|
|
665
|
-
logger.warning("Error while locking job", exc_info=True)
|
|
666
|
-
return JobState.WAITING
|
|
667
|
-
|
|
668
|
-
try:
|
|
669
|
-
# Runs the job
|
|
670
|
-
process = await job.aio_run()
|
|
671
|
-
except Exception:
|
|
672
|
-
logger.warning("Error while starting job", exc_info=True)
|
|
673
|
-
return JobState.ERROR
|
|
674
|
-
|
|
675
|
-
try:
|
|
676
|
-
if isinstance(process, JobState):
|
|
677
|
-
state = process
|
|
678
|
-
logger.debug("Job %s ended (state %s)", job, state)
|
|
679
|
-
else:
|
|
680
|
-
logger.debug("Waiting for job %s process to end", job)
|
|
277
|
+
"""Start a job (scheduler coordination layer)
|
|
681
278
|
|
|
682
|
-
|
|
683
|
-
|
|
279
|
+
This method serves as a coordination layer that delegates the actual
|
|
280
|
+
job starting logic to the job itself while handling scheduler-specific
|
|
281
|
+
concerns like state notifications and providing coordination context.
|
|
684
282
|
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
code = int(job.failedpath.read_text())
|
|
691
|
-
|
|
692
|
-
logger.debug("Job %s ended with code %s", job, code)
|
|
693
|
-
state = JobState.DONE if code == 0 else JobState.ERROR
|
|
694
|
-
|
|
695
|
-
except JobError:
|
|
696
|
-
logger.warning("Error while running job")
|
|
697
|
-
state = JobState.ERROR
|
|
698
|
-
|
|
699
|
-
except Exception:
|
|
700
|
-
logger.warning(
|
|
701
|
-
"Error while running job (in experimaestro)", exc_info=True
|
|
702
|
-
)
|
|
703
|
-
state = JobState.ERROR
|
|
704
|
-
|
|
705
|
-
return state
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
ServiceClass = TypeVar("ServiceClass", bound=Service)
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
class experiment:
|
|
712
|
-
"""Main experiment object
|
|
713
|
-
|
|
714
|
-
It is a context object, i.e. experiments is run with
|
|
715
|
-
|
|
716
|
-
```py
|
|
717
|
-
with experiment(...) as xp:
|
|
718
|
-
...
|
|
719
|
-
```
|
|
720
|
-
"""
|
|
721
|
-
|
|
722
|
-
# Current experiment
|
|
723
|
-
CURRENT: Optional["experiment"] = None
|
|
724
|
-
|
|
725
|
-
@staticmethod
|
|
726
|
-
def current() -> "experiment":
|
|
727
|
-
"""Returns the current experiment, but checking first if set
|
|
728
|
-
|
|
729
|
-
If there is no current experiment, raises an AssertError
|
|
730
|
-
"""
|
|
731
|
-
assert experiment.CURRENT is not None, "No current experiment defined"
|
|
732
|
-
return experiment.CURRENT
|
|
733
|
-
|
|
734
|
-
def __init__(
|
|
735
|
-
self,
|
|
736
|
-
env: Union[Path, str, Environment],
|
|
737
|
-
name: str,
|
|
738
|
-
*,
|
|
739
|
-
host: Optional[str] = None,
|
|
740
|
-
port: Optional[int] = None,
|
|
741
|
-
token: Optional[str] = None,
|
|
742
|
-
run_mode: Optional[RunMode] = None,
|
|
743
|
-
launcher=None,
|
|
744
|
-
):
|
|
745
|
-
"""
|
|
746
|
-
:param env: an environment -- or a working directory for a local
|
|
747
|
-
environment
|
|
748
|
-
|
|
749
|
-
:param name: the identifier of the experiment
|
|
750
|
-
|
|
751
|
-
:param launcher: The launcher (if not provided, inferred from path)
|
|
752
|
-
|
|
753
|
-
:param host: The host for the web server (overrides the environment if
|
|
754
|
-
set)
|
|
755
|
-
:param port: the port for the web server (overrides the environment if
|
|
756
|
-
set). Use negative number to avoid running a web server (default when dry run).
|
|
757
|
-
|
|
758
|
-
:param run_mode: The run mode for the experiment (normal, generate run
|
|
759
|
-
files, dry run)
|
|
283
|
+
:param job: The job to start
|
|
284
|
+
:return: JobState.WAITING if dependencies could not be locked, JobState.DONE
|
|
285
|
+
if job completed successfully, JobState.ERROR if job failed during execution,
|
|
286
|
+
or None (should not occur in normal operation)
|
|
287
|
+
:raises Exception: Various exceptions during scheduler coordination
|
|
760
288
|
"""
|
|
761
289
|
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
if isinstance(env, Environment):
|
|
765
|
-
self.environment = env
|
|
766
|
-
else:
|
|
767
|
-
self.environment = Environment(workdir=env)
|
|
768
|
-
|
|
769
|
-
# Creates the workspace
|
|
770
|
-
run_mode = run_mode or RunMode.NORMAL
|
|
771
|
-
self.workspace = Workspace(
|
|
772
|
-
self.environment, launcher=launcher, run_mode=run_mode
|
|
773
|
-
)
|
|
774
|
-
|
|
775
|
-
# Mark the directory has an experimaestro folder
|
|
776
|
-
self.workdir = self.workspace.experimentspath / name
|
|
777
|
-
self.workdir.mkdir(parents=True, exist_ok=True)
|
|
778
|
-
self.xplockpath = self.workdir / "lock"
|
|
779
|
-
self.xplock = None
|
|
780
|
-
self.old_experiment = None
|
|
781
|
-
self.services: Dict[str, Service] = {}
|
|
782
|
-
|
|
783
|
-
settings = get_settings()
|
|
784
|
-
|
|
785
|
-
if host is not None:
|
|
786
|
-
settings.server.host = host
|
|
787
|
-
|
|
788
|
-
if port is not None:
|
|
789
|
-
settings.server.port = port
|
|
790
|
-
|
|
791
|
-
if token is not None:
|
|
792
|
-
settings.server.token = token
|
|
793
|
-
|
|
794
|
-
# Create the scheduler
|
|
795
|
-
self.scheduler = Scheduler(self, name)
|
|
796
|
-
self.server = (
|
|
797
|
-
Server(self.scheduler, settings.server)
|
|
798
|
-
if (settings.server.port is not None and settings.server.port >= 0)
|
|
799
|
-
and self.workspace.run_mode == RunMode.NORMAL
|
|
800
|
-
else None
|
|
801
|
-
)
|
|
802
|
-
|
|
803
|
-
if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
|
|
804
|
-
import faulthandler
|
|
805
|
-
|
|
806
|
-
logger.info("Enabling fault handler")
|
|
807
|
-
faulthandler.enable(all_threads=True)
|
|
808
|
-
|
|
809
|
-
def submit(self, job: Job):
|
|
810
|
-
return self.scheduler.submit(job)
|
|
290
|
+
# Assert preconditions
|
|
291
|
+
assert job.launcher is not None
|
|
811
292
|
|
|
812
|
-
def prepare(self, job: Job):
|
|
813
|
-
"""Generate the file"""
|
|
814
|
-
return self.scheduler.prepare(job)
|
|
815
|
-
|
|
816
|
-
@property
|
|
817
|
-
def run_mode(self):
|
|
818
|
-
return self.workspace.run_mode
|
|
819
|
-
|
|
820
|
-
@property
|
|
821
|
-
def loop(self):
|
|
822
|
-
assert self.central is not None
|
|
823
|
-
return self.central.loop
|
|
824
|
-
|
|
825
|
-
@property
|
|
826
|
-
def resultspath(self):
|
|
827
|
-
"""Return the directory in which results can be stored for this experiment"""
|
|
828
|
-
return self.workdir / "results"
|
|
829
|
-
|
|
830
|
-
@property
|
|
831
|
-
def jobspath(self):
|
|
832
|
-
"""Return the directory in which results can be stored for this experiment"""
|
|
833
|
-
return self.workdir / "jobs"
|
|
834
|
-
|
|
835
|
-
@property
|
|
836
|
-
def jobsbakpath(self):
|
|
837
|
-
"""Return the directory in which results can be stored for this experiment"""
|
|
838
|
-
return self.workdir / "jobs.bak"
|
|
839
|
-
|
|
840
|
-
def stop(self):
|
|
841
|
-
"""Stop the experiment as soon as possible"""
|
|
842
|
-
|
|
843
|
-
async def doStop():
|
|
844
|
-
assert self.central is not None
|
|
845
|
-
async with self.central.exitCondition:
|
|
846
|
-
self.exitMode = True
|
|
847
|
-
self.central.exitCondition.notify_all()
|
|
848
|
-
|
|
849
|
-
assert self.central is not None and self.central.loop is not None
|
|
850
|
-
asyncio.run_coroutine_threadsafe(doStop(), self.central.loop)
|
|
851
|
-
|
|
852
|
-
def wait(self):
|
|
853
|
-
"""Wait until the running processes have finished"""
|
|
854
|
-
|
|
855
|
-
async def awaitcompletion():
|
|
856
|
-
assert self.central is not None
|
|
857
|
-
async with self.central.exitCondition:
|
|
858
|
-
while True:
|
|
859
|
-
if self.unfinishedJobs == 0 or self.exitMode:
|
|
860
|
-
break
|
|
861
|
-
await self.central.exitCondition.wait()
|
|
862
|
-
|
|
863
|
-
if self.failedJobs:
|
|
864
|
-
# Show some more information
|
|
865
|
-
count = 0
|
|
866
|
-
for job in self.failedJobs.values():
|
|
867
|
-
if job.failure_status != JobFailureStatus.DEPENDENCY:
|
|
868
|
-
count += 1
|
|
869
|
-
logger.error(
|
|
870
|
-
"Job %s failed, check the log file %s",
|
|
871
|
-
job.relpath,
|
|
872
|
-
job.stderr,
|
|
873
|
-
)
|
|
874
|
-
raise FailedExperiment(f"{count} failed jobs")
|
|
875
|
-
|
|
876
|
-
future = asyncio.run_coroutine_threadsafe(awaitcompletion(), self.loop)
|
|
877
|
-
return future.result()
|
|
878
|
-
|
|
879
|
-
def setenv(self, name, value):
|
|
880
|
-
"""Shortcut to set the environment value"""
|
|
881
|
-
self.environment.setenv(name, value)
|
|
882
|
-
|
|
883
|
-
def token(self, name: str, count: int):
|
|
884
|
-
"""Returns a token for this experiment
|
|
885
|
-
|
|
886
|
-
The token is the default token of the workspace connector"""
|
|
887
|
-
return self.workspace.connector.createtoken(name, count)
|
|
888
|
-
|
|
889
|
-
def __enter__(self):
|
|
890
|
-
if self.workspace.run_mode != RunMode.DRY_RUN:
|
|
891
|
-
logger.info("Locking experiment %s", self.xplockpath)
|
|
892
|
-
self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
|
|
893
|
-
logger.info("Experiment locked")
|
|
894
|
-
|
|
895
|
-
# Move old jobs into "jobs.bak"
|
|
896
|
-
if self.workspace.run_mode == RunMode.NORMAL:
|
|
897
|
-
self.jobsbakpath.mkdir(exist_ok=True)
|
|
898
|
-
for p in self.jobspath.glob("*/*"):
|
|
899
|
-
if p.is_symlink():
|
|
900
|
-
target = self.jobsbakpath / p.relative_to(self.jobspath)
|
|
901
|
-
if target.is_symlink():
|
|
902
|
-
# Remove if duplicate
|
|
903
|
-
p.unlink()
|
|
904
|
-
else:
|
|
905
|
-
# Rename otherwise
|
|
906
|
-
target.parent.mkdir(parents=True, exist_ok=True)
|
|
907
|
-
p.rename(target)
|
|
908
|
-
|
|
909
|
-
if self.server:
|
|
910
|
-
self.server.start()
|
|
911
|
-
|
|
912
|
-
self.workspace.__enter__()
|
|
913
|
-
(self.workspace.path / ".__experimaestro__").touch()
|
|
914
|
-
|
|
915
|
-
global SIGNAL_HANDLER
|
|
916
|
-
# Number of unfinished jobs
|
|
917
|
-
self.unfinishedJobs = 0
|
|
918
|
-
|
|
919
|
-
# List of failed jobs
|
|
920
|
-
self.failedJobs: Dict[str, Job] = {}
|
|
921
|
-
|
|
922
|
-
# Exit mode when catching signals
|
|
923
|
-
self.exitMode = False
|
|
924
|
-
|
|
925
|
-
self.central = SchedulerCentral.create(self.scheduler.name)
|
|
926
|
-
|
|
927
|
-
SIGNAL_HANDLER.add(self)
|
|
928
|
-
|
|
929
|
-
self.old_experiment = experiment.CURRENT
|
|
930
|
-
experiment.CURRENT = self
|
|
931
|
-
return self
|
|
932
|
-
|
|
933
|
-
def __exit__(self, exc_type, exc_value, traceback):
|
|
934
|
-
# If no exception and normal run mode, remove old "jobs"
|
|
935
|
-
if self.workspace.run_mode == RunMode.NORMAL:
|
|
936
|
-
if exc_type is None and self.jobsbakpath.is_dir():
|
|
937
|
-
rmtree(self.jobsbakpath)
|
|
938
|
-
|
|
939
|
-
# Close the different locks
|
|
940
293
|
try:
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
" (some jobs may be running)"
|
|
947
|
-
)
|
|
948
|
-
else:
|
|
949
|
-
self.wait()
|
|
950
|
-
finally:
|
|
951
|
-
SIGNAL_HANDLER.remove(self)
|
|
952
|
-
|
|
953
|
-
# Stop services
|
|
954
|
-
for service in self.services.values():
|
|
955
|
-
logger.info("Closing service %s", service.description())
|
|
956
|
-
service.stop()
|
|
957
|
-
|
|
958
|
-
if self.central is not None:
|
|
959
|
-
self.central.loop.stop()
|
|
960
|
-
|
|
961
|
-
self.central = None
|
|
962
|
-
self.workspace.__exit__(exc_type, exc_value, traceback)
|
|
963
|
-
if self.xplock:
|
|
964
|
-
self.xplock.__exit__(exc_type, exc_value, traceback)
|
|
965
|
-
|
|
966
|
-
# Put back old experiment as current one
|
|
967
|
-
experiment.CURRENT = self.old_experiment
|
|
968
|
-
if self.server:
|
|
969
|
-
self.server.stop()
|
|
970
|
-
|
|
971
|
-
def add_service(self, service: ServiceClass) -> ServiceClass:
|
|
972
|
-
"""Adds a service (e.g. tensorboard viewer) to the experiment
|
|
973
|
-
|
|
974
|
-
:param service: A service instance
|
|
975
|
-
:return: The same service instance
|
|
976
|
-
"""
|
|
977
|
-
self.services[service.id] = service
|
|
978
|
-
for listener in self.scheduler.listeners:
|
|
979
|
-
listener.service_add(service)
|
|
980
|
-
return service
|
|
981
|
-
|
|
982
|
-
def save(self, obj: Any, name: str = "default"):
|
|
983
|
-
"""Serializes configurations.
|
|
984
|
-
|
|
985
|
-
Saves configuration objects within the experimental directory
|
|
986
|
-
|
|
987
|
-
:param obj: The object to save
|
|
988
|
-
:param name: The name of the saving directory (default to `default`)
|
|
989
|
-
"""
|
|
990
|
-
|
|
991
|
-
if self.workspace.run_mode == RunMode.NORMAL:
|
|
992
|
-
from experimaestro import save
|
|
993
|
-
|
|
994
|
-
save_dir = self.workdir / "data" / name
|
|
995
|
-
save_dir.mkdir(exist_ok=True, parents=True)
|
|
996
|
-
|
|
997
|
-
save(obj, save_dir)
|
|
998
|
-
|
|
999
|
-
def load(self, reference: str, name: str = "default"):
|
|
1000
|
-
"""Serializes configurations.
|
|
294
|
+
# Call job's start method with scheduler context
|
|
295
|
+
state = await job.aio_start(
|
|
296
|
+
sched_dependency_lock=self.dependencyLock,
|
|
297
|
+
notification_server=self.xp.server if self.xp else None,
|
|
298
|
+
)
|
|
1001
299
|
|
|
1002
|
-
|
|
300
|
+
if state is None:
|
|
301
|
+
# Dependencies couldn't be locked, return WAITING state
|
|
302
|
+
return JobState.WAITING
|
|
1003
303
|
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
from experimaestro import load
|
|
304
|
+
# Notify scheduler listeners of job state after successful start
|
|
305
|
+
self.notify_job_state(job)
|
|
306
|
+
return state
|
|
1008
307
|
|
|
1009
|
-
|
|
1010
|
-
|
|
308
|
+
except Exception:
|
|
309
|
+
logger.warning("Error in scheduler job coordination", exc_info=True)
|
|
310
|
+
return JobState.ERROR
|