experimaestro 2.0.0a3__py3-none-any.whl → 2.0.0a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/connectors/__init__.py +2 -2
- experimaestro/core/objects/config.py +28 -9
- experimaestro/scheduler/__init__.py +18 -1
- experimaestro/scheduler/base.py +87 -906
- experimaestro/scheduler/experiment.py +387 -0
- experimaestro/scheduler/jobs.py +475 -0
- experimaestro/scheduler/signal_handler.py +32 -0
- experimaestro/scheduler/state.py +1 -1
- experimaestro/server/__init__.py +36 -5
- experimaestro/tests/test_dependencies.py +1 -1
- experimaestro/tests/test_generators.py +34 -9
- experimaestro/typingutils.py +11 -2
- {experimaestro-2.0.0a3.dist-info → experimaestro-2.0.0a4.dist-info}/METADATA +3 -2
- {experimaestro-2.0.0a3.dist-info → experimaestro-2.0.0a4.dist-info}/RECORD +17 -14
- {experimaestro-2.0.0a3.dist-info → experimaestro-2.0.0a4.dist-info}/WHEEL +1 -1
- {experimaestro-2.0.0a3.dist-info → experimaestro-2.0.0a4.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0a3.dist-info → experimaestro-2.0.0a4.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,475 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
from collections import ChainMap
|
|
4
|
+
import enum
|
|
5
|
+
from functools import cached_property
|
|
6
|
+
import itertools
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, Iterator, List, Optional, Set
|
|
9
|
+
|
|
10
|
+
import concurrent
|
|
11
|
+
|
|
12
|
+
from experimaestro.core.objects import Config, ConfigWalkContext, WatchedOutput
|
|
13
|
+
from experimaestro.notifications import LevelInformation, Reporter
|
|
14
|
+
|
|
15
|
+
# from experimaestro.scheduler.base import Scheduler
|
|
16
|
+
from experimaestro.scheduler.dependencies import Dependency, DependencyStatus, Resource
|
|
17
|
+
from experimaestro.scheduler.workspace import RunMode, Workspace
|
|
18
|
+
from experimaestro.locking import Lock, LockError, Locks
|
|
19
|
+
from experimaestro.utils import logger
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from experimaestro.connectors import Process
|
|
23
|
+
from experimaestro.launchers import Launcher
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class JobState(enum.Enum):
|
|
27
|
+
# Job is not yet scheduled
|
|
28
|
+
UNSCHEDULED = 0
|
|
29
|
+
|
|
30
|
+
# Job is waiting for dependencies to be done
|
|
31
|
+
WAITING = 1
|
|
32
|
+
|
|
33
|
+
# Job is ready to run
|
|
34
|
+
READY = 2
|
|
35
|
+
|
|
36
|
+
# Job is scheduled (e.g. slurm)
|
|
37
|
+
SCHEDULED = 3
|
|
38
|
+
|
|
39
|
+
# Job is running
|
|
40
|
+
RUNNING = 4
|
|
41
|
+
|
|
42
|
+
# Job is done (finished)
|
|
43
|
+
DONE = 5
|
|
44
|
+
|
|
45
|
+
# Job failed (finished)
|
|
46
|
+
ERROR = 6
|
|
47
|
+
|
|
48
|
+
def notstarted(self):
|
|
49
|
+
return self.value <= JobState.READY.value
|
|
50
|
+
|
|
51
|
+
def running(self):
|
|
52
|
+
return (
|
|
53
|
+
self.value == JobState.RUNNING.value
|
|
54
|
+
or self.value == JobState.SCHEDULED.value
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def finished(self):
|
|
58
|
+
return self.value >= JobState.DONE.value
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class JobFailureStatus(enum.Enum):
|
|
62
|
+
#: Job failed
|
|
63
|
+
DEPENDENCY = 0
|
|
64
|
+
|
|
65
|
+
#: Job dependency failed
|
|
66
|
+
FAILED = 1
|
|
67
|
+
|
|
68
|
+
#: Memory
|
|
69
|
+
MEMORY = 2
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class JobLock(Lock):
|
|
73
|
+
def __init__(self, job):
|
|
74
|
+
super().__init__()
|
|
75
|
+
self.job = job
|
|
76
|
+
|
|
77
|
+
def _acquire(self):
|
|
78
|
+
return self.job.state == JobState.DONE
|
|
79
|
+
|
|
80
|
+
def _release(self):
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class JobDependency(Dependency):
|
|
85
|
+
def __init__(self, job):
|
|
86
|
+
super().__init__(job)
|
|
87
|
+
|
|
88
|
+
def status(self) -> DependencyStatus:
|
|
89
|
+
if self.origin.state == JobState.DONE:
|
|
90
|
+
return DependencyStatus.OK
|
|
91
|
+
elif self.origin.state == JobState.ERROR:
|
|
92
|
+
return DependencyStatus.FAIL
|
|
93
|
+
return DependencyStatus.WAIT
|
|
94
|
+
|
|
95
|
+
def lock(self):
|
|
96
|
+
return JobLock(self.origin)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class Job(Resource):
|
|
100
|
+
"""A job is a resource that is produced by the execution of some code"""
|
|
101
|
+
|
|
102
|
+
# Set by the scheduler
|
|
103
|
+
_readyEvent: Optional[asyncio.Event]
|
|
104
|
+
_future: Optional["concurrent.futures.Future"]
|
|
105
|
+
|
|
106
|
+
def __init__(
|
|
107
|
+
self,
|
|
108
|
+
config: Config,
|
|
109
|
+
*,
|
|
110
|
+
workspace: Workspace = None,
|
|
111
|
+
launcher: "Launcher" = None,
|
|
112
|
+
run_mode: RunMode = RunMode.NORMAL,
|
|
113
|
+
):
|
|
114
|
+
from experimaestro.scheduler.base import Scheduler
|
|
115
|
+
|
|
116
|
+
super().__init__()
|
|
117
|
+
|
|
118
|
+
self.workspace = workspace or Workspace.CURRENT
|
|
119
|
+
self.launcher = launcher or self.workspace.launcher if self.workspace else None
|
|
120
|
+
|
|
121
|
+
if run_mode == RunMode.NORMAL:
|
|
122
|
+
assert self.workspace is not None, "No experiment has been defined"
|
|
123
|
+
assert self.launcher is not None, (
|
|
124
|
+
"No launcher, and no default defined for the workspace %s" % workspace
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
self.type = config.__xpmtype__
|
|
128
|
+
self.name = str(self.type.identifier).rsplit(".", 1)[-1]
|
|
129
|
+
|
|
130
|
+
self.scheduler: Optional["Scheduler"] = None
|
|
131
|
+
self.config = config
|
|
132
|
+
self.state: JobState = JobState.UNSCHEDULED
|
|
133
|
+
|
|
134
|
+
#: If a job has failed, indicates the failure status
|
|
135
|
+
self.failure_status: JobFailureStatus = None
|
|
136
|
+
|
|
137
|
+
# Dependencies
|
|
138
|
+
self.dependencies: Set[Dependency] = set() # as target
|
|
139
|
+
|
|
140
|
+
# Watched outputs
|
|
141
|
+
self.watched_outputs = {}
|
|
142
|
+
for watched in config.__xpm__.watched_outputs:
|
|
143
|
+
self.watch_output(watched)
|
|
144
|
+
|
|
145
|
+
# Process
|
|
146
|
+
self._process = None
|
|
147
|
+
self.unsatisfied = 0
|
|
148
|
+
|
|
149
|
+
# Meta-information
|
|
150
|
+
self.starttime: Optional[float] = None
|
|
151
|
+
self.submittime: Optional[float] = None
|
|
152
|
+
self.endtime: Optional[float] = None
|
|
153
|
+
self._progress: List[LevelInformation] = []
|
|
154
|
+
self.tags = config.tags()
|
|
155
|
+
|
|
156
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
157
|
+
"""Monitor task outputs
|
|
158
|
+
|
|
159
|
+
:param watched: A description of the watched output
|
|
160
|
+
"""
|
|
161
|
+
self.scheduler.xp.watch_output(watched)
|
|
162
|
+
|
|
163
|
+
def task_output_update(self, subpath: Path):
|
|
164
|
+
"""Notification of an updated task output"""
|
|
165
|
+
if watcher := self.watched_outputs.get(subpath, None):
|
|
166
|
+
watcher.update()
|
|
167
|
+
|
|
168
|
+
def done_handler(self):
|
|
169
|
+
"""The task has been completed"""
|
|
170
|
+
for watcher in self.watched_outputs.values():
|
|
171
|
+
watcher.update()
|
|
172
|
+
|
|
173
|
+
def __str__(self):
|
|
174
|
+
return "Job[{}]".format(self.identifier)
|
|
175
|
+
|
|
176
|
+
def wait(self) -> JobState:
|
|
177
|
+
assert self._future, "Cannot wait a not submitted job"
|
|
178
|
+
return self._future.result()
|
|
179
|
+
|
|
180
|
+
@cached_property
|
|
181
|
+
def python_path(self) -> Iterator[str]:
|
|
182
|
+
"""Returns an iterator over python path"""
|
|
183
|
+
return itertools.chain(self.workspace.python_path)
|
|
184
|
+
|
|
185
|
+
@cached_property
|
|
186
|
+
def environ(self):
|
|
187
|
+
"""Returns the job environment
|
|
188
|
+
|
|
189
|
+
It is made of (by order of priority):
|
|
190
|
+
|
|
191
|
+
1. The job environment
|
|
192
|
+
1. The launcher environment
|
|
193
|
+
1. The workspace environment
|
|
194
|
+
|
|
195
|
+
"""
|
|
196
|
+
return ChainMap(
|
|
197
|
+
{},
|
|
198
|
+
self.launcher.environ if self.launcher else {},
|
|
199
|
+
self.workspace.env if self.workspace else {},
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
@property
|
|
203
|
+
def progress(self):
|
|
204
|
+
return self._progress
|
|
205
|
+
|
|
206
|
+
def set_progress(self, level: int, value: float, desc: Optional[str]):
|
|
207
|
+
if value < 0:
|
|
208
|
+
logger.warning(f"Progress value out of bounds ({value})")
|
|
209
|
+
value = 0
|
|
210
|
+
elif value > 1:
|
|
211
|
+
logger.warning(f"Progress value out of bounds ({value})")
|
|
212
|
+
value = 1
|
|
213
|
+
|
|
214
|
+
# Adjust the length of the array
|
|
215
|
+
self._progress = self._progress[: (level + 1)]
|
|
216
|
+
while len(self._progress) <= level:
|
|
217
|
+
self._progress.append(LevelInformation(len(self._progress), None, 0.0))
|
|
218
|
+
|
|
219
|
+
if desc:
|
|
220
|
+
self._progress[-1].desc = desc
|
|
221
|
+
self._progress[-1].progress = value
|
|
222
|
+
|
|
223
|
+
for listener in self.scheduler.listeners:
|
|
224
|
+
listener.job_state(self)
|
|
225
|
+
|
|
226
|
+
def add_notification_server(self, server):
|
|
227
|
+
"""Adds a notification server"""
|
|
228
|
+
key, baseurl = server.getNotificationSpec()
|
|
229
|
+
dirpath = self.path / Reporter.NOTIFICATION_FOLDER
|
|
230
|
+
dirpath.mkdir(exist_ok=True)
|
|
231
|
+
(dirpath / key).write_text(f"{baseurl}/{self.identifier}")
|
|
232
|
+
|
|
233
|
+
@property
|
|
234
|
+
def ready(self):
|
|
235
|
+
return self.state == JobState.READY
|
|
236
|
+
|
|
237
|
+
@property
|
|
238
|
+
def jobpath(self) -> Path:
|
|
239
|
+
"""Deprecated, use `path`"""
|
|
240
|
+
return self.workspace.jobspath / self.relpath
|
|
241
|
+
|
|
242
|
+
@property
|
|
243
|
+
def path(self) -> Path:
|
|
244
|
+
return self.workspace.jobspath / self.relpath
|
|
245
|
+
|
|
246
|
+
@property
|
|
247
|
+
def experimaestro_path(self) -> Path:
|
|
248
|
+
return (self.path / ".experimaestro").resolve()
|
|
249
|
+
|
|
250
|
+
@cached_property
|
|
251
|
+
def task_outputs_path(self) -> Path:
|
|
252
|
+
return self.experimaestro_path / "task-outputs.jsonl"
|
|
253
|
+
|
|
254
|
+
@property
|
|
255
|
+
def relpath(self):
|
|
256
|
+
identifier = self.config.__xpm__.identifier
|
|
257
|
+
base = Path(str(self.type.identifier))
|
|
258
|
+
return base / identifier.all.hex()
|
|
259
|
+
|
|
260
|
+
@property
|
|
261
|
+
def relmainpath(self):
|
|
262
|
+
identifier = self.config.__xpm__.identifier
|
|
263
|
+
base = Path(str(self.type.identifier))
|
|
264
|
+
return base / identifier.main.hex()
|
|
265
|
+
|
|
266
|
+
@property
|
|
267
|
+
def hashidentifier(self):
|
|
268
|
+
return self.config.__xpm__.identifier
|
|
269
|
+
|
|
270
|
+
@property
|
|
271
|
+
def identifier(self):
|
|
272
|
+
return self.config.__xpm__.identifier.all.hex()
|
|
273
|
+
|
|
274
|
+
def prepare(self, overwrite=False):
|
|
275
|
+
"""Prepare all files before starting a task
|
|
276
|
+
|
|
277
|
+
:param overwrite: if True, overwrite files even if the task has been run
|
|
278
|
+
"""
|
|
279
|
+
pass
|
|
280
|
+
|
|
281
|
+
async def aio_start(self, sched_dependency_lock, notification_server=None):
|
|
282
|
+
"""Start the job with core job starting logic
|
|
283
|
+
|
|
284
|
+
This method contains the core logic for starting a job that was previously
|
|
285
|
+
located in Scheduler.aio_start(). It handles job locking, dependency
|
|
286
|
+
acquisition, directory setup, and job execution while using the scheduler's
|
|
287
|
+
coordination lock to prevent race conditions between multiple jobs.
|
|
288
|
+
|
|
289
|
+
:param sched_dependency_lock: The scheduler's dependency lock for coordination
|
|
290
|
+
between jobs to prevent race conditions during dependency acquisition
|
|
291
|
+
:param notification_server: Optional notification server from the experiment
|
|
292
|
+
for job progress reporting
|
|
293
|
+
:return: JobState.DONE if job completed successfully, JobState.ERROR if job
|
|
294
|
+
failed during execution, or None if dependencies couldn't be locked
|
|
295
|
+
(signals WAITING state to scheduler)
|
|
296
|
+
:raises Exception: Various exceptions during job execution, dependency locking,
|
|
297
|
+
or process creation
|
|
298
|
+
"""
|
|
299
|
+
# We first lock the job before proceeding
|
|
300
|
+
assert self.launcher is not None
|
|
301
|
+
|
|
302
|
+
with Locks() as locks:
|
|
303
|
+
logger.debug("[starting] Locking job %s", self)
|
|
304
|
+
async with self.launcher.connector.lock(self.lockpath):
|
|
305
|
+
logger.debug("[starting] Locked job %s", self)
|
|
306
|
+
|
|
307
|
+
state = None
|
|
308
|
+
try:
|
|
309
|
+
logger.debug(
|
|
310
|
+
"Starting job %s with %d dependencies",
|
|
311
|
+
self,
|
|
312
|
+
len(self.dependencies),
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# Individual dependency lock acquisition
|
|
316
|
+
# We use the scheduler-wide lock to avoid cross-jobs race conditions
|
|
317
|
+
async with sched_dependency_lock:
|
|
318
|
+
for dependency in self.dependencies:
|
|
319
|
+
try:
|
|
320
|
+
locks.append(dependency.lock().acquire())
|
|
321
|
+
except LockError:
|
|
322
|
+
logger.warning(
|
|
323
|
+
"Could not lock %s, aborting start for job %s",
|
|
324
|
+
dependency,
|
|
325
|
+
self,
|
|
326
|
+
)
|
|
327
|
+
dependency.check()
|
|
328
|
+
return None # Signal to scheduler that dependencies couldn't be locked
|
|
329
|
+
|
|
330
|
+
# Dependencies have been locked, we can start the job
|
|
331
|
+
self.starttime = time.time()
|
|
332
|
+
|
|
333
|
+
# Creates the main directory
|
|
334
|
+
directory = self.path
|
|
335
|
+
logger.debug("Making directories job %s...", directory)
|
|
336
|
+
if not directory.is_dir():
|
|
337
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
338
|
+
|
|
339
|
+
# Sets up the notification URL
|
|
340
|
+
if notification_server is not None:
|
|
341
|
+
self.add_notification_server(notification_server)
|
|
342
|
+
|
|
343
|
+
except Exception:
|
|
344
|
+
logger.warning("Error while locking job", exc_info=True)
|
|
345
|
+
return None # Signal waiting state to scheduler
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
# Runs the job
|
|
349
|
+
process = await self.aio_run()
|
|
350
|
+
except Exception:
|
|
351
|
+
logger.warning("Error while starting job", exc_info=True)
|
|
352
|
+
return JobState.ERROR
|
|
353
|
+
|
|
354
|
+
try:
|
|
355
|
+
if isinstance(process, JobState):
|
|
356
|
+
state = process
|
|
357
|
+
logger.debug("Job %s ended (state %s)", self, state)
|
|
358
|
+
else:
|
|
359
|
+
logger.debug("Waiting for job %s process to end", self)
|
|
360
|
+
|
|
361
|
+
code = await process.aio_code()
|
|
362
|
+
logger.debug("Got return code %s for %s", code, self)
|
|
363
|
+
|
|
364
|
+
# Check the file if there is no return code
|
|
365
|
+
if code is None:
|
|
366
|
+
# Case where we cannot retrieve the code right away
|
|
367
|
+
if self.donepath.is_file():
|
|
368
|
+
code = 0
|
|
369
|
+
else:
|
|
370
|
+
code = int(self.failedpath.read_text())
|
|
371
|
+
|
|
372
|
+
logger.debug("Job %s ended with code %s", self, code)
|
|
373
|
+
state = JobState.DONE if code == 0 else JobState.ERROR
|
|
374
|
+
|
|
375
|
+
except JobError:
|
|
376
|
+
logger.warning("Error while running job")
|
|
377
|
+
state = JobState.ERROR
|
|
378
|
+
|
|
379
|
+
except Exception:
|
|
380
|
+
logger.warning(
|
|
381
|
+
"Error while running job (in experimaestro)", exc_info=True
|
|
382
|
+
)
|
|
383
|
+
state = JobState.ERROR
|
|
384
|
+
return state
|
|
385
|
+
|
|
386
|
+
async def aio_run(self):
|
|
387
|
+
"""Actually run the code"""
|
|
388
|
+
raise NotImplementedError(f"Method aio_run not implemented in {self.__class__}")
|
|
389
|
+
|
|
390
|
+
async def aio_process(self) -> Optional["Process"]:
|
|
391
|
+
"""Returns the process if it exists"""
|
|
392
|
+
raise NotImplementedError("Not implemented")
|
|
393
|
+
|
|
394
|
+
@property
|
|
395
|
+
def pidpath(self):
|
|
396
|
+
"""This file contains the file PID"""
|
|
397
|
+
return self.jobpath / ("%s.pid" % self.name)
|
|
398
|
+
|
|
399
|
+
@property
|
|
400
|
+
def lockpath(self):
|
|
401
|
+
"""This file is used as a lock for running the job"""
|
|
402
|
+
return self.workspace.jobspath / self.relmainpath / ("%s.lock" % self.name)
|
|
403
|
+
|
|
404
|
+
@property
|
|
405
|
+
def donepath(self) -> Path:
|
|
406
|
+
"""When a job has been successful, this file is written"""
|
|
407
|
+
return self.jobpath / ("%s.done" % self.name)
|
|
408
|
+
|
|
409
|
+
@property
|
|
410
|
+
def failedpath(self):
|
|
411
|
+
"""When a job has been unsuccessful, this file is written with an error
|
|
412
|
+
code inside"""
|
|
413
|
+
return self.jobpath / ("%s.failed" % self.name)
|
|
414
|
+
|
|
415
|
+
@property
|
|
416
|
+
def stdout(self) -> Path:
|
|
417
|
+
return self.jobpath / ("%s.out" % self.name)
|
|
418
|
+
|
|
419
|
+
@property
|
|
420
|
+
def stderr(self) -> Path:
|
|
421
|
+
return self.jobpath / ("%s.err" % self.name)
|
|
422
|
+
|
|
423
|
+
@property
|
|
424
|
+
def basepath(self) -> Path:
|
|
425
|
+
return self.jobpath / self.name
|
|
426
|
+
|
|
427
|
+
def dependencychanged(self, dependency, oldstatus, status):
|
|
428
|
+
"""Called when a dependency has changed"""
|
|
429
|
+
|
|
430
|
+
def value(s):
|
|
431
|
+
return 1 if s == DependencyStatus.OK else 0
|
|
432
|
+
|
|
433
|
+
self.unsatisfied -= value(status) - value(oldstatus)
|
|
434
|
+
|
|
435
|
+
logger.debug("Job %s: unsatisfied %d", self, self.unsatisfied)
|
|
436
|
+
|
|
437
|
+
if status == DependencyStatus.FAIL:
|
|
438
|
+
# Job completed
|
|
439
|
+
if not self.state.finished():
|
|
440
|
+
self.state = JobState.ERROR
|
|
441
|
+
self.failure_status = JobFailureStatus.DEPENDENCY
|
|
442
|
+
self._readyEvent.set()
|
|
443
|
+
|
|
444
|
+
if self.unsatisfied == 0:
|
|
445
|
+
logger.info("Job %s is ready to run", self)
|
|
446
|
+
# We are ready
|
|
447
|
+
self.state = JobState.READY
|
|
448
|
+
self._readyEvent.set()
|
|
449
|
+
|
|
450
|
+
def finalState(self) -> "concurrent.futures.Future[JobState]":
|
|
451
|
+
assert self._future is not None
|
|
452
|
+
return self._future
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
class JobContext(ConfigWalkContext):
|
|
456
|
+
def __init__(self, job: Job):
|
|
457
|
+
super().__init__()
|
|
458
|
+
self.job = job
|
|
459
|
+
|
|
460
|
+
@property
|
|
461
|
+
def name(self):
|
|
462
|
+
return self.job.name
|
|
463
|
+
|
|
464
|
+
@property
|
|
465
|
+
def path(self):
|
|
466
|
+
return self.job.path
|
|
467
|
+
|
|
468
|
+
@property
|
|
469
|
+
def task(self):
|
|
470
|
+
return self.job.config
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
class JobError(Exception):
|
|
474
|
+
def __init__(self, code):
|
|
475
|
+
super().__init__(f"Job exited with code {code}")
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import signal
|
|
2
|
+
from typing import Set
|
|
3
|
+
from experimaestro.scheduler import experiment
|
|
4
|
+
from experimaestro.utils import logger
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SignalHandler:
|
|
8
|
+
def __init__(self):
|
|
9
|
+
self.experiments: Set["experiment"] = set()
|
|
10
|
+
self.original_sigint_handler = None
|
|
11
|
+
|
|
12
|
+
def add(self, xp: "experiment"):
|
|
13
|
+
if not self.experiments:
|
|
14
|
+
self.original_sigint_handler = signal.getsignal(signal.SIGINT)
|
|
15
|
+
|
|
16
|
+
signal.signal(signal.SIGINT, self)
|
|
17
|
+
|
|
18
|
+
self.experiments.add(xp)
|
|
19
|
+
|
|
20
|
+
def remove(self, xp):
|
|
21
|
+
self.experiments.remove(xp)
|
|
22
|
+
if not self.experiments:
|
|
23
|
+
signal.signal(signal.SIGINT, self.original_sigint_handler)
|
|
24
|
+
|
|
25
|
+
def __call__(self, signum, frame):
|
|
26
|
+
"""SIGINT signal handler"""
|
|
27
|
+
logger.warning("Signal received")
|
|
28
|
+
for xp in self.experiments:
|
|
29
|
+
xp.stop()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
SIGNAL_HANDLER = SignalHandler()
|
experimaestro/scheduler/state.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Iterable, Optional, Type
|
|
|
5
5
|
from experimaestro import Task
|
|
6
6
|
|
|
7
7
|
from experimaestro.core.context import SerializationContext
|
|
8
|
-
from experimaestro.scheduler.
|
|
8
|
+
from experimaestro.scheduler.jobs import Job, JobDependency
|
|
9
9
|
from experimaestro.settings import find_workspace
|
|
10
10
|
from experimaestro.core.serialization import from_state_dict, save_definition
|
|
11
11
|
|
experimaestro/server/__init__.py
CHANGED
|
@@ -5,10 +5,25 @@ import platform
|
|
|
5
5
|
import socket
|
|
6
6
|
import uuid
|
|
7
7
|
from experimaestro.scheduler.base import Job
|
|
8
|
-
import
|
|
8
|
+
import sys
|
|
9
9
|
import http
|
|
10
10
|
import threading
|
|
11
11
|
from typing import Optional, Tuple
|
|
12
|
+
|
|
13
|
+
if sys.version_info >= (3, 9):
|
|
14
|
+
from importlib.resources import files
|
|
15
|
+
|
|
16
|
+
pkg_resources = None
|
|
17
|
+
else:
|
|
18
|
+
try:
|
|
19
|
+
from importlib_resources import files
|
|
20
|
+
|
|
21
|
+
pkg_resources = None
|
|
22
|
+
except ImportError:
|
|
23
|
+
# Fallback to pkg_resources if importlib_resources not available
|
|
24
|
+
import pkg_resources
|
|
25
|
+
|
|
26
|
+
files = None
|
|
12
27
|
from experimaestro.scheduler import Scheduler, Listener as BaseListener
|
|
13
28
|
from experimaestro.scheduler.services import Service, ServiceListener
|
|
14
29
|
from experimaestro.settings import ServerSettings
|
|
@@ -143,6 +158,7 @@ def proxy_response(base_url: str, request: Request, path: str):
|
|
|
143
158
|
return flask_response
|
|
144
159
|
|
|
145
160
|
|
|
161
|
+
# flake8: noqa: C901
|
|
146
162
|
def start_app(server: "Server"):
|
|
147
163
|
logging.debug("Starting Flask server...")
|
|
148
164
|
app = Flask("experimaestro")
|
|
@@ -256,10 +272,25 @@ def start_app(server: "Server"):
|
|
|
256
272
|
|
|
257
273
|
datapath = "data/%s" % path
|
|
258
274
|
logging.debug("Looking for %s", datapath)
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
275
|
+
|
|
276
|
+
if files is not None:
|
|
277
|
+
try:
|
|
278
|
+
package_files = files("experimaestro.server")
|
|
279
|
+
resource_file = package_files / datapath
|
|
280
|
+
if resource_file.is_file():
|
|
281
|
+
mimetype = MIMETYPES[datapath.rsplit(".", 1)[1]]
|
|
282
|
+
content = resource_file.read_bytes()
|
|
283
|
+
return Response(content, mimetype=mimetype)
|
|
284
|
+
except (FileNotFoundError, KeyError):
|
|
285
|
+
pass
|
|
286
|
+
elif pkg_resources is not None:
|
|
287
|
+
# Fallback to pkg_resources
|
|
288
|
+
if pkg_resources.resource_exists("experimaestro.server", datapath):
|
|
289
|
+
mimetype = MIMETYPES[datapath.rsplit(".", 1)[1]]
|
|
290
|
+
content = pkg_resources.resource_string(
|
|
291
|
+
"experimaestro.server", datapath
|
|
292
|
+
)
|
|
293
|
+
return Response(content, mimetype=mimetype)
|
|
263
294
|
return Response("Page not found", status=404)
|
|
264
295
|
|
|
265
296
|
# Start the app
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Any, Callable
|
|
2
2
|
import pytest
|
|
3
3
|
from experimaestro import Config, Param, Task, RunMode
|
|
4
|
-
from experimaestro.scheduler.
|
|
4
|
+
from experimaestro.scheduler.jobs import JobDependency
|
|
5
5
|
from experimaestro.tests.utils import TemporaryExperiment
|
|
6
6
|
|
|
7
7
|
|
|
@@ -13,12 +13,35 @@ class Learner(Task):
|
|
|
13
13
|
validation: Param[Validation]
|
|
14
14
|
x: Param[int]
|
|
15
15
|
|
|
16
|
+
@staticmethod
|
|
17
|
+
def create(x: int, validation: Param[Validation]):
|
|
18
|
+
return Learner.C(x=x, validation=validation)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class LearnerList(Task):
|
|
22
|
+
validation: Param[list[Validation]]
|
|
23
|
+
x: Param[int]
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def create(x: int, validation: Param[Validation]):
|
|
27
|
+
return LearnerList.C(x=x, validation=[validation])
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class LearnerDict(Task):
|
|
31
|
+
validation: Param[dict[str, Validation]]
|
|
32
|
+
x: Param[int]
|
|
33
|
+
|
|
34
|
+
@staticmethod
|
|
35
|
+
def create(x: int, validation: Param[Validation]):
|
|
36
|
+
return LearnerDict.C(x=x, validation={"key": validation})
|
|
37
|
+
|
|
16
38
|
|
|
17
39
|
class ModuleLoader(Task):
|
|
18
40
|
validation: Param[Validation] = field(ignore_generated=True)
|
|
19
41
|
|
|
20
42
|
|
|
21
|
-
|
|
43
|
+
@pytest.mark.parametrize("cls", [Learner, LearnerDict, LearnerList])
|
|
44
|
+
def test_generators_reuse_on_submit(cls):
|
|
22
45
|
# We have one way to select the best model
|
|
23
46
|
validation = Validation.C()
|
|
24
47
|
|
|
@@ -29,40 +52,42 @@ def test_generators_reuse_on_submit():
|
|
|
29
52
|
)
|
|
30
53
|
|
|
31
54
|
# OK, the path is generated depending on Learner with x=1
|
|
32
|
-
|
|
55
|
+
cls.create(1, validation).submit(workspace=workspace)
|
|
33
56
|
|
|
34
57
|
with pytest.raises((AttributeError)):
|
|
35
58
|
# Here we have a problem...
|
|
36
59
|
# the path is still the previous one
|
|
37
|
-
|
|
60
|
+
cls.create(2, validation).submit(workspace=workspace)
|
|
38
61
|
|
|
39
62
|
|
|
40
|
-
|
|
63
|
+
@pytest.mark.parametrize("cls", [Learner, LearnerDict, LearnerList])
|
|
64
|
+
def test_generators_delayed_submit(cls):
|
|
41
65
|
workspace = Workspace(
|
|
42
66
|
Settings(),
|
|
43
67
|
WorkspaceSettings("test_generators_simple", path=Path("/tmp")),
|
|
44
68
|
run_mode=RunMode.DRY_RUN,
|
|
45
69
|
)
|
|
46
70
|
validation = Validation.C()
|
|
47
|
-
task1 =
|
|
48
|
-
task2 =
|
|
71
|
+
task1 = cls.create(1, validation)
|
|
72
|
+
task2 = cls.create(2, validation)
|
|
49
73
|
task1.submit(workspace=workspace)
|
|
50
74
|
with pytest.raises((AttributeError)):
|
|
51
75
|
task2.submit(workspace=workspace)
|
|
52
76
|
|
|
53
77
|
|
|
54
|
-
|
|
78
|
+
@pytest.mark.parametrize("cls", [Learner, LearnerDict, LearnerList])
|
|
79
|
+
def test_generators_reuse_on_set(cls):
|
|
55
80
|
workspace = Workspace(
|
|
56
81
|
Settings(),
|
|
57
82
|
WorkspaceSettings("test_generators_simple", path=Path("/tmp")),
|
|
58
83
|
run_mode=RunMode.DRY_RUN,
|
|
59
84
|
)
|
|
60
85
|
validation = Validation.C()
|
|
61
|
-
|
|
86
|
+
cls.create(1, validation).submit(workspace=workspace)
|
|
62
87
|
with pytest.raises((AttributeError)):
|
|
63
88
|
# We should not be able to *create* a second task with the same validation,
|
|
64
89
|
# even without submitting it
|
|
65
|
-
|
|
90
|
+
cls.create(2, validation)
|
|
66
91
|
|
|
67
92
|
# This should run OK
|
|
68
93
|
ModuleLoader.C(validation=validation)
|