experimaestro 2.0.0a3__py3-none-any.whl → 2.0.0a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/connectors/__init__.py +2 -2
- experimaestro/core/objects/config.py +28 -9
- experimaestro/scheduler/__init__.py +18 -1
- experimaestro/scheduler/base.py +87 -906
- experimaestro/scheduler/experiment.py +387 -0
- experimaestro/scheduler/jobs.py +475 -0
- experimaestro/scheduler/signal_handler.py +32 -0
- experimaestro/scheduler/state.py +1 -1
- experimaestro/server/__init__.py +36 -5
- experimaestro/tests/test_dependencies.py +1 -1
- experimaestro/tests/test_generators.py +34 -9
- experimaestro/typingutils.py +11 -2
- {experimaestro-2.0.0a3.dist-info → experimaestro-2.0.0a4.dist-info}/METADATA +3 -2
- {experimaestro-2.0.0a3.dist-info → experimaestro-2.0.0a4.dist-info}/RECORD +17 -14
- {experimaestro-2.0.0a3.dist-info → experimaestro-2.0.0a4.dist-info}/WHEEL +1 -1
- {experimaestro-2.0.0a3.dist-info → experimaestro-2.0.0a4.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0a3.dist-info → experimaestro-2.0.0a4.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from shutil import rmtree
|
|
6
|
+
from typing import Any, Dict, Optional, TypeVar, Union
|
|
7
|
+
|
|
8
|
+
from experimaestro.core.objects import WatchedOutput
|
|
9
|
+
from experimaestro.exceptions import HandledException
|
|
10
|
+
|
|
11
|
+
from experimaestro.scheduler.signal_handler import SIGNAL_HANDLER
|
|
12
|
+
from experimaestro.scheduler.jobs import Job, JobFailureStatus
|
|
13
|
+
from experimaestro.scheduler.services import Service
|
|
14
|
+
from experimaestro.scheduler.workspace import RunMode, Workspace
|
|
15
|
+
from experimaestro.settings import WorkspaceSettings, get_settings
|
|
16
|
+
from experimaestro.utils import logger
|
|
17
|
+
|
|
18
|
+
ServiceClass = TypeVar("ServiceClass", bound=Service)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class FailedExperiment(HandledException):
|
|
22
|
+
"""Raised when an experiment failed"""
|
|
23
|
+
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class experiment:
|
|
28
|
+
"""Main experiment object
|
|
29
|
+
|
|
30
|
+
It is a context object, i.e. an experiment is run with
|
|
31
|
+
|
|
32
|
+
```py
|
|
33
|
+
with experiment(...) as xp:
|
|
34
|
+
...
|
|
35
|
+
```
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
#: Current experiment
|
|
39
|
+
CURRENT: Optional["experiment"] = None
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def current() -> "experiment":
|
|
43
|
+
"""Returns the current experiment, but checking first if set
|
|
44
|
+
|
|
45
|
+
If there is no current experiment, raises an AssertError
|
|
46
|
+
"""
|
|
47
|
+
assert experiment.CURRENT is not None, "No current experiment defined"
|
|
48
|
+
return experiment.CURRENT
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
env: Union[Path, str, WorkspaceSettings],
|
|
53
|
+
name: str,
|
|
54
|
+
*,
|
|
55
|
+
host: Optional[str] = None,
|
|
56
|
+
port: Optional[int] = None,
|
|
57
|
+
token: Optional[str] = None,
|
|
58
|
+
run_mode: Optional[RunMode] = None,
|
|
59
|
+
launcher=None,
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
:param env: an environment -- or a working directory for a local
|
|
63
|
+
environment
|
|
64
|
+
|
|
65
|
+
:param name: the identifier of the experiment
|
|
66
|
+
|
|
67
|
+
:param launcher: The launcher (if not provided, inferred from path)
|
|
68
|
+
|
|
69
|
+
:param host: The host for the web server (overrides the environment if
|
|
70
|
+
set)
|
|
71
|
+
:param port: the port for the web server (overrides the environment if
|
|
72
|
+
set). Use negative number to avoid running a web server (default when dry run).
|
|
73
|
+
|
|
74
|
+
:param run_mode: The run mode for the experiment (normal, generate run
|
|
75
|
+
files, dry run)
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
from experimaestro.server import Server
|
|
79
|
+
from experimaestro.scheduler import Listener, Scheduler
|
|
80
|
+
|
|
81
|
+
settings = get_settings()
|
|
82
|
+
if not isinstance(env, WorkspaceSettings):
|
|
83
|
+
env = WorkspaceSettings(id=None, path=Path(env))
|
|
84
|
+
|
|
85
|
+
# Creates the workspace
|
|
86
|
+
run_mode = run_mode or RunMode.NORMAL
|
|
87
|
+
self.workspace = Workspace(settings, env, launcher=launcher, run_mode=run_mode)
|
|
88
|
+
|
|
89
|
+
# Mark the directory has an experimaestro folder
|
|
90
|
+
self.workdir = self.workspace.experimentspath / name
|
|
91
|
+
self.workdir.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
self.xplockpath = self.workdir / "lock"
|
|
93
|
+
self.xplock = None
|
|
94
|
+
self.old_experiment = None
|
|
95
|
+
self.services: Dict[str, Service] = {}
|
|
96
|
+
self._job_listener: Optional[Listener] = None
|
|
97
|
+
|
|
98
|
+
# Get configuration settings
|
|
99
|
+
|
|
100
|
+
if host is not None:
|
|
101
|
+
settings.server.host = host
|
|
102
|
+
|
|
103
|
+
if port is not None:
|
|
104
|
+
settings.server.port = port
|
|
105
|
+
|
|
106
|
+
if token is not None:
|
|
107
|
+
settings.server.token = token
|
|
108
|
+
|
|
109
|
+
# Create the scheduler
|
|
110
|
+
self.scheduler = Scheduler.create(self, name)
|
|
111
|
+
self.server = (
|
|
112
|
+
Server(self.scheduler, settings.server)
|
|
113
|
+
if (settings.server.port is not None and settings.server.port >= 0)
|
|
114
|
+
and self.workspace.run_mode == RunMode.NORMAL
|
|
115
|
+
else None
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
|
|
119
|
+
import faulthandler
|
|
120
|
+
|
|
121
|
+
logger.info("Enabling fault handler")
|
|
122
|
+
faulthandler.enable(all_threads=True)
|
|
123
|
+
|
|
124
|
+
def submit(self, job: Job):
|
|
125
|
+
return self.scheduler.submit(job)
|
|
126
|
+
|
|
127
|
+
def prepare(self, job: Job):
|
|
128
|
+
"""Generate the file"""
|
|
129
|
+
return self.scheduler.prepare(job)
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def run_mode(self):
|
|
133
|
+
return self.workspace.run_mode
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def loop(self):
|
|
137
|
+
assert self.scheduler is not None, "No scheduler defined"
|
|
138
|
+
return self.scheduler.loop
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def resultspath(self):
|
|
142
|
+
"""Return the directory in which results can be stored for this experiment"""
|
|
143
|
+
return self.workdir / "results"
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def jobspath(self):
|
|
147
|
+
"""Return the directory in which results can be stored for this experiment"""
|
|
148
|
+
return self.workdir / "jobs"
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def alt_jobspaths(self):
|
|
152
|
+
"""Return potential other directories"""
|
|
153
|
+
for alt_workdir in self.workspace.alt_workdirs:
|
|
154
|
+
yield alt_workdir / "jobs"
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def jobsbakpath(self):
|
|
158
|
+
"""Return the directory in which results can be stored for this experiment"""
|
|
159
|
+
return self.workdir / "jobs.bak"
|
|
160
|
+
|
|
161
|
+
def stop(self):
|
|
162
|
+
"""Stop the experiment as soon as possible"""
|
|
163
|
+
|
|
164
|
+
async def doStop():
|
|
165
|
+
assert self.scheduler is not None
|
|
166
|
+
async with self.scheduler.exitCondition:
|
|
167
|
+
self.exitMode = True
|
|
168
|
+
logging.debug("Setting exit mode to true")
|
|
169
|
+
self.scheduler.exitCondition.notify_all()
|
|
170
|
+
|
|
171
|
+
assert self.scheduler is not None and self.scheduler.loop is not None
|
|
172
|
+
asyncio.run_coroutine_threadsafe(doStop(), self.scheduler.loop)
|
|
173
|
+
|
|
174
|
+
def wait(self):
|
|
175
|
+
"""Wait until the running processes have finished"""
|
|
176
|
+
|
|
177
|
+
async def awaitcompletion():
|
|
178
|
+
assert self.scheduler is not None, "No scheduler defined"
|
|
179
|
+
logger.debug("Waiting to exit scheduler...")
|
|
180
|
+
async with self.scheduler.exitCondition:
|
|
181
|
+
while True:
|
|
182
|
+
if self.exitMode:
|
|
183
|
+
break
|
|
184
|
+
|
|
185
|
+
# If we have still unfinished jobs or possible new tasks, wait
|
|
186
|
+
logger.debug(
|
|
187
|
+
"Checking exit condition: unfinished jobs=%d, task output queue size=%d",
|
|
188
|
+
self.unfinishedJobs,
|
|
189
|
+
self.taskOutputQueueSize,
|
|
190
|
+
)
|
|
191
|
+
if self.unfinishedJobs == 0 and self.taskOutputQueueSize == 0:
|
|
192
|
+
break
|
|
193
|
+
|
|
194
|
+
# Wait for more news...
|
|
195
|
+
await self.scheduler.exitCondition.wait()
|
|
196
|
+
|
|
197
|
+
if self.failedJobs:
|
|
198
|
+
# Show some more information
|
|
199
|
+
count = 0
|
|
200
|
+
for job in self.failedJobs.values():
|
|
201
|
+
if job.failure_status != JobFailureStatus.DEPENDENCY:
|
|
202
|
+
count += 1
|
|
203
|
+
logger.error(
|
|
204
|
+
"Job %s failed, check the log file %s",
|
|
205
|
+
job.relpath,
|
|
206
|
+
job.stderr,
|
|
207
|
+
)
|
|
208
|
+
raise FailedExperiment(f"{count} failed jobs")
|
|
209
|
+
|
|
210
|
+
future = asyncio.run_coroutine_threadsafe(awaitcompletion(), self.loop)
|
|
211
|
+
return future.result()
|
|
212
|
+
|
|
213
|
+
def setenv(self, name, value, override=True):
|
|
214
|
+
"""Shortcut to set the environment value"""
|
|
215
|
+
if override or name not in self.workspace.env:
|
|
216
|
+
logging.info("Setting environment: %s=%s", name, value)
|
|
217
|
+
self.workspace.env[name] = value
|
|
218
|
+
|
|
219
|
+
def token(self, name: str, count: int):
|
|
220
|
+
"""Returns a token for this experiment
|
|
221
|
+
|
|
222
|
+
The token is the default token of the workspace connector"""
|
|
223
|
+
return self.workspace.connector.createtoken(name, count)
|
|
224
|
+
|
|
225
|
+
def __enter__(self):
|
|
226
|
+
from .dynamic_outputs import TaskOutputsWorker
|
|
227
|
+
|
|
228
|
+
if self.workspace.run_mode != RunMode.DRY_RUN:
|
|
229
|
+
logger.info("Locking experiment %s", self.xplockpath)
|
|
230
|
+
self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
|
|
231
|
+
logger.info("Experiment locked")
|
|
232
|
+
|
|
233
|
+
# Move old jobs into "jobs.bak"
|
|
234
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
235
|
+
self.jobsbakpath.mkdir(exist_ok=True)
|
|
236
|
+
for p in self.jobspath.glob("*/*"):
|
|
237
|
+
if p.is_symlink():
|
|
238
|
+
target = self.jobsbakpath / p.relative_to(self.jobspath)
|
|
239
|
+
if target.is_symlink():
|
|
240
|
+
# Remove if duplicate
|
|
241
|
+
p.unlink()
|
|
242
|
+
else:
|
|
243
|
+
# Rename otherwise
|
|
244
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
245
|
+
p.rename(target)
|
|
246
|
+
|
|
247
|
+
if self.server:
|
|
248
|
+
self.server.start()
|
|
249
|
+
|
|
250
|
+
self.workspace.__enter__()
|
|
251
|
+
(self.workspace.path / ".__experimaestro__").touch()
|
|
252
|
+
|
|
253
|
+
# Number of unfinished jobs
|
|
254
|
+
self.unfinishedJobs = 0
|
|
255
|
+
self.taskOutputQueueSize = 0
|
|
256
|
+
|
|
257
|
+
# List of failed jobs
|
|
258
|
+
self.failedJobs: Dict[str, Job] = {}
|
|
259
|
+
|
|
260
|
+
# Exit mode when catching signals
|
|
261
|
+
self.exitMode = False
|
|
262
|
+
|
|
263
|
+
self.scheduler.start_scheduler()
|
|
264
|
+
self.taskOutputsWorker = TaskOutputsWorker(self)
|
|
265
|
+
self.taskOutputsWorker.start()
|
|
266
|
+
|
|
267
|
+
SIGNAL_HANDLER.add(self)
|
|
268
|
+
|
|
269
|
+
self.old_experiment = experiment.CURRENT
|
|
270
|
+
experiment.CURRENT = self
|
|
271
|
+
return self
|
|
272
|
+
|
|
273
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
274
|
+
logger.debug("Exiting scheduler context")
|
|
275
|
+
# If no exception and normal run mode, remove old "jobs"
|
|
276
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
277
|
+
if exc_type is None and self.jobsbakpath.is_dir():
|
|
278
|
+
rmtree(self.jobsbakpath)
|
|
279
|
+
|
|
280
|
+
# Close the different locks
|
|
281
|
+
try:
|
|
282
|
+
if exc_type:
|
|
283
|
+
# import faulthandler
|
|
284
|
+
# faulthandler.dump_traceback()
|
|
285
|
+
logger.error(
|
|
286
|
+
"Not waiting since an exception was thrown"
|
|
287
|
+
" (some jobs may be running)"
|
|
288
|
+
)
|
|
289
|
+
else:
|
|
290
|
+
self.wait()
|
|
291
|
+
finally:
|
|
292
|
+
SIGNAL_HANDLER.remove(self)
|
|
293
|
+
|
|
294
|
+
# Stop services
|
|
295
|
+
for service in self.services.values():
|
|
296
|
+
logger.info("Closing service %s", service.description())
|
|
297
|
+
service.stop()
|
|
298
|
+
|
|
299
|
+
if self.scheduler is not None:
|
|
300
|
+
logger.info("Stopping scheduler event loop")
|
|
301
|
+
self.scheduler.loop.stop()
|
|
302
|
+
|
|
303
|
+
if self.taskOutputsWorker is not None:
|
|
304
|
+
logger.info("Stopping tasks outputs worker")
|
|
305
|
+
self.taskOutputsWorker.queue.put(None)
|
|
306
|
+
|
|
307
|
+
self.workspace.__exit__(exc_type, exc_value, traceback)
|
|
308
|
+
if self.xplock:
|
|
309
|
+
self.xplock.__exit__(exc_type, exc_value, traceback)
|
|
310
|
+
|
|
311
|
+
# Put back old experiment as current one
|
|
312
|
+
experiment.CURRENT = self.old_experiment
|
|
313
|
+
if self.server:
|
|
314
|
+
logger.info("Stopping web server")
|
|
315
|
+
self.server.stop()
|
|
316
|
+
|
|
317
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
318
|
+
# Write the state
|
|
319
|
+
logging.info("Saving the experiment state")
|
|
320
|
+
from experimaestro.scheduler.state import ExperimentState
|
|
321
|
+
|
|
322
|
+
ExperimentState.save(
|
|
323
|
+
self.workdir / "state.json", self.scheduler.jobs.values()
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
async def update_task_output_count(self, delta: int):
|
|
327
|
+
"""Change in the number of task outputs to process"""
|
|
328
|
+
async with self.scheduler.exitCondition:
|
|
329
|
+
self.taskOutputQueueSize += delta
|
|
330
|
+
logging.debug(
|
|
331
|
+
"Updating queue size with %d => %d", delta, self.taskOutputQueueSize
|
|
332
|
+
)
|
|
333
|
+
if self.taskOutputQueueSize == 0:
|
|
334
|
+
self.scheduler.exitCondition.notify_all()
|
|
335
|
+
|
|
336
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
337
|
+
"""Watch an output
|
|
338
|
+
|
|
339
|
+
:param watched: The watched output specification
|
|
340
|
+
"""
|
|
341
|
+
|
|
342
|
+
self.taskOutputsWorker.watch_output(watched)
|
|
343
|
+
|
|
344
|
+
def add_service(self, service: ServiceClass) -> ServiceClass:
|
|
345
|
+
"""Adds a service (e.g. tensorboard viewer) to the experiment
|
|
346
|
+
|
|
347
|
+
:param service: A service instance
|
|
348
|
+
:return: The same service instance
|
|
349
|
+
"""
|
|
350
|
+
self.services[service.id] = service
|
|
351
|
+
for listener in self.scheduler.listeners:
|
|
352
|
+
listener.service_add(service)
|
|
353
|
+
return service
|
|
354
|
+
|
|
355
|
+
def save(self, obj: Any, name: str = "default"):
|
|
356
|
+
"""Serializes configurations.
|
|
357
|
+
|
|
358
|
+
Saves configuration objects within the experimental directory
|
|
359
|
+
|
|
360
|
+
:param obj: The object to save
|
|
361
|
+
:param name: The name of the saving directory (default to `default`)
|
|
362
|
+
"""
|
|
363
|
+
|
|
364
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
365
|
+
from experimaestro import save
|
|
366
|
+
|
|
367
|
+
save_dir = self.workdir / "data" / name
|
|
368
|
+
save_dir.mkdir(exist_ok=True, parents=True)
|
|
369
|
+
|
|
370
|
+
save(obj, save_dir)
|
|
371
|
+
|
|
372
|
+
def load(self, reference: str, name: str = "default"):
|
|
373
|
+
"""Serializes configurations.
|
|
374
|
+
|
|
375
|
+
Loads configuration objects from an experimental directory
|
|
376
|
+
|
|
377
|
+
:param reference: The name of the experiment
|
|
378
|
+
:param name: The name of the saving directory (default to `default`)
|
|
379
|
+
"""
|
|
380
|
+
from experimaestro import load
|
|
381
|
+
|
|
382
|
+
path = self.workspace.experimentspath / reference / "data" / name
|
|
383
|
+
return load(path)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
# re-export at the module level
|
|
387
|
+
current = experiment.current
|