experimaestro 1.5.1__py3-none-any.whl → 2.0.0a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +14 -4
- experimaestro/__main__.py +3 -423
- experimaestro/annotations.py +14 -4
- experimaestro/cli/__init__.py +311 -0
- experimaestro/{filter.py → cli/filter.py} +23 -9
- experimaestro/cli/jobs.py +268 -0
- experimaestro/cli/progress.py +269 -0
- experimaestro/click.py +0 -35
- experimaestro/commandline.py +3 -7
- experimaestro/connectors/__init__.py +29 -14
- experimaestro/connectors/local.py +19 -10
- experimaestro/connectors/ssh.py +27 -8
- experimaestro/core/arguments.py +45 -3
- experimaestro/core/callbacks.py +52 -0
- experimaestro/core/context.py +8 -9
- experimaestro/core/identifier.py +310 -0
- experimaestro/core/objects/__init__.py +44 -0
- experimaestro/core/{objects.py → objects/config.py} +399 -772
- experimaestro/core/objects/config_utils.py +58 -0
- experimaestro/core/objects/config_walk.py +151 -0
- experimaestro/core/objects.pyi +15 -45
- experimaestro/core/serialization.py +63 -9
- experimaestro/core/serializers.py +1 -8
- experimaestro/core/types.py +104 -66
- experimaestro/experiments/cli.py +154 -72
- experimaestro/experiments/configuration.py +10 -1
- experimaestro/generators.py +6 -1
- experimaestro/ipc.py +4 -1
- experimaestro/launcherfinder/__init__.py +1 -1
- experimaestro/launcherfinder/base.py +2 -18
- experimaestro/launcherfinder/parser.py +8 -3
- experimaestro/launcherfinder/registry.py +52 -140
- experimaestro/launcherfinder/specs.py +49 -10
- experimaestro/launchers/direct.py +0 -47
- experimaestro/launchers/slurm/base.py +54 -14
- experimaestro/mkdocs/__init__.py +1 -1
- experimaestro/mkdocs/base.py +6 -8
- experimaestro/notifications.py +38 -12
- experimaestro/progress.py +406 -0
- experimaestro/run.py +24 -3
- experimaestro/scheduler/__init__.py +18 -1
- experimaestro/scheduler/base.py +108 -808
- experimaestro/scheduler/dynamic_outputs.py +184 -0
- experimaestro/scheduler/experiment.py +387 -0
- experimaestro/scheduler/jobs.py +475 -0
- experimaestro/scheduler/signal_handler.py +32 -0
- experimaestro/scheduler/state.py +75 -0
- experimaestro/scheduler/workspace.py +27 -8
- experimaestro/scriptbuilder.py +18 -3
- experimaestro/server/__init__.py +36 -5
- experimaestro/server/data/1815e00441357e01619e.ttf +0 -0
- experimaestro/server/data/2463b90d9a316e4e5294.woff2 +0 -0
- experimaestro/server/data/2582b0e4bcf85eceead0.ttf +0 -0
- experimaestro/server/data/89999bdf5d835c012025.woff2 +0 -0
- experimaestro/server/data/914997e1bdfc990d0897.ttf +0 -0
- experimaestro/server/data/c210719e60948b211a12.woff2 +0 -0
- experimaestro/server/data/index.css +5187 -5068
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +68887 -68064
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +45 -5
- experimaestro/sphinx/__init__.py +7 -17
- experimaestro/taskglobals.py +7 -2
- experimaestro/tests/core/__init__.py +0 -0
- experimaestro/tests/core/test_generics.py +206 -0
- experimaestro/tests/definitions_types.py +5 -3
- experimaestro/tests/launchers/bin/sbatch +34 -7
- experimaestro/tests/launchers/bin/srun +5 -0
- experimaestro/tests/launchers/common.py +17 -5
- experimaestro/tests/launchers/config_slurm/launchers.py +25 -0
- experimaestro/tests/restart.py +10 -5
- experimaestro/tests/tasks/all.py +23 -10
- experimaestro/tests/tasks/foreign.py +2 -4
- experimaestro/tests/test_checkers.py +2 -2
- experimaestro/tests/test_dependencies.py +11 -17
- experimaestro/tests/test_experiment.py +73 -0
- experimaestro/tests/test_file_progress.py +425 -0
- experimaestro/tests/test_file_progress_integration.py +477 -0
- experimaestro/tests/test_findlauncher.py +12 -5
- experimaestro/tests/test_forward.py +5 -5
- experimaestro/tests/test_generators.py +93 -0
- experimaestro/tests/test_identifier.py +182 -158
- experimaestro/tests/test_instance.py +19 -27
- experimaestro/tests/test_objects.py +13 -20
- experimaestro/tests/test_outputs.py +6 -6
- experimaestro/tests/test_param.py +68 -30
- experimaestro/tests/test_progress.py +4 -4
- experimaestro/tests/test_serializers.py +24 -64
- experimaestro/tests/test_ssh.py +7 -0
- experimaestro/tests/test_tags.py +50 -21
- experimaestro/tests/test_tasks.py +42 -51
- experimaestro/tests/test_tokens.py +11 -8
- experimaestro/tests/test_types.py +24 -21
- experimaestro/tests/test_validation.py +67 -110
- experimaestro/tests/token_reschedule.py +1 -1
- experimaestro/tokens.py +24 -13
- experimaestro/tools/diff.py +8 -1
- experimaestro/typingutils.py +20 -11
- experimaestro/utils/asyncio.py +6 -2
- experimaestro/utils/multiprocessing.py +44 -0
- experimaestro/utils/resources.py +11 -3
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/METADATA +28 -36
- experimaestro-2.0.0a8.dist-info/RECORD +166 -0
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/WHEEL +1 -1
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/entry_points.txt +0 -4
- experimaestro/launchers/slurm/cli.py +0 -29
- experimaestro/launchers/slurm/configuration.py +0 -597
- experimaestro/scheduler/environment.py +0 -94
- experimaestro/server/data/016b4a6cdced82ab3aa1.ttf +0 -0
- experimaestro/server/data/50701fbb8177c2dde530.ttf +0 -0
- experimaestro/server/data/878f31251d960bd6266f.woff2 +0 -0
- experimaestro/server/data/b041b1fa4fe241b23445.woff2 +0 -0
- experimaestro/server/data/b6879d41b0852f01ed5b.woff2 +0 -0
- experimaestro/server/data/d75e3fd1eb12e9bd6655.ttf +0 -0
- experimaestro/tests/launchers/config_slurm/launchers.yaml +0 -134
- experimaestro/utils/yaml.py +0 -202
- experimaestro-1.5.1.dist-info/RECORD +0 -148
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Handles dynamic task outputs"""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import queue
|
|
7
|
+
import threading
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from functools import cached_property
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Callable, TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
from watchdog.events import FileSystemEventHandler
|
|
14
|
+
|
|
15
|
+
from experimaestro.ipc import ipcom
|
|
16
|
+
from experimaestro.utils import logger
|
|
17
|
+
|
|
18
|
+
from .base import Job, experiment
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from experimaestro.core.objects import WatchedOutput
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TaskOutputCallbackHandler:
|
|
25
|
+
def __init__(self, converter: Callable):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TaskOutputs(FileSystemEventHandler):
|
|
30
|
+
"""Represent and monitors dynamic outputs generated by one task"""
|
|
31
|
+
|
|
32
|
+
#: Global dictionary for handles
|
|
33
|
+
HANDLERS: dict[Path, "TaskOutputs"] = {}
|
|
34
|
+
|
|
35
|
+
#: Global lock to access current HANDLERS
|
|
36
|
+
LOCK = threading.Lock()
|
|
37
|
+
|
|
38
|
+
def create(job: Job):
|
|
39
|
+
with TaskOutputs.LOCK:
|
|
40
|
+
if instance := TaskOutputs.get(job.task_outputs_path, None):
|
|
41
|
+
return instance
|
|
42
|
+
|
|
43
|
+
instance = TaskOutputs(job.task_outputs_path)
|
|
44
|
+
TaskOutputs[job.task_outputs_path] = instance
|
|
45
|
+
return instance
|
|
46
|
+
|
|
47
|
+
def __init__(self, path: Path):
|
|
48
|
+
"""Monitors an event path"""
|
|
49
|
+
logger.debug("Watching dynamic task outputs in %s", path)
|
|
50
|
+
self.path = path
|
|
51
|
+
self.handle = None
|
|
52
|
+
self.count = 0
|
|
53
|
+
self.lock = threading.Lock()
|
|
54
|
+
self.listeners: dict[str, dict[Callable, set[Callable]]] = defaultdict(
|
|
55
|
+
lambda: defaultdict(set)
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
#: The events registered so far
|
|
59
|
+
self.events = []
|
|
60
|
+
|
|
61
|
+
def __enter__(self):
|
|
62
|
+
"""Starts monitoring task outputs"""
|
|
63
|
+
self.job.task_outputs_path.parent.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
with self.lock:
|
|
65
|
+
if self.handle is None:
|
|
66
|
+
assert self.count == 0
|
|
67
|
+
self.handle = ipcom().fswatch(self, self.path.parent, False)
|
|
68
|
+
self.count += 1
|
|
69
|
+
return self
|
|
70
|
+
|
|
71
|
+
def __exit__(self, *args):
|
|
72
|
+
"""Stops monitoring task outputs"""
|
|
73
|
+
with self.lock:
|
|
74
|
+
self.count -= 1
|
|
75
|
+
if self.count == 0:
|
|
76
|
+
ipcom().fsunwatch(self.handle)
|
|
77
|
+
self.fh.close()
|
|
78
|
+
|
|
79
|
+
self.handle = None
|
|
80
|
+
self._fh = None
|
|
81
|
+
|
|
82
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
83
|
+
"""Add a new listener"""
|
|
84
|
+
key = f"{watched.config.__identifier__}/{watched.method_name}"
|
|
85
|
+
with self.lock:
|
|
86
|
+
# Process events so far
|
|
87
|
+
listener = self.listeners[key].get(watched.method, None)
|
|
88
|
+
if listener is None:
|
|
89
|
+
listener = TaskOutputCallbackHandler(watched.method)
|
|
90
|
+
|
|
91
|
+
# Register
|
|
92
|
+
self.listeners[key][watched.method].add(watched.callback)
|
|
93
|
+
|
|
94
|
+
#
|
|
95
|
+
# --- Events
|
|
96
|
+
#
|
|
97
|
+
|
|
98
|
+
@cached_property
|
|
99
|
+
def fh(self):
|
|
100
|
+
if self._fh is None:
|
|
101
|
+
self._fh = self.path.open("rt")
|
|
102
|
+
return self._fh
|
|
103
|
+
|
|
104
|
+
def on_modified(self, event):
|
|
105
|
+
self.handle(Path(event.src_path))
|
|
106
|
+
|
|
107
|
+
def on_created(self, event):
|
|
108
|
+
self.handle(Path(event.src_path))
|
|
109
|
+
|
|
110
|
+
def handle(self, path: Path):
|
|
111
|
+
if path != self.path:
|
|
112
|
+
return
|
|
113
|
+
|
|
114
|
+
with self.lock:
|
|
115
|
+
logger.debug("[TASK OUTPUT] Handling task output for %s", self.path)
|
|
116
|
+
|
|
117
|
+
while json_line := self.fh.readline():
|
|
118
|
+
# Read the event
|
|
119
|
+
event = json.loads(json_line)
|
|
120
|
+
logger.debug("Event: %s", event)
|
|
121
|
+
|
|
122
|
+
# FIXME: move elsewhere
|
|
123
|
+
# # Process the event
|
|
124
|
+
# event = self.config_method(
|
|
125
|
+
# self.job.config.__xpm__.mark_output,
|
|
126
|
+
# *event["args"],
|
|
127
|
+
# **event["kwargs"],
|
|
128
|
+
# )
|
|
129
|
+
|
|
130
|
+
self.events.append(event)
|
|
131
|
+
# self.job.scheduler.xp.taskOutputsWorker.add(self, event)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class TaskOutputsWorker(threading.Thread):
|
|
135
|
+
"""This worker process dynamic output queue for one experiment"""
|
|
136
|
+
|
|
137
|
+
def __init__(self, xp: experiment):
|
|
138
|
+
super().__init__(name="task outputs worker", daemon=True)
|
|
139
|
+
self.queue = queue.Queue()
|
|
140
|
+
self.xp = xp
|
|
141
|
+
|
|
142
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
143
|
+
"""Watch an output
|
|
144
|
+
|
|
145
|
+
:param watched: The watched output specification
|
|
146
|
+
"""
|
|
147
|
+
logger.debug("Registering task output listener %s", watched)
|
|
148
|
+
|
|
149
|
+
# path = watched.job.tasks_output_path
|
|
150
|
+
TaskOutputs.create(watched.job).watch_output(watched)
|
|
151
|
+
|
|
152
|
+
def add(self, watcher, event):
|
|
153
|
+
asyncio.run_coroutine_threadsafe(
|
|
154
|
+
self.xp.update_task_output_count(1),
|
|
155
|
+
self.xp.scheduler.loop,
|
|
156
|
+
).result()
|
|
157
|
+
self.queue.put((watcher, event))
|
|
158
|
+
|
|
159
|
+
def run(self):
|
|
160
|
+
logging.debug("Starting output listener queue")
|
|
161
|
+
while True:
|
|
162
|
+
# Get the next element in the queue
|
|
163
|
+
element = self.queue.get()
|
|
164
|
+
if element is None:
|
|
165
|
+
# end of processing
|
|
166
|
+
break
|
|
167
|
+
|
|
168
|
+
# Call all the listeners
|
|
169
|
+
logging.debug("Got one event: %s", element)
|
|
170
|
+
watcher, event = element
|
|
171
|
+
for listener in watcher.listeners:
|
|
172
|
+
try:
|
|
173
|
+
logger.debug("Calling listener [%s] with %s", listener, event)
|
|
174
|
+
listener(event)
|
|
175
|
+
logger.debug(
|
|
176
|
+
"[done] Calling listener [%s] with %s", listener, event
|
|
177
|
+
)
|
|
178
|
+
except Exception:
|
|
179
|
+
logging.exception("Exception while calling the listener")
|
|
180
|
+
self.queue.task_done()
|
|
181
|
+
|
|
182
|
+
asyncio.run_coroutine_threadsafe(
|
|
183
|
+
self.xp.update_task_output_count(-1), self.xp.scheduler.loop
|
|
184
|
+
).result()
|
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from shutil import rmtree
|
|
6
|
+
from typing import Any, Dict, Optional, TypeVar, Union
|
|
7
|
+
|
|
8
|
+
from experimaestro.core.objects import WatchedOutput
|
|
9
|
+
from experimaestro.exceptions import HandledException
|
|
10
|
+
|
|
11
|
+
from experimaestro.scheduler.signal_handler import SIGNAL_HANDLER
|
|
12
|
+
from experimaestro.scheduler.jobs import Job, JobFailureStatus
|
|
13
|
+
from experimaestro.scheduler.services import Service
|
|
14
|
+
from experimaestro.scheduler.workspace import RunMode, Workspace
|
|
15
|
+
from experimaestro.settings import WorkspaceSettings, get_settings
|
|
16
|
+
from experimaestro.utils import logger
|
|
17
|
+
|
|
18
|
+
ServiceClass = TypeVar("ServiceClass", bound=Service)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class FailedExperiment(HandledException):
|
|
22
|
+
"""Raised when an experiment failed"""
|
|
23
|
+
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class experiment:
|
|
28
|
+
"""Main experiment object
|
|
29
|
+
|
|
30
|
+
It is a context object, i.e. an experiment is run with
|
|
31
|
+
|
|
32
|
+
```py
|
|
33
|
+
with experiment(...) as xp:
|
|
34
|
+
...
|
|
35
|
+
```
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
#: Current experiment
|
|
39
|
+
CURRENT: Optional["experiment"] = None
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def current() -> "experiment":
|
|
43
|
+
"""Returns the current experiment, but checking first if set
|
|
44
|
+
|
|
45
|
+
If there is no current experiment, raises an AssertError
|
|
46
|
+
"""
|
|
47
|
+
assert experiment.CURRENT is not None, "No current experiment defined"
|
|
48
|
+
return experiment.CURRENT
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
env: Union[Path, str, WorkspaceSettings],
|
|
53
|
+
name: str,
|
|
54
|
+
*,
|
|
55
|
+
host: Optional[str] = None,
|
|
56
|
+
port: Optional[int] = None,
|
|
57
|
+
token: Optional[str] = None,
|
|
58
|
+
run_mode: Optional[RunMode] = None,
|
|
59
|
+
launcher=None,
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
:param env: an environment -- or a working directory for a local
|
|
63
|
+
environment
|
|
64
|
+
|
|
65
|
+
:param name: the identifier of the experiment
|
|
66
|
+
|
|
67
|
+
:param launcher: The launcher (if not provided, inferred from path)
|
|
68
|
+
|
|
69
|
+
:param host: The host for the web server (overrides the environment if
|
|
70
|
+
set)
|
|
71
|
+
:param port: the port for the web server (overrides the environment if
|
|
72
|
+
set). Use negative number to avoid running a web server (default when dry run).
|
|
73
|
+
|
|
74
|
+
:param run_mode: The run mode for the experiment (normal, generate run
|
|
75
|
+
files, dry run)
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
from experimaestro.server import Server
|
|
79
|
+
from experimaestro.scheduler import Listener, Scheduler
|
|
80
|
+
|
|
81
|
+
settings = get_settings()
|
|
82
|
+
if not isinstance(env, WorkspaceSettings):
|
|
83
|
+
env = WorkspaceSettings(id=None, path=Path(env))
|
|
84
|
+
|
|
85
|
+
# Creates the workspace
|
|
86
|
+
run_mode = run_mode or RunMode.NORMAL
|
|
87
|
+
self.workspace = Workspace(settings, env, launcher=launcher, run_mode=run_mode)
|
|
88
|
+
|
|
89
|
+
# Mark the directory has an experimaestro folder
|
|
90
|
+
self.workdir = self.workspace.experimentspath / name
|
|
91
|
+
self.workdir.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
self.xplockpath = self.workdir / "lock"
|
|
93
|
+
self.xplock = None
|
|
94
|
+
self.old_experiment = None
|
|
95
|
+
self.services: Dict[str, Service] = {}
|
|
96
|
+
self._job_listener: Optional[Listener] = None
|
|
97
|
+
|
|
98
|
+
# Get configuration settings
|
|
99
|
+
|
|
100
|
+
if host is not None:
|
|
101
|
+
settings.server.host = host
|
|
102
|
+
|
|
103
|
+
if port is not None:
|
|
104
|
+
settings.server.port = port
|
|
105
|
+
|
|
106
|
+
if token is not None:
|
|
107
|
+
settings.server.token = token
|
|
108
|
+
|
|
109
|
+
# Create the scheduler
|
|
110
|
+
self.scheduler = Scheduler.create(self, name)
|
|
111
|
+
self.server = (
|
|
112
|
+
Server(self.scheduler, settings.server)
|
|
113
|
+
if (settings.server.port is not None and settings.server.port >= 0)
|
|
114
|
+
and self.workspace.run_mode == RunMode.NORMAL
|
|
115
|
+
else None
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
|
|
119
|
+
import faulthandler
|
|
120
|
+
|
|
121
|
+
logger.info("Enabling fault handler")
|
|
122
|
+
faulthandler.enable(all_threads=True)
|
|
123
|
+
|
|
124
|
+
def submit(self, job: Job):
|
|
125
|
+
return self.scheduler.submit(job)
|
|
126
|
+
|
|
127
|
+
def prepare(self, job: Job):
|
|
128
|
+
"""Generate the file"""
|
|
129
|
+
return self.scheduler.prepare(job)
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def run_mode(self):
|
|
133
|
+
return self.workspace.run_mode
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def loop(self):
|
|
137
|
+
assert self.scheduler is not None, "No scheduler defined"
|
|
138
|
+
return self.scheduler.loop
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def resultspath(self):
|
|
142
|
+
"""Return the directory in which results can be stored for this experiment"""
|
|
143
|
+
return self.workdir / "results"
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def jobspath(self):
|
|
147
|
+
"""Return the directory in which results can be stored for this experiment"""
|
|
148
|
+
return self.workdir / "jobs"
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def alt_jobspaths(self):
|
|
152
|
+
"""Return potential other directories"""
|
|
153
|
+
for alt_workdir in self.workspace.alt_workdirs:
|
|
154
|
+
yield alt_workdir / "jobs"
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def jobsbakpath(self):
|
|
158
|
+
"""Return the directory in which results can be stored for this experiment"""
|
|
159
|
+
return self.workdir / "jobs.bak"
|
|
160
|
+
|
|
161
|
+
def stop(self):
|
|
162
|
+
"""Stop the experiment as soon as possible"""
|
|
163
|
+
|
|
164
|
+
async def doStop():
|
|
165
|
+
assert self.scheduler is not None
|
|
166
|
+
async with self.scheduler.exitCondition:
|
|
167
|
+
self.exitMode = True
|
|
168
|
+
logging.debug("Setting exit mode to true")
|
|
169
|
+
self.scheduler.exitCondition.notify_all()
|
|
170
|
+
|
|
171
|
+
assert self.scheduler is not None and self.scheduler.loop is not None
|
|
172
|
+
asyncio.run_coroutine_threadsafe(doStop(), self.scheduler.loop)
|
|
173
|
+
|
|
174
|
+
def wait(self):
|
|
175
|
+
"""Wait until the running processes have finished"""
|
|
176
|
+
|
|
177
|
+
async def awaitcompletion():
|
|
178
|
+
assert self.scheduler is not None, "No scheduler defined"
|
|
179
|
+
logger.debug("Waiting to exit scheduler...")
|
|
180
|
+
async with self.scheduler.exitCondition:
|
|
181
|
+
while True:
|
|
182
|
+
if self.exitMode:
|
|
183
|
+
break
|
|
184
|
+
|
|
185
|
+
# If we have still unfinished jobs or possible new tasks, wait
|
|
186
|
+
logger.debug(
|
|
187
|
+
"Checking exit condition: unfinished jobs=%d, task output queue size=%d",
|
|
188
|
+
self.unfinishedJobs,
|
|
189
|
+
self.taskOutputQueueSize,
|
|
190
|
+
)
|
|
191
|
+
if self.unfinishedJobs == 0 and self.taskOutputQueueSize == 0:
|
|
192
|
+
break
|
|
193
|
+
|
|
194
|
+
# Wait for more news...
|
|
195
|
+
await self.scheduler.exitCondition.wait()
|
|
196
|
+
|
|
197
|
+
if self.failedJobs:
|
|
198
|
+
# Show some more information
|
|
199
|
+
count = 0
|
|
200
|
+
for job in self.failedJobs.values():
|
|
201
|
+
if job.failure_status != JobFailureStatus.DEPENDENCY:
|
|
202
|
+
count += 1
|
|
203
|
+
logger.error(
|
|
204
|
+
"Job %s failed, check the log file %s",
|
|
205
|
+
job.relpath,
|
|
206
|
+
job.stderr,
|
|
207
|
+
)
|
|
208
|
+
raise FailedExperiment(f"{count} failed jobs")
|
|
209
|
+
|
|
210
|
+
future = asyncio.run_coroutine_threadsafe(awaitcompletion(), self.loop)
|
|
211
|
+
return future.result()
|
|
212
|
+
|
|
213
|
+
def setenv(self, name, value, override=True):
|
|
214
|
+
"""Shortcut to set the environment value"""
|
|
215
|
+
if override or name not in self.workspace.env:
|
|
216
|
+
logging.info("Setting environment: %s=%s", name, value)
|
|
217
|
+
self.workspace.env[name] = value
|
|
218
|
+
|
|
219
|
+
def token(self, name: str, count: int):
|
|
220
|
+
"""Returns a token for this experiment
|
|
221
|
+
|
|
222
|
+
The token is the default token of the workspace connector"""
|
|
223
|
+
return self.workspace.connector.createtoken(name, count)
|
|
224
|
+
|
|
225
|
+
def __enter__(self):
|
|
226
|
+
from .dynamic_outputs import TaskOutputsWorker
|
|
227
|
+
|
|
228
|
+
if self.workspace.run_mode != RunMode.DRY_RUN:
|
|
229
|
+
logger.info("Locking experiment %s", self.xplockpath)
|
|
230
|
+
self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
|
|
231
|
+
logger.info("Experiment locked")
|
|
232
|
+
|
|
233
|
+
# Move old jobs into "jobs.bak"
|
|
234
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
235
|
+
self.jobsbakpath.mkdir(exist_ok=True)
|
|
236
|
+
for p in self.jobspath.glob("*/*"):
|
|
237
|
+
if p.is_symlink():
|
|
238
|
+
target = self.jobsbakpath / p.relative_to(self.jobspath)
|
|
239
|
+
if target.is_symlink():
|
|
240
|
+
# Remove if duplicate
|
|
241
|
+
p.unlink()
|
|
242
|
+
else:
|
|
243
|
+
# Rename otherwise
|
|
244
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
245
|
+
p.rename(target)
|
|
246
|
+
|
|
247
|
+
if self.server:
|
|
248
|
+
self.server.start()
|
|
249
|
+
|
|
250
|
+
self.workspace.__enter__()
|
|
251
|
+
(self.workspace.path / ".__experimaestro__").touch()
|
|
252
|
+
|
|
253
|
+
# Number of unfinished jobs
|
|
254
|
+
self.unfinishedJobs = 0
|
|
255
|
+
self.taskOutputQueueSize = 0
|
|
256
|
+
|
|
257
|
+
# List of failed jobs
|
|
258
|
+
self.failedJobs: Dict[str, Job] = {}
|
|
259
|
+
|
|
260
|
+
# Exit mode when catching signals
|
|
261
|
+
self.exitMode = False
|
|
262
|
+
|
|
263
|
+
self.scheduler.start_scheduler()
|
|
264
|
+
self.taskOutputsWorker = TaskOutputsWorker(self)
|
|
265
|
+
self.taskOutputsWorker.start()
|
|
266
|
+
|
|
267
|
+
SIGNAL_HANDLER.add(self)
|
|
268
|
+
|
|
269
|
+
self.old_experiment = experiment.CURRENT
|
|
270
|
+
experiment.CURRENT = self
|
|
271
|
+
return self
|
|
272
|
+
|
|
273
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
274
|
+
logger.debug("Exiting scheduler context")
|
|
275
|
+
# If no exception and normal run mode, remove old "jobs"
|
|
276
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
277
|
+
if exc_type is None and self.jobsbakpath.is_dir():
|
|
278
|
+
rmtree(self.jobsbakpath)
|
|
279
|
+
|
|
280
|
+
# Close the different locks
|
|
281
|
+
try:
|
|
282
|
+
if exc_type:
|
|
283
|
+
# import faulthandler
|
|
284
|
+
# faulthandler.dump_traceback()
|
|
285
|
+
logger.error(
|
|
286
|
+
"Not waiting since an exception was thrown"
|
|
287
|
+
" (some jobs may be running)"
|
|
288
|
+
)
|
|
289
|
+
else:
|
|
290
|
+
self.wait()
|
|
291
|
+
finally:
|
|
292
|
+
SIGNAL_HANDLER.remove(self)
|
|
293
|
+
|
|
294
|
+
# Stop services
|
|
295
|
+
for service in self.services.values():
|
|
296
|
+
logger.info("Closing service %s", service.description())
|
|
297
|
+
service.stop()
|
|
298
|
+
|
|
299
|
+
if self.scheduler is not None:
|
|
300
|
+
logger.info("Stopping scheduler event loop")
|
|
301
|
+
self.scheduler.loop.stop()
|
|
302
|
+
|
|
303
|
+
if self.taskOutputsWorker is not None:
|
|
304
|
+
logger.info("Stopping tasks outputs worker")
|
|
305
|
+
self.taskOutputsWorker.queue.put(None)
|
|
306
|
+
|
|
307
|
+
self.workspace.__exit__(exc_type, exc_value, traceback)
|
|
308
|
+
if self.xplock:
|
|
309
|
+
self.xplock.__exit__(exc_type, exc_value, traceback)
|
|
310
|
+
|
|
311
|
+
# Put back old experiment as current one
|
|
312
|
+
experiment.CURRENT = self.old_experiment
|
|
313
|
+
if self.server:
|
|
314
|
+
logger.info("Stopping web server")
|
|
315
|
+
self.server.stop()
|
|
316
|
+
|
|
317
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
318
|
+
# Write the state
|
|
319
|
+
logging.info("Saving the experiment state")
|
|
320
|
+
from experimaestro.scheduler.state import ExperimentState
|
|
321
|
+
|
|
322
|
+
ExperimentState.save(
|
|
323
|
+
self.workdir / "state.json", self.scheduler.jobs.values()
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
async def update_task_output_count(self, delta: int):
|
|
327
|
+
"""Change in the number of task outputs to process"""
|
|
328
|
+
async with self.scheduler.exitCondition:
|
|
329
|
+
self.taskOutputQueueSize += delta
|
|
330
|
+
logging.debug(
|
|
331
|
+
"Updating queue size with %d => %d", delta, self.taskOutputQueueSize
|
|
332
|
+
)
|
|
333
|
+
if self.taskOutputQueueSize == 0:
|
|
334
|
+
self.scheduler.exitCondition.notify_all()
|
|
335
|
+
|
|
336
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
337
|
+
"""Watch an output
|
|
338
|
+
|
|
339
|
+
:param watched: The watched output specification
|
|
340
|
+
"""
|
|
341
|
+
|
|
342
|
+
self.taskOutputsWorker.watch_output(watched)
|
|
343
|
+
|
|
344
|
+
def add_service(self, service: ServiceClass) -> ServiceClass:
|
|
345
|
+
"""Adds a service (e.g. tensorboard viewer) to the experiment
|
|
346
|
+
|
|
347
|
+
:param service: A service instance
|
|
348
|
+
:return: The same service instance
|
|
349
|
+
"""
|
|
350
|
+
self.services[service.id] = service
|
|
351
|
+
for listener in self.scheduler.listeners:
|
|
352
|
+
listener.service_add(service)
|
|
353
|
+
return service
|
|
354
|
+
|
|
355
|
+
def save(self, obj: Any, name: str = "default"):
|
|
356
|
+
"""Serializes configurations.
|
|
357
|
+
|
|
358
|
+
Saves configuration objects within the experimental directory
|
|
359
|
+
|
|
360
|
+
:param obj: The object to save
|
|
361
|
+
:param name: The name of the saving directory (default to `default`)
|
|
362
|
+
"""
|
|
363
|
+
|
|
364
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
365
|
+
from experimaestro import save
|
|
366
|
+
|
|
367
|
+
save_dir = self.workdir / "data" / name
|
|
368
|
+
save_dir.mkdir(exist_ok=True, parents=True)
|
|
369
|
+
|
|
370
|
+
save(obj, save_dir)
|
|
371
|
+
|
|
372
|
+
def load(self, reference: str, name: str = "default"):
|
|
373
|
+
"""Serializes configurations.
|
|
374
|
+
|
|
375
|
+
Loads configuration objects from an experimental directory
|
|
376
|
+
|
|
377
|
+
:param reference: The name of the experiment
|
|
378
|
+
:param name: The name of the saving directory (default to `default`)
|
|
379
|
+
"""
|
|
380
|
+
from experimaestro import load
|
|
381
|
+
|
|
382
|
+
path = self.workspace.experimentspath / reference / "data" / name
|
|
383
|
+
return load(path)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
# re-export at the module level
|
|
387
|
+
current = experiment.current
|