experimaestro 1.6.1__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +3 -1
- experimaestro/annotations.py +13 -3
- experimaestro/cli/filter.py +3 -3
- experimaestro/cli/jobs.py +1 -1
- experimaestro/commandline.py +3 -7
- experimaestro/connectors/__init__.py +22 -10
- experimaestro/connectors/local.py +17 -8
- experimaestro/connectors/ssh.py +1 -1
- experimaestro/core/arguments.py +26 -3
- experimaestro/core/objects.py +90 -6
- experimaestro/core/objects.pyi +7 -1
- experimaestro/core/types.py +33 -2
- experimaestro/experiments/cli.py +21 -9
- experimaestro/generators.py +6 -1
- experimaestro/ipc.py +4 -1
- experimaestro/launcherfinder/registry.py +23 -5
- experimaestro/launchers/slurm/base.py +47 -9
- experimaestro/notifications.py +1 -1
- experimaestro/run.py +1 -1
- experimaestro/scheduler/base.py +102 -6
- experimaestro/scheduler/dynamic_outputs.py +184 -0
- experimaestro/scheduler/workspace.py +2 -1
- experimaestro/scriptbuilder.py +13 -2
- experimaestro/server/data/016b4a6cdced82ab3aa1.ttf +0 -0
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/1815e00441357e01619e.ttf +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/2463b90d9a316e4e5294.woff2 +0 -0
- experimaestro/server/data/2582b0e4bcf85eceead0.ttf +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/50701fbb8177c2dde530.ttf +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/878f31251d960bd6266f.woff2 +0 -0
- experimaestro/server/data/89999bdf5d835c012025.woff2 +0 -0
- experimaestro/server/data/914997e1bdfc990d0897.ttf +0 -0
- experimaestro/server/data/b041b1fa4fe241b23445.woff2 +0 -0
- experimaestro/server/data/b6879d41b0852f01ed5b.woff2 +0 -0
- experimaestro/server/data/c210719e60948b211a12.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/d75e3fd1eb12e9bd6655.ttf +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro/server/data/favicon.ico +0 -0
- experimaestro/server/data/index.css +22963 -0
- experimaestro/server/data/index.css.map +1 -0
- experimaestro/server/data/index.html +27 -0
- experimaestro/server/data/index.js +101770 -0
- experimaestro/server/data/index.js.map +1 -0
- experimaestro/server/data/login.html +22 -0
- experimaestro/server/data/manifest.json +15 -0
- experimaestro/settings.py +2 -2
- experimaestro/sphinx/__init__.py +7 -17
- experimaestro/taskglobals.py +7 -2
- experimaestro/tests/definitions_types.py +5 -3
- experimaestro/tests/launchers/bin/sbatch +34 -7
- experimaestro/tests/launchers/bin/srun +5 -0
- experimaestro/tests/launchers/common.py +16 -4
- experimaestro/tests/restart.py +6 -3
- experimaestro/tests/tasks/all.py +16 -10
- experimaestro/tests/tasks/foreign.py +2 -4
- experimaestro/tests/test_forward.py +5 -5
- experimaestro/tests/test_identifier.py +61 -66
- experimaestro/tests/test_instance.py +3 -6
- experimaestro/tests/test_param.py +40 -22
- experimaestro/tests/test_tags.py +5 -11
- experimaestro/tests/test_tokens.py +3 -2
- experimaestro/tests/test_types.py +17 -14
- experimaestro/tests/test_validation.py +48 -91
- experimaestro/tokens.py +16 -5
- experimaestro/typingutils.py +7 -0
- experimaestro/utils/asyncio.py +6 -2
- experimaestro/utils/resources.py +7 -3
- {experimaestro-1.6.1.dist-info → experimaestro-1.7.0.dist-info}/METADATA +3 -4
- experimaestro-1.7.0.dist-info/RECORD +154 -0
- {experimaestro-1.6.1.dist-info → experimaestro-1.7.0.dist-info}/WHEEL +1 -1
- experimaestro-1.6.1.dist-info/RECORD +0 -122
- {experimaestro-1.6.1.dist-info → experimaestro-1.7.0.dist-info}/LICENSE +0 -0
- {experimaestro-1.6.1.dist-info → experimaestro-1.7.0.dist-info}/entry_points.txt +0 -0
experimaestro/experiments/cli.py
CHANGED
|
@@ -59,7 +59,7 @@ class ExperimentCallable(Protocol):
|
|
|
59
59
|
class ConfigurationLoader:
|
|
60
60
|
def __init__(self):
|
|
61
61
|
self.yamls = []
|
|
62
|
-
self.
|
|
62
|
+
self.python_path = set()
|
|
63
63
|
|
|
64
64
|
def load(self, yaml_file: Path):
|
|
65
65
|
"""Loads a YAML file, and parents one if they exist"""
|
|
@@ -76,9 +76,9 @@ class ConfigurationLoader:
|
|
|
76
76
|
for path in _data.get("pythonpath", []):
|
|
77
77
|
path = Path(path)
|
|
78
78
|
if path.is_absolute():
|
|
79
|
-
self.
|
|
79
|
+
self.python_path.add(path.resolve())
|
|
80
80
|
else:
|
|
81
|
-
self.
|
|
81
|
+
self.python_path.add((yaml_file.parent / path).resolve())
|
|
82
82
|
|
|
83
83
|
|
|
84
84
|
@click.option("--debug", is_flag=True, help="Print debug information")
|
|
@@ -181,7 +181,7 @@ def experiments_cli( # noqa: C901
|
|
|
181
181
|
configuration.merge_with(OmegaConf.from_dotlist(extra_conf))
|
|
182
182
|
|
|
183
183
|
# --- Get the XP file
|
|
184
|
-
|
|
184
|
+
python_path = list(conf_loader.python_path)
|
|
185
185
|
if module_name is None:
|
|
186
186
|
module_name = configuration.get("module", None)
|
|
187
187
|
|
|
@@ -192,9 +192,13 @@ def experiments_cli( # noqa: C901
|
|
|
192
192
|
not module_name
|
|
193
193
|
), "Module name and experiment file are mutually exclusive options"
|
|
194
194
|
xp_file = Path(xp_file)
|
|
195
|
-
if not
|
|
196
|
-
|
|
197
|
-
logging.info(
|
|
195
|
+
if not python_path:
|
|
196
|
+
python_path.append(xp_file.parent)
|
|
197
|
+
logging.info(
|
|
198
|
+
"Using python path: %s", ", ".join(str(s) for s in python_path)
|
|
199
|
+
)
|
|
200
|
+
else:
|
|
201
|
+
xp_file = Path(xp_file)
|
|
198
202
|
|
|
199
203
|
assert (
|
|
200
204
|
module_name or xp_file
|
|
@@ -209,7 +213,7 @@ def experiments_cli( # noqa: C901
|
|
|
209
213
|
# --- Finds the "run" function
|
|
210
214
|
|
|
211
215
|
# Modifies the Python path
|
|
212
|
-
for path in
|
|
216
|
+
for path in python_path:
|
|
213
217
|
sys.path.append(str(path))
|
|
214
218
|
|
|
215
219
|
if xp_file:
|
|
@@ -226,7 +230,11 @@ def experiments_cli( # noqa: C901
|
|
|
226
230
|
)
|
|
227
231
|
else:
|
|
228
232
|
# Module
|
|
229
|
-
|
|
233
|
+
try:
|
|
234
|
+
mod = importlib.import_module(module_name)
|
|
235
|
+
except ModuleNotFoundError as e:
|
|
236
|
+
logging.error("Module not found: %s with python path %s", e, sys.path)
|
|
237
|
+
raise
|
|
230
238
|
|
|
231
239
|
helper = getattr(mod, "run", None)
|
|
232
240
|
|
|
@@ -265,6 +273,7 @@ def experiments_cli( # noqa: C901
|
|
|
265
273
|
|
|
266
274
|
# Define the workspace
|
|
267
275
|
ws_env = find_workspace(workdir=workdir, workspace=workspace)
|
|
276
|
+
|
|
268
277
|
workdir = ws_env.path
|
|
269
278
|
|
|
270
279
|
logging.info("Using working directory %s", str(workdir.resolve()))
|
|
@@ -278,6 +287,9 @@ def experiments_cli( # noqa: C901
|
|
|
278
287
|
for key, value in env:
|
|
279
288
|
xp.setenv(key, value)
|
|
280
289
|
|
|
290
|
+
# Sets the python path
|
|
291
|
+
xp.workspace.python_path.extend(python_path)
|
|
292
|
+
|
|
281
293
|
try:
|
|
282
294
|
# Run the experiment
|
|
283
295
|
helper.xp = xp
|
experimaestro/generators.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
from pathlib import Path
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
3
4
|
from typing import Callable, Union
|
|
4
5
|
from experimaestro.core.arguments import ArgumentOptions, TypeAnnotation
|
|
5
6
|
from experimaestro.core.objects import ConfigWalkContext, Config
|
|
6
7
|
|
|
7
8
|
|
|
8
|
-
class Generator:
|
|
9
|
+
class Generator(ABC):
|
|
9
10
|
"""Base class for all generators"""
|
|
10
11
|
|
|
11
12
|
def isoutput(self):
|
|
@@ -13,6 +14,10 @@ class Generator:
|
|
|
13
14
|
path within the job folder)"""
|
|
14
15
|
return False
|
|
15
16
|
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def __call__(self, context: ConfigWalkContext, config: Config):
|
|
19
|
+
...
|
|
20
|
+
|
|
16
21
|
|
|
17
22
|
class PathGenerator(Generator):
|
|
18
23
|
"""Generates a path"""
|
experimaestro/ipc.py
CHANGED
|
@@ -7,6 +7,7 @@ import sys
|
|
|
7
7
|
import logging
|
|
8
8
|
from .utils import logger
|
|
9
9
|
from watchdog.observers import Observer
|
|
10
|
+
from watchdog.observers.api import ObservedWatch
|
|
10
11
|
from watchdog.events import FileSystemEventHandler
|
|
11
12
|
|
|
12
13
|
|
|
@@ -20,7 +21,9 @@ class IPCom:
|
|
|
20
21
|
self.observer.start()
|
|
21
22
|
self.pid = os.getpid()
|
|
22
23
|
|
|
23
|
-
def fswatch(
|
|
24
|
+
def fswatch(
|
|
25
|
+
self, watcher: FileSystemEventHandler, path: Path, recursive=False
|
|
26
|
+
) -> ObservedWatch:
|
|
24
27
|
if not self.observer.is_alive():
|
|
25
28
|
logging.error("Observer is not alive")
|
|
26
29
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Configuration registers
|
|
2
2
|
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from typing import ClassVar, Dict, Optional, Set, Type, Union
|
|
4
5
|
|
|
5
6
|
from pathlib import Path
|
|
@@ -7,7 +8,6 @@ import typing
|
|
|
7
8
|
from omegaconf import DictConfig, OmegaConf, SCMode
|
|
8
9
|
import pkg_resources
|
|
9
10
|
from experimaestro.utils import logger
|
|
10
|
-
|
|
11
11
|
from .base import ConnectorConfiguration, TokenConfiguration
|
|
12
12
|
from .specs import HostRequirement
|
|
13
13
|
|
|
@@ -36,6 +36,16 @@ def load_yaml(schema, path: Path):
|
|
|
36
36
|
)
|
|
37
37
|
|
|
38
38
|
|
|
39
|
+
@contextmanager
|
|
40
|
+
def ensure_enter(fp):
|
|
41
|
+
"""Behaves as a resource, whether it is one or not"""
|
|
42
|
+
if hasattr(fp, "__enter__"):
|
|
43
|
+
with fp as _fp:
|
|
44
|
+
yield _fp
|
|
45
|
+
else:
|
|
46
|
+
yield fp
|
|
47
|
+
|
|
48
|
+
|
|
39
49
|
class LauncherRegistry:
|
|
40
50
|
INSTANCES: ClassVar[Dict[Path, "LauncherRegistry"]] = {}
|
|
41
51
|
CURRENT_CONFIG_DIR: ClassVar[Optional[Path]] = None
|
|
@@ -78,13 +88,16 @@ class LauncherRegistry:
|
|
|
78
88
|
|
|
79
89
|
from importlib import util
|
|
80
90
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
91
|
+
with ensure_enter(launchers_py.__fspath__()) as fp:
|
|
92
|
+
spec = util.spec_from_file_location("xpm_launchers_conf", fp)
|
|
93
|
+
module = util.module_from_spec(spec)
|
|
94
|
+
spec.loader.exec_module(module)
|
|
84
95
|
|
|
85
96
|
self.find_launcher_fn = getattr(module, "find_launcher", None)
|
|
86
97
|
if self.find_launcher_fn is None:
|
|
87
|
-
logger.
|
|
98
|
+
logger.warning(
|
|
99
|
+
"No find_launcher() function was found in %s", launchers_py
|
|
100
|
+
)
|
|
88
101
|
|
|
89
102
|
# Read the configuration file
|
|
90
103
|
self.connectors = load_yaml(
|
|
@@ -144,9 +157,14 @@ class LauncherRegistry:
|
|
|
144
157
|
specs.append(spec)
|
|
145
158
|
|
|
146
159
|
# Use launcher function
|
|
160
|
+
from experimaestro.launchers import Launcher
|
|
161
|
+
|
|
147
162
|
if self.find_launcher_fn is not None:
|
|
148
163
|
for spec in specs:
|
|
149
164
|
if launcher := self.find_launcher_fn(spec, tags):
|
|
165
|
+
assert isinstance(
|
|
166
|
+
launcher, Launcher
|
|
167
|
+
), "f{self.find_launcher_fn} did not return a Launcher but {type(launcher)}"
|
|
150
168
|
return launcher
|
|
151
169
|
|
|
152
170
|
return None
|
|
@@ -11,6 +11,7 @@ from typing import (
|
|
|
11
11
|
)
|
|
12
12
|
from experimaestro.connectors.local import LocalConnector
|
|
13
13
|
import re
|
|
14
|
+
from shlex import quote as shquote
|
|
14
15
|
from contextlib import contextmanager
|
|
15
16
|
from dataclasses import dataclass
|
|
16
17
|
from experimaestro.launcherfinder.registry import (
|
|
@@ -235,15 +236,15 @@ class SlurmProcessBuilder(ProcessBuilder):
|
|
|
235
236
|
super().__init__()
|
|
236
237
|
self.launcher = launcher
|
|
237
238
|
|
|
238
|
-
def start(self) -> BatchSlurmProcess:
|
|
239
|
+
def start(self, task_mode: bool = False) -> BatchSlurmProcess:
|
|
239
240
|
"""Start the process"""
|
|
240
241
|
builder = self.launcher.connector.processbuilder()
|
|
241
|
-
builder.workingDirectory = self.workingDirectory
|
|
242
242
|
builder.environ = self.launcher.launcherenv
|
|
243
243
|
builder.detach = False
|
|
244
244
|
|
|
245
245
|
if not self.detach:
|
|
246
246
|
# Simplest case: we wait for the output
|
|
247
|
+
builder.workingDirectory = self.workingDirectory
|
|
247
248
|
builder.command = [f"{self.launcher.binpath}/srun"]
|
|
248
249
|
builder.command.extend(self.launcher.options.args())
|
|
249
250
|
builder.command.extend(self.command)
|
|
@@ -255,11 +256,17 @@ class SlurmProcessBuilder(ProcessBuilder):
|
|
|
255
256
|
return builder.start()
|
|
256
257
|
|
|
257
258
|
builder.command = [f"{self.launcher.binpath}/sbatch", "--parsable"]
|
|
258
|
-
builder.command.extend(self.launcher.options.args())
|
|
259
259
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
260
|
+
if not task_mode:
|
|
261
|
+
# Use command line parameters when not running a task
|
|
262
|
+
builder.command.extend(self.launcher.options.args())
|
|
263
|
+
|
|
264
|
+
if self.workingDirectory:
|
|
265
|
+
workdir = self.launcher.connector.resolve(self.workingDirectory)
|
|
266
|
+
builder.command.append(f"--chdir={workdir}")
|
|
267
|
+
addstream(builder.command, "-e", self.stderr)
|
|
268
|
+
addstream(builder.command, "-o", self.stdout)
|
|
269
|
+
addstream(builder.command, "-i", self.stdin)
|
|
263
270
|
|
|
264
271
|
builder.command.extend(self.command)
|
|
265
272
|
logger.info(
|
|
@@ -427,12 +434,43 @@ class SlurmLauncher(Launcher):
|
|
|
427
434
|
|
|
428
435
|
We assume *nix, but should be changed to PythonScriptBuilder when working
|
|
429
436
|
"""
|
|
430
|
-
|
|
431
|
-
builder.processtype = "slurm"
|
|
432
|
-
return builder
|
|
437
|
+
return SlurmScriptBuilder(self)
|
|
433
438
|
|
|
434
439
|
def processbuilder(self) -> SlurmProcessBuilder:
|
|
435
440
|
"""Returns the process builder for this launcher
|
|
436
441
|
|
|
437
442
|
By default, returns the associated connector builder"""
|
|
438
443
|
return SlurmProcessBuilder(self)
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
class SlurmScriptBuilder(PythonScriptBuilder):
|
|
447
|
+
def __init__(self, launcher: SlurmLauncher, pythonpath=None):
|
|
448
|
+
super().__init__(pythonpath)
|
|
449
|
+
self.launcher = launcher
|
|
450
|
+
self.processtype = "slurm"
|
|
451
|
+
|
|
452
|
+
def write(self, job):
|
|
453
|
+
py_path = super().write(job)
|
|
454
|
+
main_path = py_path.parent
|
|
455
|
+
|
|
456
|
+
def relpath(path: Path):
|
|
457
|
+
return shquote(self.launcher.connector.resolve(path, main_path))
|
|
458
|
+
|
|
459
|
+
# Writes the sbatch shell script containing all the options
|
|
460
|
+
sh_path = job.jobpath / ("%s.sh" % job.name)
|
|
461
|
+
with sh_path.open("wt") as out:
|
|
462
|
+
out.write("""#!/bin/sh\n\n""")
|
|
463
|
+
|
|
464
|
+
workdir = self.launcher.connector.resolve(main_path)
|
|
465
|
+
out.write(f"#SBATCH --chdir={shquote(workdir)}\n")
|
|
466
|
+
out.write(f"""#SBATCH --error={relpath(job.stderr)}\n""")
|
|
467
|
+
out.write(f"""#SBATCH --output={relpath(job.stdout)}\n""")
|
|
468
|
+
|
|
469
|
+
for arg in self.launcher.options.args():
|
|
470
|
+
out.write(f"""#SBATCH {arg}\n""")
|
|
471
|
+
|
|
472
|
+
# We finish by the call to srun
|
|
473
|
+
out.write(f"""\nsrun ./{relpath(py_path)}\n\n""")
|
|
474
|
+
|
|
475
|
+
self.launcher.connector.setExecutable(sh_path, True)
|
|
476
|
+
return sh_path
|
experimaestro/notifications.py
CHANGED
|
@@ -78,7 +78,6 @@ class Reporter(threading.Thread):
|
|
|
78
78
|
|
|
79
79
|
self.progress_threshold = 0.01
|
|
80
80
|
self.cv = threading.Condition()
|
|
81
|
-
self.start()
|
|
82
81
|
|
|
83
82
|
def stop(self):
|
|
84
83
|
self.stopping = True
|
|
@@ -222,6 +221,7 @@ class Reporter(threading.Thread):
|
|
|
222
221
|
taskpath = TaskEnv.instance().taskpath
|
|
223
222
|
assert taskpath is not None, "Task path is not defined"
|
|
224
223
|
Reporter.INSTANCE = Reporter(taskpath)
|
|
224
|
+
Reporter.INSTANCE.start()
|
|
225
225
|
return Reporter.INSTANCE
|
|
226
226
|
|
|
227
227
|
|
experimaestro/run.py
CHANGED
|
@@ -140,10 +140,10 @@ class TaskRunner:
|
|
|
140
140
|
run(workdir / "params.json")
|
|
141
141
|
|
|
142
142
|
# ... remove the handlers
|
|
143
|
-
logger.info("Task ended successfully")
|
|
144
143
|
remove_signal_handlers(remove_cleanup=False)
|
|
145
144
|
|
|
146
145
|
# Everything went OK
|
|
146
|
+
logger.info("Task ended successfully")
|
|
147
147
|
sys.exit(0)
|
|
148
148
|
except Exception:
|
|
149
149
|
logger.exception("Got exception while running")
|
experimaestro/scheduler/base.py
CHANGED
|
@@ -1,12 +1,22 @@
|
|
|
1
1
|
from collections import ChainMap
|
|
2
2
|
from functools import cached_property
|
|
3
|
+
import itertools
|
|
3
4
|
import logging
|
|
4
5
|
import os
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from shutil import rmtree
|
|
7
8
|
import threading
|
|
8
9
|
import time
|
|
9
|
-
from typing import
|
|
10
|
+
from typing import (
|
|
11
|
+
Any,
|
|
12
|
+
Iterator,
|
|
13
|
+
List,
|
|
14
|
+
Optional,
|
|
15
|
+
Set,
|
|
16
|
+
TypeVar,
|
|
17
|
+
Union,
|
|
18
|
+
TYPE_CHECKING,
|
|
19
|
+
)
|
|
10
20
|
import enum
|
|
11
21
|
import signal
|
|
12
22
|
import asyncio
|
|
@@ -17,9 +27,10 @@ from experimaestro.scheduler.services import Service
|
|
|
17
27
|
from experimaestro.settings import WorkspaceSettings, get_settings
|
|
18
28
|
|
|
19
29
|
|
|
20
|
-
from experimaestro.core.objects import Config, ConfigWalkContext
|
|
30
|
+
from experimaestro.core.objects import Config, ConfigWalkContext, WatchedOutput
|
|
21
31
|
from experimaestro.utils import logger
|
|
22
32
|
from experimaestro.locking import Locks, LockError, Lock
|
|
33
|
+
from experimaestro.utils.asyncio import asyncThreadcheck
|
|
23
34
|
from .workspace import RunMode, Workspace
|
|
24
35
|
from .dependencies import Dependency, DependencyStatus, Resource
|
|
25
36
|
import concurrent.futures
|
|
@@ -110,7 +121,7 @@ class JobDependency(Dependency):
|
|
|
110
121
|
|
|
111
122
|
|
|
112
123
|
class Job(Resource):
|
|
113
|
-
"""A job is a
|
|
124
|
+
"""A job is a resource that is produced by the execution of some code"""
|
|
114
125
|
|
|
115
126
|
# Set by the scheduler
|
|
116
127
|
_readyEvent: Optional[asyncio.Event]
|
|
@@ -148,6 +159,11 @@ class Job(Resource):
|
|
|
148
159
|
# Dependencies
|
|
149
160
|
self.dependencies: Set[Dependency] = set() # as target
|
|
150
161
|
|
|
162
|
+
# Watched outputs
|
|
163
|
+
self.watched_outputs = {}
|
|
164
|
+
for watched in config.__xpm__.watched_outputs:
|
|
165
|
+
self.watch_output(watched)
|
|
166
|
+
|
|
151
167
|
# Process
|
|
152
168
|
self._process = None
|
|
153
169
|
self.unsatisfied = 0
|
|
@@ -159,6 +175,23 @@ class Job(Resource):
|
|
|
159
175
|
self._progress: List[LevelInformation] = []
|
|
160
176
|
self.tags = config.tags()
|
|
161
177
|
|
|
178
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
179
|
+
"""Monitor task outputs
|
|
180
|
+
|
|
181
|
+
:param watched: A description of the watched output
|
|
182
|
+
"""
|
|
183
|
+
self.scheduler.xp.watch_output(watched)
|
|
184
|
+
|
|
185
|
+
def task_output_update(self, subpath: Path):
|
|
186
|
+
"""Notification of an updated task output"""
|
|
187
|
+
if watcher := self.watched_outputs.get(subpath, None):
|
|
188
|
+
watcher.update()
|
|
189
|
+
|
|
190
|
+
def done_handler(self):
|
|
191
|
+
"""The task has been completed"""
|
|
192
|
+
for watcher in self.watched_outputs.values():
|
|
193
|
+
watcher.update()
|
|
194
|
+
|
|
162
195
|
def __str__(self):
|
|
163
196
|
return "Job[{}]".format(self.identifier)
|
|
164
197
|
|
|
@@ -166,6 +199,11 @@ class Job(Resource):
|
|
|
166
199
|
assert self._future, "Cannot wait a not submitted job"
|
|
167
200
|
return self._future.result()
|
|
168
201
|
|
|
202
|
+
@cached_property
|
|
203
|
+
def python_path(self) -> Iterator[str]:
|
|
204
|
+
"""Returns an iterator over python path"""
|
|
205
|
+
return itertools.chain(self.workspace.python_path)
|
|
206
|
+
|
|
169
207
|
@cached_property
|
|
170
208
|
def environ(self):
|
|
171
209
|
"""Returns the job environment
|
|
@@ -219,7 +257,7 @@ class Job(Resource):
|
|
|
219
257
|
return self.state == JobState.READY
|
|
220
258
|
|
|
221
259
|
@property
|
|
222
|
-
def jobpath(self):
|
|
260
|
+
def jobpath(self) -> Path:
|
|
223
261
|
"""Deprecated, use `path`"""
|
|
224
262
|
return self.workspace.jobspath / self.relpath
|
|
225
263
|
|
|
@@ -227,6 +265,14 @@ class Job(Resource):
|
|
|
227
265
|
def path(self) -> Path:
|
|
228
266
|
return self.workspace.jobspath / self.relpath
|
|
229
267
|
|
|
268
|
+
@property
|
|
269
|
+
def experimaestro_path(self) -> Path:
|
|
270
|
+
return (self.path / ".experimaestro").resolve()
|
|
271
|
+
|
|
272
|
+
@cached_property
|
|
273
|
+
def task_outputs_path(self) -> Path:
|
|
274
|
+
return self.experimaestro_path / "task-outputs.jsonl"
|
|
275
|
+
|
|
230
276
|
@property
|
|
231
277
|
def relpath(self):
|
|
232
278
|
identifier = self.config.__xpm__.identifier
|
|
@@ -436,7 +482,7 @@ class Scheduler:
|
|
|
436
482
|
self.jobs: Dict[str, "Job"] = {}
|
|
437
483
|
|
|
438
484
|
# List of jobs
|
|
439
|
-
self.waitingjobs = set()
|
|
485
|
+
self.waitingjobs: Set[Job] = set()
|
|
440
486
|
|
|
441
487
|
# Listeners
|
|
442
488
|
self.listeners: Set[Listener] = set()
|
|
@@ -459,10 +505,12 @@ class Scheduler:
|
|
|
459
505
|
|
|
460
506
|
def submit(self, job: Job) -> Optional[Job]:
|
|
461
507
|
# Wait for the future containing the submitted job
|
|
508
|
+
logger.debug("Registering the job %s within the scheduler", job)
|
|
462
509
|
otherFuture = asyncio.run_coroutine_threadsafe(
|
|
463
510
|
self.aio_registerJob(job), self.loop
|
|
464
511
|
)
|
|
465
512
|
other = otherFuture.result()
|
|
513
|
+
logger.debug("Job already submitted" if other else "First submission")
|
|
466
514
|
if other:
|
|
467
515
|
return other
|
|
468
516
|
|
|
@@ -598,9 +646,13 @@ class Scheduler:
|
|
|
598
646
|
if job.state != JobState.DONE:
|
|
599
647
|
self.xp.failedJobs[job.identifier] = job
|
|
600
648
|
|
|
649
|
+
# Process all remaining tasks outputs
|
|
650
|
+
await asyncThreadcheck("End of job processing", job.done_handler)
|
|
651
|
+
|
|
601
652
|
# Decrement the number of unfinished jobs and notify
|
|
602
653
|
self.xp.unfinishedJobs -= 1
|
|
603
654
|
async with self.xp.central.exitCondition:
|
|
655
|
+
logging.debug("Updated number of unfinished jobs")
|
|
604
656
|
self.xp.central.exitCondition.notify_all()
|
|
605
657
|
|
|
606
658
|
job.endtime = time.time()
|
|
@@ -688,6 +740,7 @@ class Scheduler:
|
|
|
688
740
|
code = await process.aio_code()
|
|
689
741
|
logger.debug("Got return code %s for %s", code, job)
|
|
690
742
|
|
|
743
|
+
# Check the file if there is no return code
|
|
691
744
|
if code is None:
|
|
692
745
|
# Case where we cannot retrieve the code right away
|
|
693
746
|
if job.donepath.is_file():
|
|
@@ -853,6 +906,7 @@ class experiment:
|
|
|
853
906
|
assert self.central is not None
|
|
854
907
|
async with self.central.exitCondition:
|
|
855
908
|
self.exitMode = True
|
|
909
|
+
logging.debug("Setting exit mode to true")
|
|
856
910
|
self.central.exitCondition.notify_all()
|
|
857
911
|
|
|
858
912
|
assert self.central is not None and self.central.loop is not None
|
|
@@ -863,10 +917,22 @@ class experiment:
|
|
|
863
917
|
|
|
864
918
|
async def awaitcompletion():
|
|
865
919
|
assert self.central is not None
|
|
920
|
+
logger.debug("Waiting to exit scheduler...")
|
|
866
921
|
async with self.central.exitCondition:
|
|
867
922
|
while True:
|
|
868
|
-
if self.
|
|
923
|
+
if self.exitMode:
|
|
869
924
|
break
|
|
925
|
+
|
|
926
|
+
# If we have still unfinished jobs or possible new tasks, wait
|
|
927
|
+
logger.debug(
|
|
928
|
+
"Checking exit condition: unfinished jobs=%d, task output queue size=%d",
|
|
929
|
+
self.unfinishedJobs,
|
|
930
|
+
self.taskOutputQueueSize,
|
|
931
|
+
)
|
|
932
|
+
if self.unfinishedJobs == 0 and self.taskOutputQueueSize == 0:
|
|
933
|
+
break
|
|
934
|
+
|
|
935
|
+
# Wait for more news...
|
|
870
936
|
await self.central.exitCondition.wait()
|
|
871
937
|
|
|
872
938
|
if self.failedJobs:
|
|
@@ -898,6 +964,8 @@ class experiment:
|
|
|
898
964
|
return self.workspace.connector.createtoken(name, count)
|
|
899
965
|
|
|
900
966
|
def __enter__(self):
|
|
967
|
+
from .dynamic_outputs import TaskOutputsWorker
|
|
968
|
+
|
|
901
969
|
if self.workspace.run_mode != RunMode.DRY_RUN:
|
|
902
970
|
logger.info("Locking experiment %s", self.xplockpath)
|
|
903
971
|
self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
|
|
@@ -926,6 +994,7 @@ class experiment:
|
|
|
926
994
|
global SIGNAL_HANDLER
|
|
927
995
|
# Number of unfinished jobs
|
|
928
996
|
self.unfinishedJobs = 0
|
|
997
|
+
self.taskOutputQueueSize = 0
|
|
929
998
|
|
|
930
999
|
# List of failed jobs
|
|
931
1000
|
self.failedJobs: Dict[str, Job] = {}
|
|
@@ -934,6 +1003,8 @@ class experiment:
|
|
|
934
1003
|
self.exitMode = False
|
|
935
1004
|
|
|
936
1005
|
self.central = SchedulerCentral.create(self.scheduler.name)
|
|
1006
|
+
self.taskOutputsWorker = TaskOutputsWorker(self)
|
|
1007
|
+
self.taskOutputsWorker.start()
|
|
937
1008
|
|
|
938
1009
|
SIGNAL_HANDLER.add(self)
|
|
939
1010
|
|
|
@@ -942,6 +1013,7 @@ class experiment:
|
|
|
942
1013
|
return self
|
|
943
1014
|
|
|
944
1015
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
1016
|
+
logger.debug("Exiting scheduler context")
|
|
945
1017
|
# If no exception and normal run mode, remove old "jobs"
|
|
946
1018
|
if self.workspace.run_mode == RunMode.NORMAL:
|
|
947
1019
|
if exc_type is None and self.jobsbakpath.is_dir():
|
|
@@ -967,8 +1039,13 @@ class experiment:
|
|
|
967
1039
|
service.stop()
|
|
968
1040
|
|
|
969
1041
|
if self.central is not None:
|
|
1042
|
+
logger.info("Stopping scheduler event loop")
|
|
970
1043
|
self.central.loop.stop()
|
|
971
1044
|
|
|
1045
|
+
if self.taskOutputsWorker is not None:
|
|
1046
|
+
logger.info("Stopping tasks outputs worker")
|
|
1047
|
+
self.taskOutputsWorker.queue.put(None)
|
|
1048
|
+
|
|
972
1049
|
self.central = None
|
|
973
1050
|
self.workspace.__exit__(exc_type, exc_value, traceback)
|
|
974
1051
|
if self.xplock:
|
|
@@ -977,8 +1054,27 @@ class experiment:
|
|
|
977
1054
|
# Put back old experiment as current one
|
|
978
1055
|
experiment.CURRENT = self.old_experiment
|
|
979
1056
|
if self.server:
|
|
1057
|
+
logger.info("Stopping web server")
|
|
980
1058
|
self.server.stop()
|
|
981
1059
|
|
|
1060
|
+
async def update_task_output_count(self, delta: int):
|
|
1061
|
+
"""Change in the number of task outputs to process"""
|
|
1062
|
+
async with self.central.exitCondition:
|
|
1063
|
+
self.taskOutputQueueSize += delta
|
|
1064
|
+
logging.debug(
|
|
1065
|
+
"Updating queue size with %d => %d", delta, self.taskOutputQueueSize
|
|
1066
|
+
)
|
|
1067
|
+
if self.taskOutputQueueSize == 0:
|
|
1068
|
+
self.central.exitCondition.notify_all()
|
|
1069
|
+
|
|
1070
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
1071
|
+
"""Watch an output
|
|
1072
|
+
|
|
1073
|
+
:param watched: The watched output specification
|
|
1074
|
+
"""
|
|
1075
|
+
|
|
1076
|
+
self.taskOutputsWorker.watch_output(watched)
|
|
1077
|
+
|
|
982
1078
|
def add_service(self, service: ServiceClass) -> ServiceClass:
|
|
983
1079
|
"""Adds a service (e.g. tensorboard viewer) to the experiment
|
|
984
1080
|
|