experimaestro 1.6.2__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +3 -1
- experimaestro/annotations.py +13 -3
- experimaestro/cli/filter.py +3 -3
- experimaestro/cli/jobs.py +1 -1
- experimaestro/commandline.py +3 -7
- experimaestro/connectors/__init__.py +22 -10
- experimaestro/connectors/local.py +17 -8
- experimaestro/connectors/ssh.py +1 -1
- experimaestro/core/arguments.py +26 -3
- experimaestro/core/objects.py +90 -6
- experimaestro/core/objects.pyi +7 -1
- experimaestro/core/types.py +33 -2
- experimaestro/experiments/cli.py +7 -3
- experimaestro/generators.py +6 -1
- experimaestro/ipc.py +4 -1
- experimaestro/launcherfinder/registry.py +23 -5
- experimaestro/launchers/slurm/base.py +47 -9
- experimaestro/notifications.py +1 -1
- experimaestro/run.py +1 -1
- experimaestro/scheduler/base.py +98 -10
- experimaestro/scheduler/dynamic_outputs.py +184 -0
- experimaestro/scriptbuilder.py +3 -1
- experimaestro/server/data/016b4a6cdced82ab3aa1.ttf +0 -0
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/1815e00441357e01619e.ttf +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/2463b90d9a316e4e5294.woff2 +0 -0
- experimaestro/server/data/2582b0e4bcf85eceead0.ttf +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/50701fbb8177c2dde530.ttf +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/878f31251d960bd6266f.woff2 +0 -0
- experimaestro/server/data/89999bdf5d835c012025.woff2 +0 -0
- experimaestro/server/data/914997e1bdfc990d0897.ttf +0 -0
- experimaestro/server/data/b041b1fa4fe241b23445.woff2 +0 -0
- experimaestro/server/data/b6879d41b0852f01ed5b.woff2 +0 -0
- experimaestro/server/data/c210719e60948b211a12.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/d75e3fd1eb12e9bd6655.ttf +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro/server/data/favicon.ico +0 -0
- experimaestro/server/data/index.css +22963 -0
- experimaestro/server/data/index.css.map +1 -0
- experimaestro/server/data/index.html +27 -0
- experimaestro/server/data/index.js +101770 -0
- experimaestro/server/data/index.js.map +1 -0
- experimaestro/server/data/login.html +22 -0
- experimaestro/server/data/manifest.json +15 -0
- experimaestro/sphinx/__init__.py +7 -17
- experimaestro/taskglobals.py +7 -2
- experimaestro/tests/definitions_types.py +5 -3
- experimaestro/tests/launchers/bin/sbatch +34 -7
- experimaestro/tests/launchers/bin/srun +5 -0
- experimaestro/tests/launchers/common.py +16 -4
- experimaestro/tests/restart.py +6 -3
- experimaestro/tests/tasks/all.py +16 -10
- experimaestro/tests/tasks/foreign.py +2 -4
- experimaestro/tests/test_forward.py +5 -5
- experimaestro/tests/test_identifier.py +61 -66
- experimaestro/tests/test_instance.py +3 -6
- experimaestro/tests/test_param.py +40 -22
- experimaestro/tests/test_tags.py +5 -11
- experimaestro/tests/test_tokens.py +3 -2
- experimaestro/tests/test_types.py +17 -14
- experimaestro/tests/test_validation.py +48 -91
- experimaestro/tokens.py +16 -5
- experimaestro/typingutils.py +7 -0
- experimaestro/utils/asyncio.py +6 -2
- experimaestro/utils/resources.py +7 -3
- {experimaestro-1.6.2.dist-info → experimaestro-1.7.0.dist-info}/METADATA +3 -4
- experimaestro-1.7.0.dist-info/RECORD +154 -0
- {experimaestro-1.6.2.dist-info → experimaestro-1.7.0.dist-info}/WHEEL +1 -1
- experimaestro-1.6.2.dist-info/RECORD +0 -122
- {experimaestro-1.6.2.dist-info → experimaestro-1.7.0.dist-info}/LICENSE +0 -0
- {experimaestro-1.6.2.dist-info → experimaestro-1.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Configuration registers
|
|
2
2
|
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from typing import ClassVar, Dict, Optional, Set, Type, Union
|
|
4
5
|
|
|
5
6
|
from pathlib import Path
|
|
@@ -7,7 +8,6 @@ import typing
|
|
|
7
8
|
from omegaconf import DictConfig, OmegaConf, SCMode
|
|
8
9
|
import pkg_resources
|
|
9
10
|
from experimaestro.utils import logger
|
|
10
|
-
|
|
11
11
|
from .base import ConnectorConfiguration, TokenConfiguration
|
|
12
12
|
from .specs import HostRequirement
|
|
13
13
|
|
|
@@ -36,6 +36,16 @@ def load_yaml(schema, path: Path):
|
|
|
36
36
|
)
|
|
37
37
|
|
|
38
38
|
|
|
39
|
+
@contextmanager
|
|
40
|
+
def ensure_enter(fp):
|
|
41
|
+
"""Behaves as a resource, whether it is one or not"""
|
|
42
|
+
if hasattr(fp, "__enter__"):
|
|
43
|
+
with fp as _fp:
|
|
44
|
+
yield _fp
|
|
45
|
+
else:
|
|
46
|
+
yield fp
|
|
47
|
+
|
|
48
|
+
|
|
39
49
|
class LauncherRegistry:
|
|
40
50
|
INSTANCES: ClassVar[Dict[Path, "LauncherRegistry"]] = {}
|
|
41
51
|
CURRENT_CONFIG_DIR: ClassVar[Optional[Path]] = None
|
|
@@ -78,13 +88,16 @@ class LauncherRegistry:
|
|
|
78
88
|
|
|
79
89
|
from importlib import util
|
|
80
90
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
91
|
+
with ensure_enter(launchers_py.__fspath__()) as fp:
|
|
92
|
+
spec = util.spec_from_file_location("xpm_launchers_conf", fp)
|
|
93
|
+
module = util.module_from_spec(spec)
|
|
94
|
+
spec.loader.exec_module(module)
|
|
84
95
|
|
|
85
96
|
self.find_launcher_fn = getattr(module, "find_launcher", None)
|
|
86
97
|
if self.find_launcher_fn is None:
|
|
87
|
-
logger.
|
|
98
|
+
logger.warning(
|
|
99
|
+
"No find_launcher() function was found in %s", launchers_py
|
|
100
|
+
)
|
|
88
101
|
|
|
89
102
|
# Read the configuration file
|
|
90
103
|
self.connectors = load_yaml(
|
|
@@ -144,9 +157,14 @@ class LauncherRegistry:
|
|
|
144
157
|
specs.append(spec)
|
|
145
158
|
|
|
146
159
|
# Use launcher function
|
|
160
|
+
from experimaestro.launchers import Launcher
|
|
161
|
+
|
|
147
162
|
if self.find_launcher_fn is not None:
|
|
148
163
|
for spec in specs:
|
|
149
164
|
if launcher := self.find_launcher_fn(spec, tags):
|
|
165
|
+
assert isinstance(
|
|
166
|
+
launcher, Launcher
|
|
167
|
+
), "f{self.find_launcher_fn} did not return a Launcher but {type(launcher)}"
|
|
150
168
|
return launcher
|
|
151
169
|
|
|
152
170
|
return None
|
|
@@ -11,6 +11,7 @@ from typing import (
|
|
|
11
11
|
)
|
|
12
12
|
from experimaestro.connectors.local import LocalConnector
|
|
13
13
|
import re
|
|
14
|
+
from shlex import quote as shquote
|
|
14
15
|
from contextlib import contextmanager
|
|
15
16
|
from dataclasses import dataclass
|
|
16
17
|
from experimaestro.launcherfinder.registry import (
|
|
@@ -235,15 +236,15 @@ class SlurmProcessBuilder(ProcessBuilder):
|
|
|
235
236
|
super().__init__()
|
|
236
237
|
self.launcher = launcher
|
|
237
238
|
|
|
238
|
-
def start(self) -> BatchSlurmProcess:
|
|
239
|
+
def start(self, task_mode: bool = False) -> BatchSlurmProcess:
|
|
239
240
|
"""Start the process"""
|
|
240
241
|
builder = self.launcher.connector.processbuilder()
|
|
241
|
-
builder.workingDirectory = self.workingDirectory
|
|
242
242
|
builder.environ = self.launcher.launcherenv
|
|
243
243
|
builder.detach = False
|
|
244
244
|
|
|
245
245
|
if not self.detach:
|
|
246
246
|
# Simplest case: we wait for the output
|
|
247
|
+
builder.workingDirectory = self.workingDirectory
|
|
247
248
|
builder.command = [f"{self.launcher.binpath}/srun"]
|
|
248
249
|
builder.command.extend(self.launcher.options.args())
|
|
249
250
|
builder.command.extend(self.command)
|
|
@@ -255,11 +256,17 @@ class SlurmProcessBuilder(ProcessBuilder):
|
|
|
255
256
|
return builder.start()
|
|
256
257
|
|
|
257
258
|
builder.command = [f"{self.launcher.binpath}/sbatch", "--parsable"]
|
|
258
|
-
builder.command.extend(self.launcher.options.args())
|
|
259
259
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
260
|
+
if not task_mode:
|
|
261
|
+
# Use command line parameters when not running a task
|
|
262
|
+
builder.command.extend(self.launcher.options.args())
|
|
263
|
+
|
|
264
|
+
if self.workingDirectory:
|
|
265
|
+
workdir = self.launcher.connector.resolve(self.workingDirectory)
|
|
266
|
+
builder.command.append(f"--chdir={workdir}")
|
|
267
|
+
addstream(builder.command, "-e", self.stderr)
|
|
268
|
+
addstream(builder.command, "-o", self.stdout)
|
|
269
|
+
addstream(builder.command, "-i", self.stdin)
|
|
263
270
|
|
|
264
271
|
builder.command.extend(self.command)
|
|
265
272
|
logger.info(
|
|
@@ -427,12 +434,43 @@ class SlurmLauncher(Launcher):
|
|
|
427
434
|
|
|
428
435
|
We assume *nix, but should be changed to PythonScriptBuilder when working
|
|
429
436
|
"""
|
|
430
|
-
|
|
431
|
-
builder.processtype = "slurm"
|
|
432
|
-
return builder
|
|
437
|
+
return SlurmScriptBuilder(self)
|
|
433
438
|
|
|
434
439
|
def processbuilder(self) -> SlurmProcessBuilder:
|
|
435
440
|
"""Returns the process builder for this launcher
|
|
436
441
|
|
|
437
442
|
By default, returns the associated connector builder"""
|
|
438
443
|
return SlurmProcessBuilder(self)
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
class SlurmScriptBuilder(PythonScriptBuilder):
|
|
447
|
+
def __init__(self, launcher: SlurmLauncher, pythonpath=None):
|
|
448
|
+
super().__init__(pythonpath)
|
|
449
|
+
self.launcher = launcher
|
|
450
|
+
self.processtype = "slurm"
|
|
451
|
+
|
|
452
|
+
def write(self, job):
|
|
453
|
+
py_path = super().write(job)
|
|
454
|
+
main_path = py_path.parent
|
|
455
|
+
|
|
456
|
+
def relpath(path: Path):
|
|
457
|
+
return shquote(self.launcher.connector.resolve(path, main_path))
|
|
458
|
+
|
|
459
|
+
# Writes the sbatch shell script containing all the options
|
|
460
|
+
sh_path = job.jobpath / ("%s.sh" % job.name)
|
|
461
|
+
with sh_path.open("wt") as out:
|
|
462
|
+
out.write("""#!/bin/sh\n\n""")
|
|
463
|
+
|
|
464
|
+
workdir = self.launcher.connector.resolve(main_path)
|
|
465
|
+
out.write(f"#SBATCH --chdir={shquote(workdir)}\n")
|
|
466
|
+
out.write(f"""#SBATCH --error={relpath(job.stderr)}\n""")
|
|
467
|
+
out.write(f"""#SBATCH --output={relpath(job.stdout)}\n""")
|
|
468
|
+
|
|
469
|
+
for arg in self.launcher.options.args():
|
|
470
|
+
out.write(f"""#SBATCH {arg}\n""")
|
|
471
|
+
|
|
472
|
+
# We finish by the call to srun
|
|
473
|
+
out.write(f"""\nsrun ./{relpath(py_path)}\n\n""")
|
|
474
|
+
|
|
475
|
+
self.launcher.connector.setExecutable(sh_path, True)
|
|
476
|
+
return sh_path
|
experimaestro/notifications.py
CHANGED
|
@@ -78,7 +78,6 @@ class Reporter(threading.Thread):
|
|
|
78
78
|
|
|
79
79
|
self.progress_threshold = 0.01
|
|
80
80
|
self.cv = threading.Condition()
|
|
81
|
-
self.start()
|
|
82
81
|
|
|
83
82
|
def stop(self):
|
|
84
83
|
self.stopping = True
|
|
@@ -222,6 +221,7 @@ class Reporter(threading.Thread):
|
|
|
222
221
|
taskpath = TaskEnv.instance().taskpath
|
|
223
222
|
assert taskpath is not None, "Task path is not defined"
|
|
224
223
|
Reporter.INSTANCE = Reporter(taskpath)
|
|
224
|
+
Reporter.INSTANCE.start()
|
|
225
225
|
return Reporter.INSTANCE
|
|
226
226
|
|
|
227
227
|
|
experimaestro/run.py
CHANGED
|
@@ -140,10 +140,10 @@ class TaskRunner:
|
|
|
140
140
|
run(workdir / "params.json")
|
|
141
141
|
|
|
142
142
|
# ... remove the handlers
|
|
143
|
-
logger.info("Task ended successfully")
|
|
144
143
|
remove_signal_handlers(remove_cleanup=False)
|
|
145
144
|
|
|
146
145
|
# Everything went OK
|
|
146
|
+
logger.info("Task ended successfully")
|
|
147
147
|
sys.exit(0)
|
|
148
148
|
except Exception:
|
|
149
149
|
logger.exception("Got exception while running")
|
experimaestro/scheduler/base.py
CHANGED
|
@@ -7,7 +7,16 @@ from pathlib import Path
|
|
|
7
7
|
from shutil import rmtree
|
|
8
8
|
import threading
|
|
9
9
|
import time
|
|
10
|
-
from typing import
|
|
10
|
+
from typing import (
|
|
11
|
+
Any,
|
|
12
|
+
Iterator,
|
|
13
|
+
List,
|
|
14
|
+
Optional,
|
|
15
|
+
Set,
|
|
16
|
+
TypeVar,
|
|
17
|
+
Union,
|
|
18
|
+
TYPE_CHECKING,
|
|
19
|
+
)
|
|
11
20
|
import enum
|
|
12
21
|
import signal
|
|
13
22
|
import asyncio
|
|
@@ -18,9 +27,10 @@ from experimaestro.scheduler.services import Service
|
|
|
18
27
|
from experimaestro.settings import WorkspaceSettings, get_settings
|
|
19
28
|
|
|
20
29
|
|
|
21
|
-
from experimaestro.core.objects import Config, ConfigWalkContext
|
|
30
|
+
from experimaestro.core.objects import Config, ConfigWalkContext, WatchedOutput
|
|
22
31
|
from experimaestro.utils import logger
|
|
23
32
|
from experimaestro.locking import Locks, LockError, Lock
|
|
33
|
+
from experimaestro.utils.asyncio import asyncThreadcheck
|
|
24
34
|
from .workspace import RunMode, Workspace
|
|
25
35
|
from .dependencies import Dependency, DependencyStatus, Resource
|
|
26
36
|
import concurrent.futures
|
|
@@ -111,7 +121,7 @@ class JobDependency(Dependency):
|
|
|
111
121
|
|
|
112
122
|
|
|
113
123
|
class Job(Resource):
|
|
114
|
-
"""A job is a
|
|
124
|
+
"""A job is a resource that is produced by the execution of some code"""
|
|
115
125
|
|
|
116
126
|
# Set by the scheduler
|
|
117
127
|
_readyEvent: Optional[asyncio.Event]
|
|
@@ -149,6 +159,11 @@ class Job(Resource):
|
|
|
149
159
|
# Dependencies
|
|
150
160
|
self.dependencies: Set[Dependency] = set() # as target
|
|
151
161
|
|
|
162
|
+
# Watched outputs
|
|
163
|
+
self.watched_outputs = {}
|
|
164
|
+
for watched in config.__xpm__.watched_outputs:
|
|
165
|
+
self.watch_output(watched)
|
|
166
|
+
|
|
152
167
|
# Process
|
|
153
168
|
self._process = None
|
|
154
169
|
self.unsatisfied = 0
|
|
@@ -160,6 +175,23 @@ class Job(Resource):
|
|
|
160
175
|
self._progress: List[LevelInformation] = []
|
|
161
176
|
self.tags = config.tags()
|
|
162
177
|
|
|
178
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
179
|
+
"""Monitor task outputs
|
|
180
|
+
|
|
181
|
+
:param watched: A description of the watched output
|
|
182
|
+
"""
|
|
183
|
+
self.scheduler.xp.watch_output(watched)
|
|
184
|
+
|
|
185
|
+
def task_output_update(self, subpath: Path):
|
|
186
|
+
"""Notification of an updated task output"""
|
|
187
|
+
if watcher := self.watched_outputs.get(subpath, None):
|
|
188
|
+
watcher.update()
|
|
189
|
+
|
|
190
|
+
def done_handler(self):
|
|
191
|
+
"""The task has been completed"""
|
|
192
|
+
for watcher in self.watched_outputs.values():
|
|
193
|
+
watcher.update()
|
|
194
|
+
|
|
163
195
|
def __str__(self):
|
|
164
196
|
return "Job[{}]".format(self.identifier)
|
|
165
197
|
|
|
@@ -170,10 +202,8 @@ class Job(Resource):
|
|
|
170
202
|
@cached_property
|
|
171
203
|
def python_path(self) -> Iterator[str]:
|
|
172
204
|
"""Returns an iterator over python path"""
|
|
173
|
-
return itertools.chain(
|
|
174
|
-
|
|
175
|
-
)
|
|
176
|
-
|
|
205
|
+
return itertools.chain(self.workspace.python_path)
|
|
206
|
+
|
|
177
207
|
@cached_property
|
|
178
208
|
def environ(self):
|
|
179
209
|
"""Returns the job environment
|
|
@@ -227,7 +257,7 @@ class Job(Resource):
|
|
|
227
257
|
return self.state == JobState.READY
|
|
228
258
|
|
|
229
259
|
@property
|
|
230
|
-
def jobpath(self):
|
|
260
|
+
def jobpath(self) -> Path:
|
|
231
261
|
"""Deprecated, use `path`"""
|
|
232
262
|
return self.workspace.jobspath / self.relpath
|
|
233
263
|
|
|
@@ -235,6 +265,14 @@ class Job(Resource):
|
|
|
235
265
|
def path(self) -> Path:
|
|
236
266
|
return self.workspace.jobspath / self.relpath
|
|
237
267
|
|
|
268
|
+
@property
|
|
269
|
+
def experimaestro_path(self) -> Path:
|
|
270
|
+
return (self.path / ".experimaestro").resolve()
|
|
271
|
+
|
|
272
|
+
@cached_property
|
|
273
|
+
def task_outputs_path(self) -> Path:
|
|
274
|
+
return self.experimaestro_path / "task-outputs.jsonl"
|
|
275
|
+
|
|
238
276
|
@property
|
|
239
277
|
def relpath(self):
|
|
240
278
|
identifier = self.config.__xpm__.identifier
|
|
@@ -444,7 +482,7 @@ class Scheduler:
|
|
|
444
482
|
self.jobs: Dict[str, "Job"] = {}
|
|
445
483
|
|
|
446
484
|
# List of jobs
|
|
447
|
-
self.waitingjobs = set()
|
|
485
|
+
self.waitingjobs: Set[Job] = set()
|
|
448
486
|
|
|
449
487
|
# Listeners
|
|
450
488
|
self.listeners: Set[Listener] = set()
|
|
@@ -467,10 +505,12 @@ class Scheduler:
|
|
|
467
505
|
|
|
468
506
|
def submit(self, job: Job) -> Optional[Job]:
|
|
469
507
|
# Wait for the future containing the submitted job
|
|
508
|
+
logger.debug("Registering the job %s within the scheduler", job)
|
|
470
509
|
otherFuture = asyncio.run_coroutine_threadsafe(
|
|
471
510
|
self.aio_registerJob(job), self.loop
|
|
472
511
|
)
|
|
473
512
|
other = otherFuture.result()
|
|
513
|
+
logger.debug("Job already submitted" if other else "First submission")
|
|
474
514
|
if other:
|
|
475
515
|
return other
|
|
476
516
|
|
|
@@ -606,9 +646,13 @@ class Scheduler:
|
|
|
606
646
|
if job.state != JobState.DONE:
|
|
607
647
|
self.xp.failedJobs[job.identifier] = job
|
|
608
648
|
|
|
649
|
+
# Process all remaining tasks outputs
|
|
650
|
+
await asyncThreadcheck("End of job processing", job.done_handler)
|
|
651
|
+
|
|
609
652
|
# Decrement the number of unfinished jobs and notify
|
|
610
653
|
self.xp.unfinishedJobs -= 1
|
|
611
654
|
async with self.xp.central.exitCondition:
|
|
655
|
+
logging.debug("Updated number of unfinished jobs")
|
|
612
656
|
self.xp.central.exitCondition.notify_all()
|
|
613
657
|
|
|
614
658
|
job.endtime = time.time()
|
|
@@ -696,6 +740,7 @@ class Scheduler:
|
|
|
696
740
|
code = await process.aio_code()
|
|
697
741
|
logger.debug("Got return code %s for %s", code, job)
|
|
698
742
|
|
|
743
|
+
# Check the file if there is no return code
|
|
699
744
|
if code is None:
|
|
700
745
|
# Case where we cannot retrieve the code right away
|
|
701
746
|
if job.donepath.is_file():
|
|
@@ -861,6 +906,7 @@ class experiment:
|
|
|
861
906
|
assert self.central is not None
|
|
862
907
|
async with self.central.exitCondition:
|
|
863
908
|
self.exitMode = True
|
|
909
|
+
logging.debug("Setting exit mode to true")
|
|
864
910
|
self.central.exitCondition.notify_all()
|
|
865
911
|
|
|
866
912
|
assert self.central is not None and self.central.loop is not None
|
|
@@ -871,10 +917,22 @@ class experiment:
|
|
|
871
917
|
|
|
872
918
|
async def awaitcompletion():
|
|
873
919
|
assert self.central is not None
|
|
920
|
+
logger.debug("Waiting to exit scheduler...")
|
|
874
921
|
async with self.central.exitCondition:
|
|
875
922
|
while True:
|
|
876
|
-
if self.
|
|
923
|
+
if self.exitMode:
|
|
877
924
|
break
|
|
925
|
+
|
|
926
|
+
# If we have still unfinished jobs or possible new tasks, wait
|
|
927
|
+
logger.debug(
|
|
928
|
+
"Checking exit condition: unfinished jobs=%d, task output queue size=%d",
|
|
929
|
+
self.unfinishedJobs,
|
|
930
|
+
self.taskOutputQueueSize,
|
|
931
|
+
)
|
|
932
|
+
if self.unfinishedJobs == 0 and self.taskOutputQueueSize == 0:
|
|
933
|
+
break
|
|
934
|
+
|
|
935
|
+
# Wait for more news...
|
|
878
936
|
await self.central.exitCondition.wait()
|
|
879
937
|
|
|
880
938
|
if self.failedJobs:
|
|
@@ -906,6 +964,8 @@ class experiment:
|
|
|
906
964
|
return self.workspace.connector.createtoken(name, count)
|
|
907
965
|
|
|
908
966
|
def __enter__(self):
|
|
967
|
+
from .dynamic_outputs import TaskOutputsWorker
|
|
968
|
+
|
|
909
969
|
if self.workspace.run_mode != RunMode.DRY_RUN:
|
|
910
970
|
logger.info("Locking experiment %s", self.xplockpath)
|
|
911
971
|
self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
|
|
@@ -934,6 +994,7 @@ class experiment:
|
|
|
934
994
|
global SIGNAL_HANDLER
|
|
935
995
|
# Number of unfinished jobs
|
|
936
996
|
self.unfinishedJobs = 0
|
|
997
|
+
self.taskOutputQueueSize = 0
|
|
937
998
|
|
|
938
999
|
# List of failed jobs
|
|
939
1000
|
self.failedJobs: Dict[str, Job] = {}
|
|
@@ -942,6 +1003,8 @@ class experiment:
|
|
|
942
1003
|
self.exitMode = False
|
|
943
1004
|
|
|
944
1005
|
self.central = SchedulerCentral.create(self.scheduler.name)
|
|
1006
|
+
self.taskOutputsWorker = TaskOutputsWorker(self)
|
|
1007
|
+
self.taskOutputsWorker.start()
|
|
945
1008
|
|
|
946
1009
|
SIGNAL_HANDLER.add(self)
|
|
947
1010
|
|
|
@@ -950,6 +1013,7 @@ class experiment:
|
|
|
950
1013
|
return self
|
|
951
1014
|
|
|
952
1015
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
1016
|
+
logger.debug("Exiting scheduler context")
|
|
953
1017
|
# If no exception and normal run mode, remove old "jobs"
|
|
954
1018
|
if self.workspace.run_mode == RunMode.NORMAL:
|
|
955
1019
|
if exc_type is None and self.jobsbakpath.is_dir():
|
|
@@ -975,8 +1039,13 @@ class experiment:
|
|
|
975
1039
|
service.stop()
|
|
976
1040
|
|
|
977
1041
|
if self.central is not None:
|
|
1042
|
+
logger.info("Stopping scheduler event loop")
|
|
978
1043
|
self.central.loop.stop()
|
|
979
1044
|
|
|
1045
|
+
if self.taskOutputsWorker is not None:
|
|
1046
|
+
logger.info("Stopping tasks outputs worker")
|
|
1047
|
+
self.taskOutputsWorker.queue.put(None)
|
|
1048
|
+
|
|
980
1049
|
self.central = None
|
|
981
1050
|
self.workspace.__exit__(exc_type, exc_value, traceback)
|
|
982
1051
|
if self.xplock:
|
|
@@ -985,8 +1054,27 @@ class experiment:
|
|
|
985
1054
|
# Put back old experiment as current one
|
|
986
1055
|
experiment.CURRENT = self.old_experiment
|
|
987
1056
|
if self.server:
|
|
1057
|
+
logger.info("Stopping web server")
|
|
988
1058
|
self.server.stop()
|
|
989
1059
|
|
|
1060
|
+
async def update_task_output_count(self, delta: int):
|
|
1061
|
+
"""Change in the number of task outputs to process"""
|
|
1062
|
+
async with self.central.exitCondition:
|
|
1063
|
+
self.taskOutputQueueSize += delta
|
|
1064
|
+
logging.debug(
|
|
1065
|
+
"Updating queue size with %d => %d", delta, self.taskOutputQueueSize
|
|
1066
|
+
)
|
|
1067
|
+
if self.taskOutputQueueSize == 0:
|
|
1068
|
+
self.central.exitCondition.notify_all()
|
|
1069
|
+
|
|
1070
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
1071
|
+
"""Watch an output
|
|
1072
|
+
|
|
1073
|
+
:param watched: The watched output specification
|
|
1074
|
+
"""
|
|
1075
|
+
|
|
1076
|
+
self.taskOutputsWorker.watch_output(watched)
|
|
1077
|
+
|
|
990
1078
|
def add_service(self, service: ServiceClass) -> ServiceClass:
|
|
991
1079
|
"""Adds a service (e.g. tensorboard viewer) to the experiment
|
|
992
1080
|
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Handles dynamic task outputs"""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import queue
|
|
7
|
+
import threading
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from functools import cached_property
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Callable, TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
from watchdog.events import FileSystemEventHandler
|
|
14
|
+
|
|
15
|
+
from experimaestro.ipc import ipcom
|
|
16
|
+
from experimaestro.utils import logger
|
|
17
|
+
|
|
18
|
+
from .base import Job, experiment
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from experimaestro.core.objects import WatchedOutput
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TaskOutputCallbackHandler:
|
|
25
|
+
def __init__(self, converter: Callable):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TaskOutputs(FileSystemEventHandler):
|
|
30
|
+
"""Represent and monitors dynamic outputs generated by one task"""
|
|
31
|
+
|
|
32
|
+
#: Global dictionary for handles
|
|
33
|
+
HANDLERS: dict[Path, "TaskOutputs"] = {}
|
|
34
|
+
|
|
35
|
+
#: Global lock to access current HANDLERS
|
|
36
|
+
LOCK = threading.Lock()
|
|
37
|
+
|
|
38
|
+
def create(job: Job):
|
|
39
|
+
with TaskOutputs.LOCK:
|
|
40
|
+
if instance := TaskOutputs.get(job.task_outputs_path, None):
|
|
41
|
+
return instance
|
|
42
|
+
|
|
43
|
+
instance = TaskOutputs(job.task_outputs_path)
|
|
44
|
+
TaskOutputs[job.task_outputs_path] = instance
|
|
45
|
+
return instance
|
|
46
|
+
|
|
47
|
+
def __init__(self, path: Path):
|
|
48
|
+
"""Monitors an event path"""
|
|
49
|
+
logger.debug("Watching dynamic task outputs in %s", path)
|
|
50
|
+
self.path = path
|
|
51
|
+
self.handle = None
|
|
52
|
+
self.count = 0
|
|
53
|
+
self.lock = threading.Lock()
|
|
54
|
+
self.listeners: dict[str, dict[Callable, set[Callable]]] = defaultdict(
|
|
55
|
+
lambda: defaultdict(set)
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
#: The events registered so far
|
|
59
|
+
self.events = []
|
|
60
|
+
|
|
61
|
+
def __enter__(self):
|
|
62
|
+
"""Starts monitoring task outputs"""
|
|
63
|
+
self.job.task_outputs_path.parent.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
with self.lock:
|
|
65
|
+
if self.handle is None:
|
|
66
|
+
assert self.count == 0
|
|
67
|
+
self.handle = ipcom().fswatch(self, self.path.parent, False)
|
|
68
|
+
self.count += 1
|
|
69
|
+
return self
|
|
70
|
+
|
|
71
|
+
def __exit__(self, *args):
|
|
72
|
+
"""Stops monitoring task outputs"""
|
|
73
|
+
with self.lock:
|
|
74
|
+
self.count -= 1
|
|
75
|
+
if self.count == 0:
|
|
76
|
+
ipcom().fsunwatch(self.handle)
|
|
77
|
+
self.fh.close()
|
|
78
|
+
|
|
79
|
+
self.handle = None
|
|
80
|
+
self._fh = None
|
|
81
|
+
|
|
82
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
83
|
+
"""Add a new listener"""
|
|
84
|
+
key = f"{watched.config.__identifier__}/{watched.method_name}"
|
|
85
|
+
with self.lock:
|
|
86
|
+
# Process events so far
|
|
87
|
+
listener = self.listeners[key].get(watched.method, None)
|
|
88
|
+
if listener is None:
|
|
89
|
+
listener = TaskOutputCallbackHandler(watched.method)
|
|
90
|
+
|
|
91
|
+
# Register
|
|
92
|
+
self.listeners[key][watched.method].add(watched.callback)
|
|
93
|
+
|
|
94
|
+
#
|
|
95
|
+
# --- Events
|
|
96
|
+
#
|
|
97
|
+
|
|
98
|
+
@cached_property
|
|
99
|
+
def fh(self):
|
|
100
|
+
if self._fh is None:
|
|
101
|
+
self._fh = self.path.open("rt")
|
|
102
|
+
return self._fh
|
|
103
|
+
|
|
104
|
+
def on_modified(self, event):
|
|
105
|
+
self.handle(Path(event.src_path))
|
|
106
|
+
|
|
107
|
+
def on_created(self, event):
|
|
108
|
+
self.handle(Path(event.src_path))
|
|
109
|
+
|
|
110
|
+
def handle(self, path: Path):
|
|
111
|
+
if path != self.path:
|
|
112
|
+
return
|
|
113
|
+
|
|
114
|
+
with self.lock:
|
|
115
|
+
logger.debug("[TASK OUTPUT] Handling task output for %s", self.path)
|
|
116
|
+
|
|
117
|
+
while json_line := self.fh.readline():
|
|
118
|
+
# Read the event
|
|
119
|
+
event = json.loads(json_line)
|
|
120
|
+
logger.debug("Event: %s", event)
|
|
121
|
+
|
|
122
|
+
# FIXME: move elsewhere
|
|
123
|
+
# # Process the event
|
|
124
|
+
# event = self.config_method(
|
|
125
|
+
# self.job.config.__xpm__.mark_output,
|
|
126
|
+
# *event["args"],
|
|
127
|
+
# **event["kwargs"],
|
|
128
|
+
# )
|
|
129
|
+
|
|
130
|
+
self.events.append(event)
|
|
131
|
+
# self.job.scheduler.xp.taskOutputsWorker.add(self, event)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class TaskOutputsWorker(threading.Thread):
|
|
135
|
+
"""This worker process dynamic output queue for one experiment"""
|
|
136
|
+
|
|
137
|
+
def __init__(self, xp: experiment):
|
|
138
|
+
super().__init__(name="task outputs worker", daemon=True)
|
|
139
|
+
self.queue = queue.Queue()
|
|
140
|
+
self.xp = xp
|
|
141
|
+
|
|
142
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
143
|
+
"""Watch an output
|
|
144
|
+
|
|
145
|
+
:param watched: The watched output specification
|
|
146
|
+
"""
|
|
147
|
+
logger.debug("Registering task output listener %s", watched)
|
|
148
|
+
|
|
149
|
+
# path = watched.job.tasks_output_path
|
|
150
|
+
TaskOutputs.create(watched.job).watch_output(watched)
|
|
151
|
+
|
|
152
|
+
def add(self, watcher, event):
|
|
153
|
+
asyncio.run_coroutine_threadsafe(
|
|
154
|
+
self.xp.update_task_output_count(1),
|
|
155
|
+
self.xp.scheduler.loop,
|
|
156
|
+
).result()
|
|
157
|
+
self.queue.put((watcher, event))
|
|
158
|
+
|
|
159
|
+
def run(self):
|
|
160
|
+
logging.debug("Starting output listener queue")
|
|
161
|
+
while True:
|
|
162
|
+
# Get the next element in the queue
|
|
163
|
+
element = self.queue.get()
|
|
164
|
+
if element is None:
|
|
165
|
+
# end of processing
|
|
166
|
+
break
|
|
167
|
+
|
|
168
|
+
# Call all the listeners
|
|
169
|
+
logging.debug("Got one event: %s", element)
|
|
170
|
+
watcher, event = element
|
|
171
|
+
for listener in watcher.listeners:
|
|
172
|
+
try:
|
|
173
|
+
logger.debug("Calling listener [%s] with %s", listener, event)
|
|
174
|
+
listener(event)
|
|
175
|
+
logger.debug(
|
|
176
|
+
"[done] Calling listener [%s] with %s", listener, event
|
|
177
|
+
)
|
|
178
|
+
except Exception:
|
|
179
|
+
logging.exception("Exception while calling the listener")
|
|
180
|
+
self.queue.task_done()
|
|
181
|
+
|
|
182
|
+
asyncio.run_coroutine_threadsafe(
|
|
183
|
+
self.xp.update_task_output_count(-1), self.xp.scheduler.loop
|
|
184
|
+
).result()
|
experimaestro/scriptbuilder.py
CHANGED
|
@@ -51,6 +51,8 @@ class PythonScriptBuilder:
|
|
|
51
51
|
self.lockfiles: List[Path] = []
|
|
52
52
|
self.notificationURL: Optional[str] = None
|
|
53
53
|
self.command: Optional[AbstractCommand] = None
|
|
54
|
+
|
|
55
|
+
# This is used to serialize the full process identifier on disk
|
|
54
56
|
self.processtype = "local"
|
|
55
57
|
|
|
56
58
|
def write(self, job: CommandLineJob):
|
|
@@ -63,7 +65,7 @@ class PythonScriptBuilder:
|
|
|
63
65
|
job {CommandLineJob} -- [description]
|
|
64
66
|
|
|
65
67
|
Returns:
|
|
66
|
-
|
|
68
|
+
str -- The script path on disk
|
|
67
69
|
"""
|
|
68
70
|
assert isinstance(
|
|
69
71
|
job, CommandLineJob
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|