experimaestro 1.6.1__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (81) hide show
  1. experimaestro/__init__.py +3 -1
  2. experimaestro/annotations.py +13 -3
  3. experimaestro/cli/filter.py +3 -3
  4. experimaestro/cli/jobs.py +1 -1
  5. experimaestro/commandline.py +3 -7
  6. experimaestro/connectors/__init__.py +22 -10
  7. experimaestro/connectors/local.py +17 -8
  8. experimaestro/connectors/ssh.py +1 -1
  9. experimaestro/core/arguments.py +26 -3
  10. experimaestro/core/objects.py +90 -6
  11. experimaestro/core/objects.pyi +7 -1
  12. experimaestro/core/types.py +33 -2
  13. experimaestro/experiments/cli.py +21 -9
  14. experimaestro/generators.py +6 -1
  15. experimaestro/ipc.py +4 -1
  16. experimaestro/launcherfinder/registry.py +23 -5
  17. experimaestro/launchers/slurm/base.py +47 -9
  18. experimaestro/notifications.py +1 -1
  19. experimaestro/run.py +1 -1
  20. experimaestro/scheduler/base.py +102 -6
  21. experimaestro/scheduler/dynamic_outputs.py +184 -0
  22. experimaestro/scheduler/workspace.py +2 -1
  23. experimaestro/scriptbuilder.py +13 -2
  24. experimaestro/server/data/016b4a6cdced82ab3aa1.ttf +0 -0
  25. experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
  26. experimaestro/server/data/1815e00441357e01619e.ttf +0 -0
  27. experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
  28. experimaestro/server/data/2463b90d9a316e4e5294.woff2 +0 -0
  29. experimaestro/server/data/2582b0e4bcf85eceead0.ttf +0 -0
  30. experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
  31. experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
  32. experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
  33. experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
  34. experimaestro/server/data/50701fbb8177c2dde530.ttf +0 -0
  35. experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
  36. experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
  37. experimaestro/server/data/878f31251d960bd6266f.woff2 +0 -0
  38. experimaestro/server/data/89999bdf5d835c012025.woff2 +0 -0
  39. experimaestro/server/data/914997e1bdfc990d0897.ttf +0 -0
  40. experimaestro/server/data/b041b1fa4fe241b23445.woff2 +0 -0
  41. experimaestro/server/data/b6879d41b0852f01ed5b.woff2 +0 -0
  42. experimaestro/server/data/c210719e60948b211a12.woff2 +0 -0
  43. experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
  44. experimaestro/server/data/d75e3fd1eb12e9bd6655.ttf +0 -0
  45. experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
  46. experimaestro/server/data/favicon.ico +0 -0
  47. experimaestro/server/data/index.css +22963 -0
  48. experimaestro/server/data/index.css.map +1 -0
  49. experimaestro/server/data/index.html +27 -0
  50. experimaestro/server/data/index.js +101770 -0
  51. experimaestro/server/data/index.js.map +1 -0
  52. experimaestro/server/data/login.html +22 -0
  53. experimaestro/server/data/manifest.json +15 -0
  54. experimaestro/settings.py +2 -2
  55. experimaestro/sphinx/__init__.py +7 -17
  56. experimaestro/taskglobals.py +7 -2
  57. experimaestro/tests/definitions_types.py +5 -3
  58. experimaestro/tests/launchers/bin/sbatch +34 -7
  59. experimaestro/tests/launchers/bin/srun +5 -0
  60. experimaestro/tests/launchers/common.py +16 -4
  61. experimaestro/tests/restart.py +6 -3
  62. experimaestro/tests/tasks/all.py +16 -10
  63. experimaestro/tests/tasks/foreign.py +2 -4
  64. experimaestro/tests/test_forward.py +5 -5
  65. experimaestro/tests/test_identifier.py +61 -66
  66. experimaestro/tests/test_instance.py +3 -6
  67. experimaestro/tests/test_param.py +40 -22
  68. experimaestro/tests/test_tags.py +5 -11
  69. experimaestro/tests/test_tokens.py +3 -2
  70. experimaestro/tests/test_types.py +17 -14
  71. experimaestro/tests/test_validation.py +48 -91
  72. experimaestro/tokens.py +16 -5
  73. experimaestro/typingutils.py +7 -0
  74. experimaestro/utils/asyncio.py +6 -2
  75. experimaestro/utils/resources.py +7 -3
  76. {experimaestro-1.6.1.dist-info → experimaestro-1.7.0.dist-info}/METADATA +3 -4
  77. experimaestro-1.7.0.dist-info/RECORD +154 -0
  78. {experimaestro-1.6.1.dist-info → experimaestro-1.7.0.dist-info}/WHEEL +1 -1
  79. experimaestro-1.6.1.dist-info/RECORD +0 -122
  80. {experimaestro-1.6.1.dist-info → experimaestro-1.7.0.dist-info}/LICENSE +0 -0
  81. {experimaestro-1.6.1.dist-info → experimaestro-1.7.0.dist-info}/entry_points.txt +0 -0
@@ -59,7 +59,7 @@ class ExperimentCallable(Protocol):
59
59
  class ConfigurationLoader:
60
60
  def __init__(self):
61
61
  self.yamls = []
62
- self.pythonpath = set()
62
+ self.python_path = set()
63
63
 
64
64
  def load(self, yaml_file: Path):
65
65
  """Loads a YAML file, and parents one if they exist"""
@@ -76,9 +76,9 @@ class ConfigurationLoader:
76
76
  for path in _data.get("pythonpath", []):
77
77
  path = Path(path)
78
78
  if path.is_absolute():
79
- self.pythonpath.add(path.resolve())
79
+ self.python_path.add(path.resolve())
80
80
  else:
81
- self.pythonpath.add((yaml_file.parent / path).resolve())
81
+ self.python_path.add((yaml_file.parent / path).resolve())
82
82
 
83
83
 
84
84
  @click.option("--debug", is_flag=True, help="Print debug information")
@@ -181,7 +181,7 @@ def experiments_cli( # noqa: C901
181
181
  configuration.merge_with(OmegaConf.from_dotlist(extra_conf))
182
182
 
183
183
  # --- Get the XP file
184
- pythonpath = list(conf_loader.pythonpath)
184
+ python_path = list(conf_loader.python_path)
185
185
  if module_name is None:
186
186
  module_name = configuration.get("module", None)
187
187
 
@@ -192,9 +192,13 @@ def experiments_cli( # noqa: C901
192
192
  not module_name
193
193
  ), "Module name and experiment file are mutually exclusive options"
194
194
  xp_file = Path(xp_file)
195
- if not pythonpath:
196
- pythonpath.append(xp_file.parent)
197
- logging.info("Using python path: %s", ", ".join(str(s) for s in pythonpath))
195
+ if not python_path:
196
+ python_path.append(xp_file.parent)
197
+ logging.info(
198
+ "Using python path: %s", ", ".join(str(s) for s in python_path)
199
+ )
200
+ else:
201
+ xp_file = Path(xp_file)
198
202
 
199
203
  assert (
200
204
  module_name or xp_file
@@ -209,7 +213,7 @@ def experiments_cli( # noqa: C901
209
213
  # --- Finds the "run" function
210
214
 
211
215
  # Modifies the Python path
212
- for path in pythonpath:
216
+ for path in python_path:
213
217
  sys.path.append(str(path))
214
218
 
215
219
  if xp_file:
@@ -226,7 +230,11 @@ def experiments_cli( # noqa: C901
226
230
  )
227
231
  else:
228
232
  # Module
229
- mod = importlib.import_module(module_name)
233
+ try:
234
+ mod = importlib.import_module(module_name)
235
+ except ModuleNotFoundError as e:
236
+ logging.error("Module not found: %s with python path %s", e, sys.path)
237
+ raise
230
238
 
231
239
  helper = getattr(mod, "run", None)
232
240
 
@@ -265,6 +273,7 @@ def experiments_cli( # noqa: C901
265
273
 
266
274
  # Define the workspace
267
275
  ws_env = find_workspace(workdir=workdir, workspace=workspace)
276
+
268
277
  workdir = ws_env.path
269
278
 
270
279
  logging.info("Using working directory %s", str(workdir.resolve()))
@@ -278,6 +287,9 @@ def experiments_cli( # noqa: C901
278
287
  for key, value in env:
279
288
  xp.setenv(key, value)
280
289
 
290
+ # Sets the python path
291
+ xp.workspace.python_path.extend(python_path)
292
+
281
293
  try:
282
294
  # Run the experiment
283
295
  helper.xp = xp
@@ -1,11 +1,12 @@
1
1
  import inspect
2
2
  from pathlib import Path
3
+ from abc import ABC, abstractmethod
3
4
  from typing import Callable, Union
4
5
  from experimaestro.core.arguments import ArgumentOptions, TypeAnnotation
5
6
  from experimaestro.core.objects import ConfigWalkContext, Config
6
7
 
7
8
 
8
- class Generator:
9
+ class Generator(ABC):
9
10
  """Base class for all generators"""
10
11
 
11
12
  def isoutput(self):
@@ -13,6 +14,10 @@ class Generator:
13
14
  path within the job folder)"""
14
15
  return False
15
16
 
17
+ @abstractmethod
18
+ def __call__(self, context: ConfigWalkContext, config: Config):
19
+ ...
20
+
16
21
 
17
22
  class PathGenerator(Generator):
18
23
  """Generates a path"""
experimaestro/ipc.py CHANGED
@@ -7,6 +7,7 @@ import sys
7
7
  import logging
8
8
  from .utils import logger
9
9
  from watchdog.observers import Observer
10
+ from watchdog.observers.api import ObservedWatch
10
11
  from watchdog.events import FileSystemEventHandler
11
12
 
12
13
 
@@ -20,7 +21,9 @@ class IPCom:
20
21
  self.observer.start()
21
22
  self.pid = os.getpid()
22
23
 
23
- def fswatch(self, watcher: FileSystemEventHandler, path: Path, recursive=False):
24
+ def fswatch(
25
+ self, watcher: FileSystemEventHandler, path: Path, recursive=False
26
+ ) -> ObservedWatch:
24
27
  if not self.observer.is_alive():
25
28
  logging.error("Observer is not alive")
26
29
 
@@ -1,5 +1,6 @@
1
1
  # Configuration registers
2
2
 
3
+ from contextlib import contextmanager
3
4
  from typing import ClassVar, Dict, Optional, Set, Type, Union
4
5
 
5
6
  from pathlib import Path
@@ -7,7 +8,6 @@ import typing
7
8
  from omegaconf import DictConfig, OmegaConf, SCMode
8
9
  import pkg_resources
9
10
  from experimaestro.utils import logger
10
-
11
11
  from .base import ConnectorConfiguration, TokenConfiguration
12
12
  from .specs import HostRequirement
13
13
 
@@ -36,6 +36,16 @@ def load_yaml(schema, path: Path):
36
36
  )
37
37
 
38
38
 
39
+ @contextmanager
40
+ def ensure_enter(fp):
41
+ """Behaves as a resource, whether it is one or not"""
42
+ if hasattr(fp, "__enter__"):
43
+ with fp as _fp:
44
+ yield _fp
45
+ else:
46
+ yield fp
47
+
48
+
39
49
  class LauncherRegistry:
40
50
  INSTANCES: ClassVar[Dict[Path, "LauncherRegistry"]] = {}
41
51
  CURRENT_CONFIG_DIR: ClassVar[Optional[Path]] = None
@@ -78,13 +88,16 @@ class LauncherRegistry:
78
88
 
79
89
  from importlib import util
80
90
 
81
- spec = util.spec_from_file_location("xpm_launchers_conf", launchers_py)
82
- module = util.module_from_spec(spec)
83
- spec.loader.exec_module(module)
91
+ with ensure_enter(launchers_py.__fspath__()) as fp:
92
+ spec = util.spec_from_file_location("xpm_launchers_conf", fp)
93
+ module = util.module_from_spec(spec)
94
+ spec.loader.exec_module(module)
84
95
 
85
96
  self.find_launcher_fn = getattr(module, "find_launcher", None)
86
97
  if self.find_launcher_fn is None:
87
- logger.warn("No find_launcher() function was found in %s", launchers_py)
98
+ logger.warning(
99
+ "No find_launcher() function was found in %s", launchers_py
100
+ )
88
101
 
89
102
  # Read the configuration file
90
103
  self.connectors = load_yaml(
@@ -144,9 +157,14 @@ class LauncherRegistry:
144
157
  specs.append(spec)
145
158
 
146
159
  # Use launcher function
160
+ from experimaestro.launchers import Launcher
161
+
147
162
  if self.find_launcher_fn is not None:
148
163
  for spec in specs:
149
164
  if launcher := self.find_launcher_fn(spec, tags):
165
+ assert isinstance(
166
+ launcher, Launcher
167
+ ), "f{self.find_launcher_fn} did not return a Launcher but {type(launcher)}"
150
168
  return launcher
151
169
 
152
170
  return None
@@ -11,6 +11,7 @@ from typing import (
11
11
  )
12
12
  from experimaestro.connectors.local import LocalConnector
13
13
  import re
14
+ from shlex import quote as shquote
14
15
  from contextlib import contextmanager
15
16
  from dataclasses import dataclass
16
17
  from experimaestro.launcherfinder.registry import (
@@ -235,15 +236,15 @@ class SlurmProcessBuilder(ProcessBuilder):
235
236
  super().__init__()
236
237
  self.launcher = launcher
237
238
 
238
- def start(self) -> BatchSlurmProcess:
239
+ def start(self, task_mode: bool = False) -> BatchSlurmProcess:
239
240
  """Start the process"""
240
241
  builder = self.launcher.connector.processbuilder()
241
- builder.workingDirectory = self.workingDirectory
242
242
  builder.environ = self.launcher.launcherenv
243
243
  builder.detach = False
244
244
 
245
245
  if not self.detach:
246
246
  # Simplest case: we wait for the output
247
+ builder.workingDirectory = self.workingDirectory
247
248
  builder.command = [f"{self.launcher.binpath}/srun"]
248
249
  builder.command.extend(self.launcher.options.args())
249
250
  builder.command.extend(self.command)
@@ -255,11 +256,17 @@ class SlurmProcessBuilder(ProcessBuilder):
255
256
  return builder.start()
256
257
 
257
258
  builder.command = [f"{self.launcher.binpath}/sbatch", "--parsable"]
258
- builder.command.extend(self.launcher.options.args())
259
259
 
260
- addstream(builder.command, "-e", self.stderr)
261
- addstream(builder.command, "-o", self.stdout)
262
- addstream(builder.command, "-i", self.stdin)
260
+ if not task_mode:
261
+ # Use command line parameters when not running a task
262
+ builder.command.extend(self.launcher.options.args())
263
+
264
+ if self.workingDirectory:
265
+ workdir = self.launcher.connector.resolve(self.workingDirectory)
266
+ builder.command.append(f"--chdir={workdir}")
267
+ addstream(builder.command, "-e", self.stderr)
268
+ addstream(builder.command, "-o", self.stdout)
269
+ addstream(builder.command, "-i", self.stdin)
263
270
 
264
271
  builder.command.extend(self.command)
265
272
  logger.info(
@@ -427,12 +434,43 @@ class SlurmLauncher(Launcher):
427
434
 
428
435
  We assume *nix, but should be changed to PythonScriptBuilder when working
429
436
  """
430
- builder = PythonScriptBuilder()
431
- builder.processtype = "slurm"
432
- return builder
437
+ return SlurmScriptBuilder(self)
433
438
 
434
439
  def processbuilder(self) -> SlurmProcessBuilder:
435
440
  """Returns the process builder for this launcher
436
441
 
437
442
  By default, returns the associated connector builder"""
438
443
  return SlurmProcessBuilder(self)
444
+
445
+
446
+ class SlurmScriptBuilder(PythonScriptBuilder):
447
+ def __init__(self, launcher: SlurmLauncher, pythonpath=None):
448
+ super().__init__(pythonpath)
449
+ self.launcher = launcher
450
+ self.processtype = "slurm"
451
+
452
+ def write(self, job):
453
+ py_path = super().write(job)
454
+ main_path = py_path.parent
455
+
456
+ def relpath(path: Path):
457
+ return shquote(self.launcher.connector.resolve(path, main_path))
458
+
459
+ # Writes the sbatch shell script containing all the options
460
+ sh_path = job.jobpath / ("%s.sh" % job.name)
461
+ with sh_path.open("wt") as out:
462
+ out.write("""#!/bin/sh\n\n""")
463
+
464
+ workdir = self.launcher.connector.resolve(main_path)
465
+ out.write(f"#SBATCH --chdir={shquote(workdir)}\n")
466
+ out.write(f"""#SBATCH --error={relpath(job.stderr)}\n""")
467
+ out.write(f"""#SBATCH --output={relpath(job.stdout)}\n""")
468
+
469
+ for arg in self.launcher.options.args():
470
+ out.write(f"""#SBATCH {arg}\n""")
471
+
472
+ # We finish by the call to srun
473
+ out.write(f"""\nsrun ./{relpath(py_path)}\n\n""")
474
+
475
+ self.launcher.connector.setExecutable(sh_path, True)
476
+ return sh_path
@@ -78,7 +78,6 @@ class Reporter(threading.Thread):
78
78
 
79
79
  self.progress_threshold = 0.01
80
80
  self.cv = threading.Condition()
81
- self.start()
82
81
 
83
82
  def stop(self):
84
83
  self.stopping = True
@@ -222,6 +221,7 @@ class Reporter(threading.Thread):
222
221
  taskpath = TaskEnv.instance().taskpath
223
222
  assert taskpath is not None, "Task path is not defined"
224
223
  Reporter.INSTANCE = Reporter(taskpath)
224
+ Reporter.INSTANCE.start()
225
225
  return Reporter.INSTANCE
226
226
 
227
227
 
experimaestro/run.py CHANGED
@@ -140,10 +140,10 @@ class TaskRunner:
140
140
  run(workdir / "params.json")
141
141
 
142
142
  # ... remove the handlers
143
- logger.info("Task ended successfully")
144
143
  remove_signal_handlers(remove_cleanup=False)
145
144
 
146
145
  # Everything went OK
146
+ logger.info("Task ended successfully")
147
147
  sys.exit(0)
148
148
  except Exception:
149
149
  logger.exception("Got exception while running")
@@ -1,12 +1,22 @@
1
1
  from collections import ChainMap
2
2
  from functools import cached_property
3
+ import itertools
3
4
  import logging
4
5
  import os
5
6
  from pathlib import Path
6
7
  from shutil import rmtree
7
8
  import threading
8
9
  import time
9
- from typing import Any, List, Optional, Set, TypeVar, Union, TYPE_CHECKING
10
+ from typing import (
11
+ Any,
12
+ Iterator,
13
+ List,
14
+ Optional,
15
+ Set,
16
+ TypeVar,
17
+ Union,
18
+ TYPE_CHECKING,
19
+ )
10
20
  import enum
11
21
  import signal
12
22
  import asyncio
@@ -17,9 +27,10 @@ from experimaestro.scheduler.services import Service
17
27
  from experimaestro.settings import WorkspaceSettings, get_settings
18
28
 
19
29
 
20
- from experimaestro.core.objects import Config, ConfigWalkContext
30
+ from experimaestro.core.objects import Config, ConfigWalkContext, WatchedOutput
21
31
  from experimaestro.utils import logger
22
32
  from experimaestro.locking import Locks, LockError, Lock
33
+ from experimaestro.utils.asyncio import asyncThreadcheck
23
34
  from .workspace import RunMode, Workspace
24
35
  from .dependencies import Dependency, DependencyStatus, Resource
25
36
  import concurrent.futures
@@ -110,7 +121,7 @@ class JobDependency(Dependency):
110
121
 
111
122
 
112
123
  class Job(Resource):
113
- """A job is a resouce that is produced by the execution of some code"""
124
+ """A job is a resource that is produced by the execution of some code"""
114
125
 
115
126
  # Set by the scheduler
116
127
  _readyEvent: Optional[asyncio.Event]
@@ -148,6 +159,11 @@ class Job(Resource):
148
159
  # Dependencies
149
160
  self.dependencies: Set[Dependency] = set() # as target
150
161
 
162
+ # Watched outputs
163
+ self.watched_outputs = {}
164
+ for watched in config.__xpm__.watched_outputs:
165
+ self.watch_output(watched)
166
+
151
167
  # Process
152
168
  self._process = None
153
169
  self.unsatisfied = 0
@@ -159,6 +175,23 @@ class Job(Resource):
159
175
  self._progress: List[LevelInformation] = []
160
176
  self.tags = config.tags()
161
177
 
178
+ def watch_output(self, watched: "WatchedOutput"):
179
+ """Monitor task outputs
180
+
181
+ :param watched: A description of the watched output
182
+ """
183
+ self.scheduler.xp.watch_output(watched)
184
+
185
+ def task_output_update(self, subpath: Path):
186
+ """Notification of an updated task output"""
187
+ if watcher := self.watched_outputs.get(subpath, None):
188
+ watcher.update()
189
+
190
+ def done_handler(self):
191
+ """The task has been completed"""
192
+ for watcher in self.watched_outputs.values():
193
+ watcher.update()
194
+
162
195
  def __str__(self):
163
196
  return "Job[{}]".format(self.identifier)
164
197
 
@@ -166,6 +199,11 @@ class Job(Resource):
166
199
  assert self._future, "Cannot wait a not submitted job"
167
200
  return self._future.result()
168
201
 
202
+ @cached_property
203
+ def python_path(self) -> Iterator[str]:
204
+ """Returns an iterator over python path"""
205
+ return itertools.chain(self.workspace.python_path)
206
+
169
207
  @cached_property
170
208
  def environ(self):
171
209
  """Returns the job environment
@@ -219,7 +257,7 @@ class Job(Resource):
219
257
  return self.state == JobState.READY
220
258
 
221
259
  @property
222
- def jobpath(self):
260
+ def jobpath(self) -> Path:
223
261
  """Deprecated, use `path`"""
224
262
  return self.workspace.jobspath / self.relpath
225
263
 
@@ -227,6 +265,14 @@ class Job(Resource):
227
265
  def path(self) -> Path:
228
266
  return self.workspace.jobspath / self.relpath
229
267
 
268
+ @property
269
+ def experimaestro_path(self) -> Path:
270
+ return (self.path / ".experimaestro").resolve()
271
+
272
+ @cached_property
273
+ def task_outputs_path(self) -> Path:
274
+ return self.experimaestro_path / "task-outputs.jsonl"
275
+
230
276
  @property
231
277
  def relpath(self):
232
278
  identifier = self.config.__xpm__.identifier
@@ -436,7 +482,7 @@ class Scheduler:
436
482
  self.jobs: Dict[str, "Job"] = {}
437
483
 
438
484
  # List of jobs
439
- self.waitingjobs = set()
485
+ self.waitingjobs: Set[Job] = set()
440
486
 
441
487
  # Listeners
442
488
  self.listeners: Set[Listener] = set()
@@ -459,10 +505,12 @@ class Scheduler:
459
505
 
460
506
  def submit(self, job: Job) -> Optional[Job]:
461
507
  # Wait for the future containing the submitted job
508
+ logger.debug("Registering the job %s within the scheduler", job)
462
509
  otherFuture = asyncio.run_coroutine_threadsafe(
463
510
  self.aio_registerJob(job), self.loop
464
511
  )
465
512
  other = otherFuture.result()
513
+ logger.debug("Job already submitted" if other else "First submission")
466
514
  if other:
467
515
  return other
468
516
 
@@ -598,9 +646,13 @@ class Scheduler:
598
646
  if job.state != JobState.DONE:
599
647
  self.xp.failedJobs[job.identifier] = job
600
648
 
649
+ # Process all remaining tasks outputs
650
+ await asyncThreadcheck("End of job processing", job.done_handler)
651
+
601
652
  # Decrement the number of unfinished jobs and notify
602
653
  self.xp.unfinishedJobs -= 1
603
654
  async with self.xp.central.exitCondition:
655
+ logging.debug("Updated number of unfinished jobs")
604
656
  self.xp.central.exitCondition.notify_all()
605
657
 
606
658
  job.endtime = time.time()
@@ -688,6 +740,7 @@ class Scheduler:
688
740
  code = await process.aio_code()
689
741
  logger.debug("Got return code %s for %s", code, job)
690
742
 
743
+ # Check the file if there is no return code
691
744
  if code is None:
692
745
  # Case where we cannot retrieve the code right away
693
746
  if job.donepath.is_file():
@@ -853,6 +906,7 @@ class experiment:
853
906
  assert self.central is not None
854
907
  async with self.central.exitCondition:
855
908
  self.exitMode = True
909
+ logging.debug("Setting exit mode to true")
856
910
  self.central.exitCondition.notify_all()
857
911
 
858
912
  assert self.central is not None and self.central.loop is not None
@@ -863,10 +917,22 @@ class experiment:
863
917
 
864
918
  async def awaitcompletion():
865
919
  assert self.central is not None
920
+ logger.debug("Waiting to exit scheduler...")
866
921
  async with self.central.exitCondition:
867
922
  while True:
868
- if self.unfinishedJobs == 0 or self.exitMode:
923
+ if self.exitMode:
869
924
  break
925
+
926
+ # If we have still unfinished jobs or possible new tasks, wait
927
+ logger.debug(
928
+ "Checking exit condition: unfinished jobs=%d, task output queue size=%d",
929
+ self.unfinishedJobs,
930
+ self.taskOutputQueueSize,
931
+ )
932
+ if self.unfinishedJobs == 0 and self.taskOutputQueueSize == 0:
933
+ break
934
+
935
+ # Wait for more news...
870
936
  await self.central.exitCondition.wait()
871
937
 
872
938
  if self.failedJobs:
@@ -898,6 +964,8 @@ class experiment:
898
964
  return self.workspace.connector.createtoken(name, count)
899
965
 
900
966
  def __enter__(self):
967
+ from .dynamic_outputs import TaskOutputsWorker
968
+
901
969
  if self.workspace.run_mode != RunMode.DRY_RUN:
902
970
  logger.info("Locking experiment %s", self.xplockpath)
903
971
  self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
@@ -926,6 +994,7 @@ class experiment:
926
994
  global SIGNAL_HANDLER
927
995
  # Number of unfinished jobs
928
996
  self.unfinishedJobs = 0
997
+ self.taskOutputQueueSize = 0
929
998
 
930
999
  # List of failed jobs
931
1000
  self.failedJobs: Dict[str, Job] = {}
@@ -934,6 +1003,8 @@ class experiment:
934
1003
  self.exitMode = False
935
1004
 
936
1005
  self.central = SchedulerCentral.create(self.scheduler.name)
1006
+ self.taskOutputsWorker = TaskOutputsWorker(self)
1007
+ self.taskOutputsWorker.start()
937
1008
 
938
1009
  SIGNAL_HANDLER.add(self)
939
1010
 
@@ -942,6 +1013,7 @@ class experiment:
942
1013
  return self
943
1014
 
944
1015
  def __exit__(self, exc_type, exc_value, traceback):
1016
+ logger.debug("Exiting scheduler context")
945
1017
  # If no exception and normal run mode, remove old "jobs"
946
1018
  if self.workspace.run_mode == RunMode.NORMAL:
947
1019
  if exc_type is None and self.jobsbakpath.is_dir():
@@ -967,8 +1039,13 @@ class experiment:
967
1039
  service.stop()
968
1040
 
969
1041
  if self.central is not None:
1042
+ logger.info("Stopping scheduler event loop")
970
1043
  self.central.loop.stop()
971
1044
 
1045
+ if self.taskOutputsWorker is not None:
1046
+ logger.info("Stopping tasks outputs worker")
1047
+ self.taskOutputsWorker.queue.put(None)
1048
+
972
1049
  self.central = None
973
1050
  self.workspace.__exit__(exc_type, exc_value, traceback)
974
1051
  if self.xplock:
@@ -977,8 +1054,27 @@ class experiment:
977
1054
  # Put back old experiment as current one
978
1055
  experiment.CURRENT = self.old_experiment
979
1056
  if self.server:
1057
+ logger.info("Stopping web server")
980
1058
  self.server.stop()
981
1059
 
1060
+ async def update_task_output_count(self, delta: int):
1061
+ """Change in the number of task outputs to process"""
1062
+ async with self.central.exitCondition:
1063
+ self.taskOutputQueueSize += delta
1064
+ logging.debug(
1065
+ "Updating queue size with %d => %d", delta, self.taskOutputQueueSize
1066
+ )
1067
+ if self.taskOutputQueueSize == 0:
1068
+ self.central.exitCondition.notify_all()
1069
+
1070
+ def watch_output(self, watched: "WatchedOutput"):
1071
+ """Watch an output
1072
+
1073
+ :param watched: The watched output specification
1074
+ """
1075
+
1076
+ self.taskOutputsWorker.watch_output(watched)
1077
+
982
1078
  def add_service(self, service: ServiceClass) -> ServiceClass:
983
1079
  """Adds a service (e.g. tensorboard viewer) to the experiment
984
1080