experimaestro 1.5.1__py3-none-any.whl → 2.0.0a8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (118) hide show
  1. experimaestro/__init__.py +14 -4
  2. experimaestro/__main__.py +3 -423
  3. experimaestro/annotations.py +14 -4
  4. experimaestro/cli/__init__.py +311 -0
  5. experimaestro/{filter.py → cli/filter.py} +23 -9
  6. experimaestro/cli/jobs.py +268 -0
  7. experimaestro/cli/progress.py +269 -0
  8. experimaestro/click.py +0 -35
  9. experimaestro/commandline.py +3 -7
  10. experimaestro/connectors/__init__.py +29 -14
  11. experimaestro/connectors/local.py +19 -10
  12. experimaestro/connectors/ssh.py +27 -8
  13. experimaestro/core/arguments.py +45 -3
  14. experimaestro/core/callbacks.py +52 -0
  15. experimaestro/core/context.py +8 -9
  16. experimaestro/core/identifier.py +310 -0
  17. experimaestro/core/objects/__init__.py +44 -0
  18. experimaestro/core/{objects.py → objects/config.py} +399 -772
  19. experimaestro/core/objects/config_utils.py +58 -0
  20. experimaestro/core/objects/config_walk.py +151 -0
  21. experimaestro/core/objects.pyi +15 -45
  22. experimaestro/core/serialization.py +63 -9
  23. experimaestro/core/serializers.py +1 -8
  24. experimaestro/core/types.py +104 -66
  25. experimaestro/experiments/cli.py +154 -72
  26. experimaestro/experiments/configuration.py +10 -1
  27. experimaestro/generators.py +6 -1
  28. experimaestro/ipc.py +4 -1
  29. experimaestro/launcherfinder/__init__.py +1 -1
  30. experimaestro/launcherfinder/base.py +2 -18
  31. experimaestro/launcherfinder/parser.py +8 -3
  32. experimaestro/launcherfinder/registry.py +52 -140
  33. experimaestro/launcherfinder/specs.py +49 -10
  34. experimaestro/launchers/direct.py +0 -47
  35. experimaestro/launchers/slurm/base.py +54 -14
  36. experimaestro/mkdocs/__init__.py +1 -1
  37. experimaestro/mkdocs/base.py +6 -8
  38. experimaestro/notifications.py +38 -12
  39. experimaestro/progress.py +406 -0
  40. experimaestro/run.py +24 -3
  41. experimaestro/scheduler/__init__.py +18 -1
  42. experimaestro/scheduler/base.py +108 -808
  43. experimaestro/scheduler/dynamic_outputs.py +184 -0
  44. experimaestro/scheduler/experiment.py +387 -0
  45. experimaestro/scheduler/jobs.py +475 -0
  46. experimaestro/scheduler/signal_handler.py +32 -0
  47. experimaestro/scheduler/state.py +75 -0
  48. experimaestro/scheduler/workspace.py +27 -8
  49. experimaestro/scriptbuilder.py +18 -3
  50. experimaestro/server/__init__.py +36 -5
  51. experimaestro/server/data/1815e00441357e01619e.ttf +0 -0
  52. experimaestro/server/data/2463b90d9a316e4e5294.woff2 +0 -0
  53. experimaestro/server/data/2582b0e4bcf85eceead0.ttf +0 -0
  54. experimaestro/server/data/89999bdf5d835c012025.woff2 +0 -0
  55. experimaestro/server/data/914997e1bdfc990d0897.ttf +0 -0
  56. experimaestro/server/data/c210719e60948b211a12.woff2 +0 -0
  57. experimaestro/server/data/index.css +5187 -5068
  58. experimaestro/server/data/index.css.map +1 -1
  59. experimaestro/server/data/index.js +68887 -68064
  60. experimaestro/server/data/index.js.map +1 -1
  61. experimaestro/settings.py +45 -5
  62. experimaestro/sphinx/__init__.py +7 -17
  63. experimaestro/taskglobals.py +7 -2
  64. experimaestro/tests/core/__init__.py +0 -0
  65. experimaestro/tests/core/test_generics.py +206 -0
  66. experimaestro/tests/definitions_types.py +5 -3
  67. experimaestro/tests/launchers/bin/sbatch +34 -7
  68. experimaestro/tests/launchers/bin/srun +5 -0
  69. experimaestro/tests/launchers/common.py +17 -5
  70. experimaestro/tests/launchers/config_slurm/launchers.py +25 -0
  71. experimaestro/tests/restart.py +10 -5
  72. experimaestro/tests/tasks/all.py +23 -10
  73. experimaestro/tests/tasks/foreign.py +2 -4
  74. experimaestro/tests/test_checkers.py +2 -2
  75. experimaestro/tests/test_dependencies.py +11 -17
  76. experimaestro/tests/test_experiment.py +73 -0
  77. experimaestro/tests/test_file_progress.py +425 -0
  78. experimaestro/tests/test_file_progress_integration.py +477 -0
  79. experimaestro/tests/test_findlauncher.py +12 -5
  80. experimaestro/tests/test_forward.py +5 -5
  81. experimaestro/tests/test_generators.py +93 -0
  82. experimaestro/tests/test_identifier.py +182 -158
  83. experimaestro/tests/test_instance.py +19 -27
  84. experimaestro/tests/test_objects.py +13 -20
  85. experimaestro/tests/test_outputs.py +6 -6
  86. experimaestro/tests/test_param.py +68 -30
  87. experimaestro/tests/test_progress.py +4 -4
  88. experimaestro/tests/test_serializers.py +24 -64
  89. experimaestro/tests/test_ssh.py +7 -0
  90. experimaestro/tests/test_tags.py +50 -21
  91. experimaestro/tests/test_tasks.py +42 -51
  92. experimaestro/tests/test_tokens.py +11 -8
  93. experimaestro/tests/test_types.py +24 -21
  94. experimaestro/tests/test_validation.py +67 -110
  95. experimaestro/tests/token_reschedule.py +1 -1
  96. experimaestro/tokens.py +24 -13
  97. experimaestro/tools/diff.py +8 -1
  98. experimaestro/typingutils.py +20 -11
  99. experimaestro/utils/asyncio.py +6 -2
  100. experimaestro/utils/multiprocessing.py +44 -0
  101. experimaestro/utils/resources.py +11 -3
  102. {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/METADATA +28 -36
  103. experimaestro-2.0.0a8.dist-info/RECORD +166 -0
  104. {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/WHEEL +1 -1
  105. {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/entry_points.txt +0 -4
  106. experimaestro/launchers/slurm/cli.py +0 -29
  107. experimaestro/launchers/slurm/configuration.py +0 -597
  108. experimaestro/scheduler/environment.py +0 -94
  109. experimaestro/server/data/016b4a6cdced82ab3aa1.ttf +0 -0
  110. experimaestro/server/data/50701fbb8177c2dde530.ttf +0 -0
  111. experimaestro/server/data/878f31251d960bd6266f.woff2 +0 -0
  112. experimaestro/server/data/b041b1fa4fe241b23445.woff2 +0 -0
  113. experimaestro/server/data/b6879d41b0852f01ed5b.woff2 +0 -0
  114. experimaestro/server/data/d75e3fd1eb12e9bd6655.ttf +0 -0
  115. experimaestro/tests/launchers/config_slurm/launchers.yaml +0 -134
  116. experimaestro/utils/yaml.py +0 -202
  117. experimaestro-1.5.1.dist-info/RECORD +0 -148
  118. {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info/licenses}/LICENSE +0 -0
@@ -1,346 +1,23 @@
1
- from collections import ChainMap
2
- from functools import cached_property
3
- import os
4
- from pathlib import Path
5
- from shutil import rmtree
1
+ import logging
6
2
  import threading
7
3
  import time
8
- from typing import Any, List, Optional, Set, TypeVar, Union, TYPE_CHECKING
9
- import enum
10
- import signal
4
+ from typing import (
5
+ Optional,
6
+ Set,
7
+ )
11
8
  import asyncio
12
- from experimaestro.exceptions import HandledException
13
- from experimaestro.notifications import LevelInformation, Reporter
14
9
  from typing import Dict
10
+
11
+ from experimaestro.scheduler import experiment
12
+ from experimaestro.scheduler.jobs import Job, JobState
15
13
  from experimaestro.scheduler.services import Service
16
- from experimaestro.settings import get_settings
17
14
 
18
15
 
19
- from experimaestro.core.objects import Config, ConfigWalkContext
20
16
  from experimaestro.utils import logger
21
- from experimaestro.locking import Locks, LockError, Lock
22
- from .environment import Environment
23
- from .workspace import RunMode, Workspace
24
- from .dependencies import Dependency, DependencyStatus, Resource
17
+ from experimaestro.utils.asyncio import asyncThreadcheck
25
18
  import concurrent.futures
26
19
 
27
20
 
28
- if TYPE_CHECKING:
29
- from experimaestro.connectors import Process
30
- from experimaestro.launchers import Launcher
31
-
32
-
33
- class FailedExperiment(HandledException):
34
- """Raised when an experiment failed"""
35
-
36
- pass
37
-
38
-
39
- class JobState(enum.Enum):
40
- # Job is not yet scheduled
41
- UNSCHEDULED = 0
42
-
43
- # Job is waiting for dependencies to be done
44
- WAITING = 1
45
-
46
- # Job is ready to run
47
- READY = 2
48
-
49
- # Job is scheduled (e.g. slurm)
50
- SCHEDULED = 3
51
-
52
- # Job is running
53
- RUNNING = 4
54
-
55
- # Job is done (finished)
56
- DONE = 5
57
-
58
- # Job failed (finished)
59
- ERROR = 6
60
-
61
- def notstarted(self):
62
- return self.value <= JobState.READY.value
63
-
64
- def running(self):
65
- return (
66
- self.value == JobState.RUNNING.value
67
- or self.value == JobState.SCHEDULED.value
68
- )
69
-
70
- def finished(self):
71
- return self.value >= JobState.DONE.value
72
-
73
-
74
- class JobFailureStatus(enum.Enum):
75
- #: Job failed
76
- DEPENDENCY = 0
77
-
78
- #: Job dependency failed
79
- FAILED = 1
80
-
81
- #: Memory
82
- MEMORY = 2
83
-
84
-
85
- class JobLock(Lock):
86
- def __init__(self, job):
87
- super().__init__()
88
- self.job = job
89
-
90
- def _acquire(self):
91
- return self.job.state == JobState.DONE
92
-
93
- def _release(self):
94
- return False
95
-
96
-
97
- class JobDependency(Dependency):
98
- def __init__(self, job):
99
- super().__init__(job)
100
-
101
- def status(self) -> DependencyStatus:
102
- if self.origin.state == JobState.DONE:
103
- return DependencyStatus.OK
104
- elif self.origin.state == JobState.ERROR:
105
- return DependencyStatus.FAIL
106
- return DependencyStatus.WAIT
107
-
108
- def lock(self):
109
- return JobLock(self.origin)
110
-
111
-
112
- class Job(Resource):
113
- """A job is a resouce that is produced by the execution of some code"""
114
-
115
- # Set by the scheduler
116
- _readyEvent: Optional[asyncio.Event]
117
- _future: Optional["concurrent.futures.Future"]
118
-
119
- def __init__(
120
- self,
121
- config: Config,
122
- *,
123
- workspace: Workspace = None,
124
- launcher: "Launcher" = None,
125
- run_mode: RunMode = RunMode.NORMAL,
126
- ):
127
- super().__init__()
128
-
129
- self.workspace = workspace or Workspace.CURRENT
130
- self.launcher = launcher or self.workspace.launcher if self.workspace else None
131
-
132
- if run_mode == RunMode.NORMAL:
133
- assert self.workspace is not None, "No experiment has been defined"
134
- assert self.launcher is not None, (
135
- "No launcher, and no default defined for the workspace %s" % workspace
136
- )
137
-
138
- self.type = config.__xpmtype__
139
- self.name = str(self.type.identifier).rsplit(".", 1)[-1]
140
-
141
- self.scheduler: Optional["Scheduler"] = None
142
- self.config = config
143
- self.state: JobState = JobState.UNSCHEDULED
144
-
145
- #: If a job has failed, indicates the failure status
146
- self.failure_status: JobFailureStatus = None
147
-
148
- # Dependencies
149
- self.dependencies: Set[Dependency] = set() # as target
150
-
151
- # Process
152
- self._process = None
153
- self.unsatisfied = 0
154
-
155
- # Meta-information
156
- self.starttime: Optional[float] = None
157
- self.submittime: Optional[float] = None
158
- self.endtime: Optional[float] = None
159
- self._progress: List[LevelInformation] = []
160
- self.tags = config.tags()
161
-
162
- def __str__(self):
163
- return "Job[{}]".format(self.identifier)
164
-
165
- def wait(self) -> JobState:
166
- assert self._future, "Cannot wait a not submitted job"
167
- return self._future.result()
168
-
169
- @cached_property
170
- def environ(self):
171
- """Returns the job environment
172
-
173
- It is made of (by order of priority):
174
-
175
- 1. The job environment
176
- 1. The launcher environment
177
- 1. The workspace environment
178
-
179
- """
180
- return ChainMap(
181
- {},
182
- self.launcher.environ if self.launcher else {},
183
- self.workspace.environment.environ if self.workspace else {},
184
- )
185
-
186
- @property
187
- def progress(self):
188
- return self._progress
189
-
190
- def set_progress(self, level: int, value: float, desc: Optional[str]):
191
- if value < 0:
192
- logger.warning(f"Progress value out of bounds ({value})")
193
- value = 0
194
- elif value > 1:
195
- logger.warning(f"Progress value out of bounds ({value})")
196
- value = 1
197
-
198
- # Adjust the length of the array
199
- self._progress = self._progress[: (level + 1)]
200
- while len(self._progress) <= level:
201
- self._progress.append(LevelInformation(len(self._progress), None, 0.0))
202
-
203
- if desc:
204
- self._progress[-1].desc = desc
205
- self._progress[-1].progress = value
206
-
207
- for listener in self.scheduler.listeners:
208
- listener.job_state(self)
209
-
210
- def add_notification_server(self, server):
211
- """Adds a notification server"""
212
- key, baseurl = server.getNotificationSpec()
213
- dirpath = self.path / Reporter.NOTIFICATION_FOLDER
214
- dirpath.mkdir(exist_ok=True)
215
- (dirpath / key).write_text(f"{baseurl}/{self.identifier}")
216
-
217
- @property
218
- def ready(self):
219
- return self.state == JobState.READY
220
-
221
- @property
222
- def jobpath(self):
223
- """Deprecated, use `path`"""
224
- return self.workspace.jobspath / self.relpath
225
-
226
- @property
227
- def path(self) -> Path:
228
- return self.workspace.jobspath / self.relpath
229
-
230
- @property
231
- def relpath(self):
232
- identifier = self.config.__xpm__.identifier
233
- base = Path(str(self.type.identifier))
234
- return base / identifier.all.hex()
235
-
236
- @property
237
- def relmainpath(self):
238
- identifier = self.config.__xpm__.identifier
239
- base = Path(str(self.type.identifier))
240
- return base / identifier.main.hex()
241
-
242
- @property
243
- def hashidentifier(self):
244
- return self.config.__xpm__.identifier
245
-
246
- @property
247
- def identifier(self):
248
- return self.config.__xpm__.identifier.all.hex()
249
-
250
- def prepare(self, overwrite=False):
251
- """Prepare all files before starting a task
252
-
253
- :param overwrite: if True, overwrite files even if the task has been run
254
- """
255
- pass
256
-
257
- async def aio_run(self):
258
- """Actually run the code"""
259
- raise NotImplementedError(f"Method aio_run not implemented in {self.__class__}")
260
-
261
- async def aio_process(self) -> Optional["Process"]:
262
- """Returns the process if it exists"""
263
- raise NotImplementedError("Not implemented")
264
-
265
- @property
266
- def pidpath(self):
267
- """This file contains the file PID"""
268
- return self.jobpath / ("%s.pid" % self.name)
269
-
270
- @property
271
- def lockpath(self):
272
- """This file is used as a lock for running the job"""
273
- return self.workspace.jobspath / self.relmainpath / ("%s.lock" % self.name)
274
-
275
- @property
276
- def donepath(self) -> Path:
277
- """When a job has been successful, this file is written"""
278
- return self.jobpath / ("%s.done" % self.name)
279
-
280
- @property
281
- def failedpath(self):
282
- """When a job has been unsuccessful, this file is written with an error
283
- code inside"""
284
- return self.jobpath / ("%s.failed" % self.name)
285
-
286
- @property
287
- def stdout(self) -> Path:
288
- return self.jobpath / ("%s.out" % self.name)
289
-
290
- @property
291
- def stderr(self) -> Path:
292
- return self.jobpath / ("%s.err" % self.name)
293
-
294
- @property
295
- def basepath(self) -> Path:
296
- return self.jobpath / self.name
297
-
298
- def dependencychanged(self, dependency, oldstatus, status):
299
- """Called when a dependency has changed"""
300
-
301
- def value(s):
302
- return 1 if s == DependencyStatus.OK else 0
303
-
304
- self.unsatisfied -= value(status) - value(oldstatus)
305
-
306
- logger.debug("Job %s: unsatisfied %d", self, self.unsatisfied)
307
-
308
- if status == DependencyStatus.FAIL:
309
- # Job completed
310
- if not self.state.finished():
311
- self.state = JobState.ERROR
312
- self.failure_status = JobFailureStatus.DEPENDENCY
313
- self._readyEvent.set()
314
-
315
- if self.unsatisfied == 0:
316
- logger.info("Job %s is ready to run", self)
317
- # We are ready
318
- self.state = JobState.READY
319
- self._readyEvent.set()
320
-
321
- def finalState(self) -> "concurrent.futures.Future[JobState]":
322
- assert self._future is not None
323
- return self._future
324
-
325
-
326
- class JobContext(ConfigWalkContext):
327
- def __init__(self, job: Job):
328
- super().__init__()
329
- self.job = job
330
-
331
- @property
332
- def name(self):
333
- return self.job.name
334
-
335
- @property
336
- def path(self):
337
- return self.job.path
338
-
339
- @property
340
- def task(self):
341
- return self.job.config
342
-
343
-
344
21
  class Listener:
345
22
  def job_submitted(self, job):
346
23
  pass
@@ -353,78 +30,16 @@ class Listener:
353
30
  pass
354
31
 
355
32
 
356
- class JobError(Exception):
357
- def __init__(self, code):
358
- super().__init__(f"Job exited with code {code}")
359
-
360
-
361
- class SignalHandler:
362
- def __init__(self):
363
- self.experiments: Set["experiment"] = set()
364
- self.original_sigint_handler = None
365
-
366
- def add(self, xp: "experiment"):
367
- if not self.experiments:
368
- self.original_sigint_handler = signal.getsignal(signal.SIGINT)
369
-
370
- signal.signal(signal.SIGINT, self)
371
-
372
- self.experiments.add(xp)
373
-
374
- def remove(self, xp):
375
- self.experiments.remove(xp)
376
- if not self.experiments:
377
- signal.signal(signal.SIGINT, self.original_sigint_handler)
378
-
379
- def __call__(self, signum, frame):
380
- """SIGINT signal handler"""
381
- logger.warning("Signal received")
382
- for xp in self.experiments:
383
- xp.stop()
384
-
385
-
386
- SIGNAL_HANDLER = SignalHandler()
387
-
388
-
389
- class SchedulerCentral(threading.Thread):
390
- loop: asyncio.AbstractEventLoop
391
-
392
- """The event loop thread used by the scheduler"""
393
-
394
- def __init__(self, name: str):
395
- # Daemon thread so it is non blocking
396
- super().__init__(name=f"Scheduler EL ({name})", daemon=True)
397
-
398
- self._ready = threading.Event()
399
-
400
- def run(self):
401
- logger.debug("Starting event loop thread")
402
- self.loop = asyncio.new_event_loop()
403
- asyncio.set_event_loop(self.loop)
404
-
405
- # Set loop-dependent variables
406
- self.exitCondition = asyncio.Condition()
407
- self.dependencyLock = asyncio.Lock()
408
-
409
- # Start the event loop
410
- self._ready.set()
411
- self.loop.run_forever()
412
-
413
- @staticmethod
414
- def create(name: str):
415
- instance = SchedulerCentral(name)
416
- instance.start()
417
- instance._ready.wait()
418
- return instance
419
-
420
-
421
- class Scheduler:
33
+ class Scheduler(threading.Thread):
422
34
  """A job scheduler
423
35
 
424
36
  The scheduler is based on asyncio for easy concurrency handling
425
37
  """
426
38
 
427
39
  def __init__(self, xp: "experiment", name: str):
40
+ super().__init__(name=f"Scheduler ({name})", daemon=True)
41
+ self._ready = threading.Event()
42
+
428
43
  # Name of the experiment
429
44
  self.name = name
430
45
  self.xp = xp
@@ -436,14 +51,37 @@ class Scheduler:
436
51
  self.jobs: Dict[str, "Job"] = {}
437
52
 
438
53
  # List of jobs
439
- self.waitingjobs = set()
54
+ self.waitingjobs: Set[Job] = set()
440
55
 
441
56
  # Listeners
442
57
  self.listeners: Set[Listener] = set()
443
58
 
444
- @property
445
- def loop(self):
446
- return self.xp.loop
59
+ @staticmethod
60
+ def create(xp: "experiment", name: str):
61
+ instance = Scheduler(xp, name)
62
+ instance.start()
63
+ instance._ready.wait()
64
+ return instance
65
+
66
+ def run(self):
67
+ """Run the event loop forever"""
68
+ logger.debug("Starting event loop thread")
69
+ # Ported from SchedulerCentral
70
+ self.loop = asyncio.new_event_loop()
71
+ asyncio.set_event_loop(self.loop)
72
+ # Set loop-dependent variables
73
+ self.exitCondition = asyncio.Condition()
74
+ self.dependencyLock = asyncio.Lock()
75
+ self._ready.set()
76
+ self.loop.run_forever()
77
+
78
+ def start_scheduler(self):
79
+ """Start the scheduler event loop in a thread"""
80
+ if not self.is_alive():
81
+ self.start()
82
+ self._ready.wait()
83
+ else:
84
+ logger.warning("Scheduler already started")
447
85
 
448
86
  def addlistener(self, listener: Listener):
449
87
  self.listeners.add(listener)
@@ -452,6 +90,13 @@ class Scheduler:
452
90
  self.listeners.remove(listener)
453
91
 
454
92
  def getJobState(self, job: Job) -> "concurrent.futures.Future[JobState]":
93
+ # Check if the job belongs to this scheduler
94
+ if job.identifier not in self.jobs:
95
+ # If job is not in this scheduler, return its current state directly
96
+ future = concurrent.futures.Future()
97
+ future.set_result(job.state)
98
+ return future
99
+
455
100
  return asyncio.run_coroutine_threadsafe(self.aio_getjobstate(job), self.loop)
456
101
 
457
102
  async def aio_getjobstate(self, job: Job):
@@ -459,14 +104,17 @@ class Scheduler:
459
104
 
460
105
  def submit(self, job: Job) -> Optional[Job]:
461
106
  # Wait for the future containing the submitted job
107
+ logger.debug("Registering the job %s within the scheduler", job)
462
108
  otherFuture = asyncio.run_coroutine_threadsafe(
463
109
  self.aio_registerJob(job), self.loop
464
110
  )
465
111
  other = otherFuture.result()
112
+ logger.debug("Job already submitted" if other else "First submission")
466
113
  if other:
467
114
  return other
468
115
 
469
116
  job._future = asyncio.run_coroutine_threadsafe(self.aio_submit(job), self.loop)
117
+ return None
470
118
 
471
119
  def prepare(self, job: Job):
472
120
  """Prepares the job for running"""
@@ -498,6 +146,22 @@ class Scheduler:
498
146
 
499
147
  return None
500
148
 
149
+ def notify_job_submitted(self, job: Job):
150
+ """Notify the listeners that a job has been submitted"""
151
+ for listener in self.listeners:
152
+ try:
153
+ listener.job_submitted(job)
154
+ except Exception:
155
+ logger.exception("Got an error with listener %s", listener)
156
+
157
+ def notify_job_state(self, job: Job):
158
+ """Notify the listeners that a job has changed state"""
159
+ for listener in self.listeners:
160
+ try:
161
+ listener.job_state(job)
162
+ except Exception:
163
+ logger.exception("Got an error with listener %s", listener)
164
+
501
165
  async def aio_submit(self, job: Job) -> JobState: # noqa: C901
502
166
  """Main scheduler function: submit a job, run it (if needed), and returns
503
167
  the status code
@@ -508,6 +172,12 @@ class Scheduler:
508
172
  job.scheduler = self
509
173
  self.waitingjobs.add(job)
510
174
 
175
+ # Check that we don't have a completed job in
176
+ # alternate directories
177
+ for jobspath in experiment.current().alt_jobspaths:
178
+ # FIXME: check if done
179
+ pass
180
+
511
181
  # Creates a link into the experiment folder
512
182
  path = experiment.current().jobspath / job.relpath
513
183
  path.parent.mkdir(parents=True, exist_ok=True)
@@ -516,11 +186,8 @@ class Scheduler:
516
186
  path.symlink_to(job.path)
517
187
 
518
188
  job.state = JobState.WAITING
519
- for listener in self.listeners:
520
- try:
521
- listener.job_submitted(job)
522
- except Exception:
523
- logger.exception("Got an error with listener %s", listener)
189
+
190
+ self.notify_job_submitted(job)
524
191
 
525
192
  # Add dependencies, and add to blocking resources
526
193
  if job.dependencies:
@@ -543,11 +210,8 @@ class Scheduler:
543
210
  if process is not None:
544
211
  # Yep! First we notify the listeners
545
212
  job.state = JobState.RUNNING
546
- for listener in self.listeners:
547
- try:
548
- listener.job_state(job)
549
- except Exception:
550
- logger.exception("Got an error with listener %s", listener)
213
+ # Notify the listeners
214
+ self.notify_job_state(job)
551
215
 
552
216
  # Adds to the listeners
553
217
  if self.xp.server is not None:
@@ -582,20 +246,20 @@ class Scheduler:
582
246
 
583
247
  job.state = state
584
248
 
585
- for listener in self.listeners:
586
- try:
587
- listener.job_state(job)
588
- except Exception as e:
589
- logger.exception("Listener %s did raise an exception", e)
249
+ self.notify_job_state(job)
590
250
 
591
251
  # Job is finished
592
252
  if job.state != JobState.DONE:
593
253
  self.xp.failedJobs[job.identifier] = job
594
254
 
255
+ # Process all remaining tasks outputs
256
+ await asyncThreadcheck("End of job processing", job.done_handler)
257
+
595
258
  # Decrement the number of unfinished jobs and notify
596
259
  self.xp.unfinishedJobs -= 1
597
- async with self.xp.central.exitCondition:
598
- self.xp.central.exitCondition.notify_all()
260
+ async with self.exitCondition:
261
+ logging.debug("Updated number of unfinished jobs")
262
+ self.exitCondition.notify_all()
599
263
 
600
264
  job.endtime = time.time()
601
265
  if job in self.waitingjobs:
@@ -610,401 +274,37 @@ class Scheduler:
610
274
  return job.state
611
275
 
612
276
  async def aio_start(self, job: Job) -> Optional[JobState]:
613
- """Start a job
614
-
615
- Returns None if the dependencies could not be locked after all
616
- Returns DONE/ERROR depending on the process outcome
617
- """
618
-
619
- # We first lock the job before proceeding
620
- assert job.launcher is not None
621
- assert self.xp.central is not None
622
-
623
- with Locks() as locks:
624
- logger.debug("[starting] Locking job %s", job)
625
- async with job.launcher.connector.lock(job.lockpath):
626
- logger.debug("[starting] Locked job %s", job)
627
-
628
- state = None
629
- try:
630
- logger.debug(
631
- "Starting job %s with %d dependencies",
632
- job,
633
- len(job.dependencies),
634
- )
635
-
636
- async with self.xp.central.dependencyLock:
637
- for dependency in job.dependencies:
638
- try:
639
- locks.append(dependency.lock().acquire())
640
- except LockError:
641
- logger.warning(
642
- "Could not lock %s, aborting start for job %s",
643
- dependency,
644
- job,
645
- )
646
- dependency.check()
647
- return JobState.WAITING
648
-
649
- for listener in self.listeners:
650
- listener.job_state(job)
651
-
652
- job.starttime = time.time()
653
-
654
- # Creates the main directory
655
- directory = job.path
656
- logger.debug("Making directories job %s...", directory)
657
- if not directory.is_dir():
658
- directory.mkdir(parents=True, exist_ok=True)
659
-
660
- # Sets up the notification URL
661
- if self.xp.server is not None:
662
- job.add_notification_server(self.xp.server)
663
-
664
- except Exception:
665
- logger.warning("Error while locking job", exc_info=True)
666
- return JobState.WAITING
667
-
668
- try:
669
- # Runs the job
670
- process = await job.aio_run()
671
- except Exception:
672
- logger.warning("Error while starting job", exc_info=True)
673
- return JobState.ERROR
674
-
675
- try:
676
- if isinstance(process, JobState):
677
- state = process
678
- logger.debug("Job %s ended (state %s)", job, state)
679
- else:
680
- logger.debug("Waiting for job %s process to end", job)
277
+ """Start a job (scheduler coordination layer)
681
278
 
682
- code = await process.aio_code()
683
- logger.debug("Got return code %s for %s", code, job)
279
+ This method serves as a coordination layer that delegates the actual
280
+ job starting logic to the job itself while handling scheduler-specific
281
+ concerns like state notifications and providing coordination context.
684
282
 
685
- if code is None:
686
- # Case where we cannot retrieve the code right away
687
- if job.donepath.is_file():
688
- code = 0
689
- else:
690
- code = int(job.failedpath.read_text())
691
-
692
- logger.debug("Job %s ended with code %s", job, code)
693
- state = JobState.DONE if code == 0 else JobState.ERROR
694
-
695
- except JobError:
696
- logger.warning("Error while running job")
697
- state = JobState.ERROR
698
-
699
- except Exception:
700
- logger.warning(
701
- "Error while running job (in experimaestro)", exc_info=True
702
- )
703
- state = JobState.ERROR
704
-
705
- return state
706
-
707
-
708
- ServiceClass = TypeVar("ServiceClass", bound=Service)
709
-
710
-
711
- class experiment:
712
- """Main experiment object
713
-
714
- It is a context object, i.e. experiments is run with
715
-
716
- ```py
717
- with experiment(...) as xp:
718
- ...
719
- ```
720
- """
721
-
722
- # Current experiment
723
- CURRENT: Optional["experiment"] = None
724
-
725
- @staticmethod
726
- def current() -> "experiment":
727
- """Returns the current experiment, but checking first if set
728
-
729
- If there is no current experiment, raises an AssertError
730
- """
731
- assert experiment.CURRENT is not None, "No current experiment defined"
732
- return experiment.CURRENT
733
-
734
- def __init__(
735
- self,
736
- env: Union[Path, str, Environment],
737
- name: str,
738
- *,
739
- host: Optional[str] = None,
740
- port: Optional[int] = None,
741
- token: Optional[str] = None,
742
- run_mode: Optional[RunMode] = None,
743
- launcher=None,
744
- ):
745
- """
746
- :param env: an environment -- or a working directory for a local
747
- environment
748
-
749
- :param name: the identifier of the experiment
750
-
751
- :param launcher: The launcher (if not provided, inferred from path)
752
-
753
- :param host: The host for the web server (overrides the environment if
754
- set)
755
- :param port: the port for the web server (overrides the environment if
756
- set). Use negative number to avoid running a web server (default when dry run).
757
-
758
- :param run_mode: The run mode for the experiment (normal, generate run
759
- files, dry run)
283
+ :param job: The job to start
284
+ :return: JobState.WAITING if dependencies could not be locked, JobState.DONE
285
+ if job completed successfully, JobState.ERROR if job failed during execution,
286
+ or None (should not occur in normal operation)
287
+ :raises Exception: Various exceptions during scheduler coordination
760
288
  """
761
289
 
762
- from experimaestro.server import Server
763
-
764
- if isinstance(env, Environment):
765
- self.environment = env
766
- else:
767
- self.environment = Environment(workdir=env)
768
-
769
- # Creates the workspace
770
- run_mode = run_mode or RunMode.NORMAL
771
- self.workspace = Workspace(
772
- self.environment, launcher=launcher, run_mode=run_mode
773
- )
774
-
775
- # Mark the directory has an experimaestro folder
776
- self.workdir = self.workspace.experimentspath / name
777
- self.workdir.mkdir(parents=True, exist_ok=True)
778
- self.xplockpath = self.workdir / "lock"
779
- self.xplock = None
780
- self.old_experiment = None
781
- self.services: Dict[str, Service] = {}
782
-
783
- settings = get_settings()
784
-
785
- if host is not None:
786
- settings.server.host = host
787
-
788
- if port is not None:
789
- settings.server.port = port
790
-
791
- if token is not None:
792
- settings.server.token = token
793
-
794
- # Create the scheduler
795
- self.scheduler = Scheduler(self, name)
796
- self.server = (
797
- Server(self.scheduler, settings.server)
798
- if (settings.server.port is not None and settings.server.port >= 0)
799
- and self.workspace.run_mode == RunMode.NORMAL
800
- else None
801
- )
802
-
803
- if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
804
- import faulthandler
805
-
806
- logger.info("Enabling fault handler")
807
- faulthandler.enable(all_threads=True)
808
-
809
- def submit(self, job: Job):
810
- return self.scheduler.submit(job)
290
+ # Assert preconditions
291
+ assert job.launcher is not None
811
292
 
812
- def prepare(self, job: Job):
813
- """Generate the file"""
814
- return self.scheduler.prepare(job)
815
-
816
- @property
817
- def run_mode(self):
818
- return self.workspace.run_mode
819
-
820
- @property
821
- def loop(self):
822
- assert self.central is not None
823
- return self.central.loop
824
-
825
- @property
826
- def resultspath(self):
827
- """Return the directory in which results can be stored for this experiment"""
828
- return self.workdir / "results"
829
-
830
- @property
831
- def jobspath(self):
832
- """Return the directory in which results can be stored for this experiment"""
833
- return self.workdir / "jobs"
834
-
835
- @property
836
- def jobsbakpath(self):
837
- """Return the directory in which results can be stored for this experiment"""
838
- return self.workdir / "jobs.bak"
839
-
840
- def stop(self):
841
- """Stop the experiment as soon as possible"""
842
-
843
- async def doStop():
844
- assert self.central is not None
845
- async with self.central.exitCondition:
846
- self.exitMode = True
847
- self.central.exitCondition.notify_all()
848
-
849
- assert self.central is not None and self.central.loop is not None
850
- asyncio.run_coroutine_threadsafe(doStop(), self.central.loop)
851
-
852
- def wait(self):
853
- """Wait until the running processes have finished"""
854
-
855
- async def awaitcompletion():
856
- assert self.central is not None
857
- async with self.central.exitCondition:
858
- while True:
859
- if self.unfinishedJobs == 0 or self.exitMode:
860
- break
861
- await self.central.exitCondition.wait()
862
-
863
- if self.failedJobs:
864
- # Show some more information
865
- count = 0
866
- for job in self.failedJobs.values():
867
- if job.failure_status != JobFailureStatus.DEPENDENCY:
868
- count += 1
869
- logger.error(
870
- "Job %s failed, check the log file %s",
871
- job.relpath,
872
- job.stderr,
873
- )
874
- raise FailedExperiment(f"{count} failed jobs")
875
-
876
- future = asyncio.run_coroutine_threadsafe(awaitcompletion(), self.loop)
877
- return future.result()
878
-
879
- def setenv(self, name, value):
880
- """Shortcut to set the environment value"""
881
- self.environment.setenv(name, value)
882
-
883
- def token(self, name: str, count: int):
884
- """Returns a token for this experiment
885
-
886
- The token is the default token of the workspace connector"""
887
- return self.workspace.connector.createtoken(name, count)
888
-
889
- def __enter__(self):
890
- if self.workspace.run_mode != RunMode.DRY_RUN:
891
- logger.info("Locking experiment %s", self.xplockpath)
892
- self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
893
- logger.info("Experiment locked")
894
-
895
- # Move old jobs into "jobs.bak"
896
- if self.workspace.run_mode == RunMode.NORMAL:
897
- self.jobsbakpath.mkdir(exist_ok=True)
898
- for p in self.jobspath.glob("*/*"):
899
- if p.is_symlink():
900
- target = self.jobsbakpath / p.relative_to(self.jobspath)
901
- if target.is_symlink():
902
- # Remove if duplicate
903
- p.unlink()
904
- else:
905
- # Rename otherwise
906
- target.parent.mkdir(parents=True, exist_ok=True)
907
- p.rename(target)
908
-
909
- if self.server:
910
- self.server.start()
911
-
912
- self.workspace.__enter__()
913
- (self.workspace.path / ".__experimaestro__").touch()
914
-
915
- global SIGNAL_HANDLER
916
- # Number of unfinished jobs
917
- self.unfinishedJobs = 0
918
-
919
- # List of failed jobs
920
- self.failedJobs: Dict[str, Job] = {}
921
-
922
- # Exit mode when catching signals
923
- self.exitMode = False
924
-
925
- self.central = SchedulerCentral.create(self.scheduler.name)
926
-
927
- SIGNAL_HANDLER.add(self)
928
-
929
- self.old_experiment = experiment.CURRENT
930
- experiment.CURRENT = self
931
- return self
932
-
933
- def __exit__(self, exc_type, exc_value, traceback):
934
- # If no exception and normal run mode, remove old "jobs"
935
- if self.workspace.run_mode == RunMode.NORMAL:
936
- if exc_type is None and self.jobsbakpath.is_dir():
937
- rmtree(self.jobsbakpath)
938
-
939
- # Close the different locks
940
293
  try:
941
- if exc_type:
942
- # import faulthandler
943
- # faulthandler.dump_traceback()
944
- logger.error(
945
- "Not waiting since an exception was thrown"
946
- " (some jobs may be running)"
947
- )
948
- else:
949
- self.wait()
950
- finally:
951
- SIGNAL_HANDLER.remove(self)
952
-
953
- # Stop services
954
- for service in self.services.values():
955
- logger.info("Closing service %s", service.description())
956
- service.stop()
957
-
958
- if self.central is not None:
959
- self.central.loop.stop()
960
-
961
- self.central = None
962
- self.workspace.__exit__(exc_type, exc_value, traceback)
963
- if self.xplock:
964
- self.xplock.__exit__(exc_type, exc_value, traceback)
965
-
966
- # Put back old experiment as current one
967
- experiment.CURRENT = self.old_experiment
968
- if self.server:
969
- self.server.stop()
970
-
971
- def add_service(self, service: ServiceClass) -> ServiceClass:
972
- """Adds a service (e.g. tensorboard viewer) to the experiment
973
-
974
- :param service: A service instance
975
- :return: The same service instance
976
- """
977
- self.services[service.id] = service
978
- for listener in self.scheduler.listeners:
979
- listener.service_add(service)
980
- return service
981
-
982
- def save(self, obj: Any, name: str = "default"):
983
- """Serializes configurations.
984
-
985
- Saves configuration objects within the experimental directory
986
-
987
- :param obj: The object to save
988
- :param name: The name of the saving directory (default to `default`)
989
- """
990
-
991
- if self.workspace.run_mode == RunMode.NORMAL:
992
- from experimaestro import save
993
-
994
- save_dir = self.workdir / "data" / name
995
- save_dir.mkdir(exist_ok=True, parents=True)
996
-
997
- save(obj, save_dir)
998
-
999
- def load(self, reference: str, name: str = "default"):
1000
- """Serializes configurations.
294
+ # Call job's start method with scheduler context
295
+ state = await job.aio_start(
296
+ sched_dependency_lock=self.dependencyLock,
297
+ notification_server=self.xp.server if self.xp else None,
298
+ )
1001
299
 
1002
- Loads configuration objects from an experimental directory
300
+ if state is None:
301
+ # Dependencies couldn't be locked, return WAITING state
302
+ return JobState.WAITING
1003
303
 
1004
- :param reference: The name of the experiment
1005
- :param name: The name of the saving directory (default to `default`)
1006
- """
1007
- from experimaestro import load
304
+ # Notify scheduler listeners of job state after successful start
305
+ self.notify_job_state(job)
306
+ return state
1008
307
 
1009
- path = self.workspace.experimentspath / reference / "data" / name
1010
- return load(path)
308
+ except Exception:
309
+ logger.warning("Error in scheduler job coordination", exc_info=True)
310
+ return JobState.ERROR