experimaestro 2.0.0a3__py3-none-any.whl → 2.0.0a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

@@ -1,392 +1,23 @@
1
- from collections import ChainMap
2
- from functools import cached_property
3
- import itertools
4
1
  import logging
5
- import os
6
- from pathlib import Path
7
- from shutil import rmtree
8
2
  import threading
9
3
  import time
10
4
  from typing import (
11
- Any,
12
- Iterator,
13
- List,
14
5
  Optional,
15
6
  Set,
16
- TypeVar,
17
- Union,
18
- TYPE_CHECKING,
19
7
  )
20
- import enum
21
- import signal
22
8
  import asyncio
23
- from experimaestro.exceptions import HandledException
24
- from experimaestro.notifications import LevelInformation, Reporter
25
9
  from typing import Dict
10
+
11
+ from experimaestro.scheduler import experiment
12
+ from experimaestro.scheduler.jobs import Job, JobState
26
13
  from experimaestro.scheduler.services import Service
27
- from experimaestro.settings import WorkspaceSettings, get_settings
28
14
 
29
15
 
30
- from experimaestro.core.objects import Config, ConfigWalkContext, WatchedOutput
31
16
  from experimaestro.utils import logger
32
- from experimaestro.locking import Locks, LockError, Lock
33
17
  from experimaestro.utils.asyncio import asyncThreadcheck
34
- from .workspace import RunMode, Workspace
35
- from .dependencies import Dependency, DependencyStatus, Resource
36
18
  import concurrent.futures
37
19
 
38
20
 
39
- if TYPE_CHECKING:
40
- from experimaestro.connectors import Process
41
- from experimaestro.launchers import Launcher
42
-
43
-
44
- class FailedExperiment(HandledException):
45
- """Raised when an experiment failed"""
46
-
47
- pass
48
-
49
-
50
- class JobState(enum.Enum):
51
- # Job is not yet scheduled
52
- UNSCHEDULED = 0
53
-
54
- # Job is waiting for dependencies to be done
55
- WAITING = 1
56
-
57
- # Job is ready to run
58
- READY = 2
59
-
60
- # Job is scheduled (e.g. slurm)
61
- SCHEDULED = 3
62
-
63
- # Job is running
64
- RUNNING = 4
65
-
66
- # Job is done (finished)
67
- DONE = 5
68
-
69
- # Job failed (finished)
70
- ERROR = 6
71
-
72
- def notstarted(self):
73
- return self.value <= JobState.READY.value
74
-
75
- def running(self):
76
- return (
77
- self.value == JobState.RUNNING.value
78
- or self.value == JobState.SCHEDULED.value
79
- )
80
-
81
- def finished(self):
82
- return self.value >= JobState.DONE.value
83
-
84
-
85
- class JobFailureStatus(enum.Enum):
86
- #: Job failed
87
- DEPENDENCY = 0
88
-
89
- #: Job dependency failed
90
- FAILED = 1
91
-
92
- #: Memory
93
- MEMORY = 2
94
-
95
-
96
- class JobLock(Lock):
97
- def __init__(self, job):
98
- super().__init__()
99
- self.job = job
100
-
101
- def _acquire(self):
102
- return self.job.state == JobState.DONE
103
-
104
- def _release(self):
105
- return False
106
-
107
-
108
- class JobDependency(Dependency):
109
- def __init__(self, job):
110
- super().__init__(job)
111
-
112
- def status(self) -> DependencyStatus:
113
- if self.origin.state == JobState.DONE:
114
- return DependencyStatus.OK
115
- elif self.origin.state == JobState.ERROR:
116
- return DependencyStatus.FAIL
117
- return DependencyStatus.WAIT
118
-
119
- def lock(self):
120
- return JobLock(self.origin)
121
-
122
-
123
- class Job(Resource):
124
- """A job is a resource that is produced by the execution of some code"""
125
-
126
- # Set by the scheduler
127
- _readyEvent: Optional[asyncio.Event]
128
- _future: Optional["concurrent.futures.Future"]
129
-
130
- def __init__(
131
- self,
132
- config: Config,
133
- *,
134
- workspace: Workspace = None,
135
- launcher: "Launcher" = None,
136
- run_mode: RunMode = RunMode.NORMAL,
137
- ):
138
- super().__init__()
139
-
140
- self.workspace = workspace or Workspace.CURRENT
141
- self.launcher = launcher or self.workspace.launcher if self.workspace else None
142
-
143
- if run_mode == RunMode.NORMAL:
144
- assert self.workspace is not None, "No experiment has been defined"
145
- assert self.launcher is not None, (
146
- "No launcher, and no default defined for the workspace %s" % workspace
147
- )
148
-
149
- self.type = config.__xpmtype__
150
- self.name = str(self.type.identifier).rsplit(".", 1)[-1]
151
-
152
- self.scheduler: Optional["Scheduler"] = None
153
- self.config = config
154
- self.state: JobState = JobState.UNSCHEDULED
155
-
156
- #: If a job has failed, indicates the failure status
157
- self.failure_status: JobFailureStatus = None
158
-
159
- # Dependencies
160
- self.dependencies: Set[Dependency] = set() # as target
161
-
162
- # Watched outputs
163
- self.watched_outputs = {}
164
- for watched in config.__xpm__.watched_outputs:
165
- self.watch_output(watched)
166
-
167
- # Process
168
- self._process = None
169
- self.unsatisfied = 0
170
-
171
- # Meta-information
172
- self.starttime: Optional[float] = None
173
- self.submittime: Optional[float] = None
174
- self.endtime: Optional[float] = None
175
- self._progress: List[LevelInformation] = []
176
- self.tags = config.tags()
177
-
178
- def watch_output(self, watched: "WatchedOutput"):
179
- """Monitor task outputs
180
-
181
- :param watched: A description of the watched output
182
- """
183
- self.scheduler.xp.watch_output(watched)
184
-
185
- def task_output_update(self, subpath: Path):
186
- """Notification of an updated task output"""
187
- if watcher := self.watched_outputs.get(subpath, None):
188
- watcher.update()
189
-
190
- def done_handler(self):
191
- """The task has been completed"""
192
- for watcher in self.watched_outputs.values():
193
- watcher.update()
194
-
195
- def __str__(self):
196
- return "Job[{}]".format(self.identifier)
197
-
198
- def wait(self) -> JobState:
199
- assert self._future, "Cannot wait a not submitted job"
200
- return self._future.result()
201
-
202
- @cached_property
203
- def python_path(self) -> Iterator[str]:
204
- """Returns an iterator over python path"""
205
- return itertools.chain(self.workspace.python_path)
206
-
207
- @cached_property
208
- def environ(self):
209
- """Returns the job environment
210
-
211
- It is made of (by order of priority):
212
-
213
- 1. The job environment
214
- 1. The launcher environment
215
- 1. The workspace environment
216
-
217
- """
218
- return ChainMap(
219
- {},
220
- self.launcher.environ if self.launcher else {},
221
- self.workspace.env if self.workspace else {},
222
- )
223
-
224
- @property
225
- def progress(self):
226
- return self._progress
227
-
228
- def set_progress(self, level: int, value: float, desc: Optional[str]):
229
- if value < 0:
230
- logger.warning(f"Progress value out of bounds ({value})")
231
- value = 0
232
- elif value > 1:
233
- logger.warning(f"Progress value out of bounds ({value})")
234
- value = 1
235
-
236
- # Adjust the length of the array
237
- self._progress = self._progress[: (level + 1)]
238
- while len(self._progress) <= level:
239
- self._progress.append(LevelInformation(len(self._progress), None, 0.0))
240
-
241
- if desc:
242
- self._progress[-1].desc = desc
243
- self._progress[-1].progress = value
244
-
245
- for listener in self.scheduler.listeners:
246
- listener.job_state(self)
247
-
248
- def add_notification_server(self, server):
249
- """Adds a notification server"""
250
- key, baseurl = server.getNotificationSpec()
251
- dirpath = self.path / Reporter.NOTIFICATION_FOLDER
252
- dirpath.mkdir(exist_ok=True)
253
- (dirpath / key).write_text(f"{baseurl}/{self.identifier}")
254
-
255
- @property
256
- def ready(self):
257
- return self.state == JobState.READY
258
-
259
- @property
260
- def jobpath(self) -> Path:
261
- """Deprecated, use `path`"""
262
- return self.workspace.jobspath / self.relpath
263
-
264
- @property
265
- def path(self) -> Path:
266
- return self.workspace.jobspath / self.relpath
267
-
268
- @property
269
- def experimaestro_path(self) -> Path:
270
- return (self.path / ".experimaestro").resolve()
271
-
272
- @cached_property
273
- def task_outputs_path(self) -> Path:
274
- return self.experimaestro_path / "task-outputs.jsonl"
275
-
276
- @property
277
- def relpath(self):
278
- identifier = self.config.__xpm__.identifier
279
- base = Path(str(self.type.identifier))
280
- return base / identifier.all.hex()
281
-
282
- @property
283
- def relmainpath(self):
284
- identifier = self.config.__xpm__.identifier
285
- base = Path(str(self.type.identifier))
286
- return base / identifier.main.hex()
287
-
288
- @property
289
- def hashidentifier(self):
290
- return self.config.__xpm__.identifier
291
-
292
- @property
293
- def identifier(self):
294
- return self.config.__xpm__.identifier.all.hex()
295
-
296
- def prepare(self, overwrite=False):
297
- """Prepare all files before starting a task
298
-
299
- :param overwrite: if True, overwrite files even if the task has been run
300
- """
301
- pass
302
-
303
- async def aio_run(self):
304
- """Actually run the code"""
305
- raise NotImplementedError(f"Method aio_run not implemented in {self.__class__}")
306
-
307
- async def aio_process(self) -> Optional["Process"]:
308
- """Returns the process if it exists"""
309
- raise NotImplementedError("Not implemented")
310
-
311
- @property
312
- def pidpath(self):
313
- """This file contains the file PID"""
314
- return self.jobpath / ("%s.pid" % self.name)
315
-
316
- @property
317
- def lockpath(self):
318
- """This file is used as a lock for running the job"""
319
- return self.workspace.jobspath / self.relmainpath / ("%s.lock" % self.name)
320
-
321
- @property
322
- def donepath(self) -> Path:
323
- """When a job has been successful, this file is written"""
324
- return self.jobpath / ("%s.done" % self.name)
325
-
326
- @property
327
- def failedpath(self):
328
- """When a job has been unsuccessful, this file is written with an error
329
- code inside"""
330
- return self.jobpath / ("%s.failed" % self.name)
331
-
332
- @property
333
- def stdout(self) -> Path:
334
- return self.jobpath / ("%s.out" % self.name)
335
-
336
- @property
337
- def stderr(self) -> Path:
338
- return self.jobpath / ("%s.err" % self.name)
339
-
340
- @property
341
- def basepath(self) -> Path:
342
- return self.jobpath / self.name
343
-
344
- def dependencychanged(self, dependency, oldstatus, status):
345
- """Called when a dependency has changed"""
346
-
347
- def value(s):
348
- return 1 if s == DependencyStatus.OK else 0
349
-
350
- self.unsatisfied -= value(status) - value(oldstatus)
351
-
352
- logger.debug("Job %s: unsatisfied %d", self, self.unsatisfied)
353
-
354
- if status == DependencyStatus.FAIL:
355
- # Job completed
356
- if not self.state.finished():
357
- self.state = JobState.ERROR
358
- self.failure_status = JobFailureStatus.DEPENDENCY
359
- self._readyEvent.set()
360
-
361
- if self.unsatisfied == 0:
362
- logger.info("Job %s is ready to run", self)
363
- # We are ready
364
- self.state = JobState.READY
365
- self._readyEvent.set()
366
-
367
- def finalState(self) -> "concurrent.futures.Future[JobState]":
368
- assert self._future is not None
369
- return self._future
370
-
371
-
372
- class JobContext(ConfigWalkContext):
373
- def __init__(self, job: Job):
374
- super().__init__()
375
- self.job = job
376
-
377
- @property
378
- def name(self):
379
- return self.job.name
380
-
381
- @property
382
- def path(self):
383
- return self.job.path
384
-
385
- @property
386
- def task(self):
387
- return self.job.config
388
-
389
-
390
21
  class Listener:
391
22
  def job_submitted(self, job):
392
23
  pass
@@ -399,78 +30,16 @@ class Listener:
399
30
  pass
400
31
 
401
32
 
402
- class JobError(Exception):
403
- def __init__(self, code):
404
- super().__init__(f"Job exited with code {code}")
405
-
406
-
407
- class SignalHandler:
408
- def __init__(self):
409
- self.experiments: Set["experiment"] = set()
410
- self.original_sigint_handler = None
411
-
412
- def add(self, xp: "experiment"):
413
- if not self.experiments:
414
- self.original_sigint_handler = signal.getsignal(signal.SIGINT)
415
-
416
- signal.signal(signal.SIGINT, self)
417
-
418
- self.experiments.add(xp)
419
-
420
- def remove(self, xp):
421
- self.experiments.remove(xp)
422
- if not self.experiments:
423
- signal.signal(signal.SIGINT, self.original_sigint_handler)
424
-
425
- def __call__(self, signum, frame):
426
- """SIGINT signal handler"""
427
- logger.warning("Signal received")
428
- for xp in self.experiments:
429
- xp.stop()
430
-
431
-
432
- SIGNAL_HANDLER = SignalHandler()
433
-
434
-
435
- class SchedulerCentral(threading.Thread):
436
- loop: asyncio.AbstractEventLoop
437
-
438
- """The event loop thread used by the scheduler"""
439
-
440
- def __init__(self, name: str):
441
- # Daemon thread so it is non blocking
442
- super().__init__(name=f"Scheduler EL ({name})", daemon=True)
443
-
444
- self._ready = threading.Event()
445
-
446
- def run(self):
447
- logger.debug("Starting event loop thread")
448
- self.loop = asyncio.new_event_loop()
449
- asyncio.set_event_loop(self.loop)
450
-
451
- # Set loop-dependent variables
452
- self.exitCondition = asyncio.Condition()
453
- self.dependencyLock = asyncio.Lock()
454
-
455
- # Start the event loop
456
- self._ready.set()
457
- self.loop.run_forever()
458
-
459
- @staticmethod
460
- def create(name: str):
461
- instance = SchedulerCentral(name)
462
- instance.start()
463
- instance._ready.wait()
464
- return instance
465
-
466
-
467
- class Scheduler:
33
+ class Scheduler(threading.Thread):
468
34
  """A job scheduler
469
35
 
470
36
  The scheduler is based on asyncio for easy concurrency handling
471
37
  """
472
38
 
473
39
  def __init__(self, xp: "experiment", name: str):
40
+ super().__init__(name=f"Scheduler ({name})", daemon=True)
41
+ self._ready = threading.Event()
42
+
474
43
  # Name of the experiment
475
44
  self.name = name
476
45
  self.xp = xp
@@ -487,9 +56,32 @@ class Scheduler:
487
56
  # Listeners
488
57
  self.listeners: Set[Listener] = set()
489
58
 
490
- @property
491
- def loop(self):
492
- return self.xp.loop
59
+ @staticmethod
60
+ def create(xp: "experiment", name: str):
61
+ instance = Scheduler(xp, name)
62
+ instance.start()
63
+ instance._ready.wait()
64
+ return instance
65
+
66
+ def run(self):
67
+ """Run the event loop forever"""
68
+ logger.debug("Starting event loop thread")
69
+ # Ported from SchedulerCentral
70
+ self.loop = asyncio.new_event_loop()
71
+ asyncio.set_event_loop(self.loop)
72
+ # Set loop-dependent variables
73
+ self.exitCondition = asyncio.Condition()
74
+ self.dependencyLock = asyncio.Lock()
75
+ self._ready.set()
76
+ self.loop.run_forever()
77
+
78
+ def start_scheduler(self):
79
+ """Start the scheduler event loop in a thread"""
80
+ if not self.is_alive():
81
+ self.start()
82
+ self._ready.wait()
83
+ else:
84
+ logger.warning("Scheduler already started")
493
85
 
494
86
  def addlistener(self, listener: Listener):
495
87
  self.listeners.add(listener)
@@ -498,6 +90,13 @@ class Scheduler:
498
90
  self.listeners.remove(listener)
499
91
 
500
92
  def getJobState(self, job: Job) -> "concurrent.futures.Future[JobState]":
93
+ # Check if the job belongs to this scheduler
94
+ if job.identifier not in self.jobs:
95
+ # If job is not in this scheduler, return its current state directly
96
+ future = concurrent.futures.Future()
97
+ future.set_result(job.state)
98
+ return future
99
+
501
100
  return asyncio.run_coroutine_threadsafe(self.aio_getjobstate(job), self.loop)
502
101
 
503
102
  async def aio_getjobstate(self, job: Job):
@@ -547,6 +146,22 @@ class Scheduler:
547
146
 
548
147
  return None
549
148
 
149
+ def notify_job_submitted(self, job: Job):
150
+ """Notify the listeners that a job has been submitted"""
151
+ for listener in self.listeners:
152
+ try:
153
+ listener.job_submitted(job)
154
+ except Exception:
155
+ logger.exception("Got an error with listener %s", listener)
156
+
157
+ def notify_job_state(self, job: Job):
158
+ """Notify the listeners that a job has changed state"""
159
+ for listener in self.listeners:
160
+ try:
161
+ listener.job_state(job)
162
+ except Exception:
163
+ logger.exception("Got an error with listener %s", listener)
164
+
550
165
  async def aio_submit(self, job: Job) -> JobState: # noqa: C901
551
166
  """Main scheduler function: submit a job, run it (if needed), and returns
552
167
  the status code
@@ -571,11 +186,8 @@ class Scheduler:
571
186
  path.symlink_to(job.path)
572
187
 
573
188
  job.state = JobState.WAITING
574
- for listener in self.listeners:
575
- try:
576
- listener.job_submitted(job)
577
- except Exception:
578
- logger.exception("Got an error with listener %s", listener)
189
+
190
+ self.notify_job_submitted(job)
579
191
 
580
192
  # Add dependencies, and add to blocking resources
581
193
  if job.dependencies:
@@ -598,11 +210,8 @@ class Scheduler:
598
210
  if process is not None:
599
211
  # Yep! First we notify the listeners
600
212
  job.state = JobState.RUNNING
601
- for listener in self.listeners:
602
- try:
603
- listener.job_state(job)
604
- except Exception:
605
- logger.exception("Got an error with listener %s", listener)
213
+ # Notify the listeners
214
+ self.notify_job_state(job)
606
215
 
607
216
  # Adds to the listeners
608
217
  if self.xp.server is not None:
@@ -637,11 +246,7 @@ class Scheduler:
637
246
 
638
247
  job.state = state
639
248
 
640
- for listener in self.listeners:
641
- try:
642
- listener.job_state(job)
643
- except Exception as e:
644
- logger.exception("Listener %s did raise an exception", e)
249
+ self.notify_job_state(job)
645
250
 
646
251
  # Job is finished
647
252
  if job.state != JobState.DONE:
@@ -652,9 +257,9 @@ class Scheduler:
652
257
 
653
258
  # Decrement the number of unfinished jobs and notify
654
259
  self.xp.unfinishedJobs -= 1
655
- async with self.xp.central.exitCondition:
260
+ async with self.exitCondition:
656
261
  logging.debug("Updated number of unfinished jobs")
657
- self.xp.central.exitCondition.notify_all()
262
+ self.exitCondition.notify_all()
658
263
 
659
264
  job.endtime = time.time()
660
265
  if job in self.waitingjobs:
@@ -669,461 +274,37 @@ class Scheduler:
669
274
  return job.state
670
275
 
671
276
  async def aio_start(self, job: Job) -> Optional[JobState]:
672
- """Start a job
673
-
674
- Returns None if the dependencies could not be locked after all
675
- Returns DONE/ERROR depending on the process outcome
676
- """
677
-
678
- # We first lock the job before proceeding
679
- assert job.launcher is not None
680
- assert self.xp.central is not None
681
-
682
- with Locks() as locks:
683
- logger.debug("[starting] Locking job %s", job)
684
- async with job.launcher.connector.lock(job.lockpath):
685
- logger.debug("[starting] Locked job %s", job)
686
-
687
- state = None
688
- try:
689
- logger.debug(
690
- "Starting job %s with %d dependencies",
691
- job,
692
- len(job.dependencies),
693
- )
694
-
695
- async with self.xp.central.dependencyLock:
696
- for dependency in job.dependencies:
697
- try:
698
- locks.append(dependency.lock().acquire())
699
- except LockError:
700
- logger.warning(
701
- "Could not lock %s, aborting start for job %s",
702
- dependency,
703
- job,
704
- )
705
- dependency.check()
706
- return JobState.WAITING
707
-
708
- for listener in self.listeners:
709
- listener.job_state(job)
710
-
711
- job.starttime = time.time()
712
-
713
- # Creates the main directory
714
- directory = job.path
715
- logger.debug("Making directories job %s...", directory)
716
- if not directory.is_dir():
717
- directory.mkdir(parents=True, exist_ok=True)
718
-
719
- # Sets up the notification URL
720
- if self.xp.server is not None:
721
- job.add_notification_server(self.xp.server)
722
-
723
- except Exception:
724
- logger.warning("Error while locking job", exc_info=True)
725
- return JobState.WAITING
726
-
727
- try:
728
- # Runs the job
729
- process = await job.aio_run()
730
- except Exception:
731
- logger.warning("Error while starting job", exc_info=True)
732
- return JobState.ERROR
733
-
734
- try:
735
- if isinstance(process, JobState):
736
- state = process
737
- logger.debug("Job %s ended (state %s)", job, state)
738
- else:
739
- logger.debug("Waiting for job %s process to end", job)
740
-
741
- code = await process.aio_code()
742
- logger.debug("Got return code %s for %s", code, job)
743
-
744
- # Check the file if there is no return code
745
- if code is None:
746
- # Case where we cannot retrieve the code right away
747
- if job.donepath.is_file():
748
- code = 0
749
- else:
750
- code = int(job.failedpath.read_text())
751
-
752
- logger.debug("Job %s ended with code %s", job, code)
753
- state = JobState.DONE if code == 0 else JobState.ERROR
754
-
755
- except JobError:
756
- logger.warning("Error while running job")
757
- state = JobState.ERROR
758
-
759
- except Exception:
760
- logger.warning(
761
- "Error while running job (in experimaestro)", exc_info=True
762
- )
763
- state = JobState.ERROR
764
-
765
- return state
766
-
767
-
768
- ServiceClass = TypeVar("ServiceClass", bound=Service)
769
-
770
-
771
- class experiment:
772
- """Main experiment object
773
-
774
- It is a context object, i.e. experiments is run with
775
-
776
- ```py
777
- with experiment(...) as xp:
778
- ...
779
- ```
780
- """
781
-
782
- #: Current experiment
783
- CURRENT: Optional["experiment"] = None
784
-
785
- @staticmethod
786
- def current() -> "experiment":
787
- """Returns the current experiment, but checking first if set
788
-
789
- If there is no current experiment, raises an AssertError
790
- """
791
- assert experiment.CURRENT is not None, "No current experiment defined"
792
- return experiment.CURRENT
793
-
794
- def __init__(
795
- self,
796
- env: Union[Path, str, WorkspaceSettings],
797
- name: str,
798
- *,
799
- host: Optional[str] = None,
800
- port: Optional[int] = None,
801
- token: Optional[str] = None,
802
- run_mode: Optional[RunMode] = None,
803
- launcher=None,
804
- ):
805
- """
806
- :param env: an environment -- or a working directory for a local
807
- environment
277
+ """Start a job (scheduler coordination layer)
808
278
 
809
- :param name: the identifier of the experiment
279
+ This method serves as a coordination layer that delegates the actual
280
+ job starting logic to the job itself while handling scheduler-specific
281
+ concerns like state notifications and providing coordination context.
810
282
 
811
- :param launcher: The launcher (if not provided, inferred from path)
812
-
813
- :param host: The host for the web server (overrides the environment if
814
- set)
815
- :param port: the port for the web server (overrides the environment if
816
- set). Use negative number to avoid running a web server (default when dry run).
817
-
818
- :param run_mode: The run mode for the experiment (normal, generate run
819
- files, dry run)
283
+ :param job: The job to start
284
+ :return: JobState.WAITING if dependencies could not be locked, JobState.DONE
285
+ if job completed successfully, JobState.ERROR if job failed during execution,
286
+ or None (should not occur in normal operation)
287
+ :raises Exception: Various exceptions during scheduler coordination
820
288
  """
821
289
 
822
- from experimaestro.server import Server
823
- from experimaestro.scheduler import Listener
824
-
825
- settings = get_settings()
826
- if not isinstance(env, WorkspaceSettings):
827
- env = WorkspaceSettings(id=None, path=Path(env))
828
-
829
- # Creates the workspace
830
- run_mode = run_mode or RunMode.NORMAL
831
- self.workspace = Workspace(settings, env, launcher=launcher, run_mode=run_mode)
832
-
833
- # Mark the directory has an experimaestro folder
834
- self.workdir = self.workspace.experimentspath / name
835
- self.workdir.mkdir(parents=True, exist_ok=True)
836
- self.xplockpath = self.workdir / "lock"
837
- self.xplock = None
838
- self.old_experiment = None
839
- self.services: Dict[str, Service] = {}
840
- self._job_listener: Optional[Listener] = None
841
-
842
- # Get configuration settings
843
-
844
- if host is not None:
845
- settings.server.host = host
846
-
847
- if port is not None:
848
- settings.server.port = port
849
-
850
- if token is not None:
851
- settings.server.token = token
852
-
853
- # Create the scheduler
854
- self.scheduler = Scheduler(self, name)
855
- self.server = (
856
- Server(self.scheduler, settings.server)
857
- if (settings.server.port is not None and settings.server.port >= 0)
858
- and self.workspace.run_mode == RunMode.NORMAL
859
- else None
860
- )
861
-
862
- if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
863
- import faulthandler
864
-
865
- logger.info("Enabling fault handler")
866
- faulthandler.enable(all_threads=True)
867
-
868
- def submit(self, job: Job):
869
- return self.scheduler.submit(job)
290
+ # Assert preconditions
291
+ assert job.launcher is not None
870
292
 
871
- def prepare(self, job: Job):
872
- """Generate the file"""
873
- return self.scheduler.prepare(job)
874
-
875
- @property
876
- def run_mode(self):
877
- return self.workspace.run_mode
878
-
879
- @property
880
- def loop(self):
881
- assert self.central is not None
882
- return self.central.loop
883
-
884
- @property
885
- def resultspath(self):
886
- """Return the directory in which results can be stored for this experiment"""
887
- return self.workdir / "results"
888
-
889
- @property
890
- def jobspath(self):
891
- """Return the directory in which results can be stored for this experiment"""
892
- return self.workdir / "jobs"
893
-
894
- @property
895
- def alt_jobspaths(self):
896
- """Return potential other directories"""
897
- for alt_workdir in self.workspace.alt_workdirs:
898
- yield alt_workdir / "jobs"
899
-
900
- @property
901
- def jobsbakpath(self):
902
- """Return the directory in which results can be stored for this experiment"""
903
- return self.workdir / "jobs.bak"
904
-
905
- def stop(self):
906
- """Stop the experiment as soon as possible"""
907
-
908
- async def doStop():
909
- assert self.central is not None
910
- async with self.central.exitCondition:
911
- self.exitMode = True
912
- logging.debug("Setting exit mode to true")
913
- self.central.exitCondition.notify_all()
914
-
915
- assert self.central is not None and self.central.loop is not None
916
- asyncio.run_coroutine_threadsafe(doStop(), self.central.loop)
917
-
918
- def wait(self):
919
- """Wait until the running processes have finished"""
920
-
921
- async def awaitcompletion():
922
- assert self.central is not None
923
- logger.debug("Waiting to exit scheduler...")
924
- async with self.central.exitCondition:
925
- while True:
926
- if self.exitMode:
927
- break
928
-
929
- # If we have still unfinished jobs or possible new tasks, wait
930
- logger.debug(
931
- "Checking exit condition: unfinished jobs=%d, task output queue size=%d",
932
- self.unfinishedJobs,
933
- self.taskOutputQueueSize,
934
- )
935
- if self.unfinishedJobs == 0 and self.taskOutputQueueSize == 0:
936
- break
937
-
938
- # Wait for more news...
939
- await self.central.exitCondition.wait()
940
-
941
- if self.failedJobs:
942
- # Show some more information
943
- count = 0
944
- for job in self.failedJobs.values():
945
- if job.failure_status != JobFailureStatus.DEPENDENCY:
946
- count += 1
947
- logger.error(
948
- "Job %s failed, check the log file %s",
949
- job.relpath,
950
- job.stderr,
951
- )
952
- raise FailedExperiment(f"{count} failed jobs")
953
-
954
- future = asyncio.run_coroutine_threadsafe(awaitcompletion(), self.loop)
955
- return future.result()
956
-
957
- def setenv(self, name, value, override=True):
958
- """Shortcut to set the environment value"""
959
- if override or name not in self.workspace.env:
960
- logging.info("Setting environment: %s=%s", name, value)
961
- self.workspace.env[name] = value
962
-
963
- def token(self, name: str, count: int):
964
- """Returns a token for this experiment
965
-
966
- The token is the default token of the workspace connector"""
967
- return self.workspace.connector.createtoken(name, count)
968
-
969
- def __enter__(self):
970
- from .dynamic_outputs import TaskOutputsWorker
971
-
972
- if self.workspace.run_mode != RunMode.DRY_RUN:
973
- logger.info("Locking experiment %s", self.xplockpath)
974
- self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
975
- logger.info("Experiment locked")
976
-
977
- # Move old jobs into "jobs.bak"
978
- if self.workspace.run_mode == RunMode.NORMAL:
979
- self.jobsbakpath.mkdir(exist_ok=True)
980
- for p in self.jobspath.glob("*/*"):
981
- if p.is_symlink():
982
- target = self.jobsbakpath / p.relative_to(self.jobspath)
983
- if target.is_symlink():
984
- # Remove if duplicate
985
- p.unlink()
986
- else:
987
- # Rename otherwise
988
- target.parent.mkdir(parents=True, exist_ok=True)
989
- p.rename(target)
990
-
991
- if self.server:
992
- self.server.start()
993
-
994
- self.workspace.__enter__()
995
- (self.workspace.path / ".__experimaestro__").touch()
996
-
997
- global SIGNAL_HANDLER
998
- # Number of unfinished jobs
999
- self.unfinishedJobs = 0
1000
- self.taskOutputQueueSize = 0
1001
-
1002
- # List of failed jobs
1003
- self.failedJobs: Dict[str, Job] = {}
1004
-
1005
- # Exit mode when catching signals
1006
- self.exitMode = False
1007
-
1008
- self.central = SchedulerCentral.create(self.scheduler.name)
1009
- self.taskOutputsWorker = TaskOutputsWorker(self)
1010
- self.taskOutputsWorker.start()
1011
-
1012
- SIGNAL_HANDLER.add(self)
1013
-
1014
- self.old_experiment = experiment.CURRENT
1015
- experiment.CURRENT = self
1016
- return self
1017
-
1018
- def __exit__(self, exc_type, exc_value, traceback):
1019
- logger.debug("Exiting scheduler context")
1020
- # If no exception and normal run mode, remove old "jobs"
1021
- if self.workspace.run_mode == RunMode.NORMAL:
1022
- if exc_type is None and self.jobsbakpath.is_dir():
1023
- rmtree(self.jobsbakpath)
1024
-
1025
- # Close the different locks
1026
293
  try:
1027
- if exc_type:
1028
- # import faulthandler
1029
- # faulthandler.dump_traceback()
1030
- logger.error(
1031
- "Not waiting since an exception was thrown"
1032
- " (some jobs may be running)"
1033
- )
1034
- else:
1035
- self.wait()
1036
- finally:
1037
- SIGNAL_HANDLER.remove(self)
1038
-
1039
- # Stop services
1040
- for service in self.services.values():
1041
- logger.info("Closing service %s", service.description())
1042
- service.stop()
1043
-
1044
- if self.central is not None:
1045
- logger.info("Stopping scheduler event loop")
1046
- self.central.loop.stop()
1047
-
1048
- if self.taskOutputsWorker is not None:
1049
- logger.info("Stopping tasks outputs worker")
1050
- self.taskOutputsWorker.queue.put(None)
1051
-
1052
- self.central = None
1053
- self.workspace.__exit__(exc_type, exc_value, traceback)
1054
- if self.xplock:
1055
- self.xplock.__exit__(exc_type, exc_value, traceback)
1056
-
1057
- # Put back old experiment as current one
1058
- experiment.CURRENT = self.old_experiment
1059
- if self.server:
1060
- logger.info("Stopping web server")
1061
- self.server.stop()
1062
-
1063
- if self.workspace.run_mode == RunMode.NORMAL:
1064
- # Write the state
1065
- logging.info("Saving the experiment state")
1066
- from experimaestro.scheduler.state import ExperimentState
1067
-
1068
- ExperimentState.save(
1069
- self.workdir / "state.json", self.scheduler.jobs.values()
294
+ # Call job's start method with scheduler context
295
+ state = await job.aio_start(
296
+ sched_dependency_lock=self.dependencyLock,
297
+ notification_server=self.xp.server if self.xp else None,
1070
298
  )
1071
299
 
1072
- async def update_task_output_count(self, delta: int):
1073
- """Change in the number of task outputs to process"""
1074
- async with self.central.exitCondition:
1075
- self.taskOutputQueueSize += delta
1076
- logging.debug(
1077
- "Updating queue size with %d => %d", delta, self.taskOutputQueueSize
1078
- )
1079
- if self.taskOutputQueueSize == 0:
1080
- self.central.exitCondition.notify_all()
1081
-
1082
- def watch_output(self, watched: "WatchedOutput"):
1083
- """Watch an output
1084
-
1085
- :param watched: The watched output specification
1086
- """
1087
-
1088
- self.taskOutputsWorker.watch_output(watched)
1089
-
1090
- def add_service(self, service: ServiceClass) -> ServiceClass:
1091
- """Adds a service (e.g. tensorboard viewer) to the experiment
1092
-
1093
- :param service: A service instance
1094
- :return: The same service instance
1095
- """
1096
- self.services[service.id] = service
1097
- for listener in self.scheduler.listeners:
1098
- listener.service_add(service)
1099
- return service
1100
-
1101
- def save(self, obj: Any, name: str = "default"):
1102
- """Serializes configurations.
1103
-
1104
- Saves configuration objects within the experimental directory
1105
-
1106
- :param obj: The object to save
1107
- :param name: The name of the saving directory (default to `default`)
1108
- """
1109
-
1110
- if self.workspace.run_mode == RunMode.NORMAL:
1111
- from experimaestro import save
1112
-
1113
- save_dir = self.workdir / "data" / name
1114
- save_dir.mkdir(exist_ok=True, parents=True)
1115
-
1116
- save(obj, save_dir)
300
+ if state is None:
301
+ # Dependencies couldn't be locked, return WAITING state
302
+ return JobState.WAITING
1117
303
 
1118
- def load(self, reference: str, name: str = "default"):
1119
- """Serializes configurations.
1120
-
1121
- Loads configuration objects from an experimental directory
1122
-
1123
- :param reference: The name of the experiment
1124
- :param name: The name of the saving directory (default to `default`)
1125
- """
1126
- from experimaestro import load
304
+ # Notify scheduler listeners of job state after successful start
305
+ self.notify_job_state(job)
306
+ return state
1127
307
 
1128
- path = self.workspace.experimentspath / reference / "data" / name
1129
- return load(path)
308
+ except Exception:
309
+ logger.warning("Error in scheduler job coordination", exc_info=True)
310
+ return JobState.ERROR