experimaestro 1.11.1__py3-none-any.whl → 2.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (133) hide show
  1. experimaestro/__init__.py +10 -11
  2. experimaestro/annotations.py +167 -206
  3. experimaestro/cli/__init__.py +140 -16
  4. experimaestro/cli/filter.py +42 -74
  5. experimaestro/cli/jobs.py +157 -106
  6. experimaestro/cli/progress.py +269 -0
  7. experimaestro/cli/refactor.py +249 -0
  8. experimaestro/click.py +0 -1
  9. experimaestro/commandline.py +19 -3
  10. experimaestro/connectors/__init__.py +22 -3
  11. experimaestro/connectors/local.py +12 -0
  12. experimaestro/core/arguments.py +192 -37
  13. experimaestro/core/identifier.py +127 -12
  14. experimaestro/core/objects/__init__.py +6 -0
  15. experimaestro/core/objects/config.py +702 -285
  16. experimaestro/core/objects/config_walk.py +24 -6
  17. experimaestro/core/serialization.py +91 -34
  18. experimaestro/core/serializers.py +1 -8
  19. experimaestro/core/subparameters.py +164 -0
  20. experimaestro/core/types.py +198 -83
  21. experimaestro/exceptions.py +26 -0
  22. experimaestro/experiments/cli.py +107 -25
  23. experimaestro/generators.py +50 -9
  24. experimaestro/huggingface.py +3 -1
  25. experimaestro/launcherfinder/parser.py +29 -0
  26. experimaestro/launcherfinder/registry.py +3 -3
  27. experimaestro/launchers/__init__.py +26 -1
  28. experimaestro/launchers/direct.py +12 -0
  29. experimaestro/launchers/slurm/base.py +154 -2
  30. experimaestro/mkdocs/base.py +6 -8
  31. experimaestro/mkdocs/metaloader.py +0 -1
  32. experimaestro/mypy.py +452 -7
  33. experimaestro/notifications.py +75 -16
  34. experimaestro/progress.py +404 -0
  35. experimaestro/rpyc.py +0 -1
  36. experimaestro/run.py +19 -6
  37. experimaestro/scheduler/__init__.py +18 -1
  38. experimaestro/scheduler/base.py +504 -959
  39. experimaestro/scheduler/dependencies.py +43 -28
  40. experimaestro/scheduler/dynamic_outputs.py +259 -130
  41. experimaestro/scheduler/experiment.py +582 -0
  42. experimaestro/scheduler/interfaces.py +474 -0
  43. experimaestro/scheduler/jobs.py +485 -0
  44. experimaestro/scheduler/services.py +186 -12
  45. experimaestro/scheduler/signal_handler.py +32 -0
  46. experimaestro/scheduler/state.py +1 -1
  47. experimaestro/scheduler/state_db.py +388 -0
  48. experimaestro/scheduler/state_provider.py +2345 -0
  49. experimaestro/scheduler/state_sync.py +834 -0
  50. experimaestro/scheduler/workspace.py +52 -10
  51. experimaestro/scriptbuilder.py +7 -0
  52. experimaestro/server/__init__.py +153 -32
  53. experimaestro/server/data/index.css +0 -125
  54. experimaestro/server/data/index.css.map +1 -1
  55. experimaestro/server/data/index.js +194 -58
  56. experimaestro/server/data/index.js.map +1 -1
  57. experimaestro/settings.py +47 -6
  58. experimaestro/sphinx/__init__.py +3 -3
  59. experimaestro/taskglobals.py +20 -0
  60. experimaestro/tests/conftest.py +80 -0
  61. experimaestro/tests/core/test_generics.py +2 -2
  62. experimaestro/tests/identifier_stability.json +45 -0
  63. experimaestro/tests/launchers/bin/sacct +6 -2
  64. experimaestro/tests/launchers/bin/sbatch +4 -2
  65. experimaestro/tests/launchers/common.py +2 -2
  66. experimaestro/tests/launchers/test_slurm.py +80 -0
  67. experimaestro/tests/restart.py +1 -1
  68. experimaestro/tests/tasks/all.py +7 -0
  69. experimaestro/tests/tasks/test_dynamic.py +231 -0
  70. experimaestro/tests/test_checkers.py +2 -2
  71. experimaestro/tests/test_cli_jobs.py +615 -0
  72. experimaestro/tests/test_dependencies.py +11 -17
  73. experimaestro/tests/test_deprecated.py +630 -0
  74. experimaestro/tests/test_environment.py +200 -0
  75. experimaestro/tests/test_experiment.py +3 -3
  76. experimaestro/tests/test_file_progress.py +425 -0
  77. experimaestro/tests/test_file_progress_integration.py +477 -0
  78. experimaestro/tests/test_forward.py +3 -3
  79. experimaestro/tests/test_generators.py +93 -0
  80. experimaestro/tests/test_identifier.py +520 -169
  81. experimaestro/tests/test_identifier_stability.py +458 -0
  82. experimaestro/tests/test_instance.py +16 -21
  83. experimaestro/tests/test_multitoken.py +442 -0
  84. experimaestro/tests/test_mypy.py +433 -0
  85. experimaestro/tests/test_objects.py +314 -30
  86. experimaestro/tests/test_outputs.py +8 -8
  87. experimaestro/tests/test_param.py +22 -26
  88. experimaestro/tests/test_partial_paths.py +231 -0
  89. experimaestro/tests/test_progress.py +2 -50
  90. experimaestro/tests/test_resumable_task.py +480 -0
  91. experimaestro/tests/test_serializers.py +141 -60
  92. experimaestro/tests/test_state_db.py +434 -0
  93. experimaestro/tests/test_subparameters.py +160 -0
  94. experimaestro/tests/test_tags.py +151 -15
  95. experimaestro/tests/test_tasks.py +137 -160
  96. experimaestro/tests/test_token_locking.py +252 -0
  97. experimaestro/tests/test_tokens.py +25 -19
  98. experimaestro/tests/test_types.py +133 -11
  99. experimaestro/tests/test_validation.py +19 -19
  100. experimaestro/tests/test_workspace_triggers.py +158 -0
  101. experimaestro/tests/token_reschedule.py +5 -3
  102. experimaestro/tests/utils.py +2 -2
  103. experimaestro/tokens.py +154 -57
  104. experimaestro/tools/diff.py +8 -1
  105. experimaestro/tui/__init__.py +8 -0
  106. experimaestro/tui/app.py +2303 -0
  107. experimaestro/tui/app.tcss +353 -0
  108. experimaestro/tui/log_viewer.py +228 -0
  109. experimaestro/typingutils.py +11 -2
  110. experimaestro/utils/__init__.py +23 -0
  111. experimaestro/utils/environment.py +148 -0
  112. experimaestro/utils/git.py +129 -0
  113. experimaestro/utils/resources.py +1 -1
  114. experimaestro/version.py +34 -0
  115. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +70 -39
  116. experimaestro-2.0.0b4.dist-info/RECORD +181 -0
  117. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
  118. experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
  119. experimaestro/compat.py +0 -6
  120. experimaestro/core/objects.pyi +0 -225
  121. experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
  122. experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
  123. experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
  124. experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
  125. experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
  126. experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
  127. experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
  128. experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
  129. experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
  130. experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
  131. experimaestro-1.11.1.dist-info/RECORD +0 -158
  132. experimaestro-1.11.1.dist-info/entry_points.txt +0 -17
  133. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info/licenses}/LICENSE +0 -0
@@ -1,503 +1,210 @@
1
- from collections import ChainMap
2
- from functools import cached_property
3
- import itertools
4
- import logging
5
- import os
6
- from pathlib import Path
7
- from shutil import rmtree
8
1
  import threading
9
2
  import time
10
3
  from typing import (
11
- Any,
12
- Iterator,
13
- List,
14
4
  Optional,
15
5
  Set,
16
- TypeVar,
17
- Union,
6
+ ClassVar,
18
7
  TYPE_CHECKING,
19
8
  )
20
- import enum
21
- import signal
22
9
  import asyncio
23
- from experimaestro.exceptions import HandledException
24
- from experimaestro.notifications import LevelInformation, Reporter
25
10
  from typing import Dict
11
+
12
+ from experimaestro.scheduler import experiment
13
+ from experimaestro.scheduler.jobs import Job, JobState, JobError
26
14
  from experimaestro.scheduler.services import Service
27
- from experimaestro.settings import WorkspaceSettings, get_settings
28
15
 
29
16
 
30
- from experimaestro.core.objects import Config, ConfigWalkContext, WatchedOutput
31
17
  from experimaestro.utils import logger
32
- from experimaestro.locking import Locks, LockError, Lock
33
18
  from experimaestro.utils.asyncio import asyncThreadcheck
34
- from .workspace import RunMode, Workspace
35
- from .dependencies import Dependency, DependencyStatus, Resource
36
19
  import concurrent.futures
37
20
 
38
-
39
21
  if TYPE_CHECKING:
40
- from experimaestro.connectors import Process
41
- from experimaestro.launchers import Launcher
42
-
43
-
44
- class FailedExperiment(HandledException):
45
- """Raised when an experiment failed"""
46
-
47
- pass
48
-
49
-
50
- class JobState(enum.Enum):
51
- # Job is not yet scheduled
52
- UNSCHEDULED = 0
53
-
54
- # Job is waiting for dependencies to be done
55
- WAITING = 1
56
-
57
- # Job is ready to run
58
- READY = 2
59
-
60
- # Job is scheduled (e.g. slurm)
61
- SCHEDULED = 3
62
-
63
- # Job is running
64
- RUNNING = 4
65
-
66
- # Job is done (finished)
67
- DONE = 5
68
-
69
- # Job failed (finished)
70
- ERROR = 6
71
-
72
- def notstarted(self):
73
- return self.value <= JobState.READY.value
74
-
75
- def running(self):
76
- return (
77
- self.value == JobState.RUNNING.value
78
- or self.value == JobState.SCHEDULED.value
79
- )
80
-
81
- def finished(self):
82
- return self.value >= JobState.DONE.value
22
+ from experimaestro.server import Server
23
+ from experimaestro.settings import ServerSettings
24
+ from experimaestro.scheduler.workspace import Workspace
83
25
 
84
26
 
85
- class JobFailureStatus(enum.Enum):
86
- #: Job failed
87
- DEPENDENCY = 0
88
-
89
- #: Job dependency failed
90
- FAILED = 1
91
-
92
- #: Memory
93
- MEMORY = 2
94
-
95
-
96
- class JobLock(Lock):
97
- def __init__(self, job):
98
- super().__init__()
99
- self.job = job
27
+ class Listener:
28
+ def job_submitted(self, job):
29
+ pass
100
30
 
101
- def _acquire(self):
102
- return self.job.state == JobState.DONE
31
+ def job_state(self, job):
32
+ pass
103
33
 
104
- def _release(self):
105
- return False
34
+ def service_add(self, service: Service):
35
+ """Notify when a new service is added"""
36
+ pass
106
37
 
107
38
 
108
- class JobDependency(Dependency):
109
- def __init__(self, job):
110
- super().__init__(job)
39
+ class Scheduler(threading.Thread):
40
+ """A job scheduler (singleton)
111
41
 
112
- def status(self) -> DependencyStatus:
113
- if self.origin.state == JobState.DONE:
114
- return DependencyStatus.OK
115
- elif self.origin.state == JobState.ERROR:
116
- return DependencyStatus.FAIL
117
- return DependencyStatus.WAIT
42
+ The scheduler is based on asyncio for easy concurrency handling.
43
+ This is a singleton - only one scheduler instance exists per process.
44
+ """
118
45
 
119
- def lock(self):
120
- return JobLock(self.origin)
46
+ _instance: ClassVar[Optional["Scheduler"]] = None
47
+ _lock: ClassVar[threading.Lock] = threading.Lock()
121
48
 
49
+ def __init__(self, name: str = "Global"):
50
+ super().__init__(name=f"Scheduler ({name})", daemon=True)
51
+ self._ready = threading.Event()
122
52
 
123
- class Job(Resource):
124
- """A job is a resource that is produced by the execution of some code"""
53
+ # Name of the scheduler
54
+ self.name = name
125
55
 
126
- # Set by the scheduler
127
- _readyEvent: Optional[asyncio.Event]
128
- _future: Optional["concurrent.futures.Future"]
56
+ # Track experiments (simple dict for now)
57
+ self.experiments: Dict[str, "experiment"] = {}
129
58
 
130
- def __init__(
131
- self,
132
- config: Config,
133
- *,
134
- workspace: Workspace = None,
135
- launcher: "Launcher" = None,
136
- run_mode: RunMode = RunMode.NORMAL,
137
- ):
138
- super().__init__()
59
+ # Exit mode activated
60
+ self.exitmode = False
139
61
 
140
- self.workspace = workspace or Workspace.CURRENT
141
- self.launcher = launcher or self.workspace.launcher if self.workspace else None
62
+ # List of all jobs
63
+ self.jobs: Dict[str, "Job"] = {}
142
64
 
143
- if run_mode == RunMode.NORMAL:
144
- assert self.workspace is not None, "No experiment has been defined"
145
- assert self.launcher is not None, (
146
- "No launcher, and no default defined for the workspace %s" % workspace
147
- )
65
+ # List of jobs
66
+ self.waitingjobs: Set[Job] = set()
148
67
 
149
- self.type = config.__xpmtype__
150
- self.name = str(self.type.identifier).rsplit(".", 1)[-1]
68
+ # Listeners with thread-safe access
69
+ self._listeners: Set[Listener] = set()
70
+ self._listeners_lock = threading.Lock()
151
71
 
152
- self.scheduler: Optional["Scheduler"] = None
153
- self.config = config
154
- self.state: JobState = JobState.UNSCHEDULED
72
+ # Notification thread pool (single worker to serialize notifications)
73
+ self._notification_executor = concurrent.futures.ThreadPoolExecutor(
74
+ max_workers=1, thread_name_prefix="NotificationWorker"
75
+ )
155
76
 
156
- #: If a job has failed, indicates the failure status
157
- self.failure_status: JobFailureStatus = None
77
+ # Server (managed by scheduler)
78
+ self.server: Optional["Server"] = None
158
79
 
159
- # Dependencies
160
- self.dependencies: Set[Dependency] = set() # as target
80
+ @staticmethod
81
+ def has_instance() -> bool:
82
+ """Check if a scheduler instance exists without creating one"""
83
+ return Scheduler._instance is not None
161
84
 
162
- # Watched outputs
163
- self.watched_outputs = {}
164
- for watched in config.__xpm__.watched_outputs:
165
- self.watch_output(watched)
85
+ @staticmethod
86
+ def instance() -> "Scheduler":
87
+ """Get or create the global scheduler instance"""
88
+ if Scheduler._instance is None:
89
+ with Scheduler._lock:
90
+ if Scheduler._instance is None:
91
+ Scheduler._instance = Scheduler._create()
92
+ return Scheduler._instance
166
93
 
167
- # Process
168
- self._process = None
169
- self.unsatisfied = 0
94
+ @staticmethod
95
+ def _create(name: str = "Global"):
96
+ """Internal method to create and start scheduler"""
97
+ instance = Scheduler(name)
98
+ instance.start()
99
+ instance._ready.wait()
100
+ return instance
170
101
 
171
- # Meta-information
172
- self.starttime: Optional[float] = None
173
- self.submittime: Optional[float] = None
174
- self.endtime: Optional[float] = None
175
- self._progress: List[LevelInformation] = []
176
- self.tags = config.tags()
102
+ @staticmethod
103
+ def create(xp: "experiment" = None, name: str = "Global"):
104
+ """Create or get the scheduler instance
177
105
 
178
- def watch_output(self, watched: "WatchedOutput"):
179
- """Monitor task outputs
106
+ Args:
107
+ xp: (Deprecated) Experiment reference, ignored
108
+ name: Name for the scheduler (only used on first creation)
180
109
 
181
- :param watched: A description of the watched output
110
+ Returns:
111
+ The global scheduler instance
182
112
  """
183
- self.scheduler.xp.watch_output(watched)
184
-
185
- def task_output_update(self, subpath: Path):
186
- """Notification of an updated task output"""
187
- if watcher := self.watched_outputs.get(subpath, None):
188
- watcher.update()
189
-
190
- def done_handler(self):
191
- """The task has been completed"""
192
- for watcher in self.watched_outputs.values():
193
- watcher.update()
194
-
195
- def __str__(self):
196
- return "Job[{}]".format(self.identifier)
113
+ return Scheduler.instance()
197
114
 
198
- def wait(self) -> JobState:
199
- assert self._future, "Cannot wait a not submitted job"
200
- return self._future.result()
115
+ def register_experiment(self, xp: "experiment"):
116
+ """Register an experiment with the scheduler"""
117
+ # Use experiment name as key for now
118
+ key = xp.workdir.name
119
+ self.experiments[key] = xp
201
120
 
202
- @cached_property
203
- def python_path(self) -> Iterator[str]:
204
- """Returns an iterator over python path"""
205
- return itertools.chain(self.workspace.python_path)
121
+ logger.debug("Registered experiment %s with scheduler", key)
206
122
 
207
- @cached_property
208
- def environ(self):
209
- """Returns the job environment
123
+ def unregister_experiment(self, xp: "experiment"):
124
+ """Unregister an experiment from the scheduler"""
125
+ key = xp.workdir.name
126
+ if key in self.experiments:
127
+ del self.experiments[key]
128
+ logger.debug("Unregistered experiment %s from scheduler", key)
210
129
 
211
- It is made of (by order of priority):
212
-
213
- 1. The job environment
214
- 1. The launcher environment
215
- 1. The workspace environment
216
-
217
- """
218
- return ChainMap(
219
- {},
220
- self.launcher.environ if self.launcher else {},
221
- self.workspace.env if self.workspace else {},
222
- )
130
+ def start_server(
131
+ self, settings: "ServerSettings" = None, workspace: "Workspace" = None
132
+ ):
133
+ """Start the notification server (if not already running)
223
134
 
224
- @property
225
- def progress(self):
226
- return self._progress
227
-
228
- def set_progress(self, level: int, value: float, desc: Optional[str]):
229
- if value < 0:
230
- logger.warning(f"Progress value out of bounds ({value})")
231
- value = 0
232
- elif value > 1:
233
- logger.warning(f"Progress value out of bounds ({value})")
234
- value = 1
235
-
236
- # Adjust the length of the array
237
- self._progress = self._progress[: (level + 1)]
238
- while len(self._progress) <= level:
239
- self._progress.append(LevelInformation(len(self._progress), None, 0.0))
240
-
241
- if desc:
242
- self._progress[-1].desc = desc
243
- self._progress[-1].progress = value
244
-
245
- for listener in self.scheduler.listeners:
246
- listener.job_state(self)
247
-
248
- def add_notification_server(self, server):
249
- """Adds a notification server"""
250
- key, baseurl = server.getNotificationSpec()
251
- dirpath = self.path / Reporter.NOTIFICATION_FOLDER
252
- dirpath.mkdir(exist_ok=True)
253
- (dirpath / key).write_text(f"{baseurl}/{self.identifier}")
254
-
255
- @property
256
- def ready(self):
257
- return self.state == JobState.READY
258
-
259
- @property
260
- def jobpath(self) -> Path:
261
- """Deprecated, use `path`"""
262
- return self.workspace.jobspath / self.relpath
263
-
264
- @property
265
- def path(self) -> Path:
266
- return self.workspace.jobspath / self.relpath
267
-
268
- @property
269
- def experimaestro_path(self) -> Path:
270
- return (self.path / ".experimaestro").resolve()
271
-
272
- @cached_property
273
- def task_outputs_path(self) -> Path:
274
- return self.experimaestro_path / "task-outputs.jsonl"
275
-
276
- @property
277
- def relpath(self):
278
- identifier = self.config.__xpm__.identifier
279
- base = Path(str(self.type.identifier))
280
- return base / identifier.all.hex()
281
-
282
- @property
283
- def relmainpath(self):
284
- identifier = self.config.__xpm__.identifier
285
- base = Path(str(self.type.identifier))
286
- return base / identifier.main.hex()
287
-
288
- @property
289
- def hashidentifier(self):
290
- return self.config.__xpm__.identifier
291
-
292
- @property
293
- def identifier(self):
294
- return self.config.__xpm__.identifier.all.hex()
295
-
296
- def prepare(self, overwrite=False):
297
- """Prepare all files before starting a task
298
-
299
- :param overwrite: if True, overwrite files even if the task has been run
135
+ Args:
136
+ settings: Server settings
137
+ workspace: Workspace instance (required to get workspace path)
300
138
  """
301
- pass
302
-
303
- async def aio_run(self):
304
- """Actually run the code"""
305
- raise NotImplementedError(f"Method aio_run not implemented in {self.__class__}")
306
-
307
- async def aio_process(self) -> Optional["Process"]:
308
- """Returns the process if it exists"""
309
- raise NotImplementedError("Not implemented")
310
-
311
- @property
312
- def pidpath(self):
313
- """This file contains the file PID"""
314
- return self.jobpath / ("%s.pid" % self.name)
315
-
316
- @property
317
- def lockpath(self):
318
- """This file is used as a lock for running the job"""
319
- return self.workspace.jobspath / self.relmainpath / ("%s.lock" % self.name)
320
-
321
- @property
322
- def donepath(self) -> Path:
323
- """When a job has been successful, this file is written"""
324
- return self.jobpath / ("%s.done" % self.name)
325
-
326
- @property
327
- def failedpath(self):
328
- """When a job has been unsuccessful, this file is written with an error
329
- code inside"""
330
- return self.jobpath / ("%s.failed" % self.name)
331
-
332
- @property
333
- def stdout(self) -> Path:
334
- return self.jobpath / ("%s.out" % self.name)
335
-
336
- @property
337
- def stderr(self) -> Path:
338
- return self.jobpath / ("%s.err" % self.name)
339
-
340
- @property
341
- def basepath(self) -> Path:
342
- return self.jobpath / self.name
343
-
344
- def dependencychanged(self, dependency, oldstatus, status):
345
- """Called when a dependency has changed"""
346
-
347
- def value(s):
348
- return 1 if s == DependencyStatus.OK else 0
139
+ if self.server is None:
140
+ from experimaestro.server import Server
141
+ from experimaestro.scheduler.state_provider import WorkspaceStateProvider
349
142
 
350
- self.unsatisfied -= value(status) - value(oldstatus)
143
+ if workspace is None:
144
+ raise ValueError("workspace parameter is required to start server")
351
145
 
352
- logger.debug("Job %s: unsatisfied %d", self, self.unsatisfied)
353
-
354
- if status == DependencyStatus.FAIL:
355
- # Job completed
356
- if not self.state.finished():
357
- self.state = JobState.ERROR
358
- self.failure_status = JobFailureStatus.DEPENDENCY
359
- self._readyEvent.set()
360
-
361
- if self.unsatisfied == 0:
362
- logger.info("Job %s is ready to run", self)
363
- # We are ready
364
- self.state = JobState.READY
365
- self._readyEvent.set()
366
-
367
- def finalState(self) -> "concurrent.futures.Future[JobState]":
368
- assert self._future is not None
369
- return self._future
370
-
371
-
372
- class JobContext(ConfigWalkContext):
373
- def __init__(self, job: Job):
374
- super().__init__()
375
- self.job = job
376
-
377
- @property
378
- def name(self):
379
- return self.job.name
380
-
381
- @property
382
- def path(self):
383
- return self.job.path
384
-
385
- @property
386
- def task(self):
387
- return self.job.config
388
-
389
-
390
- class Listener:
391
- def job_submitted(self, job):
392
- pass
393
-
394
- def job_state(self, job):
395
- pass
396
-
397
- def service_add(self, service: Service):
398
- """Notify when a new service is added"""
399
- pass
400
-
401
-
402
- class JobError(Exception):
403
- def __init__(self, code):
404
- super().__init__(f"Job exited with code {code}")
405
-
406
-
407
- class SignalHandler:
408
- def __init__(self):
409
- self.experiments: Set["experiment"] = set()
410
- self.original_sigint_handler = None
411
-
412
- def add(self, xp: "experiment"):
413
- if not self.experiments:
414
- self.original_sigint_handler = signal.getsignal(signal.SIGINT)
415
-
416
- signal.signal(signal.SIGINT, self)
417
-
418
- self.experiments.add(xp)
419
-
420
- def remove(self, xp):
421
- self.experiments.remove(xp)
422
- if not self.experiments:
423
- signal.signal(signal.SIGINT, self.original_sigint_handler)
424
-
425
- def __call__(self, signum, frame):
426
- """SIGINT signal handler"""
427
- logger.warning("Signal received")
428
- for xp in self.experiments:
429
- xp.stop()
430
-
431
-
432
- SIGNAL_HANDLER = SignalHandler()
433
-
434
-
435
- class SchedulerCentral(threading.Thread):
436
- loop: asyncio.AbstractEventLoop
437
-
438
- """The event loop thread used by the scheduler"""
146
+ # Get the workspace state provider singleton
147
+ state_provider = WorkspaceStateProvider.get_instance(
148
+ workspace.path, read_only=False, sync_on_start=False
149
+ )
439
150
 
440
- def __init__(self, name: str):
441
- # Daemon thread so it is non blocking
442
- super().__init__(name=f"Scheduler EL ({name})", daemon=True)
151
+ self.server = Server.instance(settings, state_provider)
152
+ self.server.start()
153
+ logger.info("Server started by scheduler")
154
+ else:
155
+ logger.debug("Server already running")
443
156
 
444
- self._ready = threading.Event()
157
+ def stop_server(self):
158
+ """Stop the notification server"""
159
+ if self.server is not None:
160
+ self.server.stop()
161
+ logger.info("Server stopped by scheduler")
445
162
 
446
163
  def run(self):
164
+ """Run the event loop forever"""
447
165
  logger.debug("Starting event loop thread")
166
+ # Ported from SchedulerCentral
448
167
  self.loop = asyncio.new_event_loop()
449
168
  asyncio.set_event_loop(self.loop)
450
-
451
169
  # Set loop-dependent variables
452
170
  self.exitCondition = asyncio.Condition()
453
171
  self.dependencyLock = asyncio.Lock()
454
172
 
455
- # Start the event loop
173
+ # Note: State provider removed - now managed at workspace level
174
+ # Each experiment has its own workspace with database
175
+
456
176
  self._ready.set()
457
177
  self.loop.run_forever()
458
178
 
459
- @staticmethod
460
- def create(name: str):
461
- instance = SchedulerCentral(name)
462
- instance.start()
463
- instance._ready.wait()
464
- return instance
465
-
466
-
467
- class Scheduler:
468
- """A job scheduler
469
-
470
- The scheduler is based on asyncio for easy concurrency handling
471
- """
472
-
473
- def __init__(self, xp: "experiment", name: str):
474
- # Name of the experiment
475
- self.name = name
476
- self.xp = xp
477
-
478
- # Exit mode activated
479
- self.exitmode = False
480
-
481
- # List of all jobs
482
- self.jobs: Dict[str, "Job"] = {}
483
-
484
- # List of jobs
485
- self.waitingjobs: Set[Job] = set()
486
-
487
- # Listeners
488
- self.listeners: Set[Listener] = set()
489
-
490
- @property
491
- def loop(self):
492
- return self.xp.loop
179
+ def start_scheduler(self):
180
+ """Start the scheduler event loop in a thread"""
181
+ if not self.is_alive():
182
+ self.start()
183
+ self._ready.wait()
184
+ else:
185
+ logger.warning("Scheduler already started")
493
186
 
494
187
  def addlistener(self, listener: Listener):
495
- self.listeners.add(listener)
188
+ with self._listeners_lock:
189
+ self._listeners.add(listener)
496
190
 
497
191
  def removelistener(self, listener: Listener):
498
- self.listeners.remove(listener)
192
+ with self._listeners_lock:
193
+ self._listeners.discard(listener)
194
+
195
+ def clear_listeners(self):
196
+ """Clear all listeners (for testing purposes)"""
197
+ with self._listeners_lock:
198
+ self._listeners.clear()
499
199
 
500
200
  def getJobState(self, job: Job) -> "concurrent.futures.Future[JobState]":
201
+ # Check if the job belongs to this scheduler
202
+ if job.identifier not in self.jobs:
203
+ # If job is not in this scheduler, return its current state directly
204
+ future = concurrent.futures.Future()
205
+ future.set_result(job.state)
206
+ return future
207
+
501
208
  return asyncio.run_coroutine_threadsafe(self.aio_getjobstate(job), self.loop)
502
209
 
503
210
  async def aio_getjobstate(self, job: Job):
@@ -505,17 +212,25 @@ class Scheduler:
505
212
 
506
213
  def submit(self, job: Job) -> Optional[Job]:
507
214
  # Wait for the future containing the submitted job
508
- logger.debug("Registering the job %s within the scheduler", job)
215
+ logger.debug("Submit job %s to the scheduler", job)
509
216
  otherFuture = asyncio.run_coroutine_threadsafe(
510
217
  self.aio_registerJob(job), self.loop
511
218
  )
512
219
  other = otherFuture.result()
513
220
  logger.debug("Job already submitted" if other else "First submission")
514
- if other:
515
- return other
221
+
222
+ # Only returns if job was already submitted and doesn't need reprocessing
223
+ if other is not None:
224
+ # If state is WAITING, it was just reset for resubmission and needs processing
225
+ # If state is RUNNING or finished (DONE), no need to reprocess
226
+ if other.state != JobState.WAITING:
227
+ return other
228
+ # Use 'other' for resubmission since it has the correct experiments list
229
+ job = other
516
230
 
517
231
  job._future = asyncio.run_coroutine_threadsafe(self.aio_submit(job), self.loop)
518
- return None
232
+
233
+ return other
519
234
 
520
235
  def prepare(self, job: Job):
521
236
  """Prepares the job for running"""
@@ -530,33 +245,99 @@ class Scheduler:
530
245
 
531
246
  if self.exitmode:
532
247
  logger.warning("Exit mode: not submitting")
248
+ return
533
249
 
534
- elif job.identifier in self.jobs:
250
+ # Job was already submitted
251
+ if job.identifier in self.jobs:
535
252
  other = self.jobs[job.identifier]
536
253
  assert job.type == other.type
537
- if other.state == JobState.ERROR:
254
+
255
+ # Add current experiment to the existing job's experiments list
256
+ xp = experiment.current()
257
+ xp.add_job(other)
258
+
259
+ # Copy watched outputs from new job to existing job
260
+ # This ensures new callbacks are registered even for resubmitted jobs
261
+ other.watched_outputs.extend(job.watched_outputs)
262
+
263
+ if other.state.is_error():
538
264
  logger.info("Re-submitting job")
265
+ # Clean up old process info so it will be re-started
266
+ other._process = None
267
+ if other.pidpath.is_file():
268
+ other.pidpath.unlink()
269
+ # Use set_state to handle experiment statistics updates
270
+ other.set_state(JobState.WAITING)
271
+ self.notify_job_state(other) # Notify listeners of re-submit
539
272
  else:
540
273
  logger.warning("Job %s already submitted", job.identifier)
541
- return other
542
274
 
543
- else:
544
- # Register this job
545
- self.xp.unfinishedJobs += 1
546
- self.jobs[job.identifier] = job
275
+ # Returns the previous job
276
+ return other
277
+
278
+ # Register this job
279
+ xp = experiment.current()
280
+ self.jobs[job.identifier] = job
281
+ # Set submittime now so that add_job can record it in the database
282
+ # (aio_submit may update this later for re-submitted jobs)
283
+ job.submittime = time.time()
284
+ xp.add_job(job)
285
+
286
+ # Set up dependencies
287
+ for dependency in job.dependencies:
288
+ dependency.target = job
289
+ dependency.origin.dependents.add(dependency)
547
290
 
548
291
  return None
549
292
 
550
- async def aio_submit(self, job: Job) -> JobState: # noqa: C901
293
+ def _notify_listeners(self, notification_func, job: Job):
294
+ """Execute notification in thread pool with error isolation.
295
+
296
+ This runs notifications in a dedicated thread pool to avoid blocking
297
+ the scheduler and to isolate errors from affecting other listeners.
298
+ """
299
+
300
+ def _do_notify():
301
+ # Get a snapshot of listeners with the lock
302
+ with self._listeners_lock:
303
+ listeners_snapshot = list(self._listeners)
304
+
305
+ for listener in listeners_snapshot:
306
+ try:
307
+ notification_func(listener, job)
308
+ except Exception:
309
+ logger.exception("Got an error with listener %s", listener)
310
+
311
+ self._notification_executor.submit(_do_notify)
312
+
313
+ def notify_job_submitted(self, job: Job):
314
+ """Notify the listeners that a job has been submitted"""
315
+ self._notify_listeners(lambda lst, j: lst.job_submitted(j), job)
316
+
317
+ def notify_job_state(self, job: Job):
318
+ """Notify the listeners that a job has changed state"""
319
+ self._notify_listeners(lambda lst, j: lst.job_state(j), job)
320
+
321
+ def notify_service_add(self, service: Service):
322
+ """Notify the listeners that a service has been added"""
323
+ self._notify_listeners(lambda lst, s: lst.service_add(s), service)
324
+
325
+ async def aio_submit(self, job: Job) -> JobState:
551
326
  """Main scheduler function: submit a job, run it (if needed), and returns
552
327
  the status code
553
328
  """
329
+ from experimaestro.scheduler.jobs import JobStateError, JobFailureStatus
330
+
554
331
  logger.info("Submitting job %s", job)
555
- job._readyEvent = asyncio.Event()
556
332
  job.submittime = time.time()
557
333
  job.scheduler = self
558
334
  self.waitingjobs.add(job)
559
335
 
336
+ # Register watched outputs now that the job has a scheduler
337
+ job.register_watched_outputs()
338
+
339
+ # Note: Job metadata will be written after directory is created in aio_start
340
+
560
341
  # Check that we don't have a completed job in
561
342
  # alternate directories
562
343
  for jobspath in experiment.current().alt_jobspaths:
@@ -570,560 +351,324 @@ class Scheduler:
570
351
  path.unlink()
571
352
  path.symlink_to(job.path)
572
353
 
573
- job.state = JobState.WAITING
574
- for listener in self.listeners:
575
- try:
576
- listener.job_submitted(job)
577
- except Exception:
578
- logger.exception("Got an error with listener %s", listener)
579
-
580
- # Add dependencies, and add to blocking resources
581
- if job.dependencies:
582
- job.unsatisfied = len(job.dependencies)
583
-
584
- for dependency in job.dependencies:
585
- dependency.target = job
586
- dependency.loop = self.loop
587
- dependency.origin.dependents.add(dependency)
588
- dependency.check()
589
- else:
590
- job._readyEvent.set()
591
- job.state = JobState.READY
354
+ job.set_state(JobState.WAITING)
355
+ self.notify_job_submitted(job)
592
356
 
357
+ # Check if already done
593
358
  if job.donepath.exists():
594
- job.state = JobState.DONE
359
+ job.set_state(JobState.DONE)
360
+ self.notify_job_state(job) # Notify listeners of done state
595
361
 
596
362
  # Check if we have a running process
597
- process = await job.aio_process()
598
- if process is not None:
599
- # Yep! First we notify the listeners
600
- job.state = JobState.RUNNING
601
- for listener in self.listeners:
602
- try:
603
- listener.job_state(job)
604
- except Exception:
605
- logger.exception("Got an error with listener %s", listener)
606
-
607
- # Adds to the listeners
608
- if self.xp.server is not None:
609
- job.add_notification_server(self.xp.server)
610
-
611
- # And now, we wait...
612
- logger.info("Got a process for job %s - waiting to complete", job)
613
- code = await process.aio_code()
614
- logger.info("Job %s completed with code %s", job, code)
615
- job.state = JobState.DONE if code == 0 else JobState.ERROR
616
-
617
- # Check if done
618
- if job.donepath.exists():
619
- job.state = JobState.DONE
620
-
621
- # OK, not done; let's start the job for real
622
- while not job.state.finished():
623
- # Wait that the job is ready
624
- await job._readyEvent.wait()
625
- job._readyEvent.clear()
626
-
627
- if job.state == JobState.READY:
628
- try:
629
- state = await self.aio_start(job)
630
- except Exception:
631
- logger.exception("Got an exception while starting the job")
632
- raise
363
+ if not job.state.finished():
364
+ process = await job.aio_process()
365
+ if process is not None:
366
+ # Notify listeners that job is running
367
+ job.set_state(JobState.RUNNING)
368
+ self.notify_job_state(job)
369
+
370
+ # Adds to the listeners
371
+ if self.server is not None:
372
+ job.add_notification_server(self.server)
373
+
374
+ # And now, we wait...
375
+ logger.info("Got a process for job %s - waiting to complete", job)
376
+ code = await process.aio_code()
377
+ logger.info("Job %s completed with code %s", job, code)
378
+
379
+ # Record exit code if available
380
+ if code is not None:
381
+ job.exit_code = code
382
+
383
+ # Read state from .done/.failed files (contains detailed failure reason)
384
+ state = JobState.from_path(job.path, job.name)
385
+
386
+ # If state is a generic FAILED error, let the process determine
387
+ # the state (it may detect launcher-specific failures like SLURM timeout)
388
+ if (
389
+ state is not None
390
+ and isinstance(state, JobStateError)
391
+ and state.failure_reason == JobFailureStatus.FAILED
392
+ and code is not None
393
+ ):
394
+ process_state = process.get_job_state(code)
395
+ if (
396
+ isinstance(process_state, JobStateError)
397
+ and process_state.failure_reason != JobFailureStatus.FAILED
398
+ ):
399
+ # Process detected a more specific failure reason
400
+ state = process_state
633
401
 
634
402
  if state is None:
635
- # State is None if this is not the main thread
636
- return JobState.ERROR
637
-
638
- job.state = state
639
-
640
- for listener in self.listeners:
403
+ if code is not None:
404
+ # Fall back to process-specific state detection
405
+ state = process.get_job_state(code)
406
+ else:
407
+ logger.error("No .done or .failed file found for job %s", job)
408
+ state = JobState.ERROR
409
+ # Set endtime before set_state so database gets the timestamp
410
+ job.endtime = time.time()
411
+ job.set_state(state)
412
+ self.notify_job_state(job) # Notify listeners of final state
413
+
414
+ # If not done or running, start the job
415
+ if not job.state.finished():
641
416
  try:
642
- listener.job_state(job)
643
- except Exception as e:
644
- logger.exception("Listener %s did raise an exception", e)
645
-
646
- # Job is finished
647
- if job.state != JobState.DONE:
648
- self.xp.failedJobs[job.identifier] = job
417
+ state = await self.aio_start(job)
418
+ # Set endtime before set_state so database gets the timestamp
419
+ job.endtime = time.time()
420
+ job.set_state(state)
421
+ except Exception:
422
+ logger.exception("Got an exception while starting the job")
423
+ raise
649
424
 
650
- # Process all remaining tasks outputs
651
- await asyncThreadcheck("End of job processing", job.done_handler)
425
+ # Job is finished - experiment statistics already updated by set_state
652
426
 
653
- # Decrement the number of unfinished jobs and notify
654
- self.xp.unfinishedJobs -= 1
655
- async with self.xp.central.exitCondition:
656
- logging.debug("Updated number of unfinished jobs")
657
- self.xp.central.exitCondition.notify_all()
427
+ # Write final metadata with end time and final state
428
+ job.write_metadata()
658
429
 
659
- job.endtime = time.time()
660
430
  if job in self.waitingjobs:
661
431
  self.waitingjobs.remove(job)
662
432
 
663
- with job.dependents as dependents:
664
- logger.info("Processing %d dependent jobs", len(dependents))
665
- for dependency in dependents:
666
- logger.debug("Checking dependency %s", dependency)
667
- self.loop.call_soon(dependency.check)
433
+ # Process all remaining task outputs BEFORE notifying exit condition
434
+ # This ensures taskOutputQueueSize is updated before wait() can check it,
435
+ # preventing a race where wait() sees both unfinishedJobs==0 and
436
+ # taskOutputQueueSize==0 before callbacks have been queued.
437
+ await asyncThreadcheck("End of job processing", job.done_handler)
438
+
439
+ # Now notify - wait() will see the correct taskOutputQueueSize
440
+ async with self.exitCondition:
441
+ self.exitCondition.notify_all()
668
442
 
669
443
  return job.state
670
444
 
671
- async def aio_start(self, job: Job) -> Optional[JobState]:
672
- """Start a job
445
+ async def aio_start(self, job: Job) -> Optional[JobState]: # noqa: C901
446
+ """Start a job with full job starting logic
447
+
448
+ This method handles job locking, dependency acquisition, directory setup,
449
+ and job execution while using the scheduler's coordination lock to prevent
450
+ race conditions between multiple jobs.
673
451
 
674
- Returns None if the dependencies could not be locked after all
675
- Returns DONE/ERROR depending on the process outcome
452
+ :param job: The job to start
453
+ :return: JobState.WAITING if dependencies could not be locked, JobState.DONE
454
+ if job completed successfully, JobState.ERROR if job failed during execution,
455
+ or None (should not occur in normal operation)
456
+ :raises Exception: Various exceptions during job execution, dependency locking,
457
+ or process creation
676
458
  """
459
+ from experimaestro.scheduler.jobs import JobStateError
460
+ from experimaestro.locking import Locks, LockError
461
+ from experimaestro.scheduler.jobs import JobFailureStatus
677
462
 
678
- # We first lock the job before proceeding
463
+ # Assert preconditions
679
464
  assert job.launcher is not None
680
- assert self.xp.central is not None
681
465
 
682
- with Locks() as locks:
683
- logger.debug("[starting] Locking job %s", job)
684
- async with job.launcher.connector.lock(job.lockpath):
685
- logger.debug("[starting] Locked job %s", job)
466
+ # Restart loop for resumable tasks that timeout
467
+ while True:
468
+ logger.debug(
469
+ "Starting job %s with %d dependencies",
470
+ job,
471
+ len(job.dependencies),
472
+ )
686
473
 
687
- state = None
688
- try:
689
- logger.debug(
690
- "Starting job %s with %d dependencies",
691
- job,
692
- len(job.dependencies),
693
- )
474
+ # Separate static and dynamic dependencies
475
+ static_deps = [d for d in job.dependencies if not d.is_dynamic()]
476
+ dynamic_deps = [d for d in job.dependencies if d.is_dynamic()]
694
477
 
695
- async with self.xp.central.dependencyLock:
696
- for dependency in job.dependencies:
697
- try:
698
- locks.append(dependency.lock().acquire())
699
- except LockError:
700
- logger.warning(
701
- "Could not lock %s, aborting start for job %s",
702
- dependency,
703
- job,
478
+ # First, wait for all static dependencies (jobs) to complete
479
+ # These don't need the dependency lock as they can't change state
480
+ # Static dependency locks don't need to be added to locks list
481
+ logger.debug("Waiting for %d static dependencies", len(static_deps))
482
+ for dependency in static_deps:
483
+ logger.debug("Waiting for static dependency %s", dependency)
484
+ try:
485
+ await dependency.aio_lock()
486
+ except RuntimeError as e:
487
+ # Dependency failed - mark job as failed due to dependency
488
+ logger.warning("Dependency failed: %s", e)
489
+ return JobStateError(JobFailureStatus.DEPENDENCY)
490
+
491
+ # We first lock the job before proceeding
492
+ with Locks() as locks:
493
+ logger.debug("[starting] Locking job %s", job)
494
+ async with job.launcher.connector.lock(job.lockpath):
495
+ logger.debug("[starting] Locked job %s", job)
496
+
497
+ state = None
498
+ try:
499
+ # Now handle dynamic dependencies (tokens) with retry logic
500
+ # CRITICAL: Only one task at a time can acquire dynamic dependencies
501
+ # to prevent deadlocks (e.g., Task A holds Token1 waiting for Token2,
502
+ # Task B holds Token2 waiting for Token1)
503
+ if dynamic_deps:
504
+ async with self.dependencyLock:
505
+ logger.debug(
506
+ "Locking %d dynamic dependencies (tokens)",
507
+ len(dynamic_deps),
704
508
  )
705
- dependency.check()
706
- return JobState.WAITING
509
+ while True:
510
+ all_locked = True
511
+ for idx, dependency in enumerate(dynamic_deps):
512
+ try:
513
+ # Use timeout=0 for first dependency, 0.1s for subsequent
514
+ timeout = 0 if idx == 0 else 0.1
515
+ # Acquire the lock (this might block on IPC locks)
516
+ lock = await dependency.aio_lock(
517
+ timeout=timeout
518
+ )
519
+ locks.append(lock)
520
+ except LockError:
521
+ logger.info(
522
+ "Could not lock %s, retrying",
523
+ dependency,
524
+ )
525
+ # Release all locks and restart
526
+ for lock in locks.locks:
527
+ lock.release()
528
+ locks.locks.clear()
529
+ # Put failed dependency first
530
+ dynamic_deps.remove(dependency)
531
+ dynamic_deps.insert(0, dependency)
532
+ all_locked = False
533
+ break
534
+
535
+ if all_locked:
536
+ # All locks acquired successfully
537
+ break
538
+
539
+ # Dependencies have been locked, we can start the job
540
+ job.starttime = time.time()
541
+
542
+ # Creates the main directory
543
+ directory = job.path
544
+ logger.debug("Making directories job %s...", directory)
545
+
546
+ # Warn about directory cleanup for non-resumable tasks
547
+ # (only once per task type)
548
+ xpmtype = job.config.__xpmtype__
549
+ if (
550
+ directory.is_dir()
551
+ and not job.resumable
552
+ and not xpmtype.warned_clean_not_resumable
553
+ ):
554
+ xpmtype.warned_clean_not_resumable = True
555
+ logger.warning(
556
+ "In a future version, directory will be cleaned up for "
557
+ "non-resumable tasks (%s). Use ResumableTask if you want "
558
+ "to preserve the directory contents.",
559
+ xpmtype.identifier,
560
+ )
707
561
 
708
- for listener in self.listeners:
709
- listener.job_state(job)
562
+ if not directory.is_dir():
563
+ directory.mkdir(parents=True, exist_ok=True)
710
564
 
711
- job.starttime = time.time()
565
+ # Write metadata with submit and start time (after directory creation)
566
+ job.write_metadata()
712
567
 
713
- # Creates the main directory
714
- directory = job.path
715
- logger.debug("Making directories job %s...", directory)
716
- if not directory.is_dir():
717
- directory.mkdir(parents=True, exist_ok=True)
568
+ # Sets up the notification URL
569
+ if self.server is not None:
570
+ job.add_notification_server(self.server)
718
571
 
719
- # Sets up the notification URL
720
- if self.xp.server is not None:
721
- job.add_notification_server(self.xp.server)
572
+ except Exception:
573
+ logger.warning("Error while locking job", exc_info=True)
574
+ return JobState.WAITING
722
575
 
723
- except Exception:
724
- logger.warning("Error while locking job", exc_info=True)
725
- return JobState.WAITING
576
+ try:
577
+ # Runs the job
578
+ process = await job.aio_run()
579
+ except Exception:
580
+ logger.warning("Error while starting job", exc_info=True)
581
+ return JobState.ERROR
726
582
 
583
+ # Wait for job to complete while holding locks
727
584
  try:
728
- # Runs the job
729
- process = await job.aio_run()
730
- except Exception:
731
- logger.warning("Error while starting job", exc_info=True)
732
- return JobState.ERROR
733
-
734
- try:
735
- if isinstance(process, JobState):
736
- state = process
737
- logger.debug("Job %s ended (state %s)", job, state)
738
- else:
739
585
  logger.debug("Waiting for job %s process to end", job)
740
586
 
741
587
  code = await process.aio_code()
742
588
  logger.debug("Got return code %s for %s", code, job)
743
589
 
744
- # Check the file if there is no return code
745
- if code is None:
746
- # Case where we cannot retrieve the code right away
747
- if job.donepath.is_file():
748
- code = 0
590
+ # Record exit code if available
591
+ if code is not None:
592
+ logger.info("Job %s ended with code %s", job, code)
593
+ job.exit_code = code
594
+ else:
595
+ logger.info("Job %s ended, reading state from files", job)
596
+
597
+ # Read state from .done/.failed files (contains detailed failure reason)
598
+ state = JobState.from_path(job.path, job.name)
599
+
600
+ # If state is a generic FAILED error, let the process determine
601
+ # the state (it may detect launcher-specific failures like SLURM timeout)
602
+ if (
603
+ state is not None
604
+ and isinstance(state, JobStateError)
605
+ and state.failure_reason == JobFailureStatus.FAILED
606
+ and code is not None
607
+ ):
608
+ process_state = process.get_job_state(code)
609
+ if (
610
+ isinstance(process_state, JobStateError)
611
+ and process_state.failure_reason != JobFailureStatus.FAILED
612
+ ):
613
+ # Process detected a more specific failure reason
614
+ state = process_state
615
+
616
+ if state is None:
617
+ if code is not None:
618
+ # Fall back to process-specific state detection
619
+ state = process.get_job_state(code)
749
620
  else:
750
- code = int(job.failedpath.read_text())
751
-
752
- logger.debug("Job %s ended with code %s", job, code)
753
- state = JobState.DONE if code == 0 else JobState.ERROR
754
-
755
- except JobError:
756
- logger.warning("Error while running job")
757
- state = JobState.ERROR
758
-
759
- except Exception:
760
- logger.warning(
761
- "Error while running job (in experimaestro)", exc_info=True
762
- )
763
- state = JobState.ERROR
764
-
765
- return state
766
-
767
-
768
- ServiceClass = TypeVar("ServiceClass", bound=Service)
769
-
770
-
771
- class experiment:
772
- """Main experiment object
773
-
774
- It is a context object, i.e. experiments is run with
775
-
776
- ```py
777
- with experiment(...) as xp:
778
- ...
779
- ```
780
- """
781
-
782
- #: Current experiment
783
- CURRENT: Optional["experiment"] = None
784
-
785
- @staticmethod
786
- def current() -> "experiment":
787
- """Returns the current experiment, but checking first if set
788
-
789
- If there is no current experiment, raises an AssertError
790
- """
791
- assert experiment.CURRENT is not None, "No current experiment defined"
792
- return experiment.CURRENT
793
-
794
- def __init__(
795
- self,
796
- env: Union[Path, str, WorkspaceSettings],
797
- name: str,
798
- *,
799
- host: Optional[str] = None,
800
- port: Optional[int] = None,
801
- token: Optional[str] = None,
802
- run_mode: Optional[RunMode] = None,
803
- launcher=None,
804
- ):
805
- """
806
- :param env: an environment -- or a working directory for a local
807
- environment
808
-
809
- :param name: the identifier of the experiment
810
-
811
- :param launcher: The launcher (if not provided, inferred from path)
812
-
813
- :param host: The host for the web server (overrides the environment if
814
- set)
815
- :param port: the port for the web server (overrides the environment if
816
- set). Use negative number to avoid running a web server (default when dry run).
817
-
818
- :param run_mode: The run mode for the experiment (normal, generate run
819
- files, dry run)
820
- """
821
-
822
- from experimaestro.server import Server
823
- from experimaestro.scheduler import Listener
824
-
825
- settings = get_settings()
826
- if not isinstance(env, WorkspaceSettings):
827
- env = WorkspaceSettings(id=None, path=Path(env))
828
-
829
- # Creates the workspace
830
- run_mode = run_mode or RunMode.NORMAL
831
- self.workspace = Workspace(settings, env, launcher=launcher, run_mode=run_mode)
832
-
833
- # Mark the directory has an experimaestro folder
834
- self.workdir = self.workspace.experimentspath / name
835
- self.workdir.mkdir(parents=True, exist_ok=True)
836
- self.xplockpath = self.workdir / "lock"
837
- self.xplock = None
838
- self.old_experiment = None
839
- self.services: Dict[str, Service] = {}
840
- self._job_listener: Optional[Listener] = None
841
-
842
- # Get configuration settings
843
-
844
- if host is not None:
845
- settings.server.host = host
846
-
847
- if port is not None:
848
- settings.server.port = port
849
-
850
- if token is not None:
851
- settings.server.token = token
852
-
853
- # Create the scheduler
854
- self.scheduler = Scheduler(self, name)
855
- self.server = (
856
- Server(self.scheduler, settings.server)
857
- if (settings.server.port is not None and settings.server.port >= 0)
858
- and self.workspace.run_mode == RunMode.NORMAL
859
- else None
860
- )
861
-
862
- if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
863
- import faulthandler
864
-
865
- logger.info("Enabling fault handler")
866
- faulthandler.enable(all_threads=True)
867
-
868
- def submit(self, job: Job):
869
- return self.scheduler.submit(job)
870
-
871
- def prepare(self, job: Job):
872
- """Generate the file"""
873
- return self.scheduler.prepare(job)
874
-
875
- @property
876
- def run_mode(self):
877
- return self.workspace.run_mode
878
-
879
- @property
880
- def loop(self):
881
- assert self.central is not None
882
- return self.central.loop
883
-
884
- @property
885
- def resultspath(self):
886
- """Return the directory in which results can be stored for this experiment"""
887
- return self.workdir / "results"
888
-
889
- @property
890
- def jobspath(self):
891
- """Return the directory in which results can be stored for this experiment"""
892
- return self.workdir / "jobs"
893
-
894
- @property
895
- def alt_jobspaths(self):
896
- """Return potential other directories"""
897
- for alt_workdir in self.workspace.alt_workdirs:
898
- yield alt_workdir / "jobs"
899
-
900
- @property
901
- def jobsbakpath(self):
902
- """Return the directory in which results can be stored for this experiment"""
903
- return self.workdir / "jobs.bak"
904
-
905
- def stop(self):
906
- """Stop the experiment as soon as possible"""
907
-
908
- async def doStop():
909
- assert self.central is not None
910
- async with self.central.exitCondition:
911
- self.exitMode = True
912
- logging.debug("Setting exit mode to true")
913
- self.central.exitCondition.notify_all()
914
-
915
- assert self.central is not None and self.central.loop is not None
916
- asyncio.run_coroutine_threadsafe(doStop(), self.central.loop)
917
-
918
- def wait(self):
919
- """Wait until the running processes have finished"""
920
-
921
- async def awaitcompletion():
922
- assert self.central is not None
923
- logger.debug("Waiting to exit scheduler...")
924
- async with self.central.exitCondition:
925
- while True:
926
- if self.exitMode:
927
- break
928
-
929
- # If we have still unfinished jobs or possible new tasks, wait
930
- logger.debug(
931
- "Checking exit condition: unfinished jobs=%d, task output queue size=%d",
932
- self.unfinishedJobs,
933
- self.taskOutputQueueSize,
934
- )
935
- if self.unfinishedJobs == 0 and self.taskOutputQueueSize == 0:
936
- break
937
-
938
- # Wait for more news...
939
- await self.central.exitCondition.wait()
940
-
941
- if self.failedJobs:
942
- # Show some more information
943
- count = 0
944
- for job in self.failedJobs.values():
945
- if job.failure_status != JobFailureStatus.DEPENDENCY:
946
- count += 1
947
621
  logger.error(
948
- "Job %s failed, check the log file %s",
949
- job.relpath,
950
- job.stderr,
622
+ "No .done or .failed file found for job %s", job
951
623
  )
952
- raise FailedExperiment(f"{count} failed jobs")
953
-
954
- future = asyncio.run_coroutine_threadsafe(awaitcompletion(), self.loop)
955
- return future.result()
956
-
957
- def setenv(self, name, value, override=True):
958
- """Shortcut to set the environment value"""
959
- if override or name not in self.workspace.env:
960
- logging.info("Setting environment: %s=%s", name, value)
961
- self.workspace.env[name] = value
962
-
963
- def token(self, name: str, count: int):
964
- """Returns a token for this experiment
965
-
966
- The token is the default token of the workspace connector"""
967
- return self.workspace.connector.createtoken(name, count)
968
-
969
- def __enter__(self):
970
- from .dynamic_outputs import TaskOutputsWorker
971
-
972
- if self.workspace.run_mode != RunMode.DRY_RUN:
973
- logger.info("Locking experiment %s", self.xplockpath)
974
- self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
975
- logger.info("Experiment locked")
976
-
977
- # Move old jobs into "jobs.bak"
978
- if self.workspace.run_mode == RunMode.NORMAL:
979
- self.jobsbakpath.mkdir(exist_ok=True)
980
- for p in self.jobspath.glob("*/*"):
981
- if p.is_symlink():
982
- target = self.jobsbakpath / p.relative_to(self.jobspath)
983
- if target.is_symlink():
984
- # Remove if duplicate
985
- p.unlink()
986
- else:
987
- # Rename otherwise
988
- target.parent.mkdir(parents=True, exist_ok=True)
989
- p.rename(target)
990
-
991
- if self.server:
992
- self.server.start()
993
-
994
- self.workspace.__enter__()
995
- (self.workspace.path / ".__experimaestro__").touch()
996
-
997
- global SIGNAL_HANDLER
998
- # Number of unfinished jobs
999
- self.unfinishedJobs = 0
1000
- self.taskOutputQueueSize = 0
1001
-
1002
- # List of failed jobs
1003
- self.failedJobs: Dict[str, Job] = {}
1004
-
1005
- # Exit mode when catching signals
1006
- self.exitMode = False
1007
-
1008
- self.central = SchedulerCentral.create(self.scheduler.name)
1009
- self.taskOutputsWorker = TaskOutputsWorker(self)
1010
- self.taskOutputsWorker.start()
1011
-
1012
- SIGNAL_HANDLER.add(self)
1013
-
1014
- self.old_experiment = experiment.CURRENT
1015
- experiment.CURRENT = self
1016
- return self
1017
-
1018
- def __exit__(self, exc_type, exc_value, traceback):
1019
- logger.debug("Exiting scheduler context")
1020
- # If no exception and normal run mode, remove old "jobs"
1021
- if self.workspace.run_mode == RunMode.NORMAL:
1022
- if exc_type is None and self.jobsbakpath.is_dir():
1023
- rmtree(self.jobsbakpath)
1024
-
1025
- # Close the different locks
1026
- try:
1027
- if exc_type:
1028
- # import faulthandler
1029
- # faulthandler.dump_traceback()
1030
- logger.error(
1031
- "Not waiting since an exception was thrown"
1032
- " (some jobs may be running)"
1033
- )
1034
- else:
1035
- self.wait()
1036
- finally:
1037
- SIGNAL_HANDLER.remove(self)
1038
-
1039
- # Stop services
1040
- for service in self.services.values():
1041
- logger.info("Closing service %s", service.description())
1042
- service.stop()
1043
-
1044
- if self.central is not None:
1045
- logger.info("Stopping scheduler event loop")
1046
- self.central.loop.stop()
1047
-
1048
- if self.taskOutputsWorker is not None:
1049
- logger.info("Stopping tasks outputs worker")
1050
- self.taskOutputsWorker.queue.put(None)
1051
-
1052
- self.central = None
1053
- self.workspace.__exit__(exc_type, exc_value, traceback)
1054
- if self.xplock:
1055
- self.xplock.__exit__(exc_type, exc_value, traceback)
1056
-
1057
- # Put back old experiment as current one
1058
- experiment.CURRENT = self.old_experiment
1059
- if self.server:
1060
- logger.info("Stopping web server")
1061
- self.server.stop()
1062
-
1063
- if self.workspace.run_mode == RunMode.NORMAL:
1064
- # Write the state
1065
- logging.info("Saving the experiment state")
1066
- from experimaestro.scheduler.state import ExperimentState
1067
-
1068
- ExperimentState.save(
1069
- self.workdir / "state.json", self.scheduler.jobs.values()
1070
- )
1071
-
1072
- async def update_task_output_count(self, delta: int):
1073
- """Change in the number of task outputs to process"""
1074
- async with self.central.exitCondition:
1075
- self.taskOutputQueueSize += delta
1076
- logging.debug(
1077
- "Updating queue size with %d => %d", delta, self.taskOutputQueueSize
1078
- )
1079
- if self.taskOutputQueueSize == 0:
1080
- self.central.exitCondition.notify_all()
1081
-
1082
- def watch_output(self, watched: "WatchedOutput"):
1083
- """Watch an output
1084
-
1085
- :param watched: The watched output specification
1086
- """
1087
-
1088
- self.taskOutputsWorker.watch_output(watched)
1089
-
1090
- def add_service(self, service: ServiceClass) -> ServiceClass:
1091
- """Adds a service (e.g. tensorboard viewer) to the experiment
1092
-
1093
- :param service: A service instance
1094
- :return: The same service instance
1095
- """
1096
- self.services[service.id] = service
1097
- for listener in self.scheduler.listeners:
1098
- listener.service_add(service)
1099
- return service
624
+ state = JobState.ERROR
1100
625
 
1101
- def save(self, obj: Any, name: str = "default"):
1102
- """Serializes configurations.
626
+ except JobError:
627
+ logger.warning("Error while running job")
628
+ state = JobState.ERROR
1103
629
 
1104
- Saves configuration objects within the experimental directory
1105
-
1106
- :param obj: The object to save
1107
- :param name: The name of the saving directory (default to `default`)
1108
- """
1109
-
1110
- if self.workspace.run_mode == RunMode.NORMAL:
1111
- from experimaestro import save
1112
-
1113
- save_dir = self.workdir / "data" / name
1114
- save_dir.mkdir(exist_ok=True, parents=True)
1115
-
1116
- save(obj, save_dir)
1117
-
1118
- def load(self, reference: str, name: str = "default"):
1119
- """Serializes configurations.
1120
-
1121
- Loads configuration objects from an experimental directory
1122
-
1123
- :param reference: The name of the experiment
1124
- :param name: The name of the saving directory (default to `default`)
1125
- """
1126
- from experimaestro import load
630
+ except Exception:
631
+ logger.warning(
632
+ "Error while running job (in experimaestro)", exc_info=True
633
+ )
634
+ state = JobState.ERROR
635
+
636
+ # Locks are released here after job completes
637
+
638
+ # Check if we should restart a resumable task that timed out
639
+ from experimaestro.scheduler.jobs import JobStateError
640
+
641
+ if (
642
+ isinstance(state, JobStateError)
643
+ and state.failure_reason == JobFailureStatus.TIMEOUT
644
+ and job.resumable
645
+ ):
646
+ job.retry_count += 1
647
+ if job.retry_count <= job.max_retries:
648
+ logger.info(
649
+ "Resumable task %s timed out - restarting (attempt %d/%d)",
650
+ job,
651
+ job.retry_count,
652
+ job.max_retries,
653
+ )
654
+ # Rotate log files to preserve previous run's logs
655
+ job.rotate_logs()
656
+ # Clear cached process so aio_run() will create a new one
657
+ job._process = None
658
+ # Delete PID file so the job will be resubmitted
659
+ if job.pidpath.exists():
660
+ job.pidpath.unlink()
661
+ # Continue the loop to restart
662
+ continue
663
+ else:
664
+ logger.warning(
665
+ "Resumable task %s exceeded max retries (%d), marking as failed",
666
+ job,
667
+ job.max_retries,
668
+ )
669
+ # Fall through to return the error state
1127
670
 
1128
- path = self.workspace.experimentspath / reference / "data" / name
1129
- return load(path)
671
+ # Job finished (success or non-recoverable error)
672
+ # Notify scheduler listeners of job state after job completes
673
+ self.notify_job_state(job)
674
+ return state