experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (122) hide show
  1. experimaestro/__init__.py +10 -11
  2. experimaestro/annotations.py +167 -206
  3. experimaestro/cli/__init__.py +278 -7
  4. experimaestro/cli/filter.py +42 -74
  5. experimaestro/cli/jobs.py +157 -106
  6. experimaestro/cli/refactor.py +249 -0
  7. experimaestro/click.py +0 -1
  8. experimaestro/commandline.py +19 -3
  9. experimaestro/connectors/__init__.py +20 -1
  10. experimaestro/connectors/local.py +12 -0
  11. experimaestro/core/arguments.py +182 -46
  12. experimaestro/core/identifier.py +107 -6
  13. experimaestro/core/objects/__init__.py +6 -0
  14. experimaestro/core/objects/config.py +542 -25
  15. experimaestro/core/objects/config_walk.py +20 -0
  16. experimaestro/core/serialization.py +91 -34
  17. experimaestro/core/subparameters.py +164 -0
  18. experimaestro/core/types.py +175 -38
  19. experimaestro/exceptions.py +26 -0
  20. experimaestro/experiments/cli.py +111 -25
  21. experimaestro/generators.py +50 -9
  22. experimaestro/huggingface.py +3 -1
  23. experimaestro/launcherfinder/parser.py +29 -0
  24. experimaestro/launchers/__init__.py +26 -1
  25. experimaestro/launchers/direct.py +12 -0
  26. experimaestro/launchers/slurm/base.py +154 -2
  27. experimaestro/mkdocs/metaloader.py +0 -1
  28. experimaestro/mypy.py +452 -7
  29. experimaestro/notifications.py +63 -13
  30. experimaestro/progress.py +0 -2
  31. experimaestro/rpyc.py +0 -1
  32. experimaestro/run.py +19 -6
  33. experimaestro/scheduler/base.py +510 -125
  34. experimaestro/scheduler/dependencies.py +43 -28
  35. experimaestro/scheduler/dynamic_outputs.py +259 -130
  36. experimaestro/scheduler/experiment.py +256 -31
  37. experimaestro/scheduler/interfaces.py +501 -0
  38. experimaestro/scheduler/jobs.py +216 -206
  39. experimaestro/scheduler/remote/__init__.py +31 -0
  40. experimaestro/scheduler/remote/client.py +874 -0
  41. experimaestro/scheduler/remote/protocol.py +467 -0
  42. experimaestro/scheduler/remote/server.py +423 -0
  43. experimaestro/scheduler/remote/sync.py +144 -0
  44. experimaestro/scheduler/services.py +323 -23
  45. experimaestro/scheduler/state_db.py +437 -0
  46. experimaestro/scheduler/state_provider.py +2766 -0
  47. experimaestro/scheduler/state_sync.py +891 -0
  48. experimaestro/scheduler/workspace.py +52 -10
  49. experimaestro/scriptbuilder.py +7 -0
  50. experimaestro/server/__init__.py +147 -57
  51. experimaestro/server/data/index.css +0 -125
  52. experimaestro/server/data/index.css.map +1 -1
  53. experimaestro/server/data/index.js +194 -58
  54. experimaestro/server/data/index.js.map +1 -1
  55. experimaestro/settings.py +44 -5
  56. experimaestro/sphinx/__init__.py +3 -3
  57. experimaestro/taskglobals.py +20 -0
  58. experimaestro/tests/conftest.py +80 -0
  59. experimaestro/tests/core/test_generics.py +2 -2
  60. experimaestro/tests/identifier_stability.json +45 -0
  61. experimaestro/tests/launchers/bin/sacct +6 -2
  62. experimaestro/tests/launchers/bin/sbatch +4 -2
  63. experimaestro/tests/launchers/test_slurm.py +80 -0
  64. experimaestro/tests/tasks/test_dynamic.py +231 -0
  65. experimaestro/tests/test_cli_jobs.py +615 -0
  66. experimaestro/tests/test_deprecated.py +630 -0
  67. experimaestro/tests/test_environment.py +200 -0
  68. experimaestro/tests/test_file_progress_integration.py +1 -1
  69. experimaestro/tests/test_forward.py +3 -3
  70. experimaestro/tests/test_identifier.py +372 -41
  71. experimaestro/tests/test_identifier_stability.py +458 -0
  72. experimaestro/tests/test_instance.py +3 -3
  73. experimaestro/tests/test_multitoken.py +442 -0
  74. experimaestro/tests/test_mypy.py +433 -0
  75. experimaestro/tests/test_objects.py +312 -5
  76. experimaestro/tests/test_outputs.py +2 -2
  77. experimaestro/tests/test_param.py +8 -12
  78. experimaestro/tests/test_partial_paths.py +231 -0
  79. experimaestro/tests/test_progress.py +0 -48
  80. experimaestro/tests/test_remote_state.py +671 -0
  81. experimaestro/tests/test_resumable_task.py +480 -0
  82. experimaestro/tests/test_serializers.py +141 -1
  83. experimaestro/tests/test_state_db.py +434 -0
  84. experimaestro/tests/test_subparameters.py +160 -0
  85. experimaestro/tests/test_tags.py +136 -0
  86. experimaestro/tests/test_tasks.py +107 -121
  87. experimaestro/tests/test_token_locking.py +252 -0
  88. experimaestro/tests/test_tokens.py +17 -13
  89. experimaestro/tests/test_types.py +123 -1
  90. experimaestro/tests/test_workspace_triggers.py +158 -0
  91. experimaestro/tests/token_reschedule.py +4 -2
  92. experimaestro/tests/utils.py +2 -2
  93. experimaestro/tokens.py +154 -57
  94. experimaestro/tools/diff.py +1 -1
  95. experimaestro/tui/__init__.py +8 -0
  96. experimaestro/tui/app.py +2395 -0
  97. experimaestro/tui/app.tcss +353 -0
  98. experimaestro/tui/log_viewer.py +228 -0
  99. experimaestro/utils/__init__.py +23 -0
  100. experimaestro/utils/environment.py +148 -0
  101. experimaestro/utils/git.py +129 -0
  102. experimaestro/utils/resources.py +1 -1
  103. experimaestro/version.py +34 -0
  104. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +68 -38
  105. experimaestro-2.0.0b8.dist-info/RECORD +187 -0
  106. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +1 -1
  107. experimaestro-2.0.0b8.dist-info/entry_points.txt +16 -0
  108. experimaestro/compat.py +0 -6
  109. experimaestro/core/objects.pyi +0 -221
  110. experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
  111. experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
  112. experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
  113. experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
  114. experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
  115. experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
  116. experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
  117. experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
  118. experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
  119. experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
  120. experimaestro-2.0.0a8.dist-info/RECORD +0 -166
  121. experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
  122. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0
@@ -1,15 +1,16 @@
1
- import logging
2
1
  import threading
3
2
  import time
4
3
  from typing import (
5
4
  Optional,
6
5
  Set,
6
+ ClassVar,
7
+ TYPE_CHECKING,
7
8
  )
8
9
  import asyncio
9
10
  from typing import Dict
10
11
 
11
12
  from experimaestro.scheduler import experiment
12
- from experimaestro.scheduler.jobs import Job, JobState
13
+ from experimaestro.scheduler.jobs import Job, JobState, JobError
13
14
  from experimaestro.scheduler.services import Service
14
15
 
15
16
 
@@ -17,6 +18,11 @@ from experimaestro.utils import logger
17
18
  from experimaestro.utils.asyncio import asyncThreadcheck
18
19
  import concurrent.futures
19
20
 
21
+ if TYPE_CHECKING:
22
+ from experimaestro.server import Server
23
+ from experimaestro.settings import ServerSettings
24
+ from experimaestro.scheduler.workspace import Workspace
25
+
20
26
 
21
27
  class Listener:
22
28
  def job_submitted(self, job):
@@ -31,18 +37,24 @@ class Listener:
31
37
 
32
38
 
33
39
  class Scheduler(threading.Thread):
34
- """A job scheduler
40
+ """A job scheduler (singleton)
35
41
 
36
- The scheduler is based on asyncio for easy concurrency handling
42
+ The scheduler is based on asyncio for easy concurrency handling.
43
+ This is a singleton - only one scheduler instance exists per process.
37
44
  """
38
45
 
39
- def __init__(self, xp: "experiment", name: str):
46
+ _instance: ClassVar[Optional["Scheduler"]] = None
47
+ _lock: ClassVar[threading.Lock] = threading.Lock()
48
+
49
+ def __init__(self, name: str = "Global"):
40
50
  super().__init__(name=f"Scheduler ({name})", daemon=True)
41
51
  self._ready = threading.Event()
42
52
 
43
- # Name of the experiment
53
+ # Name of the scheduler
44
54
  self.name = name
45
- self.xp = xp
55
+
56
+ # Track experiments (simple dict for now)
57
+ self.experiments: Dict[str, "experiment"] = {}
46
58
 
47
59
  # Exit mode activated
48
60
  self.exitmode = False
@@ -53,16 +65,101 @@ class Scheduler(threading.Thread):
53
65
  # List of jobs
54
66
  self.waitingjobs: Set[Job] = set()
55
67
 
56
- # Listeners
57
- self.listeners: Set[Listener] = set()
68
+ # Listeners with thread-safe access
69
+ self._listeners: Set[Listener] = set()
70
+ self._listeners_lock = threading.Lock()
71
+
72
+ # Notification thread pool (single worker to serialize notifications)
73
+ self._notification_executor = concurrent.futures.ThreadPoolExecutor(
74
+ max_workers=1, thread_name_prefix="NotificationWorker"
75
+ )
76
+
77
+ # Server (managed by scheduler)
78
+ self.server: Optional["Server"] = None
58
79
 
59
80
  @staticmethod
60
- def create(xp: "experiment", name: str):
61
- instance = Scheduler(xp, name)
81
+ def has_instance() -> bool:
82
+ """Check if a scheduler instance exists without creating one"""
83
+ return Scheduler._instance is not None
84
+
85
+ @staticmethod
86
+ def instance() -> "Scheduler":
87
+ """Get or create the global scheduler instance"""
88
+ if Scheduler._instance is None:
89
+ with Scheduler._lock:
90
+ if Scheduler._instance is None:
91
+ Scheduler._instance = Scheduler._create()
92
+ return Scheduler._instance
93
+
94
+ @staticmethod
95
+ def _create(name: str = "Global"):
96
+ """Internal method to create and start scheduler"""
97
+ instance = Scheduler(name)
62
98
  instance.start()
63
99
  instance._ready.wait()
64
100
  return instance
65
101
 
102
+ @staticmethod
103
+ def create(xp: "experiment" = None, name: str = "Global"):
104
+ """Create or get the scheduler instance
105
+
106
+ Args:
107
+ xp: (Deprecated) Experiment reference, ignored
108
+ name: Name for the scheduler (only used on first creation)
109
+
110
+ Returns:
111
+ The global scheduler instance
112
+ """
113
+ return Scheduler.instance()
114
+
115
+ def register_experiment(self, xp: "experiment"):
116
+ """Register an experiment with the scheduler"""
117
+ # Use experiment name as key for now
118
+ key = xp.workdir.name
119
+ self.experiments[key] = xp
120
+
121
+ logger.debug("Registered experiment %s with scheduler", key)
122
+
123
+ def unregister_experiment(self, xp: "experiment"):
124
+ """Unregister an experiment from the scheduler"""
125
+ key = xp.workdir.name
126
+ if key in self.experiments:
127
+ del self.experiments[key]
128
+ logger.debug("Unregistered experiment %s from scheduler", key)
129
+
130
+ def start_server(
131
+ self, settings: "ServerSettings" = None, workspace: "Workspace" = None
132
+ ):
133
+ """Start the notification server (if not already running)
134
+
135
+ Args:
136
+ settings: Server settings
137
+ workspace: Workspace instance (required to get workspace path)
138
+ """
139
+ if self.server is None:
140
+ from experimaestro.server import Server
141
+ from experimaestro.scheduler.state_provider import WorkspaceStateProvider
142
+
143
+ if workspace is None:
144
+ raise ValueError("workspace parameter is required to start server")
145
+
146
+ # Get the workspace state provider singleton
147
+ state_provider = WorkspaceStateProvider.get_instance(
148
+ workspace.path, read_only=False, sync_on_start=False
149
+ )
150
+
151
+ self.server = Server.instance(settings, state_provider)
152
+ self.server.start()
153
+ logger.info("Server started by scheduler")
154
+ else:
155
+ logger.debug("Server already running")
156
+
157
+ def stop_server(self):
158
+ """Stop the notification server"""
159
+ if self.server is not None:
160
+ self.server.stop()
161
+ logger.info("Server stopped by scheduler")
162
+
66
163
  def run(self):
67
164
  """Run the event loop forever"""
68
165
  logger.debug("Starting event loop thread")
@@ -72,6 +169,10 @@ class Scheduler(threading.Thread):
72
169
  # Set loop-dependent variables
73
170
  self.exitCondition = asyncio.Condition()
74
171
  self.dependencyLock = asyncio.Lock()
172
+
173
+ # Note: State provider removed - now managed at workspace level
174
+ # Each experiment has its own workspace with database
175
+
75
176
  self._ready.set()
76
177
  self.loop.run_forever()
77
178
 
@@ -84,10 +185,38 @@ class Scheduler(threading.Thread):
84
185
  logger.warning("Scheduler already started")
85
186
 
86
187
  def addlistener(self, listener: Listener):
87
- self.listeners.add(listener)
188
+ with self._listeners_lock:
189
+ self._listeners.add(listener)
88
190
 
89
191
  def removelistener(self, listener: Listener):
90
- self.listeners.remove(listener)
192
+ with self._listeners_lock:
193
+ self._listeners.discard(listener)
194
+
195
+ def clear_listeners(self):
196
+ """Clear all listeners (for testing purposes)"""
197
+ with self._listeners_lock:
198
+ self._listeners.clear()
199
+
200
+ def wait_for_notifications(self, timeout: float = 5.0) -> bool:
201
+ """Wait for all pending notifications to be processed.
202
+
203
+ This submits a sentinel task and waits for it to complete,
204
+ ensuring all previously submitted notifications have been processed.
205
+
206
+ Args:
207
+ timeout: Maximum time to wait in seconds
208
+
209
+ Returns:
210
+ True if all notifications were processed, False if timeout occurred
211
+ """
212
+ try:
213
+ # Submit a no-op and wait for it to complete
214
+ future = self._notification_executor.submit(lambda: None)
215
+ future.result(timeout=timeout)
216
+ return True
217
+ except concurrent.futures.TimeoutError:
218
+ logger.warning("Timeout waiting for notification queue to drain")
219
+ return False
91
220
 
92
221
  def getJobState(self, job: Job) -> "concurrent.futures.Future[JobState]":
93
222
  # Check if the job belongs to this scheduler
@@ -104,17 +233,25 @@ class Scheduler(threading.Thread):
104
233
 
105
234
  def submit(self, job: Job) -> Optional[Job]:
106
235
  # Wait for the future containing the submitted job
107
- logger.debug("Registering the job %s within the scheduler", job)
236
+ logger.debug("Submit job %s to the scheduler", job)
108
237
  otherFuture = asyncio.run_coroutine_threadsafe(
109
238
  self.aio_registerJob(job), self.loop
110
239
  )
111
240
  other = otherFuture.result()
112
241
  logger.debug("Job already submitted" if other else "First submission")
113
- if other:
114
- return other
242
+
243
+ # Only returns if job was already submitted and doesn't need reprocessing
244
+ if other is not None:
245
+ # If state is WAITING, it was just reset for resubmission and needs processing
246
+ # If state is RUNNING or finished (DONE), no need to reprocess
247
+ if other.state != JobState.WAITING:
248
+ return other
249
+ # Use 'other' for resubmission since it has the correct experiments list
250
+ job = other
115
251
 
116
252
  job._future = asyncio.run_coroutine_threadsafe(self.aio_submit(job), self.loop)
117
- return None
253
+
254
+ return other
118
255
 
119
256
  def prepare(self, job: Job):
120
257
  """Prepares the job for running"""
@@ -129,49 +266,99 @@ class Scheduler(threading.Thread):
129
266
 
130
267
  if self.exitmode:
131
268
  logger.warning("Exit mode: not submitting")
269
+ return
132
270
 
133
- elif job.identifier in self.jobs:
271
+ # Job was already submitted
272
+ if job.identifier in self.jobs:
134
273
  other = self.jobs[job.identifier]
135
274
  assert job.type == other.type
136
- if other.state == JobState.ERROR:
275
+
276
+ # Add current experiment to the existing job's experiments list
277
+ xp = experiment.current()
278
+ xp.add_job(other)
279
+
280
+ # Copy watched outputs from new job to existing job
281
+ # This ensures new callbacks are registered even for resubmitted jobs
282
+ other.watched_outputs.extend(job.watched_outputs)
283
+
284
+ if other.state.is_error():
137
285
  logger.info("Re-submitting job")
286
+ # Clean up old process info so it will be re-started
287
+ other._process = None
288
+ if other.pidpath.is_file():
289
+ other.pidpath.unlink()
290
+ # Use set_state to handle experiment statistics updates
291
+ other.set_state(JobState.WAITING)
292
+ self.notify_job_state(other) # Notify listeners of re-submit
138
293
  else:
139
294
  logger.warning("Job %s already submitted", job.identifier)
140
- return other
141
295
 
142
- else:
143
- # Register this job
144
- self.xp.unfinishedJobs += 1
145
- self.jobs[job.identifier] = job
296
+ # Returns the previous job
297
+ return other
298
+
299
+ # Register this job
300
+ xp = experiment.current()
301
+ self.jobs[job.identifier] = job
302
+ # Set submittime now so that add_job can record it in the database
303
+ # (aio_submit may update this later for re-submitted jobs)
304
+ job.submittime = time.time()
305
+ xp.add_job(job)
306
+
307
+ # Set up dependencies
308
+ for dependency in job.dependencies:
309
+ dependency.target = job
310
+ dependency.origin.dependents.add(dependency)
146
311
 
147
312
  return None
148
313
 
314
+ def _notify_listeners(self, notification_func, job: Job):
315
+ """Execute notification in thread pool with error isolation.
316
+
317
+ This runs notifications in a dedicated thread pool to avoid blocking
318
+ the scheduler and to isolate errors from affecting other listeners.
319
+ """
320
+
321
+ def _do_notify():
322
+ # Get a snapshot of listeners with the lock
323
+ with self._listeners_lock:
324
+ listeners_snapshot = list(self._listeners)
325
+
326
+ for listener in listeners_snapshot:
327
+ try:
328
+ notification_func(listener, job)
329
+ except Exception:
330
+ logger.exception("Got an error with listener %s", listener)
331
+
332
+ self._notification_executor.submit(_do_notify)
333
+
149
334
  def notify_job_submitted(self, job: Job):
150
335
  """Notify the listeners that a job has been submitted"""
151
- for listener in self.listeners:
152
- try:
153
- listener.job_submitted(job)
154
- except Exception:
155
- logger.exception("Got an error with listener %s", listener)
336
+ self._notify_listeners(lambda lst, j: lst.job_submitted(j), job)
156
337
 
157
338
  def notify_job_state(self, job: Job):
158
339
  """Notify the listeners that a job has changed state"""
159
- for listener in self.listeners:
160
- try:
161
- listener.job_state(job)
162
- except Exception:
163
- logger.exception("Got an error with listener %s", listener)
340
+ self._notify_listeners(lambda lst, j: lst.job_state(j), job)
341
+
342
+ def notify_service_add(self, service: Service):
343
+ """Notify the listeners that a service has been added"""
344
+ self._notify_listeners(lambda lst, s: lst.service_add(s), service)
164
345
 
165
- async def aio_submit(self, job: Job) -> JobState: # noqa: C901
346
+ async def aio_submit(self, job: Job) -> JobState:
166
347
  """Main scheduler function: submit a job, run it (if needed), and returns
167
348
  the status code
168
349
  """
350
+ from experimaestro.scheduler.jobs import JobStateError, JobFailureStatus
351
+
169
352
  logger.info("Submitting job %s", job)
170
- job._readyEvent = asyncio.Event()
171
353
  job.submittime = time.time()
172
354
  job.scheduler = self
173
355
  self.waitingjobs.add(job)
174
356
 
357
+ # Register watched outputs now that the job has a scheduler
358
+ job.register_watched_outputs()
359
+
360
+ # Note: Job metadata will be written after directory is created in aio_start
361
+
175
362
  # Check that we don't have a completed job in
176
363
  # alternate directories
177
364
  for jobspath in experiment.current().alt_jobspaths:
@@ -185,126 +372,324 @@ class Scheduler(threading.Thread):
185
372
  path.unlink()
186
373
  path.symlink_to(job.path)
187
374
 
188
- job.state = JobState.WAITING
189
-
375
+ job.set_state(JobState.WAITING)
190
376
  self.notify_job_submitted(job)
191
377
 
192
- # Add dependencies, and add to blocking resources
193
- if job.dependencies:
194
- job.unsatisfied = len(job.dependencies)
195
-
196
- for dependency in job.dependencies:
197
- dependency.target = job
198
- dependency.loop = self.loop
199
- dependency.origin.dependents.add(dependency)
200
- dependency.check()
201
- else:
202
- job._readyEvent.set()
203
- job.state = JobState.READY
204
-
378
+ # Check if already done
205
379
  if job.donepath.exists():
206
- job.state = JobState.DONE
380
+ job.set_state(JobState.DONE)
381
+ self.notify_job_state(job) # Notify listeners of done state
207
382
 
208
383
  # Check if we have a running process
209
- process = await job.aio_process()
210
- if process is not None:
211
- # Yep! First we notify the listeners
212
- job.state = JobState.RUNNING
213
- # Notify the listeners
214
- self.notify_job_state(job)
215
-
216
- # Adds to the listeners
217
- if self.xp.server is not None:
218
- job.add_notification_server(self.xp.server)
219
-
220
- # And now, we wait...
221
- logger.info("Got a process for job %s - waiting to complete", job)
222
- code = await process.aio_code()
223
- logger.info("Job %s completed with code %s", job, code)
224
- job.state = JobState.DONE if code == 0 else JobState.ERROR
225
-
226
- # Check if done
227
- if job.donepath.exists():
228
- job.state = JobState.DONE
229
-
230
- # OK, not done; let's start the job for real
231
- while not job.state.finished():
232
- # Wait that the job is ready
233
- await job._readyEvent.wait()
234
- job._readyEvent.clear()
235
-
236
- if job.state == JobState.READY:
237
- try:
238
- state = await self.aio_start(job)
239
- except Exception:
240
- logger.exception("Got an exception while starting the job")
241
- raise
384
+ if not job.state.finished():
385
+ process = await job.aio_process()
386
+ if process is not None:
387
+ # Notify listeners that job is running
388
+ job.set_state(JobState.RUNNING)
389
+ self.notify_job_state(job)
390
+
391
+ # Adds to the listeners
392
+ if self.server is not None:
393
+ job.add_notification_server(self.server)
394
+
395
+ # And now, we wait...
396
+ logger.info("Got a process for job %s - waiting to complete", job)
397
+ code = await process.aio_code()
398
+ logger.info("Job %s completed with code %s", job, code)
399
+
400
+ # Record exit code if available
401
+ if code is not None:
402
+ job.exit_code = code
403
+
404
+ # Read state from .done/.failed files (contains detailed failure reason)
405
+ state = JobState.from_path(job.path, job.name)
406
+
407
+ # If state is a generic FAILED error, let the process determine
408
+ # the state (it may detect launcher-specific failures like SLURM timeout)
409
+ if (
410
+ state is not None
411
+ and isinstance(state, JobStateError)
412
+ and state.failure_reason == JobFailureStatus.FAILED
413
+ and code is not None
414
+ ):
415
+ process_state = process.get_job_state(code)
416
+ if (
417
+ isinstance(process_state, JobStateError)
418
+ and process_state.failure_reason != JobFailureStatus.FAILED
419
+ ):
420
+ # Process detected a more specific failure reason
421
+ state = process_state
242
422
 
243
423
  if state is None:
244
- # State is None if this is not the main thread
245
- return JobState.ERROR
424
+ if code is not None:
425
+ # Fall back to process-specific state detection
426
+ state = process.get_job_state(code)
427
+ else:
428
+ logger.error("No .done or .failed file found for job %s", job)
429
+ state = JobState.ERROR
430
+ # Set endtime before set_state so database gets the timestamp
431
+ job.endtime = time.time()
432
+ job.set_state(state)
433
+ self.notify_job_state(job) # Notify listeners of final state
434
+
435
+ # If not done or running, start the job
436
+ if not job.state.finished():
437
+ try:
438
+ state = await self.aio_start(job)
439
+ # Set endtime before set_state so database gets the timestamp
440
+ job.endtime = time.time()
441
+ job.set_state(state)
442
+ except Exception:
443
+ logger.exception("Got an exception while starting the job")
444
+ raise
246
445
 
247
- job.state = state
446
+ # Job is finished - experiment statistics already updated by set_state
248
447
 
249
- self.notify_job_state(job)
448
+ # Write final metadata with end time and final state
449
+ job.write_metadata()
250
450
 
251
- # Job is finished
252
- if job.state != JobState.DONE:
253
- self.xp.failedJobs[job.identifier] = job
451
+ if job in self.waitingjobs:
452
+ self.waitingjobs.remove(job)
254
453
 
255
- # Process all remaining tasks outputs
454
+ # Process all remaining task outputs BEFORE notifying exit condition
455
+ # This ensures taskOutputQueueSize is updated before wait() can check it,
456
+ # preventing a race where wait() sees both unfinishedJobs==0 and
457
+ # taskOutputQueueSize==0 before callbacks have been queued.
256
458
  await asyncThreadcheck("End of job processing", job.done_handler)
257
459
 
258
- # Decrement the number of unfinished jobs and notify
259
- self.xp.unfinishedJobs -= 1
460
+ # Now notify - wait() will see the correct taskOutputQueueSize
260
461
  async with self.exitCondition:
261
- logging.debug("Updated number of unfinished jobs")
262
462
  self.exitCondition.notify_all()
263
463
 
264
- job.endtime = time.time()
265
- if job in self.waitingjobs:
266
- self.waitingjobs.remove(job)
267
-
268
- with job.dependents as dependents:
269
- logger.info("Processing %d dependent jobs", len(dependents))
270
- for dependency in dependents:
271
- logger.debug("Checking dependency %s", dependency)
272
- self.loop.call_soon(dependency.check)
273
-
274
464
  return job.state
275
465
 
276
- async def aio_start(self, job: Job) -> Optional[JobState]:
277
- """Start a job (scheduler coordination layer)
466
+ async def aio_start(self, job: Job) -> Optional[JobState]: # noqa: C901
467
+ """Start a job with full job starting logic
278
468
 
279
- This method serves as a coordination layer that delegates the actual
280
- job starting logic to the job itself while handling scheduler-specific
281
- concerns like state notifications and providing coordination context.
469
+ This method handles job locking, dependency acquisition, directory setup,
470
+ and job execution while using the scheduler's coordination lock to prevent
471
+ race conditions between multiple jobs.
282
472
 
283
473
  :param job: The job to start
284
474
  :return: JobState.WAITING if dependencies could not be locked, JobState.DONE
285
475
  if job completed successfully, JobState.ERROR if job failed during execution,
286
476
  or None (should not occur in normal operation)
287
- :raises Exception: Various exceptions during scheduler coordination
477
+ :raises Exception: Various exceptions during job execution, dependency locking,
478
+ or process creation
288
479
  """
480
+ from experimaestro.scheduler.jobs import JobStateError
481
+ from experimaestro.locking import Locks, LockError
482
+ from experimaestro.scheduler.jobs import JobFailureStatus
289
483
 
290
484
  # Assert preconditions
291
485
  assert job.launcher is not None
292
486
 
293
- try:
294
- # Call job's start method with scheduler context
295
- state = await job.aio_start(
296
- sched_dependency_lock=self.dependencyLock,
297
- notification_server=self.xp.server if self.xp else None,
487
+ # Restart loop for resumable tasks that timeout
488
+ while True:
489
+ logger.debug(
490
+ "Starting job %s with %d dependencies",
491
+ job,
492
+ len(job.dependencies),
298
493
  )
299
494
 
300
- if state is None:
301
- # Dependencies couldn't be locked, return WAITING state
302
- return JobState.WAITING
495
+ # Separate static and dynamic dependencies
496
+ static_deps = [d for d in job.dependencies if not d.is_dynamic()]
497
+ dynamic_deps = [d for d in job.dependencies if d.is_dynamic()]
303
498
 
304
- # Notify scheduler listeners of job state after successful start
499
+ # First, wait for all static dependencies (jobs) to complete
500
+ # These don't need the dependency lock as they can't change state
501
+ # Static dependency locks don't need to be added to locks list
502
+ logger.debug("Waiting for %d static dependencies", len(static_deps))
503
+ for dependency in static_deps:
504
+ logger.debug("Waiting for static dependency %s", dependency)
505
+ try:
506
+ await dependency.aio_lock()
507
+ except RuntimeError as e:
508
+ # Dependency failed - mark job as failed due to dependency
509
+ logger.warning("Dependency failed: %s", e)
510
+ return JobStateError(JobFailureStatus.DEPENDENCY)
511
+
512
+ # We first lock the job before proceeding
513
+ with Locks() as locks:
514
+ logger.debug("[starting] Locking job %s", job)
515
+ async with job.launcher.connector.lock(job.lockpath):
516
+ logger.debug("[starting] Locked job %s", job)
517
+
518
+ state = None
519
+ try:
520
+ # Now handle dynamic dependencies (tokens) with retry logic
521
+ # CRITICAL: Only one task at a time can acquire dynamic dependencies
522
+ # to prevent deadlocks (e.g., Task A holds Token1 waiting for Token2,
523
+ # Task B holds Token2 waiting for Token1)
524
+ if dynamic_deps:
525
+ async with self.dependencyLock:
526
+ logger.debug(
527
+ "Locking %d dynamic dependencies (tokens)",
528
+ len(dynamic_deps),
529
+ )
530
+ while True:
531
+ all_locked = True
532
+ for idx, dependency in enumerate(dynamic_deps):
533
+ try:
534
+ # Use timeout=0 for first dependency, 0.1s for subsequent
535
+ timeout = 0 if idx == 0 else 0.1
536
+ # Acquire the lock (this might block on IPC locks)
537
+ lock = await dependency.aio_lock(
538
+ timeout=timeout
539
+ )
540
+ locks.append(lock)
541
+ except LockError:
542
+ logger.info(
543
+ "Could not lock %s, retrying",
544
+ dependency,
545
+ )
546
+ # Release all locks and restart
547
+ for lock in locks.locks:
548
+ lock.release()
549
+ locks.locks.clear()
550
+ # Put failed dependency first
551
+ dynamic_deps.remove(dependency)
552
+ dynamic_deps.insert(0, dependency)
553
+ all_locked = False
554
+ break
555
+
556
+ if all_locked:
557
+ # All locks acquired successfully
558
+ break
559
+
560
+ # Dependencies have been locked, we can start the job
561
+ job.starttime = time.time()
562
+
563
+ # Creates the main directory
564
+ directory = job.path
565
+ logger.debug("Making directories job %s...", directory)
566
+
567
+ # Warn about directory cleanup for non-resumable tasks
568
+ # (only once per task type)
569
+ xpmtype = job.config.__xpmtype__
570
+ if (
571
+ directory.is_dir()
572
+ and not job.resumable
573
+ and not xpmtype.warned_clean_not_resumable
574
+ ):
575
+ xpmtype.warned_clean_not_resumable = True
576
+ logger.warning(
577
+ "In a future version, directory will be cleaned up for "
578
+ "non-resumable tasks (%s). Use ResumableTask if you want "
579
+ "to preserve the directory contents.",
580
+ xpmtype.identifier,
581
+ )
582
+
583
+ if not directory.is_dir():
584
+ directory.mkdir(parents=True, exist_ok=True)
585
+
586
+ # Write metadata with submit and start time (after directory creation)
587
+ job.write_metadata()
588
+
589
+ # Sets up the notification URL
590
+ if self.server is not None:
591
+ job.add_notification_server(self.server)
592
+
593
+ except Exception:
594
+ logger.warning("Error while locking job", exc_info=True)
595
+ return JobState.WAITING
596
+
597
+ try:
598
+ # Runs the job
599
+ process = await job.aio_run()
600
+ except Exception:
601
+ logger.warning("Error while starting job", exc_info=True)
602
+ return JobState.ERROR
603
+
604
+ # Wait for job to complete while holding locks
605
+ try:
606
+ logger.debug("Waiting for job %s process to end", job)
607
+
608
+ code = await process.aio_code()
609
+ logger.debug("Got return code %s for %s", code, job)
610
+
611
+ # Record exit code if available
612
+ if code is not None:
613
+ logger.info("Job %s ended with code %s", job, code)
614
+ job.exit_code = code
615
+ else:
616
+ logger.info("Job %s ended, reading state from files", job)
617
+
618
+ # Read state from .done/.failed files (contains detailed failure reason)
619
+ state = JobState.from_path(job.path, job.name)
620
+
621
+ # If state is a generic FAILED error, let the process determine
622
+ # the state (it may detect launcher-specific failures like SLURM timeout)
623
+ if (
624
+ state is not None
625
+ and isinstance(state, JobStateError)
626
+ and state.failure_reason == JobFailureStatus.FAILED
627
+ and code is not None
628
+ ):
629
+ process_state = process.get_job_state(code)
630
+ if (
631
+ isinstance(process_state, JobStateError)
632
+ and process_state.failure_reason != JobFailureStatus.FAILED
633
+ ):
634
+ # Process detected a more specific failure reason
635
+ state = process_state
636
+
637
+ if state is None:
638
+ if code is not None:
639
+ # Fall back to process-specific state detection
640
+ state = process.get_job_state(code)
641
+ else:
642
+ logger.error(
643
+ "No .done or .failed file found for job %s", job
644
+ )
645
+ state = JobState.ERROR
646
+
647
+ except JobError:
648
+ logger.warning("Error while running job")
649
+ state = JobState.ERROR
650
+
651
+ except Exception:
652
+ logger.warning(
653
+ "Error while running job (in experimaestro)", exc_info=True
654
+ )
655
+ state = JobState.ERROR
656
+
657
+ # Locks are released here after job completes
658
+
659
+ # Check if we should restart a resumable task that timed out
660
+ from experimaestro.scheduler.jobs import JobStateError
661
+
662
+ if (
663
+ isinstance(state, JobStateError)
664
+ and state.failure_reason == JobFailureStatus.TIMEOUT
665
+ and job.resumable
666
+ ):
667
+ job.retry_count += 1
668
+ if job.retry_count <= job.max_retries:
669
+ logger.info(
670
+ "Resumable task %s timed out - restarting (attempt %d/%d)",
671
+ job,
672
+ job.retry_count,
673
+ job.max_retries,
674
+ )
675
+ # Rotate log files to preserve previous run's logs
676
+ job.rotate_logs()
677
+ # Clear cached process so aio_run() will create a new one
678
+ job._process = None
679
+ # Delete PID file so the job will be resubmitted
680
+ if job.pidpath.exists():
681
+ job.pidpath.unlink()
682
+ # Continue the loop to restart
683
+ continue
684
+ else:
685
+ logger.warning(
686
+ "Resumable task %s exceeded max retries (%d), marking as failed",
687
+ job,
688
+ job.max_retries,
689
+ )
690
+ # Fall through to return the error state
691
+
692
+ # Job finished (success or non-recoverable error)
693
+ # Notify scheduler listeners of job state after job completes
305
694
  self.notify_job_state(job)
306
695
  return state
307
-
308
- except Exception:
309
- logger.warning("Error in scheduler job coordination", exc_info=True)
310
- return JobState.ERROR