experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (116) hide show
  1. experimaestro/__init__.py +10 -11
  2. experimaestro/annotations.py +167 -206
  3. experimaestro/cli/__init__.py +130 -5
  4. experimaestro/cli/filter.py +42 -74
  5. experimaestro/cli/jobs.py +157 -106
  6. experimaestro/cli/refactor.py +249 -0
  7. experimaestro/click.py +0 -1
  8. experimaestro/commandline.py +19 -3
  9. experimaestro/connectors/__init__.py +20 -1
  10. experimaestro/connectors/local.py +12 -0
  11. experimaestro/core/arguments.py +182 -46
  12. experimaestro/core/identifier.py +107 -6
  13. experimaestro/core/objects/__init__.py +6 -0
  14. experimaestro/core/objects/config.py +542 -25
  15. experimaestro/core/objects/config_walk.py +20 -0
  16. experimaestro/core/serialization.py +91 -34
  17. experimaestro/core/subparameters.py +164 -0
  18. experimaestro/core/types.py +175 -38
  19. experimaestro/exceptions.py +26 -0
  20. experimaestro/experiments/cli.py +107 -25
  21. experimaestro/generators.py +50 -9
  22. experimaestro/huggingface.py +3 -1
  23. experimaestro/launcherfinder/parser.py +29 -0
  24. experimaestro/launchers/__init__.py +26 -1
  25. experimaestro/launchers/direct.py +12 -0
  26. experimaestro/launchers/slurm/base.py +154 -2
  27. experimaestro/mkdocs/metaloader.py +0 -1
  28. experimaestro/mypy.py +452 -7
  29. experimaestro/notifications.py +63 -13
  30. experimaestro/progress.py +0 -2
  31. experimaestro/rpyc.py +0 -1
  32. experimaestro/run.py +19 -6
  33. experimaestro/scheduler/base.py +489 -125
  34. experimaestro/scheduler/dependencies.py +43 -28
  35. experimaestro/scheduler/dynamic_outputs.py +259 -130
  36. experimaestro/scheduler/experiment.py +225 -30
  37. experimaestro/scheduler/interfaces.py +474 -0
  38. experimaestro/scheduler/jobs.py +216 -206
  39. experimaestro/scheduler/services.py +186 -12
  40. experimaestro/scheduler/state_db.py +388 -0
  41. experimaestro/scheduler/state_provider.py +2345 -0
  42. experimaestro/scheduler/state_sync.py +834 -0
  43. experimaestro/scheduler/workspace.py +52 -10
  44. experimaestro/scriptbuilder.py +7 -0
  45. experimaestro/server/__init__.py +147 -57
  46. experimaestro/server/data/index.css +0 -125
  47. experimaestro/server/data/index.css.map +1 -1
  48. experimaestro/server/data/index.js +194 -58
  49. experimaestro/server/data/index.js.map +1 -1
  50. experimaestro/settings.py +44 -5
  51. experimaestro/sphinx/__init__.py +3 -3
  52. experimaestro/taskglobals.py +20 -0
  53. experimaestro/tests/conftest.py +80 -0
  54. experimaestro/tests/core/test_generics.py +2 -2
  55. experimaestro/tests/identifier_stability.json +45 -0
  56. experimaestro/tests/launchers/bin/sacct +6 -2
  57. experimaestro/tests/launchers/bin/sbatch +4 -2
  58. experimaestro/tests/launchers/test_slurm.py +80 -0
  59. experimaestro/tests/tasks/test_dynamic.py +231 -0
  60. experimaestro/tests/test_cli_jobs.py +615 -0
  61. experimaestro/tests/test_deprecated.py +630 -0
  62. experimaestro/tests/test_environment.py +200 -0
  63. experimaestro/tests/test_file_progress_integration.py +1 -1
  64. experimaestro/tests/test_forward.py +3 -3
  65. experimaestro/tests/test_identifier.py +372 -41
  66. experimaestro/tests/test_identifier_stability.py +458 -0
  67. experimaestro/tests/test_instance.py +3 -3
  68. experimaestro/tests/test_multitoken.py +442 -0
  69. experimaestro/tests/test_mypy.py +433 -0
  70. experimaestro/tests/test_objects.py +312 -5
  71. experimaestro/tests/test_outputs.py +2 -2
  72. experimaestro/tests/test_param.py +8 -12
  73. experimaestro/tests/test_partial_paths.py +231 -0
  74. experimaestro/tests/test_progress.py +0 -48
  75. experimaestro/tests/test_resumable_task.py +480 -0
  76. experimaestro/tests/test_serializers.py +141 -1
  77. experimaestro/tests/test_state_db.py +434 -0
  78. experimaestro/tests/test_subparameters.py +160 -0
  79. experimaestro/tests/test_tags.py +136 -0
  80. experimaestro/tests/test_tasks.py +107 -121
  81. experimaestro/tests/test_token_locking.py +252 -0
  82. experimaestro/tests/test_tokens.py +17 -13
  83. experimaestro/tests/test_types.py +123 -1
  84. experimaestro/tests/test_workspace_triggers.py +158 -0
  85. experimaestro/tests/token_reschedule.py +4 -2
  86. experimaestro/tests/utils.py +2 -2
  87. experimaestro/tokens.py +154 -57
  88. experimaestro/tools/diff.py +1 -1
  89. experimaestro/tui/__init__.py +8 -0
  90. experimaestro/tui/app.py +2303 -0
  91. experimaestro/tui/app.tcss +353 -0
  92. experimaestro/tui/log_viewer.py +228 -0
  93. experimaestro/utils/__init__.py +23 -0
  94. experimaestro/utils/environment.py +148 -0
  95. experimaestro/utils/git.py +129 -0
  96. experimaestro/utils/resources.py +1 -1
  97. experimaestro/version.py +34 -0
  98. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +68 -38
  99. experimaestro-2.0.0b4.dist-info/RECORD +181 -0
  100. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
  101. experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
  102. experimaestro/compat.py +0 -6
  103. experimaestro/core/objects.pyi +0 -221
  104. experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
  105. experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
  106. experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
  107. experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
  108. experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
  109. experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
  110. experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
  111. experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
  112. experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
  113. experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
  114. experimaestro-2.0.0a8.dist-info/RECORD +0 -166
  115. experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
  116. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b4.dist-info}/licenses/LICENSE +0 -0
@@ -1,15 +1,16 @@
1
- import logging
2
1
  import threading
3
2
  import time
4
3
  from typing import (
5
4
  Optional,
6
5
  Set,
6
+ ClassVar,
7
+ TYPE_CHECKING,
7
8
  )
8
9
  import asyncio
9
10
  from typing import Dict
10
11
 
11
12
  from experimaestro.scheduler import experiment
12
- from experimaestro.scheduler.jobs import Job, JobState
13
+ from experimaestro.scheduler.jobs import Job, JobState, JobError
13
14
  from experimaestro.scheduler.services import Service
14
15
 
15
16
 
@@ -17,6 +18,11 @@ from experimaestro.utils import logger
17
18
  from experimaestro.utils.asyncio import asyncThreadcheck
18
19
  import concurrent.futures
19
20
 
21
+ if TYPE_CHECKING:
22
+ from experimaestro.server import Server
23
+ from experimaestro.settings import ServerSettings
24
+ from experimaestro.scheduler.workspace import Workspace
25
+
20
26
 
21
27
  class Listener:
22
28
  def job_submitted(self, job):
@@ -31,18 +37,24 @@ class Listener:
31
37
 
32
38
 
33
39
  class Scheduler(threading.Thread):
34
- """A job scheduler
40
+ """A job scheduler (singleton)
35
41
 
36
- The scheduler is based on asyncio for easy concurrency handling
42
+ The scheduler is based on asyncio for easy concurrency handling.
43
+ This is a singleton - only one scheduler instance exists per process.
37
44
  """
38
45
 
39
- def __init__(self, xp: "experiment", name: str):
46
+ _instance: ClassVar[Optional["Scheduler"]] = None
47
+ _lock: ClassVar[threading.Lock] = threading.Lock()
48
+
49
+ def __init__(self, name: str = "Global"):
40
50
  super().__init__(name=f"Scheduler ({name})", daemon=True)
41
51
  self._ready = threading.Event()
42
52
 
43
- # Name of the experiment
53
+ # Name of the scheduler
44
54
  self.name = name
45
- self.xp = xp
55
+
56
+ # Track experiments (simple dict for now)
57
+ self.experiments: Dict[str, "experiment"] = {}
46
58
 
47
59
  # Exit mode activated
48
60
  self.exitmode = False
@@ -53,16 +65,101 @@ class Scheduler(threading.Thread):
53
65
  # List of jobs
54
66
  self.waitingjobs: Set[Job] = set()
55
67
 
56
- # Listeners
57
- self.listeners: Set[Listener] = set()
68
+ # Listeners with thread-safe access
69
+ self._listeners: Set[Listener] = set()
70
+ self._listeners_lock = threading.Lock()
71
+
72
+ # Notification thread pool (single worker to serialize notifications)
73
+ self._notification_executor = concurrent.futures.ThreadPoolExecutor(
74
+ max_workers=1, thread_name_prefix="NotificationWorker"
75
+ )
76
+
77
+ # Server (managed by scheduler)
78
+ self.server: Optional["Server"] = None
58
79
 
59
80
  @staticmethod
60
- def create(xp: "experiment", name: str):
61
- instance = Scheduler(xp, name)
81
+ def has_instance() -> bool:
82
+ """Check if a scheduler instance exists without creating one"""
83
+ return Scheduler._instance is not None
84
+
85
+ @staticmethod
86
+ def instance() -> "Scheduler":
87
+ """Get or create the global scheduler instance"""
88
+ if Scheduler._instance is None:
89
+ with Scheduler._lock:
90
+ if Scheduler._instance is None:
91
+ Scheduler._instance = Scheduler._create()
92
+ return Scheduler._instance
93
+
94
+ @staticmethod
95
+ def _create(name: str = "Global"):
96
+ """Internal method to create and start scheduler"""
97
+ instance = Scheduler(name)
62
98
  instance.start()
63
99
  instance._ready.wait()
64
100
  return instance
65
101
 
102
+ @staticmethod
103
+ def create(xp: "experiment" = None, name: str = "Global"):
104
+ """Create or get the scheduler instance
105
+
106
+ Args:
107
+ xp: (Deprecated) Experiment reference, ignored
108
+ name: Name for the scheduler (only used on first creation)
109
+
110
+ Returns:
111
+ The global scheduler instance
112
+ """
113
+ return Scheduler.instance()
114
+
115
+ def register_experiment(self, xp: "experiment"):
116
+ """Register an experiment with the scheduler"""
117
+ # Use experiment name as key for now
118
+ key = xp.workdir.name
119
+ self.experiments[key] = xp
120
+
121
+ logger.debug("Registered experiment %s with scheduler", key)
122
+
123
+ def unregister_experiment(self, xp: "experiment"):
124
+ """Unregister an experiment from the scheduler"""
125
+ key = xp.workdir.name
126
+ if key in self.experiments:
127
+ del self.experiments[key]
128
+ logger.debug("Unregistered experiment %s from scheduler", key)
129
+
130
+ def start_server(
131
+ self, settings: "ServerSettings" = None, workspace: "Workspace" = None
132
+ ):
133
+ """Start the notification server (if not already running)
134
+
135
+ Args:
136
+ settings: Server settings
137
+ workspace: Workspace instance (required to get workspace path)
138
+ """
139
+ if self.server is None:
140
+ from experimaestro.server import Server
141
+ from experimaestro.scheduler.state_provider import WorkspaceStateProvider
142
+
143
+ if workspace is None:
144
+ raise ValueError("workspace parameter is required to start server")
145
+
146
+ # Get the workspace state provider singleton
147
+ state_provider = WorkspaceStateProvider.get_instance(
148
+ workspace.path, read_only=False, sync_on_start=False
149
+ )
150
+
151
+ self.server = Server.instance(settings, state_provider)
152
+ self.server.start()
153
+ logger.info("Server started by scheduler")
154
+ else:
155
+ logger.debug("Server already running")
156
+
157
+ def stop_server(self):
158
+ """Stop the notification server"""
159
+ if self.server is not None:
160
+ self.server.stop()
161
+ logger.info("Server stopped by scheduler")
162
+
66
163
  def run(self):
67
164
  """Run the event loop forever"""
68
165
  logger.debug("Starting event loop thread")
@@ -72,6 +169,10 @@ class Scheduler(threading.Thread):
72
169
  # Set loop-dependent variables
73
170
  self.exitCondition = asyncio.Condition()
74
171
  self.dependencyLock = asyncio.Lock()
172
+
173
+ # Note: State provider removed - now managed at workspace level
174
+ # Each experiment has its own workspace with database
175
+
75
176
  self._ready.set()
76
177
  self.loop.run_forever()
77
178
 
@@ -84,10 +185,17 @@ class Scheduler(threading.Thread):
84
185
  logger.warning("Scheduler already started")
85
186
 
86
187
  def addlistener(self, listener: Listener):
87
- self.listeners.add(listener)
188
+ with self._listeners_lock:
189
+ self._listeners.add(listener)
88
190
 
89
191
  def removelistener(self, listener: Listener):
90
- self.listeners.remove(listener)
192
+ with self._listeners_lock:
193
+ self._listeners.discard(listener)
194
+
195
+ def clear_listeners(self):
196
+ """Clear all listeners (for testing purposes)"""
197
+ with self._listeners_lock:
198
+ self._listeners.clear()
91
199
 
92
200
  def getJobState(self, job: Job) -> "concurrent.futures.Future[JobState]":
93
201
  # Check if the job belongs to this scheduler
@@ -104,17 +212,25 @@ class Scheduler(threading.Thread):
104
212
 
105
213
  def submit(self, job: Job) -> Optional[Job]:
106
214
  # Wait for the future containing the submitted job
107
- logger.debug("Registering the job %s within the scheduler", job)
215
+ logger.debug("Submit job %s to the scheduler", job)
108
216
  otherFuture = asyncio.run_coroutine_threadsafe(
109
217
  self.aio_registerJob(job), self.loop
110
218
  )
111
219
  other = otherFuture.result()
112
220
  logger.debug("Job already submitted" if other else "First submission")
113
- if other:
114
- return other
221
+
222
+ # Only returns if job was already submitted and doesn't need reprocessing
223
+ if other is not None:
224
+ # If state is WAITING, it was just reset for resubmission and needs processing
225
+ # If state is RUNNING or finished (DONE), no need to reprocess
226
+ if other.state != JobState.WAITING:
227
+ return other
228
+ # Use 'other' for resubmission since it has the correct experiments list
229
+ job = other
115
230
 
116
231
  job._future = asyncio.run_coroutine_threadsafe(self.aio_submit(job), self.loop)
117
- return None
232
+
233
+ return other
118
234
 
119
235
  def prepare(self, job: Job):
120
236
  """Prepares the job for running"""
@@ -129,49 +245,99 @@ class Scheduler(threading.Thread):
129
245
 
130
246
  if self.exitmode:
131
247
  logger.warning("Exit mode: not submitting")
248
+ return
132
249
 
133
- elif job.identifier in self.jobs:
250
+ # Job was already submitted
251
+ if job.identifier in self.jobs:
134
252
  other = self.jobs[job.identifier]
135
253
  assert job.type == other.type
136
- if other.state == JobState.ERROR:
254
+
255
+ # Add current experiment to the existing job's experiments list
256
+ xp = experiment.current()
257
+ xp.add_job(other)
258
+
259
+ # Copy watched outputs from new job to existing job
260
+ # This ensures new callbacks are registered even for resubmitted jobs
261
+ other.watched_outputs.extend(job.watched_outputs)
262
+
263
+ if other.state.is_error():
137
264
  logger.info("Re-submitting job")
265
+ # Clean up old process info so it will be re-started
266
+ other._process = None
267
+ if other.pidpath.is_file():
268
+ other.pidpath.unlink()
269
+ # Use set_state to handle experiment statistics updates
270
+ other.set_state(JobState.WAITING)
271
+ self.notify_job_state(other) # Notify listeners of re-submit
138
272
  else:
139
273
  logger.warning("Job %s already submitted", job.identifier)
140
- return other
141
274
 
142
- else:
143
- # Register this job
144
- self.xp.unfinishedJobs += 1
145
- self.jobs[job.identifier] = job
275
+ # Returns the previous job
276
+ return other
277
+
278
+ # Register this job
279
+ xp = experiment.current()
280
+ self.jobs[job.identifier] = job
281
+ # Set submittime now so that add_job can record it in the database
282
+ # (aio_submit may update this later for re-submitted jobs)
283
+ job.submittime = time.time()
284
+ xp.add_job(job)
285
+
286
+ # Set up dependencies
287
+ for dependency in job.dependencies:
288
+ dependency.target = job
289
+ dependency.origin.dependents.add(dependency)
146
290
 
147
291
  return None
148
292
 
293
+ def _notify_listeners(self, notification_func, job: Job):
294
+ """Execute notification in thread pool with error isolation.
295
+
296
+ This runs notifications in a dedicated thread pool to avoid blocking
297
+ the scheduler and to isolate errors from affecting other listeners.
298
+ """
299
+
300
+ def _do_notify():
301
+ # Get a snapshot of listeners with the lock
302
+ with self._listeners_lock:
303
+ listeners_snapshot = list(self._listeners)
304
+
305
+ for listener in listeners_snapshot:
306
+ try:
307
+ notification_func(listener, job)
308
+ except Exception:
309
+ logger.exception("Got an error with listener %s", listener)
310
+
311
+ self._notification_executor.submit(_do_notify)
312
+
149
313
  def notify_job_submitted(self, job: Job):
150
314
  """Notify the listeners that a job has been submitted"""
151
- for listener in self.listeners:
152
- try:
153
- listener.job_submitted(job)
154
- except Exception:
155
- logger.exception("Got an error with listener %s", listener)
315
+ self._notify_listeners(lambda lst, j: lst.job_submitted(j), job)
156
316
 
157
317
  def notify_job_state(self, job: Job):
158
318
  """Notify the listeners that a job has changed state"""
159
- for listener in self.listeners:
160
- try:
161
- listener.job_state(job)
162
- except Exception:
163
- logger.exception("Got an error with listener %s", listener)
319
+ self._notify_listeners(lambda lst, j: lst.job_state(j), job)
164
320
 
165
- async def aio_submit(self, job: Job) -> JobState: # noqa: C901
321
+ def notify_service_add(self, service: Service):
322
+ """Notify the listeners that a service has been added"""
323
+ self._notify_listeners(lambda lst, s: lst.service_add(s), service)
324
+
325
+ async def aio_submit(self, job: Job) -> JobState:
166
326
  """Main scheduler function: submit a job, run it (if needed), and returns
167
327
  the status code
168
328
  """
329
+ from experimaestro.scheduler.jobs import JobStateError, JobFailureStatus
330
+
169
331
  logger.info("Submitting job %s", job)
170
- job._readyEvent = asyncio.Event()
171
332
  job.submittime = time.time()
172
333
  job.scheduler = self
173
334
  self.waitingjobs.add(job)
174
335
 
336
+ # Register watched outputs now that the job has a scheduler
337
+ job.register_watched_outputs()
338
+
339
+ # Note: Job metadata will be written after directory is created in aio_start
340
+
175
341
  # Check that we don't have a completed job in
176
342
  # alternate directories
177
343
  for jobspath in experiment.current().alt_jobspaths:
@@ -185,126 +351,324 @@ class Scheduler(threading.Thread):
185
351
  path.unlink()
186
352
  path.symlink_to(job.path)
187
353
 
188
- job.state = JobState.WAITING
189
-
354
+ job.set_state(JobState.WAITING)
190
355
  self.notify_job_submitted(job)
191
356
 
192
- # Add dependencies, and add to blocking resources
193
- if job.dependencies:
194
- job.unsatisfied = len(job.dependencies)
195
-
196
- for dependency in job.dependencies:
197
- dependency.target = job
198
- dependency.loop = self.loop
199
- dependency.origin.dependents.add(dependency)
200
- dependency.check()
201
- else:
202
- job._readyEvent.set()
203
- job.state = JobState.READY
204
-
357
+ # Check if already done
205
358
  if job.donepath.exists():
206
- job.state = JobState.DONE
359
+ job.set_state(JobState.DONE)
360
+ self.notify_job_state(job) # Notify listeners of done state
207
361
 
208
362
  # Check if we have a running process
209
- process = await job.aio_process()
210
- if process is not None:
211
- # Yep! First we notify the listeners
212
- job.state = JobState.RUNNING
213
- # Notify the listeners
214
- self.notify_job_state(job)
215
-
216
- # Adds to the listeners
217
- if self.xp.server is not None:
218
- job.add_notification_server(self.xp.server)
219
-
220
- # And now, we wait...
221
- logger.info("Got a process for job %s - waiting to complete", job)
222
- code = await process.aio_code()
223
- logger.info("Job %s completed with code %s", job, code)
224
- job.state = JobState.DONE if code == 0 else JobState.ERROR
225
-
226
- # Check if done
227
- if job.donepath.exists():
228
- job.state = JobState.DONE
229
-
230
- # OK, not done; let's start the job for real
231
- while not job.state.finished():
232
- # Wait that the job is ready
233
- await job._readyEvent.wait()
234
- job._readyEvent.clear()
235
-
236
- if job.state == JobState.READY:
237
- try:
238
- state = await self.aio_start(job)
239
- except Exception:
240
- logger.exception("Got an exception while starting the job")
241
- raise
363
+ if not job.state.finished():
364
+ process = await job.aio_process()
365
+ if process is not None:
366
+ # Notify listeners that job is running
367
+ job.set_state(JobState.RUNNING)
368
+ self.notify_job_state(job)
369
+
370
+ # Adds to the listeners
371
+ if self.server is not None:
372
+ job.add_notification_server(self.server)
373
+
374
+ # And now, we wait...
375
+ logger.info("Got a process for job %s - waiting to complete", job)
376
+ code = await process.aio_code()
377
+ logger.info("Job %s completed with code %s", job, code)
378
+
379
+ # Record exit code if available
380
+ if code is not None:
381
+ job.exit_code = code
382
+
383
+ # Read state from .done/.failed files (contains detailed failure reason)
384
+ state = JobState.from_path(job.path, job.name)
385
+
386
+ # If state is a generic FAILED error, let the process determine
387
+ # the state (it may detect launcher-specific failures like SLURM timeout)
388
+ if (
389
+ state is not None
390
+ and isinstance(state, JobStateError)
391
+ and state.failure_reason == JobFailureStatus.FAILED
392
+ and code is not None
393
+ ):
394
+ process_state = process.get_job_state(code)
395
+ if (
396
+ isinstance(process_state, JobStateError)
397
+ and process_state.failure_reason != JobFailureStatus.FAILED
398
+ ):
399
+ # Process detected a more specific failure reason
400
+ state = process_state
242
401
 
243
402
  if state is None:
244
- # State is None if this is not the main thread
245
- return JobState.ERROR
403
+ if code is not None:
404
+ # Fall back to process-specific state detection
405
+ state = process.get_job_state(code)
406
+ else:
407
+ logger.error("No .done or .failed file found for job %s", job)
408
+ state = JobState.ERROR
409
+ # Set endtime before set_state so database gets the timestamp
410
+ job.endtime = time.time()
411
+ job.set_state(state)
412
+ self.notify_job_state(job) # Notify listeners of final state
413
+
414
+ # If not done or running, start the job
415
+ if not job.state.finished():
416
+ try:
417
+ state = await self.aio_start(job)
418
+ # Set endtime before set_state so database gets the timestamp
419
+ job.endtime = time.time()
420
+ job.set_state(state)
421
+ except Exception:
422
+ logger.exception("Got an exception while starting the job")
423
+ raise
246
424
 
247
- job.state = state
425
+ # Job is finished - experiment statistics already updated by set_state
248
426
 
249
- self.notify_job_state(job)
427
+ # Write final metadata with end time and final state
428
+ job.write_metadata()
250
429
 
251
- # Job is finished
252
- if job.state != JobState.DONE:
253
- self.xp.failedJobs[job.identifier] = job
430
+ if job in self.waitingjobs:
431
+ self.waitingjobs.remove(job)
254
432
 
255
- # Process all remaining tasks outputs
433
+ # Process all remaining task outputs BEFORE notifying exit condition
434
+ # This ensures taskOutputQueueSize is updated before wait() can check it,
435
+ # preventing a race where wait() sees both unfinishedJobs==0 and
436
+ # taskOutputQueueSize==0 before callbacks have been queued.
256
437
  await asyncThreadcheck("End of job processing", job.done_handler)
257
438
 
258
- # Decrement the number of unfinished jobs and notify
259
- self.xp.unfinishedJobs -= 1
439
+ # Now notify - wait() will see the correct taskOutputQueueSize
260
440
  async with self.exitCondition:
261
- logging.debug("Updated number of unfinished jobs")
262
441
  self.exitCondition.notify_all()
263
442
 
264
- job.endtime = time.time()
265
- if job in self.waitingjobs:
266
- self.waitingjobs.remove(job)
267
-
268
- with job.dependents as dependents:
269
- logger.info("Processing %d dependent jobs", len(dependents))
270
- for dependency in dependents:
271
- logger.debug("Checking dependency %s", dependency)
272
- self.loop.call_soon(dependency.check)
273
-
274
443
  return job.state
275
444
 
276
- async def aio_start(self, job: Job) -> Optional[JobState]:
277
- """Start a job (scheduler coordination layer)
445
+ async def aio_start(self, job: Job) -> Optional[JobState]: # noqa: C901
446
+ """Start a job with full job starting logic
278
447
 
279
- This method serves as a coordination layer that delegates the actual
280
- job starting logic to the job itself while handling scheduler-specific
281
- concerns like state notifications and providing coordination context.
448
+ This method handles job locking, dependency acquisition, directory setup,
449
+ and job execution while using the scheduler's coordination lock to prevent
450
+ race conditions between multiple jobs.
282
451
 
283
452
  :param job: The job to start
284
453
  :return: JobState.WAITING if dependencies could not be locked, JobState.DONE
285
454
  if job completed successfully, JobState.ERROR if job failed during execution,
286
455
  or None (should not occur in normal operation)
287
- :raises Exception: Various exceptions during scheduler coordination
456
+ :raises Exception: Various exceptions during job execution, dependency locking,
457
+ or process creation
288
458
  """
459
+ from experimaestro.scheduler.jobs import JobStateError
460
+ from experimaestro.locking import Locks, LockError
461
+ from experimaestro.scheduler.jobs import JobFailureStatus
289
462
 
290
463
  # Assert preconditions
291
464
  assert job.launcher is not None
292
465
 
293
- try:
294
- # Call job's start method with scheduler context
295
- state = await job.aio_start(
296
- sched_dependency_lock=self.dependencyLock,
297
- notification_server=self.xp.server if self.xp else None,
466
+ # Restart loop for resumable tasks that timeout
467
+ while True:
468
+ logger.debug(
469
+ "Starting job %s with %d dependencies",
470
+ job,
471
+ len(job.dependencies),
298
472
  )
299
473
 
300
- if state is None:
301
- # Dependencies couldn't be locked, return WAITING state
302
- return JobState.WAITING
474
+ # Separate static and dynamic dependencies
475
+ static_deps = [d for d in job.dependencies if not d.is_dynamic()]
476
+ dynamic_deps = [d for d in job.dependencies if d.is_dynamic()]
303
477
 
304
- # Notify scheduler listeners of job state after successful start
478
+ # First, wait for all static dependencies (jobs) to complete
479
+ # These don't need the dependency lock as they can't change state
480
+ # Static dependency locks don't need to be added to locks list
481
+ logger.debug("Waiting for %d static dependencies", len(static_deps))
482
+ for dependency in static_deps:
483
+ logger.debug("Waiting for static dependency %s", dependency)
484
+ try:
485
+ await dependency.aio_lock()
486
+ except RuntimeError as e:
487
+ # Dependency failed - mark job as failed due to dependency
488
+ logger.warning("Dependency failed: %s", e)
489
+ return JobStateError(JobFailureStatus.DEPENDENCY)
490
+
491
+ # We first lock the job before proceeding
492
+ with Locks() as locks:
493
+ logger.debug("[starting] Locking job %s", job)
494
+ async with job.launcher.connector.lock(job.lockpath):
495
+ logger.debug("[starting] Locked job %s", job)
496
+
497
+ state = None
498
+ try:
499
+ # Now handle dynamic dependencies (tokens) with retry logic
500
+ # CRITICAL: Only one task at a time can acquire dynamic dependencies
501
+ # to prevent deadlocks (e.g., Task A holds Token1 waiting for Token2,
502
+ # Task B holds Token2 waiting for Token1)
503
+ if dynamic_deps:
504
+ async with self.dependencyLock:
505
+ logger.debug(
506
+ "Locking %d dynamic dependencies (tokens)",
507
+ len(dynamic_deps),
508
+ )
509
+ while True:
510
+ all_locked = True
511
+ for idx, dependency in enumerate(dynamic_deps):
512
+ try:
513
+ # Use timeout=0 for first dependency, 0.1s for subsequent
514
+ timeout = 0 if idx == 0 else 0.1
515
+ # Acquire the lock (this might block on IPC locks)
516
+ lock = await dependency.aio_lock(
517
+ timeout=timeout
518
+ )
519
+ locks.append(lock)
520
+ except LockError:
521
+ logger.info(
522
+ "Could not lock %s, retrying",
523
+ dependency,
524
+ )
525
+ # Release all locks and restart
526
+ for lock in locks.locks:
527
+ lock.release()
528
+ locks.locks.clear()
529
+ # Put failed dependency first
530
+ dynamic_deps.remove(dependency)
531
+ dynamic_deps.insert(0, dependency)
532
+ all_locked = False
533
+ break
534
+
535
+ if all_locked:
536
+ # All locks acquired successfully
537
+ break
538
+
539
+ # Dependencies have been locked, we can start the job
540
+ job.starttime = time.time()
541
+
542
+ # Creates the main directory
543
+ directory = job.path
544
+ logger.debug("Making directories job %s...", directory)
545
+
546
+ # Warn about directory cleanup for non-resumable tasks
547
+ # (only once per task type)
548
+ xpmtype = job.config.__xpmtype__
549
+ if (
550
+ directory.is_dir()
551
+ and not job.resumable
552
+ and not xpmtype.warned_clean_not_resumable
553
+ ):
554
+ xpmtype.warned_clean_not_resumable = True
555
+ logger.warning(
556
+ "In a future version, directory will be cleaned up for "
557
+ "non-resumable tasks (%s). Use ResumableTask if you want "
558
+ "to preserve the directory contents.",
559
+ xpmtype.identifier,
560
+ )
561
+
562
+ if not directory.is_dir():
563
+ directory.mkdir(parents=True, exist_ok=True)
564
+
565
+ # Write metadata with submit and start time (after directory creation)
566
+ job.write_metadata()
567
+
568
+ # Sets up the notification URL
569
+ if self.server is not None:
570
+ job.add_notification_server(self.server)
571
+
572
+ except Exception:
573
+ logger.warning("Error while locking job", exc_info=True)
574
+ return JobState.WAITING
575
+
576
+ try:
577
+ # Runs the job
578
+ process = await job.aio_run()
579
+ except Exception:
580
+ logger.warning("Error while starting job", exc_info=True)
581
+ return JobState.ERROR
582
+
583
+ # Wait for job to complete while holding locks
584
+ try:
585
+ logger.debug("Waiting for job %s process to end", job)
586
+
587
+ code = await process.aio_code()
588
+ logger.debug("Got return code %s for %s", code, job)
589
+
590
+ # Record exit code if available
591
+ if code is not None:
592
+ logger.info("Job %s ended with code %s", job, code)
593
+ job.exit_code = code
594
+ else:
595
+ logger.info("Job %s ended, reading state from files", job)
596
+
597
+ # Read state from .done/.failed files (contains detailed failure reason)
598
+ state = JobState.from_path(job.path, job.name)
599
+
600
+ # If state is a generic FAILED error, let the process determine
601
+ # the state (it may detect launcher-specific failures like SLURM timeout)
602
+ if (
603
+ state is not None
604
+ and isinstance(state, JobStateError)
605
+ and state.failure_reason == JobFailureStatus.FAILED
606
+ and code is not None
607
+ ):
608
+ process_state = process.get_job_state(code)
609
+ if (
610
+ isinstance(process_state, JobStateError)
611
+ and process_state.failure_reason != JobFailureStatus.FAILED
612
+ ):
613
+ # Process detected a more specific failure reason
614
+ state = process_state
615
+
616
+ if state is None:
617
+ if code is not None:
618
+ # Fall back to process-specific state detection
619
+ state = process.get_job_state(code)
620
+ else:
621
+ logger.error(
622
+ "No .done or .failed file found for job %s", job
623
+ )
624
+ state = JobState.ERROR
625
+
626
+ except JobError:
627
+ logger.warning("Error while running job")
628
+ state = JobState.ERROR
629
+
630
+ except Exception:
631
+ logger.warning(
632
+ "Error while running job (in experimaestro)", exc_info=True
633
+ )
634
+ state = JobState.ERROR
635
+
636
+ # Locks are released here after job completes
637
+
638
+ # Check if we should restart a resumable task that timed out
639
+ from experimaestro.scheduler.jobs import JobStateError
640
+
641
+ if (
642
+ isinstance(state, JobStateError)
643
+ and state.failure_reason == JobFailureStatus.TIMEOUT
644
+ and job.resumable
645
+ ):
646
+ job.retry_count += 1
647
+ if job.retry_count <= job.max_retries:
648
+ logger.info(
649
+ "Resumable task %s timed out - restarting (attempt %d/%d)",
650
+ job,
651
+ job.retry_count,
652
+ job.max_retries,
653
+ )
654
+ # Rotate log files to preserve previous run's logs
655
+ job.rotate_logs()
656
+ # Clear cached process so aio_run() will create a new one
657
+ job._process = None
658
+ # Delete PID file so the job will be resubmitted
659
+ if job.pidpath.exists():
660
+ job.pidpath.unlink()
661
+ # Continue the loop to restart
662
+ continue
663
+ else:
664
+ logger.warning(
665
+ "Resumable task %s exceeded max retries (%d), marking as failed",
666
+ job,
667
+ job.max_retries,
668
+ )
669
+ # Fall through to return the error state
670
+
671
+ # Job finished (success or non-recoverable error)
672
+ # Notify scheduler listeners of job state after job completes
305
673
  self.notify_job_state(job)
306
674
  return state
307
-
308
- except Exception:
309
- logger.warning("Error in scheduler job coordination", exc_info=True)
310
- return JobState.ERROR