experimaestro 2.0.0b8__py3-none-any.whl → 2.0.0b17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (152) hide show
  1. experimaestro/__init__.py +12 -5
  2. experimaestro/cli/__init__.py +239 -126
  3. experimaestro/cli/filter.py +48 -23
  4. experimaestro/cli/jobs.py +253 -71
  5. experimaestro/cli/refactor.py +1 -2
  6. experimaestro/commandline.py +7 -4
  7. experimaestro/connectors/__init__.py +9 -1
  8. experimaestro/connectors/local.py +43 -3
  9. experimaestro/core/arguments.py +18 -18
  10. experimaestro/core/identifier.py +11 -11
  11. experimaestro/core/objects/config.py +96 -39
  12. experimaestro/core/objects/config_walk.py +3 -3
  13. experimaestro/core/{subparameters.py → partial.py} +16 -16
  14. experimaestro/core/partial_lock.py +394 -0
  15. experimaestro/core/types.py +12 -15
  16. experimaestro/dynamic.py +290 -0
  17. experimaestro/experiments/__init__.py +6 -2
  18. experimaestro/experiments/cli.py +217 -50
  19. experimaestro/experiments/configuration.py +24 -0
  20. experimaestro/generators.py +5 -5
  21. experimaestro/ipc.py +118 -1
  22. experimaestro/launcherfinder/__init__.py +2 -2
  23. experimaestro/launcherfinder/registry.py +6 -7
  24. experimaestro/launcherfinder/specs.py +2 -9
  25. experimaestro/launchers/slurm/__init__.py +2 -2
  26. experimaestro/launchers/slurm/base.py +62 -0
  27. experimaestro/locking.py +957 -1
  28. experimaestro/notifications.py +89 -201
  29. experimaestro/progress.py +63 -366
  30. experimaestro/rpyc.py +0 -2
  31. experimaestro/run.py +29 -2
  32. experimaestro/scheduler/__init__.py +8 -1
  33. experimaestro/scheduler/base.py +629 -53
  34. experimaestro/scheduler/dependencies.py +20 -16
  35. experimaestro/scheduler/experiment.py +732 -167
  36. experimaestro/scheduler/interfaces.py +316 -101
  37. experimaestro/scheduler/jobs.py +58 -20
  38. experimaestro/scheduler/remote/adaptive_sync.py +265 -0
  39. experimaestro/scheduler/remote/client.py +171 -117
  40. experimaestro/scheduler/remote/protocol.py +8 -193
  41. experimaestro/scheduler/remote/server.py +95 -71
  42. experimaestro/scheduler/services.py +53 -28
  43. experimaestro/scheduler/state_provider.py +663 -2430
  44. experimaestro/scheduler/state_status.py +1247 -0
  45. experimaestro/scheduler/transient.py +31 -0
  46. experimaestro/scheduler/workspace.py +1 -1
  47. experimaestro/scheduler/workspace_state_provider.py +1273 -0
  48. experimaestro/scriptbuilder.py +4 -4
  49. experimaestro/settings.py +36 -0
  50. experimaestro/tests/conftest.py +33 -5
  51. experimaestro/tests/connectors/bin/executable.py +1 -1
  52. experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
  53. experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
  54. experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
  55. experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
  56. experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
  57. experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
  58. experimaestro/tests/launchers/bin/test.py +1 -0
  59. experimaestro/tests/launchers/test_slurm.py +9 -9
  60. experimaestro/tests/partial_reschedule.py +46 -0
  61. experimaestro/tests/restart.py +3 -3
  62. experimaestro/tests/restart_main.py +1 -0
  63. experimaestro/tests/scripts/notifyandwait.py +1 -0
  64. experimaestro/tests/task_partial.py +38 -0
  65. experimaestro/tests/task_tokens.py +2 -2
  66. experimaestro/tests/tasks/test_dynamic.py +6 -6
  67. experimaestro/tests/test_dependencies.py +3 -3
  68. experimaestro/tests/test_deprecated.py +15 -15
  69. experimaestro/tests/test_dynamic_locking.py +317 -0
  70. experimaestro/tests/test_environment.py +24 -14
  71. experimaestro/tests/test_experiment.py +171 -36
  72. experimaestro/tests/test_identifier.py +25 -25
  73. experimaestro/tests/test_identifier_stability.py +3 -5
  74. experimaestro/tests/test_multitoken.py +2 -4
  75. experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
  76. experimaestro/tests/test_partial_paths.py +81 -138
  77. experimaestro/tests/test_pre_experiment.py +219 -0
  78. experimaestro/tests/test_progress.py +2 -8
  79. experimaestro/tests/test_remote_state.py +560 -99
  80. experimaestro/tests/test_stray_jobs.py +261 -0
  81. experimaestro/tests/test_tasks.py +1 -2
  82. experimaestro/tests/test_token_locking.py +52 -67
  83. experimaestro/tests/test_tokens.py +5 -6
  84. experimaestro/tests/test_transient.py +225 -0
  85. experimaestro/tests/test_workspace_state_provider.py +768 -0
  86. experimaestro/tests/token_reschedule.py +1 -3
  87. experimaestro/tests/utils.py +2 -7
  88. experimaestro/tokens.py +227 -372
  89. experimaestro/tools/diff.py +1 -0
  90. experimaestro/tools/documentation.py +4 -5
  91. experimaestro/tools/jobs.py +1 -2
  92. experimaestro/tui/app.py +438 -1966
  93. experimaestro/tui/app.tcss +162 -0
  94. experimaestro/tui/dialogs.py +172 -0
  95. experimaestro/tui/log_viewer.py +253 -3
  96. experimaestro/tui/messages.py +137 -0
  97. experimaestro/tui/utils.py +54 -0
  98. experimaestro/tui/widgets/__init__.py +23 -0
  99. experimaestro/tui/widgets/experiments.py +468 -0
  100. experimaestro/tui/widgets/global_services.py +238 -0
  101. experimaestro/tui/widgets/jobs.py +972 -0
  102. experimaestro/tui/widgets/log.py +156 -0
  103. experimaestro/tui/widgets/orphans.py +363 -0
  104. experimaestro/tui/widgets/runs.py +185 -0
  105. experimaestro/tui/widgets/services.py +314 -0
  106. experimaestro/tui/widgets/stray_jobs.py +528 -0
  107. experimaestro/utils/__init__.py +1 -1
  108. experimaestro/utils/environment.py +105 -22
  109. experimaestro/utils/fswatcher.py +124 -0
  110. experimaestro/utils/jobs.py +1 -2
  111. experimaestro/utils/jupyter.py +1 -2
  112. experimaestro/utils/logging.py +72 -0
  113. experimaestro/version.py +2 -2
  114. experimaestro/webui/__init__.py +9 -0
  115. experimaestro/webui/app.py +117 -0
  116. experimaestro/{server → webui}/data/index.css +66 -11
  117. experimaestro/webui/data/index.css.map +1 -0
  118. experimaestro/{server → webui}/data/index.js +82763 -87217
  119. experimaestro/webui/data/index.js.map +1 -0
  120. experimaestro/webui/routes/__init__.py +5 -0
  121. experimaestro/webui/routes/auth.py +53 -0
  122. experimaestro/webui/routes/proxy.py +117 -0
  123. experimaestro/webui/server.py +200 -0
  124. experimaestro/webui/state_bridge.py +152 -0
  125. experimaestro/webui/websocket.py +413 -0
  126. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +5 -6
  127. experimaestro-2.0.0b17.dist-info/RECORD +219 -0
  128. experimaestro/cli/progress.py +0 -269
  129. experimaestro/scheduler/state.py +0 -75
  130. experimaestro/scheduler/state_db.py +0 -437
  131. experimaestro/scheduler/state_sync.py +0 -891
  132. experimaestro/server/__init__.py +0 -467
  133. experimaestro/server/data/index.css.map +0 -1
  134. experimaestro/server/data/index.js.map +0 -1
  135. experimaestro/tests/test_cli_jobs.py +0 -615
  136. experimaestro/tests/test_file_progress.py +0 -425
  137. experimaestro/tests/test_file_progress_integration.py +0 -477
  138. experimaestro/tests/test_state_db.py +0 -434
  139. experimaestro-2.0.0b8.dist-info/RECORD +0 -187
  140. /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
  141. /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
  142. /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
  143. /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
  144. /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
  145. /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
  146. /experimaestro/{server → webui}/data/favicon.ico +0 -0
  147. /experimaestro/{server → webui}/data/index.html +0 -0
  148. /experimaestro/{server → webui}/data/login.html +0 -0
  149. /experimaestro/{server → webui}/data/manifest.json +0 -0
  150. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
  151. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
  152. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
@@ -1,17 +1,34 @@
1
+ import json
1
2
  import threading
2
3
  import time
4
+ from datetime import datetime
5
+ from pathlib import Path
3
6
  from typing import (
7
+ Dict,
8
+ List,
4
9
  Optional,
5
10
  Set,
6
11
  ClassVar,
7
12
  TYPE_CHECKING,
8
13
  )
9
14
  import asyncio
10
- from typing import Dict
11
15
 
12
16
  from experimaestro.scheduler import experiment
13
- from experimaestro.scheduler.jobs import Job, JobState, JobError
17
+ from experimaestro.scheduler.jobs import Job, JobState, JobError, JobDependency
14
18
  from experimaestro.scheduler.services import Service
19
+ from experimaestro.scheduler.interfaces import (
20
+ BaseJob,
21
+ BaseExperiment,
22
+ BaseService,
23
+ )
24
+ from experimaestro.scheduler.state_provider import StateProvider
25
+ from experimaestro.scheduler.state_status import (
26
+ EventReader,
27
+ JobProgressEvent,
28
+ JobStateChangedEvent,
29
+ WatchedDirectory,
30
+ job_entity_id_extractor,
31
+ )
15
32
 
16
33
 
17
34
  from experimaestro.utils import logger
@@ -19,7 +36,7 @@ from experimaestro.utils.asyncio import asyncThreadcheck
19
36
  import concurrent.futures
20
37
 
21
38
  if TYPE_CHECKING:
22
- from experimaestro.server import Server
39
+ from experimaestro.webui import WebUIServer
23
40
  from experimaestro.settings import ServerSettings
24
41
  from experimaestro.scheduler.workspace import Workspace
25
42
 
@@ -36,18 +53,25 @@ class Listener:
36
53
  pass
37
54
 
38
55
 
39
- class Scheduler(threading.Thread):
40
- """A job scheduler (singleton)
56
+ class Scheduler(StateProvider, threading.Thread):
57
+ """A job scheduler (singleton) that provides live state
41
58
 
42
59
  The scheduler is based on asyncio for easy concurrency handling.
43
60
  This is a singleton - only one scheduler instance exists per process.
61
+
62
+ Inherits from StateProvider to allow TUI/Web interfaces to access
63
+ live job and experiment state during experiment execution.
44
64
  """
45
65
 
46
66
  _instance: ClassVar[Optional["Scheduler"]] = None
47
67
  _lock: ClassVar[threading.Lock] = threading.Lock()
48
68
 
69
+ #: Scheduler is always live
70
+ is_live: bool = True
71
+
49
72
  def __init__(self, name: str = "Global"):
50
- super().__init__(name=f"Scheduler ({name})", daemon=True)
73
+ StateProvider.__init__(self) # Initialize state listener management
74
+ threading.Thread.__init__(self, name=f"Scheduler ({name})", daemon=True)
51
75
  self._ready = threading.Event()
52
76
 
53
77
  # Name of the scheduler
@@ -62,10 +86,19 @@ class Scheduler(threading.Thread):
62
86
  # List of all jobs
63
87
  self.jobs: Dict[str, "Job"] = {}
64
88
 
89
+ # Services: (experiment_id, run_id) -> {service_id -> Service}
90
+ self.services: Dict[tuple[str, str], Dict[str, Service]] = {}
91
+
92
+ # Tags map: (experiment_id, run_id) -> {job_id -> {tag_key: tag_value}}
93
+ self._tags_map: dict[tuple[str, str], dict[str, dict[str, str]]] = {}
94
+
95
+ # Dependencies map: (experiment_id, run_id) -> {job_id -> [depends_on_job_ids]}
96
+ self._dependencies_map: dict[tuple[str, str], dict[str, list[str]]] = {}
97
+
65
98
  # List of jobs
66
99
  self.waitingjobs: Set[Job] = set()
67
100
 
68
- # Listeners with thread-safe access
101
+ # Legacy listeners with thread-safe access
69
102
  self._listeners: Set[Listener] = set()
70
103
  self._listeners_lock = threading.Lock()
71
104
 
@@ -75,7 +108,12 @@ class Scheduler(threading.Thread):
75
108
  )
76
109
 
77
110
  # Server (managed by scheduler)
78
- self.server: Optional["Server"] = None
111
+ self.server: Optional["WebUIServer"] = None
112
+
113
+ # Job event readers per workspace
114
+ # Uses EventReader to watch .events/jobs/ directory
115
+ self._job_event_readers: Dict[Path, EventReader] = {}
116
+ self._job_event_readers_lock = threading.Lock()
79
117
 
80
118
  @staticmethod
81
119
  def has_instance() -> bool:
@@ -114,51 +152,58 @@ class Scheduler(threading.Thread):
114
152
 
115
153
  def register_experiment(self, xp: "experiment"):
116
154
  """Register an experiment with the scheduler"""
117
- # Use experiment name as key for now
118
- key = xp.workdir.name
155
+ # Use experiment name as key (not workdir.name which is now run_id)
156
+ key = xp.name
119
157
  self.experiments[key] = xp
120
158
 
159
+ # Start watching job events for this workspace
160
+ self._start_job_event_reader(xp.workspace.path)
161
+
121
162
  logger.debug("Registered experiment %s with scheduler", key)
122
163
 
123
164
  def unregister_experiment(self, xp: "experiment"):
124
165
  """Unregister an experiment from the scheduler"""
125
- key = xp.workdir.name
166
+ key = xp.name
126
167
  if key in self.experiments:
127
168
  del self.experiments[key]
128
169
  logger.debug("Unregistered experiment %s from scheduler", key)
129
170
 
130
171
  def start_server(
131
- self, settings: "ServerSettings" = None, workspace: "Workspace" = None
172
+ self,
173
+ settings: "ServerSettings" = None,
174
+ workspace: "Workspace" = None, # noqa: ARG002 - kept for backward compat
175
+ wait_for_quit: bool = False,
132
176
  ):
133
- """Start the notification server (if not already running)
177
+ """Start the web server (if not already running)
134
178
 
135
179
  Args:
136
180
  settings: Server settings
137
- workspace: Workspace instance (required to get workspace path)
181
+ workspace: Workspace instance (deprecated, not used)
182
+ wait_for_quit: If True, server waits for explicit quit from web UI
138
183
  """
139
184
  if self.server is None:
140
- from experimaestro.server import Server
141
- from experimaestro.scheduler.state_provider import WorkspaceStateProvider
185
+ from experimaestro.webui import WebUIServer
142
186
 
143
- if workspace is None:
144
- raise ValueError("workspace parameter is required to start server")
145
-
146
- # Get the workspace state provider singleton
147
- state_provider = WorkspaceStateProvider.get_instance(
148
- workspace.path, read_only=False, sync_on_start=False
149
- )
150
-
151
- self.server = Server.instance(settings, state_provider)
187
+ # Use the Scheduler itself as the StateProvider for live state access
188
+ self.server = WebUIServer.instance(settings, self, wait_for_quit)
152
189
  self.server.start()
153
- logger.info("Server started by scheduler")
190
+ logger.info("Web server started by scheduler")
154
191
  else:
155
- logger.debug("Server already running")
192
+ logger.debug("Web server already running")
156
193
 
157
194
  def stop_server(self):
158
- """Stop the notification server"""
195
+ """Stop the web server"""
159
196
  if self.server is not None:
160
197
  self.server.stop()
161
- logger.info("Server stopped by scheduler")
198
+ logger.info("Web server stopped by scheduler")
199
+
200
+ def wait_for_server_quit(self):
201
+ """Wait for explicit quit from web interface
202
+
203
+ Only blocks if server was started with wait_for_quit=True.
204
+ """
205
+ if self.server is not None:
206
+ self.server.wait()
162
207
 
163
208
  def run(self):
164
209
  """Run the event loop forever"""
@@ -277,12 +322,35 @@ class Scheduler(threading.Thread):
277
322
  xp = experiment.current()
278
323
  xp.add_job(other)
279
324
 
325
+ # Merge transient modes: more conservative mode wins
326
+ # NONE(0) > TRANSIENT(1) > REMOVE(2) - lower value wins
327
+ was_transient = other.transient.is_transient
328
+ if job.transient < other.transient:
329
+ other.transient = job.transient
330
+ # If job was transient and is now non-transient, mark it as needed
331
+ # This flag tells aio_submit not to skip the job
332
+ if was_transient and not other.transient.is_transient:
333
+ other._needed_transient = True
334
+
280
335
  # Copy watched outputs from new job to existing job
281
336
  # This ensures new callbacks are registered even for resubmitted jobs
282
337
  other.watched_outputs.extend(job.watched_outputs)
283
338
 
339
+ # Check if job needs to be re-started
340
+ need_restart = False
284
341
  if other.state.is_error():
285
- logger.info("Re-submitting job")
342
+ logger.info("Re-submitting job (was in error state)")
343
+ need_restart = True
344
+ elif (
345
+ was_transient
346
+ and not other.transient.is_transient
347
+ and other.state == JobState.UNSCHEDULED
348
+ ):
349
+ # Job was transient and skipped, but now is non-transient - restart it
350
+ logger.info("Re-submitting job (was transient, now non-transient)")
351
+ need_restart = True
352
+
353
+ if need_restart:
286
354
  # Clean up old process info so it will be re-started
287
355
  other._process = None
288
356
  if other.pidpath.is_file():
@@ -290,6 +358,7 @@ class Scheduler(threading.Thread):
290
358
  # Use set_state to handle experiment statistics updates
291
359
  other.set_state(JobState.WAITING)
292
360
  self.notify_job_state(other) # Notify listeners of re-submit
361
+ # The calling aio_submit will continue with this job and start it
293
362
  else:
294
363
  logger.warning("Job %s already submitted", job.identifier)
295
364
 
@@ -304,13 +373,145 @@ class Scheduler(threading.Thread):
304
373
  job.submittime = time.time()
305
374
  xp.add_job(job)
306
375
 
376
+ # Update tags map for this experiment/run
377
+ if job.tags:
378
+ exp_run_key = (xp.name, xp.run_id)
379
+ if exp_run_key not in self._tags_map:
380
+ self._tags_map[exp_run_key] = {}
381
+ self._tags_map[exp_run_key][job.identifier] = dict(job.tags)
382
+
383
+ # Update dependencies map for this experiment/run
384
+ exp_run_key = (xp.name, xp.run_id)
385
+ if exp_run_key not in self._dependencies_map:
386
+ self._dependencies_map[exp_run_key] = {}
387
+ depends_on_ids = [
388
+ dep.origin.identifier
389
+ for dep in job.dependencies
390
+ if isinstance(dep, JobDependency)
391
+ ]
392
+ if depends_on_ids:
393
+ self._dependencies_map[exp_run_key][job.identifier] = depends_on_ids
394
+
307
395
  # Set up dependencies
308
396
  for dependency in job.dependencies:
309
397
  dependency.target = job
310
- dependency.origin.dependents.add(dependency)
398
+ # Some dependencies (like PartialDependency) don't have an origin resource
399
+ if dependency.origin is not None:
400
+ dependency.origin.dependents.add(dependency)
311
401
 
312
402
  return None
313
403
 
404
+ def _start_job_event_reader(self, workspace_path: Path) -> None:
405
+ """Start watching job events in a workspace
406
+
407
+ Uses EventReader to watch .events/jobs/ for job progress events.
408
+ Job state events are emitted by the job process itself.
409
+ Only starts one reader per workspace.
410
+
411
+ Args:
412
+ workspace_path: Path to the workspace directory
413
+ """
414
+ with self._job_event_readers_lock:
415
+ # Already watching this workspace
416
+ if workspace_path in self._job_event_readers:
417
+ return
418
+
419
+ jobs_dir = workspace_path / ".events" / "jobs"
420
+
421
+ # Create new reader for this workspace
422
+ reader = EventReader(
423
+ [
424
+ WatchedDirectory(
425
+ path=jobs_dir,
426
+ glob_pattern="*/event-*-*.jsonl",
427
+ entity_id_extractor=job_entity_id_extractor,
428
+ )
429
+ ]
430
+ )
431
+ reader.start_watching(
432
+ on_event=self._on_job_event,
433
+ )
434
+ self._job_event_readers[workspace_path] = reader
435
+ logger.debug("Started job event reader for %s", jobs_dir)
436
+
437
+ def _stop_job_event_reader(self, workspace_path: Optional[Path] = None) -> None:
438
+ """Stop watching job events
439
+
440
+ Args:
441
+ workspace_path: If provided, stop only this workspace's reader.
442
+ If None, stop all readers.
443
+ """
444
+ with self._job_event_readers_lock:
445
+ if workspace_path is not None:
446
+ reader = self._job_event_readers.pop(workspace_path, None)
447
+ if reader is not None:
448
+ reader.stop_watching()
449
+ logger.debug("Stopped job event reader for %s", workspace_path)
450
+ else:
451
+ # Stop all readers
452
+ for path, reader in self._job_event_readers.items():
453
+ reader.stop_watching()
454
+ logger.debug("Stopped job event reader for %s", path)
455
+ self._job_event_readers.clear()
456
+
457
+ def _on_job_event(self, entity_id: str, event) -> None:
458
+ """Handle job events from EventReader
459
+
460
+ Updates job state from file-based events and notifies listeners.
461
+
462
+ Args:
463
+ entity_id: The job ID
464
+ event: The event (JobProgressEvent or JobStateChangedEvent)
465
+ """
466
+ job = self.jobs.get(entity_id)
467
+ if job is None:
468
+ logger.debug(
469
+ "Job event for unknown job %s",
470
+ entity_id,
471
+ )
472
+ return
473
+ logger.debug("Received event for job %s: %s", job, event)
474
+
475
+ if isinstance(event, JobProgressEvent):
476
+ # Update job's in-memory progress and notify legacy listeners
477
+ job.set_progress(event.level, event.progress, event.desc)
478
+ self.notify_job_state(job)
479
+
480
+ # Notify StateProvider-style listeners (TUI/WebUI)
481
+ state_event = JobStateChangedEvent(
482
+ job_id=job.identifier,
483
+ state=job.state.name.lower(),
484
+ )
485
+ self._notify_state_listeners_async(state_event)
486
+
487
+ def _cleanup_job_event_files(self, job: Job) -> None:
488
+ """Clean up old job event files from previous runs
489
+
490
+ Removes event files at .events/jobs/{task_id}/event-{job_id}-*.jsonl
491
+ Called when a job is about to start to ensure clean state.
492
+
493
+ Args:
494
+ job: The job being started
495
+ """
496
+ # Get the workspace path from the job's path
497
+ # job.path is workspace/jobs/task_id/job_id
498
+ workspace_path = job.path.parent.parent.parent
499
+ task_id = str(job.type.identifier)
500
+ job_id = job.identifier
501
+
502
+ events_dir = workspace_path / ".events" / "jobs" / task_id
503
+ if not events_dir.exists():
504
+ return
505
+
506
+ # Find and delete old event files for this job
507
+ pattern = f"event-{job_id}-*.jsonl"
508
+ for event_file in events_dir.glob(pattern):
509
+ try:
510
+ event_file.unlink()
511
+ logger.debug("Removed old job event file: %s", event_file)
512
+ except OSError as e:
513
+ logger.warning("Failed to remove job event file %s: %s", event_file, e)
514
+
314
515
  def _notify_listeners(self, notification_func, job: Job):
315
516
  """Execute notification in thread pool with error isolation.
316
517
 
@@ -331,18 +532,103 @@ class Scheduler(threading.Thread):
331
532
 
332
533
  self._notification_executor.submit(_do_notify)
333
534
 
535
+ def _notify_state_listeners_async(self, event):
536
+ """Notify StateProvider-style listeners asynchronously with error isolation.
537
+
538
+ This runs notifications in the same thread pool as _notify_listeners
539
+ to avoid blocking the scheduler and isolate errors.
540
+ """
541
+
542
+ def _do_notify():
543
+ # Get a snapshot of listeners with the lock
544
+ with self._state_listener_lock:
545
+ listeners_snapshot = list(self._state_listeners)
546
+
547
+ for listener in listeners_snapshot:
548
+ try:
549
+ listener(event)
550
+ except Exception:
551
+ logger.exception("Got an error with state listener %s", listener)
552
+
553
+ self._notification_executor.submit(_do_notify)
554
+
334
555
  def notify_job_submitted(self, job: Job):
335
556
  """Notify the listeners that a job has been submitted"""
336
557
  self._notify_listeners(lambda lst, j: lst.job_submitted(j), job)
337
558
 
559
+ # Also notify StateProvider-style listeners (for TUI etc.)
560
+ from experimaestro.scheduler.state_status import JobSubmittedEvent, JobTag
561
+
562
+ # Get experiment info from job's experiments list
563
+ for exp in job.experiments:
564
+ experiment_id = exp.experiment_id
565
+ run_id = exp.run_id
566
+
567
+ # Get tags and dependencies for this job
568
+ exp_run_key = (experiment_id, run_id)
569
+ tags_dict = self._tags_map.get(exp_run_key, {}).get(job.identifier, {})
570
+ tags = [JobTag(key=k, value=v) for k, v in tags_dict.items()]
571
+ depends_on = self._dependencies_map.get(exp_run_key, {}).get(
572
+ job.identifier, []
573
+ )
574
+
575
+ event = JobSubmittedEvent(
576
+ experiment_id=experiment_id,
577
+ run_id=run_id,
578
+ job_id=job.identifier,
579
+ tags=tags,
580
+ depends_on=depends_on,
581
+ )
582
+ self._notify_state_listeners_async(event)
583
+
338
584
  def notify_job_state(self, job: Job):
339
- """Notify the listeners that a job has changed state"""
585
+ """Notify the listeners that a job has changed state
586
+
587
+ Note: This does NOT write to job event files. Job events are written
588
+ by the job process itself. The scheduler only forwards notifications
589
+ to listeners.
590
+ """
591
+ # Legacy listener notification (per-experiment)
340
592
  self._notify_listeners(lambda lst, j: lst.job_state(j), job)
341
593
 
342
- def notify_service_add(self, service: Service):
594
+ # Notify StateProvider-style listeners with experiment-independent event
595
+ from experimaestro.scheduler.state_status import JobStateChangedEvent
596
+
597
+ event = JobStateChangedEvent(
598
+ job_id=job.identifier,
599
+ state=job.state.name.lower(),
600
+ )
601
+ self._notify_state_listeners_async(event)
602
+
603
+ def notify_service_add(
604
+ self, service: Service, experiment_id: str = "", run_id: str = ""
605
+ ):
343
606
  """Notify the listeners that a service has been added"""
344
607
  self._notify_listeners(lambda lst, s: lst.service_add(s), service)
345
608
 
609
+ # Store experiment info on the service for later retrieval
610
+ if experiment_id:
611
+ service._experiment_id = experiment_id
612
+ service._run_id = run_id or ""
613
+
614
+ # Store service in scheduler's services dict (persists after experiment ends)
615
+ if experiment_id:
616
+ key = (experiment_id, run_id or "")
617
+ if key not in self.services:
618
+ self.services[key] = {}
619
+ self.services[key][service.id] = service
620
+
621
+ # Also notify StateProvider-style listeners (for TUI etc.)
622
+ from experimaestro.scheduler.state_status import ServiceAddedEvent
623
+
624
+ if experiment_id:
625
+ event = ServiceAddedEvent(
626
+ experiment_id=experiment_id,
627
+ run_id=run_id or "",
628
+ service_id=service.id,
629
+ )
630
+ self._notify_state_listeners_async(event)
631
+
346
632
  async def aio_submit(self, job: Job) -> JobState:
347
633
  """Main scheduler function: submit a job, run it (if needed), and returns
348
634
  the status code
@@ -362,7 +648,7 @@ class Scheduler(threading.Thread):
362
648
  # Check that we don't have a completed job in
363
649
  # alternate directories
364
650
  for jobspath in experiment.current().alt_jobspaths:
365
- # FIXME: check if done
651
+ # Future enhancement: check if done
366
652
  pass
367
653
 
368
654
  # Creates a link into the experiment folder
@@ -388,10 +674,6 @@ class Scheduler(threading.Thread):
388
674
  job.set_state(JobState.RUNNING)
389
675
  self.notify_job_state(job)
390
676
 
391
- # Adds to the listeners
392
- if self.server is not None:
393
- job.add_notification_server(self.server)
394
-
395
677
  # And now, we wait...
396
678
  logger.info("Got a process for job %s - waiting to complete", job)
397
679
  code = await process.aio_code()
@@ -434,19 +716,27 @@ class Scheduler(threading.Thread):
434
716
 
435
717
  # If not done or running, start the job
436
718
  if not job.state.finished():
437
- try:
438
- state = await self.aio_start(job)
439
- # Set endtime before set_state so database gets the timestamp
440
- job.endtime = time.time()
441
- job.set_state(state)
442
- except Exception:
443
- logger.exception("Got an exception while starting the job")
444
- raise
719
+ # Check if this is a transient job that is not needed
720
+ if job.transient.is_transient and not job._needed_transient:
721
+ job.set_state(JobState.UNSCHEDULED)
722
+
723
+ # Start the job if not skipped (state is still WAITING)
724
+ if job.state == JobState.WAITING:
725
+ try:
726
+ state = await self.aio_start(job)
727
+ if state is not None:
728
+ job.endtime = time.time()
729
+ job.set_state(state)
730
+ except Exception:
731
+ logger.exception("Got an exception while starting the job")
732
+ raise
445
733
 
446
734
  # Job is finished - experiment statistics already updated by set_state
447
735
 
448
736
  # Write final metadata with end time and final state
449
- job.write_metadata()
737
+ # Only for jobs that actually started (starttime is set in aio_start)
738
+ if job.starttime is not None:
739
+ job.status_path.write_text(json.dumps(job.state_dict()))
450
740
 
451
741
  if job in self.waitingjobs:
452
742
  self.waitingjobs.remove(job)
@@ -478,7 +768,7 @@ class Scheduler(threading.Thread):
478
768
  or process creation
479
769
  """
480
770
  from experimaestro.scheduler.jobs import JobStateError
481
- from experimaestro.locking import Locks, LockError
771
+ from experimaestro.locking import DynamicDependencyLocks, LockError
482
772
  from experimaestro.scheduler.jobs import JobFailureStatus
483
773
 
484
774
  # Assert preconditions
@@ -510,7 +800,7 @@ class Scheduler(threading.Thread):
510
800
  return JobStateError(JobFailureStatus.DEPENDENCY)
511
801
 
512
802
  # We first lock the job before proceeding
513
- with Locks() as locks:
803
+ with DynamicDependencyLocks() as locks:
514
804
  logger.debug("[starting] Locking job %s", job)
515
805
  async with job.launcher.connector.lock(job.lockpath):
516
806
  logger.debug("[starting] Locked job %s", job)
@@ -583,12 +873,15 @@ class Scheduler(threading.Thread):
583
873
  if not directory.is_dir():
584
874
  directory.mkdir(parents=True, exist_ok=True)
585
875
 
876
+ # Clean up old job event files from previous runs
877
+ self._cleanup_job_event_files(job)
878
+
586
879
  # Write metadata with submit and start time (after directory creation)
587
- job.write_metadata()
880
+ job.status_path.parent.mkdir(parents=True, exist_ok=True)
881
+ job.status_path.write_text(json.dumps(job.state_dict()))
588
882
 
589
- # Sets up the notification URL
590
- if self.server is not None:
591
- job.add_notification_server(self.server)
883
+ # Notify locks before job starts (e.g., create symlinks)
884
+ await locks.aio_job_before_start(job)
592
885
 
593
886
  except Exception:
594
887
  logger.warning("Error while locking job", exc_info=True)
@@ -597,6 +890,30 @@ class Scheduler(threading.Thread):
597
890
  try:
598
891
  # Runs the job
599
892
  process = await job.aio_run()
893
+
894
+ # Notify locks that job has started
895
+ await locks.aio_job_started(job, process)
896
+
897
+ # Write locks.json for job process (if there are dynamic locks)
898
+ if locks.locks:
899
+ import tempfile
900
+
901
+ locks_path = job.path / "locks.json"
902
+ locks_data = {"dynamic_locks": locks.to_json()}
903
+ # Atomic write: write to temp file then rename
904
+ with tempfile.NamedTemporaryFile(
905
+ mode="w",
906
+ dir=job.path,
907
+ prefix=".locks.",
908
+ suffix=".json",
909
+ delete=False,
910
+ ) as tmp:
911
+ json.dump(locks_data, tmp)
912
+ tmp_path = tmp.name
913
+ # Rename is atomic on POSIX
914
+ import os
915
+
916
+ os.rename(tmp_path, locks_path)
600
917
  except Exception:
601
918
  logger.warning("Error while starting job", exc_info=True)
602
919
  return JobState.ERROR
@@ -654,6 +971,9 @@ class Scheduler(threading.Thread):
654
971
  )
655
972
  state = JobState.ERROR
656
973
 
974
+ # Notify locks that job has finished (before releasing)
975
+ await locks.aio_job_finished(job)
976
+
657
977
  # Locks are released here after job completes
658
978
 
659
979
  # Check if we should restart a resumable task that timed out
@@ -693,3 +1013,259 @@ class Scheduler(threading.Thread):
693
1013
  # Notify scheduler listeners of job state after job completes
694
1014
  self.notify_job_state(job)
695
1015
  return state
1016
+
1017
+ # =========================================================================
1018
+ # StateProvider abstract method implementations
1019
+ # =========================================================================
1020
+
1021
+ def get_experiments(
1022
+ self,
1023
+ since: Optional[datetime] = None, # noqa: ARG002
1024
+ ) -> List[BaseExperiment]:
1025
+ """Get list of all live experiments"""
1026
+ # Note: 'since' filter not applicable for live scheduler
1027
+ return list(self.experiments.values())
1028
+
1029
+ def get_experiment(self, experiment_id: str) -> Optional[BaseExperiment]:
1030
+ """Get a specific experiment by ID"""
1031
+ return self.experiments.get(experiment_id)
1032
+
1033
+ def get_experiment_runs(self, experiment_id: str) -> List[BaseExperiment]:
1034
+ """Get all runs for an experiment
1035
+
1036
+ For a live scheduler, returns the live experiment directly.
1037
+ """
1038
+ exp = self.experiments.get(experiment_id)
1039
+ if not exp:
1040
+ return []
1041
+
1042
+ # Return the live experiment (it already implements BaseExperiment)
1043
+ return [exp]
1044
+
1045
+ def get_current_run(self, experiment_id: str) -> Optional[str]:
1046
+ """Get the current run ID for an experiment"""
1047
+ exp = self.experiments.get(experiment_id)
1048
+ return exp.run_id if exp else None
1049
+
1050
+ def get_jobs(
1051
+ self,
1052
+ experiment_id: Optional[str] = None,
1053
+ run_id: Optional[str] = None, # noqa: ARG002 - not used in live scheduler
1054
+ task_id: Optional[str] = None,
1055
+ state: Optional[str] = None,
1056
+ tags: Optional[Dict[str, str]] = None,
1057
+ since: Optional[datetime] = None, # noqa: ARG002 - not used in live scheduler
1058
+ ) -> List[BaseJob]:
1059
+ """Query jobs with optional filters"""
1060
+ jobs: List[BaseJob] = list(self.jobs.values())
1061
+
1062
+ # Filter by experiment
1063
+ if experiment_id:
1064
+ exp = self.experiments.get(experiment_id)
1065
+ if exp:
1066
+ jobs = [j for j in jobs if j.experiments and exp in j.experiments]
1067
+ else:
1068
+ jobs = []
1069
+
1070
+ # Filter by task_id
1071
+ if task_id:
1072
+ jobs = [j for j in jobs if j.task_id == task_id]
1073
+
1074
+ # Filter by state
1075
+ if state:
1076
+ jobs = [j for j in jobs if j.state.name.lower() == state.lower()]
1077
+
1078
+ # Filter by tags (all tags must match)
1079
+ if tags:
1080
+ jobs = [j for j in jobs if all(j.tags.get(k) == v for k, v in tags.items())]
1081
+
1082
+ return jobs
1083
+
1084
+ def get_job(
1085
+ self,
1086
+ job_id: str,
1087
+ experiment_id: str, # noqa: ARG002 - job_id is sufficient in live scheduler
1088
+ run_id: Optional[str] = None, # noqa: ARG002 - job_id is sufficient in live scheduler
1089
+ ) -> Optional[BaseJob]:
1090
+ """Get a specific job"""
1091
+ return self.jobs.get(job_id)
1092
+
1093
+ def get_all_jobs(
1094
+ self,
1095
+ state: Optional[str] = None,
1096
+ tags: Optional[Dict[str, str]] = None,
1097
+ since: Optional[datetime] = None, # noqa: ARG002 - not used in live scheduler
1098
+ ) -> List[BaseJob]:
1099
+ """Get all jobs across all experiments"""
1100
+ jobs: List[BaseJob] = list(self.jobs.values())
1101
+
1102
+ if state:
1103
+ jobs = [j for j in jobs if j.state.name.lower() == state.lower()]
1104
+
1105
+ if tags:
1106
+ jobs = [j for j in jobs if all(j.tags.get(k) == v for k, v in tags.items())]
1107
+
1108
+ return jobs
1109
+
1110
+ def get_services(
1111
+ self,
1112
+ experiment_id: Optional[str] = None,
1113
+ run_id: Optional[str] = None,
1114
+ ) -> List[BaseService]:
1115
+ """Get services for an experiment
1116
+
1117
+ Services are stored in the scheduler and persist after experiments finish.
1118
+ """
1119
+ if experiment_id is None:
1120
+ # Return all services from all experiments
1121
+ services = []
1122
+ for services_dict in self.services.values():
1123
+ services.extend(services_dict.values())
1124
+ return services
1125
+
1126
+ # Get services for specific experiment
1127
+ services = []
1128
+ if run_id is not None:
1129
+ # Specific run requested
1130
+ key = (experiment_id, run_id)
1131
+ services_dict = self.services.get(key, {})
1132
+ services = list(services_dict.values())
1133
+ else:
1134
+ # No run_id specified - return services from all runs of this experiment
1135
+ for (exp_id, _run_id), services_dict in self.services.items():
1136
+ if exp_id == experiment_id:
1137
+ services.extend(services_dict.values())
1138
+
1139
+ logger.debug(
1140
+ "get_services(%s, %s): returning %d services",
1141
+ experiment_id,
1142
+ run_id,
1143
+ len(services),
1144
+ )
1145
+ return services
1146
+
1147
+ def get_tags_map(
1148
+ self,
1149
+ experiment_id: str,
1150
+ run_id: Optional[str] = None,
1151
+ ) -> dict[str, dict[str, str]]:
1152
+ """Get tags map for jobs in an experiment/run
1153
+
1154
+ Returns a map from job_id to {tag_key: tag_value}.
1155
+ """
1156
+ exp = self.experiments.get(experiment_id)
1157
+ if not exp:
1158
+ return {}
1159
+
1160
+ # Use current run if not specified
1161
+ if run_id is None:
1162
+ run_id = exp.run_id
1163
+
1164
+ exp_run_key = (experiment_id, run_id)
1165
+ return self._tags_map.get(exp_run_key, {})
1166
+
1167
+ def get_dependencies_map(
1168
+ self,
1169
+ experiment_id: str,
1170
+ run_id: Optional[str] = None,
1171
+ ) -> dict[str, list[str]]:
1172
+ """Get dependencies map for jobs in an experiment/run
1173
+
1174
+ Returns a map from job_id to list of job_ids it depends on.
1175
+ """
1176
+ exp = self.experiments.get(experiment_id)
1177
+ if not exp:
1178
+ return {}
1179
+
1180
+ # Use current run if not specified
1181
+ if run_id is None:
1182
+ run_id = exp.run_id
1183
+
1184
+ exp_run_key = (experiment_id, run_id)
1185
+ return self._dependencies_map.get(exp_run_key, {})
1186
+
1187
+ def kill_job(self, job: BaseJob, perform: bool = False) -> bool:
1188
+ """Kill a running job
1189
+
1190
+ For the scheduler, this is a live operation.
1191
+ """
1192
+ if not perform:
1193
+ # Just check if the job can be killed
1194
+ return job.state == JobState.RUNNING
1195
+
1196
+ if job.state != JobState.RUNNING:
1197
+ return False
1198
+
1199
+ # Get the actual Job from our jobs dict
1200
+ actual_job = self.jobs.get(job.identifier)
1201
+ if actual_job is None:
1202
+ return False
1203
+
1204
+ # Try to kill the process via the process attribute
1205
+ process = getattr(actual_job, "process", None)
1206
+ if process is not None:
1207
+ try:
1208
+ process.kill()
1209
+ return True
1210
+ except Exception:
1211
+ logger.exception("Failed to kill job %s", job.identifier)
1212
+ return False
1213
+
1214
+ def clean_job(
1215
+ self,
1216
+ job: BaseJob, # noqa: ARG002
1217
+ perform: bool = False, # noqa: ARG002
1218
+ ) -> bool:
1219
+ """Clean a finished job
1220
+
1221
+ For the scheduler, jobs are automatically cleaned when they finish.
1222
+ """
1223
+ # Live scheduler doesn't support cleaning jobs
1224
+ return False
1225
+
1226
+ def get_process_info(self, job: BaseJob):
1227
+ """Get process information for a job
1228
+
1229
+ For the scheduler, we can access the actual Job and read its PID file.
1230
+ """
1231
+ from experimaestro.scheduler.state_provider import ProcessInfo
1232
+
1233
+ # Get the actual Job from our jobs dict
1234
+ actual_job = self.jobs.get(job.identifier)
1235
+ if actual_job is None:
1236
+ return None
1237
+
1238
+ # Try to read the PID file
1239
+ try:
1240
+ pidpath = getattr(actual_job, "pidpath", None)
1241
+ if pidpath is None or not pidpath.exists():
1242
+ return None
1243
+
1244
+ pinfo = json.loads(pidpath.read_text())
1245
+ pid = pinfo.get("pid")
1246
+ proc_type = pinfo.get("type", "unknown")
1247
+
1248
+ if pid is None:
1249
+ return None
1250
+
1251
+ # Check if running based on job state
1252
+ running = actual_job.state == JobState.RUNNING
1253
+
1254
+ return ProcessInfo(pid=pid, type=proc_type, running=running)
1255
+ except Exception:
1256
+ return None
1257
+
1258
+ def close(self) -> None:
1259
+ """Close the state provider and clean up resources"""
1260
+ # Stop all job event readers
1261
+ self._stop_job_event_reader()
1262
+
1263
+ @property
1264
+ def read_only(self) -> bool:
1265
+ """Live scheduler is read-write"""
1266
+ return False
1267
+
1268
+ @property
1269
+ def is_remote(self) -> bool:
1270
+ """Live scheduler is local"""
1271
+ return False