experimaestro 2.0.0b8__py3-none-any.whl → 2.0.0b17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (152) hide show
  1. experimaestro/__init__.py +12 -5
  2. experimaestro/cli/__init__.py +239 -126
  3. experimaestro/cli/filter.py +48 -23
  4. experimaestro/cli/jobs.py +253 -71
  5. experimaestro/cli/refactor.py +1 -2
  6. experimaestro/commandline.py +7 -4
  7. experimaestro/connectors/__init__.py +9 -1
  8. experimaestro/connectors/local.py +43 -3
  9. experimaestro/core/arguments.py +18 -18
  10. experimaestro/core/identifier.py +11 -11
  11. experimaestro/core/objects/config.py +96 -39
  12. experimaestro/core/objects/config_walk.py +3 -3
  13. experimaestro/core/{subparameters.py → partial.py} +16 -16
  14. experimaestro/core/partial_lock.py +394 -0
  15. experimaestro/core/types.py +12 -15
  16. experimaestro/dynamic.py +290 -0
  17. experimaestro/experiments/__init__.py +6 -2
  18. experimaestro/experiments/cli.py +217 -50
  19. experimaestro/experiments/configuration.py +24 -0
  20. experimaestro/generators.py +5 -5
  21. experimaestro/ipc.py +118 -1
  22. experimaestro/launcherfinder/__init__.py +2 -2
  23. experimaestro/launcherfinder/registry.py +6 -7
  24. experimaestro/launcherfinder/specs.py +2 -9
  25. experimaestro/launchers/slurm/__init__.py +2 -2
  26. experimaestro/launchers/slurm/base.py +62 -0
  27. experimaestro/locking.py +957 -1
  28. experimaestro/notifications.py +89 -201
  29. experimaestro/progress.py +63 -366
  30. experimaestro/rpyc.py +0 -2
  31. experimaestro/run.py +29 -2
  32. experimaestro/scheduler/__init__.py +8 -1
  33. experimaestro/scheduler/base.py +629 -53
  34. experimaestro/scheduler/dependencies.py +20 -16
  35. experimaestro/scheduler/experiment.py +732 -167
  36. experimaestro/scheduler/interfaces.py +316 -101
  37. experimaestro/scheduler/jobs.py +58 -20
  38. experimaestro/scheduler/remote/adaptive_sync.py +265 -0
  39. experimaestro/scheduler/remote/client.py +171 -117
  40. experimaestro/scheduler/remote/protocol.py +8 -193
  41. experimaestro/scheduler/remote/server.py +95 -71
  42. experimaestro/scheduler/services.py +53 -28
  43. experimaestro/scheduler/state_provider.py +663 -2430
  44. experimaestro/scheduler/state_status.py +1247 -0
  45. experimaestro/scheduler/transient.py +31 -0
  46. experimaestro/scheduler/workspace.py +1 -1
  47. experimaestro/scheduler/workspace_state_provider.py +1273 -0
  48. experimaestro/scriptbuilder.py +4 -4
  49. experimaestro/settings.py +36 -0
  50. experimaestro/tests/conftest.py +33 -5
  51. experimaestro/tests/connectors/bin/executable.py +1 -1
  52. experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
  53. experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
  54. experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
  55. experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
  56. experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
  57. experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
  58. experimaestro/tests/launchers/bin/test.py +1 -0
  59. experimaestro/tests/launchers/test_slurm.py +9 -9
  60. experimaestro/tests/partial_reschedule.py +46 -0
  61. experimaestro/tests/restart.py +3 -3
  62. experimaestro/tests/restart_main.py +1 -0
  63. experimaestro/tests/scripts/notifyandwait.py +1 -0
  64. experimaestro/tests/task_partial.py +38 -0
  65. experimaestro/tests/task_tokens.py +2 -2
  66. experimaestro/tests/tasks/test_dynamic.py +6 -6
  67. experimaestro/tests/test_dependencies.py +3 -3
  68. experimaestro/tests/test_deprecated.py +15 -15
  69. experimaestro/tests/test_dynamic_locking.py +317 -0
  70. experimaestro/tests/test_environment.py +24 -14
  71. experimaestro/tests/test_experiment.py +171 -36
  72. experimaestro/tests/test_identifier.py +25 -25
  73. experimaestro/tests/test_identifier_stability.py +3 -5
  74. experimaestro/tests/test_multitoken.py +2 -4
  75. experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
  76. experimaestro/tests/test_partial_paths.py +81 -138
  77. experimaestro/tests/test_pre_experiment.py +219 -0
  78. experimaestro/tests/test_progress.py +2 -8
  79. experimaestro/tests/test_remote_state.py +560 -99
  80. experimaestro/tests/test_stray_jobs.py +261 -0
  81. experimaestro/tests/test_tasks.py +1 -2
  82. experimaestro/tests/test_token_locking.py +52 -67
  83. experimaestro/tests/test_tokens.py +5 -6
  84. experimaestro/tests/test_transient.py +225 -0
  85. experimaestro/tests/test_workspace_state_provider.py +768 -0
  86. experimaestro/tests/token_reschedule.py +1 -3
  87. experimaestro/tests/utils.py +2 -7
  88. experimaestro/tokens.py +227 -372
  89. experimaestro/tools/diff.py +1 -0
  90. experimaestro/tools/documentation.py +4 -5
  91. experimaestro/tools/jobs.py +1 -2
  92. experimaestro/tui/app.py +438 -1966
  93. experimaestro/tui/app.tcss +162 -0
  94. experimaestro/tui/dialogs.py +172 -0
  95. experimaestro/tui/log_viewer.py +253 -3
  96. experimaestro/tui/messages.py +137 -0
  97. experimaestro/tui/utils.py +54 -0
  98. experimaestro/tui/widgets/__init__.py +23 -0
  99. experimaestro/tui/widgets/experiments.py +468 -0
  100. experimaestro/tui/widgets/global_services.py +238 -0
  101. experimaestro/tui/widgets/jobs.py +972 -0
  102. experimaestro/tui/widgets/log.py +156 -0
  103. experimaestro/tui/widgets/orphans.py +363 -0
  104. experimaestro/tui/widgets/runs.py +185 -0
  105. experimaestro/tui/widgets/services.py +314 -0
  106. experimaestro/tui/widgets/stray_jobs.py +528 -0
  107. experimaestro/utils/__init__.py +1 -1
  108. experimaestro/utils/environment.py +105 -22
  109. experimaestro/utils/fswatcher.py +124 -0
  110. experimaestro/utils/jobs.py +1 -2
  111. experimaestro/utils/jupyter.py +1 -2
  112. experimaestro/utils/logging.py +72 -0
  113. experimaestro/version.py +2 -2
  114. experimaestro/webui/__init__.py +9 -0
  115. experimaestro/webui/app.py +117 -0
  116. experimaestro/{server → webui}/data/index.css +66 -11
  117. experimaestro/webui/data/index.css.map +1 -0
  118. experimaestro/{server → webui}/data/index.js +82763 -87217
  119. experimaestro/webui/data/index.js.map +1 -0
  120. experimaestro/webui/routes/__init__.py +5 -0
  121. experimaestro/webui/routes/auth.py +53 -0
  122. experimaestro/webui/routes/proxy.py +117 -0
  123. experimaestro/webui/server.py +200 -0
  124. experimaestro/webui/state_bridge.py +152 -0
  125. experimaestro/webui/websocket.py +413 -0
  126. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +5 -6
  127. experimaestro-2.0.0b17.dist-info/RECORD +219 -0
  128. experimaestro/cli/progress.py +0 -269
  129. experimaestro/scheduler/state.py +0 -75
  130. experimaestro/scheduler/state_db.py +0 -437
  131. experimaestro/scheduler/state_sync.py +0 -891
  132. experimaestro/server/__init__.py +0 -467
  133. experimaestro/server/data/index.css.map +0 -1
  134. experimaestro/server/data/index.js.map +0 -1
  135. experimaestro/tests/test_cli_jobs.py +0 -615
  136. experimaestro/tests/test_file_progress.py +0 -425
  137. experimaestro/tests/test_file_progress_integration.py +0 -477
  138. experimaestro/tests/test_state_db.py +0 -434
  139. experimaestro-2.0.0b8.dist-info/RECORD +0 -187
  140. /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
  141. /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
  142. /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
  143. /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
  144. /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
  145. /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
  146. /experimaestro/{server → webui}/data/favicon.ico +0 -0
  147. /experimaestro/{server → webui}/data/index.html +0 -0
  148. /experimaestro/{server → webui}/data/login.html +0 -0
  149. /experimaestro/{server → webui}/data/manifest.json +0 -0
  150. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
  151. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
  152. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
@@ -1,11 +1,12 @@
1
1
  import asyncio
2
+ import inspect
2
3
  import json
3
4
  import logging
4
5
  import os
5
6
  from pathlib import Path
6
7
  import time
7
8
  from shutil import rmtree
8
- from typing import Any, Dict, Optional, TypeVar, Union
9
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union
9
10
 
10
11
  from experimaestro.core.objects import WatchedOutput
11
12
  from experimaestro.exceptions import HandledException
@@ -14,9 +15,19 @@ from experimaestro.scheduler.signal_handler import SIGNAL_HANDLER
14
15
  from experimaestro.scheduler.jobs import Job
15
16
  from experimaestro.scheduler.services import Service
16
17
  from experimaestro.scheduler.workspace import RunMode, Workspace
17
- from experimaestro.settings import WorkspaceSettings, get_settings
18
+ from experimaestro.scheduler.interfaces import (
19
+ BaseExperiment,
20
+ BaseService,
21
+ ExperimentJobInformation,
22
+ )
23
+ from experimaestro.settings import WorkspaceSettings, get_settings, HistorySettings
24
+ from experimaestro.experiments.configuration import DirtyGitAction
18
25
  from experimaestro.utils import logger
19
26
 
27
+ if TYPE_CHECKING:
28
+ from experimaestro.scheduler.interfaces import ExperimentStatus
29
+ from experimaestro.scheduler.state_status import ExperimentEventWriter
30
+
20
31
  ServiceClass = TypeVar("ServiceClass", bound=Service)
21
32
 
22
33
 
@@ -26,11 +37,46 @@ class FailedExperiment(HandledException):
26
37
  pass
27
38
 
28
39
 
29
- class DatabaseListener:
30
- """Listener that updates job state in the database"""
40
+ class DirtyGitError(HandledException):
41
+ """Raised when the git repository has uncommitted changes and dirty_git=error"""
42
+
43
+ pass
44
+
45
+
46
+ class GracefulExperimentExit(Exception):
47
+ """Raised to exit an experiment context without waiting for running jobs.
48
+
49
+ This is useful in tests or when you want to detach from an experiment
50
+ while keeping jobs running (e.g., to test stray job detection).
51
+
52
+ Example::
53
+
54
+ with experiment(workdir, "my-experiment") as xp:
55
+ task = MyTask.C(value=1).submit()
56
+ # Wait for task to start...
57
+ raise GracefulExperimentExit() # Exit without waiting for task to finish
58
+ """
31
59
 
32
- def __init__(self, state_provider, experiment_id: str, run_id: str):
33
- self.state_provider = state_provider
60
+ pass
61
+
62
+
63
+ class StateListener:
64
+ """Listener that writes events to filesystem
65
+
66
+ Job state events are written to per-job event files by the scheduler.
67
+ This listener writes experiment-level events (job state, services) to
68
+ the experiment event file.
69
+ """
70
+
71
+ def __init__(
72
+ self,
73
+ event_writer: "ExperimentEventWriter",
74
+ experiment: "experiment",
75
+ experiment_id: str,
76
+ run_id: str,
77
+ ):
78
+ self.event_writer = event_writer
79
+ self.experiment = experiment
34
80
  self.experiment_id = experiment_id
35
81
  self.run_id = run_id
36
82
 
@@ -39,34 +85,65 @@ class DatabaseListener:
39
85
  pass
40
86
 
41
87
  def job_state(self, job):
42
- """Update job state in database"""
43
- self.state_provider.update_job_state(job, self.experiment_id, self.run_id)
88
+ """Write job state change event to experiment event file"""
89
+ from .state_status import JobStateChangedEvent
90
+
91
+ # Get failure reason if error state
92
+ failure_reason = None
93
+ if hasattr(job.state, "failure_reason") and job.state.failure_reason:
94
+ failure_reason = job.state.failure_reason.name
95
+
96
+ # Get progress as list of dicts
97
+ progress = []
98
+ if hasattr(job, "_progress") and job._progress:
99
+ progress = [
100
+ {"level": p.level, "progress": p.progress, "desc": p.desc}
101
+ for p in job._progress
102
+ ]
103
+
104
+ event = JobStateChangedEvent(
105
+ job_id=job.identifier,
106
+ state=job.state.name,
107
+ failure_reason=failure_reason,
108
+ submitted_time=job.submittime,
109
+ started_time=job.starttime,
110
+ ended_time=job.endtime,
111
+ exit_code=getattr(job, "exit_code", None),
112
+ retry_count=getattr(job, "retry_count", 0),
113
+ progress=progress,
114
+ )
115
+ # Write to experiment event file
116
+ self.event_writer.write_event(event)
44
117
 
45
118
  def service_add(self, service):
46
- """Register service in database"""
119
+ """Write service added event to filesystem"""
47
120
  from experimaestro.scheduler.services import Service
48
-
49
- state_dict = Service.serialize_state_dict(service._full_state_dict())
50
- self.state_provider.register_service(
51
- service.id,
52
- self.experiment_id,
53
- self.run_id,
54
- service.description(),
55
- state_dict=json.dumps(state_dict),
121
+ from .state_status import ServiceAddedEvent
122
+
123
+ state_dict = Service.serialize_state_dict(service.state_dict())
124
+ service_class = f"{service.__class__.__module__}.{service.__class__.__name__}"
125
+ event = ServiceAddedEvent(
126
+ service_id=service.id,
127
+ description=service.description(),
128
+ service_class=service_class,
129
+ state_dict=state_dict,
56
130
  )
131
+ self.event_writer.write_event(event)
57
132
 
58
133
  def service_state_changed(self, service):
59
134
  """Called when service state changes (runtime only, not persisted)"""
60
- # Service state is managed at runtime, not persisted to DB
135
+ # Service state is managed at runtime, not persisted
61
136
  pass
62
137
 
63
138
 
64
- class experiment:
139
+ class experiment(BaseExperiment):
65
140
  """Context manager for running experiments.
66
141
 
67
142
  Creates a workspace, manages task submission, and optionally starts
68
143
  a web server for monitoring.
69
144
 
145
+ Implements BaseExperiment interface for use with StateProvider and TUI.
146
+
70
147
  Example::
71
148
 
72
149
  from experimaestro import experiment
@@ -99,6 +176,10 @@ class experiment:
99
176
  run_mode: Optional[RunMode] = None,
100
177
  launcher=None,
101
178
  register_signals: bool = True,
179
+ project_paths: Optional[list[Path]] = None,
180
+ wait_for_quit: bool = False,
181
+ dirty_git: DirtyGitAction = DirtyGitAction.WARN,
182
+ no_db: bool = False,
102
183
  ):
103
184
  """
104
185
  :param env: an environment -- or a working directory for a local
@@ -118,10 +199,60 @@ class experiment:
118
199
 
119
200
  :param register_signals: Whether to register signal handlers (default: True).
120
201
  Set to False when running in a background thread.
202
+
203
+ :param project_paths: Paths to the project files (for git info). If not
204
+ provided, will be inferred from the caller's location.
205
+
206
+ :param wait_for_quit: Deprecated, no longer used. Web server is no longer
207
+ started automatically.
208
+
209
+ :param dirty_git: Action when git repository has uncommitted changes:
210
+ DirtyGitAction.IGNORE (don't check), DirtyGitAction.WARN (log warning,
211
+ default), or DirtyGitAction.ERROR (raise exception).
212
+
213
+ :param no_db: Deprecated, kept for backwards compatibility. This parameter
214
+ is now a no-op as the database has been replaced with filesystem-based
215
+ state tracking.
216
+
217
+ .. deprecated::
218
+ The ``host``, ``port``, ``token``, and ``wait_for_quit`` parameters are
219
+ deprecated. Use ``--web`` flag with ``run-experiment`` CLI or start the
220
+ web server separately.
121
221
  """
222
+ import warnings
122
223
 
123
224
  from experimaestro.scheduler import Listener, Scheduler
124
225
 
226
+ # Warn about deprecated server parameters
227
+ if host is not None:
228
+ warnings.warn(
229
+ "The 'host' parameter is deprecated. Use '--web' flag with "
230
+ "'run-experiment' CLI or start the web server separately.",
231
+ DeprecationWarning,
232
+ stacklevel=2,
233
+ )
234
+ if port is not None:
235
+ warnings.warn(
236
+ "The 'port' parameter is deprecated. Use '--web' flag with "
237
+ "'run-experiment' CLI or start the web server separately.",
238
+ DeprecationWarning,
239
+ stacklevel=2,
240
+ )
241
+ if token is not None:
242
+ warnings.warn(
243
+ "The 'token' parameter is deprecated. Use '--web' flag with "
244
+ "'run-experiment' CLI or start the web server separately.",
245
+ DeprecationWarning,
246
+ stacklevel=2,
247
+ )
248
+ if wait_for_quit:
249
+ warnings.warn(
250
+ "The 'wait_for_quit' parameter is deprecated. Use '--web' flag with "
251
+ "'run-experiment' CLI or start the web server separately.",
252
+ DeprecationWarning,
253
+ stacklevel=2,
254
+ )
255
+
125
256
  settings = get_settings()
126
257
  if not isinstance(env, WorkspaceSettings):
127
258
  env = WorkspaceSettings(id=None, path=Path(env))
@@ -130,36 +261,45 @@ class experiment:
130
261
  run_mode = run_mode or RunMode.NORMAL
131
262
  self.workspace = Workspace(settings, env, launcher=launcher, run_mode=run_mode)
132
263
 
133
- # Mark the directory has an experimaestro folder
134
- self.workdir = self.workspace.experimentspath / name
135
- self.workdir.mkdir(parents=True, exist_ok=True)
136
- self.xplockpath = self.workdir / "lock"
264
+ # Store experiment name for ID references
265
+ self.name = name
266
+
267
+ # Create experiment base directory (run directories will be created inside)
268
+ self._experiment_base = self.workspace.experimentspath / name
269
+ self._experiment_base.mkdir(parents=True, exist_ok=True)
270
+
271
+ # Lock is at experiment level (prevents concurrent runs of same experiment)
272
+ self.xplockpath = self._experiment_base / "lock"
273
+
274
+ # workdir will be set in __enter__ after run_id is generated
275
+ self.workdir = None
137
276
  self.xplock = None
138
277
  self.old_experiment = None
139
- self.services: Dict[str, Service] = {}
278
+ self._services: Dict[str, Service] = {}
140
279
  self._job_listener: Optional[Listener] = None
141
280
  self._register_signals = register_signals
281
+ self._dirty_git = dirty_git
282
+ self._no_db = no_db
142
283
 
143
- # Get configuration settings
144
-
145
- if host is not None:
146
- settings.server.host = host
147
-
148
- if port is not None:
149
- settings.server.port = port
150
-
151
- if token is not None:
152
- settings.server.token = token
284
+ # Capture project paths for git info
285
+ if project_paths is not None:
286
+ self._project_paths = project_paths
287
+ else:
288
+ # Fall back to caller's file path
289
+ self._project_paths = []
290
+ try:
291
+ # Go up the stack to find the first frame outside this module
292
+ for frame_info in inspect.stack():
293
+ frame_file = frame_info.filename
294
+ if "experimaestro" not in frame_file:
295
+ self._project_paths = [Path(frame_file).resolve().parent]
296
+ break
297
+ except Exception:
298
+ pass
153
299
 
154
300
  # Use singleton scheduler
155
301
  self.scheduler = Scheduler.instance()
156
302
 
157
- # Determine if we need a server
158
- self._needs_server = (
159
- settings.server.port is not None and settings.server.port >= 0
160
- ) and self.workspace.run_mode == RunMode.NORMAL
161
- self._server_settings = settings.server if self._needs_server else None
162
-
163
303
  if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
164
304
  import faulthandler
165
305
 
@@ -197,50 +337,76 @@ class experiment:
197
337
  """Return the directory in which results can be stored for this experiment"""
198
338
  return self.workdir / "jobs"
199
339
 
340
+ # =========================================================================
341
+ # BaseExperiment interface properties
342
+ # =========================================================================
343
+
200
344
  @property
201
- def alt_jobspaths(self):
202
- """Return potential other directories"""
203
- for alt_workdir in self.workspace.alt_workdirs:
204
- yield alt_workdir / "jobs"
345
+ def experiment_id(self) -> str:
346
+ """Experiment identifier (overrides BaseExperiment.experiment_id)"""
347
+ return self.name
205
348
 
206
349
  @property
207
- def jobsbakpath(self):
208
- """Return the directory in which results can be stored for this experiment"""
209
- return self.workdir / "jobs.bak"
350
+ def status(self) -> "ExperimentStatus":
351
+ """Experiment status - RUNNING for live experiments, updated on finalization"""
352
+ from experimaestro.scheduler.interfaces import ExperimentStatus
353
+
354
+ return getattr(self, "_status", ExperimentStatus.RUNNING)
210
355
 
211
356
  @property
212
- def jobs_jsonl_path(self):
213
- """Return the path to the jobs.jsonl file for this experiment"""
214
- return self.workdir / "jobs.jsonl"
357
+ def jobs(self) -> Dict[str, "Job"]:
358
+ """Jobs in this experiment"""
359
+ return {
360
+ job.identifier: job
361
+ for job in self.scheduler.jobs.values()
362
+ if self in job.experiments
363
+ }
215
364
 
216
365
  @property
217
- def services_json_path(self):
218
- """Return the path to the services.json file for this experiment"""
219
- return self.workdir / "services.json"
366
+ def tags(self) -> Dict[str, Dict[str, str]]:
367
+ """Tags for jobs - tracked directly in experiment"""
368
+ return self._tags
220
369
 
221
- def _write_services_json(self):
222
- """Write all services to services.json file"""
223
- from experimaestro.scheduler.services import Service
370
+ @property
371
+ def dependencies(self) -> Dict[str, List[str]]:
372
+ """Job dependencies - tracked directly in experiment"""
373
+ return self._dependencies
224
374
 
225
- services_data = {}
226
- for service_id, service in self.services.items():
227
- # Get state_dict from service (includes __class__ for recreation)
228
- # and serialize paths to JSON-compatible format
229
- service_state = Service.serialize_state_dict(service._full_state_dict())
230
- # Add runtime state info
231
- service_state.update(
232
- {
233
- "service_id": service_id,
234
- "description": service.description(),
235
- "state": service.state.name,
236
- "url": getattr(service, "url", None),
237
- "timestamp": time.time(),
238
- }
239
- )
240
- services_data[service_id] = service_state
375
+ @property
376
+ def events_count(self) -> int:
377
+ """Number of events processed"""
378
+ return self._events_count
379
+
380
+ @property
381
+ def started_at(self) -> Optional[float]:
382
+ """Timestamp when experiment started"""
383
+ return self._started_at
384
+
385
+ @property
386
+ def ended_at(self) -> Optional[float]:
387
+ """Timestamp when experiment ended (None if still running)"""
388
+ return self._ended_at
241
389
 
242
- with self.services_json_path.open("w") as f:
243
- json.dump(services_data, f, indent=2)
390
+ @property
391
+ def hostname(self) -> Optional[str]:
392
+ """Hostname where experiment is running"""
393
+ return self._hostname
394
+
395
+ @property
396
+ def services(self) -> Dict[str, "BaseService"]:
397
+ """Services in this experiment"""
398
+ return self._services
399
+
400
+ @property
401
+ def alt_jobspaths(self):
402
+ """Return potential other directories"""
403
+ for alt_workdir in self.workspace.alt_workdirs:
404
+ yield alt_workdir / "jobs"
405
+
406
+ @property
407
+ def jobs_jsonl_path(self):
408
+ """Return the path to the jobs.jsonl file for this experiment"""
409
+ return self.workdir / "jobs.jsonl"
244
410
 
245
411
  def add_job(self, job: "Job"):
246
412
  """Register a job and its tags to jobs.jsonl file and database
@@ -266,24 +432,76 @@ class experiment:
266
432
  logging.debug(
267
433
  "Job %s already running, unfinished jobs for %s: %d",
268
434
  job.identifier[:8],
269
- self.workdir.name,
435
+ self.name,
270
436
  self.unfinishedJobs,
271
437
  )
272
438
 
273
- record = {
274
- "job_id": job.identifier,
275
- "task_id": str(job.type.identifier),
276
- "tags": dict(job.tags.items()) if job.tags else {},
277
- "timestamp": time.time(),
278
- }
439
+ job_info = ExperimentJobInformation(
440
+ job_id=job.identifier,
441
+ task_id=str(job.type.identifier),
442
+ tags=dict(job.tags.items()) if job.tags else {},
443
+ timestamp=time.time(),
444
+ )
279
445
 
280
446
  with self.jobs_jsonl_path.open("a") as f:
281
- f.write(json.dumps(record) + "\n")
447
+ f.write(json.dumps(job_info.to_dict()) + "\n")
448
+
449
+ # Write job submitted event to filesystem (only in NORMAL mode)
450
+ if self._event_writer is not None:
451
+ from .state_status import JobSubmittedEvent
452
+
453
+ # Get dependency job IDs
454
+ depends_on = []
455
+ if hasattr(job, "dependencies"):
456
+ for dep in job.dependencies:
457
+ if hasattr(dep, "identifier"):
458
+ depends_on.append(dep.identifier)
459
+
460
+ job_tags = dict(job.tags.items()) if job.tags else {}
461
+ event = JobSubmittedEvent(
462
+ job_id=job.identifier,
463
+ task_id=str(job.type.identifier),
464
+ transient=job.transient.value if hasattr(job, "transient") else 0,
465
+ tags=job_tags,
466
+ depends_on=depends_on,
467
+ )
468
+ self._event_writer.write_event(event)
282
469
 
283
- # Also register in database for TUI/monitoring (only in NORMAL mode)
284
- if self._db_listener is not None:
285
- experiment_id = self.workdir.name
286
- self.state_provider.update_job_submitted(job, experiment_id, self.run_id)
470
+ # Track tags and dependencies directly in experiment
471
+ if job_tags:
472
+ self._tags[job.identifier] = job_tags
473
+ if depends_on:
474
+ self._dependencies[job.identifier] = depends_on
475
+
476
+ def _finalize_run(self, status: str) -> None:
477
+ """Finalize the run: write final status.json and archive event files
478
+
479
+ Args:
480
+ status: Final status ("completed" or "failed")
481
+ """
482
+ from datetime import datetime
483
+ from experimaestro.scheduler.interfaces import ExperimentStatus
484
+ from .state_status import RunCompletedEvent
485
+
486
+ # Update final status in the experiment
487
+ self._ended_at = datetime.now().timestamp()
488
+ if status in ("completed", "done"):
489
+ self._status = ExperimentStatus.DONE
490
+ elif status == "failed":
491
+ self._status = ExperimentStatus.FAILED
492
+
493
+ # Write RunCompletedEvent before closing the event writer
494
+ event = RunCompletedEvent(status=status, ended_at=datetime.now().isoformat())
495
+ self._event_writer.write_event(event)
496
+
497
+ # Close the event writer to flush any buffered events
498
+ self._event_writer.close()
499
+
500
+ # Write final status.json using write_status()
501
+ self.write_status()
502
+
503
+ # Archive event files to permanent storage
504
+ self._event_writer.archive_events()
287
505
 
288
506
  def stop(self):
289
507
  """Stop the experiment as soon as possible"""
@@ -365,69 +583,165 @@ class experiment:
365
583
  return self.workspace.connector.createtoken(name, count)
366
584
 
367
585
  def __enter__(self):
586
+ from datetime import datetime
368
587
  from .dynamic_outputs import TaskOutputsWorker
369
- from experimaestro.utils.environment import save_environment_info
588
+ from experimaestro.utils.environment import (
589
+ ExperimentEnvironment,
590
+ ExperimentRunInfo,
591
+ )
592
+
593
+ # Check for old experiment layout and warn
594
+ old_xp_dir = self.workspace.path / "xp"
595
+ if old_xp_dir.exists() and old_xp_dir.is_dir():
596
+ logger.warning(
597
+ "Experimaestro v2 has a modified experiment file layout. "
598
+ "DO NOT use experimaestro v1 to cleanup orphans. "
599
+ "You can use 'experimaestro migrate v1-to-v2 %s' to migrate old experiment "
600
+ "folders to the new structure.",
601
+ self.workspace.path,
602
+ )
370
603
 
371
- if self.workspace.run_mode != RunMode.DRY_RUN:
604
+ # Only lock and save environment in NORMAL mode
605
+ if self.workspace.run_mode == RunMode.NORMAL:
372
606
  logger.info("Locking experiment %s", self.xplockpath)
373
- self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
607
+ lock = self.workspace.connector.lock(self.xplockpath, 0)
608
+
609
+ # Try non-blocking first to check if lock is held
610
+ if not lock.acquire(blocking=False):
611
+ # Lock is held - try to find hostname from latest run's environment.json
612
+ hostname = None
613
+ try:
614
+ # Find the most recent run directory
615
+ run_dirs = sorted(
616
+ [d for d in self._experiment_base.iterdir() if d.is_dir()],
617
+ key=lambda d: d.stat().st_mtime,
618
+ reverse=True,
619
+ )
620
+ if run_dirs:
621
+ env_path = run_dirs[0] / "environment.json"
622
+ if env_path.exists():
623
+ env = ExperimentEnvironment.load(env_path)
624
+ hostname = env.run.hostname if env.run else None
625
+ except Exception:
626
+ pass # Ignore errors when trying to find hostname
627
+ holder_info = f" (held by {hostname})" if hostname else ""
628
+ logger.warning(
629
+ "Experiment is locked%s, waiting for lock to be released...",
630
+ holder_info,
631
+ )
632
+ # Now wait for the lock
633
+ lock.acquire(blocking=True)
634
+
635
+ self.xplock = lock
374
636
  logger.info("Experiment locked")
375
637
 
376
- # Capture and save environment info (git info for editable packages + all package versions)
377
- if self.workspace.run_mode == RunMode.NORMAL:
378
- env_info_path = self.workdir / "environment.json"
379
- save_environment_info(env_info_path)
638
+ # Generate run_id with collision detection
639
+ now = datetime.now()
640
+ base_run_id = now.strftime("%Y%m%d_%H%M%S")
641
+ run_id = base_run_id
642
+ suffix = 1
643
+ while (self._experiment_base / run_id).exists():
644
+ run_id = f"{base_run_id}.{suffix}"
645
+ suffix += 1
646
+ self.run_id = run_id
380
647
 
381
- # Move old jobs into "jobs.bak"
382
- if self.workspace.run_mode == RunMode.NORMAL:
383
- self.jobsbakpath.mkdir(exist_ok=True)
384
- for p in self.jobspath.glob("*/*"):
385
- if p.is_symlink():
386
- target = self.jobsbakpath / p.relative_to(self.jobspath)
387
- if target.is_symlink():
388
- # Remove if duplicate
389
- p.unlink()
390
- else:
391
- # Rename otherwise
392
- target.parent.mkdir(parents=True, exist_ok=True)
393
- p.rename(target)
648
+ # Create the run-specific workdir
649
+ self.workdir = self._experiment_base / self.run_id
650
+ self.workdir.mkdir(parents=True, exist_ok=True)
651
+
652
+ # Capture and save environment info
653
+ from experimaestro.utils.git import get_git_info
654
+ from experimaestro.utils.environment import get_current_environment
655
+
656
+ env_info_path = self.workdir / "environment.json"
657
+ env = get_current_environment()
658
+
659
+ # Capture project git info from project paths
660
+ dirty_repos = []
661
+ for project_path in self._project_paths:
662
+ project_git = get_git_info(project_path)
663
+ if project_git:
664
+ env.projects.append(project_git)
665
+ # Track dirty repositories
666
+ if project_git.get("dirty"):
667
+ dirty_repos.append(project_git.get("path", str(project_path)))
668
+
669
+ # Handle dirty git repositories based on configured action
670
+ if dirty_repos and self._dirty_git != DirtyGitAction.IGNORE:
671
+ for repo_path in dirty_repos:
672
+ if self._dirty_git == DirtyGitAction.WARN:
673
+ logger.warning(
674
+ "Project repository has uncommitted changes: %s",
675
+ repo_path,
676
+ )
677
+ elif self._dirty_git == DirtyGitAction.ERROR:
678
+ # Release the lock before raising the error
679
+ raise DirtyGitError(
680
+ f"Project repository has uncommitted changes: {repo_path}"
681
+ )
682
+
683
+ env.save(env_info_path)
684
+ else:
685
+ # Non-NORMAL mode: use placeholder run_id and workdir
686
+ self.run_id = "dry-run"
687
+ self.workdir = self._experiment_base / self.run_id
688
+ self.workdir.mkdir(parents=True, exist_ok=True)
394
689
 
395
690
  # Register experiment with scheduler
396
691
  self.scheduler.register_experiment(self)
397
692
 
398
- # Start server via scheduler if needed
399
- if self._needs_server:
400
- self.scheduler.start_server(self._server_settings, workspace=self.workspace)
693
+ # Set experiment start time for BaseExperiment interface
694
+ self._started_at = time.time()
695
+ self._ended_at = None
401
696
 
402
697
  self.workspace.__enter__()
403
698
  (self.workspace.path / ".__experimaestro__").touch()
404
699
 
405
- # Initialize workspace state provider (singleton per workspace path)
406
- # Use read_only mode when not in NORMAL run mode to prevent DB changes
407
- from .state_provider import WorkspaceStateProvider
700
+ # Initialize filesystem-based state tracking (only in NORMAL mode)
701
+ from .state_status import ExperimentEventWriter
408
702
 
409
703
  is_normal_mode = self.workspace.run_mode == RunMode.NORMAL
410
- self.state_provider = WorkspaceStateProvider.get_instance(
411
- self.workspace.path,
412
- read_only=not is_normal_mode,
413
- sync_on_start=False, # Experiments don't sync on start
414
- )
704
+ self._event_writer = None
705
+ self._state_listener = None
706
+
707
+ # Track job tags and dependencies directly (no more StatusData)
708
+ self._tags: Dict[str, Dict[str, str]] = {}
709
+ self._dependencies: Dict[str, List[str]] = {}
710
+ self._events_count = 0
711
+ self._hostname: Optional[str] = None
712
+ self._started_at: Optional[float] = None
713
+ self._ended_at: Optional[float] = None
415
714
 
416
- # Register experiment in database and create a run (only in NORMAL mode)
417
- experiment_id = self.workdir.name
418
- self._db_listener = None
419
715
  if is_normal_mode:
420
- self.state_provider.ensure_experiment(experiment_id)
421
- self.run_id = self.state_provider.create_run(experiment_id)
716
+ import socket
717
+
718
+ # Create event writer for this experiment
719
+ # Events are written to experiments/{experiment_id}/events-{count}.jsonl
720
+ # Permanent storage: workdir/events/
721
+ self._event_writer = ExperimentEventWriter(self, self.workspace.path, 0)
722
+
723
+ # Initialize status.json for this run
724
+ self._hostname = socket.gethostname()
725
+ self._started_at = datetime.now().timestamp()
726
+ self._event_writer.init_status()
727
+
728
+ # Create symlink to current run
729
+ self._event_writer.create_symlink()
730
+
731
+ # Add run info to environment.json
732
+ env_path = self.workdir / "environment.json"
733
+ env = ExperimentEnvironment.load(env_path)
734
+ env.run = ExperimentRunInfo(
735
+ hostname=self._hostname,
736
+ started_at=datetime.now().isoformat(),
737
+ )
738
+ env.save(env_path)
422
739
 
423
- # Add database listener to update job state in database
424
- self._db_listener = DatabaseListener(
425
- self.state_provider, experiment_id, self.run_id
740
+ # Add state listener to write events to filesystem
741
+ self._state_listener = StateListener(
742
+ self._event_writer, self, self.name, self.run_id
426
743
  )
427
- self.scheduler.addlistener(self._db_listener)
428
- else:
429
- # In non-NORMAL modes, use a placeholder run_id
430
- self.run_id = None
744
+ self.scheduler.addlistener(self._state_listener)
431
745
 
432
746
  # Number of unfinished jobs
433
747
  self.unfinishedJobs = 0
@@ -452,19 +766,17 @@ class experiment:
452
766
 
453
767
  def __exit__(self, exc_type, exc_value, traceback):
454
768
  logger.debug("Exiting scheduler context")
455
- # If no exception and normal run mode, remove old "jobs"
456
- if self.workspace.run_mode == RunMode.NORMAL:
457
- if exc_type is None and self.jobsbakpath.is_dir():
458
- rmtree(self.jobsbakpath)
459
769
 
460
770
  # Close the different locks
461
771
  try:
462
- if exc_type:
772
+ if exc_type is GracefulExperimentExit:
773
+ # Graceful exit - don't wait for jobs, don't log error
774
+ logger.info("Graceful experiment exit - not waiting for running jobs")
775
+ elif exc_type:
463
776
  # import faulthandler
464
777
  # faulthandler.dump_traceback()
465
- logger.error(
466
- "Not waiting since an exception was thrown"
467
- " (some jobs may be running)"
778
+ logger.exception(
779
+ "Not waiting since an exception was thrown (some jobs may be running)"
468
780
  )
469
781
  else:
470
782
  self.wait()
@@ -481,20 +793,41 @@ class experiment:
481
793
  logger.info("Closing service %s", service.description())
482
794
  service.stop()
483
795
 
796
+ # Set end time for BaseExperiment interface
797
+ self._ended_at = time.time()
798
+
484
799
  # Unregister experiment from scheduler
485
800
  self.scheduler.unregister_experiment(self)
486
801
 
487
- # Remove database listener and mark run as completed (only in NORMAL mode)
488
- if self._db_listener is not None:
489
- self.scheduler.removelistener(self._db_listener)
490
-
491
- # Mark run as completed in database
492
- experiment_id = self.workdir.name
493
- status = "failed" if exc_type else "completed"
494
- self.state_provider.complete_run(experiment_id, self.run_id, status)
802
+ # Remove state listener and finalize run (only in NORMAL mode)
803
+ if exc_type is GracefulExperimentExit:
804
+ status = "detached" # Graceful exit, jobs may still be running
805
+ elif exc_type:
806
+ status = "failed"
807
+ else:
808
+ status = "completed"
809
+
810
+ if self._state_listener is not None:
811
+ self.scheduler.removelistener(self._state_listener)
812
+ self._finalize_run(status)
813
+
814
+ # Update environment.json with run status
815
+ if self.workspace.run_mode == RunMode.NORMAL and self.workdir:
816
+ from datetime import datetime
817
+ from experimaestro.utils.environment import ExperimentEnvironment
818
+
819
+ env_path = self.workdir / "environment.json"
820
+ if env_path.exists():
821
+ try:
822
+ env = ExperimentEnvironment.load(env_path)
823
+ if env.run:
824
+ env.run.ended_at = datetime.now().isoformat()
825
+ env.run.status = status
826
+ env.save(env_path)
827
+ except Exception as e:
828
+ logger.warning("Failed to update environment.json: %s", e)
495
829
 
496
830
  # Note: Don't stop scheduler - it's shared!
497
- # Note: Don't stop server - it runs in daemon mode until program exit
498
831
 
499
832
  if self.taskOutputsWorker is not None:
500
833
  logger.info("Stopping tasks outputs worker")
@@ -508,13 +841,39 @@ class experiment:
508
841
  experiment.CURRENT = self.old_experiment
509
842
 
510
843
  if self.workspace.run_mode == RunMode.NORMAL:
511
- # Write the state
512
- logging.info("Saving the experiment state")
513
- from experimaestro.scheduler.state import ExperimentState
844
+ # Remove job directories for transient jobs with REMOVE mode
845
+ if exc_type is None:
846
+ for job in list(self.scheduler.jobs.values()):
847
+ if (
848
+ self in job.experiments
849
+ and job.transient.should_remove
850
+ and job.state.finished()
851
+ ):
852
+ job_path = job.path
853
+ if job_path.exists():
854
+ logger.info(
855
+ "Removing transient job directory: %s", job_path
856
+ )
857
+ rmtree(job_path)
858
+ # Also remove the symlink in the experiment's jobs folder
859
+ symlink_path = self.jobspath / job.relpath
860
+ if symlink_path.is_symlink():
861
+ symlink_path.unlink()
862
+
863
+ # Cleanup old runs based on history settings
864
+ try:
865
+ cleanup_experiment_history(
866
+ self._experiment_base,
867
+ current_run_id=self.run_id,
868
+ current_status=status,
869
+ history=self._get_history_settings(),
870
+ )
871
+ except Exception as e:
872
+ logger.warning("Failed to cleanup old runs: %s", e)
514
873
 
515
- ExperimentState.save(
516
- self.workdir / "state.json", self.scheduler.jobs.values()
517
- )
874
+ # Suppress GracefulExperimentExit exception
875
+ if exc_type is GracefulExperimentExit:
876
+ return True
518
877
 
519
878
  async def update_task_output_count(self, delta: int):
520
879
  """Change in the number of task outputs to process"""
@@ -555,27 +914,43 @@ class experiment:
555
914
  id(service),
556
915
  )
557
916
 
558
- self.services[service.id] = service
917
+ self._services[service.id] = service
559
918
 
560
919
  # Allow service to access experiment context
561
920
  service.set_experiment(self)
562
921
 
563
- # Register database listener for state changes
564
- service.add_listener(self._db_listener)
922
+ # Register state listener for state changes (writes events)
923
+ if self._state_listener is not None:
924
+ service.add_listener(self._state_listener)
565
925
 
566
- # Register file listener for state changes (writes to services.json)
926
+ # Register listener for state changes
567
927
  service.add_listener(self)
568
928
 
569
- self.scheduler.notify_service_add(service)
570
-
571
- # Write services.json file
572
- self._write_services_json()
929
+ self.scheduler.notify_service_add(service, self.name, self.run_id or "")
573
930
 
574
931
  return service
575
932
 
576
933
  def service_state_changed(self, service):
577
- """Called when a service state changes - update services.json"""
578
- self._write_services_json()
934
+ """Called when a service state changes - notify listeners"""
935
+ state_name = service.state.name if hasattr(service.state, "name") else "UNKNOWN"
936
+ logger.debug(
937
+ "Service %s state changed to %s (experiment=%s)",
938
+ service.id,
939
+ state_name,
940
+ self.name,
941
+ )
942
+
943
+ # Notify state listeners (for TUI tab title updates etc.)
944
+ from experimaestro.scheduler.state_status import ServiceStateChangedEvent
945
+
946
+ if self.scheduler is not None:
947
+ event = ServiceStateChangedEvent(
948
+ experiment_id=self.name,
949
+ run_id=self.run_id or "",
950
+ service_id=service.id,
951
+ state=state_name,
952
+ )
953
+ self.scheduler._notify_state_listeners_async(event)
579
954
 
580
955
  def save(self, obj: Any, name: str = "default"):
581
956
  """Serializes configurations.
@@ -594,19 +969,209 @@ class experiment:
594
969
 
595
970
  save(obj, save_dir)
596
971
 
597
- def load(self, reference: str, name: str = "default"):
598
- """Serializes configurations.
599
-
600
- Loads configuration objects from an experimental directory
972
+ def load(self, reference: str, name: str = "default", run_id: str = None):
973
+ """Loads configuration objects from an experimental directory.
601
974
 
602
975
  :param reference: The name of the experiment
603
976
  :param name: The name of the saving directory (default to `default`)
977
+ :param run_id: The run ID to load from (default: latest run)
604
978
  """
605
979
  from experimaestro import load
606
980
 
607
- path = self.workspace.experimentspath / reference / "data" / name
981
+ exp_base = self.workspace.experimentspath / reference
982
+ if run_id is None:
983
+ # Find the latest run directory
984
+ run_dirs = sorted(
985
+ [d for d in exp_base.iterdir() if d.is_dir()],
986
+ key=lambda d: d.stat().st_mtime,
987
+ reverse=True,
988
+ )
989
+ if not run_dirs:
990
+ raise FileNotFoundError(f"No runs found for experiment {reference}")
991
+ run_dir = run_dirs[0]
992
+ else:
993
+ run_dir = exp_base / run_id
994
+
995
+ path = run_dir / "data" / name
608
996
  return load(path)
609
997
 
998
+ def _get_history_settings(self) -> HistorySettings:
999
+ """Get the history settings for this experiment.
1000
+
1001
+ Returns workspace-specific settings if available, otherwise global defaults.
1002
+ """
1003
+ # Check if workspace has explicit history settings
1004
+ ws_settings = self.workspace.settings
1005
+ if ws_settings and ws_settings.history:
1006
+ return ws_settings.history
1007
+
1008
+ # Fall back to global settings
1009
+ settings = get_settings()
1010
+ return settings.history
1011
+
1012
+
1013
+ def get_run_status(run_dir: Path) -> Optional[str]:
1014
+ """Get the status of a run from its status.json or environment.json.
1015
+
1016
+ Args:
1017
+ run_dir: Path to the run directory
1018
+
1019
+ Returns:
1020
+ 'completed', 'failed', or None if status cannot be determined.
1021
+ """
1022
+ # Try environment.json first (most reliable - written on exit)
1023
+ env_path = run_dir / "environment.json"
1024
+ if env_path.exists():
1025
+ try:
1026
+ from experimaestro.utils.environment import ExperimentEnvironment
1027
+
1028
+ env = ExperimentEnvironment.load(env_path)
1029
+ if env.run and env.run.status:
1030
+ return env.run.status
1031
+ except Exception:
1032
+ pass
1033
+
1034
+ # Fall back to status.json
1035
+ status_path = run_dir / "status.json"
1036
+ if status_path.exists():
1037
+ try:
1038
+ with status_path.open() as f:
1039
+ status = json.load(f)
1040
+ # Check the experiment status field
1041
+ exp_status = status.get("status")
1042
+ if exp_status == "done":
1043
+ return "completed"
1044
+ elif exp_status == "failed":
1045
+ return "failed"
1046
+ # Check job states as fallback
1047
+ jobs = status.get("jobs", {})
1048
+ if any(j.get("state") == "error" for j in jobs.values()):
1049
+ return "failed"
1050
+ return "completed"
1051
+ except Exception:
1052
+ pass
1053
+
1054
+ # Cannot determine status
1055
+ return None
1056
+
1057
+
1058
+ def cleanup_experiment_history(
1059
+ experiment_base: Path,
1060
+ *,
1061
+ current_run_id: Optional[str] = None,
1062
+ current_status: Optional[str] = None,
1063
+ history: Optional[HistorySettings] = None,
1064
+ ) -> list[Path]:
1065
+ """Clean up old experiment runs based on history settings.
1066
+
1067
+ This function can be called from the CLI or other contexts.
1068
+
1069
+ Args:
1070
+ experiment_base: Path to the experiment directory (containing run subdirs)
1071
+ current_run_id: ID of the current run to exclude from cleanup (optional)
1072
+ current_status: Status of the current run ('completed' or 'failed'), used
1073
+ to determine if failed runs should be removed (optional)
1074
+ history: History settings to use (defaults to global settings)
1075
+
1076
+ Returns:
1077
+ List of paths that were removed
1078
+ """
1079
+ if history is None:
1080
+ settings = get_settings()
1081
+ history = settings.history
1082
+
1083
+ removed_paths = []
1084
+
1085
+ # List all run directories (excluding the current one)
1086
+ run_dirs = []
1087
+ for d in experiment_base.iterdir():
1088
+ if d.is_dir() and d.name != current_run_id:
1089
+ run_dirs.append(d)
1090
+
1091
+ # Sort by directory name (oldest first)
1092
+ # Directory names are in format YYYYMMDD_HHMMSS or YYYYMMDD_HHMMSS.N (with modifier)
1093
+ def run_sort_key(d: Path) -> tuple[str, int]:
1094
+ """Parse run_id for sorting, handling modifiers like 20250501_102315.1"""
1095
+ name = d.name
1096
+ if "." in name:
1097
+ parts = name.split(".", 1)
1098
+ try:
1099
+ return (parts[0], int(parts[1]))
1100
+ except (ValueError, IndexError):
1101
+ return (name, 0)
1102
+ return (name, 0)
1103
+
1104
+ run_dirs.sort(key=run_sort_key)
1105
+
1106
+ # Categorize runs by status
1107
+ completed_runs = []
1108
+ failed_runs = []
1109
+
1110
+ for run_dir in run_dirs:
1111
+ status = get_run_status(run_dir)
1112
+ if status == "completed":
1113
+ completed_runs.append(run_dir)
1114
+ elif status == "failed":
1115
+ failed_runs.append(run_dir)
1116
+ # Runs with unknown status are not touched
1117
+
1118
+ # If current run succeeded, remove all past failed runs (per user requirement)
1119
+ if current_status == "completed":
1120
+ # Remove all past failed runs
1121
+ # Per user requirement: "If an experiment succeed, it remove the past failed"
1122
+ for run_dir in failed_runs:
1123
+ logger.info("Removing failed run (experiment succeeded): %s", run_dir)
1124
+ try:
1125
+ rmtree(run_dir)
1126
+ removed_paths.append(run_dir)
1127
+ except Exception as e:
1128
+ logger.warning("Failed to remove run directory %s: %s", run_dir, e)
1129
+ failed_runs = []
1130
+
1131
+ # Remove failed runs that come after any successful run
1132
+ # (if there's a success before a failure, that failure is stale)
1133
+ if completed_runs:
1134
+ # Find the newest completed run
1135
+ newest_completed = run_sort_key(completed_runs[-1])
1136
+ remaining_failed = []
1137
+ for run_dir in failed_runs:
1138
+ if run_sort_key(run_dir) < newest_completed:
1139
+ logger.info("Removing failed run (success exists after): %s", run_dir)
1140
+ try:
1141
+ rmtree(run_dir)
1142
+ removed_paths.append(run_dir)
1143
+ except Exception as e:
1144
+ logger.warning("Failed to remove run directory %s: %s", run_dir, e)
1145
+ else:
1146
+ remaining_failed.append(run_dir)
1147
+ failed_runs = remaining_failed
1148
+
1149
+ # Keep only max_done completed runs (remove oldest ones)
1150
+ while len(completed_runs) > history.max_done:
1151
+ run_dir = completed_runs.pop(0) # Remove oldest
1152
+ logger.info(
1153
+ "Removing old completed run (keeping %d): %s", history.max_done, run_dir
1154
+ )
1155
+ try:
1156
+ rmtree(run_dir)
1157
+ removed_paths.append(run_dir)
1158
+ except Exception as e:
1159
+ logger.warning("Failed to remove run directory %s: %s", run_dir, e)
1160
+
1161
+ # Keep only max_failed failed runs (remove oldest ones)
1162
+ while len(failed_runs) > history.max_failed:
1163
+ run_dir = failed_runs.pop(0) # Remove oldest
1164
+ logger.info(
1165
+ "Removing old failed run (keeping %d): %s", history.max_failed, run_dir
1166
+ )
1167
+ try:
1168
+ rmtree(run_dir)
1169
+ removed_paths.append(run_dir)
1170
+ except Exception as e:
1171
+ logger.warning("Failed to remove run directory %s: %s", run_dir, e)
1172
+
1173
+ return removed_paths
1174
+
610
1175
 
611
1176
  # re-export at the module level
612
1177
  current = experiment.current