experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (154) hide show
  1. experimaestro/__init__.py +12 -5
  2. experimaestro/cli/__init__.py +393 -134
  3. experimaestro/cli/filter.py +48 -23
  4. experimaestro/cli/jobs.py +253 -71
  5. experimaestro/cli/refactor.py +1 -2
  6. experimaestro/commandline.py +7 -4
  7. experimaestro/connectors/__init__.py +9 -1
  8. experimaestro/connectors/local.py +43 -3
  9. experimaestro/core/arguments.py +18 -18
  10. experimaestro/core/identifier.py +11 -11
  11. experimaestro/core/objects/config.py +96 -39
  12. experimaestro/core/objects/config_walk.py +3 -3
  13. experimaestro/core/{subparameters.py → partial.py} +16 -16
  14. experimaestro/core/partial_lock.py +394 -0
  15. experimaestro/core/types.py +12 -15
  16. experimaestro/dynamic.py +290 -0
  17. experimaestro/experiments/__init__.py +6 -2
  18. experimaestro/experiments/cli.py +223 -52
  19. experimaestro/experiments/configuration.py +24 -0
  20. experimaestro/generators.py +5 -5
  21. experimaestro/ipc.py +118 -1
  22. experimaestro/launcherfinder/__init__.py +2 -2
  23. experimaestro/launcherfinder/registry.py +6 -7
  24. experimaestro/launcherfinder/specs.py +2 -9
  25. experimaestro/launchers/slurm/__init__.py +2 -2
  26. experimaestro/launchers/slurm/base.py +62 -0
  27. experimaestro/locking.py +957 -1
  28. experimaestro/notifications.py +89 -201
  29. experimaestro/progress.py +63 -366
  30. experimaestro/rpyc.py +0 -2
  31. experimaestro/run.py +29 -2
  32. experimaestro/scheduler/__init__.py +8 -1
  33. experimaestro/scheduler/base.py +650 -53
  34. experimaestro/scheduler/dependencies.py +20 -16
  35. experimaestro/scheduler/experiment.py +764 -169
  36. experimaestro/scheduler/interfaces.py +338 -96
  37. experimaestro/scheduler/jobs.py +58 -20
  38. experimaestro/scheduler/remote/__init__.py +31 -0
  39. experimaestro/scheduler/remote/adaptive_sync.py +265 -0
  40. experimaestro/scheduler/remote/client.py +928 -0
  41. experimaestro/scheduler/remote/protocol.py +282 -0
  42. experimaestro/scheduler/remote/server.py +447 -0
  43. experimaestro/scheduler/remote/sync.py +144 -0
  44. experimaestro/scheduler/services.py +186 -35
  45. experimaestro/scheduler/state_provider.py +811 -2157
  46. experimaestro/scheduler/state_status.py +1247 -0
  47. experimaestro/scheduler/transient.py +31 -0
  48. experimaestro/scheduler/workspace.py +1 -1
  49. experimaestro/scheduler/workspace_state_provider.py +1273 -0
  50. experimaestro/scriptbuilder.py +4 -4
  51. experimaestro/settings.py +36 -0
  52. experimaestro/tests/conftest.py +33 -5
  53. experimaestro/tests/connectors/bin/executable.py +1 -1
  54. experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
  55. experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
  56. experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
  57. experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
  58. experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
  59. experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
  60. experimaestro/tests/launchers/bin/test.py +1 -0
  61. experimaestro/tests/launchers/test_slurm.py +9 -9
  62. experimaestro/tests/partial_reschedule.py +46 -0
  63. experimaestro/tests/restart.py +3 -3
  64. experimaestro/tests/restart_main.py +1 -0
  65. experimaestro/tests/scripts/notifyandwait.py +1 -0
  66. experimaestro/tests/task_partial.py +38 -0
  67. experimaestro/tests/task_tokens.py +2 -2
  68. experimaestro/tests/tasks/test_dynamic.py +6 -6
  69. experimaestro/tests/test_dependencies.py +3 -3
  70. experimaestro/tests/test_deprecated.py +15 -15
  71. experimaestro/tests/test_dynamic_locking.py +317 -0
  72. experimaestro/tests/test_environment.py +24 -14
  73. experimaestro/tests/test_experiment.py +171 -36
  74. experimaestro/tests/test_identifier.py +25 -25
  75. experimaestro/tests/test_identifier_stability.py +3 -5
  76. experimaestro/tests/test_multitoken.py +2 -4
  77. experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
  78. experimaestro/tests/test_partial_paths.py +81 -138
  79. experimaestro/tests/test_pre_experiment.py +219 -0
  80. experimaestro/tests/test_progress.py +2 -8
  81. experimaestro/tests/test_remote_state.py +1132 -0
  82. experimaestro/tests/test_stray_jobs.py +261 -0
  83. experimaestro/tests/test_tasks.py +1 -2
  84. experimaestro/tests/test_token_locking.py +52 -67
  85. experimaestro/tests/test_tokens.py +5 -6
  86. experimaestro/tests/test_transient.py +225 -0
  87. experimaestro/tests/test_workspace_state_provider.py +768 -0
  88. experimaestro/tests/token_reschedule.py +1 -3
  89. experimaestro/tests/utils.py +2 -7
  90. experimaestro/tokens.py +227 -372
  91. experimaestro/tools/diff.py +1 -0
  92. experimaestro/tools/documentation.py +4 -5
  93. experimaestro/tools/jobs.py +1 -2
  94. experimaestro/tui/app.py +459 -1895
  95. experimaestro/tui/app.tcss +162 -0
  96. experimaestro/tui/dialogs.py +172 -0
  97. experimaestro/tui/log_viewer.py +253 -3
  98. experimaestro/tui/messages.py +137 -0
  99. experimaestro/tui/utils.py +54 -0
  100. experimaestro/tui/widgets/__init__.py +23 -0
  101. experimaestro/tui/widgets/experiments.py +468 -0
  102. experimaestro/tui/widgets/global_services.py +238 -0
  103. experimaestro/tui/widgets/jobs.py +972 -0
  104. experimaestro/tui/widgets/log.py +156 -0
  105. experimaestro/tui/widgets/orphans.py +363 -0
  106. experimaestro/tui/widgets/runs.py +185 -0
  107. experimaestro/tui/widgets/services.py +314 -0
  108. experimaestro/tui/widgets/stray_jobs.py +528 -0
  109. experimaestro/utils/__init__.py +1 -1
  110. experimaestro/utils/environment.py +105 -22
  111. experimaestro/utils/fswatcher.py +124 -0
  112. experimaestro/utils/jobs.py +1 -2
  113. experimaestro/utils/jupyter.py +1 -2
  114. experimaestro/utils/logging.py +72 -0
  115. experimaestro/version.py +2 -2
  116. experimaestro/webui/__init__.py +9 -0
  117. experimaestro/webui/app.py +117 -0
  118. experimaestro/{server → webui}/data/index.css +66 -11
  119. experimaestro/webui/data/index.css.map +1 -0
  120. experimaestro/{server → webui}/data/index.js +82763 -87217
  121. experimaestro/webui/data/index.js.map +1 -0
  122. experimaestro/webui/routes/__init__.py +5 -0
  123. experimaestro/webui/routes/auth.py +53 -0
  124. experimaestro/webui/routes/proxy.py +117 -0
  125. experimaestro/webui/server.py +200 -0
  126. experimaestro/webui/state_bridge.py +152 -0
  127. experimaestro/webui/websocket.py +413 -0
  128. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +8 -9
  129. experimaestro-2.0.0b17.dist-info/RECORD +219 -0
  130. experimaestro/cli/progress.py +0 -269
  131. experimaestro/scheduler/state.py +0 -75
  132. experimaestro/scheduler/state_db.py +0 -388
  133. experimaestro/scheduler/state_sync.py +0 -834
  134. experimaestro/server/__init__.py +0 -467
  135. experimaestro/server/data/index.css.map +0 -1
  136. experimaestro/server/data/index.js.map +0 -1
  137. experimaestro/tests/test_cli_jobs.py +0 -615
  138. experimaestro/tests/test_file_progress.py +0 -425
  139. experimaestro/tests/test_file_progress_integration.py +0 -477
  140. experimaestro/tests/test_state_db.py +0 -434
  141. experimaestro-2.0.0b4.dist-info/RECORD +0 -181
  142. /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
  143. /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
  144. /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
  145. /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
  146. /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
  147. /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
  148. /experimaestro/{server → webui}/data/favicon.ico +0 -0
  149. /experimaestro/{server → webui}/data/index.html +0 -0
  150. /experimaestro/{server → webui}/data/login.html +0 -0
  151. /experimaestro/{server → webui}/data/manifest.json +0 -0
  152. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
  153. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
  154. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
@@ -1,11 +1,12 @@
1
1
  import asyncio
2
+ import inspect
2
3
  import json
3
4
  import logging
4
5
  import os
5
6
  from pathlib import Path
6
7
  import time
7
8
  from shutil import rmtree
8
- from typing import Any, Dict, Optional, TypeVar, Union
9
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union
9
10
 
10
11
  from experimaestro.core.objects import WatchedOutput
11
12
  from experimaestro.exceptions import HandledException
@@ -14,9 +15,19 @@ from experimaestro.scheduler.signal_handler import SIGNAL_HANDLER
14
15
  from experimaestro.scheduler.jobs import Job
15
16
  from experimaestro.scheduler.services import Service
16
17
  from experimaestro.scheduler.workspace import RunMode, Workspace
17
- from experimaestro.settings import WorkspaceSettings, get_settings
18
+ from experimaestro.scheduler.interfaces import (
19
+ BaseExperiment,
20
+ BaseService,
21
+ ExperimentJobInformation,
22
+ )
23
+ from experimaestro.settings import WorkspaceSettings, get_settings, HistorySettings
24
+ from experimaestro.experiments.configuration import DirtyGitAction
18
25
  from experimaestro.utils import logger
19
26
 
27
+ if TYPE_CHECKING:
28
+ from experimaestro.scheduler.interfaces import ExperimentStatus
29
+ from experimaestro.scheduler.state_status import ExperimentEventWriter
30
+
20
31
  ServiceClass = TypeVar("ServiceClass", bound=Service)
21
32
 
22
33
 
@@ -26,11 +37,46 @@ class FailedExperiment(HandledException):
26
37
  pass
27
38
 
28
39
 
29
- class DatabaseListener:
30
- """Listener that updates job state in the database"""
40
+ class DirtyGitError(HandledException):
41
+ """Raised when the git repository has uncommitted changes and dirty_git=error"""
42
+
43
+ pass
44
+
45
+
46
+ class GracefulExperimentExit(Exception):
47
+ """Raised to exit an experiment context without waiting for running jobs.
48
+
49
+ This is useful in tests or when you want to detach from an experiment
50
+ while keeping jobs running (e.g., to test stray job detection).
51
+
52
+ Example::
53
+
54
+ with experiment(workdir, "my-experiment") as xp:
55
+ task = MyTask.C(value=1).submit()
56
+ # Wait for task to start...
57
+ raise GracefulExperimentExit() # Exit without waiting for task to finish
58
+ """
59
+
60
+ pass
61
+
62
+
63
+ class StateListener:
64
+ """Listener that writes events to filesystem
65
+
66
+ Job state events are written to per-job event files by the scheduler.
67
+ This listener writes experiment-level events (job state, services) to
68
+ the experiment event file.
69
+ """
31
70
 
32
- def __init__(self, state_provider, experiment_id: str, run_id: str):
33
- self.state_provider = state_provider
71
+ def __init__(
72
+ self,
73
+ event_writer: "ExperimentEventWriter",
74
+ experiment: "experiment",
75
+ experiment_id: str,
76
+ run_id: str,
77
+ ):
78
+ self.event_writer = event_writer
79
+ self.experiment = experiment
34
80
  self.experiment_id = experiment_id
35
81
  self.run_id = run_id
36
82
 
@@ -39,38 +85,65 @@ class DatabaseListener:
39
85
  pass
40
86
 
41
87
  def job_state(self, job):
42
- """Update job state in database"""
43
- self.state_provider.update_job_state(job, self.experiment_id, self.run_id)
88
+ """Write job state change event to experiment event file"""
89
+ from .state_status import JobStateChangedEvent
90
+
91
+ # Get failure reason if error state
92
+ failure_reason = None
93
+ if hasattr(job.state, "failure_reason") and job.state.failure_reason:
94
+ failure_reason = job.state.failure_reason.name
95
+
96
+ # Get progress as list of dicts
97
+ progress = []
98
+ if hasattr(job, "_progress") and job._progress:
99
+ progress = [
100
+ {"level": p.level, "progress": p.progress, "desc": p.desc}
101
+ for p in job._progress
102
+ ]
103
+
104
+ event = JobStateChangedEvent(
105
+ job_id=job.identifier,
106
+ state=job.state.name,
107
+ failure_reason=failure_reason,
108
+ submitted_time=job.submittime,
109
+ started_time=job.starttime,
110
+ ended_time=job.endtime,
111
+ exit_code=getattr(job, "exit_code", None),
112
+ retry_count=getattr(job, "retry_count", 0),
113
+ progress=progress,
114
+ )
115
+ # Write to experiment event file
116
+ self.event_writer.write_event(event)
44
117
 
45
118
  def service_add(self, service):
46
- """Update service in database"""
47
- self.state_provider.update_service(
48
- service.id,
49
- self.experiment_id,
50
- self.run_id,
51
- service.description(),
52
- service.state.name,
53
- state_dict=json.dumps(service.state_dict()),
119
+ """Write service added event to filesystem"""
120
+ from experimaestro.scheduler.services import Service
121
+ from .state_status import ServiceAddedEvent
122
+
123
+ state_dict = Service.serialize_state_dict(service.state_dict())
124
+ service_class = f"{service.__class__.__module__}.{service.__class__.__name__}"
125
+ event = ServiceAddedEvent(
126
+ service_id=service.id,
127
+ description=service.description(),
128
+ service_class=service_class,
129
+ state_dict=state_dict,
54
130
  )
131
+ self.event_writer.write_event(event)
55
132
 
56
133
  def service_state_changed(self, service):
57
- """Update service state in database (called by Service when state changes)"""
58
- self.state_provider.update_service(
59
- service.id,
60
- self.experiment_id,
61
- self.run_id,
62
- service.description(),
63
- service.state.name,
64
- state_dict=json.dumps(service.state_dict()),
65
- )
134
+ """Called when service state changes (runtime only, not persisted)"""
135
+ # Service state is managed at runtime, not persisted
136
+ pass
66
137
 
67
138
 
68
- class experiment:
139
+ class experiment(BaseExperiment):
69
140
  """Context manager for running experiments.
70
141
 
71
142
  Creates a workspace, manages task submission, and optionally starts
72
143
  a web server for monitoring.
73
144
 
145
+ Implements BaseExperiment interface for use with StateProvider and TUI.
146
+
74
147
  Example::
75
148
 
76
149
  from experimaestro import experiment
@@ -103,6 +176,10 @@ class experiment:
103
176
  run_mode: Optional[RunMode] = None,
104
177
  launcher=None,
105
178
  register_signals: bool = True,
179
+ project_paths: Optional[list[Path]] = None,
180
+ wait_for_quit: bool = False,
181
+ dirty_git: DirtyGitAction = DirtyGitAction.WARN,
182
+ no_db: bool = False,
106
183
  ):
107
184
  """
108
185
  :param env: an environment -- or a working directory for a local
@@ -122,10 +199,60 @@ class experiment:
122
199
 
123
200
  :param register_signals: Whether to register signal handlers (default: True).
124
201
  Set to False when running in a background thread.
202
+
203
+ :param project_paths: Paths to the project files (for git info). If not
204
+ provided, will be inferred from the caller's location.
205
+
206
+ :param wait_for_quit: Deprecated, no longer used. Web server is no longer
207
+ started automatically.
208
+
209
+ :param dirty_git: Action when git repository has uncommitted changes:
210
+ DirtyGitAction.IGNORE (don't check), DirtyGitAction.WARN (log warning,
211
+ default), or DirtyGitAction.ERROR (raise exception).
212
+
213
+ :param no_db: Deprecated, kept for backwards compatibility. This parameter
214
+ is now a no-op as the database has been replaced with filesystem-based
215
+ state tracking.
216
+
217
+ .. deprecated::
218
+ The ``host``, ``port``, ``token``, and ``wait_for_quit`` parameters are
219
+ deprecated. Use ``--web`` flag with ``run-experiment`` CLI or start the
220
+ web server separately.
125
221
  """
222
+ import warnings
126
223
 
127
224
  from experimaestro.scheduler import Listener, Scheduler
128
225
 
226
+ # Warn about deprecated server parameters
227
+ if host is not None:
228
+ warnings.warn(
229
+ "The 'host' parameter is deprecated. Use '--web' flag with "
230
+ "'run-experiment' CLI or start the web server separately.",
231
+ DeprecationWarning,
232
+ stacklevel=2,
233
+ )
234
+ if port is not None:
235
+ warnings.warn(
236
+ "The 'port' parameter is deprecated. Use '--web' flag with "
237
+ "'run-experiment' CLI or start the web server separately.",
238
+ DeprecationWarning,
239
+ stacklevel=2,
240
+ )
241
+ if token is not None:
242
+ warnings.warn(
243
+ "The 'token' parameter is deprecated. Use '--web' flag with "
244
+ "'run-experiment' CLI or start the web server separately.",
245
+ DeprecationWarning,
246
+ stacklevel=2,
247
+ )
248
+ if wait_for_quit:
249
+ warnings.warn(
250
+ "The 'wait_for_quit' parameter is deprecated. Use '--web' flag with "
251
+ "'run-experiment' CLI or start the web server separately.",
252
+ DeprecationWarning,
253
+ stacklevel=2,
254
+ )
255
+
129
256
  settings = get_settings()
130
257
  if not isinstance(env, WorkspaceSettings):
131
258
  env = WorkspaceSettings(id=None, path=Path(env))
@@ -134,36 +261,45 @@ class experiment:
134
261
  run_mode = run_mode or RunMode.NORMAL
135
262
  self.workspace = Workspace(settings, env, launcher=launcher, run_mode=run_mode)
136
263
 
137
- # Mark the directory has an experimaestro folder
138
- self.workdir = self.workspace.experimentspath / name
139
- self.workdir.mkdir(parents=True, exist_ok=True)
140
- self.xplockpath = self.workdir / "lock"
264
+ # Store experiment name for ID references
265
+ self.name = name
266
+
267
+ # Create experiment base directory (run directories will be created inside)
268
+ self._experiment_base = self.workspace.experimentspath / name
269
+ self._experiment_base.mkdir(parents=True, exist_ok=True)
270
+
271
+ # Lock is at experiment level (prevents concurrent runs of same experiment)
272
+ self.xplockpath = self._experiment_base / "lock"
273
+
274
+ # workdir will be set in __enter__ after run_id is generated
275
+ self.workdir = None
141
276
  self.xplock = None
142
277
  self.old_experiment = None
143
- self.services: Dict[str, Service] = {}
278
+ self._services: Dict[str, Service] = {}
144
279
  self._job_listener: Optional[Listener] = None
145
280
  self._register_signals = register_signals
146
-
147
- # Get configuration settings
148
-
149
- if host is not None:
150
- settings.server.host = host
151
-
152
- if port is not None:
153
- settings.server.port = port
154
-
155
- if token is not None:
156
- settings.server.token = token
281
+ self._dirty_git = dirty_git
282
+ self._no_db = no_db
283
+
284
+ # Capture project paths for git info
285
+ if project_paths is not None:
286
+ self._project_paths = project_paths
287
+ else:
288
+ # Fall back to caller's file path
289
+ self._project_paths = []
290
+ try:
291
+ # Go up the stack to find the first frame outside this module
292
+ for frame_info in inspect.stack():
293
+ frame_file = frame_info.filename
294
+ if "experimaestro" not in frame_file:
295
+ self._project_paths = [Path(frame_file).resolve().parent]
296
+ break
297
+ except Exception:
298
+ pass
157
299
 
158
300
  # Use singleton scheduler
159
301
  self.scheduler = Scheduler.instance()
160
302
 
161
- # Determine if we need a server
162
- self._needs_server = (
163
- settings.server.port is not None and settings.server.port >= 0
164
- ) and self.workspace.run_mode == RunMode.NORMAL
165
- self._server_settings = settings.server if self._needs_server else None
166
-
167
303
  if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
168
304
  import faulthandler
169
305
 
@@ -201,48 +337,77 @@ class experiment:
201
337
  """Return the directory in which results can be stored for this experiment"""
202
338
  return self.workdir / "jobs"
203
339
 
340
+ # =========================================================================
341
+ # BaseExperiment interface properties
342
+ # =========================================================================
343
+
344
+ @property
345
+ def experiment_id(self) -> str:
346
+ """Experiment identifier (overrides BaseExperiment.experiment_id)"""
347
+ return self.name
348
+
349
+ @property
350
+ def status(self) -> "ExperimentStatus":
351
+ """Experiment status - RUNNING for live experiments, updated on finalization"""
352
+ from experimaestro.scheduler.interfaces import ExperimentStatus
353
+
354
+ return getattr(self, "_status", ExperimentStatus.RUNNING)
355
+
356
+ @property
357
+ def jobs(self) -> Dict[str, "Job"]:
358
+ """Jobs in this experiment"""
359
+ return {
360
+ job.identifier: job
361
+ for job in self.scheduler.jobs.values()
362
+ if self in job.experiments
363
+ }
364
+
365
+ @property
366
+ def tags(self) -> Dict[str, Dict[str, str]]:
367
+ """Tags for jobs - tracked directly in experiment"""
368
+ return self._tags
369
+
370
+ @property
371
+ def dependencies(self) -> Dict[str, List[str]]:
372
+ """Job dependencies - tracked directly in experiment"""
373
+ return self._dependencies
374
+
375
+ @property
376
+ def events_count(self) -> int:
377
+ """Number of events processed"""
378
+ return self._events_count
379
+
380
+ @property
381
+ def started_at(self) -> Optional[float]:
382
+ """Timestamp when experiment started"""
383
+ return self._started_at
384
+
385
+ @property
386
+ def ended_at(self) -> Optional[float]:
387
+ """Timestamp when experiment ended (None if still running)"""
388
+ return self._ended_at
389
+
390
+ @property
391
+ def hostname(self) -> Optional[str]:
392
+ """Hostname where experiment is running"""
393
+ return self._hostname
394
+
395
+ @property
396
+ def services(self) -> Dict[str, "BaseService"]:
397
+ """Services in this experiment"""
398
+ return self._services
399
+
204
400
  @property
205
401
  def alt_jobspaths(self):
206
402
  """Return potential other directories"""
207
403
  for alt_workdir in self.workspace.alt_workdirs:
208
404
  yield alt_workdir / "jobs"
209
405
 
210
- @property
211
- def jobsbakpath(self):
212
- """Return the directory in which results can be stored for this experiment"""
213
- return self.workdir / "jobs.bak"
214
-
215
406
  @property
216
407
  def jobs_jsonl_path(self):
217
408
  """Return the path to the jobs.jsonl file for this experiment"""
218
409
  return self.workdir / "jobs.jsonl"
219
410
 
220
- @property
221
- def services_json_path(self):
222
- """Return the path to the services.json file for this experiment"""
223
- return self.workdir / "services.json"
224
-
225
- def _write_services_json(self):
226
- """Write all services to services.json file"""
227
- services_data = {}
228
- for service_id, service in self.services.items():
229
- # Get state_dict from service (includes __class__ for recreation)
230
- service_state = service.state_dict()
231
- # Add runtime state info
232
- service_state.update(
233
- {
234
- "service_id": service_id,
235
- "description": service.description(),
236
- "state": service.state.name,
237
- "url": getattr(service, "url", None),
238
- "timestamp": time.time(),
239
- }
240
- )
241
- services_data[service_id] = service_state
242
-
243
- with self.services_json_path.open("w") as f:
244
- json.dump(services_data, f, indent=2)
245
-
246
411
  def add_job(self, job: "Job"):
247
412
  """Register a job and its tags to jobs.jsonl file and database
248
413
 
@@ -267,23 +432,76 @@ class experiment:
267
432
  logging.debug(
268
433
  "Job %s already running, unfinished jobs for %s: %d",
269
434
  job.identifier[:8],
270
- self.workdir.name,
435
+ self.name,
271
436
  self.unfinishedJobs,
272
437
  )
273
438
 
274
- record = {
275
- "job_id": job.identifier,
276
- "task_id": str(job.type.identifier),
277
- "tags": dict(job.tags.items()) if job.tags else {},
278
- "timestamp": time.time(),
279
- }
439
+ job_info = ExperimentJobInformation(
440
+ job_id=job.identifier,
441
+ task_id=str(job.type.identifier),
442
+ tags=dict(job.tags.items()) if job.tags else {},
443
+ timestamp=time.time(),
444
+ )
280
445
 
281
446
  with self.jobs_jsonl_path.open("a") as f:
282
- f.write(json.dumps(record) + "\n")
447
+ f.write(json.dumps(job_info.to_dict()) + "\n")
448
+
449
+ # Write job submitted event to filesystem (only in NORMAL mode)
450
+ if self._event_writer is not None:
451
+ from .state_status import JobSubmittedEvent
452
+
453
+ # Get dependency job IDs
454
+ depends_on = []
455
+ if hasattr(job, "dependencies"):
456
+ for dep in job.dependencies:
457
+ if hasattr(dep, "identifier"):
458
+ depends_on.append(dep.identifier)
459
+
460
+ job_tags = dict(job.tags.items()) if job.tags else {}
461
+ event = JobSubmittedEvent(
462
+ job_id=job.identifier,
463
+ task_id=str(job.type.identifier),
464
+ transient=job.transient.value if hasattr(job, "transient") else 0,
465
+ tags=job_tags,
466
+ depends_on=depends_on,
467
+ )
468
+ self._event_writer.write_event(event)
469
+
470
+ # Track tags and dependencies directly in experiment
471
+ if job_tags:
472
+ self._tags[job.identifier] = job_tags
473
+ if depends_on:
474
+ self._dependencies[job.identifier] = depends_on
475
+
476
+ def _finalize_run(self, status: str) -> None:
477
+ """Finalize the run: write final status.json and archive event files
478
+
479
+ Args:
480
+ status: Final status ("completed" or "failed")
481
+ """
482
+ from datetime import datetime
483
+ from experimaestro.scheduler.interfaces import ExperimentStatus
484
+ from .state_status import RunCompletedEvent
283
485
 
284
- # Also register in database for TUI/monitoring
285
- experiment_id = self.workdir.name
286
- self.state_provider.update_job_submitted(job, experiment_id, self.run_id)
486
+ # Update final status in the experiment
487
+ self._ended_at = datetime.now().timestamp()
488
+ if status in ("completed", "done"):
489
+ self._status = ExperimentStatus.DONE
490
+ elif status == "failed":
491
+ self._status = ExperimentStatus.FAILED
492
+
493
+ # Write RunCompletedEvent before closing the event writer
494
+ event = RunCompletedEvent(status=status, ended_at=datetime.now().isoformat())
495
+ self._event_writer.write_event(event)
496
+
497
+ # Close the event writer to flush any buffered events
498
+ self._event_writer.close()
499
+
500
+ # Write final status.json using write_status()
501
+ self.write_status()
502
+
503
+ # Archive event files to permanent storage
504
+ self._event_writer.archive_events()
287
505
 
288
506
  def stop(self):
289
507
  """Stop the experiment as soon as possible"""
@@ -365,62 +583,165 @@ class experiment:
365
583
  return self.workspace.connector.createtoken(name, count)
366
584
 
367
585
  def __enter__(self):
586
+ from datetime import datetime
368
587
  from .dynamic_outputs import TaskOutputsWorker
369
- from experimaestro.utils.environment import save_environment_info
588
+ from experimaestro.utils.environment import (
589
+ ExperimentEnvironment,
590
+ ExperimentRunInfo,
591
+ )
370
592
 
371
- if self.workspace.run_mode != RunMode.DRY_RUN:
593
+ # Check for old experiment layout and warn
594
+ old_xp_dir = self.workspace.path / "xp"
595
+ if old_xp_dir.exists() and old_xp_dir.is_dir():
596
+ logger.warning(
597
+ "Experimaestro v2 has a modified experiment file layout. "
598
+ "DO NOT use experimaestro v1 to cleanup orphans. "
599
+ "You can use 'experimaestro migrate v1-to-v2 %s' to migrate old experiment "
600
+ "folders to the new structure.",
601
+ self.workspace.path,
602
+ )
603
+
604
+ # Only lock and save environment in NORMAL mode
605
+ if self.workspace.run_mode == RunMode.NORMAL:
372
606
  logger.info("Locking experiment %s", self.xplockpath)
373
- self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
607
+ lock = self.workspace.connector.lock(self.xplockpath, 0)
608
+
609
+ # Try non-blocking first to check if lock is held
610
+ if not lock.acquire(blocking=False):
611
+ # Lock is held - try to find hostname from latest run's environment.json
612
+ hostname = None
613
+ try:
614
+ # Find the most recent run directory
615
+ run_dirs = sorted(
616
+ [d for d in self._experiment_base.iterdir() if d.is_dir()],
617
+ key=lambda d: d.stat().st_mtime,
618
+ reverse=True,
619
+ )
620
+ if run_dirs:
621
+ env_path = run_dirs[0] / "environment.json"
622
+ if env_path.exists():
623
+ env = ExperimentEnvironment.load(env_path)
624
+ hostname = env.run.hostname if env.run else None
625
+ except Exception:
626
+ pass # Ignore errors when trying to find hostname
627
+ holder_info = f" (held by {hostname})" if hostname else ""
628
+ logger.warning(
629
+ "Experiment is locked%s, waiting for lock to be released...",
630
+ holder_info,
631
+ )
632
+ # Now wait for the lock
633
+ lock.acquire(blocking=True)
634
+
635
+ self.xplock = lock
374
636
  logger.info("Experiment locked")
375
637
 
376
- # Capture and save environment info (git info for editable packages + all package versions)
377
- if self.workspace.run_mode == RunMode.NORMAL:
378
- env_info_path = self.workdir / "environment.json"
379
- save_environment_info(env_info_path)
638
+ # Generate run_id with collision detection
639
+ now = datetime.now()
640
+ base_run_id = now.strftime("%Y%m%d_%H%M%S")
641
+ run_id = base_run_id
642
+ suffix = 1
643
+ while (self._experiment_base / run_id).exists():
644
+ run_id = f"{base_run_id}.{suffix}"
645
+ suffix += 1
646
+ self.run_id = run_id
380
647
 
381
- # Move old jobs into "jobs.bak"
382
- if self.workspace.run_mode == RunMode.NORMAL:
383
- self.jobsbakpath.mkdir(exist_ok=True)
384
- for p in self.jobspath.glob("*/*"):
385
- if p.is_symlink():
386
- target = self.jobsbakpath / p.relative_to(self.jobspath)
387
- if target.is_symlink():
388
- # Remove if duplicate
389
- p.unlink()
390
- else:
391
- # Rename otherwise
392
- target.parent.mkdir(parents=True, exist_ok=True)
393
- p.rename(target)
648
+ # Create the run-specific workdir
649
+ self.workdir = self._experiment_base / self.run_id
650
+ self.workdir.mkdir(parents=True, exist_ok=True)
651
+
652
+ # Capture and save environment info
653
+ from experimaestro.utils.git import get_git_info
654
+ from experimaestro.utils.environment import get_current_environment
655
+
656
+ env_info_path = self.workdir / "environment.json"
657
+ env = get_current_environment()
658
+
659
+ # Capture project git info from project paths
660
+ dirty_repos = []
661
+ for project_path in self._project_paths:
662
+ project_git = get_git_info(project_path)
663
+ if project_git:
664
+ env.projects.append(project_git)
665
+ # Track dirty repositories
666
+ if project_git.get("dirty"):
667
+ dirty_repos.append(project_git.get("path", str(project_path)))
668
+
669
+ # Handle dirty git repositories based on configured action
670
+ if dirty_repos and self._dirty_git != DirtyGitAction.IGNORE:
671
+ for repo_path in dirty_repos:
672
+ if self._dirty_git == DirtyGitAction.WARN:
673
+ logger.warning(
674
+ "Project repository has uncommitted changes: %s",
675
+ repo_path,
676
+ )
677
+ elif self._dirty_git == DirtyGitAction.ERROR:
678
+ # Release the lock before raising the error
679
+ raise DirtyGitError(
680
+ f"Project repository has uncommitted changes: {repo_path}"
681
+ )
682
+
683
+ env.save(env_info_path)
684
+ else:
685
+ # Non-NORMAL mode: use placeholder run_id and workdir
686
+ self.run_id = "dry-run"
687
+ self.workdir = self._experiment_base / self.run_id
688
+ self.workdir.mkdir(parents=True, exist_ok=True)
394
689
 
395
690
  # Register experiment with scheduler
396
691
  self.scheduler.register_experiment(self)
397
692
 
398
- # Start server via scheduler if needed
399
- if self._needs_server:
400
- self.scheduler.start_server(self._server_settings, workspace=self.workspace)
693
+ # Set experiment start time for BaseExperiment interface
694
+ self._started_at = time.time()
695
+ self._ended_at = None
401
696
 
402
697
  self.workspace.__enter__()
403
698
  (self.workspace.path / ".__experimaestro__").touch()
404
699
 
405
- # Initialize workspace state provider (singleton per workspace path)
406
- from .state_provider import WorkspaceStateProvider
407
-
408
- self.state_provider = WorkspaceStateProvider.get_instance(
409
- self.workspace.path,
410
- read_only=False,
411
- sync_on_start=False, # Experiments don't sync on start
412
- )
413
-
414
- # Register experiment in database and create a run
415
- experiment_id = self.workdir.name
416
- self.state_provider.ensure_experiment(experiment_id)
417
- self.run_id = self.state_provider.create_run(experiment_id)
700
+ # Initialize filesystem-based state tracking (only in NORMAL mode)
701
+ from .state_status import ExperimentEventWriter
702
+
703
+ is_normal_mode = self.workspace.run_mode == RunMode.NORMAL
704
+ self._event_writer = None
705
+ self._state_listener = None
706
+
707
+ # Track job tags and dependencies directly (no more StatusData)
708
+ self._tags: Dict[str, Dict[str, str]] = {}
709
+ self._dependencies: Dict[str, List[str]] = {}
710
+ self._events_count = 0
711
+ self._hostname: Optional[str] = None
712
+ self._started_at: Optional[float] = None
713
+ self._ended_at: Optional[float] = None
714
+
715
+ if is_normal_mode:
716
+ import socket
717
+
718
+ # Create event writer for this experiment
719
+ # Events are written to experiments/{experiment_id}/events-{count}.jsonl
720
+ # Permanent storage: workdir/events/
721
+ self._event_writer = ExperimentEventWriter(self, self.workspace.path, 0)
722
+
723
+ # Initialize status.json for this run
724
+ self._hostname = socket.gethostname()
725
+ self._started_at = datetime.now().timestamp()
726
+ self._event_writer.init_status()
727
+
728
+ # Create symlink to current run
729
+ self._event_writer.create_symlink()
730
+
731
+ # Add run info to environment.json
732
+ env_path = self.workdir / "environment.json"
733
+ env = ExperimentEnvironment.load(env_path)
734
+ env.run = ExperimentRunInfo(
735
+ hostname=self._hostname,
736
+ started_at=datetime.now().isoformat(),
737
+ )
738
+ env.save(env_path)
418
739
 
419
- # Add database listener to update job state in database
420
- self._db_listener = DatabaseListener(
421
- self.state_provider, experiment_id, self.run_id
422
- )
423
- self.scheduler.addlistener(self._db_listener)
740
+ # Add state listener to write events to filesystem
741
+ self._state_listener = StateListener(
742
+ self._event_writer, self, self.name, self.run_id
743
+ )
744
+ self.scheduler.addlistener(self._state_listener)
424
745
 
425
746
  # Number of unfinished jobs
426
747
  self.unfinishedJobs = 0
@@ -445,22 +766,24 @@ class experiment:
445
766
 
446
767
  def __exit__(self, exc_type, exc_value, traceback):
447
768
  logger.debug("Exiting scheduler context")
448
- # If no exception and normal run mode, remove old "jobs"
449
- if self.workspace.run_mode == RunMode.NORMAL:
450
- if exc_type is None and self.jobsbakpath.is_dir():
451
- rmtree(self.jobsbakpath)
452
769
 
453
770
  # Close the different locks
454
771
  try:
455
- if exc_type:
772
+ if exc_type is GracefulExperimentExit:
773
+ # Graceful exit - don't wait for jobs, don't log error
774
+ logger.info("Graceful experiment exit - not waiting for running jobs")
775
+ elif exc_type:
456
776
  # import faulthandler
457
777
  # faulthandler.dump_traceback()
458
- logger.error(
459
- "Not waiting since an exception was thrown"
460
- " (some jobs may be running)"
778
+ logger.exception(
779
+ "Not waiting since an exception was thrown (some jobs may be running)"
461
780
  )
462
781
  else:
463
782
  self.wait()
783
+
784
+ # Wait for all pending notifications to be processed
785
+ # before removing listeners
786
+ self.scheduler.wait_for_notifications()
464
787
  finally:
465
788
  if self._register_signals:
466
789
  SIGNAL_HANDLER.remove(self)
@@ -470,19 +793,41 @@ class experiment:
470
793
  logger.info("Closing service %s", service.description())
471
794
  service.stop()
472
795
 
796
+ # Set end time for BaseExperiment interface
797
+ self._ended_at = time.time()
798
+
473
799
  # Unregister experiment from scheduler
474
800
  self.scheduler.unregister_experiment(self)
475
801
 
476
- # Remove database listener
477
- self.scheduler.removelistener(self._db_listener)
478
-
479
- # Mark run as completed in database
480
- experiment_id = self.workdir.name
481
- status = "failed" if exc_type else "completed"
482
- self.state_provider.complete_run(experiment_id, self.run_id, status)
802
+ # Remove state listener and finalize run (only in NORMAL mode)
803
+ if exc_type is GracefulExperimentExit:
804
+ status = "detached" # Graceful exit, jobs may still be running
805
+ elif exc_type:
806
+ status = "failed"
807
+ else:
808
+ status = "completed"
809
+
810
+ if self._state_listener is not None:
811
+ self.scheduler.removelistener(self._state_listener)
812
+ self._finalize_run(status)
813
+
814
+ # Update environment.json with run status
815
+ if self.workspace.run_mode == RunMode.NORMAL and self.workdir:
816
+ from datetime import datetime
817
+ from experimaestro.utils.environment import ExperimentEnvironment
818
+
819
+ env_path = self.workdir / "environment.json"
820
+ if env_path.exists():
821
+ try:
822
+ env = ExperimentEnvironment.load(env_path)
823
+ if env.run:
824
+ env.run.ended_at = datetime.now().isoformat()
825
+ env.run.status = status
826
+ env.save(env_path)
827
+ except Exception as e:
828
+ logger.warning("Failed to update environment.json: %s", e)
483
829
 
484
830
  # Note: Don't stop scheduler - it's shared!
485
- # Note: Don't stop server - it runs in daemon mode until program exit
486
831
 
487
832
  if self.taskOutputsWorker is not None:
488
833
  logger.info("Stopping tasks outputs worker")
@@ -496,13 +841,39 @@ class experiment:
496
841
  experiment.CURRENT = self.old_experiment
497
842
 
498
843
  if self.workspace.run_mode == RunMode.NORMAL:
499
- # Write the state
500
- logging.info("Saving the experiment state")
501
- from experimaestro.scheduler.state import ExperimentState
844
+ # Remove job directories for transient jobs with REMOVE mode
845
+ if exc_type is None:
846
+ for job in list(self.scheduler.jobs.values()):
847
+ if (
848
+ self in job.experiments
849
+ and job.transient.should_remove
850
+ and job.state.finished()
851
+ ):
852
+ job_path = job.path
853
+ if job_path.exists():
854
+ logger.info(
855
+ "Removing transient job directory: %s", job_path
856
+ )
857
+ rmtree(job_path)
858
+ # Also remove the symlink in the experiment's jobs folder
859
+ symlink_path = self.jobspath / job.relpath
860
+ if symlink_path.is_symlink():
861
+ symlink_path.unlink()
862
+
863
+ # Cleanup old runs based on history settings
864
+ try:
865
+ cleanup_experiment_history(
866
+ self._experiment_base,
867
+ current_run_id=self.run_id,
868
+ current_status=status,
869
+ history=self._get_history_settings(),
870
+ )
871
+ except Exception as e:
872
+ logger.warning("Failed to cleanup old runs: %s", e)
502
873
 
503
- ExperimentState.save(
504
- self.workdir / "state.json", self.scheduler.jobs.values()
505
- )
874
+ # Suppress GracefulExperimentExit exception
875
+ if exc_type is GracefulExperimentExit:
876
+ return True
506
877
 
507
878
  async def update_task_output_count(self, delta: int):
508
879
  """Change in the number of task outputs to process"""
@@ -526,26 +897,60 @@ class experiment:
526
897
  """Adds a service (e.g. tensorboard viewer) to the experiment
527
898
 
528
899
  :param service: A service instance
529
- :return: The same service instance
900
+ :return: The same service instance (or existing service if already added)
530
901
  """
531
- self.services[service.id] = service
902
+ existing = self.services.get(service.id)
903
+ if existing is not None:
904
+ if existing is service:
905
+ # Same service instance added twice - just return it
906
+ logger.debug("Service %s already added, ignoring duplicate", service.id)
907
+ return service
908
+ else:
909
+ # Different service with same id - warn and replace
910
+ logger.warning(
911
+ "Replacing service %s (old id=%s, new id=%s)",
912
+ service.id,
913
+ id(existing),
914
+ id(service),
915
+ )
532
916
 
533
- # Register database listener for state changes
534
- service.add_listener(self._db_listener)
917
+ self._services[service.id] = service
535
918
 
536
- # Register file listener for state changes (writes to services.json)
537
- service.add_listener(self)
919
+ # Allow service to access experiment context
920
+ service.set_experiment(self)
538
921
 
539
- self.scheduler.notify_service_add(service)
922
+ # Register state listener for state changes (writes events)
923
+ if self._state_listener is not None:
924
+ service.add_listener(self._state_listener)
540
925
 
541
- # Write services.json file
542
- self._write_services_json()
926
+ # Register listener for state changes
927
+ service.add_listener(self)
928
+
929
+ self.scheduler.notify_service_add(service, self.name, self.run_id or "")
543
930
 
544
931
  return service
545
932
 
546
933
  def service_state_changed(self, service):
547
- """Called when a service state changes - update services.json"""
548
- self._write_services_json()
934
+ """Called when a service state changes - notify listeners"""
935
+ state_name = service.state.name if hasattr(service.state, "name") else "UNKNOWN"
936
+ logger.debug(
937
+ "Service %s state changed to %s (experiment=%s)",
938
+ service.id,
939
+ state_name,
940
+ self.name,
941
+ )
942
+
943
+ # Notify state listeners (for TUI tab title updates etc.)
944
+ from experimaestro.scheduler.state_status import ServiceStateChangedEvent
945
+
946
+ if self.scheduler is not None:
947
+ event = ServiceStateChangedEvent(
948
+ experiment_id=self.name,
949
+ run_id=self.run_id or "",
950
+ service_id=service.id,
951
+ state=state_name,
952
+ )
953
+ self.scheduler._notify_state_listeners_async(event)
549
954
 
550
955
  def save(self, obj: Any, name: str = "default"):
551
956
  """Serializes configurations.
@@ -564,19 +969,209 @@ class experiment:
564
969
 
565
970
  save(obj, save_dir)
566
971
 
567
- def load(self, reference: str, name: str = "default"):
568
- """Serializes configurations.
569
-
570
- Loads configuration objects from an experimental directory
972
+ def load(self, reference: str, name: str = "default", run_id: str = None):
973
+ """Loads configuration objects from an experimental directory.
571
974
 
572
975
  :param reference: The name of the experiment
573
976
  :param name: The name of the saving directory (default to `default`)
977
+ :param run_id: The run ID to load from (default: latest run)
574
978
  """
575
979
  from experimaestro import load
576
980
 
577
- path = self.workspace.experimentspath / reference / "data" / name
981
+ exp_base = self.workspace.experimentspath / reference
982
+ if run_id is None:
983
+ # Find the latest run directory
984
+ run_dirs = sorted(
985
+ [d for d in exp_base.iterdir() if d.is_dir()],
986
+ key=lambda d: d.stat().st_mtime,
987
+ reverse=True,
988
+ )
989
+ if not run_dirs:
990
+ raise FileNotFoundError(f"No runs found for experiment {reference}")
991
+ run_dir = run_dirs[0]
992
+ else:
993
+ run_dir = exp_base / run_id
994
+
995
+ path = run_dir / "data" / name
578
996
  return load(path)
579
997
 
998
+ def _get_history_settings(self) -> HistorySettings:
999
+ """Get the history settings for this experiment.
1000
+
1001
+ Returns workspace-specific settings if available, otherwise global defaults.
1002
+ """
1003
+ # Check if workspace has explicit history settings
1004
+ ws_settings = self.workspace.settings
1005
+ if ws_settings and ws_settings.history:
1006
+ return ws_settings.history
1007
+
1008
+ # Fall back to global settings
1009
+ settings = get_settings()
1010
+ return settings.history
1011
+
1012
+
1013
+ def get_run_status(run_dir: Path) -> Optional[str]:
1014
+ """Get the status of a run from its status.json or environment.json.
1015
+
1016
+ Args:
1017
+ run_dir: Path to the run directory
1018
+
1019
+ Returns:
1020
+ 'completed', 'failed', or None if status cannot be determined.
1021
+ """
1022
+ # Try environment.json first (most reliable - written on exit)
1023
+ env_path = run_dir / "environment.json"
1024
+ if env_path.exists():
1025
+ try:
1026
+ from experimaestro.utils.environment import ExperimentEnvironment
1027
+
1028
+ env = ExperimentEnvironment.load(env_path)
1029
+ if env.run and env.run.status:
1030
+ return env.run.status
1031
+ except Exception:
1032
+ pass
1033
+
1034
+ # Fall back to status.json
1035
+ status_path = run_dir / "status.json"
1036
+ if status_path.exists():
1037
+ try:
1038
+ with status_path.open() as f:
1039
+ status = json.load(f)
1040
+ # Check the experiment status field
1041
+ exp_status = status.get("status")
1042
+ if exp_status == "done":
1043
+ return "completed"
1044
+ elif exp_status == "failed":
1045
+ return "failed"
1046
+ # Check job states as fallback
1047
+ jobs = status.get("jobs", {})
1048
+ if any(j.get("state") == "error" for j in jobs.values()):
1049
+ return "failed"
1050
+ return "completed"
1051
+ except Exception:
1052
+ pass
1053
+
1054
+ # Cannot determine status
1055
+ return None
1056
+
1057
+
1058
+ def cleanup_experiment_history(
1059
+ experiment_base: Path,
1060
+ *,
1061
+ current_run_id: Optional[str] = None,
1062
+ current_status: Optional[str] = None,
1063
+ history: Optional[HistorySettings] = None,
1064
+ ) -> list[Path]:
1065
+ """Clean up old experiment runs based on history settings.
1066
+
1067
+ This function can be called from the CLI or other contexts.
1068
+
1069
+ Args:
1070
+ experiment_base: Path to the experiment directory (containing run subdirs)
1071
+ current_run_id: ID of the current run to exclude from cleanup (optional)
1072
+ current_status: Status of the current run ('completed' or 'failed'), used
1073
+ to determine if failed runs should be removed (optional)
1074
+ history: History settings to use (defaults to global settings)
1075
+
1076
+ Returns:
1077
+ List of paths that were removed
1078
+ """
1079
+ if history is None:
1080
+ settings = get_settings()
1081
+ history = settings.history
1082
+
1083
+ removed_paths = []
1084
+
1085
+ # List all run directories (excluding the current one)
1086
+ run_dirs = []
1087
+ for d in experiment_base.iterdir():
1088
+ if d.is_dir() and d.name != current_run_id:
1089
+ run_dirs.append(d)
1090
+
1091
+ # Sort by directory name (oldest first)
1092
+ # Directory names are in format YYYYMMDD_HHMMSS or YYYYMMDD_HHMMSS.N (with modifier)
1093
+ def run_sort_key(d: Path) -> tuple[str, int]:
1094
+ """Parse run_id for sorting, handling modifiers like 20250501_102315.1"""
1095
+ name = d.name
1096
+ if "." in name:
1097
+ parts = name.split(".", 1)
1098
+ try:
1099
+ return (parts[0], int(parts[1]))
1100
+ except (ValueError, IndexError):
1101
+ return (name, 0)
1102
+ return (name, 0)
1103
+
1104
+ run_dirs.sort(key=run_sort_key)
1105
+
1106
+ # Categorize runs by status
1107
+ completed_runs = []
1108
+ failed_runs = []
1109
+
1110
+ for run_dir in run_dirs:
1111
+ status = get_run_status(run_dir)
1112
+ if status == "completed":
1113
+ completed_runs.append(run_dir)
1114
+ elif status == "failed":
1115
+ failed_runs.append(run_dir)
1116
+ # Runs with unknown status are not touched
1117
+
1118
+ # If current run succeeded, remove all past failed runs (per user requirement)
1119
+ if current_status == "completed":
1120
+ # Remove all past failed runs
1121
+ # Per user requirement: "If an experiment succeed, it remove the past failed"
1122
+ for run_dir in failed_runs:
1123
+ logger.info("Removing failed run (experiment succeeded): %s", run_dir)
1124
+ try:
1125
+ rmtree(run_dir)
1126
+ removed_paths.append(run_dir)
1127
+ except Exception as e:
1128
+ logger.warning("Failed to remove run directory %s: %s", run_dir, e)
1129
+ failed_runs = []
1130
+
1131
+ # Remove failed runs that come after any successful run
1132
+ # (if there's a success before a failure, that failure is stale)
1133
+ if completed_runs:
1134
+ # Find the newest completed run
1135
+ newest_completed = run_sort_key(completed_runs[-1])
1136
+ remaining_failed = []
1137
+ for run_dir in failed_runs:
1138
+ if run_sort_key(run_dir) < newest_completed:
1139
+ logger.info("Removing failed run (success exists after): %s", run_dir)
1140
+ try:
1141
+ rmtree(run_dir)
1142
+ removed_paths.append(run_dir)
1143
+ except Exception as e:
1144
+ logger.warning("Failed to remove run directory %s: %s", run_dir, e)
1145
+ else:
1146
+ remaining_failed.append(run_dir)
1147
+ failed_runs = remaining_failed
1148
+
1149
+ # Keep only max_done completed runs (remove oldest ones)
1150
+ while len(completed_runs) > history.max_done:
1151
+ run_dir = completed_runs.pop(0) # Remove oldest
1152
+ logger.info(
1153
+ "Removing old completed run (keeping %d): %s", history.max_done, run_dir
1154
+ )
1155
+ try:
1156
+ rmtree(run_dir)
1157
+ removed_paths.append(run_dir)
1158
+ except Exception as e:
1159
+ logger.warning("Failed to remove run directory %s: %s", run_dir, e)
1160
+
1161
+ # Keep only max_failed failed runs (remove oldest ones)
1162
+ while len(failed_runs) > history.max_failed:
1163
+ run_dir = failed_runs.pop(0) # Remove oldest
1164
+ logger.info(
1165
+ "Removing old failed run (keeping %d): %s", history.max_failed, run_dir
1166
+ )
1167
+ try:
1168
+ rmtree(run_dir)
1169
+ removed_paths.append(run_dir)
1170
+ except Exception as e:
1171
+ logger.warning("Failed to remove run directory %s: %s", run_dir, e)
1172
+
1173
+ return removed_paths
1174
+
580
1175
 
581
1176
  # re-export at the module level
582
1177
  current = experiment.current