experimaestro 2.0.0b8__py3-none-any.whl → 2.0.0b17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +12 -5
- experimaestro/cli/__init__.py +239 -126
- experimaestro/cli/filter.py +48 -23
- experimaestro/cli/jobs.py +253 -71
- experimaestro/cli/refactor.py +1 -2
- experimaestro/commandline.py +7 -4
- experimaestro/connectors/__init__.py +9 -1
- experimaestro/connectors/local.py +43 -3
- experimaestro/core/arguments.py +18 -18
- experimaestro/core/identifier.py +11 -11
- experimaestro/core/objects/config.py +96 -39
- experimaestro/core/objects/config_walk.py +3 -3
- experimaestro/core/{subparameters.py → partial.py} +16 -16
- experimaestro/core/partial_lock.py +394 -0
- experimaestro/core/types.py +12 -15
- experimaestro/dynamic.py +290 -0
- experimaestro/experiments/__init__.py +6 -2
- experimaestro/experiments/cli.py +217 -50
- experimaestro/experiments/configuration.py +24 -0
- experimaestro/generators.py +5 -5
- experimaestro/ipc.py +118 -1
- experimaestro/launcherfinder/__init__.py +2 -2
- experimaestro/launcherfinder/registry.py +6 -7
- experimaestro/launcherfinder/specs.py +2 -9
- experimaestro/launchers/slurm/__init__.py +2 -2
- experimaestro/launchers/slurm/base.py +62 -0
- experimaestro/locking.py +957 -1
- experimaestro/notifications.py +89 -201
- experimaestro/progress.py +63 -366
- experimaestro/rpyc.py +0 -2
- experimaestro/run.py +29 -2
- experimaestro/scheduler/__init__.py +8 -1
- experimaestro/scheduler/base.py +629 -53
- experimaestro/scheduler/dependencies.py +20 -16
- experimaestro/scheduler/experiment.py +732 -167
- experimaestro/scheduler/interfaces.py +316 -101
- experimaestro/scheduler/jobs.py +58 -20
- experimaestro/scheduler/remote/adaptive_sync.py +265 -0
- experimaestro/scheduler/remote/client.py +171 -117
- experimaestro/scheduler/remote/protocol.py +8 -193
- experimaestro/scheduler/remote/server.py +95 -71
- experimaestro/scheduler/services.py +53 -28
- experimaestro/scheduler/state_provider.py +663 -2430
- experimaestro/scheduler/state_status.py +1247 -0
- experimaestro/scheduler/transient.py +31 -0
- experimaestro/scheduler/workspace.py +1 -1
- experimaestro/scheduler/workspace_state_provider.py +1273 -0
- experimaestro/scriptbuilder.py +4 -4
- experimaestro/settings.py +36 -0
- experimaestro/tests/conftest.py +33 -5
- experimaestro/tests/connectors/bin/executable.py +1 -1
- experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
- experimaestro/tests/launchers/bin/test.py +1 -0
- experimaestro/tests/launchers/test_slurm.py +9 -9
- experimaestro/tests/partial_reschedule.py +46 -0
- experimaestro/tests/restart.py +3 -3
- experimaestro/tests/restart_main.py +1 -0
- experimaestro/tests/scripts/notifyandwait.py +1 -0
- experimaestro/tests/task_partial.py +38 -0
- experimaestro/tests/task_tokens.py +2 -2
- experimaestro/tests/tasks/test_dynamic.py +6 -6
- experimaestro/tests/test_dependencies.py +3 -3
- experimaestro/tests/test_deprecated.py +15 -15
- experimaestro/tests/test_dynamic_locking.py +317 -0
- experimaestro/tests/test_environment.py +24 -14
- experimaestro/tests/test_experiment.py +171 -36
- experimaestro/tests/test_identifier.py +25 -25
- experimaestro/tests/test_identifier_stability.py +3 -5
- experimaestro/tests/test_multitoken.py +2 -4
- experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
- experimaestro/tests/test_partial_paths.py +81 -138
- experimaestro/tests/test_pre_experiment.py +219 -0
- experimaestro/tests/test_progress.py +2 -8
- experimaestro/tests/test_remote_state.py +560 -99
- experimaestro/tests/test_stray_jobs.py +261 -0
- experimaestro/tests/test_tasks.py +1 -2
- experimaestro/tests/test_token_locking.py +52 -67
- experimaestro/tests/test_tokens.py +5 -6
- experimaestro/tests/test_transient.py +225 -0
- experimaestro/tests/test_workspace_state_provider.py +768 -0
- experimaestro/tests/token_reschedule.py +1 -3
- experimaestro/tests/utils.py +2 -7
- experimaestro/tokens.py +227 -372
- experimaestro/tools/diff.py +1 -0
- experimaestro/tools/documentation.py +4 -5
- experimaestro/tools/jobs.py +1 -2
- experimaestro/tui/app.py +438 -1966
- experimaestro/tui/app.tcss +162 -0
- experimaestro/tui/dialogs.py +172 -0
- experimaestro/tui/log_viewer.py +253 -3
- experimaestro/tui/messages.py +137 -0
- experimaestro/tui/utils.py +54 -0
- experimaestro/tui/widgets/__init__.py +23 -0
- experimaestro/tui/widgets/experiments.py +468 -0
- experimaestro/tui/widgets/global_services.py +238 -0
- experimaestro/tui/widgets/jobs.py +972 -0
- experimaestro/tui/widgets/log.py +156 -0
- experimaestro/tui/widgets/orphans.py +363 -0
- experimaestro/tui/widgets/runs.py +185 -0
- experimaestro/tui/widgets/services.py +314 -0
- experimaestro/tui/widgets/stray_jobs.py +528 -0
- experimaestro/utils/__init__.py +1 -1
- experimaestro/utils/environment.py +105 -22
- experimaestro/utils/fswatcher.py +124 -0
- experimaestro/utils/jobs.py +1 -2
- experimaestro/utils/jupyter.py +1 -2
- experimaestro/utils/logging.py +72 -0
- experimaestro/version.py +2 -2
- experimaestro/webui/__init__.py +9 -0
- experimaestro/webui/app.py +117 -0
- experimaestro/{server → webui}/data/index.css +66 -11
- experimaestro/webui/data/index.css.map +1 -0
- experimaestro/{server → webui}/data/index.js +82763 -87217
- experimaestro/webui/data/index.js.map +1 -0
- experimaestro/webui/routes/__init__.py +5 -0
- experimaestro/webui/routes/auth.py +53 -0
- experimaestro/webui/routes/proxy.py +117 -0
- experimaestro/webui/server.py +200 -0
- experimaestro/webui/state_bridge.py +152 -0
- experimaestro/webui/websocket.py +413 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +5 -6
- experimaestro-2.0.0b17.dist-info/RECORD +219 -0
- experimaestro/cli/progress.py +0 -269
- experimaestro/scheduler/state.py +0 -75
- experimaestro/scheduler/state_db.py +0 -437
- experimaestro/scheduler/state_sync.py +0 -891
- experimaestro/server/__init__.py +0 -467
- experimaestro/server/data/index.css.map +0 -1
- experimaestro/server/data/index.js.map +0 -1
- experimaestro/tests/test_cli_jobs.py +0 -615
- experimaestro/tests/test_file_progress.py +0 -425
- experimaestro/tests/test_file_progress_integration.py +0 -477
- experimaestro/tests/test_state_db.py +0 -434
- experimaestro-2.0.0b8.dist-info/RECORD +0 -187
- /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
- /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
- /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
- /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
- /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
- /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
- /experimaestro/{server → webui}/data/favicon.ico +0 -0
- /experimaestro/{server → webui}/data/index.html +0 -0
- /experimaestro/{server → webui}/data/login.html +0 -0
- /experimaestro/{server → webui}/data/manifest.json +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import inspect
|
|
2
3
|
import json
|
|
3
4
|
import logging
|
|
4
5
|
import os
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
import time
|
|
7
8
|
from shutil import rmtree
|
|
8
|
-
from typing import Any, Dict, Optional, TypeVar, Union
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union
|
|
9
10
|
|
|
10
11
|
from experimaestro.core.objects import WatchedOutput
|
|
11
12
|
from experimaestro.exceptions import HandledException
|
|
@@ -14,9 +15,19 @@ from experimaestro.scheduler.signal_handler import SIGNAL_HANDLER
|
|
|
14
15
|
from experimaestro.scheduler.jobs import Job
|
|
15
16
|
from experimaestro.scheduler.services import Service
|
|
16
17
|
from experimaestro.scheduler.workspace import RunMode, Workspace
|
|
17
|
-
from experimaestro.
|
|
18
|
+
from experimaestro.scheduler.interfaces import (
|
|
19
|
+
BaseExperiment,
|
|
20
|
+
BaseService,
|
|
21
|
+
ExperimentJobInformation,
|
|
22
|
+
)
|
|
23
|
+
from experimaestro.settings import WorkspaceSettings, get_settings, HistorySettings
|
|
24
|
+
from experimaestro.experiments.configuration import DirtyGitAction
|
|
18
25
|
from experimaestro.utils import logger
|
|
19
26
|
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from experimaestro.scheduler.interfaces import ExperimentStatus
|
|
29
|
+
from experimaestro.scheduler.state_status import ExperimentEventWriter
|
|
30
|
+
|
|
20
31
|
ServiceClass = TypeVar("ServiceClass", bound=Service)
|
|
21
32
|
|
|
22
33
|
|
|
@@ -26,11 +37,46 @@ class FailedExperiment(HandledException):
|
|
|
26
37
|
pass
|
|
27
38
|
|
|
28
39
|
|
|
29
|
-
class
|
|
30
|
-
"""
|
|
40
|
+
class DirtyGitError(HandledException):
|
|
41
|
+
"""Raised when the git repository has uncommitted changes and dirty_git=error"""
|
|
42
|
+
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class GracefulExperimentExit(Exception):
|
|
47
|
+
"""Raised to exit an experiment context without waiting for running jobs.
|
|
48
|
+
|
|
49
|
+
This is useful in tests or when you want to detach from an experiment
|
|
50
|
+
while keeping jobs running (e.g., to test stray job detection).
|
|
51
|
+
|
|
52
|
+
Example::
|
|
53
|
+
|
|
54
|
+
with experiment(workdir, "my-experiment") as xp:
|
|
55
|
+
task = MyTask.C(value=1).submit()
|
|
56
|
+
# Wait for task to start...
|
|
57
|
+
raise GracefulExperimentExit() # Exit without waiting for task to finish
|
|
58
|
+
"""
|
|
31
59
|
|
|
32
|
-
|
|
33
|
-
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class StateListener:
|
|
64
|
+
"""Listener that writes events to filesystem
|
|
65
|
+
|
|
66
|
+
Job state events are written to per-job event files by the scheduler.
|
|
67
|
+
This listener writes experiment-level events (job state, services) to
|
|
68
|
+
the experiment event file.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
event_writer: "ExperimentEventWriter",
|
|
74
|
+
experiment: "experiment",
|
|
75
|
+
experiment_id: str,
|
|
76
|
+
run_id: str,
|
|
77
|
+
):
|
|
78
|
+
self.event_writer = event_writer
|
|
79
|
+
self.experiment = experiment
|
|
34
80
|
self.experiment_id = experiment_id
|
|
35
81
|
self.run_id = run_id
|
|
36
82
|
|
|
@@ -39,34 +85,65 @@ class DatabaseListener:
|
|
|
39
85
|
pass
|
|
40
86
|
|
|
41
87
|
def job_state(self, job):
|
|
42
|
-
"""
|
|
43
|
-
|
|
88
|
+
"""Write job state change event to experiment event file"""
|
|
89
|
+
from .state_status import JobStateChangedEvent
|
|
90
|
+
|
|
91
|
+
# Get failure reason if error state
|
|
92
|
+
failure_reason = None
|
|
93
|
+
if hasattr(job.state, "failure_reason") and job.state.failure_reason:
|
|
94
|
+
failure_reason = job.state.failure_reason.name
|
|
95
|
+
|
|
96
|
+
# Get progress as list of dicts
|
|
97
|
+
progress = []
|
|
98
|
+
if hasattr(job, "_progress") and job._progress:
|
|
99
|
+
progress = [
|
|
100
|
+
{"level": p.level, "progress": p.progress, "desc": p.desc}
|
|
101
|
+
for p in job._progress
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
event = JobStateChangedEvent(
|
|
105
|
+
job_id=job.identifier,
|
|
106
|
+
state=job.state.name,
|
|
107
|
+
failure_reason=failure_reason,
|
|
108
|
+
submitted_time=job.submittime,
|
|
109
|
+
started_time=job.starttime,
|
|
110
|
+
ended_time=job.endtime,
|
|
111
|
+
exit_code=getattr(job, "exit_code", None),
|
|
112
|
+
retry_count=getattr(job, "retry_count", 0),
|
|
113
|
+
progress=progress,
|
|
114
|
+
)
|
|
115
|
+
# Write to experiment event file
|
|
116
|
+
self.event_writer.write_event(event)
|
|
44
117
|
|
|
45
118
|
def service_add(self, service):
|
|
46
|
-
"""
|
|
119
|
+
"""Write service added event to filesystem"""
|
|
47
120
|
from experimaestro.scheduler.services import Service
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
service.description(),
|
|
55
|
-
|
|
121
|
+
from .state_status import ServiceAddedEvent
|
|
122
|
+
|
|
123
|
+
state_dict = Service.serialize_state_dict(service.state_dict())
|
|
124
|
+
service_class = f"{service.__class__.__module__}.{service.__class__.__name__}"
|
|
125
|
+
event = ServiceAddedEvent(
|
|
126
|
+
service_id=service.id,
|
|
127
|
+
description=service.description(),
|
|
128
|
+
service_class=service_class,
|
|
129
|
+
state_dict=state_dict,
|
|
56
130
|
)
|
|
131
|
+
self.event_writer.write_event(event)
|
|
57
132
|
|
|
58
133
|
def service_state_changed(self, service):
|
|
59
134
|
"""Called when service state changes (runtime only, not persisted)"""
|
|
60
|
-
# Service state is managed at runtime, not persisted
|
|
135
|
+
# Service state is managed at runtime, not persisted
|
|
61
136
|
pass
|
|
62
137
|
|
|
63
138
|
|
|
64
|
-
class experiment:
|
|
139
|
+
class experiment(BaseExperiment):
|
|
65
140
|
"""Context manager for running experiments.
|
|
66
141
|
|
|
67
142
|
Creates a workspace, manages task submission, and optionally starts
|
|
68
143
|
a web server for monitoring.
|
|
69
144
|
|
|
145
|
+
Implements BaseExperiment interface for use with StateProvider and TUI.
|
|
146
|
+
|
|
70
147
|
Example::
|
|
71
148
|
|
|
72
149
|
from experimaestro import experiment
|
|
@@ -99,6 +176,10 @@ class experiment:
|
|
|
99
176
|
run_mode: Optional[RunMode] = None,
|
|
100
177
|
launcher=None,
|
|
101
178
|
register_signals: bool = True,
|
|
179
|
+
project_paths: Optional[list[Path]] = None,
|
|
180
|
+
wait_for_quit: bool = False,
|
|
181
|
+
dirty_git: DirtyGitAction = DirtyGitAction.WARN,
|
|
182
|
+
no_db: bool = False,
|
|
102
183
|
):
|
|
103
184
|
"""
|
|
104
185
|
:param env: an environment -- or a working directory for a local
|
|
@@ -118,10 +199,60 @@ class experiment:
|
|
|
118
199
|
|
|
119
200
|
:param register_signals: Whether to register signal handlers (default: True).
|
|
120
201
|
Set to False when running in a background thread.
|
|
202
|
+
|
|
203
|
+
:param project_paths: Paths to the project files (for git info). If not
|
|
204
|
+
provided, will be inferred from the caller's location.
|
|
205
|
+
|
|
206
|
+
:param wait_for_quit: Deprecated, no longer used. Web server is no longer
|
|
207
|
+
started automatically.
|
|
208
|
+
|
|
209
|
+
:param dirty_git: Action when git repository has uncommitted changes:
|
|
210
|
+
DirtyGitAction.IGNORE (don't check), DirtyGitAction.WARN (log warning,
|
|
211
|
+
default), or DirtyGitAction.ERROR (raise exception).
|
|
212
|
+
|
|
213
|
+
:param no_db: Deprecated, kept for backwards compatibility. This parameter
|
|
214
|
+
is now a no-op as the database has been replaced with filesystem-based
|
|
215
|
+
state tracking.
|
|
216
|
+
|
|
217
|
+
.. deprecated::
|
|
218
|
+
The ``host``, ``port``, ``token``, and ``wait_for_quit`` parameters are
|
|
219
|
+
deprecated. Use ``--web`` flag with ``run-experiment`` CLI or start the
|
|
220
|
+
web server separately.
|
|
121
221
|
"""
|
|
222
|
+
import warnings
|
|
122
223
|
|
|
123
224
|
from experimaestro.scheduler import Listener, Scheduler
|
|
124
225
|
|
|
226
|
+
# Warn about deprecated server parameters
|
|
227
|
+
if host is not None:
|
|
228
|
+
warnings.warn(
|
|
229
|
+
"The 'host' parameter is deprecated. Use '--web' flag with "
|
|
230
|
+
"'run-experiment' CLI or start the web server separately.",
|
|
231
|
+
DeprecationWarning,
|
|
232
|
+
stacklevel=2,
|
|
233
|
+
)
|
|
234
|
+
if port is not None:
|
|
235
|
+
warnings.warn(
|
|
236
|
+
"The 'port' parameter is deprecated. Use '--web' flag with "
|
|
237
|
+
"'run-experiment' CLI or start the web server separately.",
|
|
238
|
+
DeprecationWarning,
|
|
239
|
+
stacklevel=2,
|
|
240
|
+
)
|
|
241
|
+
if token is not None:
|
|
242
|
+
warnings.warn(
|
|
243
|
+
"The 'token' parameter is deprecated. Use '--web' flag with "
|
|
244
|
+
"'run-experiment' CLI or start the web server separately.",
|
|
245
|
+
DeprecationWarning,
|
|
246
|
+
stacklevel=2,
|
|
247
|
+
)
|
|
248
|
+
if wait_for_quit:
|
|
249
|
+
warnings.warn(
|
|
250
|
+
"The 'wait_for_quit' parameter is deprecated. Use '--web' flag with "
|
|
251
|
+
"'run-experiment' CLI or start the web server separately.",
|
|
252
|
+
DeprecationWarning,
|
|
253
|
+
stacklevel=2,
|
|
254
|
+
)
|
|
255
|
+
|
|
125
256
|
settings = get_settings()
|
|
126
257
|
if not isinstance(env, WorkspaceSettings):
|
|
127
258
|
env = WorkspaceSettings(id=None, path=Path(env))
|
|
@@ -130,36 +261,45 @@ class experiment:
|
|
|
130
261
|
run_mode = run_mode or RunMode.NORMAL
|
|
131
262
|
self.workspace = Workspace(settings, env, launcher=launcher, run_mode=run_mode)
|
|
132
263
|
|
|
133
|
-
#
|
|
134
|
-
self.
|
|
135
|
-
|
|
136
|
-
|
|
264
|
+
# Store experiment name for ID references
|
|
265
|
+
self.name = name
|
|
266
|
+
|
|
267
|
+
# Create experiment base directory (run directories will be created inside)
|
|
268
|
+
self._experiment_base = self.workspace.experimentspath / name
|
|
269
|
+
self._experiment_base.mkdir(parents=True, exist_ok=True)
|
|
270
|
+
|
|
271
|
+
# Lock is at experiment level (prevents concurrent runs of same experiment)
|
|
272
|
+
self.xplockpath = self._experiment_base / "lock"
|
|
273
|
+
|
|
274
|
+
# workdir will be set in __enter__ after run_id is generated
|
|
275
|
+
self.workdir = None
|
|
137
276
|
self.xplock = None
|
|
138
277
|
self.old_experiment = None
|
|
139
|
-
self.
|
|
278
|
+
self._services: Dict[str, Service] = {}
|
|
140
279
|
self._job_listener: Optional[Listener] = None
|
|
141
280
|
self._register_signals = register_signals
|
|
281
|
+
self._dirty_git = dirty_git
|
|
282
|
+
self._no_db = no_db
|
|
142
283
|
|
|
143
|
-
#
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
284
|
+
# Capture project paths for git info
|
|
285
|
+
if project_paths is not None:
|
|
286
|
+
self._project_paths = project_paths
|
|
287
|
+
else:
|
|
288
|
+
# Fall back to caller's file path
|
|
289
|
+
self._project_paths = []
|
|
290
|
+
try:
|
|
291
|
+
# Go up the stack to find the first frame outside this module
|
|
292
|
+
for frame_info in inspect.stack():
|
|
293
|
+
frame_file = frame_info.filename
|
|
294
|
+
if "experimaestro" not in frame_file:
|
|
295
|
+
self._project_paths = [Path(frame_file).resolve().parent]
|
|
296
|
+
break
|
|
297
|
+
except Exception:
|
|
298
|
+
pass
|
|
153
299
|
|
|
154
300
|
# Use singleton scheduler
|
|
155
301
|
self.scheduler = Scheduler.instance()
|
|
156
302
|
|
|
157
|
-
# Determine if we need a server
|
|
158
|
-
self._needs_server = (
|
|
159
|
-
settings.server.port is not None and settings.server.port >= 0
|
|
160
|
-
) and self.workspace.run_mode == RunMode.NORMAL
|
|
161
|
-
self._server_settings = settings.server if self._needs_server else None
|
|
162
|
-
|
|
163
303
|
if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
|
|
164
304
|
import faulthandler
|
|
165
305
|
|
|
@@ -197,50 +337,76 @@ class experiment:
|
|
|
197
337
|
"""Return the directory in which results can be stored for this experiment"""
|
|
198
338
|
return self.workdir / "jobs"
|
|
199
339
|
|
|
340
|
+
# =========================================================================
|
|
341
|
+
# BaseExperiment interface properties
|
|
342
|
+
# =========================================================================
|
|
343
|
+
|
|
200
344
|
@property
|
|
201
|
-
def
|
|
202
|
-
"""
|
|
203
|
-
|
|
204
|
-
yield alt_workdir / "jobs"
|
|
345
|
+
def experiment_id(self) -> str:
|
|
346
|
+
"""Experiment identifier (overrides BaseExperiment.experiment_id)"""
|
|
347
|
+
return self.name
|
|
205
348
|
|
|
206
349
|
@property
|
|
207
|
-
def
|
|
208
|
-
"""
|
|
209
|
-
|
|
350
|
+
def status(self) -> "ExperimentStatus":
|
|
351
|
+
"""Experiment status - RUNNING for live experiments, updated on finalization"""
|
|
352
|
+
from experimaestro.scheduler.interfaces import ExperimentStatus
|
|
353
|
+
|
|
354
|
+
return getattr(self, "_status", ExperimentStatus.RUNNING)
|
|
210
355
|
|
|
211
356
|
@property
|
|
212
|
-
def
|
|
213
|
-
"""
|
|
214
|
-
return
|
|
357
|
+
def jobs(self) -> Dict[str, "Job"]:
|
|
358
|
+
"""Jobs in this experiment"""
|
|
359
|
+
return {
|
|
360
|
+
job.identifier: job
|
|
361
|
+
for job in self.scheduler.jobs.values()
|
|
362
|
+
if self in job.experiments
|
|
363
|
+
}
|
|
215
364
|
|
|
216
365
|
@property
|
|
217
|
-
def
|
|
218
|
-
"""
|
|
219
|
-
return self.
|
|
366
|
+
def tags(self) -> Dict[str, Dict[str, str]]:
|
|
367
|
+
"""Tags for jobs - tracked directly in experiment"""
|
|
368
|
+
return self._tags
|
|
220
369
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
370
|
+
@property
|
|
371
|
+
def dependencies(self) -> Dict[str, List[str]]:
|
|
372
|
+
"""Job dependencies - tracked directly in experiment"""
|
|
373
|
+
return self._dependencies
|
|
224
374
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
)
|
|
240
|
-
services_data[service_id] = service_state
|
|
375
|
+
@property
|
|
376
|
+
def events_count(self) -> int:
|
|
377
|
+
"""Number of events processed"""
|
|
378
|
+
return self._events_count
|
|
379
|
+
|
|
380
|
+
@property
|
|
381
|
+
def started_at(self) -> Optional[float]:
|
|
382
|
+
"""Timestamp when experiment started"""
|
|
383
|
+
return self._started_at
|
|
384
|
+
|
|
385
|
+
@property
|
|
386
|
+
def ended_at(self) -> Optional[float]:
|
|
387
|
+
"""Timestamp when experiment ended (None if still running)"""
|
|
388
|
+
return self._ended_at
|
|
241
389
|
|
|
242
|
-
|
|
243
|
-
|
|
390
|
+
@property
|
|
391
|
+
def hostname(self) -> Optional[str]:
|
|
392
|
+
"""Hostname where experiment is running"""
|
|
393
|
+
return self._hostname
|
|
394
|
+
|
|
395
|
+
@property
|
|
396
|
+
def services(self) -> Dict[str, "BaseService"]:
|
|
397
|
+
"""Services in this experiment"""
|
|
398
|
+
return self._services
|
|
399
|
+
|
|
400
|
+
@property
|
|
401
|
+
def alt_jobspaths(self):
|
|
402
|
+
"""Return potential other directories"""
|
|
403
|
+
for alt_workdir in self.workspace.alt_workdirs:
|
|
404
|
+
yield alt_workdir / "jobs"
|
|
405
|
+
|
|
406
|
+
@property
|
|
407
|
+
def jobs_jsonl_path(self):
|
|
408
|
+
"""Return the path to the jobs.jsonl file for this experiment"""
|
|
409
|
+
return self.workdir / "jobs.jsonl"
|
|
244
410
|
|
|
245
411
|
def add_job(self, job: "Job"):
|
|
246
412
|
"""Register a job and its tags to jobs.jsonl file and database
|
|
@@ -266,24 +432,76 @@ class experiment:
|
|
|
266
432
|
logging.debug(
|
|
267
433
|
"Job %s already running, unfinished jobs for %s: %d",
|
|
268
434
|
job.identifier[:8],
|
|
269
|
-
self.
|
|
435
|
+
self.name,
|
|
270
436
|
self.unfinishedJobs,
|
|
271
437
|
)
|
|
272
438
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
439
|
+
job_info = ExperimentJobInformation(
|
|
440
|
+
job_id=job.identifier,
|
|
441
|
+
task_id=str(job.type.identifier),
|
|
442
|
+
tags=dict(job.tags.items()) if job.tags else {},
|
|
443
|
+
timestamp=time.time(),
|
|
444
|
+
)
|
|
279
445
|
|
|
280
446
|
with self.jobs_jsonl_path.open("a") as f:
|
|
281
|
-
f.write(json.dumps(
|
|
447
|
+
f.write(json.dumps(job_info.to_dict()) + "\n")
|
|
448
|
+
|
|
449
|
+
# Write job submitted event to filesystem (only in NORMAL mode)
|
|
450
|
+
if self._event_writer is not None:
|
|
451
|
+
from .state_status import JobSubmittedEvent
|
|
452
|
+
|
|
453
|
+
# Get dependency job IDs
|
|
454
|
+
depends_on = []
|
|
455
|
+
if hasattr(job, "dependencies"):
|
|
456
|
+
for dep in job.dependencies:
|
|
457
|
+
if hasattr(dep, "identifier"):
|
|
458
|
+
depends_on.append(dep.identifier)
|
|
459
|
+
|
|
460
|
+
job_tags = dict(job.tags.items()) if job.tags else {}
|
|
461
|
+
event = JobSubmittedEvent(
|
|
462
|
+
job_id=job.identifier,
|
|
463
|
+
task_id=str(job.type.identifier),
|
|
464
|
+
transient=job.transient.value if hasattr(job, "transient") else 0,
|
|
465
|
+
tags=job_tags,
|
|
466
|
+
depends_on=depends_on,
|
|
467
|
+
)
|
|
468
|
+
self._event_writer.write_event(event)
|
|
282
469
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
470
|
+
# Track tags and dependencies directly in experiment
|
|
471
|
+
if job_tags:
|
|
472
|
+
self._tags[job.identifier] = job_tags
|
|
473
|
+
if depends_on:
|
|
474
|
+
self._dependencies[job.identifier] = depends_on
|
|
475
|
+
|
|
476
|
+
def _finalize_run(self, status: str) -> None:
|
|
477
|
+
"""Finalize the run: write final status.json and archive event files
|
|
478
|
+
|
|
479
|
+
Args:
|
|
480
|
+
status: Final status ("completed" or "failed")
|
|
481
|
+
"""
|
|
482
|
+
from datetime import datetime
|
|
483
|
+
from experimaestro.scheduler.interfaces import ExperimentStatus
|
|
484
|
+
from .state_status import RunCompletedEvent
|
|
485
|
+
|
|
486
|
+
# Update final status in the experiment
|
|
487
|
+
self._ended_at = datetime.now().timestamp()
|
|
488
|
+
if status in ("completed", "done"):
|
|
489
|
+
self._status = ExperimentStatus.DONE
|
|
490
|
+
elif status == "failed":
|
|
491
|
+
self._status = ExperimentStatus.FAILED
|
|
492
|
+
|
|
493
|
+
# Write RunCompletedEvent before closing the event writer
|
|
494
|
+
event = RunCompletedEvent(status=status, ended_at=datetime.now().isoformat())
|
|
495
|
+
self._event_writer.write_event(event)
|
|
496
|
+
|
|
497
|
+
# Close the event writer to flush any buffered events
|
|
498
|
+
self._event_writer.close()
|
|
499
|
+
|
|
500
|
+
# Write final status.json using write_status()
|
|
501
|
+
self.write_status()
|
|
502
|
+
|
|
503
|
+
# Archive event files to permanent storage
|
|
504
|
+
self._event_writer.archive_events()
|
|
287
505
|
|
|
288
506
|
def stop(self):
|
|
289
507
|
"""Stop the experiment as soon as possible"""
|
|
@@ -365,69 +583,165 @@ class experiment:
|
|
|
365
583
|
return self.workspace.connector.createtoken(name, count)
|
|
366
584
|
|
|
367
585
|
def __enter__(self):
|
|
586
|
+
from datetime import datetime
|
|
368
587
|
from .dynamic_outputs import TaskOutputsWorker
|
|
369
|
-
from experimaestro.utils.environment import
|
|
588
|
+
from experimaestro.utils.environment import (
|
|
589
|
+
ExperimentEnvironment,
|
|
590
|
+
ExperimentRunInfo,
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
# Check for old experiment layout and warn
|
|
594
|
+
old_xp_dir = self.workspace.path / "xp"
|
|
595
|
+
if old_xp_dir.exists() and old_xp_dir.is_dir():
|
|
596
|
+
logger.warning(
|
|
597
|
+
"Experimaestro v2 has a modified experiment file layout. "
|
|
598
|
+
"DO NOT use experimaestro v1 to cleanup orphans. "
|
|
599
|
+
"You can use 'experimaestro migrate v1-to-v2 %s' to migrate old experiment "
|
|
600
|
+
"folders to the new structure.",
|
|
601
|
+
self.workspace.path,
|
|
602
|
+
)
|
|
370
603
|
|
|
371
|
-
|
|
604
|
+
# Only lock and save environment in NORMAL mode
|
|
605
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
372
606
|
logger.info("Locking experiment %s", self.xplockpath)
|
|
373
|
-
|
|
607
|
+
lock = self.workspace.connector.lock(self.xplockpath, 0)
|
|
608
|
+
|
|
609
|
+
# Try non-blocking first to check if lock is held
|
|
610
|
+
if not lock.acquire(blocking=False):
|
|
611
|
+
# Lock is held - try to find hostname from latest run's environment.json
|
|
612
|
+
hostname = None
|
|
613
|
+
try:
|
|
614
|
+
# Find the most recent run directory
|
|
615
|
+
run_dirs = sorted(
|
|
616
|
+
[d for d in self._experiment_base.iterdir() if d.is_dir()],
|
|
617
|
+
key=lambda d: d.stat().st_mtime,
|
|
618
|
+
reverse=True,
|
|
619
|
+
)
|
|
620
|
+
if run_dirs:
|
|
621
|
+
env_path = run_dirs[0] / "environment.json"
|
|
622
|
+
if env_path.exists():
|
|
623
|
+
env = ExperimentEnvironment.load(env_path)
|
|
624
|
+
hostname = env.run.hostname if env.run else None
|
|
625
|
+
except Exception:
|
|
626
|
+
pass # Ignore errors when trying to find hostname
|
|
627
|
+
holder_info = f" (held by {hostname})" if hostname else ""
|
|
628
|
+
logger.warning(
|
|
629
|
+
"Experiment is locked%s, waiting for lock to be released...",
|
|
630
|
+
holder_info,
|
|
631
|
+
)
|
|
632
|
+
# Now wait for the lock
|
|
633
|
+
lock.acquire(blocking=True)
|
|
634
|
+
|
|
635
|
+
self.xplock = lock
|
|
374
636
|
logger.info("Experiment locked")
|
|
375
637
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
638
|
+
# Generate run_id with collision detection
|
|
639
|
+
now = datetime.now()
|
|
640
|
+
base_run_id = now.strftime("%Y%m%d_%H%M%S")
|
|
641
|
+
run_id = base_run_id
|
|
642
|
+
suffix = 1
|
|
643
|
+
while (self._experiment_base / run_id).exists():
|
|
644
|
+
run_id = f"{base_run_id}.{suffix}"
|
|
645
|
+
suffix += 1
|
|
646
|
+
self.run_id = run_id
|
|
380
647
|
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
self.
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
648
|
+
# Create the run-specific workdir
|
|
649
|
+
self.workdir = self._experiment_base / self.run_id
|
|
650
|
+
self.workdir.mkdir(parents=True, exist_ok=True)
|
|
651
|
+
|
|
652
|
+
# Capture and save environment info
|
|
653
|
+
from experimaestro.utils.git import get_git_info
|
|
654
|
+
from experimaestro.utils.environment import get_current_environment
|
|
655
|
+
|
|
656
|
+
env_info_path = self.workdir / "environment.json"
|
|
657
|
+
env = get_current_environment()
|
|
658
|
+
|
|
659
|
+
# Capture project git info from project paths
|
|
660
|
+
dirty_repos = []
|
|
661
|
+
for project_path in self._project_paths:
|
|
662
|
+
project_git = get_git_info(project_path)
|
|
663
|
+
if project_git:
|
|
664
|
+
env.projects.append(project_git)
|
|
665
|
+
# Track dirty repositories
|
|
666
|
+
if project_git.get("dirty"):
|
|
667
|
+
dirty_repos.append(project_git.get("path", str(project_path)))
|
|
668
|
+
|
|
669
|
+
# Handle dirty git repositories based on configured action
|
|
670
|
+
if dirty_repos and self._dirty_git != DirtyGitAction.IGNORE:
|
|
671
|
+
for repo_path in dirty_repos:
|
|
672
|
+
if self._dirty_git == DirtyGitAction.WARN:
|
|
673
|
+
logger.warning(
|
|
674
|
+
"Project repository has uncommitted changes: %s",
|
|
675
|
+
repo_path,
|
|
676
|
+
)
|
|
677
|
+
elif self._dirty_git == DirtyGitAction.ERROR:
|
|
678
|
+
# Release the lock before raising the error
|
|
679
|
+
raise DirtyGitError(
|
|
680
|
+
f"Project repository has uncommitted changes: {repo_path}"
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
env.save(env_info_path)
|
|
684
|
+
else:
|
|
685
|
+
# Non-NORMAL mode: use placeholder run_id and workdir
|
|
686
|
+
self.run_id = "dry-run"
|
|
687
|
+
self.workdir = self._experiment_base / self.run_id
|
|
688
|
+
self.workdir.mkdir(parents=True, exist_ok=True)
|
|
394
689
|
|
|
395
690
|
# Register experiment with scheduler
|
|
396
691
|
self.scheduler.register_experiment(self)
|
|
397
692
|
|
|
398
|
-
#
|
|
399
|
-
|
|
400
|
-
|
|
693
|
+
# Set experiment start time for BaseExperiment interface
|
|
694
|
+
self._started_at = time.time()
|
|
695
|
+
self._ended_at = None
|
|
401
696
|
|
|
402
697
|
self.workspace.__enter__()
|
|
403
698
|
(self.workspace.path / ".__experimaestro__").touch()
|
|
404
699
|
|
|
405
|
-
# Initialize
|
|
406
|
-
|
|
407
|
-
from .state_provider import WorkspaceStateProvider
|
|
700
|
+
# Initialize filesystem-based state tracking (only in NORMAL mode)
|
|
701
|
+
from .state_status import ExperimentEventWriter
|
|
408
702
|
|
|
409
703
|
is_normal_mode = self.workspace.run_mode == RunMode.NORMAL
|
|
410
|
-
self.
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
704
|
+
self._event_writer = None
|
|
705
|
+
self._state_listener = None
|
|
706
|
+
|
|
707
|
+
# Track job tags and dependencies directly (no more StatusData)
|
|
708
|
+
self._tags: Dict[str, Dict[str, str]] = {}
|
|
709
|
+
self._dependencies: Dict[str, List[str]] = {}
|
|
710
|
+
self._events_count = 0
|
|
711
|
+
self._hostname: Optional[str] = None
|
|
712
|
+
self._started_at: Optional[float] = None
|
|
713
|
+
self._ended_at: Optional[float] = None
|
|
415
714
|
|
|
416
|
-
# Register experiment in database and create a run (only in NORMAL mode)
|
|
417
|
-
experiment_id = self.workdir.name
|
|
418
|
-
self._db_listener = None
|
|
419
715
|
if is_normal_mode:
|
|
420
|
-
|
|
421
|
-
|
|
716
|
+
import socket
|
|
717
|
+
|
|
718
|
+
# Create event writer for this experiment
|
|
719
|
+
# Events are written to experiments/{experiment_id}/events-{count}.jsonl
|
|
720
|
+
# Permanent storage: workdir/events/
|
|
721
|
+
self._event_writer = ExperimentEventWriter(self, self.workspace.path, 0)
|
|
722
|
+
|
|
723
|
+
# Initialize status.json for this run
|
|
724
|
+
self._hostname = socket.gethostname()
|
|
725
|
+
self._started_at = datetime.now().timestamp()
|
|
726
|
+
self._event_writer.init_status()
|
|
727
|
+
|
|
728
|
+
# Create symlink to current run
|
|
729
|
+
self._event_writer.create_symlink()
|
|
730
|
+
|
|
731
|
+
# Add run info to environment.json
|
|
732
|
+
env_path = self.workdir / "environment.json"
|
|
733
|
+
env = ExperimentEnvironment.load(env_path)
|
|
734
|
+
env.run = ExperimentRunInfo(
|
|
735
|
+
hostname=self._hostname,
|
|
736
|
+
started_at=datetime.now().isoformat(),
|
|
737
|
+
)
|
|
738
|
+
env.save(env_path)
|
|
422
739
|
|
|
423
|
-
# Add
|
|
424
|
-
self.
|
|
425
|
-
self.
|
|
740
|
+
# Add state listener to write events to filesystem
|
|
741
|
+
self._state_listener = StateListener(
|
|
742
|
+
self._event_writer, self, self.name, self.run_id
|
|
426
743
|
)
|
|
427
|
-
self.scheduler.addlistener(self.
|
|
428
|
-
else:
|
|
429
|
-
# In non-NORMAL modes, use a placeholder run_id
|
|
430
|
-
self.run_id = None
|
|
744
|
+
self.scheduler.addlistener(self._state_listener)
|
|
431
745
|
|
|
432
746
|
# Number of unfinished jobs
|
|
433
747
|
self.unfinishedJobs = 0
|
|
@@ -452,19 +766,17 @@ class experiment:
|
|
|
452
766
|
|
|
453
767
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
454
768
|
logger.debug("Exiting scheduler context")
|
|
455
|
-
# If no exception and normal run mode, remove old "jobs"
|
|
456
|
-
if self.workspace.run_mode == RunMode.NORMAL:
|
|
457
|
-
if exc_type is None and self.jobsbakpath.is_dir():
|
|
458
|
-
rmtree(self.jobsbakpath)
|
|
459
769
|
|
|
460
770
|
# Close the different locks
|
|
461
771
|
try:
|
|
462
|
-
if exc_type:
|
|
772
|
+
if exc_type is GracefulExperimentExit:
|
|
773
|
+
# Graceful exit - don't wait for jobs, don't log error
|
|
774
|
+
logger.info("Graceful experiment exit - not waiting for running jobs")
|
|
775
|
+
elif exc_type:
|
|
463
776
|
# import faulthandler
|
|
464
777
|
# faulthandler.dump_traceback()
|
|
465
|
-
logger.
|
|
466
|
-
"Not waiting since an exception was thrown"
|
|
467
|
-
" (some jobs may be running)"
|
|
778
|
+
logger.exception(
|
|
779
|
+
"Not waiting since an exception was thrown (some jobs may be running)"
|
|
468
780
|
)
|
|
469
781
|
else:
|
|
470
782
|
self.wait()
|
|
@@ -481,20 +793,41 @@ class experiment:
|
|
|
481
793
|
logger.info("Closing service %s", service.description())
|
|
482
794
|
service.stop()
|
|
483
795
|
|
|
796
|
+
# Set end time for BaseExperiment interface
|
|
797
|
+
self._ended_at = time.time()
|
|
798
|
+
|
|
484
799
|
# Unregister experiment from scheduler
|
|
485
800
|
self.scheduler.unregister_experiment(self)
|
|
486
801
|
|
|
487
|
-
# Remove
|
|
488
|
-
if
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
status = "
|
|
494
|
-
|
|
802
|
+
# Remove state listener and finalize run (only in NORMAL mode)
|
|
803
|
+
if exc_type is GracefulExperimentExit:
|
|
804
|
+
status = "detached" # Graceful exit, jobs may still be running
|
|
805
|
+
elif exc_type:
|
|
806
|
+
status = "failed"
|
|
807
|
+
else:
|
|
808
|
+
status = "completed"
|
|
809
|
+
|
|
810
|
+
if self._state_listener is not None:
|
|
811
|
+
self.scheduler.removelistener(self._state_listener)
|
|
812
|
+
self._finalize_run(status)
|
|
813
|
+
|
|
814
|
+
# Update environment.json with run status
|
|
815
|
+
if self.workspace.run_mode == RunMode.NORMAL and self.workdir:
|
|
816
|
+
from datetime import datetime
|
|
817
|
+
from experimaestro.utils.environment import ExperimentEnvironment
|
|
818
|
+
|
|
819
|
+
env_path = self.workdir / "environment.json"
|
|
820
|
+
if env_path.exists():
|
|
821
|
+
try:
|
|
822
|
+
env = ExperimentEnvironment.load(env_path)
|
|
823
|
+
if env.run:
|
|
824
|
+
env.run.ended_at = datetime.now().isoformat()
|
|
825
|
+
env.run.status = status
|
|
826
|
+
env.save(env_path)
|
|
827
|
+
except Exception as e:
|
|
828
|
+
logger.warning("Failed to update environment.json: %s", e)
|
|
495
829
|
|
|
496
830
|
# Note: Don't stop scheduler - it's shared!
|
|
497
|
-
# Note: Don't stop server - it runs in daemon mode until program exit
|
|
498
831
|
|
|
499
832
|
if self.taskOutputsWorker is not None:
|
|
500
833
|
logger.info("Stopping tasks outputs worker")
|
|
@@ -508,13 +841,39 @@ class experiment:
|
|
|
508
841
|
experiment.CURRENT = self.old_experiment
|
|
509
842
|
|
|
510
843
|
if self.workspace.run_mode == RunMode.NORMAL:
|
|
511
|
-
#
|
|
512
|
-
|
|
513
|
-
|
|
844
|
+
# Remove job directories for transient jobs with REMOVE mode
|
|
845
|
+
if exc_type is None:
|
|
846
|
+
for job in list(self.scheduler.jobs.values()):
|
|
847
|
+
if (
|
|
848
|
+
self in job.experiments
|
|
849
|
+
and job.transient.should_remove
|
|
850
|
+
and job.state.finished()
|
|
851
|
+
):
|
|
852
|
+
job_path = job.path
|
|
853
|
+
if job_path.exists():
|
|
854
|
+
logger.info(
|
|
855
|
+
"Removing transient job directory: %s", job_path
|
|
856
|
+
)
|
|
857
|
+
rmtree(job_path)
|
|
858
|
+
# Also remove the symlink in the experiment's jobs folder
|
|
859
|
+
symlink_path = self.jobspath / job.relpath
|
|
860
|
+
if symlink_path.is_symlink():
|
|
861
|
+
symlink_path.unlink()
|
|
862
|
+
|
|
863
|
+
# Cleanup old runs based on history settings
|
|
864
|
+
try:
|
|
865
|
+
cleanup_experiment_history(
|
|
866
|
+
self._experiment_base,
|
|
867
|
+
current_run_id=self.run_id,
|
|
868
|
+
current_status=status,
|
|
869
|
+
history=self._get_history_settings(),
|
|
870
|
+
)
|
|
871
|
+
except Exception as e:
|
|
872
|
+
logger.warning("Failed to cleanup old runs: %s", e)
|
|
514
873
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
874
|
+
# Suppress GracefulExperimentExit exception
|
|
875
|
+
if exc_type is GracefulExperimentExit:
|
|
876
|
+
return True
|
|
518
877
|
|
|
519
878
|
async def update_task_output_count(self, delta: int):
|
|
520
879
|
"""Change in the number of task outputs to process"""
|
|
@@ -555,27 +914,43 @@ class experiment:
|
|
|
555
914
|
id(service),
|
|
556
915
|
)
|
|
557
916
|
|
|
558
|
-
self.
|
|
917
|
+
self._services[service.id] = service
|
|
559
918
|
|
|
560
919
|
# Allow service to access experiment context
|
|
561
920
|
service.set_experiment(self)
|
|
562
921
|
|
|
563
|
-
# Register
|
|
564
|
-
|
|
922
|
+
# Register state listener for state changes (writes events)
|
|
923
|
+
if self._state_listener is not None:
|
|
924
|
+
service.add_listener(self._state_listener)
|
|
565
925
|
|
|
566
|
-
# Register
|
|
926
|
+
# Register listener for state changes
|
|
567
927
|
service.add_listener(self)
|
|
568
928
|
|
|
569
|
-
self.scheduler.notify_service_add(service)
|
|
570
|
-
|
|
571
|
-
# Write services.json file
|
|
572
|
-
self._write_services_json()
|
|
929
|
+
self.scheduler.notify_service_add(service, self.name, self.run_id or "")
|
|
573
930
|
|
|
574
931
|
return service
|
|
575
932
|
|
|
576
933
|
def service_state_changed(self, service):
|
|
577
|
-
"""Called when a service state changes -
|
|
578
|
-
|
|
934
|
+
"""Called when a service state changes - notify listeners"""
|
|
935
|
+
state_name = service.state.name if hasattr(service.state, "name") else "UNKNOWN"
|
|
936
|
+
logger.debug(
|
|
937
|
+
"Service %s state changed to %s (experiment=%s)",
|
|
938
|
+
service.id,
|
|
939
|
+
state_name,
|
|
940
|
+
self.name,
|
|
941
|
+
)
|
|
942
|
+
|
|
943
|
+
# Notify state listeners (for TUI tab title updates etc.)
|
|
944
|
+
from experimaestro.scheduler.state_status import ServiceStateChangedEvent
|
|
945
|
+
|
|
946
|
+
if self.scheduler is not None:
|
|
947
|
+
event = ServiceStateChangedEvent(
|
|
948
|
+
experiment_id=self.name,
|
|
949
|
+
run_id=self.run_id or "",
|
|
950
|
+
service_id=service.id,
|
|
951
|
+
state=state_name,
|
|
952
|
+
)
|
|
953
|
+
self.scheduler._notify_state_listeners_async(event)
|
|
579
954
|
|
|
580
955
|
def save(self, obj: Any, name: str = "default"):
|
|
581
956
|
"""Serializes configurations.
|
|
@@ -594,19 +969,209 @@ class experiment:
|
|
|
594
969
|
|
|
595
970
|
save(obj, save_dir)
|
|
596
971
|
|
|
597
|
-
def load(self, reference: str, name: str = "default"):
|
|
598
|
-
"""
|
|
599
|
-
|
|
600
|
-
Loads configuration objects from an experimental directory
|
|
972
|
+
def load(self, reference: str, name: str = "default", run_id: str = None):
|
|
973
|
+
"""Loads configuration objects from an experimental directory.
|
|
601
974
|
|
|
602
975
|
:param reference: The name of the experiment
|
|
603
976
|
:param name: The name of the saving directory (default to `default`)
|
|
977
|
+
:param run_id: The run ID to load from (default: latest run)
|
|
604
978
|
"""
|
|
605
979
|
from experimaestro import load
|
|
606
980
|
|
|
607
|
-
|
|
981
|
+
exp_base = self.workspace.experimentspath / reference
|
|
982
|
+
if run_id is None:
|
|
983
|
+
# Find the latest run directory
|
|
984
|
+
run_dirs = sorted(
|
|
985
|
+
[d for d in exp_base.iterdir() if d.is_dir()],
|
|
986
|
+
key=lambda d: d.stat().st_mtime,
|
|
987
|
+
reverse=True,
|
|
988
|
+
)
|
|
989
|
+
if not run_dirs:
|
|
990
|
+
raise FileNotFoundError(f"No runs found for experiment {reference}")
|
|
991
|
+
run_dir = run_dirs[0]
|
|
992
|
+
else:
|
|
993
|
+
run_dir = exp_base / run_id
|
|
994
|
+
|
|
995
|
+
path = run_dir / "data" / name
|
|
608
996
|
return load(path)
|
|
609
997
|
|
|
998
|
+
def _get_history_settings(self) -> HistorySettings:
|
|
999
|
+
"""Get the history settings for this experiment.
|
|
1000
|
+
|
|
1001
|
+
Returns workspace-specific settings if available, otherwise global defaults.
|
|
1002
|
+
"""
|
|
1003
|
+
# Check if workspace has explicit history settings
|
|
1004
|
+
ws_settings = self.workspace.settings
|
|
1005
|
+
if ws_settings and ws_settings.history:
|
|
1006
|
+
return ws_settings.history
|
|
1007
|
+
|
|
1008
|
+
# Fall back to global settings
|
|
1009
|
+
settings = get_settings()
|
|
1010
|
+
return settings.history
|
|
1011
|
+
|
|
1012
|
+
|
|
1013
|
+
def get_run_status(run_dir: Path) -> Optional[str]:
|
|
1014
|
+
"""Get the status of a run from its status.json or environment.json.
|
|
1015
|
+
|
|
1016
|
+
Args:
|
|
1017
|
+
run_dir: Path to the run directory
|
|
1018
|
+
|
|
1019
|
+
Returns:
|
|
1020
|
+
'completed', 'failed', or None if status cannot be determined.
|
|
1021
|
+
"""
|
|
1022
|
+
# Try environment.json first (most reliable - written on exit)
|
|
1023
|
+
env_path = run_dir / "environment.json"
|
|
1024
|
+
if env_path.exists():
|
|
1025
|
+
try:
|
|
1026
|
+
from experimaestro.utils.environment import ExperimentEnvironment
|
|
1027
|
+
|
|
1028
|
+
env = ExperimentEnvironment.load(env_path)
|
|
1029
|
+
if env.run and env.run.status:
|
|
1030
|
+
return env.run.status
|
|
1031
|
+
except Exception:
|
|
1032
|
+
pass
|
|
1033
|
+
|
|
1034
|
+
# Fall back to status.json
|
|
1035
|
+
status_path = run_dir / "status.json"
|
|
1036
|
+
if status_path.exists():
|
|
1037
|
+
try:
|
|
1038
|
+
with status_path.open() as f:
|
|
1039
|
+
status = json.load(f)
|
|
1040
|
+
# Check the experiment status field
|
|
1041
|
+
exp_status = status.get("status")
|
|
1042
|
+
if exp_status == "done":
|
|
1043
|
+
return "completed"
|
|
1044
|
+
elif exp_status == "failed":
|
|
1045
|
+
return "failed"
|
|
1046
|
+
# Check job states as fallback
|
|
1047
|
+
jobs = status.get("jobs", {})
|
|
1048
|
+
if any(j.get("state") == "error" for j in jobs.values()):
|
|
1049
|
+
return "failed"
|
|
1050
|
+
return "completed"
|
|
1051
|
+
except Exception:
|
|
1052
|
+
pass
|
|
1053
|
+
|
|
1054
|
+
# Cannot determine status
|
|
1055
|
+
return None
|
|
1056
|
+
|
|
1057
|
+
|
|
1058
|
+
def cleanup_experiment_history(
|
|
1059
|
+
experiment_base: Path,
|
|
1060
|
+
*,
|
|
1061
|
+
current_run_id: Optional[str] = None,
|
|
1062
|
+
current_status: Optional[str] = None,
|
|
1063
|
+
history: Optional[HistorySettings] = None,
|
|
1064
|
+
) -> list[Path]:
|
|
1065
|
+
"""Clean up old experiment runs based on history settings.
|
|
1066
|
+
|
|
1067
|
+
This function can be called from the CLI or other contexts.
|
|
1068
|
+
|
|
1069
|
+
Args:
|
|
1070
|
+
experiment_base: Path to the experiment directory (containing run subdirs)
|
|
1071
|
+
current_run_id: ID of the current run to exclude from cleanup (optional)
|
|
1072
|
+
current_status: Status of the current run ('completed' or 'failed'), used
|
|
1073
|
+
to determine if failed runs should be removed (optional)
|
|
1074
|
+
history: History settings to use (defaults to global settings)
|
|
1075
|
+
|
|
1076
|
+
Returns:
|
|
1077
|
+
List of paths that were removed
|
|
1078
|
+
"""
|
|
1079
|
+
if history is None:
|
|
1080
|
+
settings = get_settings()
|
|
1081
|
+
history = settings.history
|
|
1082
|
+
|
|
1083
|
+
removed_paths = []
|
|
1084
|
+
|
|
1085
|
+
# List all run directories (excluding the current one)
|
|
1086
|
+
run_dirs = []
|
|
1087
|
+
for d in experiment_base.iterdir():
|
|
1088
|
+
if d.is_dir() and d.name != current_run_id:
|
|
1089
|
+
run_dirs.append(d)
|
|
1090
|
+
|
|
1091
|
+
# Sort by directory name (oldest first)
|
|
1092
|
+
# Directory names are in format YYYYMMDD_HHMMSS or YYYYMMDD_HHMMSS.N (with modifier)
|
|
1093
|
+
def run_sort_key(d: Path) -> tuple[str, int]:
|
|
1094
|
+
"""Parse run_id for sorting, handling modifiers like 20250501_102315.1"""
|
|
1095
|
+
name = d.name
|
|
1096
|
+
if "." in name:
|
|
1097
|
+
parts = name.split(".", 1)
|
|
1098
|
+
try:
|
|
1099
|
+
return (parts[0], int(parts[1]))
|
|
1100
|
+
except (ValueError, IndexError):
|
|
1101
|
+
return (name, 0)
|
|
1102
|
+
return (name, 0)
|
|
1103
|
+
|
|
1104
|
+
run_dirs.sort(key=run_sort_key)
|
|
1105
|
+
|
|
1106
|
+
# Categorize runs by status
|
|
1107
|
+
completed_runs = []
|
|
1108
|
+
failed_runs = []
|
|
1109
|
+
|
|
1110
|
+
for run_dir in run_dirs:
|
|
1111
|
+
status = get_run_status(run_dir)
|
|
1112
|
+
if status == "completed":
|
|
1113
|
+
completed_runs.append(run_dir)
|
|
1114
|
+
elif status == "failed":
|
|
1115
|
+
failed_runs.append(run_dir)
|
|
1116
|
+
# Runs with unknown status are not touched
|
|
1117
|
+
|
|
1118
|
+
# If current run succeeded, remove all past failed runs (per user requirement)
|
|
1119
|
+
if current_status == "completed":
|
|
1120
|
+
# Remove all past failed runs
|
|
1121
|
+
# Per user requirement: "If an experiment succeed, it remove the past failed"
|
|
1122
|
+
for run_dir in failed_runs:
|
|
1123
|
+
logger.info("Removing failed run (experiment succeeded): %s", run_dir)
|
|
1124
|
+
try:
|
|
1125
|
+
rmtree(run_dir)
|
|
1126
|
+
removed_paths.append(run_dir)
|
|
1127
|
+
except Exception as e:
|
|
1128
|
+
logger.warning("Failed to remove run directory %s: %s", run_dir, e)
|
|
1129
|
+
failed_runs = []
|
|
1130
|
+
|
|
1131
|
+
# Remove failed runs that come after any successful run
|
|
1132
|
+
# (if there's a success before a failure, that failure is stale)
|
|
1133
|
+
if completed_runs:
|
|
1134
|
+
# Find the newest completed run
|
|
1135
|
+
newest_completed = run_sort_key(completed_runs[-1])
|
|
1136
|
+
remaining_failed = []
|
|
1137
|
+
for run_dir in failed_runs:
|
|
1138
|
+
if run_sort_key(run_dir) < newest_completed:
|
|
1139
|
+
logger.info("Removing failed run (success exists after): %s", run_dir)
|
|
1140
|
+
try:
|
|
1141
|
+
rmtree(run_dir)
|
|
1142
|
+
removed_paths.append(run_dir)
|
|
1143
|
+
except Exception as e:
|
|
1144
|
+
logger.warning("Failed to remove run directory %s: %s", run_dir, e)
|
|
1145
|
+
else:
|
|
1146
|
+
remaining_failed.append(run_dir)
|
|
1147
|
+
failed_runs = remaining_failed
|
|
1148
|
+
|
|
1149
|
+
# Keep only max_done completed runs (remove oldest ones)
|
|
1150
|
+
while len(completed_runs) > history.max_done:
|
|
1151
|
+
run_dir = completed_runs.pop(0) # Remove oldest
|
|
1152
|
+
logger.info(
|
|
1153
|
+
"Removing old completed run (keeping %d): %s", history.max_done, run_dir
|
|
1154
|
+
)
|
|
1155
|
+
try:
|
|
1156
|
+
rmtree(run_dir)
|
|
1157
|
+
removed_paths.append(run_dir)
|
|
1158
|
+
except Exception as e:
|
|
1159
|
+
logger.warning("Failed to remove run directory %s: %s", run_dir, e)
|
|
1160
|
+
|
|
1161
|
+
# Keep only max_failed failed runs (remove oldest ones)
|
|
1162
|
+
while len(failed_runs) > history.max_failed:
|
|
1163
|
+
run_dir = failed_runs.pop(0) # Remove oldest
|
|
1164
|
+
logger.info(
|
|
1165
|
+
"Removing old failed run (keeping %d): %s", history.max_failed, run_dir
|
|
1166
|
+
)
|
|
1167
|
+
try:
|
|
1168
|
+
rmtree(run_dir)
|
|
1169
|
+
removed_paths.append(run_dir)
|
|
1170
|
+
except Exception as e:
|
|
1171
|
+
logger.warning("Failed to remove run directory %s: %s", run_dir, e)
|
|
1172
|
+
|
|
1173
|
+
return removed_paths
|
|
1174
|
+
|
|
610
1175
|
|
|
611
1176
|
# re-export at the module level
|
|
612
1177
|
current = experiment.current
|