experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +12 -5
- experimaestro/cli/__init__.py +393 -134
- experimaestro/cli/filter.py +48 -23
- experimaestro/cli/jobs.py +253 -71
- experimaestro/cli/refactor.py +1 -2
- experimaestro/commandline.py +7 -4
- experimaestro/connectors/__init__.py +9 -1
- experimaestro/connectors/local.py +43 -3
- experimaestro/core/arguments.py +18 -18
- experimaestro/core/identifier.py +11 -11
- experimaestro/core/objects/config.py +96 -39
- experimaestro/core/objects/config_walk.py +3 -3
- experimaestro/core/{subparameters.py → partial.py} +16 -16
- experimaestro/core/partial_lock.py +394 -0
- experimaestro/core/types.py +12 -15
- experimaestro/dynamic.py +290 -0
- experimaestro/experiments/__init__.py +6 -2
- experimaestro/experiments/cli.py +223 -52
- experimaestro/experiments/configuration.py +24 -0
- experimaestro/generators.py +5 -5
- experimaestro/ipc.py +118 -1
- experimaestro/launcherfinder/__init__.py +2 -2
- experimaestro/launcherfinder/registry.py +6 -7
- experimaestro/launcherfinder/specs.py +2 -9
- experimaestro/launchers/slurm/__init__.py +2 -2
- experimaestro/launchers/slurm/base.py +62 -0
- experimaestro/locking.py +957 -1
- experimaestro/notifications.py +89 -201
- experimaestro/progress.py +63 -366
- experimaestro/rpyc.py +0 -2
- experimaestro/run.py +29 -2
- experimaestro/scheduler/__init__.py +8 -1
- experimaestro/scheduler/base.py +650 -53
- experimaestro/scheduler/dependencies.py +20 -16
- experimaestro/scheduler/experiment.py +764 -169
- experimaestro/scheduler/interfaces.py +338 -96
- experimaestro/scheduler/jobs.py +58 -20
- experimaestro/scheduler/remote/__init__.py +31 -0
- experimaestro/scheduler/remote/adaptive_sync.py +265 -0
- experimaestro/scheduler/remote/client.py +928 -0
- experimaestro/scheduler/remote/protocol.py +282 -0
- experimaestro/scheduler/remote/server.py +447 -0
- experimaestro/scheduler/remote/sync.py +144 -0
- experimaestro/scheduler/services.py +186 -35
- experimaestro/scheduler/state_provider.py +811 -2157
- experimaestro/scheduler/state_status.py +1247 -0
- experimaestro/scheduler/transient.py +31 -0
- experimaestro/scheduler/workspace.py +1 -1
- experimaestro/scheduler/workspace_state_provider.py +1273 -0
- experimaestro/scriptbuilder.py +4 -4
- experimaestro/settings.py +36 -0
- experimaestro/tests/conftest.py +33 -5
- experimaestro/tests/connectors/bin/executable.py +1 -1
- experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
- experimaestro/tests/launchers/bin/test.py +1 -0
- experimaestro/tests/launchers/test_slurm.py +9 -9
- experimaestro/tests/partial_reschedule.py +46 -0
- experimaestro/tests/restart.py +3 -3
- experimaestro/tests/restart_main.py +1 -0
- experimaestro/tests/scripts/notifyandwait.py +1 -0
- experimaestro/tests/task_partial.py +38 -0
- experimaestro/tests/task_tokens.py +2 -2
- experimaestro/tests/tasks/test_dynamic.py +6 -6
- experimaestro/tests/test_dependencies.py +3 -3
- experimaestro/tests/test_deprecated.py +15 -15
- experimaestro/tests/test_dynamic_locking.py +317 -0
- experimaestro/tests/test_environment.py +24 -14
- experimaestro/tests/test_experiment.py +171 -36
- experimaestro/tests/test_identifier.py +25 -25
- experimaestro/tests/test_identifier_stability.py +3 -5
- experimaestro/tests/test_multitoken.py +2 -4
- experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
- experimaestro/tests/test_partial_paths.py +81 -138
- experimaestro/tests/test_pre_experiment.py +219 -0
- experimaestro/tests/test_progress.py +2 -8
- experimaestro/tests/test_remote_state.py +1132 -0
- experimaestro/tests/test_stray_jobs.py +261 -0
- experimaestro/tests/test_tasks.py +1 -2
- experimaestro/tests/test_token_locking.py +52 -67
- experimaestro/tests/test_tokens.py +5 -6
- experimaestro/tests/test_transient.py +225 -0
- experimaestro/tests/test_workspace_state_provider.py +768 -0
- experimaestro/tests/token_reschedule.py +1 -3
- experimaestro/tests/utils.py +2 -7
- experimaestro/tokens.py +227 -372
- experimaestro/tools/diff.py +1 -0
- experimaestro/tools/documentation.py +4 -5
- experimaestro/tools/jobs.py +1 -2
- experimaestro/tui/app.py +459 -1895
- experimaestro/tui/app.tcss +162 -0
- experimaestro/tui/dialogs.py +172 -0
- experimaestro/tui/log_viewer.py +253 -3
- experimaestro/tui/messages.py +137 -0
- experimaestro/tui/utils.py +54 -0
- experimaestro/tui/widgets/__init__.py +23 -0
- experimaestro/tui/widgets/experiments.py +468 -0
- experimaestro/tui/widgets/global_services.py +238 -0
- experimaestro/tui/widgets/jobs.py +972 -0
- experimaestro/tui/widgets/log.py +156 -0
- experimaestro/tui/widgets/orphans.py +363 -0
- experimaestro/tui/widgets/runs.py +185 -0
- experimaestro/tui/widgets/services.py +314 -0
- experimaestro/tui/widgets/stray_jobs.py +528 -0
- experimaestro/utils/__init__.py +1 -1
- experimaestro/utils/environment.py +105 -22
- experimaestro/utils/fswatcher.py +124 -0
- experimaestro/utils/jobs.py +1 -2
- experimaestro/utils/jupyter.py +1 -2
- experimaestro/utils/logging.py +72 -0
- experimaestro/version.py +2 -2
- experimaestro/webui/__init__.py +9 -0
- experimaestro/webui/app.py +117 -0
- experimaestro/{server → webui}/data/index.css +66 -11
- experimaestro/webui/data/index.css.map +1 -0
- experimaestro/{server → webui}/data/index.js +82763 -87217
- experimaestro/webui/data/index.js.map +1 -0
- experimaestro/webui/routes/__init__.py +5 -0
- experimaestro/webui/routes/auth.py +53 -0
- experimaestro/webui/routes/proxy.py +117 -0
- experimaestro/webui/server.py +200 -0
- experimaestro/webui/state_bridge.py +152 -0
- experimaestro/webui/websocket.py +413 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +8 -9
- experimaestro-2.0.0b17.dist-info/RECORD +219 -0
- experimaestro/cli/progress.py +0 -269
- experimaestro/scheduler/state.py +0 -75
- experimaestro/scheduler/state_db.py +0 -388
- experimaestro/scheduler/state_sync.py +0 -834
- experimaestro/server/__init__.py +0 -467
- experimaestro/server/data/index.css.map +0 -1
- experimaestro/server/data/index.js.map +0 -1
- experimaestro/tests/test_cli_jobs.py +0 -615
- experimaestro/tests/test_file_progress.py +0 -425
- experimaestro/tests/test_file_progress_integration.py +0 -477
- experimaestro/tests/test_state_db.py +0 -434
- experimaestro-2.0.0b4.dist-info/RECORD +0 -181
- /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
- /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
- /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
- /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
- /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
- /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
- /experimaestro/{server → webui}/data/favicon.ico +0 -0
- /experimaestro/{server → webui}/data/index.html +0 -0
- /experimaestro/{server → webui}/data/login.html +0 -0
- /experimaestro/{server → webui}/data/manifest.json +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import inspect
|
|
2
3
|
import json
|
|
3
4
|
import logging
|
|
4
5
|
import os
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
import time
|
|
7
8
|
from shutil import rmtree
|
|
8
|
-
from typing import Any, Dict, Optional, TypeVar, Union
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union
|
|
9
10
|
|
|
10
11
|
from experimaestro.core.objects import WatchedOutput
|
|
11
12
|
from experimaestro.exceptions import HandledException
|
|
@@ -14,9 +15,19 @@ from experimaestro.scheduler.signal_handler import SIGNAL_HANDLER
|
|
|
14
15
|
from experimaestro.scheduler.jobs import Job
|
|
15
16
|
from experimaestro.scheduler.services import Service
|
|
16
17
|
from experimaestro.scheduler.workspace import RunMode, Workspace
|
|
17
|
-
from experimaestro.
|
|
18
|
+
from experimaestro.scheduler.interfaces import (
|
|
19
|
+
BaseExperiment,
|
|
20
|
+
BaseService,
|
|
21
|
+
ExperimentJobInformation,
|
|
22
|
+
)
|
|
23
|
+
from experimaestro.settings import WorkspaceSettings, get_settings, HistorySettings
|
|
24
|
+
from experimaestro.experiments.configuration import DirtyGitAction
|
|
18
25
|
from experimaestro.utils import logger
|
|
19
26
|
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from experimaestro.scheduler.interfaces import ExperimentStatus
|
|
29
|
+
from experimaestro.scheduler.state_status import ExperimentEventWriter
|
|
30
|
+
|
|
20
31
|
ServiceClass = TypeVar("ServiceClass", bound=Service)
|
|
21
32
|
|
|
22
33
|
|
|
@@ -26,11 +37,46 @@ class FailedExperiment(HandledException):
|
|
|
26
37
|
pass
|
|
27
38
|
|
|
28
39
|
|
|
29
|
-
class
|
|
30
|
-
"""
|
|
40
|
+
class DirtyGitError(HandledException):
|
|
41
|
+
"""Raised when the git repository has uncommitted changes and dirty_git=error"""
|
|
42
|
+
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class GracefulExperimentExit(Exception):
|
|
47
|
+
"""Raised to exit an experiment context without waiting for running jobs.
|
|
48
|
+
|
|
49
|
+
This is useful in tests or when you want to detach from an experiment
|
|
50
|
+
while keeping jobs running (e.g., to test stray job detection).
|
|
51
|
+
|
|
52
|
+
Example::
|
|
53
|
+
|
|
54
|
+
with experiment(workdir, "my-experiment") as xp:
|
|
55
|
+
task = MyTask.C(value=1).submit()
|
|
56
|
+
# Wait for task to start...
|
|
57
|
+
raise GracefulExperimentExit() # Exit without waiting for task to finish
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class StateListener:
|
|
64
|
+
"""Listener that writes events to filesystem
|
|
65
|
+
|
|
66
|
+
Job state events are written to per-job event files by the scheduler.
|
|
67
|
+
This listener writes experiment-level events (job state, services) to
|
|
68
|
+
the experiment event file.
|
|
69
|
+
"""
|
|
31
70
|
|
|
32
|
-
def __init__(
|
|
33
|
-
self
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
event_writer: "ExperimentEventWriter",
|
|
74
|
+
experiment: "experiment",
|
|
75
|
+
experiment_id: str,
|
|
76
|
+
run_id: str,
|
|
77
|
+
):
|
|
78
|
+
self.event_writer = event_writer
|
|
79
|
+
self.experiment = experiment
|
|
34
80
|
self.experiment_id = experiment_id
|
|
35
81
|
self.run_id = run_id
|
|
36
82
|
|
|
@@ -39,38 +85,65 @@ class DatabaseListener:
|
|
|
39
85
|
pass
|
|
40
86
|
|
|
41
87
|
def job_state(self, job):
|
|
42
|
-
"""
|
|
43
|
-
|
|
88
|
+
"""Write job state change event to experiment event file"""
|
|
89
|
+
from .state_status import JobStateChangedEvent
|
|
90
|
+
|
|
91
|
+
# Get failure reason if error state
|
|
92
|
+
failure_reason = None
|
|
93
|
+
if hasattr(job.state, "failure_reason") and job.state.failure_reason:
|
|
94
|
+
failure_reason = job.state.failure_reason.name
|
|
95
|
+
|
|
96
|
+
# Get progress as list of dicts
|
|
97
|
+
progress = []
|
|
98
|
+
if hasattr(job, "_progress") and job._progress:
|
|
99
|
+
progress = [
|
|
100
|
+
{"level": p.level, "progress": p.progress, "desc": p.desc}
|
|
101
|
+
for p in job._progress
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
event = JobStateChangedEvent(
|
|
105
|
+
job_id=job.identifier,
|
|
106
|
+
state=job.state.name,
|
|
107
|
+
failure_reason=failure_reason,
|
|
108
|
+
submitted_time=job.submittime,
|
|
109
|
+
started_time=job.starttime,
|
|
110
|
+
ended_time=job.endtime,
|
|
111
|
+
exit_code=getattr(job, "exit_code", None),
|
|
112
|
+
retry_count=getattr(job, "retry_count", 0),
|
|
113
|
+
progress=progress,
|
|
114
|
+
)
|
|
115
|
+
# Write to experiment event file
|
|
116
|
+
self.event_writer.write_event(event)
|
|
44
117
|
|
|
45
118
|
def service_add(self, service):
|
|
46
|
-
"""
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
119
|
+
"""Write service added event to filesystem"""
|
|
120
|
+
from experimaestro.scheduler.services import Service
|
|
121
|
+
from .state_status import ServiceAddedEvent
|
|
122
|
+
|
|
123
|
+
state_dict = Service.serialize_state_dict(service.state_dict())
|
|
124
|
+
service_class = f"{service.__class__.__module__}.{service.__class__.__name__}"
|
|
125
|
+
event = ServiceAddedEvent(
|
|
126
|
+
service_id=service.id,
|
|
127
|
+
description=service.description(),
|
|
128
|
+
service_class=service_class,
|
|
129
|
+
state_dict=state_dict,
|
|
54
130
|
)
|
|
131
|
+
self.event_writer.write_event(event)
|
|
55
132
|
|
|
56
133
|
def service_state_changed(self, service):
|
|
57
|
-
"""
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
self.experiment_id,
|
|
61
|
-
self.run_id,
|
|
62
|
-
service.description(),
|
|
63
|
-
service.state.name,
|
|
64
|
-
state_dict=json.dumps(service.state_dict()),
|
|
65
|
-
)
|
|
134
|
+
"""Called when service state changes (runtime only, not persisted)"""
|
|
135
|
+
# Service state is managed at runtime, not persisted
|
|
136
|
+
pass
|
|
66
137
|
|
|
67
138
|
|
|
68
|
-
class experiment:
|
|
139
|
+
class experiment(BaseExperiment):
|
|
69
140
|
"""Context manager for running experiments.
|
|
70
141
|
|
|
71
142
|
Creates a workspace, manages task submission, and optionally starts
|
|
72
143
|
a web server for monitoring.
|
|
73
144
|
|
|
145
|
+
Implements BaseExperiment interface for use with StateProvider and TUI.
|
|
146
|
+
|
|
74
147
|
Example::
|
|
75
148
|
|
|
76
149
|
from experimaestro import experiment
|
|
@@ -103,6 +176,10 @@ class experiment:
|
|
|
103
176
|
run_mode: Optional[RunMode] = None,
|
|
104
177
|
launcher=None,
|
|
105
178
|
register_signals: bool = True,
|
|
179
|
+
project_paths: Optional[list[Path]] = None,
|
|
180
|
+
wait_for_quit: bool = False,
|
|
181
|
+
dirty_git: DirtyGitAction = DirtyGitAction.WARN,
|
|
182
|
+
no_db: bool = False,
|
|
106
183
|
):
|
|
107
184
|
"""
|
|
108
185
|
:param env: an environment -- or a working directory for a local
|
|
@@ -122,10 +199,60 @@ class experiment:
|
|
|
122
199
|
|
|
123
200
|
:param register_signals: Whether to register signal handlers (default: True).
|
|
124
201
|
Set to False when running in a background thread.
|
|
202
|
+
|
|
203
|
+
:param project_paths: Paths to the project files (for git info). If not
|
|
204
|
+
provided, will be inferred from the caller's location.
|
|
205
|
+
|
|
206
|
+
:param wait_for_quit: Deprecated, no longer used. Web server is no longer
|
|
207
|
+
started automatically.
|
|
208
|
+
|
|
209
|
+
:param dirty_git: Action when git repository has uncommitted changes:
|
|
210
|
+
DirtyGitAction.IGNORE (don't check), DirtyGitAction.WARN (log warning,
|
|
211
|
+
default), or DirtyGitAction.ERROR (raise exception).
|
|
212
|
+
|
|
213
|
+
:param no_db: Deprecated, kept for backwards compatibility. This parameter
|
|
214
|
+
is now a no-op as the database has been replaced with filesystem-based
|
|
215
|
+
state tracking.
|
|
216
|
+
|
|
217
|
+
.. deprecated::
|
|
218
|
+
The ``host``, ``port``, ``token``, and ``wait_for_quit`` parameters are
|
|
219
|
+
deprecated. Use ``--web`` flag with ``run-experiment`` CLI or start the
|
|
220
|
+
web server separately.
|
|
125
221
|
"""
|
|
222
|
+
import warnings
|
|
126
223
|
|
|
127
224
|
from experimaestro.scheduler import Listener, Scheduler
|
|
128
225
|
|
|
226
|
+
# Warn about deprecated server parameters
|
|
227
|
+
if host is not None:
|
|
228
|
+
warnings.warn(
|
|
229
|
+
"The 'host' parameter is deprecated. Use '--web' flag with "
|
|
230
|
+
"'run-experiment' CLI or start the web server separately.",
|
|
231
|
+
DeprecationWarning,
|
|
232
|
+
stacklevel=2,
|
|
233
|
+
)
|
|
234
|
+
if port is not None:
|
|
235
|
+
warnings.warn(
|
|
236
|
+
"The 'port' parameter is deprecated. Use '--web' flag with "
|
|
237
|
+
"'run-experiment' CLI or start the web server separately.",
|
|
238
|
+
DeprecationWarning,
|
|
239
|
+
stacklevel=2,
|
|
240
|
+
)
|
|
241
|
+
if token is not None:
|
|
242
|
+
warnings.warn(
|
|
243
|
+
"The 'token' parameter is deprecated. Use '--web' flag with "
|
|
244
|
+
"'run-experiment' CLI or start the web server separately.",
|
|
245
|
+
DeprecationWarning,
|
|
246
|
+
stacklevel=2,
|
|
247
|
+
)
|
|
248
|
+
if wait_for_quit:
|
|
249
|
+
warnings.warn(
|
|
250
|
+
"The 'wait_for_quit' parameter is deprecated. Use '--web' flag with "
|
|
251
|
+
"'run-experiment' CLI or start the web server separately.",
|
|
252
|
+
DeprecationWarning,
|
|
253
|
+
stacklevel=2,
|
|
254
|
+
)
|
|
255
|
+
|
|
129
256
|
settings = get_settings()
|
|
130
257
|
if not isinstance(env, WorkspaceSettings):
|
|
131
258
|
env = WorkspaceSettings(id=None, path=Path(env))
|
|
@@ -134,36 +261,45 @@ class experiment:
|
|
|
134
261
|
run_mode = run_mode or RunMode.NORMAL
|
|
135
262
|
self.workspace = Workspace(settings, env, launcher=launcher, run_mode=run_mode)
|
|
136
263
|
|
|
137
|
-
#
|
|
138
|
-
self.
|
|
139
|
-
|
|
140
|
-
|
|
264
|
+
# Store experiment name for ID references
|
|
265
|
+
self.name = name
|
|
266
|
+
|
|
267
|
+
# Create experiment base directory (run directories will be created inside)
|
|
268
|
+
self._experiment_base = self.workspace.experimentspath / name
|
|
269
|
+
self._experiment_base.mkdir(parents=True, exist_ok=True)
|
|
270
|
+
|
|
271
|
+
# Lock is at experiment level (prevents concurrent runs of same experiment)
|
|
272
|
+
self.xplockpath = self._experiment_base / "lock"
|
|
273
|
+
|
|
274
|
+
# workdir will be set in __enter__ after run_id is generated
|
|
275
|
+
self.workdir = None
|
|
141
276
|
self.xplock = None
|
|
142
277
|
self.old_experiment = None
|
|
143
|
-
self.
|
|
278
|
+
self._services: Dict[str, Service] = {}
|
|
144
279
|
self._job_listener: Optional[Listener] = None
|
|
145
280
|
self._register_signals = register_signals
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
281
|
+
self._dirty_git = dirty_git
|
|
282
|
+
self._no_db = no_db
|
|
283
|
+
|
|
284
|
+
# Capture project paths for git info
|
|
285
|
+
if project_paths is not None:
|
|
286
|
+
self._project_paths = project_paths
|
|
287
|
+
else:
|
|
288
|
+
# Fall back to caller's file path
|
|
289
|
+
self._project_paths = []
|
|
290
|
+
try:
|
|
291
|
+
# Go up the stack to find the first frame outside this module
|
|
292
|
+
for frame_info in inspect.stack():
|
|
293
|
+
frame_file = frame_info.filename
|
|
294
|
+
if "experimaestro" not in frame_file:
|
|
295
|
+
self._project_paths = [Path(frame_file).resolve().parent]
|
|
296
|
+
break
|
|
297
|
+
except Exception:
|
|
298
|
+
pass
|
|
157
299
|
|
|
158
300
|
# Use singleton scheduler
|
|
159
301
|
self.scheduler = Scheduler.instance()
|
|
160
302
|
|
|
161
|
-
# Determine if we need a server
|
|
162
|
-
self._needs_server = (
|
|
163
|
-
settings.server.port is not None and settings.server.port >= 0
|
|
164
|
-
) and self.workspace.run_mode == RunMode.NORMAL
|
|
165
|
-
self._server_settings = settings.server if self._needs_server else None
|
|
166
|
-
|
|
167
303
|
if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
|
|
168
304
|
import faulthandler
|
|
169
305
|
|
|
@@ -201,48 +337,77 @@ class experiment:
|
|
|
201
337
|
"""Return the directory in which results can be stored for this experiment"""
|
|
202
338
|
return self.workdir / "jobs"
|
|
203
339
|
|
|
340
|
+
# =========================================================================
|
|
341
|
+
# BaseExperiment interface properties
|
|
342
|
+
# =========================================================================
|
|
343
|
+
|
|
344
|
+
@property
|
|
345
|
+
def experiment_id(self) -> str:
|
|
346
|
+
"""Experiment identifier (overrides BaseExperiment.experiment_id)"""
|
|
347
|
+
return self.name
|
|
348
|
+
|
|
349
|
+
@property
|
|
350
|
+
def status(self) -> "ExperimentStatus":
|
|
351
|
+
"""Experiment status - RUNNING for live experiments, updated on finalization"""
|
|
352
|
+
from experimaestro.scheduler.interfaces import ExperimentStatus
|
|
353
|
+
|
|
354
|
+
return getattr(self, "_status", ExperimentStatus.RUNNING)
|
|
355
|
+
|
|
356
|
+
@property
|
|
357
|
+
def jobs(self) -> Dict[str, "Job"]:
|
|
358
|
+
"""Jobs in this experiment"""
|
|
359
|
+
return {
|
|
360
|
+
job.identifier: job
|
|
361
|
+
for job in self.scheduler.jobs.values()
|
|
362
|
+
if self in job.experiments
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
@property
|
|
366
|
+
def tags(self) -> Dict[str, Dict[str, str]]:
|
|
367
|
+
"""Tags for jobs - tracked directly in experiment"""
|
|
368
|
+
return self._tags
|
|
369
|
+
|
|
370
|
+
@property
|
|
371
|
+
def dependencies(self) -> Dict[str, List[str]]:
|
|
372
|
+
"""Job dependencies - tracked directly in experiment"""
|
|
373
|
+
return self._dependencies
|
|
374
|
+
|
|
375
|
+
@property
|
|
376
|
+
def events_count(self) -> int:
|
|
377
|
+
"""Number of events processed"""
|
|
378
|
+
return self._events_count
|
|
379
|
+
|
|
380
|
+
@property
|
|
381
|
+
def started_at(self) -> Optional[float]:
|
|
382
|
+
"""Timestamp when experiment started"""
|
|
383
|
+
return self._started_at
|
|
384
|
+
|
|
385
|
+
@property
|
|
386
|
+
def ended_at(self) -> Optional[float]:
|
|
387
|
+
"""Timestamp when experiment ended (None if still running)"""
|
|
388
|
+
return self._ended_at
|
|
389
|
+
|
|
390
|
+
@property
|
|
391
|
+
def hostname(self) -> Optional[str]:
|
|
392
|
+
"""Hostname where experiment is running"""
|
|
393
|
+
return self._hostname
|
|
394
|
+
|
|
395
|
+
@property
|
|
396
|
+
def services(self) -> Dict[str, "BaseService"]:
|
|
397
|
+
"""Services in this experiment"""
|
|
398
|
+
return self._services
|
|
399
|
+
|
|
204
400
|
@property
|
|
205
401
|
def alt_jobspaths(self):
|
|
206
402
|
"""Return potential other directories"""
|
|
207
403
|
for alt_workdir in self.workspace.alt_workdirs:
|
|
208
404
|
yield alt_workdir / "jobs"
|
|
209
405
|
|
|
210
|
-
@property
|
|
211
|
-
def jobsbakpath(self):
|
|
212
|
-
"""Return the directory in which results can be stored for this experiment"""
|
|
213
|
-
return self.workdir / "jobs.bak"
|
|
214
|
-
|
|
215
406
|
@property
|
|
216
407
|
def jobs_jsonl_path(self):
|
|
217
408
|
"""Return the path to the jobs.jsonl file for this experiment"""
|
|
218
409
|
return self.workdir / "jobs.jsonl"
|
|
219
410
|
|
|
220
|
-
@property
|
|
221
|
-
def services_json_path(self):
|
|
222
|
-
"""Return the path to the services.json file for this experiment"""
|
|
223
|
-
return self.workdir / "services.json"
|
|
224
|
-
|
|
225
|
-
def _write_services_json(self):
|
|
226
|
-
"""Write all services to services.json file"""
|
|
227
|
-
services_data = {}
|
|
228
|
-
for service_id, service in self.services.items():
|
|
229
|
-
# Get state_dict from service (includes __class__ for recreation)
|
|
230
|
-
service_state = service.state_dict()
|
|
231
|
-
# Add runtime state info
|
|
232
|
-
service_state.update(
|
|
233
|
-
{
|
|
234
|
-
"service_id": service_id,
|
|
235
|
-
"description": service.description(),
|
|
236
|
-
"state": service.state.name,
|
|
237
|
-
"url": getattr(service, "url", None),
|
|
238
|
-
"timestamp": time.time(),
|
|
239
|
-
}
|
|
240
|
-
)
|
|
241
|
-
services_data[service_id] = service_state
|
|
242
|
-
|
|
243
|
-
with self.services_json_path.open("w") as f:
|
|
244
|
-
json.dump(services_data, f, indent=2)
|
|
245
|
-
|
|
246
411
|
def add_job(self, job: "Job"):
|
|
247
412
|
"""Register a job and its tags to jobs.jsonl file and database
|
|
248
413
|
|
|
@@ -267,23 +432,76 @@ class experiment:
|
|
|
267
432
|
logging.debug(
|
|
268
433
|
"Job %s already running, unfinished jobs for %s: %d",
|
|
269
434
|
job.identifier[:8],
|
|
270
|
-
self.
|
|
435
|
+
self.name,
|
|
271
436
|
self.unfinishedJobs,
|
|
272
437
|
)
|
|
273
438
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
439
|
+
job_info = ExperimentJobInformation(
|
|
440
|
+
job_id=job.identifier,
|
|
441
|
+
task_id=str(job.type.identifier),
|
|
442
|
+
tags=dict(job.tags.items()) if job.tags else {},
|
|
443
|
+
timestamp=time.time(),
|
|
444
|
+
)
|
|
280
445
|
|
|
281
446
|
with self.jobs_jsonl_path.open("a") as f:
|
|
282
|
-
f.write(json.dumps(
|
|
447
|
+
f.write(json.dumps(job_info.to_dict()) + "\n")
|
|
448
|
+
|
|
449
|
+
# Write job submitted event to filesystem (only in NORMAL mode)
|
|
450
|
+
if self._event_writer is not None:
|
|
451
|
+
from .state_status import JobSubmittedEvent
|
|
452
|
+
|
|
453
|
+
# Get dependency job IDs
|
|
454
|
+
depends_on = []
|
|
455
|
+
if hasattr(job, "dependencies"):
|
|
456
|
+
for dep in job.dependencies:
|
|
457
|
+
if hasattr(dep, "identifier"):
|
|
458
|
+
depends_on.append(dep.identifier)
|
|
459
|
+
|
|
460
|
+
job_tags = dict(job.tags.items()) if job.tags else {}
|
|
461
|
+
event = JobSubmittedEvent(
|
|
462
|
+
job_id=job.identifier,
|
|
463
|
+
task_id=str(job.type.identifier),
|
|
464
|
+
transient=job.transient.value if hasattr(job, "transient") else 0,
|
|
465
|
+
tags=job_tags,
|
|
466
|
+
depends_on=depends_on,
|
|
467
|
+
)
|
|
468
|
+
self._event_writer.write_event(event)
|
|
469
|
+
|
|
470
|
+
# Track tags and dependencies directly in experiment
|
|
471
|
+
if job_tags:
|
|
472
|
+
self._tags[job.identifier] = job_tags
|
|
473
|
+
if depends_on:
|
|
474
|
+
self._dependencies[job.identifier] = depends_on
|
|
475
|
+
|
|
476
|
+
def _finalize_run(self, status: str) -> None:
|
|
477
|
+
"""Finalize the run: write final status.json and archive event files
|
|
478
|
+
|
|
479
|
+
Args:
|
|
480
|
+
status: Final status ("completed" or "failed")
|
|
481
|
+
"""
|
|
482
|
+
from datetime import datetime
|
|
483
|
+
from experimaestro.scheduler.interfaces import ExperimentStatus
|
|
484
|
+
from .state_status import RunCompletedEvent
|
|
283
485
|
|
|
284
|
-
#
|
|
285
|
-
|
|
286
|
-
|
|
486
|
+
# Update final status in the experiment
|
|
487
|
+
self._ended_at = datetime.now().timestamp()
|
|
488
|
+
if status in ("completed", "done"):
|
|
489
|
+
self._status = ExperimentStatus.DONE
|
|
490
|
+
elif status == "failed":
|
|
491
|
+
self._status = ExperimentStatus.FAILED
|
|
492
|
+
|
|
493
|
+
# Write RunCompletedEvent before closing the event writer
|
|
494
|
+
event = RunCompletedEvent(status=status, ended_at=datetime.now().isoformat())
|
|
495
|
+
self._event_writer.write_event(event)
|
|
496
|
+
|
|
497
|
+
# Close the event writer to flush any buffered events
|
|
498
|
+
self._event_writer.close()
|
|
499
|
+
|
|
500
|
+
# Write final status.json using write_status()
|
|
501
|
+
self.write_status()
|
|
502
|
+
|
|
503
|
+
# Archive event files to permanent storage
|
|
504
|
+
self._event_writer.archive_events()
|
|
287
505
|
|
|
288
506
|
def stop(self):
|
|
289
507
|
"""Stop the experiment as soon as possible"""
|
|
@@ -365,62 +583,165 @@ class experiment:
|
|
|
365
583
|
return self.workspace.connector.createtoken(name, count)
|
|
366
584
|
|
|
367
585
|
def __enter__(self):
|
|
586
|
+
from datetime import datetime
|
|
368
587
|
from .dynamic_outputs import TaskOutputsWorker
|
|
369
|
-
from experimaestro.utils.environment import
|
|
588
|
+
from experimaestro.utils.environment import (
|
|
589
|
+
ExperimentEnvironment,
|
|
590
|
+
ExperimentRunInfo,
|
|
591
|
+
)
|
|
370
592
|
|
|
371
|
-
|
|
593
|
+
# Check for old experiment layout and warn
|
|
594
|
+
old_xp_dir = self.workspace.path / "xp"
|
|
595
|
+
if old_xp_dir.exists() and old_xp_dir.is_dir():
|
|
596
|
+
logger.warning(
|
|
597
|
+
"Experimaestro v2 has a modified experiment file layout. "
|
|
598
|
+
"DO NOT use experimaestro v1 to cleanup orphans. "
|
|
599
|
+
"You can use 'experimaestro migrate v1-to-v2 %s' to migrate old experiment "
|
|
600
|
+
"folders to the new structure.",
|
|
601
|
+
self.workspace.path,
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
# Only lock and save environment in NORMAL mode
|
|
605
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
372
606
|
logger.info("Locking experiment %s", self.xplockpath)
|
|
373
|
-
|
|
607
|
+
lock = self.workspace.connector.lock(self.xplockpath, 0)
|
|
608
|
+
|
|
609
|
+
# Try non-blocking first to check if lock is held
|
|
610
|
+
if not lock.acquire(blocking=False):
|
|
611
|
+
# Lock is held - try to find hostname from latest run's environment.json
|
|
612
|
+
hostname = None
|
|
613
|
+
try:
|
|
614
|
+
# Find the most recent run directory
|
|
615
|
+
run_dirs = sorted(
|
|
616
|
+
[d for d in self._experiment_base.iterdir() if d.is_dir()],
|
|
617
|
+
key=lambda d: d.stat().st_mtime,
|
|
618
|
+
reverse=True,
|
|
619
|
+
)
|
|
620
|
+
if run_dirs:
|
|
621
|
+
env_path = run_dirs[0] / "environment.json"
|
|
622
|
+
if env_path.exists():
|
|
623
|
+
env = ExperimentEnvironment.load(env_path)
|
|
624
|
+
hostname = env.run.hostname if env.run else None
|
|
625
|
+
except Exception:
|
|
626
|
+
pass # Ignore errors when trying to find hostname
|
|
627
|
+
holder_info = f" (held by {hostname})" if hostname else ""
|
|
628
|
+
logger.warning(
|
|
629
|
+
"Experiment is locked%s, waiting for lock to be released...",
|
|
630
|
+
holder_info,
|
|
631
|
+
)
|
|
632
|
+
# Now wait for the lock
|
|
633
|
+
lock.acquire(blocking=True)
|
|
634
|
+
|
|
635
|
+
self.xplock = lock
|
|
374
636
|
logger.info("Experiment locked")
|
|
375
637
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
638
|
+
# Generate run_id with collision detection
|
|
639
|
+
now = datetime.now()
|
|
640
|
+
base_run_id = now.strftime("%Y%m%d_%H%M%S")
|
|
641
|
+
run_id = base_run_id
|
|
642
|
+
suffix = 1
|
|
643
|
+
while (self._experiment_base / run_id).exists():
|
|
644
|
+
run_id = f"{base_run_id}.{suffix}"
|
|
645
|
+
suffix += 1
|
|
646
|
+
self.run_id = run_id
|
|
380
647
|
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
self.
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
648
|
+
# Create the run-specific workdir
|
|
649
|
+
self.workdir = self._experiment_base / self.run_id
|
|
650
|
+
self.workdir.mkdir(parents=True, exist_ok=True)
|
|
651
|
+
|
|
652
|
+
# Capture and save environment info
|
|
653
|
+
from experimaestro.utils.git import get_git_info
|
|
654
|
+
from experimaestro.utils.environment import get_current_environment
|
|
655
|
+
|
|
656
|
+
env_info_path = self.workdir / "environment.json"
|
|
657
|
+
env = get_current_environment()
|
|
658
|
+
|
|
659
|
+
# Capture project git info from project paths
|
|
660
|
+
dirty_repos = []
|
|
661
|
+
for project_path in self._project_paths:
|
|
662
|
+
project_git = get_git_info(project_path)
|
|
663
|
+
if project_git:
|
|
664
|
+
env.projects.append(project_git)
|
|
665
|
+
# Track dirty repositories
|
|
666
|
+
if project_git.get("dirty"):
|
|
667
|
+
dirty_repos.append(project_git.get("path", str(project_path)))
|
|
668
|
+
|
|
669
|
+
# Handle dirty git repositories based on configured action
|
|
670
|
+
if dirty_repos and self._dirty_git != DirtyGitAction.IGNORE:
|
|
671
|
+
for repo_path in dirty_repos:
|
|
672
|
+
if self._dirty_git == DirtyGitAction.WARN:
|
|
673
|
+
logger.warning(
|
|
674
|
+
"Project repository has uncommitted changes: %s",
|
|
675
|
+
repo_path,
|
|
676
|
+
)
|
|
677
|
+
elif self._dirty_git == DirtyGitAction.ERROR:
|
|
678
|
+
# Release the lock before raising the error
|
|
679
|
+
raise DirtyGitError(
|
|
680
|
+
f"Project repository has uncommitted changes: {repo_path}"
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
env.save(env_info_path)
|
|
684
|
+
else:
|
|
685
|
+
# Non-NORMAL mode: use placeholder run_id and workdir
|
|
686
|
+
self.run_id = "dry-run"
|
|
687
|
+
self.workdir = self._experiment_base / self.run_id
|
|
688
|
+
self.workdir.mkdir(parents=True, exist_ok=True)
|
|
394
689
|
|
|
395
690
|
# Register experiment with scheduler
|
|
396
691
|
self.scheduler.register_experiment(self)
|
|
397
692
|
|
|
398
|
-
#
|
|
399
|
-
|
|
400
|
-
|
|
693
|
+
# Set experiment start time for BaseExperiment interface
|
|
694
|
+
self._started_at = time.time()
|
|
695
|
+
self._ended_at = None
|
|
401
696
|
|
|
402
697
|
self.workspace.__enter__()
|
|
403
698
|
(self.workspace.path / ".__experimaestro__").touch()
|
|
404
699
|
|
|
405
|
-
# Initialize
|
|
406
|
-
from .
|
|
407
|
-
|
|
408
|
-
self.
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
)
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
self.
|
|
417
|
-
self.
|
|
700
|
+
# Initialize filesystem-based state tracking (only in NORMAL mode)
|
|
701
|
+
from .state_status import ExperimentEventWriter
|
|
702
|
+
|
|
703
|
+
is_normal_mode = self.workspace.run_mode == RunMode.NORMAL
|
|
704
|
+
self._event_writer = None
|
|
705
|
+
self._state_listener = None
|
|
706
|
+
|
|
707
|
+
# Track job tags and dependencies directly (no more StatusData)
|
|
708
|
+
self._tags: Dict[str, Dict[str, str]] = {}
|
|
709
|
+
self._dependencies: Dict[str, List[str]] = {}
|
|
710
|
+
self._events_count = 0
|
|
711
|
+
self._hostname: Optional[str] = None
|
|
712
|
+
self._started_at: Optional[float] = None
|
|
713
|
+
self._ended_at: Optional[float] = None
|
|
714
|
+
|
|
715
|
+
if is_normal_mode:
|
|
716
|
+
import socket
|
|
717
|
+
|
|
718
|
+
# Create event writer for this experiment
|
|
719
|
+
# Events are written to experiments/{experiment_id}/events-{count}.jsonl
|
|
720
|
+
# Permanent storage: workdir/events/
|
|
721
|
+
self._event_writer = ExperimentEventWriter(self, self.workspace.path, 0)
|
|
722
|
+
|
|
723
|
+
# Initialize status.json for this run
|
|
724
|
+
self._hostname = socket.gethostname()
|
|
725
|
+
self._started_at = datetime.now().timestamp()
|
|
726
|
+
self._event_writer.init_status()
|
|
727
|
+
|
|
728
|
+
# Create symlink to current run
|
|
729
|
+
self._event_writer.create_symlink()
|
|
730
|
+
|
|
731
|
+
# Add run info to environment.json
|
|
732
|
+
env_path = self.workdir / "environment.json"
|
|
733
|
+
env = ExperimentEnvironment.load(env_path)
|
|
734
|
+
env.run = ExperimentRunInfo(
|
|
735
|
+
hostname=self._hostname,
|
|
736
|
+
started_at=datetime.now().isoformat(),
|
|
737
|
+
)
|
|
738
|
+
env.save(env_path)
|
|
418
739
|
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
740
|
+
# Add state listener to write events to filesystem
|
|
741
|
+
self._state_listener = StateListener(
|
|
742
|
+
self._event_writer, self, self.name, self.run_id
|
|
743
|
+
)
|
|
744
|
+
self.scheduler.addlistener(self._state_listener)
|
|
424
745
|
|
|
425
746
|
# Number of unfinished jobs
|
|
426
747
|
self.unfinishedJobs = 0
|
|
@@ -445,22 +766,24 @@ class experiment:
|
|
|
445
766
|
|
|
446
767
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
447
768
|
logger.debug("Exiting scheduler context")
|
|
448
|
-
# If no exception and normal run mode, remove old "jobs"
|
|
449
|
-
if self.workspace.run_mode == RunMode.NORMAL:
|
|
450
|
-
if exc_type is None and self.jobsbakpath.is_dir():
|
|
451
|
-
rmtree(self.jobsbakpath)
|
|
452
769
|
|
|
453
770
|
# Close the different locks
|
|
454
771
|
try:
|
|
455
|
-
if exc_type:
|
|
772
|
+
if exc_type is GracefulExperimentExit:
|
|
773
|
+
# Graceful exit - don't wait for jobs, don't log error
|
|
774
|
+
logger.info("Graceful experiment exit - not waiting for running jobs")
|
|
775
|
+
elif exc_type:
|
|
456
776
|
# import faulthandler
|
|
457
777
|
# faulthandler.dump_traceback()
|
|
458
|
-
logger.
|
|
459
|
-
"Not waiting since an exception was thrown"
|
|
460
|
-
" (some jobs may be running)"
|
|
778
|
+
logger.exception(
|
|
779
|
+
"Not waiting since an exception was thrown (some jobs may be running)"
|
|
461
780
|
)
|
|
462
781
|
else:
|
|
463
782
|
self.wait()
|
|
783
|
+
|
|
784
|
+
# Wait for all pending notifications to be processed
|
|
785
|
+
# before removing listeners
|
|
786
|
+
self.scheduler.wait_for_notifications()
|
|
464
787
|
finally:
|
|
465
788
|
if self._register_signals:
|
|
466
789
|
SIGNAL_HANDLER.remove(self)
|
|
@@ -470,19 +793,41 @@ class experiment:
|
|
|
470
793
|
logger.info("Closing service %s", service.description())
|
|
471
794
|
service.stop()
|
|
472
795
|
|
|
796
|
+
# Set end time for BaseExperiment interface
|
|
797
|
+
self._ended_at = time.time()
|
|
798
|
+
|
|
473
799
|
# Unregister experiment from scheduler
|
|
474
800
|
self.scheduler.unregister_experiment(self)
|
|
475
801
|
|
|
476
|
-
# Remove
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
802
|
+
# Remove state listener and finalize run (only in NORMAL mode)
|
|
803
|
+
if exc_type is GracefulExperimentExit:
|
|
804
|
+
status = "detached" # Graceful exit, jobs may still be running
|
|
805
|
+
elif exc_type:
|
|
806
|
+
status = "failed"
|
|
807
|
+
else:
|
|
808
|
+
status = "completed"
|
|
809
|
+
|
|
810
|
+
if self._state_listener is not None:
|
|
811
|
+
self.scheduler.removelistener(self._state_listener)
|
|
812
|
+
self._finalize_run(status)
|
|
813
|
+
|
|
814
|
+
# Update environment.json with run status
|
|
815
|
+
if self.workspace.run_mode == RunMode.NORMAL and self.workdir:
|
|
816
|
+
from datetime import datetime
|
|
817
|
+
from experimaestro.utils.environment import ExperimentEnvironment
|
|
818
|
+
|
|
819
|
+
env_path = self.workdir / "environment.json"
|
|
820
|
+
if env_path.exists():
|
|
821
|
+
try:
|
|
822
|
+
env = ExperimentEnvironment.load(env_path)
|
|
823
|
+
if env.run:
|
|
824
|
+
env.run.ended_at = datetime.now().isoformat()
|
|
825
|
+
env.run.status = status
|
|
826
|
+
env.save(env_path)
|
|
827
|
+
except Exception as e:
|
|
828
|
+
logger.warning("Failed to update environment.json: %s", e)
|
|
483
829
|
|
|
484
830
|
# Note: Don't stop scheduler - it's shared!
|
|
485
|
-
# Note: Don't stop server - it runs in daemon mode until program exit
|
|
486
831
|
|
|
487
832
|
if self.taskOutputsWorker is not None:
|
|
488
833
|
logger.info("Stopping tasks outputs worker")
|
|
@@ -496,13 +841,39 @@ class experiment:
|
|
|
496
841
|
experiment.CURRENT = self.old_experiment
|
|
497
842
|
|
|
498
843
|
if self.workspace.run_mode == RunMode.NORMAL:
|
|
499
|
-
#
|
|
500
|
-
|
|
501
|
-
|
|
844
|
+
# Remove job directories for transient jobs with REMOVE mode
|
|
845
|
+
if exc_type is None:
|
|
846
|
+
for job in list(self.scheduler.jobs.values()):
|
|
847
|
+
if (
|
|
848
|
+
self in job.experiments
|
|
849
|
+
and job.transient.should_remove
|
|
850
|
+
and job.state.finished()
|
|
851
|
+
):
|
|
852
|
+
job_path = job.path
|
|
853
|
+
if job_path.exists():
|
|
854
|
+
logger.info(
|
|
855
|
+
"Removing transient job directory: %s", job_path
|
|
856
|
+
)
|
|
857
|
+
rmtree(job_path)
|
|
858
|
+
# Also remove the symlink in the experiment's jobs folder
|
|
859
|
+
symlink_path = self.jobspath / job.relpath
|
|
860
|
+
if symlink_path.is_symlink():
|
|
861
|
+
symlink_path.unlink()
|
|
862
|
+
|
|
863
|
+
# Cleanup old runs based on history settings
|
|
864
|
+
try:
|
|
865
|
+
cleanup_experiment_history(
|
|
866
|
+
self._experiment_base,
|
|
867
|
+
current_run_id=self.run_id,
|
|
868
|
+
current_status=status,
|
|
869
|
+
history=self._get_history_settings(),
|
|
870
|
+
)
|
|
871
|
+
except Exception as e:
|
|
872
|
+
logger.warning("Failed to cleanup old runs: %s", e)
|
|
502
873
|
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
874
|
+
# Suppress GracefulExperimentExit exception
|
|
875
|
+
if exc_type is GracefulExperimentExit:
|
|
876
|
+
return True
|
|
506
877
|
|
|
507
878
|
async def update_task_output_count(self, delta: int):
|
|
508
879
|
"""Change in the number of task outputs to process"""
|
|
@@ -526,26 +897,60 @@ class experiment:
|
|
|
526
897
|
"""Adds a service (e.g. tensorboard viewer) to the experiment
|
|
527
898
|
|
|
528
899
|
:param service: A service instance
|
|
529
|
-
:return: The same service instance
|
|
900
|
+
:return: The same service instance (or existing service if already added)
|
|
530
901
|
"""
|
|
531
|
-
self.services
|
|
902
|
+
existing = self.services.get(service.id)
|
|
903
|
+
if existing is not None:
|
|
904
|
+
if existing is service:
|
|
905
|
+
# Same service instance added twice - just return it
|
|
906
|
+
logger.debug("Service %s already added, ignoring duplicate", service.id)
|
|
907
|
+
return service
|
|
908
|
+
else:
|
|
909
|
+
# Different service with same id - warn and replace
|
|
910
|
+
logger.warning(
|
|
911
|
+
"Replacing service %s (old id=%s, new id=%s)",
|
|
912
|
+
service.id,
|
|
913
|
+
id(existing),
|
|
914
|
+
id(service),
|
|
915
|
+
)
|
|
532
916
|
|
|
533
|
-
|
|
534
|
-
service.add_listener(self._db_listener)
|
|
917
|
+
self._services[service.id] = service
|
|
535
918
|
|
|
536
|
-
#
|
|
537
|
-
service.
|
|
919
|
+
# Allow service to access experiment context
|
|
920
|
+
service.set_experiment(self)
|
|
538
921
|
|
|
539
|
-
|
|
922
|
+
# Register state listener for state changes (writes events)
|
|
923
|
+
if self._state_listener is not None:
|
|
924
|
+
service.add_listener(self._state_listener)
|
|
540
925
|
|
|
541
|
-
#
|
|
542
|
-
|
|
926
|
+
# Register listener for state changes
|
|
927
|
+
service.add_listener(self)
|
|
928
|
+
|
|
929
|
+
self.scheduler.notify_service_add(service, self.name, self.run_id or "")
|
|
543
930
|
|
|
544
931
|
return service
|
|
545
932
|
|
|
546
933
|
def service_state_changed(self, service):
|
|
547
|
-
"""Called when a service state changes -
|
|
548
|
-
|
|
934
|
+
"""Called when a service state changes - notify listeners"""
|
|
935
|
+
state_name = service.state.name if hasattr(service.state, "name") else "UNKNOWN"
|
|
936
|
+
logger.debug(
|
|
937
|
+
"Service %s state changed to %s (experiment=%s)",
|
|
938
|
+
service.id,
|
|
939
|
+
state_name,
|
|
940
|
+
self.name,
|
|
941
|
+
)
|
|
942
|
+
|
|
943
|
+
# Notify state listeners (for TUI tab title updates etc.)
|
|
944
|
+
from experimaestro.scheduler.state_status import ServiceStateChangedEvent
|
|
945
|
+
|
|
946
|
+
if self.scheduler is not None:
|
|
947
|
+
event = ServiceStateChangedEvent(
|
|
948
|
+
experiment_id=self.name,
|
|
949
|
+
run_id=self.run_id or "",
|
|
950
|
+
service_id=service.id,
|
|
951
|
+
state=state_name,
|
|
952
|
+
)
|
|
953
|
+
self.scheduler._notify_state_listeners_async(event)
|
|
549
954
|
|
|
550
955
|
def save(self, obj: Any, name: str = "default"):
|
|
551
956
|
"""Serializes configurations.
|
|
@@ -564,19 +969,209 @@ class experiment:
|
|
|
564
969
|
|
|
565
970
|
save(obj, save_dir)
|
|
566
971
|
|
|
567
|
-
def load(self, reference: str, name: str = "default"):
|
|
568
|
-
"""
|
|
569
|
-
|
|
570
|
-
Loads configuration objects from an experimental directory
|
|
972
|
+
def load(self, reference: str, name: str = "default", run_id: str = None):
|
|
973
|
+
"""Loads configuration objects from an experimental directory.
|
|
571
974
|
|
|
572
975
|
:param reference: The name of the experiment
|
|
573
976
|
:param name: The name of the saving directory (default to `default`)
|
|
977
|
+
:param run_id: The run ID to load from (default: latest run)
|
|
574
978
|
"""
|
|
575
979
|
from experimaestro import load
|
|
576
980
|
|
|
577
|
-
|
|
981
|
+
exp_base = self.workspace.experimentspath / reference
|
|
982
|
+
if run_id is None:
|
|
983
|
+
# Find the latest run directory
|
|
984
|
+
run_dirs = sorted(
|
|
985
|
+
[d for d in exp_base.iterdir() if d.is_dir()],
|
|
986
|
+
key=lambda d: d.stat().st_mtime,
|
|
987
|
+
reverse=True,
|
|
988
|
+
)
|
|
989
|
+
if not run_dirs:
|
|
990
|
+
raise FileNotFoundError(f"No runs found for experiment {reference}")
|
|
991
|
+
run_dir = run_dirs[0]
|
|
992
|
+
else:
|
|
993
|
+
run_dir = exp_base / run_id
|
|
994
|
+
|
|
995
|
+
path = run_dir / "data" / name
|
|
578
996
|
return load(path)
|
|
579
997
|
|
|
998
|
+
def _get_history_settings(self) -> HistorySettings:
|
|
999
|
+
"""Get the history settings for this experiment.
|
|
1000
|
+
|
|
1001
|
+
Returns workspace-specific settings if available, otherwise global defaults.
|
|
1002
|
+
"""
|
|
1003
|
+
# Check if workspace has explicit history settings
|
|
1004
|
+
ws_settings = self.workspace.settings
|
|
1005
|
+
if ws_settings and ws_settings.history:
|
|
1006
|
+
return ws_settings.history
|
|
1007
|
+
|
|
1008
|
+
# Fall back to global settings
|
|
1009
|
+
settings = get_settings()
|
|
1010
|
+
return settings.history
|
|
1011
|
+
|
|
1012
|
+
|
|
1013
|
+
def get_run_status(run_dir: Path) -> Optional[str]:
|
|
1014
|
+
"""Get the status of a run from its status.json or environment.json.
|
|
1015
|
+
|
|
1016
|
+
Args:
|
|
1017
|
+
run_dir: Path to the run directory
|
|
1018
|
+
|
|
1019
|
+
Returns:
|
|
1020
|
+
'completed', 'failed', or None if status cannot be determined.
|
|
1021
|
+
"""
|
|
1022
|
+
# Try environment.json first (most reliable - written on exit)
|
|
1023
|
+
env_path = run_dir / "environment.json"
|
|
1024
|
+
if env_path.exists():
|
|
1025
|
+
try:
|
|
1026
|
+
from experimaestro.utils.environment import ExperimentEnvironment
|
|
1027
|
+
|
|
1028
|
+
env = ExperimentEnvironment.load(env_path)
|
|
1029
|
+
if env.run and env.run.status:
|
|
1030
|
+
return env.run.status
|
|
1031
|
+
except Exception:
|
|
1032
|
+
pass
|
|
1033
|
+
|
|
1034
|
+
# Fall back to status.json
|
|
1035
|
+
status_path = run_dir / "status.json"
|
|
1036
|
+
if status_path.exists():
|
|
1037
|
+
try:
|
|
1038
|
+
with status_path.open() as f:
|
|
1039
|
+
status = json.load(f)
|
|
1040
|
+
# Check the experiment status field
|
|
1041
|
+
exp_status = status.get("status")
|
|
1042
|
+
if exp_status == "done":
|
|
1043
|
+
return "completed"
|
|
1044
|
+
elif exp_status == "failed":
|
|
1045
|
+
return "failed"
|
|
1046
|
+
# Check job states as fallback
|
|
1047
|
+
jobs = status.get("jobs", {})
|
|
1048
|
+
if any(j.get("state") == "error" for j in jobs.values()):
|
|
1049
|
+
return "failed"
|
|
1050
|
+
return "completed"
|
|
1051
|
+
except Exception:
|
|
1052
|
+
pass
|
|
1053
|
+
|
|
1054
|
+
# Cannot determine status
|
|
1055
|
+
return None
|
|
1056
|
+
|
|
1057
|
+
|
|
1058
|
+
def cleanup_experiment_history(
|
|
1059
|
+
experiment_base: Path,
|
|
1060
|
+
*,
|
|
1061
|
+
current_run_id: Optional[str] = None,
|
|
1062
|
+
current_status: Optional[str] = None,
|
|
1063
|
+
history: Optional[HistorySettings] = None,
|
|
1064
|
+
) -> list[Path]:
|
|
1065
|
+
"""Clean up old experiment runs based on history settings.
|
|
1066
|
+
|
|
1067
|
+
This function can be called from the CLI or other contexts.
|
|
1068
|
+
|
|
1069
|
+
Args:
|
|
1070
|
+
experiment_base: Path to the experiment directory (containing run subdirs)
|
|
1071
|
+
current_run_id: ID of the current run to exclude from cleanup (optional)
|
|
1072
|
+
current_status: Status of the current run ('completed' or 'failed'), used
|
|
1073
|
+
to determine if failed runs should be removed (optional)
|
|
1074
|
+
history: History settings to use (defaults to global settings)
|
|
1075
|
+
|
|
1076
|
+
Returns:
|
|
1077
|
+
List of paths that were removed
|
|
1078
|
+
"""
|
|
1079
|
+
if history is None:
|
|
1080
|
+
settings = get_settings()
|
|
1081
|
+
history = settings.history
|
|
1082
|
+
|
|
1083
|
+
removed_paths = []
|
|
1084
|
+
|
|
1085
|
+
# List all run directories (excluding the current one)
|
|
1086
|
+
run_dirs = []
|
|
1087
|
+
for d in experiment_base.iterdir():
|
|
1088
|
+
if d.is_dir() and d.name != current_run_id:
|
|
1089
|
+
run_dirs.append(d)
|
|
1090
|
+
|
|
1091
|
+
# Sort by directory name (oldest first)
|
|
1092
|
+
# Directory names are in format YYYYMMDD_HHMMSS or YYYYMMDD_HHMMSS.N (with modifier)
|
|
1093
|
+
def run_sort_key(d: Path) -> tuple[str, int]:
|
|
1094
|
+
"""Parse run_id for sorting, handling modifiers like 20250501_102315.1"""
|
|
1095
|
+
name = d.name
|
|
1096
|
+
if "." in name:
|
|
1097
|
+
parts = name.split(".", 1)
|
|
1098
|
+
try:
|
|
1099
|
+
return (parts[0], int(parts[1]))
|
|
1100
|
+
except (ValueError, IndexError):
|
|
1101
|
+
return (name, 0)
|
|
1102
|
+
return (name, 0)
|
|
1103
|
+
|
|
1104
|
+
run_dirs.sort(key=run_sort_key)
|
|
1105
|
+
|
|
1106
|
+
# Categorize runs by status
|
|
1107
|
+
completed_runs = []
|
|
1108
|
+
failed_runs = []
|
|
1109
|
+
|
|
1110
|
+
for run_dir in run_dirs:
|
|
1111
|
+
status = get_run_status(run_dir)
|
|
1112
|
+
if status == "completed":
|
|
1113
|
+
completed_runs.append(run_dir)
|
|
1114
|
+
elif status == "failed":
|
|
1115
|
+
failed_runs.append(run_dir)
|
|
1116
|
+
# Runs with unknown status are not touched
|
|
1117
|
+
|
|
1118
|
+
# If current run succeeded, remove all past failed runs (per user requirement)
|
|
1119
|
+
if current_status == "completed":
|
|
1120
|
+
# Remove all past failed runs
|
|
1121
|
+
# Per user requirement: "If an experiment succeed, it remove the past failed"
|
|
1122
|
+
for run_dir in failed_runs:
|
|
1123
|
+
logger.info("Removing failed run (experiment succeeded): %s", run_dir)
|
|
1124
|
+
try:
|
|
1125
|
+
rmtree(run_dir)
|
|
1126
|
+
removed_paths.append(run_dir)
|
|
1127
|
+
except Exception as e:
|
|
1128
|
+
logger.warning("Failed to remove run directory %s: %s", run_dir, e)
|
|
1129
|
+
failed_runs = []
|
|
1130
|
+
|
|
1131
|
+
# Remove failed runs that come after any successful run
|
|
1132
|
+
# (if there's a success before a failure, that failure is stale)
|
|
1133
|
+
if completed_runs:
|
|
1134
|
+
# Find the newest completed run
|
|
1135
|
+
newest_completed = run_sort_key(completed_runs[-1])
|
|
1136
|
+
remaining_failed = []
|
|
1137
|
+
for run_dir in failed_runs:
|
|
1138
|
+
if run_sort_key(run_dir) < newest_completed:
|
|
1139
|
+
logger.info("Removing failed run (success exists after): %s", run_dir)
|
|
1140
|
+
try:
|
|
1141
|
+
rmtree(run_dir)
|
|
1142
|
+
removed_paths.append(run_dir)
|
|
1143
|
+
except Exception as e:
|
|
1144
|
+
logger.warning("Failed to remove run directory %s: %s", run_dir, e)
|
|
1145
|
+
else:
|
|
1146
|
+
remaining_failed.append(run_dir)
|
|
1147
|
+
failed_runs = remaining_failed
|
|
1148
|
+
|
|
1149
|
+
# Keep only max_done completed runs (remove oldest ones)
|
|
1150
|
+
while len(completed_runs) > history.max_done:
|
|
1151
|
+
run_dir = completed_runs.pop(0) # Remove oldest
|
|
1152
|
+
logger.info(
|
|
1153
|
+
"Removing old completed run (keeping %d): %s", history.max_done, run_dir
|
|
1154
|
+
)
|
|
1155
|
+
try:
|
|
1156
|
+
rmtree(run_dir)
|
|
1157
|
+
removed_paths.append(run_dir)
|
|
1158
|
+
except Exception as e:
|
|
1159
|
+
logger.warning("Failed to remove run directory %s: %s", run_dir, e)
|
|
1160
|
+
|
|
1161
|
+
# Keep only max_failed failed runs (remove oldest ones)
|
|
1162
|
+
while len(failed_runs) > history.max_failed:
|
|
1163
|
+
run_dir = failed_runs.pop(0) # Remove oldest
|
|
1164
|
+
logger.info(
|
|
1165
|
+
"Removing old failed run (keeping %d): %s", history.max_failed, run_dir
|
|
1166
|
+
)
|
|
1167
|
+
try:
|
|
1168
|
+
rmtree(run_dir)
|
|
1169
|
+
removed_paths.append(run_dir)
|
|
1170
|
+
except Exception as e:
|
|
1171
|
+
logger.warning("Failed to remove run directory %s: %s", run_dir, e)
|
|
1172
|
+
|
|
1173
|
+
return removed_paths
|
|
1174
|
+
|
|
580
1175
|
|
|
581
1176
|
# re-export at the module level
|
|
582
1177
|
current = experiment.current
|