experimaestro 1.11.1__py3-none-any.whl → 2.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +10 -11
- experimaestro/annotations.py +167 -206
- experimaestro/cli/__init__.py +140 -16
- experimaestro/cli/filter.py +42 -74
- experimaestro/cli/jobs.py +157 -106
- experimaestro/cli/progress.py +269 -0
- experimaestro/cli/refactor.py +249 -0
- experimaestro/click.py +0 -1
- experimaestro/commandline.py +19 -3
- experimaestro/connectors/__init__.py +22 -3
- experimaestro/connectors/local.py +12 -0
- experimaestro/core/arguments.py +192 -37
- experimaestro/core/identifier.py +127 -12
- experimaestro/core/objects/__init__.py +6 -0
- experimaestro/core/objects/config.py +702 -285
- experimaestro/core/objects/config_walk.py +24 -6
- experimaestro/core/serialization.py +91 -34
- experimaestro/core/serializers.py +1 -8
- experimaestro/core/subparameters.py +164 -0
- experimaestro/core/types.py +198 -83
- experimaestro/exceptions.py +26 -0
- experimaestro/experiments/cli.py +107 -25
- experimaestro/generators.py +50 -9
- experimaestro/huggingface.py +3 -1
- experimaestro/launcherfinder/parser.py +29 -0
- experimaestro/launcherfinder/registry.py +3 -3
- experimaestro/launchers/__init__.py +26 -1
- experimaestro/launchers/direct.py +12 -0
- experimaestro/launchers/slurm/base.py +154 -2
- experimaestro/mkdocs/base.py +6 -8
- experimaestro/mkdocs/metaloader.py +0 -1
- experimaestro/mypy.py +452 -7
- experimaestro/notifications.py +75 -16
- experimaestro/progress.py +404 -0
- experimaestro/rpyc.py +0 -1
- experimaestro/run.py +19 -6
- experimaestro/scheduler/__init__.py +18 -1
- experimaestro/scheduler/base.py +504 -959
- experimaestro/scheduler/dependencies.py +43 -28
- experimaestro/scheduler/dynamic_outputs.py +259 -130
- experimaestro/scheduler/experiment.py +582 -0
- experimaestro/scheduler/interfaces.py +474 -0
- experimaestro/scheduler/jobs.py +485 -0
- experimaestro/scheduler/services.py +186 -12
- experimaestro/scheduler/signal_handler.py +32 -0
- experimaestro/scheduler/state.py +1 -1
- experimaestro/scheduler/state_db.py +388 -0
- experimaestro/scheduler/state_provider.py +2345 -0
- experimaestro/scheduler/state_sync.py +834 -0
- experimaestro/scheduler/workspace.py +52 -10
- experimaestro/scriptbuilder.py +7 -0
- experimaestro/server/__init__.py +153 -32
- experimaestro/server/data/index.css +0 -125
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +194 -58
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +47 -6
- experimaestro/sphinx/__init__.py +3 -3
- experimaestro/taskglobals.py +20 -0
- experimaestro/tests/conftest.py +80 -0
- experimaestro/tests/core/test_generics.py +2 -2
- experimaestro/tests/identifier_stability.json +45 -0
- experimaestro/tests/launchers/bin/sacct +6 -2
- experimaestro/tests/launchers/bin/sbatch +4 -2
- experimaestro/tests/launchers/common.py +2 -2
- experimaestro/tests/launchers/test_slurm.py +80 -0
- experimaestro/tests/restart.py +1 -1
- experimaestro/tests/tasks/all.py +7 -0
- experimaestro/tests/tasks/test_dynamic.py +231 -0
- experimaestro/tests/test_checkers.py +2 -2
- experimaestro/tests/test_cli_jobs.py +615 -0
- experimaestro/tests/test_dependencies.py +11 -17
- experimaestro/tests/test_deprecated.py +630 -0
- experimaestro/tests/test_environment.py +200 -0
- experimaestro/tests/test_experiment.py +3 -3
- experimaestro/tests/test_file_progress.py +425 -0
- experimaestro/tests/test_file_progress_integration.py +477 -0
- experimaestro/tests/test_forward.py +3 -3
- experimaestro/tests/test_generators.py +93 -0
- experimaestro/tests/test_identifier.py +520 -169
- experimaestro/tests/test_identifier_stability.py +458 -0
- experimaestro/tests/test_instance.py +16 -21
- experimaestro/tests/test_multitoken.py +442 -0
- experimaestro/tests/test_mypy.py +433 -0
- experimaestro/tests/test_objects.py +314 -30
- experimaestro/tests/test_outputs.py +8 -8
- experimaestro/tests/test_param.py +22 -26
- experimaestro/tests/test_partial_paths.py +231 -0
- experimaestro/tests/test_progress.py +2 -50
- experimaestro/tests/test_resumable_task.py +480 -0
- experimaestro/tests/test_serializers.py +141 -60
- experimaestro/tests/test_state_db.py +434 -0
- experimaestro/tests/test_subparameters.py +160 -0
- experimaestro/tests/test_tags.py +151 -15
- experimaestro/tests/test_tasks.py +137 -160
- experimaestro/tests/test_token_locking.py +252 -0
- experimaestro/tests/test_tokens.py +25 -19
- experimaestro/tests/test_types.py +133 -11
- experimaestro/tests/test_validation.py +19 -19
- experimaestro/tests/test_workspace_triggers.py +158 -0
- experimaestro/tests/token_reschedule.py +5 -3
- experimaestro/tests/utils.py +2 -2
- experimaestro/tokens.py +154 -57
- experimaestro/tools/diff.py +8 -1
- experimaestro/tui/__init__.py +8 -0
- experimaestro/tui/app.py +2303 -0
- experimaestro/tui/app.tcss +353 -0
- experimaestro/tui/log_viewer.py +228 -0
- experimaestro/typingutils.py +11 -2
- experimaestro/utils/__init__.py +23 -0
- experimaestro/utils/environment.py +148 -0
- experimaestro/utils/git.py +129 -0
- experimaestro/utils/resources.py +1 -1
- experimaestro/version.py +34 -0
- {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +70 -39
- experimaestro-2.0.0b4.dist-info/RECORD +181 -0
- {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
- experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
- experimaestro/compat.py +0 -6
- experimaestro/core/objects.pyi +0 -225
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro-1.11.1.dist-info/RECORD +0 -158
- experimaestro-1.11.1.dist-info/entry_points.txt +0 -17
- {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,582 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import time
|
|
7
|
+
from shutil import rmtree
|
|
8
|
+
from typing import Any, Dict, Optional, TypeVar, Union
|
|
9
|
+
|
|
10
|
+
from experimaestro.core.objects import WatchedOutput
|
|
11
|
+
from experimaestro.exceptions import HandledException
|
|
12
|
+
|
|
13
|
+
from experimaestro.scheduler.signal_handler import SIGNAL_HANDLER
|
|
14
|
+
from experimaestro.scheduler.jobs import Job
|
|
15
|
+
from experimaestro.scheduler.services import Service
|
|
16
|
+
from experimaestro.scheduler.workspace import RunMode, Workspace
|
|
17
|
+
from experimaestro.settings import WorkspaceSettings, get_settings
|
|
18
|
+
from experimaestro.utils import logger
|
|
19
|
+
|
|
20
|
+
ServiceClass = TypeVar("ServiceClass", bound=Service)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class FailedExperiment(HandledException):
|
|
24
|
+
"""Raised when an experiment failed"""
|
|
25
|
+
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DatabaseListener:
|
|
30
|
+
"""Listener that updates job state in the database"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, state_provider, experiment_id: str, run_id: str):
|
|
33
|
+
self.state_provider = state_provider
|
|
34
|
+
self.experiment_id = experiment_id
|
|
35
|
+
self.run_id = run_id
|
|
36
|
+
|
|
37
|
+
def job_submitted(self, job):
|
|
38
|
+
# Already handled in experiment.add_job()
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
def job_state(self, job):
|
|
42
|
+
"""Update job state in database"""
|
|
43
|
+
self.state_provider.update_job_state(job, self.experiment_id, self.run_id)
|
|
44
|
+
|
|
45
|
+
def service_add(self, service):
|
|
46
|
+
"""Update service in database"""
|
|
47
|
+
self.state_provider.update_service(
|
|
48
|
+
service.id,
|
|
49
|
+
self.experiment_id,
|
|
50
|
+
self.run_id,
|
|
51
|
+
service.description(),
|
|
52
|
+
service.state.name,
|
|
53
|
+
state_dict=json.dumps(service.state_dict()),
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def service_state_changed(self, service):
|
|
57
|
+
"""Update service state in database (called by Service when state changes)"""
|
|
58
|
+
self.state_provider.update_service(
|
|
59
|
+
service.id,
|
|
60
|
+
self.experiment_id,
|
|
61
|
+
self.run_id,
|
|
62
|
+
service.description(),
|
|
63
|
+
service.state.name,
|
|
64
|
+
state_dict=json.dumps(service.state_dict()),
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class experiment:
|
|
69
|
+
"""Context manager for running experiments.
|
|
70
|
+
|
|
71
|
+
Creates a workspace, manages task submission, and optionally starts
|
|
72
|
+
a web server for monitoring.
|
|
73
|
+
|
|
74
|
+
Example::
|
|
75
|
+
|
|
76
|
+
from experimaestro import experiment
|
|
77
|
+
|
|
78
|
+
with experiment("./workdir", "my-experiment", port=12345) as xp:
|
|
79
|
+
task = MyTask.C(param=42).submit()
|
|
80
|
+
result = task.wait()
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
#: Current experiment
|
|
84
|
+
CURRENT: Optional["experiment"] = None
|
|
85
|
+
|
|
86
|
+
@staticmethod
|
|
87
|
+
def current() -> "experiment":
|
|
88
|
+
"""Returns the current experiment, but checking first if set
|
|
89
|
+
|
|
90
|
+
If there is no current experiment, raises an AssertError
|
|
91
|
+
"""
|
|
92
|
+
assert experiment.CURRENT is not None, "No current experiment defined"
|
|
93
|
+
return experiment.CURRENT
|
|
94
|
+
|
|
95
|
+
def __init__(
|
|
96
|
+
self,
|
|
97
|
+
env: Union[Path, str, WorkspaceSettings],
|
|
98
|
+
name: str,
|
|
99
|
+
*,
|
|
100
|
+
host: Optional[str] = None,
|
|
101
|
+
port: Optional[int] = None,
|
|
102
|
+
token: Optional[str] = None,
|
|
103
|
+
run_mode: Optional[RunMode] = None,
|
|
104
|
+
launcher=None,
|
|
105
|
+
register_signals: bool = True,
|
|
106
|
+
):
|
|
107
|
+
"""
|
|
108
|
+
:param env: an environment -- or a working directory for a local
|
|
109
|
+
environment
|
|
110
|
+
|
|
111
|
+
:param name: the identifier of the experiment
|
|
112
|
+
|
|
113
|
+
:param launcher: The launcher (if not provided, inferred from path)
|
|
114
|
+
|
|
115
|
+
:param host: The host for the web server (overrides the environment if
|
|
116
|
+
set)
|
|
117
|
+
:param port: the port for the web server (overrides the environment if
|
|
118
|
+
set). Use negative number to avoid running a web server (default when dry run).
|
|
119
|
+
|
|
120
|
+
:param run_mode: The run mode for the experiment (normal, generate run
|
|
121
|
+
files, dry run)
|
|
122
|
+
|
|
123
|
+
:param register_signals: Whether to register signal handlers (default: True).
|
|
124
|
+
Set to False when running in a background thread.
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
from experimaestro.scheduler import Listener, Scheduler
|
|
128
|
+
|
|
129
|
+
settings = get_settings()
|
|
130
|
+
if not isinstance(env, WorkspaceSettings):
|
|
131
|
+
env = WorkspaceSettings(id=None, path=Path(env))
|
|
132
|
+
|
|
133
|
+
# Creates the workspace
|
|
134
|
+
run_mode = run_mode or RunMode.NORMAL
|
|
135
|
+
self.workspace = Workspace(settings, env, launcher=launcher, run_mode=run_mode)
|
|
136
|
+
|
|
137
|
+
# Mark the directory has an experimaestro folder
|
|
138
|
+
self.workdir = self.workspace.experimentspath / name
|
|
139
|
+
self.workdir.mkdir(parents=True, exist_ok=True)
|
|
140
|
+
self.xplockpath = self.workdir / "lock"
|
|
141
|
+
self.xplock = None
|
|
142
|
+
self.old_experiment = None
|
|
143
|
+
self.services: Dict[str, Service] = {}
|
|
144
|
+
self._job_listener: Optional[Listener] = None
|
|
145
|
+
self._register_signals = register_signals
|
|
146
|
+
|
|
147
|
+
# Get configuration settings
|
|
148
|
+
|
|
149
|
+
if host is not None:
|
|
150
|
+
settings.server.host = host
|
|
151
|
+
|
|
152
|
+
if port is not None:
|
|
153
|
+
settings.server.port = port
|
|
154
|
+
|
|
155
|
+
if token is not None:
|
|
156
|
+
settings.server.token = token
|
|
157
|
+
|
|
158
|
+
# Use singleton scheduler
|
|
159
|
+
self.scheduler = Scheduler.instance()
|
|
160
|
+
|
|
161
|
+
# Determine if we need a server
|
|
162
|
+
self._needs_server = (
|
|
163
|
+
settings.server.port is not None and settings.server.port >= 0
|
|
164
|
+
) and self.workspace.run_mode == RunMode.NORMAL
|
|
165
|
+
self._server_settings = settings.server if self._needs_server else None
|
|
166
|
+
|
|
167
|
+
if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
|
|
168
|
+
import faulthandler
|
|
169
|
+
|
|
170
|
+
logger.info("Enabling fault handler")
|
|
171
|
+
faulthandler.enable(all_threads=True)
|
|
172
|
+
|
|
173
|
+
def submit(self, job: Job):
|
|
174
|
+
return self.scheduler.submit(job)
|
|
175
|
+
|
|
176
|
+
def prepare(self, job: Job):
|
|
177
|
+
"""Generate the file"""
|
|
178
|
+
return self.scheduler.prepare(job)
|
|
179
|
+
|
|
180
|
+
@property
|
|
181
|
+
def run_mode(self):
|
|
182
|
+
return self.workspace.run_mode
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def loop(self):
|
|
186
|
+
assert self.scheduler is not None, "No scheduler defined"
|
|
187
|
+
return self.scheduler.loop
|
|
188
|
+
|
|
189
|
+
@property
|
|
190
|
+
def server(self):
|
|
191
|
+
"""Access the server via the scheduler"""
|
|
192
|
+
return self.scheduler.server if self.scheduler else None
|
|
193
|
+
|
|
194
|
+
@property
|
|
195
|
+
def resultspath(self):
|
|
196
|
+
"""Return the directory in which results can be stored for this experiment"""
|
|
197
|
+
return self.workdir / "results"
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def jobspath(self):
|
|
201
|
+
"""Return the directory in which results can be stored for this experiment"""
|
|
202
|
+
return self.workdir / "jobs"
|
|
203
|
+
|
|
204
|
+
@property
|
|
205
|
+
def alt_jobspaths(self):
|
|
206
|
+
"""Return potential other directories"""
|
|
207
|
+
for alt_workdir in self.workspace.alt_workdirs:
|
|
208
|
+
yield alt_workdir / "jobs"
|
|
209
|
+
|
|
210
|
+
@property
|
|
211
|
+
def jobsbakpath(self):
|
|
212
|
+
"""Return the directory in which results can be stored for this experiment"""
|
|
213
|
+
return self.workdir / "jobs.bak"
|
|
214
|
+
|
|
215
|
+
@property
|
|
216
|
+
def jobs_jsonl_path(self):
|
|
217
|
+
"""Return the path to the jobs.jsonl file for this experiment"""
|
|
218
|
+
return self.workdir / "jobs.jsonl"
|
|
219
|
+
|
|
220
|
+
@property
|
|
221
|
+
def services_json_path(self):
|
|
222
|
+
"""Return the path to the services.json file for this experiment"""
|
|
223
|
+
return self.workdir / "services.json"
|
|
224
|
+
|
|
225
|
+
def _write_services_json(self):
|
|
226
|
+
"""Write all services to services.json file"""
|
|
227
|
+
services_data = {}
|
|
228
|
+
for service_id, service in self.services.items():
|
|
229
|
+
# Get state_dict from service (includes __class__ for recreation)
|
|
230
|
+
service_state = service.state_dict()
|
|
231
|
+
# Add runtime state info
|
|
232
|
+
service_state.update(
|
|
233
|
+
{
|
|
234
|
+
"service_id": service_id,
|
|
235
|
+
"description": service.description(),
|
|
236
|
+
"state": service.state.name,
|
|
237
|
+
"url": getattr(service, "url", None),
|
|
238
|
+
"timestamp": time.time(),
|
|
239
|
+
}
|
|
240
|
+
)
|
|
241
|
+
services_data[service_id] = service_state
|
|
242
|
+
|
|
243
|
+
with self.services_json_path.open("w") as f:
|
|
244
|
+
json.dump(services_data, f, indent=2)
|
|
245
|
+
|
|
246
|
+
def add_job(self, job: "Job"):
|
|
247
|
+
"""Register a job and its tags to jobs.jsonl file and database
|
|
248
|
+
|
|
249
|
+
Note: For NEW jobs, the unfinishedJobs counter is updated by
|
|
250
|
+
job.set_state() when the state transitions from UNSCHEDULED.
|
|
251
|
+
For jobs already running, we increment here since no state
|
|
252
|
+
transition will occur.
|
|
253
|
+
"""
|
|
254
|
+
from experimaestro.scheduler.interfaces import JobState
|
|
255
|
+
|
|
256
|
+
if self in job.experiments:
|
|
257
|
+
# Do not double register
|
|
258
|
+
return
|
|
259
|
+
|
|
260
|
+
# Track which experiments this job belongs to
|
|
261
|
+
job.experiments.append(self)
|
|
262
|
+
|
|
263
|
+
# If job is already being tracked (not UNSCHEDULED and not finished),
|
|
264
|
+
# increment unfinishedJobs since no state transition will trigger it
|
|
265
|
+
if job.state != JobState.UNSCHEDULED and not job.state.finished():
|
|
266
|
+
self.unfinishedJobs += 1
|
|
267
|
+
logging.debug(
|
|
268
|
+
"Job %s already running, unfinished jobs for %s: %d",
|
|
269
|
+
job.identifier[:8],
|
|
270
|
+
self.workdir.name,
|
|
271
|
+
self.unfinishedJobs,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
record = {
|
|
275
|
+
"job_id": job.identifier,
|
|
276
|
+
"task_id": str(job.type.identifier),
|
|
277
|
+
"tags": dict(job.tags.items()) if job.tags else {},
|
|
278
|
+
"timestamp": time.time(),
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
with self.jobs_jsonl_path.open("a") as f:
|
|
282
|
+
f.write(json.dumps(record) + "\n")
|
|
283
|
+
|
|
284
|
+
# Also register in database for TUI/monitoring
|
|
285
|
+
experiment_id = self.workdir.name
|
|
286
|
+
self.state_provider.update_job_submitted(job, experiment_id, self.run_id)
|
|
287
|
+
|
|
288
|
+
def stop(self):
|
|
289
|
+
"""Stop the experiment as soon as possible"""
|
|
290
|
+
|
|
291
|
+
async def doStop():
|
|
292
|
+
assert self.scheduler is not None
|
|
293
|
+
async with self.scheduler.exitCondition:
|
|
294
|
+
self.exitMode = True
|
|
295
|
+
logging.debug("Setting exit mode to true")
|
|
296
|
+
self.scheduler.exitCondition.notify_all()
|
|
297
|
+
|
|
298
|
+
assert self.scheduler is not None and self.scheduler.loop is not None
|
|
299
|
+
asyncio.run_coroutine_threadsafe(doStop(), self.scheduler.loop)
|
|
300
|
+
|
|
301
|
+
def wait(self):
|
|
302
|
+
"""Wait until the running processes have finished"""
|
|
303
|
+
|
|
304
|
+
async def awaitcompletion():
|
|
305
|
+
assert self.scheduler is not None, "No scheduler defined"
|
|
306
|
+
logger.debug("Waiting to exit scheduler...")
|
|
307
|
+
async with self.scheduler.exitCondition:
|
|
308
|
+
while True:
|
|
309
|
+
if self.exitMode:
|
|
310
|
+
break
|
|
311
|
+
|
|
312
|
+
# If we have still unfinished jobs or possible new tasks, wait
|
|
313
|
+
logger.debug(
|
|
314
|
+
"Checking exit condition: unfinished jobs=%d, task output queue size=%d",
|
|
315
|
+
self.unfinishedJobs,
|
|
316
|
+
self.taskOutputQueueSize,
|
|
317
|
+
)
|
|
318
|
+
if self.unfinishedJobs == 0 and self.taskOutputQueueSize == 0:
|
|
319
|
+
break
|
|
320
|
+
|
|
321
|
+
# Wait for more news...
|
|
322
|
+
await self.scheduler.exitCondition.wait()
|
|
323
|
+
|
|
324
|
+
if self.failedJobs:
|
|
325
|
+
# Show some more information
|
|
326
|
+
from experimaestro.scheduler.jobs import (
|
|
327
|
+
JobStateError,
|
|
328
|
+
JobFailureStatus,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
count = 0
|
|
332
|
+
for job in self.failedJobs.values():
|
|
333
|
+
# Skip dependency failures - only log direct failures
|
|
334
|
+
if isinstance(job.state, JobStateError):
|
|
335
|
+
if job.state.failure_reason != JobFailureStatus.DEPENDENCY:
|
|
336
|
+
count += 1
|
|
337
|
+
logger.error(
|
|
338
|
+
"Job %s failed, check the log file %s",
|
|
339
|
+
job.relpath,
|
|
340
|
+
job.stderr,
|
|
341
|
+
)
|
|
342
|
+
else:
|
|
343
|
+
# Should not happen, but count it anyway
|
|
344
|
+
count += 1
|
|
345
|
+
logger.error(
|
|
346
|
+
"Job %s failed, check the log file %s",
|
|
347
|
+
job.relpath,
|
|
348
|
+
job.stderr,
|
|
349
|
+
)
|
|
350
|
+
raise FailedExperiment(f"{count} failed jobs")
|
|
351
|
+
|
|
352
|
+
future = asyncio.run_coroutine_threadsafe(awaitcompletion(), self.loop)
|
|
353
|
+
return future.result()
|
|
354
|
+
|
|
355
|
+
def setenv(self, name, value, override=True):
|
|
356
|
+
"""Shortcut to set the environment value"""
|
|
357
|
+
if override or name not in self.workspace.env:
|
|
358
|
+
logging.info("Setting environment: %s=%s", name, value)
|
|
359
|
+
self.workspace.env[name] = value
|
|
360
|
+
|
|
361
|
+
def token(self, name: str, count: int):
|
|
362
|
+
"""Returns a token for this experiment
|
|
363
|
+
|
|
364
|
+
The token is the default token of the workspace connector"""
|
|
365
|
+
return self.workspace.connector.createtoken(name, count)
|
|
366
|
+
|
|
367
|
+
def __enter__(self):
|
|
368
|
+
from .dynamic_outputs import TaskOutputsWorker
|
|
369
|
+
from experimaestro.utils.environment import save_environment_info
|
|
370
|
+
|
|
371
|
+
if self.workspace.run_mode != RunMode.DRY_RUN:
|
|
372
|
+
logger.info("Locking experiment %s", self.xplockpath)
|
|
373
|
+
self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
|
|
374
|
+
logger.info("Experiment locked")
|
|
375
|
+
|
|
376
|
+
# Capture and save environment info (git info for editable packages + all package versions)
|
|
377
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
378
|
+
env_info_path = self.workdir / "environment.json"
|
|
379
|
+
save_environment_info(env_info_path)
|
|
380
|
+
|
|
381
|
+
# Move old jobs into "jobs.bak"
|
|
382
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
383
|
+
self.jobsbakpath.mkdir(exist_ok=True)
|
|
384
|
+
for p in self.jobspath.glob("*/*"):
|
|
385
|
+
if p.is_symlink():
|
|
386
|
+
target = self.jobsbakpath / p.relative_to(self.jobspath)
|
|
387
|
+
if target.is_symlink():
|
|
388
|
+
# Remove if duplicate
|
|
389
|
+
p.unlink()
|
|
390
|
+
else:
|
|
391
|
+
# Rename otherwise
|
|
392
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
393
|
+
p.rename(target)
|
|
394
|
+
|
|
395
|
+
# Register experiment with scheduler
|
|
396
|
+
self.scheduler.register_experiment(self)
|
|
397
|
+
|
|
398
|
+
# Start server via scheduler if needed
|
|
399
|
+
if self._needs_server:
|
|
400
|
+
self.scheduler.start_server(self._server_settings, workspace=self.workspace)
|
|
401
|
+
|
|
402
|
+
self.workspace.__enter__()
|
|
403
|
+
(self.workspace.path / ".__experimaestro__").touch()
|
|
404
|
+
|
|
405
|
+
# Initialize workspace state provider (singleton per workspace path)
|
|
406
|
+
from .state_provider import WorkspaceStateProvider
|
|
407
|
+
|
|
408
|
+
self.state_provider = WorkspaceStateProvider.get_instance(
|
|
409
|
+
self.workspace.path,
|
|
410
|
+
read_only=False,
|
|
411
|
+
sync_on_start=False, # Experiments don't sync on start
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# Register experiment in database and create a run
|
|
415
|
+
experiment_id = self.workdir.name
|
|
416
|
+
self.state_provider.ensure_experiment(experiment_id)
|
|
417
|
+
self.run_id = self.state_provider.create_run(experiment_id)
|
|
418
|
+
|
|
419
|
+
# Add database listener to update job state in database
|
|
420
|
+
self._db_listener = DatabaseListener(
|
|
421
|
+
self.state_provider, experiment_id, self.run_id
|
|
422
|
+
)
|
|
423
|
+
self.scheduler.addlistener(self._db_listener)
|
|
424
|
+
|
|
425
|
+
# Number of unfinished jobs
|
|
426
|
+
self.unfinishedJobs = 0
|
|
427
|
+
self.taskOutputQueueSize = 0
|
|
428
|
+
|
|
429
|
+
# List of failed jobs
|
|
430
|
+
self.failedJobs: Dict[str, Job] = {}
|
|
431
|
+
|
|
432
|
+
# Exit mode when catching signals
|
|
433
|
+
self.exitMode = False
|
|
434
|
+
|
|
435
|
+
# Note: scheduler is already running as singleton
|
|
436
|
+
self.taskOutputsWorker = TaskOutputsWorker(self)
|
|
437
|
+
self.taskOutputsWorker.start()
|
|
438
|
+
|
|
439
|
+
if self._register_signals:
|
|
440
|
+
SIGNAL_HANDLER.add(self)
|
|
441
|
+
|
|
442
|
+
self.old_experiment = experiment.CURRENT
|
|
443
|
+
experiment.CURRENT = self
|
|
444
|
+
return self
|
|
445
|
+
|
|
446
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
447
|
+
logger.debug("Exiting scheduler context")
|
|
448
|
+
# If no exception and normal run mode, remove old "jobs"
|
|
449
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
450
|
+
if exc_type is None and self.jobsbakpath.is_dir():
|
|
451
|
+
rmtree(self.jobsbakpath)
|
|
452
|
+
|
|
453
|
+
# Close the different locks
|
|
454
|
+
try:
|
|
455
|
+
if exc_type:
|
|
456
|
+
# import faulthandler
|
|
457
|
+
# faulthandler.dump_traceback()
|
|
458
|
+
logger.error(
|
|
459
|
+
"Not waiting since an exception was thrown"
|
|
460
|
+
" (some jobs may be running)"
|
|
461
|
+
)
|
|
462
|
+
else:
|
|
463
|
+
self.wait()
|
|
464
|
+
finally:
|
|
465
|
+
if self._register_signals:
|
|
466
|
+
SIGNAL_HANDLER.remove(self)
|
|
467
|
+
|
|
468
|
+
# Stop services
|
|
469
|
+
for service in self.services.values():
|
|
470
|
+
logger.info("Closing service %s", service.description())
|
|
471
|
+
service.stop()
|
|
472
|
+
|
|
473
|
+
# Unregister experiment from scheduler
|
|
474
|
+
self.scheduler.unregister_experiment(self)
|
|
475
|
+
|
|
476
|
+
# Remove database listener
|
|
477
|
+
self.scheduler.removelistener(self._db_listener)
|
|
478
|
+
|
|
479
|
+
# Mark run as completed in database
|
|
480
|
+
experiment_id = self.workdir.name
|
|
481
|
+
status = "failed" if exc_type else "completed"
|
|
482
|
+
self.state_provider.complete_run(experiment_id, self.run_id, status)
|
|
483
|
+
|
|
484
|
+
# Note: Don't stop scheduler - it's shared!
|
|
485
|
+
# Note: Don't stop server - it runs in daemon mode until program exit
|
|
486
|
+
|
|
487
|
+
if self.taskOutputsWorker is not None:
|
|
488
|
+
logger.info("Stopping tasks outputs worker")
|
|
489
|
+
self.taskOutputsWorker.queue.put(None)
|
|
490
|
+
|
|
491
|
+
self.workspace.__exit__(exc_type, exc_value, traceback)
|
|
492
|
+
if self.xplock:
|
|
493
|
+
self.xplock.__exit__(exc_type, exc_value, traceback)
|
|
494
|
+
|
|
495
|
+
# Put back old experiment as current one
|
|
496
|
+
experiment.CURRENT = self.old_experiment
|
|
497
|
+
|
|
498
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
499
|
+
# Write the state
|
|
500
|
+
logging.info("Saving the experiment state")
|
|
501
|
+
from experimaestro.scheduler.state import ExperimentState
|
|
502
|
+
|
|
503
|
+
ExperimentState.save(
|
|
504
|
+
self.workdir / "state.json", self.scheduler.jobs.values()
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
async def update_task_output_count(self, delta: int):
|
|
508
|
+
"""Change in the number of task outputs to process"""
|
|
509
|
+
async with self.scheduler.exitCondition:
|
|
510
|
+
self.taskOutputQueueSize += delta
|
|
511
|
+
logging.debug(
|
|
512
|
+
"Updating queue size with %d => %d", delta, self.taskOutputQueueSize
|
|
513
|
+
)
|
|
514
|
+
if self.taskOutputQueueSize == 0:
|
|
515
|
+
self.scheduler.exitCondition.notify_all()
|
|
516
|
+
|
|
517
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
518
|
+
"""Watch an output
|
|
519
|
+
|
|
520
|
+
:param watched: The watched output specification
|
|
521
|
+
"""
|
|
522
|
+
|
|
523
|
+
self.taskOutputsWorker.watch_output(watched)
|
|
524
|
+
|
|
525
|
+
def add_service(self, service: ServiceClass) -> ServiceClass:
|
|
526
|
+
"""Adds a service (e.g. tensorboard viewer) to the experiment
|
|
527
|
+
|
|
528
|
+
:param service: A service instance
|
|
529
|
+
:return: The same service instance
|
|
530
|
+
"""
|
|
531
|
+
self.services[service.id] = service
|
|
532
|
+
|
|
533
|
+
# Register database listener for state changes
|
|
534
|
+
service.add_listener(self._db_listener)
|
|
535
|
+
|
|
536
|
+
# Register file listener for state changes (writes to services.json)
|
|
537
|
+
service.add_listener(self)
|
|
538
|
+
|
|
539
|
+
self.scheduler.notify_service_add(service)
|
|
540
|
+
|
|
541
|
+
# Write services.json file
|
|
542
|
+
self._write_services_json()
|
|
543
|
+
|
|
544
|
+
return service
|
|
545
|
+
|
|
546
|
+
def service_state_changed(self, service):
|
|
547
|
+
"""Called when a service state changes - update services.json"""
|
|
548
|
+
self._write_services_json()
|
|
549
|
+
|
|
550
|
+
def save(self, obj: Any, name: str = "default"):
|
|
551
|
+
"""Serializes configurations.
|
|
552
|
+
|
|
553
|
+
Saves configuration objects within the experimental directory
|
|
554
|
+
|
|
555
|
+
:param obj: The object to save
|
|
556
|
+
:param name: The name of the saving directory (default to `default`)
|
|
557
|
+
"""
|
|
558
|
+
|
|
559
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
560
|
+
from experimaestro import save
|
|
561
|
+
|
|
562
|
+
save_dir = self.workdir / "data" / name
|
|
563
|
+
save_dir.mkdir(exist_ok=True, parents=True)
|
|
564
|
+
|
|
565
|
+
save(obj, save_dir)
|
|
566
|
+
|
|
567
|
+
def load(self, reference: str, name: str = "default"):
|
|
568
|
+
"""Serializes configurations.
|
|
569
|
+
|
|
570
|
+
Loads configuration objects from an experimental directory
|
|
571
|
+
|
|
572
|
+
:param reference: The name of the experiment
|
|
573
|
+
:param name: The name of the saving directory (default to `default`)
|
|
574
|
+
"""
|
|
575
|
+
from experimaestro import load
|
|
576
|
+
|
|
577
|
+
path = self.workspace.experimentspath / reference / "data" / name
|
|
578
|
+
return load(path)
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
# re-export at the module level
|
|
582
|
+
current = experiment.current
|