experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +10 -11
- experimaestro/annotations.py +167 -206
- experimaestro/cli/__init__.py +278 -7
- experimaestro/cli/filter.py +42 -74
- experimaestro/cli/jobs.py +157 -106
- experimaestro/cli/refactor.py +249 -0
- experimaestro/click.py +0 -1
- experimaestro/commandline.py +19 -3
- experimaestro/connectors/__init__.py +20 -1
- experimaestro/connectors/local.py +12 -0
- experimaestro/core/arguments.py +182 -46
- experimaestro/core/identifier.py +107 -6
- experimaestro/core/objects/__init__.py +6 -0
- experimaestro/core/objects/config.py +542 -25
- experimaestro/core/objects/config_walk.py +20 -0
- experimaestro/core/serialization.py +91 -34
- experimaestro/core/subparameters.py +164 -0
- experimaestro/core/types.py +175 -38
- experimaestro/exceptions.py +26 -0
- experimaestro/experiments/cli.py +111 -25
- experimaestro/generators.py +50 -9
- experimaestro/huggingface.py +3 -1
- experimaestro/launcherfinder/parser.py +29 -0
- experimaestro/launchers/__init__.py +26 -1
- experimaestro/launchers/direct.py +12 -0
- experimaestro/launchers/slurm/base.py +154 -2
- experimaestro/mkdocs/metaloader.py +0 -1
- experimaestro/mypy.py +452 -7
- experimaestro/notifications.py +63 -13
- experimaestro/progress.py +0 -2
- experimaestro/rpyc.py +0 -1
- experimaestro/run.py +19 -6
- experimaestro/scheduler/base.py +510 -125
- experimaestro/scheduler/dependencies.py +43 -28
- experimaestro/scheduler/dynamic_outputs.py +259 -130
- experimaestro/scheduler/experiment.py +256 -31
- experimaestro/scheduler/interfaces.py +501 -0
- experimaestro/scheduler/jobs.py +216 -206
- experimaestro/scheduler/remote/__init__.py +31 -0
- experimaestro/scheduler/remote/client.py +874 -0
- experimaestro/scheduler/remote/protocol.py +467 -0
- experimaestro/scheduler/remote/server.py +423 -0
- experimaestro/scheduler/remote/sync.py +144 -0
- experimaestro/scheduler/services.py +323 -23
- experimaestro/scheduler/state_db.py +437 -0
- experimaestro/scheduler/state_provider.py +2766 -0
- experimaestro/scheduler/state_sync.py +891 -0
- experimaestro/scheduler/workspace.py +52 -10
- experimaestro/scriptbuilder.py +7 -0
- experimaestro/server/__init__.py +147 -57
- experimaestro/server/data/index.css +0 -125
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +194 -58
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +44 -5
- experimaestro/sphinx/__init__.py +3 -3
- experimaestro/taskglobals.py +20 -0
- experimaestro/tests/conftest.py +80 -0
- experimaestro/tests/core/test_generics.py +2 -2
- experimaestro/tests/identifier_stability.json +45 -0
- experimaestro/tests/launchers/bin/sacct +6 -2
- experimaestro/tests/launchers/bin/sbatch +4 -2
- experimaestro/tests/launchers/test_slurm.py +80 -0
- experimaestro/tests/tasks/test_dynamic.py +231 -0
- experimaestro/tests/test_cli_jobs.py +615 -0
- experimaestro/tests/test_deprecated.py +630 -0
- experimaestro/tests/test_environment.py +200 -0
- experimaestro/tests/test_file_progress_integration.py +1 -1
- experimaestro/tests/test_forward.py +3 -3
- experimaestro/tests/test_identifier.py +372 -41
- experimaestro/tests/test_identifier_stability.py +458 -0
- experimaestro/tests/test_instance.py +3 -3
- experimaestro/tests/test_multitoken.py +442 -0
- experimaestro/tests/test_mypy.py +433 -0
- experimaestro/tests/test_objects.py +312 -5
- experimaestro/tests/test_outputs.py +2 -2
- experimaestro/tests/test_param.py +8 -12
- experimaestro/tests/test_partial_paths.py +231 -0
- experimaestro/tests/test_progress.py +0 -48
- experimaestro/tests/test_remote_state.py +671 -0
- experimaestro/tests/test_resumable_task.py +480 -0
- experimaestro/tests/test_serializers.py +141 -1
- experimaestro/tests/test_state_db.py +434 -0
- experimaestro/tests/test_subparameters.py +160 -0
- experimaestro/tests/test_tags.py +136 -0
- experimaestro/tests/test_tasks.py +107 -121
- experimaestro/tests/test_token_locking.py +252 -0
- experimaestro/tests/test_tokens.py +17 -13
- experimaestro/tests/test_types.py +123 -1
- experimaestro/tests/test_workspace_triggers.py +158 -0
- experimaestro/tests/token_reschedule.py +4 -2
- experimaestro/tests/utils.py +2 -2
- experimaestro/tokens.py +154 -57
- experimaestro/tools/diff.py +1 -1
- experimaestro/tui/__init__.py +8 -0
- experimaestro/tui/app.py +2395 -0
- experimaestro/tui/app.tcss +353 -0
- experimaestro/tui/log_viewer.py +228 -0
- experimaestro/utils/__init__.py +23 -0
- experimaestro/utils/environment.py +148 -0
- experimaestro/utils/git.py +129 -0
- experimaestro/utils/resources.py +1 -1
- experimaestro/version.py +34 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +68 -38
- experimaestro-2.0.0b8.dist-info/RECORD +187 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +1 -1
- experimaestro-2.0.0b8.dist-info/entry_points.txt +16 -0
- experimaestro/compat.py +0 -6
- experimaestro/core/objects.pyi +0 -221
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro-2.0.0a8.dist-info/RECORD +0 -166
- experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import json
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
4
5
|
from pathlib import Path
|
|
6
|
+
import time
|
|
5
7
|
from shutil import rmtree
|
|
6
8
|
from typing import Any, Dict, Optional, TypeVar, Union
|
|
7
9
|
|
|
@@ -9,7 +11,7 @@ from experimaestro.core.objects import WatchedOutput
|
|
|
9
11
|
from experimaestro.exceptions import HandledException
|
|
10
12
|
|
|
11
13
|
from experimaestro.scheduler.signal_handler import SIGNAL_HANDLER
|
|
12
|
-
from experimaestro.scheduler.jobs import Job
|
|
14
|
+
from experimaestro.scheduler.jobs import Job
|
|
13
15
|
from experimaestro.scheduler.services import Service
|
|
14
16
|
from experimaestro.scheduler.workspace import RunMode, Workspace
|
|
15
17
|
from experimaestro.settings import WorkspaceSettings, get_settings
|
|
@@ -24,15 +26,54 @@ class FailedExperiment(HandledException):
|
|
|
24
26
|
pass
|
|
25
27
|
|
|
26
28
|
|
|
29
|
+
class DatabaseListener:
|
|
30
|
+
"""Listener that updates job state in the database"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, state_provider, experiment_id: str, run_id: str):
|
|
33
|
+
self.state_provider = state_provider
|
|
34
|
+
self.experiment_id = experiment_id
|
|
35
|
+
self.run_id = run_id
|
|
36
|
+
|
|
37
|
+
def job_submitted(self, job):
|
|
38
|
+
# Already handled in experiment.add_job()
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
def job_state(self, job):
|
|
42
|
+
"""Update job state in database"""
|
|
43
|
+
self.state_provider.update_job_state(job, self.experiment_id, self.run_id)
|
|
44
|
+
|
|
45
|
+
def service_add(self, service):
|
|
46
|
+
"""Register service in database"""
|
|
47
|
+
from experimaestro.scheduler.services import Service
|
|
48
|
+
|
|
49
|
+
state_dict = Service.serialize_state_dict(service._full_state_dict())
|
|
50
|
+
self.state_provider.register_service(
|
|
51
|
+
service.id,
|
|
52
|
+
self.experiment_id,
|
|
53
|
+
self.run_id,
|
|
54
|
+
service.description(),
|
|
55
|
+
state_dict=json.dumps(state_dict),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def service_state_changed(self, service):
|
|
59
|
+
"""Called when service state changes (runtime only, not persisted)"""
|
|
60
|
+
# Service state is managed at runtime, not persisted to DB
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
27
64
|
class experiment:
|
|
28
|
-
"""
|
|
65
|
+
"""Context manager for running experiments.
|
|
66
|
+
|
|
67
|
+
Creates a workspace, manages task submission, and optionally starts
|
|
68
|
+
a web server for monitoring.
|
|
29
69
|
|
|
30
|
-
|
|
70
|
+
Example::
|
|
31
71
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
72
|
+
from experimaestro import experiment
|
|
73
|
+
|
|
74
|
+
with experiment("./workdir", "my-experiment", port=12345) as xp:
|
|
75
|
+
task = MyTask.C(param=42).submit()
|
|
76
|
+
result = task.wait()
|
|
36
77
|
"""
|
|
37
78
|
|
|
38
79
|
#: Current experiment
|
|
@@ -57,6 +98,7 @@ class experiment:
|
|
|
57
98
|
token: Optional[str] = None,
|
|
58
99
|
run_mode: Optional[RunMode] = None,
|
|
59
100
|
launcher=None,
|
|
101
|
+
register_signals: bool = True,
|
|
60
102
|
):
|
|
61
103
|
"""
|
|
62
104
|
:param env: an environment -- or a working directory for a local
|
|
@@ -73,9 +115,11 @@ class experiment:
|
|
|
73
115
|
|
|
74
116
|
:param run_mode: The run mode for the experiment (normal, generate run
|
|
75
117
|
files, dry run)
|
|
118
|
+
|
|
119
|
+
:param register_signals: Whether to register signal handlers (default: True).
|
|
120
|
+
Set to False when running in a background thread.
|
|
76
121
|
"""
|
|
77
122
|
|
|
78
|
-
from experimaestro.server import Server
|
|
79
123
|
from experimaestro.scheduler import Listener, Scheduler
|
|
80
124
|
|
|
81
125
|
settings = get_settings()
|
|
@@ -94,6 +138,7 @@ class experiment:
|
|
|
94
138
|
self.old_experiment = None
|
|
95
139
|
self.services: Dict[str, Service] = {}
|
|
96
140
|
self._job_listener: Optional[Listener] = None
|
|
141
|
+
self._register_signals = register_signals
|
|
97
142
|
|
|
98
143
|
# Get configuration settings
|
|
99
144
|
|
|
@@ -106,14 +151,14 @@ class experiment:
|
|
|
106
151
|
if token is not None:
|
|
107
152
|
settings.server.token = token
|
|
108
153
|
|
|
109
|
-
#
|
|
110
|
-
self.scheduler = Scheduler.
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
and
|
|
115
|
-
|
|
116
|
-
|
|
154
|
+
# Use singleton scheduler
|
|
155
|
+
self.scheduler = Scheduler.instance()
|
|
156
|
+
|
|
157
|
+
# Determine if we need a server
|
|
158
|
+
self._needs_server = (
|
|
159
|
+
settings.server.port is not None and settings.server.port >= 0
|
|
160
|
+
) and self.workspace.run_mode == RunMode.NORMAL
|
|
161
|
+
self._server_settings = settings.server if self._needs_server else None
|
|
117
162
|
|
|
118
163
|
if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
|
|
119
164
|
import faulthandler
|
|
@@ -137,6 +182,11 @@ class experiment:
|
|
|
137
182
|
assert self.scheduler is not None, "No scheduler defined"
|
|
138
183
|
return self.scheduler.loop
|
|
139
184
|
|
|
185
|
+
@property
|
|
186
|
+
def server(self):
|
|
187
|
+
"""Access the server via the scheduler"""
|
|
188
|
+
return self.scheduler.server if self.scheduler else None
|
|
189
|
+
|
|
140
190
|
@property
|
|
141
191
|
def resultspath(self):
|
|
142
192
|
"""Return the directory in which results can be stored for this experiment"""
|
|
@@ -158,6 +208,83 @@ class experiment:
|
|
|
158
208
|
"""Return the directory in which results can be stored for this experiment"""
|
|
159
209
|
return self.workdir / "jobs.bak"
|
|
160
210
|
|
|
211
|
+
@property
|
|
212
|
+
def jobs_jsonl_path(self):
|
|
213
|
+
"""Return the path to the jobs.jsonl file for this experiment"""
|
|
214
|
+
return self.workdir / "jobs.jsonl"
|
|
215
|
+
|
|
216
|
+
@property
|
|
217
|
+
def services_json_path(self):
|
|
218
|
+
"""Return the path to the services.json file for this experiment"""
|
|
219
|
+
return self.workdir / "services.json"
|
|
220
|
+
|
|
221
|
+
def _write_services_json(self):
|
|
222
|
+
"""Write all services to services.json file"""
|
|
223
|
+
from experimaestro.scheduler.services import Service
|
|
224
|
+
|
|
225
|
+
services_data = {}
|
|
226
|
+
for service_id, service in self.services.items():
|
|
227
|
+
# Get state_dict from service (includes __class__ for recreation)
|
|
228
|
+
# and serialize paths to JSON-compatible format
|
|
229
|
+
service_state = Service.serialize_state_dict(service._full_state_dict())
|
|
230
|
+
# Add runtime state info
|
|
231
|
+
service_state.update(
|
|
232
|
+
{
|
|
233
|
+
"service_id": service_id,
|
|
234
|
+
"description": service.description(),
|
|
235
|
+
"state": service.state.name,
|
|
236
|
+
"url": getattr(service, "url", None),
|
|
237
|
+
"timestamp": time.time(),
|
|
238
|
+
}
|
|
239
|
+
)
|
|
240
|
+
services_data[service_id] = service_state
|
|
241
|
+
|
|
242
|
+
with self.services_json_path.open("w") as f:
|
|
243
|
+
json.dump(services_data, f, indent=2)
|
|
244
|
+
|
|
245
|
+
def add_job(self, job: "Job"):
|
|
246
|
+
"""Register a job and its tags to jobs.jsonl file and database
|
|
247
|
+
|
|
248
|
+
Note: For NEW jobs, the unfinishedJobs counter is updated by
|
|
249
|
+
job.set_state() when the state transitions from UNSCHEDULED.
|
|
250
|
+
For jobs already running, we increment here since no state
|
|
251
|
+
transition will occur.
|
|
252
|
+
"""
|
|
253
|
+
from experimaestro.scheduler.interfaces import JobState
|
|
254
|
+
|
|
255
|
+
if self in job.experiments:
|
|
256
|
+
# Do not double register
|
|
257
|
+
return
|
|
258
|
+
|
|
259
|
+
# Track which experiments this job belongs to
|
|
260
|
+
job.experiments.append(self)
|
|
261
|
+
|
|
262
|
+
# If job is already being tracked (not UNSCHEDULED and not finished),
|
|
263
|
+
# increment unfinishedJobs since no state transition will trigger it
|
|
264
|
+
if job.state != JobState.UNSCHEDULED and not job.state.finished():
|
|
265
|
+
self.unfinishedJobs += 1
|
|
266
|
+
logging.debug(
|
|
267
|
+
"Job %s already running, unfinished jobs for %s: %d",
|
|
268
|
+
job.identifier[:8],
|
|
269
|
+
self.workdir.name,
|
|
270
|
+
self.unfinishedJobs,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
record = {
|
|
274
|
+
"job_id": job.identifier,
|
|
275
|
+
"task_id": str(job.type.identifier),
|
|
276
|
+
"tags": dict(job.tags.items()) if job.tags else {},
|
|
277
|
+
"timestamp": time.time(),
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
with self.jobs_jsonl_path.open("a") as f:
|
|
281
|
+
f.write(json.dumps(record) + "\n")
|
|
282
|
+
|
|
283
|
+
# Also register in database for TUI/monitoring (only in NORMAL mode)
|
|
284
|
+
if self._db_listener is not None:
|
|
285
|
+
experiment_id = self.workdir.name
|
|
286
|
+
self.state_provider.update_job_submitted(job, experiment_id, self.run_id)
|
|
287
|
+
|
|
161
288
|
def stop(self):
|
|
162
289
|
"""Stop the experiment as soon as possible"""
|
|
163
290
|
|
|
@@ -196,9 +323,24 @@ class experiment:
|
|
|
196
323
|
|
|
197
324
|
if self.failedJobs:
|
|
198
325
|
# Show some more information
|
|
326
|
+
from experimaestro.scheduler.jobs import (
|
|
327
|
+
JobStateError,
|
|
328
|
+
JobFailureStatus,
|
|
329
|
+
)
|
|
330
|
+
|
|
199
331
|
count = 0
|
|
200
332
|
for job in self.failedJobs.values():
|
|
201
|
-
|
|
333
|
+
# Skip dependency failures - only log direct failures
|
|
334
|
+
if isinstance(job.state, JobStateError):
|
|
335
|
+
if job.state.failure_reason != JobFailureStatus.DEPENDENCY:
|
|
336
|
+
count += 1
|
|
337
|
+
logger.error(
|
|
338
|
+
"Job %s failed, check the log file %s",
|
|
339
|
+
job.relpath,
|
|
340
|
+
job.stderr,
|
|
341
|
+
)
|
|
342
|
+
else:
|
|
343
|
+
# Should not happen, but count it anyway
|
|
202
344
|
count += 1
|
|
203
345
|
logger.error(
|
|
204
346
|
"Job %s failed, check the log file %s",
|
|
@@ -224,12 +366,18 @@ class experiment:
|
|
|
224
366
|
|
|
225
367
|
def __enter__(self):
|
|
226
368
|
from .dynamic_outputs import TaskOutputsWorker
|
|
369
|
+
from experimaestro.utils.environment import save_environment_info
|
|
227
370
|
|
|
228
371
|
if self.workspace.run_mode != RunMode.DRY_RUN:
|
|
229
372
|
logger.info("Locking experiment %s", self.xplockpath)
|
|
230
373
|
self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
|
|
231
374
|
logger.info("Experiment locked")
|
|
232
375
|
|
|
376
|
+
# Capture and save environment info (git info for editable packages + all package versions)
|
|
377
|
+
if self.workspace.run_mode == RunMode.NORMAL:
|
|
378
|
+
env_info_path = self.workdir / "environment.json"
|
|
379
|
+
save_environment_info(env_info_path)
|
|
380
|
+
|
|
233
381
|
# Move old jobs into "jobs.bak"
|
|
234
382
|
if self.workspace.run_mode == RunMode.NORMAL:
|
|
235
383
|
self.jobsbakpath.mkdir(exist_ok=True)
|
|
@@ -244,12 +392,43 @@ class experiment:
|
|
|
244
392
|
target.parent.mkdir(parents=True, exist_ok=True)
|
|
245
393
|
p.rename(target)
|
|
246
394
|
|
|
247
|
-
|
|
248
|
-
|
|
395
|
+
# Register experiment with scheduler
|
|
396
|
+
self.scheduler.register_experiment(self)
|
|
397
|
+
|
|
398
|
+
# Start server via scheduler if needed
|
|
399
|
+
if self._needs_server:
|
|
400
|
+
self.scheduler.start_server(self._server_settings, workspace=self.workspace)
|
|
249
401
|
|
|
250
402
|
self.workspace.__enter__()
|
|
251
403
|
(self.workspace.path / ".__experimaestro__").touch()
|
|
252
404
|
|
|
405
|
+
# Initialize workspace state provider (singleton per workspace path)
|
|
406
|
+
# Use read_only mode when not in NORMAL run mode to prevent DB changes
|
|
407
|
+
from .state_provider import WorkspaceStateProvider
|
|
408
|
+
|
|
409
|
+
is_normal_mode = self.workspace.run_mode == RunMode.NORMAL
|
|
410
|
+
self.state_provider = WorkspaceStateProvider.get_instance(
|
|
411
|
+
self.workspace.path,
|
|
412
|
+
read_only=not is_normal_mode,
|
|
413
|
+
sync_on_start=False, # Experiments don't sync on start
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
# Register experiment in database and create a run (only in NORMAL mode)
|
|
417
|
+
experiment_id = self.workdir.name
|
|
418
|
+
self._db_listener = None
|
|
419
|
+
if is_normal_mode:
|
|
420
|
+
self.state_provider.ensure_experiment(experiment_id)
|
|
421
|
+
self.run_id = self.state_provider.create_run(experiment_id)
|
|
422
|
+
|
|
423
|
+
# Add database listener to update job state in database
|
|
424
|
+
self._db_listener = DatabaseListener(
|
|
425
|
+
self.state_provider, experiment_id, self.run_id
|
|
426
|
+
)
|
|
427
|
+
self.scheduler.addlistener(self._db_listener)
|
|
428
|
+
else:
|
|
429
|
+
# In non-NORMAL modes, use a placeholder run_id
|
|
430
|
+
self.run_id = None
|
|
431
|
+
|
|
253
432
|
# Number of unfinished jobs
|
|
254
433
|
self.unfinishedJobs = 0
|
|
255
434
|
self.taskOutputQueueSize = 0
|
|
@@ -260,11 +439,12 @@ class experiment:
|
|
|
260
439
|
# Exit mode when catching signals
|
|
261
440
|
self.exitMode = False
|
|
262
441
|
|
|
263
|
-
|
|
442
|
+
# Note: scheduler is already running as singleton
|
|
264
443
|
self.taskOutputsWorker = TaskOutputsWorker(self)
|
|
265
444
|
self.taskOutputsWorker.start()
|
|
266
445
|
|
|
267
|
-
|
|
446
|
+
if self._register_signals:
|
|
447
|
+
SIGNAL_HANDLER.add(self)
|
|
268
448
|
|
|
269
449
|
self.old_experiment = experiment.CURRENT
|
|
270
450
|
experiment.CURRENT = self
|
|
@@ -288,17 +468,33 @@ class experiment:
|
|
|
288
468
|
)
|
|
289
469
|
else:
|
|
290
470
|
self.wait()
|
|
471
|
+
|
|
472
|
+
# Wait for all pending notifications to be processed
|
|
473
|
+
# before removing listeners
|
|
474
|
+
self.scheduler.wait_for_notifications()
|
|
291
475
|
finally:
|
|
292
|
-
|
|
476
|
+
if self._register_signals:
|
|
477
|
+
SIGNAL_HANDLER.remove(self)
|
|
293
478
|
|
|
294
479
|
# Stop services
|
|
295
480
|
for service in self.services.values():
|
|
296
481
|
logger.info("Closing service %s", service.description())
|
|
297
482
|
service.stop()
|
|
298
483
|
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
484
|
+
# Unregister experiment from scheduler
|
|
485
|
+
self.scheduler.unregister_experiment(self)
|
|
486
|
+
|
|
487
|
+
# Remove database listener and mark run as completed (only in NORMAL mode)
|
|
488
|
+
if self._db_listener is not None:
|
|
489
|
+
self.scheduler.removelistener(self._db_listener)
|
|
490
|
+
|
|
491
|
+
# Mark run as completed in database
|
|
492
|
+
experiment_id = self.workdir.name
|
|
493
|
+
status = "failed" if exc_type else "completed"
|
|
494
|
+
self.state_provider.complete_run(experiment_id, self.run_id, status)
|
|
495
|
+
|
|
496
|
+
# Note: Don't stop scheduler - it's shared!
|
|
497
|
+
# Note: Don't stop server - it runs in daemon mode until program exit
|
|
302
498
|
|
|
303
499
|
if self.taskOutputsWorker is not None:
|
|
304
500
|
logger.info("Stopping tasks outputs worker")
|
|
@@ -310,9 +506,6 @@ class experiment:
|
|
|
310
506
|
|
|
311
507
|
# Put back old experiment as current one
|
|
312
508
|
experiment.CURRENT = self.old_experiment
|
|
313
|
-
if self.server:
|
|
314
|
-
logger.info("Stopping web server")
|
|
315
|
-
self.server.stop()
|
|
316
509
|
|
|
317
510
|
if self.workspace.run_mode == RunMode.NORMAL:
|
|
318
511
|
# Write the state
|
|
@@ -345,13 +538,45 @@ class experiment:
|
|
|
345
538
|
"""Adds a service (e.g. tensorboard viewer) to the experiment
|
|
346
539
|
|
|
347
540
|
:param service: A service instance
|
|
348
|
-
:return: The same service instance
|
|
541
|
+
:return: The same service instance (or existing service if already added)
|
|
349
542
|
"""
|
|
543
|
+
existing = self.services.get(service.id)
|
|
544
|
+
if existing is not None:
|
|
545
|
+
if existing is service:
|
|
546
|
+
# Same service instance added twice - just return it
|
|
547
|
+
logger.debug("Service %s already added, ignoring duplicate", service.id)
|
|
548
|
+
return service
|
|
549
|
+
else:
|
|
550
|
+
# Different service with same id - warn and replace
|
|
551
|
+
logger.warning(
|
|
552
|
+
"Replacing service %s (old id=%s, new id=%s)",
|
|
553
|
+
service.id,
|
|
554
|
+
id(existing),
|
|
555
|
+
id(service),
|
|
556
|
+
)
|
|
557
|
+
|
|
350
558
|
self.services[service.id] = service
|
|
351
|
-
|
|
352
|
-
|
|
559
|
+
|
|
560
|
+
# Allow service to access experiment context
|
|
561
|
+
service.set_experiment(self)
|
|
562
|
+
|
|
563
|
+
# Register database listener for state changes
|
|
564
|
+
service.add_listener(self._db_listener)
|
|
565
|
+
|
|
566
|
+
# Register file listener for state changes (writes to services.json)
|
|
567
|
+
service.add_listener(self)
|
|
568
|
+
|
|
569
|
+
self.scheduler.notify_service_add(service)
|
|
570
|
+
|
|
571
|
+
# Write services.json file
|
|
572
|
+
self._write_services_json()
|
|
573
|
+
|
|
353
574
|
return service
|
|
354
575
|
|
|
576
|
+
def service_state_changed(self, service):
|
|
577
|
+
"""Called when a service state changes - update services.json"""
|
|
578
|
+
self._write_services_json()
|
|
579
|
+
|
|
355
580
|
def save(self, obj: Any, name: str = "default"):
|
|
356
581
|
"""Serializes configurations.
|
|
357
582
|
|