experimaestro 1.11.1__py3-none-any.whl → 2.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +10 -11
- experimaestro/annotations.py +167 -206
- experimaestro/cli/__init__.py +140 -16
- experimaestro/cli/filter.py +42 -74
- experimaestro/cli/jobs.py +157 -106
- experimaestro/cli/progress.py +269 -0
- experimaestro/cli/refactor.py +249 -0
- experimaestro/click.py +0 -1
- experimaestro/commandline.py +19 -3
- experimaestro/connectors/__init__.py +22 -3
- experimaestro/connectors/local.py +12 -0
- experimaestro/core/arguments.py +192 -37
- experimaestro/core/identifier.py +127 -12
- experimaestro/core/objects/__init__.py +6 -0
- experimaestro/core/objects/config.py +702 -285
- experimaestro/core/objects/config_walk.py +24 -6
- experimaestro/core/serialization.py +91 -34
- experimaestro/core/serializers.py +1 -8
- experimaestro/core/subparameters.py +164 -0
- experimaestro/core/types.py +198 -83
- experimaestro/exceptions.py +26 -0
- experimaestro/experiments/cli.py +107 -25
- experimaestro/generators.py +50 -9
- experimaestro/huggingface.py +3 -1
- experimaestro/launcherfinder/parser.py +29 -0
- experimaestro/launcherfinder/registry.py +3 -3
- experimaestro/launchers/__init__.py +26 -1
- experimaestro/launchers/direct.py +12 -0
- experimaestro/launchers/slurm/base.py +154 -2
- experimaestro/mkdocs/base.py +6 -8
- experimaestro/mkdocs/metaloader.py +0 -1
- experimaestro/mypy.py +452 -7
- experimaestro/notifications.py +75 -16
- experimaestro/progress.py +404 -0
- experimaestro/rpyc.py +0 -1
- experimaestro/run.py +19 -6
- experimaestro/scheduler/__init__.py +18 -1
- experimaestro/scheduler/base.py +504 -959
- experimaestro/scheduler/dependencies.py +43 -28
- experimaestro/scheduler/dynamic_outputs.py +259 -130
- experimaestro/scheduler/experiment.py +582 -0
- experimaestro/scheduler/interfaces.py +474 -0
- experimaestro/scheduler/jobs.py +485 -0
- experimaestro/scheduler/services.py +186 -12
- experimaestro/scheduler/signal_handler.py +32 -0
- experimaestro/scheduler/state.py +1 -1
- experimaestro/scheduler/state_db.py +388 -0
- experimaestro/scheduler/state_provider.py +2345 -0
- experimaestro/scheduler/state_sync.py +834 -0
- experimaestro/scheduler/workspace.py +52 -10
- experimaestro/scriptbuilder.py +7 -0
- experimaestro/server/__init__.py +153 -32
- experimaestro/server/data/index.css +0 -125
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +194 -58
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +47 -6
- experimaestro/sphinx/__init__.py +3 -3
- experimaestro/taskglobals.py +20 -0
- experimaestro/tests/conftest.py +80 -0
- experimaestro/tests/core/test_generics.py +2 -2
- experimaestro/tests/identifier_stability.json +45 -0
- experimaestro/tests/launchers/bin/sacct +6 -2
- experimaestro/tests/launchers/bin/sbatch +4 -2
- experimaestro/tests/launchers/common.py +2 -2
- experimaestro/tests/launchers/test_slurm.py +80 -0
- experimaestro/tests/restart.py +1 -1
- experimaestro/tests/tasks/all.py +7 -0
- experimaestro/tests/tasks/test_dynamic.py +231 -0
- experimaestro/tests/test_checkers.py +2 -2
- experimaestro/tests/test_cli_jobs.py +615 -0
- experimaestro/tests/test_dependencies.py +11 -17
- experimaestro/tests/test_deprecated.py +630 -0
- experimaestro/tests/test_environment.py +200 -0
- experimaestro/tests/test_experiment.py +3 -3
- experimaestro/tests/test_file_progress.py +425 -0
- experimaestro/tests/test_file_progress_integration.py +477 -0
- experimaestro/tests/test_forward.py +3 -3
- experimaestro/tests/test_generators.py +93 -0
- experimaestro/tests/test_identifier.py +520 -169
- experimaestro/tests/test_identifier_stability.py +458 -0
- experimaestro/tests/test_instance.py +16 -21
- experimaestro/tests/test_multitoken.py +442 -0
- experimaestro/tests/test_mypy.py +433 -0
- experimaestro/tests/test_objects.py +314 -30
- experimaestro/tests/test_outputs.py +8 -8
- experimaestro/tests/test_param.py +22 -26
- experimaestro/tests/test_partial_paths.py +231 -0
- experimaestro/tests/test_progress.py +2 -50
- experimaestro/tests/test_resumable_task.py +480 -0
- experimaestro/tests/test_serializers.py +141 -60
- experimaestro/tests/test_state_db.py +434 -0
- experimaestro/tests/test_subparameters.py +160 -0
- experimaestro/tests/test_tags.py +151 -15
- experimaestro/tests/test_tasks.py +137 -160
- experimaestro/tests/test_token_locking.py +252 -0
- experimaestro/tests/test_tokens.py +25 -19
- experimaestro/tests/test_types.py +133 -11
- experimaestro/tests/test_validation.py +19 -19
- experimaestro/tests/test_workspace_triggers.py +158 -0
- experimaestro/tests/token_reschedule.py +5 -3
- experimaestro/tests/utils.py +2 -2
- experimaestro/tokens.py +154 -57
- experimaestro/tools/diff.py +8 -1
- experimaestro/tui/__init__.py +8 -0
- experimaestro/tui/app.py +2303 -0
- experimaestro/tui/app.tcss +353 -0
- experimaestro/tui/log_viewer.py +228 -0
- experimaestro/typingutils.py +11 -2
- experimaestro/utils/__init__.py +23 -0
- experimaestro/utils/environment.py +148 -0
- experimaestro/utils/git.py +129 -0
- experimaestro/utils/resources.py +1 -1
- experimaestro/version.py +34 -0
- {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +70 -39
- experimaestro-2.0.0b4.dist-info/RECORD +181 -0
- {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
- experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
- experimaestro/compat.py +0 -6
- experimaestro/core/objects.pyi +0 -225
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro-1.11.1.dist-info/RECORD +0 -158
- experimaestro-1.11.1.dist-info/entry_points.txt +0 -17
- {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info/licenses}/LICENSE +0 -0
experimaestro/scheduler/base.py
CHANGED
|
@@ -1,503 +1,210 @@
|
|
|
1
|
-
from collections import ChainMap
|
|
2
|
-
from functools import cached_property
|
|
3
|
-
import itertools
|
|
4
|
-
import logging
|
|
5
|
-
import os
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from shutil import rmtree
|
|
8
1
|
import threading
|
|
9
2
|
import time
|
|
10
3
|
from typing import (
|
|
11
|
-
Any,
|
|
12
|
-
Iterator,
|
|
13
|
-
List,
|
|
14
4
|
Optional,
|
|
15
5
|
Set,
|
|
16
|
-
|
|
17
|
-
Union,
|
|
6
|
+
ClassVar,
|
|
18
7
|
TYPE_CHECKING,
|
|
19
8
|
)
|
|
20
|
-
import enum
|
|
21
|
-
import signal
|
|
22
9
|
import asyncio
|
|
23
|
-
from experimaestro.exceptions import HandledException
|
|
24
|
-
from experimaestro.notifications import LevelInformation, Reporter
|
|
25
10
|
from typing import Dict
|
|
11
|
+
|
|
12
|
+
from experimaestro.scheduler import experiment
|
|
13
|
+
from experimaestro.scheduler.jobs import Job, JobState, JobError
|
|
26
14
|
from experimaestro.scheduler.services import Service
|
|
27
|
-
from experimaestro.settings import WorkspaceSettings, get_settings
|
|
28
15
|
|
|
29
16
|
|
|
30
|
-
from experimaestro.core.objects import Config, ConfigWalkContext, WatchedOutput
|
|
31
17
|
from experimaestro.utils import logger
|
|
32
|
-
from experimaestro.locking import Locks, LockError, Lock
|
|
33
18
|
from experimaestro.utils.asyncio import asyncThreadcheck
|
|
34
|
-
from .workspace import RunMode, Workspace
|
|
35
|
-
from .dependencies import Dependency, DependencyStatus, Resource
|
|
36
19
|
import concurrent.futures
|
|
37
20
|
|
|
38
|
-
|
|
39
21
|
if TYPE_CHECKING:
|
|
40
|
-
from experimaestro.
|
|
41
|
-
from experimaestro.
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class FailedExperiment(HandledException):
|
|
45
|
-
"""Raised when an experiment failed"""
|
|
46
|
-
|
|
47
|
-
pass
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class JobState(enum.Enum):
|
|
51
|
-
# Job is not yet scheduled
|
|
52
|
-
UNSCHEDULED = 0
|
|
53
|
-
|
|
54
|
-
# Job is waiting for dependencies to be done
|
|
55
|
-
WAITING = 1
|
|
56
|
-
|
|
57
|
-
# Job is ready to run
|
|
58
|
-
READY = 2
|
|
59
|
-
|
|
60
|
-
# Job is scheduled (e.g. slurm)
|
|
61
|
-
SCHEDULED = 3
|
|
62
|
-
|
|
63
|
-
# Job is running
|
|
64
|
-
RUNNING = 4
|
|
65
|
-
|
|
66
|
-
# Job is done (finished)
|
|
67
|
-
DONE = 5
|
|
68
|
-
|
|
69
|
-
# Job failed (finished)
|
|
70
|
-
ERROR = 6
|
|
71
|
-
|
|
72
|
-
def notstarted(self):
|
|
73
|
-
return self.value <= JobState.READY.value
|
|
74
|
-
|
|
75
|
-
def running(self):
|
|
76
|
-
return (
|
|
77
|
-
self.value == JobState.RUNNING.value
|
|
78
|
-
or self.value == JobState.SCHEDULED.value
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
def finished(self):
|
|
82
|
-
return self.value >= JobState.DONE.value
|
|
22
|
+
from experimaestro.server import Server
|
|
23
|
+
from experimaestro.settings import ServerSettings
|
|
24
|
+
from experimaestro.scheduler.workspace import Workspace
|
|
83
25
|
|
|
84
26
|
|
|
85
|
-
class
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
#: Job dependency failed
|
|
90
|
-
FAILED = 1
|
|
91
|
-
|
|
92
|
-
#: Memory
|
|
93
|
-
MEMORY = 2
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
class JobLock(Lock):
|
|
97
|
-
def __init__(self, job):
|
|
98
|
-
super().__init__()
|
|
99
|
-
self.job = job
|
|
27
|
+
class Listener:
|
|
28
|
+
def job_submitted(self, job):
|
|
29
|
+
pass
|
|
100
30
|
|
|
101
|
-
def
|
|
102
|
-
|
|
31
|
+
def job_state(self, job):
|
|
32
|
+
pass
|
|
103
33
|
|
|
104
|
-
def
|
|
105
|
-
|
|
34
|
+
def service_add(self, service: Service):
|
|
35
|
+
"""Notify when a new service is added"""
|
|
36
|
+
pass
|
|
106
37
|
|
|
107
38
|
|
|
108
|
-
class
|
|
109
|
-
|
|
110
|
-
super().__init__(job)
|
|
39
|
+
class Scheduler(threading.Thread):
|
|
40
|
+
"""A job scheduler (singleton)
|
|
111
41
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
elif self.origin.state == JobState.ERROR:
|
|
116
|
-
return DependencyStatus.FAIL
|
|
117
|
-
return DependencyStatus.WAIT
|
|
42
|
+
The scheduler is based on asyncio for easy concurrency handling.
|
|
43
|
+
This is a singleton - only one scheduler instance exists per process.
|
|
44
|
+
"""
|
|
118
45
|
|
|
119
|
-
|
|
120
|
-
|
|
46
|
+
_instance: ClassVar[Optional["Scheduler"]] = None
|
|
47
|
+
_lock: ClassVar[threading.Lock] = threading.Lock()
|
|
121
48
|
|
|
49
|
+
def __init__(self, name: str = "Global"):
|
|
50
|
+
super().__init__(name=f"Scheduler ({name})", daemon=True)
|
|
51
|
+
self._ready = threading.Event()
|
|
122
52
|
|
|
123
|
-
|
|
124
|
-
|
|
53
|
+
# Name of the scheduler
|
|
54
|
+
self.name = name
|
|
125
55
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
_future: Optional["concurrent.futures.Future"]
|
|
56
|
+
# Track experiments (simple dict for now)
|
|
57
|
+
self.experiments: Dict[str, "experiment"] = {}
|
|
129
58
|
|
|
130
|
-
|
|
131
|
-
self
|
|
132
|
-
config: Config,
|
|
133
|
-
*,
|
|
134
|
-
workspace: Workspace = None,
|
|
135
|
-
launcher: "Launcher" = None,
|
|
136
|
-
run_mode: RunMode = RunMode.NORMAL,
|
|
137
|
-
):
|
|
138
|
-
super().__init__()
|
|
59
|
+
# Exit mode activated
|
|
60
|
+
self.exitmode = False
|
|
139
61
|
|
|
140
|
-
|
|
141
|
-
self.
|
|
62
|
+
# List of all jobs
|
|
63
|
+
self.jobs: Dict[str, "Job"] = {}
|
|
142
64
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
assert self.launcher is not None, (
|
|
146
|
-
"No launcher, and no default defined for the workspace %s" % workspace
|
|
147
|
-
)
|
|
65
|
+
# List of jobs
|
|
66
|
+
self.waitingjobs: Set[Job] = set()
|
|
148
67
|
|
|
149
|
-
|
|
150
|
-
self.
|
|
68
|
+
# Listeners with thread-safe access
|
|
69
|
+
self._listeners: Set[Listener] = set()
|
|
70
|
+
self._listeners_lock = threading.Lock()
|
|
151
71
|
|
|
152
|
-
|
|
153
|
-
self.
|
|
154
|
-
|
|
72
|
+
# Notification thread pool (single worker to serialize notifications)
|
|
73
|
+
self._notification_executor = concurrent.futures.ThreadPoolExecutor(
|
|
74
|
+
max_workers=1, thread_name_prefix="NotificationWorker"
|
|
75
|
+
)
|
|
155
76
|
|
|
156
|
-
|
|
157
|
-
self.
|
|
77
|
+
# Server (managed by scheduler)
|
|
78
|
+
self.server: Optional["Server"] = None
|
|
158
79
|
|
|
159
|
-
|
|
160
|
-
|
|
80
|
+
@staticmethod
|
|
81
|
+
def has_instance() -> bool:
|
|
82
|
+
"""Check if a scheduler instance exists without creating one"""
|
|
83
|
+
return Scheduler._instance is not None
|
|
161
84
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
85
|
+
@staticmethod
|
|
86
|
+
def instance() -> "Scheduler":
|
|
87
|
+
"""Get or create the global scheduler instance"""
|
|
88
|
+
if Scheduler._instance is None:
|
|
89
|
+
with Scheduler._lock:
|
|
90
|
+
if Scheduler._instance is None:
|
|
91
|
+
Scheduler._instance = Scheduler._create()
|
|
92
|
+
return Scheduler._instance
|
|
166
93
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
94
|
+
@staticmethod
|
|
95
|
+
def _create(name: str = "Global"):
|
|
96
|
+
"""Internal method to create and start scheduler"""
|
|
97
|
+
instance = Scheduler(name)
|
|
98
|
+
instance.start()
|
|
99
|
+
instance._ready.wait()
|
|
100
|
+
return instance
|
|
170
101
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
self.endtime: Optional[float] = None
|
|
175
|
-
self._progress: List[LevelInformation] = []
|
|
176
|
-
self.tags = config.tags()
|
|
102
|
+
@staticmethod
|
|
103
|
+
def create(xp: "experiment" = None, name: str = "Global"):
|
|
104
|
+
"""Create or get the scheduler instance
|
|
177
105
|
|
|
178
|
-
|
|
179
|
-
|
|
106
|
+
Args:
|
|
107
|
+
xp: (Deprecated) Experiment reference, ignored
|
|
108
|
+
name: Name for the scheduler (only used on first creation)
|
|
180
109
|
|
|
181
|
-
:
|
|
110
|
+
Returns:
|
|
111
|
+
The global scheduler instance
|
|
182
112
|
"""
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
def task_output_update(self, subpath: Path):
|
|
186
|
-
"""Notification of an updated task output"""
|
|
187
|
-
if watcher := self.watched_outputs.get(subpath, None):
|
|
188
|
-
watcher.update()
|
|
189
|
-
|
|
190
|
-
def done_handler(self):
|
|
191
|
-
"""The task has been completed"""
|
|
192
|
-
for watcher in self.watched_outputs.values():
|
|
193
|
-
watcher.update()
|
|
194
|
-
|
|
195
|
-
def __str__(self):
|
|
196
|
-
return "Job[{}]".format(self.identifier)
|
|
113
|
+
return Scheduler.instance()
|
|
197
114
|
|
|
198
|
-
def
|
|
199
|
-
|
|
200
|
-
|
|
115
|
+
def register_experiment(self, xp: "experiment"):
|
|
116
|
+
"""Register an experiment with the scheduler"""
|
|
117
|
+
# Use experiment name as key for now
|
|
118
|
+
key = xp.workdir.name
|
|
119
|
+
self.experiments[key] = xp
|
|
201
120
|
|
|
202
|
-
|
|
203
|
-
def python_path(self) -> Iterator[str]:
|
|
204
|
-
"""Returns an iterator over python path"""
|
|
205
|
-
return itertools.chain(self.workspace.python_path)
|
|
121
|
+
logger.debug("Registered experiment %s with scheduler", key)
|
|
206
122
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
123
|
+
def unregister_experiment(self, xp: "experiment"):
|
|
124
|
+
"""Unregister an experiment from the scheduler"""
|
|
125
|
+
key = xp.workdir.name
|
|
126
|
+
if key in self.experiments:
|
|
127
|
+
del self.experiments[key]
|
|
128
|
+
logger.debug("Unregistered experiment %s from scheduler", key)
|
|
210
129
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
1. The workspace environment
|
|
216
|
-
|
|
217
|
-
"""
|
|
218
|
-
return ChainMap(
|
|
219
|
-
{},
|
|
220
|
-
self.launcher.environ if self.launcher else {},
|
|
221
|
-
self.workspace.env if self.workspace else {},
|
|
222
|
-
)
|
|
130
|
+
def start_server(
|
|
131
|
+
self, settings: "ServerSettings" = None, workspace: "Workspace" = None
|
|
132
|
+
):
|
|
133
|
+
"""Start the notification server (if not already running)
|
|
223
134
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
def set_progress(self, level: int, value: float, desc: Optional[str]):
|
|
229
|
-
if value < 0:
|
|
230
|
-
logger.warning(f"Progress value out of bounds ({value})")
|
|
231
|
-
value = 0
|
|
232
|
-
elif value > 1:
|
|
233
|
-
logger.warning(f"Progress value out of bounds ({value})")
|
|
234
|
-
value = 1
|
|
235
|
-
|
|
236
|
-
# Adjust the length of the array
|
|
237
|
-
self._progress = self._progress[: (level + 1)]
|
|
238
|
-
while len(self._progress) <= level:
|
|
239
|
-
self._progress.append(LevelInformation(len(self._progress), None, 0.0))
|
|
240
|
-
|
|
241
|
-
if desc:
|
|
242
|
-
self._progress[-1].desc = desc
|
|
243
|
-
self._progress[-1].progress = value
|
|
244
|
-
|
|
245
|
-
for listener in self.scheduler.listeners:
|
|
246
|
-
listener.job_state(self)
|
|
247
|
-
|
|
248
|
-
def add_notification_server(self, server):
|
|
249
|
-
"""Adds a notification server"""
|
|
250
|
-
key, baseurl = server.getNotificationSpec()
|
|
251
|
-
dirpath = self.path / Reporter.NOTIFICATION_FOLDER
|
|
252
|
-
dirpath.mkdir(exist_ok=True)
|
|
253
|
-
(dirpath / key).write_text(f"{baseurl}/{self.identifier}")
|
|
254
|
-
|
|
255
|
-
@property
|
|
256
|
-
def ready(self):
|
|
257
|
-
return self.state == JobState.READY
|
|
258
|
-
|
|
259
|
-
@property
|
|
260
|
-
def jobpath(self) -> Path:
|
|
261
|
-
"""Deprecated, use `path`"""
|
|
262
|
-
return self.workspace.jobspath / self.relpath
|
|
263
|
-
|
|
264
|
-
@property
|
|
265
|
-
def path(self) -> Path:
|
|
266
|
-
return self.workspace.jobspath / self.relpath
|
|
267
|
-
|
|
268
|
-
@property
|
|
269
|
-
def experimaestro_path(self) -> Path:
|
|
270
|
-
return (self.path / ".experimaestro").resolve()
|
|
271
|
-
|
|
272
|
-
@cached_property
|
|
273
|
-
def task_outputs_path(self) -> Path:
|
|
274
|
-
return self.experimaestro_path / "task-outputs.jsonl"
|
|
275
|
-
|
|
276
|
-
@property
|
|
277
|
-
def relpath(self):
|
|
278
|
-
identifier = self.config.__xpm__.identifier
|
|
279
|
-
base = Path(str(self.type.identifier))
|
|
280
|
-
return base / identifier.all.hex()
|
|
281
|
-
|
|
282
|
-
@property
|
|
283
|
-
def relmainpath(self):
|
|
284
|
-
identifier = self.config.__xpm__.identifier
|
|
285
|
-
base = Path(str(self.type.identifier))
|
|
286
|
-
return base / identifier.main.hex()
|
|
287
|
-
|
|
288
|
-
@property
|
|
289
|
-
def hashidentifier(self):
|
|
290
|
-
return self.config.__xpm__.identifier
|
|
291
|
-
|
|
292
|
-
@property
|
|
293
|
-
def identifier(self):
|
|
294
|
-
return self.config.__xpm__.identifier.all.hex()
|
|
295
|
-
|
|
296
|
-
def prepare(self, overwrite=False):
|
|
297
|
-
"""Prepare all files before starting a task
|
|
298
|
-
|
|
299
|
-
:param overwrite: if True, overwrite files even if the task has been run
|
|
135
|
+
Args:
|
|
136
|
+
settings: Server settings
|
|
137
|
+
workspace: Workspace instance (required to get workspace path)
|
|
300
138
|
"""
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
"""Actually run the code"""
|
|
305
|
-
raise NotImplementedError(f"Method aio_run not implemented in {self.__class__}")
|
|
306
|
-
|
|
307
|
-
async def aio_process(self) -> Optional["Process"]:
|
|
308
|
-
"""Returns the process if it exists"""
|
|
309
|
-
raise NotImplementedError("Not implemented")
|
|
310
|
-
|
|
311
|
-
@property
|
|
312
|
-
def pidpath(self):
|
|
313
|
-
"""This file contains the file PID"""
|
|
314
|
-
return self.jobpath / ("%s.pid" % self.name)
|
|
315
|
-
|
|
316
|
-
@property
|
|
317
|
-
def lockpath(self):
|
|
318
|
-
"""This file is used as a lock for running the job"""
|
|
319
|
-
return self.workspace.jobspath / self.relmainpath / ("%s.lock" % self.name)
|
|
320
|
-
|
|
321
|
-
@property
|
|
322
|
-
def donepath(self) -> Path:
|
|
323
|
-
"""When a job has been successful, this file is written"""
|
|
324
|
-
return self.jobpath / ("%s.done" % self.name)
|
|
325
|
-
|
|
326
|
-
@property
|
|
327
|
-
def failedpath(self):
|
|
328
|
-
"""When a job has been unsuccessful, this file is written with an error
|
|
329
|
-
code inside"""
|
|
330
|
-
return self.jobpath / ("%s.failed" % self.name)
|
|
331
|
-
|
|
332
|
-
@property
|
|
333
|
-
def stdout(self) -> Path:
|
|
334
|
-
return self.jobpath / ("%s.out" % self.name)
|
|
335
|
-
|
|
336
|
-
@property
|
|
337
|
-
def stderr(self) -> Path:
|
|
338
|
-
return self.jobpath / ("%s.err" % self.name)
|
|
339
|
-
|
|
340
|
-
@property
|
|
341
|
-
def basepath(self) -> Path:
|
|
342
|
-
return self.jobpath / self.name
|
|
343
|
-
|
|
344
|
-
def dependencychanged(self, dependency, oldstatus, status):
|
|
345
|
-
"""Called when a dependency has changed"""
|
|
346
|
-
|
|
347
|
-
def value(s):
|
|
348
|
-
return 1 if s == DependencyStatus.OK else 0
|
|
139
|
+
if self.server is None:
|
|
140
|
+
from experimaestro.server import Server
|
|
141
|
+
from experimaestro.scheduler.state_provider import WorkspaceStateProvider
|
|
349
142
|
|
|
350
|
-
|
|
143
|
+
if workspace is None:
|
|
144
|
+
raise ValueError("workspace parameter is required to start server")
|
|
351
145
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
if not self.state.finished():
|
|
357
|
-
self.state = JobState.ERROR
|
|
358
|
-
self.failure_status = JobFailureStatus.DEPENDENCY
|
|
359
|
-
self._readyEvent.set()
|
|
360
|
-
|
|
361
|
-
if self.unsatisfied == 0:
|
|
362
|
-
logger.info("Job %s is ready to run", self)
|
|
363
|
-
# We are ready
|
|
364
|
-
self.state = JobState.READY
|
|
365
|
-
self._readyEvent.set()
|
|
366
|
-
|
|
367
|
-
def finalState(self) -> "concurrent.futures.Future[JobState]":
|
|
368
|
-
assert self._future is not None
|
|
369
|
-
return self._future
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
class JobContext(ConfigWalkContext):
|
|
373
|
-
def __init__(self, job: Job):
|
|
374
|
-
super().__init__()
|
|
375
|
-
self.job = job
|
|
376
|
-
|
|
377
|
-
@property
|
|
378
|
-
def name(self):
|
|
379
|
-
return self.job.name
|
|
380
|
-
|
|
381
|
-
@property
|
|
382
|
-
def path(self):
|
|
383
|
-
return self.job.path
|
|
384
|
-
|
|
385
|
-
@property
|
|
386
|
-
def task(self):
|
|
387
|
-
return self.job.config
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
class Listener:
|
|
391
|
-
def job_submitted(self, job):
|
|
392
|
-
pass
|
|
393
|
-
|
|
394
|
-
def job_state(self, job):
|
|
395
|
-
pass
|
|
396
|
-
|
|
397
|
-
def service_add(self, service: Service):
|
|
398
|
-
"""Notify when a new service is added"""
|
|
399
|
-
pass
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
class JobError(Exception):
|
|
403
|
-
def __init__(self, code):
|
|
404
|
-
super().__init__(f"Job exited with code {code}")
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
class SignalHandler:
|
|
408
|
-
def __init__(self):
|
|
409
|
-
self.experiments: Set["experiment"] = set()
|
|
410
|
-
self.original_sigint_handler = None
|
|
411
|
-
|
|
412
|
-
def add(self, xp: "experiment"):
|
|
413
|
-
if not self.experiments:
|
|
414
|
-
self.original_sigint_handler = signal.getsignal(signal.SIGINT)
|
|
415
|
-
|
|
416
|
-
signal.signal(signal.SIGINT, self)
|
|
417
|
-
|
|
418
|
-
self.experiments.add(xp)
|
|
419
|
-
|
|
420
|
-
def remove(self, xp):
|
|
421
|
-
self.experiments.remove(xp)
|
|
422
|
-
if not self.experiments:
|
|
423
|
-
signal.signal(signal.SIGINT, self.original_sigint_handler)
|
|
424
|
-
|
|
425
|
-
def __call__(self, signum, frame):
|
|
426
|
-
"""SIGINT signal handler"""
|
|
427
|
-
logger.warning("Signal received")
|
|
428
|
-
for xp in self.experiments:
|
|
429
|
-
xp.stop()
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
SIGNAL_HANDLER = SignalHandler()
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
class SchedulerCentral(threading.Thread):
|
|
436
|
-
loop: asyncio.AbstractEventLoop
|
|
437
|
-
|
|
438
|
-
"""The event loop thread used by the scheduler"""
|
|
146
|
+
# Get the workspace state provider singleton
|
|
147
|
+
state_provider = WorkspaceStateProvider.get_instance(
|
|
148
|
+
workspace.path, read_only=False, sync_on_start=False
|
|
149
|
+
)
|
|
439
150
|
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
151
|
+
self.server = Server.instance(settings, state_provider)
|
|
152
|
+
self.server.start()
|
|
153
|
+
logger.info("Server started by scheduler")
|
|
154
|
+
else:
|
|
155
|
+
logger.debug("Server already running")
|
|
443
156
|
|
|
444
|
-
|
|
157
|
+
def stop_server(self):
|
|
158
|
+
"""Stop the notification server"""
|
|
159
|
+
if self.server is not None:
|
|
160
|
+
self.server.stop()
|
|
161
|
+
logger.info("Server stopped by scheduler")
|
|
445
162
|
|
|
446
163
|
def run(self):
|
|
164
|
+
"""Run the event loop forever"""
|
|
447
165
|
logger.debug("Starting event loop thread")
|
|
166
|
+
# Ported from SchedulerCentral
|
|
448
167
|
self.loop = asyncio.new_event_loop()
|
|
449
168
|
asyncio.set_event_loop(self.loop)
|
|
450
|
-
|
|
451
169
|
# Set loop-dependent variables
|
|
452
170
|
self.exitCondition = asyncio.Condition()
|
|
453
171
|
self.dependencyLock = asyncio.Lock()
|
|
454
172
|
|
|
455
|
-
#
|
|
173
|
+
# Note: State provider removed - now managed at workspace level
|
|
174
|
+
# Each experiment has its own workspace with database
|
|
175
|
+
|
|
456
176
|
self._ready.set()
|
|
457
177
|
self.loop.run_forever()
|
|
458
178
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
class Scheduler:
|
|
468
|
-
"""A job scheduler
|
|
469
|
-
|
|
470
|
-
The scheduler is based on asyncio for easy concurrency handling
|
|
471
|
-
"""
|
|
472
|
-
|
|
473
|
-
def __init__(self, xp: "experiment", name: str):
|
|
474
|
-
# Name of the experiment
|
|
475
|
-
self.name = name
|
|
476
|
-
self.xp = xp
|
|
477
|
-
|
|
478
|
-
# Exit mode activated
|
|
479
|
-
self.exitmode = False
|
|
480
|
-
|
|
481
|
-
# List of all jobs
|
|
482
|
-
self.jobs: Dict[str, "Job"] = {}
|
|
483
|
-
|
|
484
|
-
# List of jobs
|
|
485
|
-
self.waitingjobs: Set[Job] = set()
|
|
486
|
-
|
|
487
|
-
# Listeners
|
|
488
|
-
self.listeners: Set[Listener] = set()
|
|
489
|
-
|
|
490
|
-
@property
|
|
491
|
-
def loop(self):
|
|
492
|
-
return self.xp.loop
|
|
179
|
+
def start_scheduler(self):
|
|
180
|
+
"""Start the scheduler event loop in a thread"""
|
|
181
|
+
if not self.is_alive():
|
|
182
|
+
self.start()
|
|
183
|
+
self._ready.wait()
|
|
184
|
+
else:
|
|
185
|
+
logger.warning("Scheduler already started")
|
|
493
186
|
|
|
494
187
|
def addlistener(self, listener: Listener):
|
|
495
|
-
self.
|
|
188
|
+
with self._listeners_lock:
|
|
189
|
+
self._listeners.add(listener)
|
|
496
190
|
|
|
497
191
|
def removelistener(self, listener: Listener):
|
|
498
|
-
self.
|
|
192
|
+
with self._listeners_lock:
|
|
193
|
+
self._listeners.discard(listener)
|
|
194
|
+
|
|
195
|
+
def clear_listeners(self):
|
|
196
|
+
"""Clear all listeners (for testing purposes)"""
|
|
197
|
+
with self._listeners_lock:
|
|
198
|
+
self._listeners.clear()
|
|
499
199
|
|
|
500
200
|
def getJobState(self, job: Job) -> "concurrent.futures.Future[JobState]":
|
|
201
|
+
# Check if the job belongs to this scheduler
|
|
202
|
+
if job.identifier not in self.jobs:
|
|
203
|
+
# If job is not in this scheduler, return its current state directly
|
|
204
|
+
future = concurrent.futures.Future()
|
|
205
|
+
future.set_result(job.state)
|
|
206
|
+
return future
|
|
207
|
+
|
|
501
208
|
return asyncio.run_coroutine_threadsafe(self.aio_getjobstate(job), self.loop)
|
|
502
209
|
|
|
503
210
|
async def aio_getjobstate(self, job: Job):
|
|
@@ -505,17 +212,25 @@ class Scheduler:
|
|
|
505
212
|
|
|
506
213
|
def submit(self, job: Job) -> Optional[Job]:
|
|
507
214
|
# Wait for the future containing the submitted job
|
|
508
|
-
logger.debug("
|
|
215
|
+
logger.debug("Submit job %s to the scheduler", job)
|
|
509
216
|
otherFuture = asyncio.run_coroutine_threadsafe(
|
|
510
217
|
self.aio_registerJob(job), self.loop
|
|
511
218
|
)
|
|
512
219
|
other = otherFuture.result()
|
|
513
220
|
logger.debug("Job already submitted" if other else "First submission")
|
|
514
|
-
|
|
515
|
-
|
|
221
|
+
|
|
222
|
+
# Only returns if job was already submitted and doesn't need reprocessing
|
|
223
|
+
if other is not None:
|
|
224
|
+
# If state is WAITING, it was just reset for resubmission and needs processing
|
|
225
|
+
# If state is RUNNING or finished (DONE), no need to reprocess
|
|
226
|
+
if other.state != JobState.WAITING:
|
|
227
|
+
return other
|
|
228
|
+
# Use 'other' for resubmission since it has the correct experiments list
|
|
229
|
+
job = other
|
|
516
230
|
|
|
517
231
|
job._future = asyncio.run_coroutine_threadsafe(self.aio_submit(job), self.loop)
|
|
518
|
-
|
|
232
|
+
|
|
233
|
+
return other
|
|
519
234
|
|
|
520
235
|
def prepare(self, job: Job):
|
|
521
236
|
"""Prepares the job for running"""
|
|
@@ -530,33 +245,99 @@ class Scheduler:
|
|
|
530
245
|
|
|
531
246
|
if self.exitmode:
|
|
532
247
|
logger.warning("Exit mode: not submitting")
|
|
248
|
+
return
|
|
533
249
|
|
|
534
|
-
|
|
250
|
+
# Job was already submitted
|
|
251
|
+
if job.identifier in self.jobs:
|
|
535
252
|
other = self.jobs[job.identifier]
|
|
536
253
|
assert job.type == other.type
|
|
537
|
-
|
|
254
|
+
|
|
255
|
+
# Add current experiment to the existing job's experiments list
|
|
256
|
+
xp = experiment.current()
|
|
257
|
+
xp.add_job(other)
|
|
258
|
+
|
|
259
|
+
# Copy watched outputs from new job to existing job
|
|
260
|
+
# This ensures new callbacks are registered even for resubmitted jobs
|
|
261
|
+
other.watched_outputs.extend(job.watched_outputs)
|
|
262
|
+
|
|
263
|
+
if other.state.is_error():
|
|
538
264
|
logger.info("Re-submitting job")
|
|
265
|
+
# Clean up old process info so it will be re-started
|
|
266
|
+
other._process = None
|
|
267
|
+
if other.pidpath.is_file():
|
|
268
|
+
other.pidpath.unlink()
|
|
269
|
+
# Use set_state to handle experiment statistics updates
|
|
270
|
+
other.set_state(JobState.WAITING)
|
|
271
|
+
self.notify_job_state(other) # Notify listeners of re-submit
|
|
539
272
|
else:
|
|
540
273
|
logger.warning("Job %s already submitted", job.identifier)
|
|
541
|
-
return other
|
|
542
274
|
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
275
|
+
# Returns the previous job
|
|
276
|
+
return other
|
|
277
|
+
|
|
278
|
+
# Register this job
|
|
279
|
+
xp = experiment.current()
|
|
280
|
+
self.jobs[job.identifier] = job
|
|
281
|
+
# Set submittime now so that add_job can record it in the database
|
|
282
|
+
# (aio_submit may update this later for re-submitted jobs)
|
|
283
|
+
job.submittime = time.time()
|
|
284
|
+
xp.add_job(job)
|
|
285
|
+
|
|
286
|
+
# Set up dependencies
|
|
287
|
+
for dependency in job.dependencies:
|
|
288
|
+
dependency.target = job
|
|
289
|
+
dependency.origin.dependents.add(dependency)
|
|
547
290
|
|
|
548
291
|
return None
|
|
549
292
|
|
|
550
|
-
|
|
293
|
+
def _notify_listeners(self, notification_func, job: Job):
|
|
294
|
+
"""Execute notification in thread pool with error isolation.
|
|
295
|
+
|
|
296
|
+
This runs notifications in a dedicated thread pool to avoid blocking
|
|
297
|
+
the scheduler and to isolate errors from affecting other listeners.
|
|
298
|
+
"""
|
|
299
|
+
|
|
300
|
+
def _do_notify():
|
|
301
|
+
# Get a snapshot of listeners with the lock
|
|
302
|
+
with self._listeners_lock:
|
|
303
|
+
listeners_snapshot = list(self._listeners)
|
|
304
|
+
|
|
305
|
+
for listener in listeners_snapshot:
|
|
306
|
+
try:
|
|
307
|
+
notification_func(listener, job)
|
|
308
|
+
except Exception:
|
|
309
|
+
logger.exception("Got an error with listener %s", listener)
|
|
310
|
+
|
|
311
|
+
self._notification_executor.submit(_do_notify)
|
|
312
|
+
|
|
313
|
+
def notify_job_submitted(self, job: Job):
|
|
314
|
+
"""Notify the listeners that a job has been submitted"""
|
|
315
|
+
self._notify_listeners(lambda lst, j: lst.job_submitted(j), job)
|
|
316
|
+
|
|
317
|
+
def notify_job_state(self, job: Job):
|
|
318
|
+
"""Notify the listeners that a job has changed state"""
|
|
319
|
+
self._notify_listeners(lambda lst, j: lst.job_state(j), job)
|
|
320
|
+
|
|
321
|
+
def notify_service_add(self, service: Service):
|
|
322
|
+
"""Notify the listeners that a service has been added"""
|
|
323
|
+
self._notify_listeners(lambda lst, s: lst.service_add(s), service)
|
|
324
|
+
|
|
325
|
+
async def aio_submit(self, job: Job) -> JobState:
|
|
551
326
|
"""Main scheduler function: submit a job, run it (if needed), and returns
|
|
552
327
|
the status code
|
|
553
328
|
"""
|
|
329
|
+
from experimaestro.scheduler.jobs import JobStateError, JobFailureStatus
|
|
330
|
+
|
|
554
331
|
logger.info("Submitting job %s", job)
|
|
555
|
-
job._readyEvent = asyncio.Event()
|
|
556
332
|
job.submittime = time.time()
|
|
557
333
|
job.scheduler = self
|
|
558
334
|
self.waitingjobs.add(job)
|
|
559
335
|
|
|
336
|
+
# Register watched outputs now that the job has a scheduler
|
|
337
|
+
job.register_watched_outputs()
|
|
338
|
+
|
|
339
|
+
# Note: Job metadata will be written after directory is created in aio_start
|
|
340
|
+
|
|
560
341
|
# Check that we don't have a completed job in
|
|
561
342
|
# alternate directories
|
|
562
343
|
for jobspath in experiment.current().alt_jobspaths:
|
|
@@ -570,560 +351,324 @@ class Scheduler:
|
|
|
570
351
|
path.unlink()
|
|
571
352
|
path.symlink_to(job.path)
|
|
572
353
|
|
|
573
|
-
job.
|
|
574
|
-
|
|
575
|
-
try:
|
|
576
|
-
listener.job_submitted(job)
|
|
577
|
-
except Exception:
|
|
578
|
-
logger.exception("Got an error with listener %s", listener)
|
|
579
|
-
|
|
580
|
-
# Add dependencies, and add to blocking resources
|
|
581
|
-
if job.dependencies:
|
|
582
|
-
job.unsatisfied = len(job.dependencies)
|
|
583
|
-
|
|
584
|
-
for dependency in job.dependencies:
|
|
585
|
-
dependency.target = job
|
|
586
|
-
dependency.loop = self.loop
|
|
587
|
-
dependency.origin.dependents.add(dependency)
|
|
588
|
-
dependency.check()
|
|
589
|
-
else:
|
|
590
|
-
job._readyEvent.set()
|
|
591
|
-
job.state = JobState.READY
|
|
354
|
+
job.set_state(JobState.WAITING)
|
|
355
|
+
self.notify_job_submitted(job)
|
|
592
356
|
|
|
357
|
+
# Check if already done
|
|
593
358
|
if job.donepath.exists():
|
|
594
|
-
job.
|
|
359
|
+
job.set_state(JobState.DONE)
|
|
360
|
+
self.notify_job_state(job) # Notify listeners of done state
|
|
595
361
|
|
|
596
362
|
# Check if we have a running process
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
363
|
+
if not job.state.finished():
|
|
364
|
+
process = await job.aio_process()
|
|
365
|
+
if process is not None:
|
|
366
|
+
# Notify listeners that job is running
|
|
367
|
+
job.set_state(JobState.RUNNING)
|
|
368
|
+
self.notify_job_state(job)
|
|
369
|
+
|
|
370
|
+
# Adds to the listeners
|
|
371
|
+
if self.server is not None:
|
|
372
|
+
job.add_notification_server(self.server)
|
|
373
|
+
|
|
374
|
+
# And now, we wait...
|
|
375
|
+
logger.info("Got a process for job %s - waiting to complete", job)
|
|
376
|
+
code = await process.aio_code()
|
|
377
|
+
logger.info("Job %s completed with code %s", job, code)
|
|
378
|
+
|
|
379
|
+
# Record exit code if available
|
|
380
|
+
if code is not None:
|
|
381
|
+
job.exit_code = code
|
|
382
|
+
|
|
383
|
+
# Read state from .done/.failed files (contains detailed failure reason)
|
|
384
|
+
state = JobState.from_path(job.path, job.name)
|
|
385
|
+
|
|
386
|
+
# If state is a generic FAILED error, let the process determine
|
|
387
|
+
# the state (it may detect launcher-specific failures like SLURM timeout)
|
|
388
|
+
if (
|
|
389
|
+
state is not None
|
|
390
|
+
and isinstance(state, JobStateError)
|
|
391
|
+
and state.failure_reason == JobFailureStatus.FAILED
|
|
392
|
+
and code is not None
|
|
393
|
+
):
|
|
394
|
+
process_state = process.get_job_state(code)
|
|
395
|
+
if (
|
|
396
|
+
isinstance(process_state, JobStateError)
|
|
397
|
+
and process_state.failure_reason != JobFailureStatus.FAILED
|
|
398
|
+
):
|
|
399
|
+
# Process detected a more specific failure reason
|
|
400
|
+
state = process_state
|
|
633
401
|
|
|
634
402
|
if state is None:
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
403
|
+
if code is not None:
|
|
404
|
+
# Fall back to process-specific state detection
|
|
405
|
+
state = process.get_job_state(code)
|
|
406
|
+
else:
|
|
407
|
+
logger.error("No .done or .failed file found for job %s", job)
|
|
408
|
+
state = JobState.ERROR
|
|
409
|
+
# Set endtime before set_state so database gets the timestamp
|
|
410
|
+
job.endtime = time.time()
|
|
411
|
+
job.set_state(state)
|
|
412
|
+
self.notify_job_state(job) # Notify listeners of final state
|
|
413
|
+
|
|
414
|
+
# If not done or running, start the job
|
|
415
|
+
if not job.state.finished():
|
|
641
416
|
try:
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
417
|
+
state = await self.aio_start(job)
|
|
418
|
+
# Set endtime before set_state so database gets the timestamp
|
|
419
|
+
job.endtime = time.time()
|
|
420
|
+
job.set_state(state)
|
|
421
|
+
except Exception:
|
|
422
|
+
logger.exception("Got an exception while starting the job")
|
|
423
|
+
raise
|
|
649
424
|
|
|
650
|
-
#
|
|
651
|
-
await asyncThreadcheck("End of job processing", job.done_handler)
|
|
425
|
+
# Job is finished - experiment statistics already updated by set_state
|
|
652
426
|
|
|
653
|
-
#
|
|
654
|
-
|
|
655
|
-
async with self.xp.central.exitCondition:
|
|
656
|
-
logging.debug("Updated number of unfinished jobs")
|
|
657
|
-
self.xp.central.exitCondition.notify_all()
|
|
427
|
+
# Write final metadata with end time and final state
|
|
428
|
+
job.write_metadata()
|
|
658
429
|
|
|
659
|
-
job.endtime = time.time()
|
|
660
430
|
if job in self.waitingjobs:
|
|
661
431
|
self.waitingjobs.remove(job)
|
|
662
432
|
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
433
|
+
# Process all remaining task outputs BEFORE notifying exit condition
|
|
434
|
+
# This ensures taskOutputQueueSize is updated before wait() can check it,
|
|
435
|
+
# preventing a race where wait() sees both unfinishedJobs==0 and
|
|
436
|
+
# taskOutputQueueSize==0 before callbacks have been queued.
|
|
437
|
+
await asyncThreadcheck("End of job processing", job.done_handler)
|
|
438
|
+
|
|
439
|
+
# Now notify - wait() will see the correct taskOutputQueueSize
|
|
440
|
+
async with self.exitCondition:
|
|
441
|
+
self.exitCondition.notify_all()
|
|
668
442
|
|
|
669
443
|
return job.state
|
|
670
444
|
|
|
671
|
-
async def aio_start(self, job: Job) -> Optional[JobState]:
|
|
672
|
-
"""Start a job
|
|
445
|
+
async def aio_start(self, job: Job) -> Optional[JobState]: # noqa: C901
|
|
446
|
+
"""Start a job with full job starting logic
|
|
447
|
+
|
|
448
|
+
This method handles job locking, dependency acquisition, directory setup,
|
|
449
|
+
and job execution while using the scheduler's coordination lock to prevent
|
|
450
|
+
race conditions between multiple jobs.
|
|
673
451
|
|
|
674
|
-
|
|
675
|
-
|
|
452
|
+
:param job: The job to start
|
|
453
|
+
:return: JobState.WAITING if dependencies could not be locked, JobState.DONE
|
|
454
|
+
if job completed successfully, JobState.ERROR if job failed during execution,
|
|
455
|
+
or None (should not occur in normal operation)
|
|
456
|
+
:raises Exception: Various exceptions during job execution, dependency locking,
|
|
457
|
+
or process creation
|
|
676
458
|
"""
|
|
459
|
+
from experimaestro.scheduler.jobs import JobStateError
|
|
460
|
+
from experimaestro.locking import Locks, LockError
|
|
461
|
+
from experimaestro.scheduler.jobs import JobFailureStatus
|
|
677
462
|
|
|
678
|
-
#
|
|
463
|
+
# Assert preconditions
|
|
679
464
|
assert job.launcher is not None
|
|
680
|
-
assert self.xp.central is not None
|
|
681
465
|
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
466
|
+
# Restart loop for resumable tasks that timeout
|
|
467
|
+
while True:
|
|
468
|
+
logger.debug(
|
|
469
|
+
"Starting job %s with %d dependencies",
|
|
470
|
+
job,
|
|
471
|
+
len(job.dependencies),
|
|
472
|
+
)
|
|
686
473
|
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
"Starting job %s with %d dependencies",
|
|
691
|
-
job,
|
|
692
|
-
len(job.dependencies),
|
|
693
|
-
)
|
|
474
|
+
# Separate static and dynamic dependencies
|
|
475
|
+
static_deps = [d for d in job.dependencies if not d.is_dynamic()]
|
|
476
|
+
dynamic_deps = [d for d in job.dependencies if d.is_dynamic()]
|
|
694
477
|
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
478
|
+
# First, wait for all static dependencies (jobs) to complete
|
|
479
|
+
# These don't need the dependency lock as they can't change state
|
|
480
|
+
# Static dependency locks don't need to be added to locks list
|
|
481
|
+
logger.debug("Waiting for %d static dependencies", len(static_deps))
|
|
482
|
+
for dependency in static_deps:
|
|
483
|
+
logger.debug("Waiting for static dependency %s", dependency)
|
|
484
|
+
try:
|
|
485
|
+
await dependency.aio_lock()
|
|
486
|
+
except RuntimeError as e:
|
|
487
|
+
# Dependency failed - mark job as failed due to dependency
|
|
488
|
+
logger.warning("Dependency failed: %s", e)
|
|
489
|
+
return JobStateError(JobFailureStatus.DEPENDENCY)
|
|
490
|
+
|
|
491
|
+
# We first lock the job before proceeding
|
|
492
|
+
with Locks() as locks:
|
|
493
|
+
logger.debug("[starting] Locking job %s", job)
|
|
494
|
+
async with job.launcher.connector.lock(job.lockpath):
|
|
495
|
+
logger.debug("[starting] Locked job %s", job)
|
|
496
|
+
|
|
497
|
+
state = None
|
|
498
|
+
try:
|
|
499
|
+
# Now handle dynamic dependencies (tokens) with retry logic
|
|
500
|
+
# CRITICAL: Only one task at a time can acquire dynamic dependencies
|
|
501
|
+
# to prevent deadlocks (e.g., Task A holds Token1 waiting for Token2,
|
|
502
|
+
# Task B holds Token2 waiting for Token1)
|
|
503
|
+
if dynamic_deps:
|
|
504
|
+
async with self.dependencyLock:
|
|
505
|
+
logger.debug(
|
|
506
|
+
"Locking %d dynamic dependencies (tokens)",
|
|
507
|
+
len(dynamic_deps),
|
|
704
508
|
)
|
|
705
|
-
|
|
706
|
-
|
|
509
|
+
while True:
|
|
510
|
+
all_locked = True
|
|
511
|
+
for idx, dependency in enumerate(dynamic_deps):
|
|
512
|
+
try:
|
|
513
|
+
# Use timeout=0 for first dependency, 0.1s for subsequent
|
|
514
|
+
timeout = 0 if idx == 0 else 0.1
|
|
515
|
+
# Acquire the lock (this might block on IPC locks)
|
|
516
|
+
lock = await dependency.aio_lock(
|
|
517
|
+
timeout=timeout
|
|
518
|
+
)
|
|
519
|
+
locks.append(lock)
|
|
520
|
+
except LockError:
|
|
521
|
+
logger.info(
|
|
522
|
+
"Could not lock %s, retrying",
|
|
523
|
+
dependency,
|
|
524
|
+
)
|
|
525
|
+
# Release all locks and restart
|
|
526
|
+
for lock in locks.locks:
|
|
527
|
+
lock.release()
|
|
528
|
+
locks.locks.clear()
|
|
529
|
+
# Put failed dependency first
|
|
530
|
+
dynamic_deps.remove(dependency)
|
|
531
|
+
dynamic_deps.insert(0, dependency)
|
|
532
|
+
all_locked = False
|
|
533
|
+
break
|
|
534
|
+
|
|
535
|
+
if all_locked:
|
|
536
|
+
# All locks acquired successfully
|
|
537
|
+
break
|
|
538
|
+
|
|
539
|
+
# Dependencies have been locked, we can start the job
|
|
540
|
+
job.starttime = time.time()
|
|
541
|
+
|
|
542
|
+
# Creates the main directory
|
|
543
|
+
directory = job.path
|
|
544
|
+
logger.debug("Making directories job %s...", directory)
|
|
545
|
+
|
|
546
|
+
# Warn about directory cleanup for non-resumable tasks
|
|
547
|
+
# (only once per task type)
|
|
548
|
+
xpmtype = job.config.__xpmtype__
|
|
549
|
+
if (
|
|
550
|
+
directory.is_dir()
|
|
551
|
+
and not job.resumable
|
|
552
|
+
and not xpmtype.warned_clean_not_resumable
|
|
553
|
+
):
|
|
554
|
+
xpmtype.warned_clean_not_resumable = True
|
|
555
|
+
logger.warning(
|
|
556
|
+
"In a future version, directory will be cleaned up for "
|
|
557
|
+
"non-resumable tasks (%s). Use ResumableTask if you want "
|
|
558
|
+
"to preserve the directory contents.",
|
|
559
|
+
xpmtype.identifier,
|
|
560
|
+
)
|
|
707
561
|
|
|
708
|
-
|
|
709
|
-
|
|
562
|
+
if not directory.is_dir():
|
|
563
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
710
564
|
|
|
711
|
-
|
|
565
|
+
# Write metadata with submit and start time (after directory creation)
|
|
566
|
+
job.write_metadata()
|
|
712
567
|
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
if not directory.is_dir():
|
|
717
|
-
directory.mkdir(parents=True, exist_ok=True)
|
|
568
|
+
# Sets up the notification URL
|
|
569
|
+
if self.server is not None:
|
|
570
|
+
job.add_notification_server(self.server)
|
|
718
571
|
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
572
|
+
except Exception:
|
|
573
|
+
logger.warning("Error while locking job", exc_info=True)
|
|
574
|
+
return JobState.WAITING
|
|
722
575
|
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
576
|
+
try:
|
|
577
|
+
# Runs the job
|
|
578
|
+
process = await job.aio_run()
|
|
579
|
+
except Exception:
|
|
580
|
+
logger.warning("Error while starting job", exc_info=True)
|
|
581
|
+
return JobState.ERROR
|
|
726
582
|
|
|
583
|
+
# Wait for job to complete while holding locks
|
|
727
584
|
try:
|
|
728
|
-
# Runs the job
|
|
729
|
-
process = await job.aio_run()
|
|
730
|
-
except Exception:
|
|
731
|
-
logger.warning("Error while starting job", exc_info=True)
|
|
732
|
-
return JobState.ERROR
|
|
733
|
-
|
|
734
|
-
try:
|
|
735
|
-
if isinstance(process, JobState):
|
|
736
|
-
state = process
|
|
737
|
-
logger.debug("Job %s ended (state %s)", job, state)
|
|
738
|
-
else:
|
|
739
585
|
logger.debug("Waiting for job %s process to end", job)
|
|
740
586
|
|
|
741
587
|
code = await process.aio_code()
|
|
742
588
|
logger.debug("Got return code %s for %s", code, job)
|
|
743
589
|
|
|
744
|
-
#
|
|
745
|
-
if code is None:
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
590
|
+
# Record exit code if available
|
|
591
|
+
if code is not None:
|
|
592
|
+
logger.info("Job %s ended with code %s", job, code)
|
|
593
|
+
job.exit_code = code
|
|
594
|
+
else:
|
|
595
|
+
logger.info("Job %s ended, reading state from files", job)
|
|
596
|
+
|
|
597
|
+
# Read state from .done/.failed files (contains detailed failure reason)
|
|
598
|
+
state = JobState.from_path(job.path, job.name)
|
|
599
|
+
|
|
600
|
+
# If state is a generic FAILED error, let the process determine
|
|
601
|
+
# the state (it may detect launcher-specific failures like SLURM timeout)
|
|
602
|
+
if (
|
|
603
|
+
state is not None
|
|
604
|
+
and isinstance(state, JobStateError)
|
|
605
|
+
and state.failure_reason == JobFailureStatus.FAILED
|
|
606
|
+
and code is not None
|
|
607
|
+
):
|
|
608
|
+
process_state = process.get_job_state(code)
|
|
609
|
+
if (
|
|
610
|
+
isinstance(process_state, JobStateError)
|
|
611
|
+
and process_state.failure_reason != JobFailureStatus.FAILED
|
|
612
|
+
):
|
|
613
|
+
# Process detected a more specific failure reason
|
|
614
|
+
state = process_state
|
|
615
|
+
|
|
616
|
+
if state is None:
|
|
617
|
+
if code is not None:
|
|
618
|
+
# Fall back to process-specific state detection
|
|
619
|
+
state = process.get_job_state(code)
|
|
749
620
|
else:
|
|
750
|
-
code = int(job.failedpath.read_text())
|
|
751
|
-
|
|
752
|
-
logger.debug("Job %s ended with code %s", job, code)
|
|
753
|
-
state = JobState.DONE if code == 0 else JobState.ERROR
|
|
754
|
-
|
|
755
|
-
except JobError:
|
|
756
|
-
logger.warning("Error while running job")
|
|
757
|
-
state = JobState.ERROR
|
|
758
|
-
|
|
759
|
-
except Exception:
|
|
760
|
-
logger.warning(
|
|
761
|
-
"Error while running job (in experimaestro)", exc_info=True
|
|
762
|
-
)
|
|
763
|
-
state = JobState.ERROR
|
|
764
|
-
|
|
765
|
-
return state
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
ServiceClass = TypeVar("ServiceClass", bound=Service)
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
class experiment:
|
|
772
|
-
"""Main experiment object
|
|
773
|
-
|
|
774
|
-
It is a context object, i.e. experiments is run with
|
|
775
|
-
|
|
776
|
-
```py
|
|
777
|
-
with experiment(...) as xp:
|
|
778
|
-
...
|
|
779
|
-
```
|
|
780
|
-
"""
|
|
781
|
-
|
|
782
|
-
#: Current experiment
|
|
783
|
-
CURRENT: Optional["experiment"] = None
|
|
784
|
-
|
|
785
|
-
@staticmethod
|
|
786
|
-
def current() -> "experiment":
|
|
787
|
-
"""Returns the current experiment, but checking first if set
|
|
788
|
-
|
|
789
|
-
If there is no current experiment, raises an AssertError
|
|
790
|
-
"""
|
|
791
|
-
assert experiment.CURRENT is not None, "No current experiment defined"
|
|
792
|
-
return experiment.CURRENT
|
|
793
|
-
|
|
794
|
-
def __init__(
|
|
795
|
-
self,
|
|
796
|
-
env: Union[Path, str, WorkspaceSettings],
|
|
797
|
-
name: str,
|
|
798
|
-
*,
|
|
799
|
-
host: Optional[str] = None,
|
|
800
|
-
port: Optional[int] = None,
|
|
801
|
-
token: Optional[str] = None,
|
|
802
|
-
run_mode: Optional[RunMode] = None,
|
|
803
|
-
launcher=None,
|
|
804
|
-
):
|
|
805
|
-
"""
|
|
806
|
-
:param env: an environment -- or a working directory for a local
|
|
807
|
-
environment
|
|
808
|
-
|
|
809
|
-
:param name: the identifier of the experiment
|
|
810
|
-
|
|
811
|
-
:param launcher: The launcher (if not provided, inferred from path)
|
|
812
|
-
|
|
813
|
-
:param host: The host for the web server (overrides the environment if
|
|
814
|
-
set)
|
|
815
|
-
:param port: the port for the web server (overrides the environment if
|
|
816
|
-
set). Use negative number to avoid running a web server (default when dry run).
|
|
817
|
-
|
|
818
|
-
:param run_mode: The run mode for the experiment (normal, generate run
|
|
819
|
-
files, dry run)
|
|
820
|
-
"""
|
|
821
|
-
|
|
822
|
-
from experimaestro.server import Server
|
|
823
|
-
from experimaestro.scheduler import Listener
|
|
824
|
-
|
|
825
|
-
settings = get_settings()
|
|
826
|
-
if not isinstance(env, WorkspaceSettings):
|
|
827
|
-
env = WorkspaceSettings(id=None, path=Path(env))
|
|
828
|
-
|
|
829
|
-
# Creates the workspace
|
|
830
|
-
run_mode = run_mode or RunMode.NORMAL
|
|
831
|
-
self.workspace = Workspace(settings, env, launcher=launcher, run_mode=run_mode)
|
|
832
|
-
|
|
833
|
-
# Mark the directory has an experimaestro folder
|
|
834
|
-
self.workdir = self.workspace.experimentspath / name
|
|
835
|
-
self.workdir.mkdir(parents=True, exist_ok=True)
|
|
836
|
-
self.xplockpath = self.workdir / "lock"
|
|
837
|
-
self.xplock = None
|
|
838
|
-
self.old_experiment = None
|
|
839
|
-
self.services: Dict[str, Service] = {}
|
|
840
|
-
self._job_listener: Optional[Listener] = None
|
|
841
|
-
|
|
842
|
-
# Get configuration settings
|
|
843
|
-
|
|
844
|
-
if host is not None:
|
|
845
|
-
settings.server.host = host
|
|
846
|
-
|
|
847
|
-
if port is not None:
|
|
848
|
-
settings.server.port = port
|
|
849
|
-
|
|
850
|
-
if token is not None:
|
|
851
|
-
settings.server.token = token
|
|
852
|
-
|
|
853
|
-
# Create the scheduler
|
|
854
|
-
self.scheduler = Scheduler(self, name)
|
|
855
|
-
self.server = (
|
|
856
|
-
Server(self.scheduler, settings.server)
|
|
857
|
-
if (settings.server.port is not None and settings.server.port >= 0)
|
|
858
|
-
and self.workspace.run_mode == RunMode.NORMAL
|
|
859
|
-
else None
|
|
860
|
-
)
|
|
861
|
-
|
|
862
|
-
if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
|
|
863
|
-
import faulthandler
|
|
864
|
-
|
|
865
|
-
logger.info("Enabling fault handler")
|
|
866
|
-
faulthandler.enable(all_threads=True)
|
|
867
|
-
|
|
868
|
-
def submit(self, job: Job):
|
|
869
|
-
return self.scheduler.submit(job)
|
|
870
|
-
|
|
871
|
-
def prepare(self, job: Job):
|
|
872
|
-
"""Generate the file"""
|
|
873
|
-
return self.scheduler.prepare(job)
|
|
874
|
-
|
|
875
|
-
@property
|
|
876
|
-
def run_mode(self):
|
|
877
|
-
return self.workspace.run_mode
|
|
878
|
-
|
|
879
|
-
@property
|
|
880
|
-
def loop(self):
|
|
881
|
-
assert self.central is not None
|
|
882
|
-
return self.central.loop
|
|
883
|
-
|
|
884
|
-
@property
|
|
885
|
-
def resultspath(self):
|
|
886
|
-
"""Return the directory in which results can be stored for this experiment"""
|
|
887
|
-
return self.workdir / "results"
|
|
888
|
-
|
|
889
|
-
@property
|
|
890
|
-
def jobspath(self):
|
|
891
|
-
"""Return the directory in which results can be stored for this experiment"""
|
|
892
|
-
return self.workdir / "jobs"
|
|
893
|
-
|
|
894
|
-
@property
|
|
895
|
-
def alt_jobspaths(self):
|
|
896
|
-
"""Return potential other directories"""
|
|
897
|
-
for alt_workdir in self.workspace.alt_workdirs:
|
|
898
|
-
yield alt_workdir / "jobs"
|
|
899
|
-
|
|
900
|
-
@property
|
|
901
|
-
def jobsbakpath(self):
|
|
902
|
-
"""Return the directory in which results can be stored for this experiment"""
|
|
903
|
-
return self.workdir / "jobs.bak"
|
|
904
|
-
|
|
905
|
-
def stop(self):
|
|
906
|
-
"""Stop the experiment as soon as possible"""
|
|
907
|
-
|
|
908
|
-
async def doStop():
|
|
909
|
-
assert self.central is not None
|
|
910
|
-
async with self.central.exitCondition:
|
|
911
|
-
self.exitMode = True
|
|
912
|
-
logging.debug("Setting exit mode to true")
|
|
913
|
-
self.central.exitCondition.notify_all()
|
|
914
|
-
|
|
915
|
-
assert self.central is not None and self.central.loop is not None
|
|
916
|
-
asyncio.run_coroutine_threadsafe(doStop(), self.central.loop)
|
|
917
|
-
|
|
918
|
-
def wait(self):
|
|
919
|
-
"""Wait until the running processes have finished"""
|
|
920
|
-
|
|
921
|
-
async def awaitcompletion():
|
|
922
|
-
assert self.central is not None
|
|
923
|
-
logger.debug("Waiting to exit scheduler...")
|
|
924
|
-
async with self.central.exitCondition:
|
|
925
|
-
while True:
|
|
926
|
-
if self.exitMode:
|
|
927
|
-
break
|
|
928
|
-
|
|
929
|
-
# If we have still unfinished jobs or possible new tasks, wait
|
|
930
|
-
logger.debug(
|
|
931
|
-
"Checking exit condition: unfinished jobs=%d, task output queue size=%d",
|
|
932
|
-
self.unfinishedJobs,
|
|
933
|
-
self.taskOutputQueueSize,
|
|
934
|
-
)
|
|
935
|
-
if self.unfinishedJobs == 0 and self.taskOutputQueueSize == 0:
|
|
936
|
-
break
|
|
937
|
-
|
|
938
|
-
# Wait for more news...
|
|
939
|
-
await self.central.exitCondition.wait()
|
|
940
|
-
|
|
941
|
-
if self.failedJobs:
|
|
942
|
-
# Show some more information
|
|
943
|
-
count = 0
|
|
944
|
-
for job in self.failedJobs.values():
|
|
945
|
-
if job.failure_status != JobFailureStatus.DEPENDENCY:
|
|
946
|
-
count += 1
|
|
947
621
|
logger.error(
|
|
948
|
-
"
|
|
949
|
-
job.relpath,
|
|
950
|
-
job.stderr,
|
|
622
|
+
"No .done or .failed file found for job %s", job
|
|
951
623
|
)
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
future = asyncio.run_coroutine_threadsafe(awaitcompletion(), self.loop)
|
|
955
|
-
return future.result()
|
|
956
|
-
|
|
957
|
-
def setenv(self, name, value, override=True):
|
|
958
|
-
"""Shortcut to set the environment value"""
|
|
959
|
-
if override or name not in self.workspace.env:
|
|
960
|
-
logging.info("Setting environment: %s=%s", name, value)
|
|
961
|
-
self.workspace.env[name] = value
|
|
962
|
-
|
|
963
|
-
def token(self, name: str, count: int):
|
|
964
|
-
"""Returns a token for this experiment
|
|
965
|
-
|
|
966
|
-
The token is the default token of the workspace connector"""
|
|
967
|
-
return self.workspace.connector.createtoken(name, count)
|
|
968
|
-
|
|
969
|
-
def __enter__(self):
|
|
970
|
-
from .dynamic_outputs import TaskOutputsWorker
|
|
971
|
-
|
|
972
|
-
if self.workspace.run_mode != RunMode.DRY_RUN:
|
|
973
|
-
logger.info("Locking experiment %s", self.xplockpath)
|
|
974
|
-
self.xplock = self.workspace.connector.lock(self.xplockpath, 0).__enter__()
|
|
975
|
-
logger.info("Experiment locked")
|
|
976
|
-
|
|
977
|
-
# Move old jobs into "jobs.bak"
|
|
978
|
-
if self.workspace.run_mode == RunMode.NORMAL:
|
|
979
|
-
self.jobsbakpath.mkdir(exist_ok=True)
|
|
980
|
-
for p in self.jobspath.glob("*/*"):
|
|
981
|
-
if p.is_symlink():
|
|
982
|
-
target = self.jobsbakpath / p.relative_to(self.jobspath)
|
|
983
|
-
if target.is_symlink():
|
|
984
|
-
# Remove if duplicate
|
|
985
|
-
p.unlink()
|
|
986
|
-
else:
|
|
987
|
-
# Rename otherwise
|
|
988
|
-
target.parent.mkdir(parents=True, exist_ok=True)
|
|
989
|
-
p.rename(target)
|
|
990
|
-
|
|
991
|
-
if self.server:
|
|
992
|
-
self.server.start()
|
|
993
|
-
|
|
994
|
-
self.workspace.__enter__()
|
|
995
|
-
(self.workspace.path / ".__experimaestro__").touch()
|
|
996
|
-
|
|
997
|
-
global SIGNAL_HANDLER
|
|
998
|
-
# Number of unfinished jobs
|
|
999
|
-
self.unfinishedJobs = 0
|
|
1000
|
-
self.taskOutputQueueSize = 0
|
|
1001
|
-
|
|
1002
|
-
# List of failed jobs
|
|
1003
|
-
self.failedJobs: Dict[str, Job] = {}
|
|
1004
|
-
|
|
1005
|
-
# Exit mode when catching signals
|
|
1006
|
-
self.exitMode = False
|
|
1007
|
-
|
|
1008
|
-
self.central = SchedulerCentral.create(self.scheduler.name)
|
|
1009
|
-
self.taskOutputsWorker = TaskOutputsWorker(self)
|
|
1010
|
-
self.taskOutputsWorker.start()
|
|
1011
|
-
|
|
1012
|
-
SIGNAL_HANDLER.add(self)
|
|
1013
|
-
|
|
1014
|
-
self.old_experiment = experiment.CURRENT
|
|
1015
|
-
experiment.CURRENT = self
|
|
1016
|
-
return self
|
|
1017
|
-
|
|
1018
|
-
def __exit__(self, exc_type, exc_value, traceback):
|
|
1019
|
-
logger.debug("Exiting scheduler context")
|
|
1020
|
-
# If no exception and normal run mode, remove old "jobs"
|
|
1021
|
-
if self.workspace.run_mode == RunMode.NORMAL:
|
|
1022
|
-
if exc_type is None and self.jobsbakpath.is_dir():
|
|
1023
|
-
rmtree(self.jobsbakpath)
|
|
1024
|
-
|
|
1025
|
-
# Close the different locks
|
|
1026
|
-
try:
|
|
1027
|
-
if exc_type:
|
|
1028
|
-
# import faulthandler
|
|
1029
|
-
# faulthandler.dump_traceback()
|
|
1030
|
-
logger.error(
|
|
1031
|
-
"Not waiting since an exception was thrown"
|
|
1032
|
-
" (some jobs may be running)"
|
|
1033
|
-
)
|
|
1034
|
-
else:
|
|
1035
|
-
self.wait()
|
|
1036
|
-
finally:
|
|
1037
|
-
SIGNAL_HANDLER.remove(self)
|
|
1038
|
-
|
|
1039
|
-
# Stop services
|
|
1040
|
-
for service in self.services.values():
|
|
1041
|
-
logger.info("Closing service %s", service.description())
|
|
1042
|
-
service.stop()
|
|
1043
|
-
|
|
1044
|
-
if self.central is not None:
|
|
1045
|
-
logger.info("Stopping scheduler event loop")
|
|
1046
|
-
self.central.loop.stop()
|
|
1047
|
-
|
|
1048
|
-
if self.taskOutputsWorker is not None:
|
|
1049
|
-
logger.info("Stopping tasks outputs worker")
|
|
1050
|
-
self.taskOutputsWorker.queue.put(None)
|
|
1051
|
-
|
|
1052
|
-
self.central = None
|
|
1053
|
-
self.workspace.__exit__(exc_type, exc_value, traceback)
|
|
1054
|
-
if self.xplock:
|
|
1055
|
-
self.xplock.__exit__(exc_type, exc_value, traceback)
|
|
1056
|
-
|
|
1057
|
-
# Put back old experiment as current one
|
|
1058
|
-
experiment.CURRENT = self.old_experiment
|
|
1059
|
-
if self.server:
|
|
1060
|
-
logger.info("Stopping web server")
|
|
1061
|
-
self.server.stop()
|
|
1062
|
-
|
|
1063
|
-
if self.workspace.run_mode == RunMode.NORMAL:
|
|
1064
|
-
# Write the state
|
|
1065
|
-
logging.info("Saving the experiment state")
|
|
1066
|
-
from experimaestro.scheduler.state import ExperimentState
|
|
1067
|
-
|
|
1068
|
-
ExperimentState.save(
|
|
1069
|
-
self.workdir / "state.json", self.scheduler.jobs.values()
|
|
1070
|
-
)
|
|
1071
|
-
|
|
1072
|
-
async def update_task_output_count(self, delta: int):
|
|
1073
|
-
"""Change in the number of task outputs to process"""
|
|
1074
|
-
async with self.central.exitCondition:
|
|
1075
|
-
self.taskOutputQueueSize += delta
|
|
1076
|
-
logging.debug(
|
|
1077
|
-
"Updating queue size with %d => %d", delta, self.taskOutputQueueSize
|
|
1078
|
-
)
|
|
1079
|
-
if self.taskOutputQueueSize == 0:
|
|
1080
|
-
self.central.exitCondition.notify_all()
|
|
1081
|
-
|
|
1082
|
-
def watch_output(self, watched: "WatchedOutput"):
|
|
1083
|
-
"""Watch an output
|
|
1084
|
-
|
|
1085
|
-
:param watched: The watched output specification
|
|
1086
|
-
"""
|
|
1087
|
-
|
|
1088
|
-
self.taskOutputsWorker.watch_output(watched)
|
|
1089
|
-
|
|
1090
|
-
def add_service(self, service: ServiceClass) -> ServiceClass:
|
|
1091
|
-
"""Adds a service (e.g. tensorboard viewer) to the experiment
|
|
1092
|
-
|
|
1093
|
-
:param service: A service instance
|
|
1094
|
-
:return: The same service instance
|
|
1095
|
-
"""
|
|
1096
|
-
self.services[service.id] = service
|
|
1097
|
-
for listener in self.scheduler.listeners:
|
|
1098
|
-
listener.service_add(service)
|
|
1099
|
-
return service
|
|
624
|
+
state = JobState.ERROR
|
|
1100
625
|
|
|
1101
|
-
|
|
1102
|
-
|
|
626
|
+
except JobError:
|
|
627
|
+
logger.warning("Error while running job")
|
|
628
|
+
state = JobState.ERROR
|
|
1103
629
|
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
630
|
+
except Exception:
|
|
631
|
+
logger.warning(
|
|
632
|
+
"Error while running job (in experimaestro)", exc_info=True
|
|
633
|
+
)
|
|
634
|
+
state = JobState.ERROR
|
|
635
|
+
|
|
636
|
+
# Locks are released here after job completes
|
|
637
|
+
|
|
638
|
+
# Check if we should restart a resumable task that timed out
|
|
639
|
+
from experimaestro.scheduler.jobs import JobStateError
|
|
640
|
+
|
|
641
|
+
if (
|
|
642
|
+
isinstance(state, JobStateError)
|
|
643
|
+
and state.failure_reason == JobFailureStatus.TIMEOUT
|
|
644
|
+
and job.resumable
|
|
645
|
+
):
|
|
646
|
+
job.retry_count += 1
|
|
647
|
+
if job.retry_count <= job.max_retries:
|
|
648
|
+
logger.info(
|
|
649
|
+
"Resumable task %s timed out - restarting (attempt %d/%d)",
|
|
650
|
+
job,
|
|
651
|
+
job.retry_count,
|
|
652
|
+
job.max_retries,
|
|
653
|
+
)
|
|
654
|
+
# Rotate log files to preserve previous run's logs
|
|
655
|
+
job.rotate_logs()
|
|
656
|
+
# Clear cached process so aio_run() will create a new one
|
|
657
|
+
job._process = None
|
|
658
|
+
# Delete PID file so the job will be resubmitted
|
|
659
|
+
if job.pidpath.exists():
|
|
660
|
+
job.pidpath.unlink()
|
|
661
|
+
# Continue the loop to restart
|
|
662
|
+
continue
|
|
663
|
+
else:
|
|
664
|
+
logger.warning(
|
|
665
|
+
"Resumable task %s exceeded max retries (%d), marking as failed",
|
|
666
|
+
job,
|
|
667
|
+
job.max_retries,
|
|
668
|
+
)
|
|
669
|
+
# Fall through to return the error state
|
|
1127
670
|
|
|
1128
|
-
|
|
1129
|
-
|
|
671
|
+
# Job finished (success or non-recoverable error)
|
|
672
|
+
# Notify scheduler listeners of job state after job completes
|
|
673
|
+
self.notify_job_state(job)
|
|
674
|
+
return state
|