experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +10 -11
- experimaestro/annotations.py +167 -206
- experimaestro/cli/__init__.py +278 -7
- experimaestro/cli/filter.py +42 -74
- experimaestro/cli/jobs.py +157 -106
- experimaestro/cli/refactor.py +249 -0
- experimaestro/click.py +0 -1
- experimaestro/commandline.py +19 -3
- experimaestro/connectors/__init__.py +20 -1
- experimaestro/connectors/local.py +12 -0
- experimaestro/core/arguments.py +182 -46
- experimaestro/core/identifier.py +107 -6
- experimaestro/core/objects/__init__.py +6 -0
- experimaestro/core/objects/config.py +542 -25
- experimaestro/core/objects/config_walk.py +20 -0
- experimaestro/core/serialization.py +91 -34
- experimaestro/core/subparameters.py +164 -0
- experimaestro/core/types.py +175 -38
- experimaestro/exceptions.py +26 -0
- experimaestro/experiments/cli.py +111 -25
- experimaestro/generators.py +50 -9
- experimaestro/huggingface.py +3 -1
- experimaestro/launcherfinder/parser.py +29 -0
- experimaestro/launchers/__init__.py +26 -1
- experimaestro/launchers/direct.py +12 -0
- experimaestro/launchers/slurm/base.py +154 -2
- experimaestro/mkdocs/metaloader.py +0 -1
- experimaestro/mypy.py +452 -7
- experimaestro/notifications.py +63 -13
- experimaestro/progress.py +0 -2
- experimaestro/rpyc.py +0 -1
- experimaestro/run.py +19 -6
- experimaestro/scheduler/base.py +510 -125
- experimaestro/scheduler/dependencies.py +43 -28
- experimaestro/scheduler/dynamic_outputs.py +259 -130
- experimaestro/scheduler/experiment.py +256 -31
- experimaestro/scheduler/interfaces.py +501 -0
- experimaestro/scheduler/jobs.py +216 -206
- experimaestro/scheduler/remote/__init__.py +31 -0
- experimaestro/scheduler/remote/client.py +874 -0
- experimaestro/scheduler/remote/protocol.py +467 -0
- experimaestro/scheduler/remote/server.py +423 -0
- experimaestro/scheduler/remote/sync.py +144 -0
- experimaestro/scheduler/services.py +323 -23
- experimaestro/scheduler/state_db.py +437 -0
- experimaestro/scheduler/state_provider.py +2766 -0
- experimaestro/scheduler/state_sync.py +891 -0
- experimaestro/scheduler/workspace.py +52 -10
- experimaestro/scriptbuilder.py +7 -0
- experimaestro/server/__init__.py +147 -57
- experimaestro/server/data/index.css +0 -125
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +194 -58
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +44 -5
- experimaestro/sphinx/__init__.py +3 -3
- experimaestro/taskglobals.py +20 -0
- experimaestro/tests/conftest.py +80 -0
- experimaestro/tests/core/test_generics.py +2 -2
- experimaestro/tests/identifier_stability.json +45 -0
- experimaestro/tests/launchers/bin/sacct +6 -2
- experimaestro/tests/launchers/bin/sbatch +4 -2
- experimaestro/tests/launchers/test_slurm.py +80 -0
- experimaestro/tests/tasks/test_dynamic.py +231 -0
- experimaestro/tests/test_cli_jobs.py +615 -0
- experimaestro/tests/test_deprecated.py +630 -0
- experimaestro/tests/test_environment.py +200 -0
- experimaestro/tests/test_file_progress_integration.py +1 -1
- experimaestro/tests/test_forward.py +3 -3
- experimaestro/tests/test_identifier.py +372 -41
- experimaestro/tests/test_identifier_stability.py +458 -0
- experimaestro/tests/test_instance.py +3 -3
- experimaestro/tests/test_multitoken.py +442 -0
- experimaestro/tests/test_mypy.py +433 -0
- experimaestro/tests/test_objects.py +312 -5
- experimaestro/tests/test_outputs.py +2 -2
- experimaestro/tests/test_param.py +8 -12
- experimaestro/tests/test_partial_paths.py +231 -0
- experimaestro/tests/test_progress.py +0 -48
- experimaestro/tests/test_remote_state.py +671 -0
- experimaestro/tests/test_resumable_task.py +480 -0
- experimaestro/tests/test_serializers.py +141 -1
- experimaestro/tests/test_state_db.py +434 -0
- experimaestro/tests/test_subparameters.py +160 -0
- experimaestro/tests/test_tags.py +136 -0
- experimaestro/tests/test_tasks.py +107 -121
- experimaestro/tests/test_token_locking.py +252 -0
- experimaestro/tests/test_tokens.py +17 -13
- experimaestro/tests/test_types.py +123 -1
- experimaestro/tests/test_workspace_triggers.py +158 -0
- experimaestro/tests/token_reschedule.py +4 -2
- experimaestro/tests/utils.py +2 -2
- experimaestro/tokens.py +154 -57
- experimaestro/tools/diff.py +1 -1
- experimaestro/tui/__init__.py +8 -0
- experimaestro/tui/app.py +2395 -0
- experimaestro/tui/app.tcss +353 -0
- experimaestro/tui/log_viewer.py +228 -0
- experimaestro/utils/__init__.py +23 -0
- experimaestro/utils/environment.py +148 -0
- experimaestro/utils/git.py +129 -0
- experimaestro/utils/resources.py +1 -1
- experimaestro/version.py +34 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +68 -38
- experimaestro-2.0.0b8.dist-info/RECORD +187 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +1 -1
- experimaestro-2.0.0b8.dist-info/entry_points.txt +16 -0
- experimaestro/compat.py +0 -6
- experimaestro/core/objects.pyi +0 -221
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro-2.0.0a8.dist-info/RECORD +0 -166
- experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0
experimaestro/scheduler/jobs.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
import time
|
|
3
2
|
from collections import ChainMap
|
|
4
|
-
import
|
|
3
|
+
from datetime import datetime
|
|
5
4
|
from functools import cached_property
|
|
6
5
|
import itertools
|
|
7
6
|
from pathlib import Path
|
|
@@ -13,60 +12,42 @@ from experimaestro.core.objects import Config, ConfigWalkContext, WatchedOutput
|
|
|
13
12
|
from experimaestro.notifications import LevelInformation, Reporter
|
|
14
13
|
|
|
15
14
|
# from experimaestro.scheduler.base import Scheduler
|
|
16
|
-
from experimaestro.scheduler.dependencies import Dependency,
|
|
15
|
+
from experimaestro.scheduler.dependencies import Dependency, Resource
|
|
17
16
|
from experimaestro.scheduler.workspace import RunMode, Workspace
|
|
18
|
-
from experimaestro.
|
|
17
|
+
from experimaestro.scheduler.interfaces import (
|
|
18
|
+
BaseJob,
|
|
19
|
+
JobState,
|
|
20
|
+
JobStateUnscheduled,
|
|
21
|
+
JobStateWaiting,
|
|
22
|
+
JobStateReady,
|
|
23
|
+
JobStateScheduled,
|
|
24
|
+
JobStateRunning,
|
|
25
|
+
JobStateDone,
|
|
26
|
+
JobStateError,
|
|
27
|
+
JobFailureStatus,
|
|
28
|
+
)
|
|
29
|
+
from experimaestro.locking import Lock
|
|
19
30
|
from experimaestro.utils import logger
|
|
20
31
|
|
|
21
32
|
if TYPE_CHECKING:
|
|
22
33
|
from experimaestro.connectors import Process
|
|
23
34
|
from experimaestro.launchers import Launcher
|
|
35
|
+
from experimaestro.scheduler.experiment import experiment
|
|
24
36
|
|
|
25
37
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
# Job is running
|
|
40
|
-
RUNNING = 4
|
|
41
|
-
|
|
42
|
-
# Job is done (finished)
|
|
43
|
-
DONE = 5
|
|
44
|
-
|
|
45
|
-
# Job failed (finished)
|
|
46
|
-
ERROR = 6
|
|
47
|
-
|
|
48
|
-
def notstarted(self):
|
|
49
|
-
return self.value <= JobState.READY.value
|
|
50
|
-
|
|
51
|
-
def running(self):
|
|
52
|
-
return (
|
|
53
|
-
self.value == JobState.RUNNING.value
|
|
54
|
-
or self.value == JobState.SCHEDULED.value
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
def finished(self):
|
|
58
|
-
return self.value >= JobState.DONE.value
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
class JobFailureStatus(enum.Enum):
|
|
62
|
-
#: Job failed
|
|
63
|
-
DEPENDENCY = 0
|
|
64
|
-
|
|
65
|
-
#: Job dependency failed
|
|
66
|
-
FAILED = 1
|
|
67
|
-
|
|
68
|
-
#: Memory
|
|
69
|
-
MEMORY = 2
|
|
38
|
+
# Re-export JobState for backward compatibility
|
|
39
|
+
__all__ = [
|
|
40
|
+
"JobState",
|
|
41
|
+
"JobStateUnscheduled",
|
|
42
|
+
"JobStateWaiting",
|
|
43
|
+
"JobStateReady",
|
|
44
|
+
"JobStateScheduled",
|
|
45
|
+
"JobStateRunning",
|
|
46
|
+
"JobStateDone",
|
|
47
|
+
"JobStateError",
|
|
48
|
+
"JobFailureStatus",
|
|
49
|
+
"Job",
|
|
50
|
+
]
|
|
70
51
|
|
|
71
52
|
|
|
72
53
|
class JobLock(Lock):
|
|
@@ -85,22 +66,43 @@ class JobDependency(Dependency):
|
|
|
85
66
|
def __init__(self, job):
|
|
86
67
|
super().__init__(job)
|
|
87
68
|
|
|
88
|
-
def
|
|
89
|
-
|
|
90
|
-
return DependencyStatus.OK
|
|
91
|
-
elif self.origin.state == JobState.ERROR:
|
|
92
|
-
return DependencyStatus.FAIL
|
|
93
|
-
return DependencyStatus.WAIT
|
|
69
|
+
async def aio_lock(self, timeout: float = 0):
|
|
70
|
+
"""Acquire lock on job dependency by waiting for job to complete
|
|
94
71
|
|
|
95
|
-
|
|
96
|
-
|
|
72
|
+
Args:
|
|
73
|
+
timeout: Must be 0 (wait indefinitely) for job dependencies
|
|
97
74
|
|
|
75
|
+
Raises:
|
|
76
|
+
ValueError: If timeout is not 0
|
|
77
|
+
RuntimeError: If the job has not been submitted or if it failed
|
|
78
|
+
"""
|
|
79
|
+
if timeout != 0:
|
|
80
|
+
raise ValueError(
|
|
81
|
+
"Job dependencies only support timeout=0 (wait indefinitely)"
|
|
82
|
+
)
|
|
98
83
|
|
|
99
|
-
|
|
84
|
+
# Wait for the job to finish
|
|
85
|
+
if self.origin._future is None:
|
|
86
|
+
raise RuntimeError(f"Job {self.origin} has no future - not submitted")
|
|
87
|
+
await asyncio.wrap_future(self.origin._future)
|
|
88
|
+
|
|
89
|
+
# Check if the job succeeded
|
|
90
|
+
if self.origin.state != JobState.DONE:
|
|
91
|
+
raise RuntimeError(
|
|
92
|
+
f"Dependency job {self.origin.identifier} failed with state "
|
|
93
|
+
f"{self.origin.state} for {self.target.identifier}"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Job succeeded, acquire and return the lock
|
|
97
|
+
lock = JobLock(self.origin)
|
|
98
|
+
lock.acquire()
|
|
99
|
+
return lock
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class Job(BaseJob, Resource):
|
|
100
103
|
"""A job is a resource that is produced by the execution of some code"""
|
|
101
104
|
|
|
102
105
|
# Set by the scheduler
|
|
103
|
-
_readyEvent: Optional[asyncio.Event]
|
|
104
106
|
_future: Optional["concurrent.futures.Future"]
|
|
105
107
|
|
|
106
108
|
def __init__(
|
|
@@ -110,6 +112,7 @@ class Job(Resource):
|
|
|
110
112
|
workspace: Workspace = None,
|
|
111
113
|
launcher: "Launcher" = None,
|
|
112
114
|
run_mode: RunMode = RunMode.NORMAL,
|
|
115
|
+
max_retries: Optional[int] = None,
|
|
113
116
|
):
|
|
114
117
|
from experimaestro.scheduler.base import Scheduler
|
|
115
118
|
|
|
@@ -128,47 +131,74 @@ class Job(Resource):
|
|
|
128
131
|
self.name = str(self.type.identifier).rsplit(".", 1)[-1]
|
|
129
132
|
|
|
130
133
|
self.scheduler: Optional["Scheduler"] = None
|
|
134
|
+
self.experiments: List["experiment"] = [] # Experiments this job belongs to
|
|
131
135
|
self.config = config
|
|
132
136
|
self.state: JobState = JobState.UNSCHEDULED
|
|
133
137
|
|
|
134
|
-
#: If a job has failed, indicates the failure status
|
|
135
|
-
self.failure_status: JobFailureStatus = None
|
|
136
|
-
|
|
137
138
|
# Dependencies
|
|
138
139
|
self.dependencies: Set[Dependency] = set() # as target
|
|
139
140
|
|
|
140
|
-
#
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
141
|
+
# Check if this is a resumable task
|
|
142
|
+
from experimaestro.core.objects import ResumableTask
|
|
143
|
+
|
|
144
|
+
self.resumable = isinstance(config, ResumableTask)
|
|
145
|
+
|
|
146
|
+
# Retry configuration for resumable tasks
|
|
147
|
+
# Use workspace setting if max_retries is not specified
|
|
148
|
+
if max_retries is None and self.workspace:
|
|
149
|
+
max_retries = self.workspace.workspace_settings.max_retries
|
|
150
|
+
self.max_retries = max_retries if max_retries is not None else 3
|
|
151
|
+
self.retry_count = 0
|
|
152
|
+
|
|
153
|
+
# Watched outputs (stored for deferred registration with scheduler)
|
|
154
|
+
self.watched_outputs: List["WatchedOutput"] = list(
|
|
155
|
+
config.__xpm__.watched_outputs
|
|
156
|
+
)
|
|
144
157
|
|
|
145
158
|
# Process
|
|
146
159
|
self._process = None
|
|
147
|
-
self.unsatisfied = 0
|
|
148
160
|
|
|
149
161
|
# Meta-information
|
|
150
162
|
self.starttime: Optional[float] = None
|
|
151
163
|
self.submittime: Optional[float] = None
|
|
152
164
|
self.endtime: Optional[float] = None
|
|
165
|
+
self.exit_code: Optional[int] = None
|
|
153
166
|
self._progress: List[LevelInformation] = []
|
|
154
167
|
self.tags = config.tags()
|
|
155
168
|
|
|
156
169
|
def watch_output(self, watched: "WatchedOutput"):
|
|
157
|
-
"""
|
|
170
|
+
"""Add a watched output to this job.
|
|
158
171
|
|
|
159
172
|
:param watched: A description of the watched output
|
|
160
173
|
"""
|
|
161
|
-
self.
|
|
174
|
+
self.watched_outputs.append(watched)
|
|
175
|
+
|
|
176
|
+
def register_watched_outputs(self):
|
|
177
|
+
"""Register all watched outputs with the scheduler.
|
|
178
|
+
|
|
179
|
+
This should be called after the job is submitted and has a scheduler.
|
|
180
|
+
"""
|
|
181
|
+
from experimaestro.scheduler.experiment import experiment
|
|
162
182
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
183
|
+
xp = experiment.current()
|
|
184
|
+
for watched in self.watched_outputs:
|
|
185
|
+
# Set the job reference so the watcher knows where to look
|
|
186
|
+
watched.job = self
|
|
187
|
+
xp.watch_output(watched)
|
|
167
188
|
|
|
168
189
|
def done_handler(self):
|
|
169
|
-
"""The task has been completed
|
|
170
|
-
|
|
171
|
-
|
|
190
|
+
"""The task has been completed.
|
|
191
|
+
|
|
192
|
+
Ensures all remaining task output events are processed by explicitly
|
|
193
|
+
reading the task outputs file. This is necessary because file system
|
|
194
|
+
watchers may have latency, and we need to process all outputs before
|
|
195
|
+
the experiment can exit.
|
|
196
|
+
"""
|
|
197
|
+
if not self.watched_outputs:
|
|
198
|
+
return
|
|
199
|
+
|
|
200
|
+
for xp in self.experiments:
|
|
201
|
+
xp.taskOutputsWorker.process_job_outputs(self)
|
|
172
202
|
|
|
173
203
|
def __str__(self):
|
|
174
204
|
return "Job[{}]".format(self.identifier)
|
|
@@ -177,6 +207,57 @@ class Job(Resource):
|
|
|
177
207
|
assert self._future, "Cannot wait a not submitted job"
|
|
178
208
|
return self._future.result()
|
|
179
209
|
|
|
210
|
+
def set_state(self, new_state: JobState):
|
|
211
|
+
"""Set the job state and update experiment statistics
|
|
212
|
+
|
|
213
|
+
This method should be called instead of direct state assignment
|
|
214
|
+
to ensure experiment statistics (unfinishedJobs, failedJobs) are
|
|
215
|
+
properly updated.
|
|
216
|
+
|
|
217
|
+
:param new_state: The new job state
|
|
218
|
+
"""
|
|
219
|
+
old_state = self.state
|
|
220
|
+
self.state = new_state
|
|
221
|
+
|
|
222
|
+
# Helper to determine if a state should be "counted" in unfinishedJobs
|
|
223
|
+
# A job is counted when it's been submitted and hasn't finished yet
|
|
224
|
+
def is_counted(state):
|
|
225
|
+
return state != JobState.UNSCHEDULED and not state.finished()
|
|
226
|
+
|
|
227
|
+
# Update experiment statistics based on state transition
|
|
228
|
+
for xp in self.experiments:
|
|
229
|
+
# Handle transitions in/out of "counted" state
|
|
230
|
+
if is_counted(new_state) and not is_counted(old_state):
|
|
231
|
+
# Job is now being tracked (new submission or resubmit)
|
|
232
|
+
xp.unfinishedJobs += 1
|
|
233
|
+
logger.debug(
|
|
234
|
+
"Job %s submitted, unfinished jobs for %s: %d",
|
|
235
|
+
self.identifier[:8],
|
|
236
|
+
xp.workdir.name,
|
|
237
|
+
xp.unfinishedJobs,
|
|
238
|
+
)
|
|
239
|
+
elif not is_counted(new_state) and is_counted(old_state):
|
|
240
|
+
# Job is no longer being tracked (finished)
|
|
241
|
+
xp.unfinishedJobs -= 1
|
|
242
|
+
logger.debug(
|
|
243
|
+
"Job %s finished, unfinished jobs for %s: %d",
|
|
244
|
+
self.identifier[:8],
|
|
245
|
+
xp.workdir.name,
|
|
246
|
+
xp.unfinishedJobs,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Handle error state
|
|
250
|
+
if new_state.is_error() and not old_state.is_error():
|
|
251
|
+
xp.failedJobs[self.identifier] = self
|
|
252
|
+
|
|
253
|
+
# Handle recovery from error (e.g., resubmit)
|
|
254
|
+
if old_state.is_error() and not new_state.is_error():
|
|
255
|
+
xp.failedJobs.pop(self.identifier, None)
|
|
256
|
+
|
|
257
|
+
# Notify listeners via scheduler's thread-safe mechanism
|
|
258
|
+
if self.scheduler:
|
|
259
|
+
self.scheduler.notify_job_state(self)
|
|
260
|
+
|
|
180
261
|
@cached_property
|
|
181
262
|
def python_path(self) -> Iterator[str]:
|
|
182
263
|
"""Returns an iterator over python path"""
|
|
@@ -220,8 +301,8 @@ class Job(Resource):
|
|
|
220
301
|
self._progress[-1].desc = desc
|
|
221
302
|
self._progress[-1].progress = value
|
|
222
303
|
|
|
223
|
-
|
|
224
|
-
|
|
304
|
+
# Notify listeners via scheduler's thread-safe mechanism
|
|
305
|
+
self.scheduler.notify_job_state(self)
|
|
225
306
|
|
|
226
307
|
def add_notification_server(self, server):
|
|
227
308
|
"""Adds a notification server"""
|
|
@@ -271,6 +352,16 @@ class Job(Resource):
|
|
|
271
352
|
def identifier(self):
|
|
272
353
|
return self.config.__xpm__.identifier.all.hex()
|
|
273
354
|
|
|
355
|
+
@property
|
|
356
|
+
def task_id(self) -> str:
|
|
357
|
+
"""Task class identifier (for BaseJob interface)"""
|
|
358
|
+
return str(self.type.identifier)
|
|
359
|
+
|
|
360
|
+
@property
|
|
361
|
+
def locator(self) -> str:
|
|
362
|
+
"""Full task locator (for BaseJob interface)"""
|
|
363
|
+
return self.identifier
|
|
364
|
+
|
|
274
365
|
def prepare(self, overwrite=False):
|
|
275
366
|
"""Prepare all files before starting a task
|
|
276
367
|
|
|
@@ -278,113 +369,12 @@ class Job(Resource):
|
|
|
278
369
|
"""
|
|
279
370
|
pass
|
|
280
371
|
|
|
281
|
-
async def
|
|
282
|
-
"""
|
|
283
|
-
|
|
284
|
-
This method contains the core logic for starting a job that was previously
|
|
285
|
-
located in Scheduler.aio_start(). It handles job locking, dependency
|
|
286
|
-
acquisition, directory setup, and job execution while using the scheduler's
|
|
287
|
-
coordination lock to prevent race conditions between multiple jobs.
|
|
288
|
-
|
|
289
|
-
:param sched_dependency_lock: The scheduler's dependency lock for coordination
|
|
290
|
-
between jobs to prevent race conditions during dependency acquisition
|
|
291
|
-
:param notification_server: Optional notification server from the experiment
|
|
292
|
-
for job progress reporting
|
|
293
|
-
:return: JobState.DONE if job completed successfully, JobState.ERROR if job
|
|
294
|
-
failed during execution, or None if dependencies couldn't be locked
|
|
295
|
-
(signals WAITING state to scheduler)
|
|
296
|
-
:raises Exception: Various exceptions during job execution, dependency locking,
|
|
297
|
-
or process creation
|
|
298
|
-
"""
|
|
299
|
-
# We first lock the job before proceeding
|
|
300
|
-
assert self.launcher is not None
|
|
301
|
-
|
|
302
|
-
with Locks() as locks:
|
|
303
|
-
logger.debug("[starting] Locking job %s", self)
|
|
304
|
-
async with self.launcher.connector.lock(self.lockpath):
|
|
305
|
-
logger.debug("[starting] Locked job %s", self)
|
|
306
|
-
|
|
307
|
-
state = None
|
|
308
|
-
try:
|
|
309
|
-
logger.debug(
|
|
310
|
-
"Starting job %s with %d dependencies",
|
|
311
|
-
self,
|
|
312
|
-
len(self.dependencies),
|
|
313
|
-
)
|
|
314
|
-
|
|
315
|
-
# Individual dependency lock acquisition
|
|
316
|
-
# We use the scheduler-wide lock to avoid cross-jobs race conditions
|
|
317
|
-
async with sched_dependency_lock:
|
|
318
|
-
for dependency in self.dependencies:
|
|
319
|
-
try:
|
|
320
|
-
locks.append(dependency.lock().acquire())
|
|
321
|
-
except LockError:
|
|
322
|
-
logger.warning(
|
|
323
|
-
"Could not lock %s, aborting start for job %s",
|
|
324
|
-
dependency,
|
|
325
|
-
self,
|
|
326
|
-
)
|
|
327
|
-
dependency.check()
|
|
328
|
-
return None # Signal to scheduler that dependencies couldn't be locked
|
|
329
|
-
|
|
330
|
-
# Dependencies have been locked, we can start the job
|
|
331
|
-
self.starttime = time.time()
|
|
332
|
-
|
|
333
|
-
# Creates the main directory
|
|
334
|
-
directory = self.path
|
|
335
|
-
logger.debug("Making directories job %s...", directory)
|
|
336
|
-
if not directory.is_dir():
|
|
337
|
-
directory.mkdir(parents=True, exist_ok=True)
|
|
338
|
-
|
|
339
|
-
# Sets up the notification URL
|
|
340
|
-
if notification_server is not None:
|
|
341
|
-
self.add_notification_server(notification_server)
|
|
342
|
-
|
|
343
|
-
except Exception:
|
|
344
|
-
logger.warning("Error while locking job", exc_info=True)
|
|
345
|
-
return None # Signal waiting state to scheduler
|
|
346
|
-
|
|
347
|
-
try:
|
|
348
|
-
# Runs the job
|
|
349
|
-
process = await self.aio_run()
|
|
350
|
-
except Exception:
|
|
351
|
-
logger.warning("Error while starting job", exc_info=True)
|
|
352
|
-
return JobState.ERROR
|
|
353
|
-
|
|
354
|
-
try:
|
|
355
|
-
if isinstance(process, JobState):
|
|
356
|
-
state = process
|
|
357
|
-
logger.debug("Job %s ended (state %s)", self, state)
|
|
358
|
-
else:
|
|
359
|
-
logger.debug("Waiting for job %s process to end", self)
|
|
360
|
-
|
|
361
|
-
code = await process.aio_code()
|
|
362
|
-
logger.debug("Got return code %s for %s", code, self)
|
|
363
|
-
|
|
364
|
-
# Check the file if there is no return code
|
|
365
|
-
if code is None:
|
|
366
|
-
# Case where we cannot retrieve the code right away
|
|
367
|
-
if self.donepath.is_file():
|
|
368
|
-
code = 0
|
|
369
|
-
else:
|
|
370
|
-
code = int(self.failedpath.read_text())
|
|
371
|
-
|
|
372
|
-
logger.debug("Job %s ended with code %s", self, code)
|
|
373
|
-
state = JobState.DONE if code == 0 else JobState.ERROR
|
|
374
|
-
|
|
375
|
-
except JobError:
|
|
376
|
-
logger.warning("Error while running job")
|
|
377
|
-
state = JobState.ERROR
|
|
378
|
-
|
|
379
|
-
except Exception:
|
|
380
|
-
logger.warning(
|
|
381
|
-
"Error while running job (in experimaestro)", exc_info=True
|
|
382
|
-
)
|
|
383
|
-
state = JobState.ERROR
|
|
384
|
-
return state
|
|
372
|
+
async def aio_run(self) -> "Process":
|
|
373
|
+
"""Actually run the code
|
|
385
374
|
|
|
386
|
-
|
|
387
|
-
|
|
375
|
+
Returns:
|
|
376
|
+
A Process instance representing the running job
|
|
377
|
+
"""
|
|
388
378
|
raise NotImplementedError(f"Method aio_run not implemented in {self.__class__}")
|
|
389
379
|
|
|
390
380
|
async def aio_process(self) -> Optional["Process"]:
|
|
@@ -420,33 +410,28 @@ class Job(Resource):
|
|
|
420
410
|
def stderr(self) -> Path:
|
|
421
411
|
return self.jobpath / ("%s.err" % self.name)
|
|
422
412
|
|
|
413
|
+
def rotate_logs(self) -> None:
|
|
414
|
+
"""Rotate log files before restarting a task.
|
|
415
|
+
|
|
416
|
+
Renames non-empty stdout and stderr files with a timestamp suffix
|
|
417
|
+
(e.g., job.20231215143022.out) to preserve logs from previous runs.
|
|
418
|
+
"""
|
|
419
|
+
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
|
420
|
+
|
|
421
|
+
for log_path in [self.stdout, self.stderr]:
|
|
422
|
+
if log_path.exists() and log_path.stat().st_size > 0:
|
|
423
|
+
# Extract extension (.out or .err)
|
|
424
|
+
ext = log_path.suffix
|
|
425
|
+
# Create new name with timestamp before extension
|
|
426
|
+
new_name = f"{log_path.stem}.{timestamp}{ext}"
|
|
427
|
+
new_path = log_path.parent / new_name
|
|
428
|
+
logger.info("Rotating log file %s -> %s", log_path.name, new_name)
|
|
429
|
+
log_path.rename(new_path)
|
|
430
|
+
|
|
423
431
|
@property
|
|
424
432
|
def basepath(self) -> Path:
|
|
425
433
|
return self.jobpath / self.name
|
|
426
434
|
|
|
427
|
-
def dependencychanged(self, dependency, oldstatus, status):
|
|
428
|
-
"""Called when a dependency has changed"""
|
|
429
|
-
|
|
430
|
-
def value(s):
|
|
431
|
-
return 1 if s == DependencyStatus.OK else 0
|
|
432
|
-
|
|
433
|
-
self.unsatisfied -= value(status) - value(oldstatus)
|
|
434
|
-
|
|
435
|
-
logger.debug("Job %s: unsatisfied %d", self, self.unsatisfied)
|
|
436
|
-
|
|
437
|
-
if status == DependencyStatus.FAIL:
|
|
438
|
-
# Job completed
|
|
439
|
-
if not self.state.finished():
|
|
440
|
-
self.state = JobState.ERROR
|
|
441
|
-
self.failure_status = JobFailureStatus.DEPENDENCY
|
|
442
|
-
self._readyEvent.set()
|
|
443
|
-
|
|
444
|
-
if self.unsatisfied == 0:
|
|
445
|
-
logger.info("Job %s is ready to run", self)
|
|
446
|
-
# We are ready
|
|
447
|
-
self.state = JobState.READY
|
|
448
|
-
self._readyEvent.set()
|
|
449
|
-
|
|
450
435
|
def finalState(self) -> "concurrent.futures.Future[JobState]":
|
|
451
436
|
assert self._future is not None
|
|
452
437
|
return self._future
|
|
@@ -469,6 +454,31 @@ class JobContext(ConfigWalkContext):
|
|
|
469
454
|
def task(self):
|
|
470
455
|
return self.job.config
|
|
471
456
|
|
|
457
|
+
def partial_path(self, subparameters, config) -> Path:
|
|
458
|
+
"""Returns the partial directory path for a given subparameters instance.
|
|
459
|
+
|
|
460
|
+
The partial path structure is:
|
|
461
|
+
WORKSPACE/partials/TASK_ID/SUBPARAM_NAME/PARTIAL_ID/
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
subparameters: The Subparameters instance defining which groups to exclude
|
|
465
|
+
config: The configuration to compute the partial identifier for
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
The partial directory path.
|
|
469
|
+
"""
|
|
470
|
+
# Compute partial identifier
|
|
471
|
+
partial_id = config.__xpm__.get_partial_identifier(subparameters)
|
|
472
|
+
|
|
473
|
+
# Build partial directory path
|
|
474
|
+
task_id = str(config.__xpmtype__.identifier)
|
|
475
|
+
return (
|
|
476
|
+
self.job.workspace.partialspath
|
|
477
|
+
/ task_id
|
|
478
|
+
/ subparameters.name
|
|
479
|
+
/ partial_id.all.hex()
|
|
480
|
+
)
|
|
481
|
+
|
|
472
482
|
|
|
473
483
|
class JobError(Exception):
|
|
474
484
|
def __init__(self, code):
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Remote monitoring support for experimaestro
|
|
2
|
+
|
|
3
|
+
This package provides SSH-based remote monitoring capabilities for experiments.
|
|
4
|
+
|
|
5
|
+
Main components:
|
|
6
|
+
- SSHStateProviderServer: JSON-RPC server that wraps WorkspaceStateProvider
|
|
7
|
+
- SSHStateProviderClient: Client that connects via SSH and implements StateProvider interface
|
|
8
|
+
- RemoteFileSynchronizer: Rsync-based file synchronization
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
# On remote host (run via SSH):
|
|
12
|
+
from experimaestro.scheduler.remote.server import SSHStateProviderServer
|
|
13
|
+
server = SSHStateProviderServer(workspace_path)
|
|
14
|
+
server.start()
|
|
15
|
+
|
|
16
|
+
# On local host:
|
|
17
|
+
from experimaestro.scheduler.remote.client import SSHStateProviderClient
|
|
18
|
+
client = SSHStateProviderClient(host="server", remote_workspace="/path")
|
|
19
|
+
client.connect()
|
|
20
|
+
experiments = client.get_experiments()
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from experimaestro.scheduler.remote.server import SSHStateProviderServer
|
|
24
|
+
from experimaestro.scheduler.remote.client import SSHStateProviderClient
|
|
25
|
+
from experimaestro.scheduler.remote.sync import RemoteFileSynchronizer
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"SSHStateProviderServer",
|
|
29
|
+
"SSHStateProviderClient",
|
|
30
|
+
"RemoteFileSynchronizer",
|
|
31
|
+
]
|