experimaestro 1.11.1__py3-none-any.whl → 2.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +10 -11
- experimaestro/annotations.py +167 -206
- experimaestro/cli/__init__.py +140 -16
- experimaestro/cli/filter.py +42 -74
- experimaestro/cli/jobs.py +157 -106
- experimaestro/cli/progress.py +269 -0
- experimaestro/cli/refactor.py +249 -0
- experimaestro/click.py +0 -1
- experimaestro/commandline.py +19 -3
- experimaestro/connectors/__init__.py +22 -3
- experimaestro/connectors/local.py +12 -0
- experimaestro/core/arguments.py +192 -37
- experimaestro/core/identifier.py +127 -12
- experimaestro/core/objects/__init__.py +6 -0
- experimaestro/core/objects/config.py +702 -285
- experimaestro/core/objects/config_walk.py +24 -6
- experimaestro/core/serialization.py +91 -34
- experimaestro/core/serializers.py +1 -8
- experimaestro/core/subparameters.py +164 -0
- experimaestro/core/types.py +198 -83
- experimaestro/exceptions.py +26 -0
- experimaestro/experiments/cli.py +107 -25
- experimaestro/generators.py +50 -9
- experimaestro/huggingface.py +3 -1
- experimaestro/launcherfinder/parser.py +29 -0
- experimaestro/launcherfinder/registry.py +3 -3
- experimaestro/launchers/__init__.py +26 -1
- experimaestro/launchers/direct.py +12 -0
- experimaestro/launchers/slurm/base.py +154 -2
- experimaestro/mkdocs/base.py +6 -8
- experimaestro/mkdocs/metaloader.py +0 -1
- experimaestro/mypy.py +452 -7
- experimaestro/notifications.py +75 -16
- experimaestro/progress.py +404 -0
- experimaestro/rpyc.py +0 -1
- experimaestro/run.py +19 -6
- experimaestro/scheduler/__init__.py +18 -1
- experimaestro/scheduler/base.py +504 -959
- experimaestro/scheduler/dependencies.py +43 -28
- experimaestro/scheduler/dynamic_outputs.py +259 -130
- experimaestro/scheduler/experiment.py +582 -0
- experimaestro/scheduler/interfaces.py +474 -0
- experimaestro/scheduler/jobs.py +485 -0
- experimaestro/scheduler/services.py +186 -12
- experimaestro/scheduler/signal_handler.py +32 -0
- experimaestro/scheduler/state.py +1 -1
- experimaestro/scheduler/state_db.py +388 -0
- experimaestro/scheduler/state_provider.py +2345 -0
- experimaestro/scheduler/state_sync.py +834 -0
- experimaestro/scheduler/workspace.py +52 -10
- experimaestro/scriptbuilder.py +7 -0
- experimaestro/server/__init__.py +153 -32
- experimaestro/server/data/index.css +0 -125
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +194 -58
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +47 -6
- experimaestro/sphinx/__init__.py +3 -3
- experimaestro/taskglobals.py +20 -0
- experimaestro/tests/conftest.py +80 -0
- experimaestro/tests/core/test_generics.py +2 -2
- experimaestro/tests/identifier_stability.json +45 -0
- experimaestro/tests/launchers/bin/sacct +6 -2
- experimaestro/tests/launchers/bin/sbatch +4 -2
- experimaestro/tests/launchers/common.py +2 -2
- experimaestro/tests/launchers/test_slurm.py +80 -0
- experimaestro/tests/restart.py +1 -1
- experimaestro/tests/tasks/all.py +7 -0
- experimaestro/tests/tasks/test_dynamic.py +231 -0
- experimaestro/tests/test_checkers.py +2 -2
- experimaestro/tests/test_cli_jobs.py +615 -0
- experimaestro/tests/test_dependencies.py +11 -17
- experimaestro/tests/test_deprecated.py +630 -0
- experimaestro/tests/test_environment.py +200 -0
- experimaestro/tests/test_experiment.py +3 -3
- experimaestro/tests/test_file_progress.py +425 -0
- experimaestro/tests/test_file_progress_integration.py +477 -0
- experimaestro/tests/test_forward.py +3 -3
- experimaestro/tests/test_generators.py +93 -0
- experimaestro/tests/test_identifier.py +520 -169
- experimaestro/tests/test_identifier_stability.py +458 -0
- experimaestro/tests/test_instance.py +16 -21
- experimaestro/tests/test_multitoken.py +442 -0
- experimaestro/tests/test_mypy.py +433 -0
- experimaestro/tests/test_objects.py +314 -30
- experimaestro/tests/test_outputs.py +8 -8
- experimaestro/tests/test_param.py +22 -26
- experimaestro/tests/test_partial_paths.py +231 -0
- experimaestro/tests/test_progress.py +2 -50
- experimaestro/tests/test_resumable_task.py +480 -0
- experimaestro/tests/test_serializers.py +141 -60
- experimaestro/tests/test_state_db.py +434 -0
- experimaestro/tests/test_subparameters.py +160 -0
- experimaestro/tests/test_tags.py +151 -15
- experimaestro/tests/test_tasks.py +137 -160
- experimaestro/tests/test_token_locking.py +252 -0
- experimaestro/tests/test_tokens.py +25 -19
- experimaestro/tests/test_types.py +133 -11
- experimaestro/tests/test_validation.py +19 -19
- experimaestro/tests/test_workspace_triggers.py +158 -0
- experimaestro/tests/token_reschedule.py +5 -3
- experimaestro/tests/utils.py +2 -2
- experimaestro/tokens.py +154 -57
- experimaestro/tools/diff.py +8 -1
- experimaestro/tui/__init__.py +8 -0
- experimaestro/tui/app.py +2303 -0
- experimaestro/tui/app.tcss +353 -0
- experimaestro/tui/log_viewer.py +228 -0
- experimaestro/typingutils.py +11 -2
- experimaestro/utils/__init__.py +23 -0
- experimaestro/utils/environment.py +148 -0
- experimaestro/utils/git.py +129 -0
- experimaestro/utils/resources.py +1 -1
- experimaestro/version.py +34 -0
- {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +70 -39
- experimaestro-2.0.0b4.dist-info/RECORD +181 -0
- {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
- experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
- experimaestro/compat.py +0 -6
- experimaestro/core/objects.pyi +0 -225
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro-1.11.1.dist-info/RECORD +0 -158
- experimaestro-1.11.1.dist-info/entry_points.txt +0 -17
- {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,485 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from collections import ChainMap
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from functools import cached_property
|
|
5
|
+
import itertools
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, Iterator, List, Optional, Set
|
|
8
|
+
|
|
9
|
+
import concurrent
|
|
10
|
+
|
|
11
|
+
from experimaestro.core.objects import Config, ConfigWalkContext, WatchedOutput
|
|
12
|
+
from experimaestro.notifications import LevelInformation, Reporter
|
|
13
|
+
|
|
14
|
+
# from experimaestro.scheduler.base import Scheduler
|
|
15
|
+
from experimaestro.scheduler.dependencies import Dependency, Resource
|
|
16
|
+
from experimaestro.scheduler.workspace import RunMode, Workspace
|
|
17
|
+
from experimaestro.scheduler.interfaces import (
|
|
18
|
+
BaseJob,
|
|
19
|
+
JobState,
|
|
20
|
+
JobStateUnscheduled,
|
|
21
|
+
JobStateWaiting,
|
|
22
|
+
JobStateReady,
|
|
23
|
+
JobStateScheduled,
|
|
24
|
+
JobStateRunning,
|
|
25
|
+
JobStateDone,
|
|
26
|
+
JobStateError,
|
|
27
|
+
JobFailureStatus,
|
|
28
|
+
)
|
|
29
|
+
from experimaestro.locking import Lock
|
|
30
|
+
from experimaestro.utils import logger
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from experimaestro.connectors import Process
|
|
34
|
+
from experimaestro.launchers import Launcher
|
|
35
|
+
from experimaestro.scheduler.experiment import experiment
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Re-export JobState for backward compatibility
|
|
39
|
+
__all__ = [
|
|
40
|
+
"JobState",
|
|
41
|
+
"JobStateUnscheduled",
|
|
42
|
+
"JobStateWaiting",
|
|
43
|
+
"JobStateReady",
|
|
44
|
+
"JobStateScheduled",
|
|
45
|
+
"JobStateRunning",
|
|
46
|
+
"JobStateDone",
|
|
47
|
+
"JobStateError",
|
|
48
|
+
"JobFailureStatus",
|
|
49
|
+
"Job",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class JobLock(Lock):
|
|
54
|
+
def __init__(self, job):
|
|
55
|
+
super().__init__()
|
|
56
|
+
self.job = job
|
|
57
|
+
|
|
58
|
+
def _acquire(self):
|
|
59
|
+
return self.job.state == JobState.DONE
|
|
60
|
+
|
|
61
|
+
def _release(self):
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class JobDependency(Dependency):
|
|
66
|
+
def __init__(self, job):
|
|
67
|
+
super().__init__(job)
|
|
68
|
+
|
|
69
|
+
async def aio_lock(self, timeout: float = 0):
|
|
70
|
+
"""Acquire lock on job dependency by waiting for job to complete
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
timeout: Must be 0 (wait indefinitely) for job dependencies
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
ValueError: If timeout is not 0
|
|
77
|
+
RuntimeError: If the job has not been submitted or if it failed
|
|
78
|
+
"""
|
|
79
|
+
if timeout != 0:
|
|
80
|
+
raise ValueError(
|
|
81
|
+
"Job dependencies only support timeout=0 (wait indefinitely)"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Wait for the job to finish
|
|
85
|
+
if self.origin._future is None:
|
|
86
|
+
raise RuntimeError(f"Job {self.origin} has no future - not submitted")
|
|
87
|
+
await asyncio.wrap_future(self.origin._future)
|
|
88
|
+
|
|
89
|
+
# Check if the job succeeded
|
|
90
|
+
if self.origin.state != JobState.DONE:
|
|
91
|
+
raise RuntimeError(
|
|
92
|
+
f"Dependency job {self.origin.identifier} failed with state "
|
|
93
|
+
f"{self.origin.state} for {self.target.identifier}"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Job succeeded, acquire and return the lock
|
|
97
|
+
lock = JobLock(self.origin)
|
|
98
|
+
lock.acquire()
|
|
99
|
+
return lock
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class Job(BaseJob, Resource):
|
|
103
|
+
"""A job is a resource that is produced by the execution of some code"""
|
|
104
|
+
|
|
105
|
+
# Set by the scheduler
|
|
106
|
+
_future: Optional["concurrent.futures.Future"]
|
|
107
|
+
|
|
108
|
+
def __init__(
|
|
109
|
+
self,
|
|
110
|
+
config: Config,
|
|
111
|
+
*,
|
|
112
|
+
workspace: Workspace = None,
|
|
113
|
+
launcher: "Launcher" = None,
|
|
114
|
+
run_mode: RunMode = RunMode.NORMAL,
|
|
115
|
+
max_retries: Optional[int] = None,
|
|
116
|
+
):
|
|
117
|
+
from experimaestro.scheduler.base import Scheduler
|
|
118
|
+
|
|
119
|
+
super().__init__()
|
|
120
|
+
|
|
121
|
+
self.workspace = workspace or Workspace.CURRENT
|
|
122
|
+
self.launcher = launcher or self.workspace.launcher if self.workspace else None
|
|
123
|
+
|
|
124
|
+
if run_mode == RunMode.NORMAL:
|
|
125
|
+
assert self.workspace is not None, "No experiment has been defined"
|
|
126
|
+
assert self.launcher is not None, (
|
|
127
|
+
"No launcher, and no default defined for the workspace %s" % workspace
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
self.type = config.__xpmtype__
|
|
131
|
+
self.name = str(self.type.identifier).rsplit(".", 1)[-1]
|
|
132
|
+
|
|
133
|
+
self.scheduler: Optional["Scheduler"] = None
|
|
134
|
+
self.experiments: List["experiment"] = [] # Experiments this job belongs to
|
|
135
|
+
self.config = config
|
|
136
|
+
self.state: JobState = JobState.UNSCHEDULED
|
|
137
|
+
|
|
138
|
+
# Dependencies
|
|
139
|
+
self.dependencies: Set[Dependency] = set() # as target
|
|
140
|
+
|
|
141
|
+
# Check if this is a resumable task
|
|
142
|
+
from experimaestro.core.objects import ResumableTask
|
|
143
|
+
|
|
144
|
+
self.resumable = isinstance(config, ResumableTask)
|
|
145
|
+
|
|
146
|
+
# Retry configuration for resumable tasks
|
|
147
|
+
# Use workspace setting if max_retries is not specified
|
|
148
|
+
if max_retries is None and self.workspace:
|
|
149
|
+
max_retries = self.workspace.workspace_settings.max_retries
|
|
150
|
+
self.max_retries = max_retries if max_retries is not None else 3
|
|
151
|
+
self.retry_count = 0
|
|
152
|
+
|
|
153
|
+
# Watched outputs (stored for deferred registration with scheduler)
|
|
154
|
+
self.watched_outputs: List["WatchedOutput"] = list(
|
|
155
|
+
config.__xpm__.watched_outputs
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Process
|
|
159
|
+
self._process = None
|
|
160
|
+
|
|
161
|
+
# Meta-information
|
|
162
|
+
self.starttime: Optional[float] = None
|
|
163
|
+
self.submittime: Optional[float] = None
|
|
164
|
+
self.endtime: Optional[float] = None
|
|
165
|
+
self.exit_code: Optional[int] = None
|
|
166
|
+
self._progress: List[LevelInformation] = []
|
|
167
|
+
self.tags = config.tags()
|
|
168
|
+
|
|
169
|
+
def watch_output(self, watched: "WatchedOutput"):
|
|
170
|
+
"""Add a watched output to this job.
|
|
171
|
+
|
|
172
|
+
:param watched: A description of the watched output
|
|
173
|
+
"""
|
|
174
|
+
self.watched_outputs.append(watched)
|
|
175
|
+
|
|
176
|
+
def register_watched_outputs(self):
|
|
177
|
+
"""Register all watched outputs with the scheduler.
|
|
178
|
+
|
|
179
|
+
This should be called after the job is submitted and has a scheduler.
|
|
180
|
+
"""
|
|
181
|
+
from experimaestro.scheduler.experiment import experiment
|
|
182
|
+
|
|
183
|
+
xp = experiment.current()
|
|
184
|
+
for watched in self.watched_outputs:
|
|
185
|
+
# Set the job reference so the watcher knows where to look
|
|
186
|
+
watched.job = self
|
|
187
|
+
xp.watch_output(watched)
|
|
188
|
+
|
|
189
|
+
def done_handler(self):
|
|
190
|
+
"""The task has been completed.
|
|
191
|
+
|
|
192
|
+
Ensures all remaining task output events are processed by explicitly
|
|
193
|
+
reading the task outputs file. This is necessary because file system
|
|
194
|
+
watchers may have latency, and we need to process all outputs before
|
|
195
|
+
the experiment can exit.
|
|
196
|
+
"""
|
|
197
|
+
if not self.watched_outputs:
|
|
198
|
+
return
|
|
199
|
+
|
|
200
|
+
for xp in self.experiments:
|
|
201
|
+
xp.taskOutputsWorker.process_job_outputs(self)
|
|
202
|
+
|
|
203
|
+
def __str__(self):
|
|
204
|
+
return "Job[{}]".format(self.identifier)
|
|
205
|
+
|
|
206
|
+
def wait(self) -> JobState:
|
|
207
|
+
assert self._future, "Cannot wait a not submitted job"
|
|
208
|
+
return self._future.result()
|
|
209
|
+
|
|
210
|
+
def set_state(self, new_state: JobState):
|
|
211
|
+
"""Set the job state and update experiment statistics
|
|
212
|
+
|
|
213
|
+
This method should be called instead of direct state assignment
|
|
214
|
+
to ensure experiment statistics (unfinishedJobs, failedJobs) are
|
|
215
|
+
properly updated.
|
|
216
|
+
|
|
217
|
+
:param new_state: The new job state
|
|
218
|
+
"""
|
|
219
|
+
old_state = self.state
|
|
220
|
+
self.state = new_state
|
|
221
|
+
|
|
222
|
+
# Helper to determine if a state should be "counted" in unfinishedJobs
|
|
223
|
+
# A job is counted when it's been submitted and hasn't finished yet
|
|
224
|
+
def is_counted(state):
|
|
225
|
+
return state != JobState.UNSCHEDULED and not state.finished()
|
|
226
|
+
|
|
227
|
+
# Update experiment statistics based on state transition
|
|
228
|
+
for xp in self.experiments:
|
|
229
|
+
# Handle transitions in/out of "counted" state
|
|
230
|
+
if is_counted(new_state) and not is_counted(old_state):
|
|
231
|
+
# Job is now being tracked (new submission or resubmit)
|
|
232
|
+
xp.unfinishedJobs += 1
|
|
233
|
+
logger.debug(
|
|
234
|
+
"Job %s submitted, unfinished jobs for %s: %d",
|
|
235
|
+
self.identifier[:8],
|
|
236
|
+
xp.workdir.name,
|
|
237
|
+
xp.unfinishedJobs,
|
|
238
|
+
)
|
|
239
|
+
elif not is_counted(new_state) and is_counted(old_state):
|
|
240
|
+
# Job is no longer being tracked (finished)
|
|
241
|
+
xp.unfinishedJobs -= 1
|
|
242
|
+
logger.debug(
|
|
243
|
+
"Job %s finished, unfinished jobs for %s: %d",
|
|
244
|
+
self.identifier[:8],
|
|
245
|
+
xp.workdir.name,
|
|
246
|
+
xp.unfinishedJobs,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Handle error state
|
|
250
|
+
if new_state.is_error() and not old_state.is_error():
|
|
251
|
+
xp.failedJobs[self.identifier] = self
|
|
252
|
+
|
|
253
|
+
# Handle recovery from error (e.g., resubmit)
|
|
254
|
+
if old_state.is_error() and not new_state.is_error():
|
|
255
|
+
xp.failedJobs.pop(self.identifier, None)
|
|
256
|
+
|
|
257
|
+
# Notify listeners via scheduler's thread-safe mechanism
|
|
258
|
+
if self.scheduler:
|
|
259
|
+
self.scheduler.notify_job_state(self)
|
|
260
|
+
|
|
261
|
+
@cached_property
|
|
262
|
+
def python_path(self) -> Iterator[str]:
|
|
263
|
+
"""Returns an iterator over python path"""
|
|
264
|
+
return itertools.chain(self.workspace.python_path)
|
|
265
|
+
|
|
266
|
+
@cached_property
|
|
267
|
+
def environ(self):
|
|
268
|
+
"""Returns the job environment
|
|
269
|
+
|
|
270
|
+
It is made of (by order of priority):
|
|
271
|
+
|
|
272
|
+
1. The job environment
|
|
273
|
+
1. The launcher environment
|
|
274
|
+
1. The workspace environment
|
|
275
|
+
|
|
276
|
+
"""
|
|
277
|
+
return ChainMap(
|
|
278
|
+
{},
|
|
279
|
+
self.launcher.environ if self.launcher else {},
|
|
280
|
+
self.workspace.env if self.workspace else {},
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
@property
|
|
284
|
+
def progress(self):
|
|
285
|
+
return self._progress
|
|
286
|
+
|
|
287
|
+
def set_progress(self, level: int, value: float, desc: Optional[str]):
|
|
288
|
+
if value < 0:
|
|
289
|
+
logger.warning(f"Progress value out of bounds ({value})")
|
|
290
|
+
value = 0
|
|
291
|
+
elif value > 1:
|
|
292
|
+
logger.warning(f"Progress value out of bounds ({value})")
|
|
293
|
+
value = 1
|
|
294
|
+
|
|
295
|
+
# Adjust the length of the array
|
|
296
|
+
self._progress = self._progress[: (level + 1)]
|
|
297
|
+
while len(self._progress) <= level:
|
|
298
|
+
self._progress.append(LevelInformation(len(self._progress), None, 0.0))
|
|
299
|
+
|
|
300
|
+
if desc:
|
|
301
|
+
self._progress[-1].desc = desc
|
|
302
|
+
self._progress[-1].progress = value
|
|
303
|
+
|
|
304
|
+
# Notify listeners via scheduler's thread-safe mechanism
|
|
305
|
+
self.scheduler.notify_job_state(self)
|
|
306
|
+
|
|
307
|
+
def add_notification_server(self, server):
|
|
308
|
+
"""Adds a notification server"""
|
|
309
|
+
key, baseurl = server.getNotificationSpec()
|
|
310
|
+
dirpath = self.path / Reporter.NOTIFICATION_FOLDER
|
|
311
|
+
dirpath.mkdir(exist_ok=True)
|
|
312
|
+
(dirpath / key).write_text(f"{baseurl}/{self.identifier}")
|
|
313
|
+
|
|
314
|
+
@property
|
|
315
|
+
def ready(self):
|
|
316
|
+
return self.state == JobState.READY
|
|
317
|
+
|
|
318
|
+
@property
|
|
319
|
+
def jobpath(self) -> Path:
|
|
320
|
+
"""Deprecated, use `path`"""
|
|
321
|
+
return self.workspace.jobspath / self.relpath
|
|
322
|
+
|
|
323
|
+
@property
|
|
324
|
+
def path(self) -> Path:
|
|
325
|
+
return self.workspace.jobspath / self.relpath
|
|
326
|
+
|
|
327
|
+
@property
|
|
328
|
+
def experimaestro_path(self) -> Path:
|
|
329
|
+
return (self.path / ".experimaestro").resolve()
|
|
330
|
+
|
|
331
|
+
@cached_property
|
|
332
|
+
def task_outputs_path(self) -> Path:
|
|
333
|
+
return self.experimaestro_path / "task-outputs.jsonl"
|
|
334
|
+
|
|
335
|
+
@property
|
|
336
|
+
def relpath(self):
|
|
337
|
+
identifier = self.config.__xpm__.identifier
|
|
338
|
+
base = Path(str(self.type.identifier))
|
|
339
|
+
return base / identifier.all.hex()
|
|
340
|
+
|
|
341
|
+
@property
|
|
342
|
+
def relmainpath(self):
|
|
343
|
+
identifier = self.config.__xpm__.identifier
|
|
344
|
+
base = Path(str(self.type.identifier))
|
|
345
|
+
return base / identifier.main.hex()
|
|
346
|
+
|
|
347
|
+
@property
|
|
348
|
+
def hashidentifier(self):
|
|
349
|
+
return self.config.__xpm__.identifier
|
|
350
|
+
|
|
351
|
+
@property
|
|
352
|
+
def identifier(self):
|
|
353
|
+
return self.config.__xpm__.identifier.all.hex()
|
|
354
|
+
|
|
355
|
+
@property
|
|
356
|
+
def task_id(self) -> str:
|
|
357
|
+
"""Task class identifier (for BaseJob interface)"""
|
|
358
|
+
return str(self.type.identifier)
|
|
359
|
+
|
|
360
|
+
@property
|
|
361
|
+
def locator(self) -> str:
|
|
362
|
+
"""Full task locator (for BaseJob interface)"""
|
|
363
|
+
return self.identifier
|
|
364
|
+
|
|
365
|
+
def prepare(self, overwrite=False):
|
|
366
|
+
"""Prepare all files before starting a task
|
|
367
|
+
|
|
368
|
+
:param overwrite: if True, overwrite files even if the task has been run
|
|
369
|
+
"""
|
|
370
|
+
pass
|
|
371
|
+
|
|
372
|
+
async def aio_run(self) -> "Process":
|
|
373
|
+
"""Actually run the code
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
A Process instance representing the running job
|
|
377
|
+
"""
|
|
378
|
+
raise NotImplementedError(f"Method aio_run not implemented in {self.__class__}")
|
|
379
|
+
|
|
380
|
+
async def aio_process(self) -> Optional["Process"]:
|
|
381
|
+
"""Returns the process if it exists"""
|
|
382
|
+
raise NotImplementedError("Not implemented")
|
|
383
|
+
|
|
384
|
+
@property
|
|
385
|
+
def pidpath(self):
|
|
386
|
+
"""This file contains the file PID"""
|
|
387
|
+
return self.jobpath / ("%s.pid" % self.name)
|
|
388
|
+
|
|
389
|
+
@property
|
|
390
|
+
def lockpath(self):
|
|
391
|
+
"""This file is used as a lock for running the job"""
|
|
392
|
+
return self.workspace.jobspath / self.relmainpath / ("%s.lock" % self.name)
|
|
393
|
+
|
|
394
|
+
@property
|
|
395
|
+
def donepath(self) -> Path:
|
|
396
|
+
"""When a job has been successful, this file is written"""
|
|
397
|
+
return self.jobpath / ("%s.done" % self.name)
|
|
398
|
+
|
|
399
|
+
@property
|
|
400
|
+
def failedpath(self):
|
|
401
|
+
"""When a job has been unsuccessful, this file is written with an error
|
|
402
|
+
code inside"""
|
|
403
|
+
return self.jobpath / ("%s.failed" % self.name)
|
|
404
|
+
|
|
405
|
+
@property
|
|
406
|
+
def stdout(self) -> Path:
|
|
407
|
+
return self.jobpath / ("%s.out" % self.name)
|
|
408
|
+
|
|
409
|
+
@property
|
|
410
|
+
def stderr(self) -> Path:
|
|
411
|
+
return self.jobpath / ("%s.err" % self.name)
|
|
412
|
+
|
|
413
|
+
def rotate_logs(self) -> None:
|
|
414
|
+
"""Rotate log files before restarting a task.
|
|
415
|
+
|
|
416
|
+
Renames non-empty stdout and stderr files with a timestamp suffix
|
|
417
|
+
(e.g., job.20231215143022.out) to preserve logs from previous runs.
|
|
418
|
+
"""
|
|
419
|
+
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
|
420
|
+
|
|
421
|
+
for log_path in [self.stdout, self.stderr]:
|
|
422
|
+
if log_path.exists() and log_path.stat().st_size > 0:
|
|
423
|
+
# Extract extension (.out or .err)
|
|
424
|
+
ext = log_path.suffix
|
|
425
|
+
# Create new name with timestamp before extension
|
|
426
|
+
new_name = f"{log_path.stem}.{timestamp}{ext}"
|
|
427
|
+
new_path = log_path.parent / new_name
|
|
428
|
+
logger.info("Rotating log file %s -> %s", log_path.name, new_name)
|
|
429
|
+
log_path.rename(new_path)
|
|
430
|
+
|
|
431
|
+
@property
|
|
432
|
+
def basepath(self) -> Path:
|
|
433
|
+
return self.jobpath / self.name
|
|
434
|
+
|
|
435
|
+
def finalState(self) -> "concurrent.futures.Future[JobState]":
|
|
436
|
+
assert self._future is not None
|
|
437
|
+
return self._future
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
class JobContext(ConfigWalkContext):
|
|
441
|
+
def __init__(self, job: Job):
|
|
442
|
+
super().__init__()
|
|
443
|
+
self.job = job
|
|
444
|
+
|
|
445
|
+
@property
|
|
446
|
+
def name(self):
|
|
447
|
+
return self.job.name
|
|
448
|
+
|
|
449
|
+
@property
|
|
450
|
+
def path(self):
|
|
451
|
+
return self.job.path
|
|
452
|
+
|
|
453
|
+
@property
|
|
454
|
+
def task(self):
|
|
455
|
+
return self.job.config
|
|
456
|
+
|
|
457
|
+
def partial_path(self, subparameters, config) -> Path:
|
|
458
|
+
"""Returns the partial directory path for a given subparameters instance.
|
|
459
|
+
|
|
460
|
+
The partial path structure is:
|
|
461
|
+
WORKSPACE/partials/TASK_ID/SUBPARAM_NAME/PARTIAL_ID/
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
subparameters: The Subparameters instance defining which groups to exclude
|
|
465
|
+
config: The configuration to compute the partial identifier for
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
The partial directory path.
|
|
469
|
+
"""
|
|
470
|
+
# Compute partial identifier
|
|
471
|
+
partial_id = config.__xpm__.get_partial_identifier(subparameters)
|
|
472
|
+
|
|
473
|
+
# Build partial directory path
|
|
474
|
+
task_id = str(config.__xpmtype__.identifier)
|
|
475
|
+
return (
|
|
476
|
+
self.job.workspace.partialspath
|
|
477
|
+
/ task_id
|
|
478
|
+
/ subparameters.name
|
|
479
|
+
/ partial_id.all.hex()
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
class JobError(Exception):
|
|
484
|
+
def __init__(self, code):
|
|
485
|
+
super().__init__(f"Job exited with code {code}")
|