experimaestro 2.0.0b8__py3-none-any.whl → 2.0.0b17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +12 -5
- experimaestro/cli/__init__.py +239 -126
- experimaestro/cli/filter.py +48 -23
- experimaestro/cli/jobs.py +253 -71
- experimaestro/cli/refactor.py +1 -2
- experimaestro/commandline.py +7 -4
- experimaestro/connectors/__init__.py +9 -1
- experimaestro/connectors/local.py +43 -3
- experimaestro/core/arguments.py +18 -18
- experimaestro/core/identifier.py +11 -11
- experimaestro/core/objects/config.py +96 -39
- experimaestro/core/objects/config_walk.py +3 -3
- experimaestro/core/{subparameters.py → partial.py} +16 -16
- experimaestro/core/partial_lock.py +394 -0
- experimaestro/core/types.py +12 -15
- experimaestro/dynamic.py +290 -0
- experimaestro/experiments/__init__.py +6 -2
- experimaestro/experiments/cli.py +217 -50
- experimaestro/experiments/configuration.py +24 -0
- experimaestro/generators.py +5 -5
- experimaestro/ipc.py +118 -1
- experimaestro/launcherfinder/__init__.py +2 -2
- experimaestro/launcherfinder/registry.py +6 -7
- experimaestro/launcherfinder/specs.py +2 -9
- experimaestro/launchers/slurm/__init__.py +2 -2
- experimaestro/launchers/slurm/base.py +62 -0
- experimaestro/locking.py +957 -1
- experimaestro/notifications.py +89 -201
- experimaestro/progress.py +63 -366
- experimaestro/rpyc.py +0 -2
- experimaestro/run.py +29 -2
- experimaestro/scheduler/__init__.py +8 -1
- experimaestro/scheduler/base.py +629 -53
- experimaestro/scheduler/dependencies.py +20 -16
- experimaestro/scheduler/experiment.py +732 -167
- experimaestro/scheduler/interfaces.py +316 -101
- experimaestro/scheduler/jobs.py +58 -20
- experimaestro/scheduler/remote/adaptive_sync.py +265 -0
- experimaestro/scheduler/remote/client.py +171 -117
- experimaestro/scheduler/remote/protocol.py +8 -193
- experimaestro/scheduler/remote/server.py +95 -71
- experimaestro/scheduler/services.py +53 -28
- experimaestro/scheduler/state_provider.py +663 -2430
- experimaestro/scheduler/state_status.py +1247 -0
- experimaestro/scheduler/transient.py +31 -0
- experimaestro/scheduler/workspace.py +1 -1
- experimaestro/scheduler/workspace_state_provider.py +1273 -0
- experimaestro/scriptbuilder.py +4 -4
- experimaestro/settings.py +36 -0
- experimaestro/tests/conftest.py +33 -5
- experimaestro/tests/connectors/bin/executable.py +1 -1
- experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
- experimaestro/tests/launchers/bin/test.py +1 -0
- experimaestro/tests/launchers/test_slurm.py +9 -9
- experimaestro/tests/partial_reschedule.py +46 -0
- experimaestro/tests/restart.py +3 -3
- experimaestro/tests/restart_main.py +1 -0
- experimaestro/tests/scripts/notifyandwait.py +1 -0
- experimaestro/tests/task_partial.py +38 -0
- experimaestro/tests/task_tokens.py +2 -2
- experimaestro/tests/tasks/test_dynamic.py +6 -6
- experimaestro/tests/test_dependencies.py +3 -3
- experimaestro/tests/test_deprecated.py +15 -15
- experimaestro/tests/test_dynamic_locking.py +317 -0
- experimaestro/tests/test_environment.py +24 -14
- experimaestro/tests/test_experiment.py +171 -36
- experimaestro/tests/test_identifier.py +25 -25
- experimaestro/tests/test_identifier_stability.py +3 -5
- experimaestro/tests/test_multitoken.py +2 -4
- experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
- experimaestro/tests/test_partial_paths.py +81 -138
- experimaestro/tests/test_pre_experiment.py +219 -0
- experimaestro/tests/test_progress.py +2 -8
- experimaestro/tests/test_remote_state.py +560 -99
- experimaestro/tests/test_stray_jobs.py +261 -0
- experimaestro/tests/test_tasks.py +1 -2
- experimaestro/tests/test_token_locking.py +52 -67
- experimaestro/tests/test_tokens.py +5 -6
- experimaestro/tests/test_transient.py +225 -0
- experimaestro/tests/test_workspace_state_provider.py +768 -0
- experimaestro/tests/token_reschedule.py +1 -3
- experimaestro/tests/utils.py +2 -7
- experimaestro/tokens.py +227 -372
- experimaestro/tools/diff.py +1 -0
- experimaestro/tools/documentation.py +4 -5
- experimaestro/tools/jobs.py +1 -2
- experimaestro/tui/app.py +438 -1966
- experimaestro/tui/app.tcss +162 -0
- experimaestro/tui/dialogs.py +172 -0
- experimaestro/tui/log_viewer.py +253 -3
- experimaestro/tui/messages.py +137 -0
- experimaestro/tui/utils.py +54 -0
- experimaestro/tui/widgets/__init__.py +23 -0
- experimaestro/tui/widgets/experiments.py +468 -0
- experimaestro/tui/widgets/global_services.py +238 -0
- experimaestro/tui/widgets/jobs.py +972 -0
- experimaestro/tui/widgets/log.py +156 -0
- experimaestro/tui/widgets/orphans.py +363 -0
- experimaestro/tui/widgets/runs.py +185 -0
- experimaestro/tui/widgets/services.py +314 -0
- experimaestro/tui/widgets/stray_jobs.py +528 -0
- experimaestro/utils/__init__.py +1 -1
- experimaestro/utils/environment.py +105 -22
- experimaestro/utils/fswatcher.py +124 -0
- experimaestro/utils/jobs.py +1 -2
- experimaestro/utils/jupyter.py +1 -2
- experimaestro/utils/logging.py +72 -0
- experimaestro/version.py +2 -2
- experimaestro/webui/__init__.py +9 -0
- experimaestro/webui/app.py +117 -0
- experimaestro/{server → webui}/data/index.css +66 -11
- experimaestro/webui/data/index.css.map +1 -0
- experimaestro/{server → webui}/data/index.js +82763 -87217
- experimaestro/webui/data/index.js.map +1 -0
- experimaestro/webui/routes/__init__.py +5 -0
- experimaestro/webui/routes/auth.py +53 -0
- experimaestro/webui/routes/proxy.py +117 -0
- experimaestro/webui/server.py +200 -0
- experimaestro/webui/state_bridge.py +152 -0
- experimaestro/webui/websocket.py +413 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +5 -6
- experimaestro-2.0.0b17.dist-info/RECORD +219 -0
- experimaestro/cli/progress.py +0 -269
- experimaestro/scheduler/state.py +0 -75
- experimaestro/scheduler/state_db.py +0 -437
- experimaestro/scheduler/state_sync.py +0 -891
- experimaestro/server/__init__.py +0 -467
- experimaestro/server/data/index.css.map +0 -1
- experimaestro/server/data/index.js.map +0 -1
- experimaestro/tests/test_cli_jobs.py +0 -615
- experimaestro/tests/test_file_progress.py +0 -425
- experimaestro/tests/test_file_progress_integration.py +0 -477
- experimaestro/tests/test_state_db.py +0 -434
- experimaestro-2.0.0b8.dist-info/RECORD +0 -187
- /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
- /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
- /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
- /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
- /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
- /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
- /experimaestro/{server → webui}/data/favicon.ico +0 -0
- /experimaestro/{server → webui}/data/index.html +0 -0
- /experimaestro/{server → webui}/data/login.html +0 -0
- /experimaestro/{server → webui}/data/manifest.json +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
experimaestro/locking.py
CHANGED
|
@@ -1,5 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import os.path
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
from typing import TYPE_CHECKING, Callable, Optional
|
|
12
|
+
import weakref
|
|
13
|
+
|
|
14
|
+
import fasteners
|
|
15
|
+
from watchdog.events import FileSystemEventHandler
|
|
16
|
+
|
|
1
17
|
from experimaestro.utils.asyncio import asyncThreadcheck
|
|
2
|
-
from .
|
|
18
|
+
from experimaestro.dynamic import DynamicResource
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger("xpm.locking")
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from experimaestro.scheduler.jobs import Job
|
|
24
|
+
from experimaestro.connectors import Process
|
|
25
|
+
from experimaestro.dynamic import DynamicDependency
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_job_lock_relpath(task_id: str, identifier: str) -> Path:
|
|
29
|
+
"""Get the lock relative path for a job.
|
|
30
|
+
|
|
31
|
+
Creates a unique relative path combining task_id and identifier.
|
|
32
|
+
Limited to 256 characters to avoid filesystem issues.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
task_id: The task identifier
|
|
36
|
+
identifier: The job identifier (hash)
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Relative path in format "{task_id}@{identifier}.json"
|
|
40
|
+
"""
|
|
41
|
+
return Path(f"{task_id}@{identifier}"[:256] + ".json")
|
|
3
42
|
|
|
4
43
|
|
|
5
44
|
class Lock:
|
|
@@ -66,3 +105,920 @@ class Locks(Lock):
|
|
|
66
105
|
for lock in self.locks:
|
|
67
106
|
logger.debug("[locks] Releasing %s", lock)
|
|
68
107
|
lock.release()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class DynamicDependencyLock(Lock, ABC):
|
|
111
|
+
"""Base class for locks from dynamic dependencies with lifecycle hooks.
|
|
112
|
+
|
|
113
|
+
Dynamic dependency locks have additional lifecycle methods that are called
|
|
114
|
+
by the scheduler when a job starts and finishes. This allows locks to:
|
|
115
|
+
- Persist state for recovery (e.g., write lock info to disk)
|
|
116
|
+
- Clean up resources race-safely when job finishes
|
|
117
|
+
- Serialize lock info to pass to job process
|
|
118
|
+
|
|
119
|
+
File structure (standardized):
|
|
120
|
+
- {lock_folder}/informations.json: Resource-level info (e.g., token counts)
|
|
121
|
+
- {lock_folder}/ipc.lock: IPC lock for inter-process coordination
|
|
122
|
+
- {lock_folder}/jobs/{task_specific_path}.json: Per-job lock file
|
|
123
|
+
|
|
124
|
+
Subclasses must implement:
|
|
125
|
+
- lock_folder: Path to the lock folder
|
|
126
|
+
|
|
127
|
+
Subclasses must implement to_json() to include 'module' and 'class' keys
|
|
128
|
+
for dynamic deserialization in the job process.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
dependency: "DynamicDependency"
|
|
132
|
+
|
|
133
|
+
def __init__(self, dependency: DynamicDependency):
|
|
134
|
+
super().__init__()
|
|
135
|
+
self.dependency = dependency
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
@abstractmethod
|
|
139
|
+
def lock_folder(self) -> Path:
|
|
140
|
+
"""Path to the lock folder. Must be implemented by subclasses."""
|
|
141
|
+
...
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def ipc_lock_path(self) -> Path:
|
|
145
|
+
"""Path to the IPC lock file."""
|
|
146
|
+
return self.lock_folder / "ipc.lock"
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def lock_file_path(self) -> Path:
|
|
150
|
+
"""Path to the lock file for the current job."""
|
|
151
|
+
job = self.dependency.target
|
|
152
|
+
return (
|
|
153
|
+
self.lock_folder
|
|
154
|
+
/ "jobs"
|
|
155
|
+
/ get_job_lock_relpath(job.task_id, job.identifier)
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
async def aio_job_before_start(self, job: Job) -> None:
|
|
159
|
+
"""Called before the job is started.
|
|
160
|
+
|
|
161
|
+
This is called AFTER the job directory is created but BEFORE the
|
|
162
|
+
job process is spawned. Use this to set up resources needed by the job.
|
|
163
|
+
|
|
164
|
+
:param job: The job about to start
|
|
165
|
+
"""
|
|
166
|
+
pass
|
|
167
|
+
|
|
168
|
+
async def aio_job_started(self, job: Job, process: Process) -> None:
|
|
169
|
+
"""Called when the job has started successfully.
|
|
170
|
+
|
|
171
|
+
This is called AFTER the process has been spawned but BEFORE the
|
|
172
|
+
scheduler releases the connector lock. Use this to persist lock state
|
|
173
|
+
for recovery purposes.
|
|
174
|
+
|
|
175
|
+
:param job: The job that started
|
|
176
|
+
:param process: The process running the job
|
|
177
|
+
"""
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
async def aio_job_finished(self, job: Job) -> None:
|
|
181
|
+
"""Called when the job has finished (success or failure).
|
|
182
|
+
|
|
183
|
+
This is called BEFORE the lock is released. Use this for any
|
|
184
|
+
pre-release cleanup that requires knowledge of job state.
|
|
185
|
+
|
|
186
|
+
:param job: The job that finished
|
|
187
|
+
"""
|
|
188
|
+
pass
|
|
189
|
+
|
|
190
|
+
def to_json(self) -> dict:
|
|
191
|
+
"""Serialize lock info for passing to job process.
|
|
192
|
+
|
|
193
|
+
Returns a dict with 'module' and 'class' keys for dynamic import.
|
|
194
|
+
Subclasses should call super().to_json() and update with their data.
|
|
195
|
+
|
|
196
|
+
:return: JSON-serializable dict with lock information
|
|
197
|
+
"""
|
|
198
|
+
return {
|
|
199
|
+
"module": self.__class__.__module__,
|
|
200
|
+
"class": self.__class__.__name__,
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
@classmethod
|
|
204
|
+
def from_json(cls, data: dict) -> JobDependencyLock:
|
|
205
|
+
"""Deserialize lock info received in job process.
|
|
206
|
+
|
|
207
|
+
This creates a JobDependencyLock variant - a lock that is already
|
|
208
|
+
held and only needs to be released on exit.
|
|
209
|
+
|
|
210
|
+
:param data: Dict from to_json()
|
|
211
|
+
:return: JobDependencyLock instance
|
|
212
|
+
"""
|
|
213
|
+
raise NotImplementedError(f"from_json not implemented for {cls.__name__}")
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class DynamicDependencyLocks(Lock):
|
|
217
|
+
"""Container for dynamic dependency locks with lifecycle support.
|
|
218
|
+
|
|
219
|
+
This container manages a collection of DynamicDependencyLock instances,
|
|
220
|
+
providing batch operations for lifecycle events and serialization.
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
def __init__(self):
|
|
224
|
+
super().__init__()
|
|
225
|
+
self.locks: list[DynamicDependencyLock] = []
|
|
226
|
+
|
|
227
|
+
def append(self, lock: DynamicDependencyLock) -> None:
|
|
228
|
+
"""Add a lock to the container."""
|
|
229
|
+
self.locks.append(lock)
|
|
230
|
+
|
|
231
|
+
def clear(self) -> None:
|
|
232
|
+
"""Clear all locks from the container (without releasing)."""
|
|
233
|
+
self.locks.clear()
|
|
234
|
+
|
|
235
|
+
def _acquire(self) -> None:
|
|
236
|
+
"""Acquire all locks."""
|
|
237
|
+
for lock in self.locks:
|
|
238
|
+
lock.acquire()
|
|
239
|
+
|
|
240
|
+
def _release(self) -> None:
|
|
241
|
+
"""Release all locks."""
|
|
242
|
+
logger.debug("Releasing %d dynamic dependency locks", len(self.locks))
|
|
243
|
+
for lock in self.locks:
|
|
244
|
+
logger.debug("[locks] Releasing %s", lock)
|
|
245
|
+
lock.release()
|
|
246
|
+
|
|
247
|
+
async def aio_job_before_start(self, job: Job) -> None:
|
|
248
|
+
"""Notify all locks before job starts."""
|
|
249
|
+
for lock in self.locks:
|
|
250
|
+
await lock.aio_job_before_start(job)
|
|
251
|
+
|
|
252
|
+
async def aio_job_started(self, job: Job, process: Process) -> None:
|
|
253
|
+
"""Notify all locks that job has started."""
|
|
254
|
+
for lock in self.locks:
|
|
255
|
+
await lock.aio_job_started(job, process)
|
|
256
|
+
|
|
257
|
+
async def aio_job_finished(self, job: Job) -> None:
|
|
258
|
+
"""Notify all locks that job has finished."""
|
|
259
|
+
for lock in self.locks:
|
|
260
|
+
await lock.aio_job_finished(job)
|
|
261
|
+
|
|
262
|
+
def to_json(self) -> list[dict]:
|
|
263
|
+
"""Serialize all locks for job process."""
|
|
264
|
+
return [lock.to_json() for lock in self.locks]
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
class JobDependencyLock:
|
|
268
|
+
"""Lock held by job process.
|
|
269
|
+
|
|
270
|
+
This is the job-process-side counterpart of DynamicDependencyLock.
|
|
271
|
+
Created via from_json(), then acquire() is called when entering context.
|
|
272
|
+
|
|
273
|
+
The scheduler creates the lock file before starting the job. The job process
|
|
274
|
+
verifies the lock file exists on acquire() and deletes it on release().
|
|
275
|
+
|
|
276
|
+
Subclasses should set lock_file_path in __init__ from JSON data.
|
|
277
|
+
"""
|
|
278
|
+
|
|
279
|
+
#: Path to the lock file to delete on release (set from JSON data)
|
|
280
|
+
lock_file_path: Optional[Path] = None
|
|
281
|
+
|
|
282
|
+
def verify_lock_file(self) -> None:
|
|
283
|
+
"""Verify the lock file exists.
|
|
284
|
+
|
|
285
|
+
If lock_file_path is None, this is a no-op.
|
|
286
|
+
|
|
287
|
+
Raises:
|
|
288
|
+
LockError: If lock file is missing
|
|
289
|
+
"""
|
|
290
|
+
if self.lock_file_path is not None and not self.lock_file_path.is_file():
|
|
291
|
+
raise LockError(f"Lock file missing: {self.lock_file_path}")
|
|
292
|
+
|
|
293
|
+
def acquire(self) -> None:
|
|
294
|
+
"""Acquire the lock. Called when entering context.
|
|
295
|
+
|
|
296
|
+
Verifies that the scheduler created the lock file.
|
|
297
|
+
"""
|
|
298
|
+
self.verify_lock_file()
|
|
299
|
+
|
|
300
|
+
def release(self) -> None:
|
|
301
|
+
"""Release the lock and delete the lock file.
|
|
302
|
+
|
|
303
|
+
Called when exiting context.
|
|
304
|
+
"""
|
|
305
|
+
if self.lock_file_path is not None and self.lock_file_path.is_file():
|
|
306
|
+
logger.debug("Deleting lock file: %s", self.lock_file_path)
|
|
307
|
+
self.lock_file_path.unlink()
|
|
308
|
+
|
|
309
|
+
def __enter__(self):
|
|
310
|
+
self.acquire()
|
|
311
|
+
return self
|
|
312
|
+
|
|
313
|
+
def __exit__(self, *args):
|
|
314
|
+
self.release()
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
class _JobDependencyLocksContext:
|
|
318
|
+
"""Context manager for acquiring/releasing job dependency locks."""
|
|
319
|
+
|
|
320
|
+
def __init__(self, locks: list[JobDependencyLock]):
|
|
321
|
+
self._locks = locks
|
|
322
|
+
self._acquired: list[JobDependencyLock] = []
|
|
323
|
+
|
|
324
|
+
def __enter__(self):
|
|
325
|
+
for lock in self._locks:
|
|
326
|
+
lock.acquire()
|
|
327
|
+
self._acquired.append(lock)
|
|
328
|
+
return self
|
|
329
|
+
|
|
330
|
+
def __exit__(self, *args):
|
|
331
|
+
for lock in reversed(self._acquired):
|
|
332
|
+
try:
|
|
333
|
+
lock.release()
|
|
334
|
+
except Exception:
|
|
335
|
+
logger.exception("Error releasing lock %s", lock)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class JobDependencyLocks:
|
|
339
|
+
"""Container for locks in job process.
|
|
340
|
+
|
|
341
|
+
Manages a collection of JobDependencyLock instances.
|
|
342
|
+
Use dependency_locks() to get a context manager for acquire/release.
|
|
343
|
+
"""
|
|
344
|
+
|
|
345
|
+
def __init__(self):
|
|
346
|
+
self.locks: list[JobDependencyLock] = []
|
|
347
|
+
|
|
348
|
+
def dependency_locks(self) -> _JobDependencyLocksContext:
|
|
349
|
+
"""Return a context manager that acquires locks on enter, releases on exit."""
|
|
350
|
+
return _JobDependencyLocksContext(self.locks)
|
|
351
|
+
|
|
352
|
+
@classmethod
|
|
353
|
+
def from_json(cls, locks_data: list[dict]) -> JobDependencyLocks:
|
|
354
|
+
"""Create from serialized lock data.
|
|
355
|
+
|
|
356
|
+
Each lock entry must have 'module' and 'class' keys specifying
|
|
357
|
+
the DynamicDependencyLock subclass to use for deserialization.
|
|
358
|
+
"""
|
|
359
|
+
import importlib
|
|
360
|
+
|
|
361
|
+
instance = cls()
|
|
362
|
+
for lock_data in locks_data:
|
|
363
|
+
module_name = lock_data.get("module")
|
|
364
|
+
class_name = lock_data.get("class")
|
|
365
|
+
|
|
366
|
+
if module_name is None or class_name is None:
|
|
367
|
+
logger.warning("Lock data missing 'module' or 'class': %s", lock_data)
|
|
368
|
+
continue
|
|
369
|
+
|
|
370
|
+
try:
|
|
371
|
+
module = importlib.import_module(module_name)
|
|
372
|
+
lock_class = getattr(module, class_name)
|
|
373
|
+
job_lock = lock_class.from_json(lock_data)
|
|
374
|
+
instance.locks.append(job_lock)
|
|
375
|
+
except (ImportError, AttributeError) as e:
|
|
376
|
+
logger.warning(
|
|
377
|
+
"Failed to load lock class %s.%s: %s",
|
|
378
|
+
module_name,
|
|
379
|
+
class_name,
|
|
380
|
+
e,
|
|
381
|
+
)
|
|
382
|
+
continue
|
|
383
|
+
|
|
384
|
+
return instance
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
# --- Generalized dynamic lock file and resource tracking ---
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
class DynamicLockFile(ABC):
|
|
391
|
+
"""Base class for files that track who holds a dynamic lock.
|
|
392
|
+
|
|
393
|
+
Each lock file stores JSON with:
|
|
394
|
+
- job_uri: Reference to the job holding the lock
|
|
395
|
+
- information: Type-specific data
|
|
396
|
+
|
|
397
|
+
Subclasses override from_information() and to_information() to
|
|
398
|
+
handle type-specific data in the "information" field.
|
|
399
|
+
"""
|
|
400
|
+
|
|
401
|
+
path: Path
|
|
402
|
+
job_uri: Optional[str]
|
|
403
|
+
|
|
404
|
+
def __init__(self, path: Path):
|
|
405
|
+
"""Load lock file from disk.
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
path: Path to the lock file
|
|
409
|
+
"""
|
|
410
|
+
self.path = path
|
|
411
|
+
self.job_uri = None
|
|
412
|
+
|
|
413
|
+
last_error = None
|
|
414
|
+
retries = 0
|
|
415
|
+
while retries < 5:
|
|
416
|
+
retries += 1
|
|
417
|
+
try:
|
|
418
|
+
with path.open("rt") as fp:
|
|
419
|
+
data = json.load(fp)
|
|
420
|
+
self.job_uri = data.get("job_uri")
|
|
421
|
+
self.from_information(data.get("information"))
|
|
422
|
+
return # Success
|
|
423
|
+
except FileNotFoundError:
|
|
424
|
+
# File was deleted between check and read
|
|
425
|
+
return
|
|
426
|
+
except Exception as e:
|
|
427
|
+
last_error = e
|
|
428
|
+
logging.exception("Error while reading %s", self.path)
|
|
429
|
+
time.sleep(0.1)
|
|
430
|
+
continue
|
|
431
|
+
|
|
432
|
+
# Exhausted retries - re-raise the last error
|
|
433
|
+
if last_error is not None:
|
|
434
|
+
raise last_error
|
|
435
|
+
|
|
436
|
+
@classmethod
|
|
437
|
+
def create(cls, path: Path, job_uri: str, information=None) -> "DynamicLockFile":
|
|
438
|
+
"""Create a new lock file on disk.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
path: Path where to create the file
|
|
442
|
+
job_uri: URI of the job holding the lock
|
|
443
|
+
information: Type-specific data for the lock file
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
New lock file instance
|
|
447
|
+
"""
|
|
448
|
+
self = object.__new__(cls)
|
|
449
|
+
self.path = path
|
|
450
|
+
self.job_uri = job_uri
|
|
451
|
+
self.from_information(information)
|
|
452
|
+
|
|
453
|
+
logging.debug("Writing lock file %s", path)
|
|
454
|
+
data = {"job_uri": job_uri, "information": self.to_information()}
|
|
455
|
+
with path.open("wt") as fp:
|
|
456
|
+
json.dump(data, fp)
|
|
457
|
+
return self
|
|
458
|
+
|
|
459
|
+
def delete(self) -> None:
|
|
460
|
+
"""Delete the lock file from disk."""
|
|
461
|
+
if self.path.is_file():
|
|
462
|
+
logging.debug("Deleting lock file %s", self.path)
|
|
463
|
+
self.path.unlink()
|
|
464
|
+
|
|
465
|
+
def watch(self, on_released: Optional[Callable[[], None]] = None) -> None:
|
|
466
|
+
"""Watch the job process and call callback when it finishes.
|
|
467
|
+
|
|
468
|
+
This starts a background thread that:
|
|
469
|
+
1. Waits for the job lock to be available (job started)
|
|
470
|
+
2. Waits for the process to finish
|
|
471
|
+
3. Deletes the lock file
|
|
472
|
+
4. Calls the callback (if provided)
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
on_released: Optional callback to invoke when lock is released
|
|
476
|
+
"""
|
|
477
|
+
if self.job_uri is None:
|
|
478
|
+
return
|
|
479
|
+
|
|
480
|
+
logger.debug("Watching process for %s (%s)", self.path, self.job_uri)
|
|
481
|
+
job_path = Path(self.job_uri)
|
|
482
|
+
lockpath = job_path.with_suffix(".lock")
|
|
483
|
+
pidpath = job_path.with_suffix(".pid")
|
|
484
|
+
|
|
485
|
+
def run():
|
|
486
|
+
logger.debug("Locking job lock path %s", lockpath)
|
|
487
|
+
process = None
|
|
488
|
+
|
|
489
|
+
# Acquire the job lock - blocks if scheduler is still starting the job
|
|
490
|
+
# Once we get the lock, the job has either started or finished
|
|
491
|
+
with fasteners.InterProcessLock(lockpath):
|
|
492
|
+
if not pidpath.is_file():
|
|
493
|
+
logger.debug("Job already finished (no PID file %s)", pidpath)
|
|
494
|
+
else:
|
|
495
|
+
s = ""
|
|
496
|
+
while s == "":
|
|
497
|
+
s = pidpath.read_text()
|
|
498
|
+
|
|
499
|
+
logger.info("Loading job watcher from definition")
|
|
500
|
+
from experimaestro.connectors import Process
|
|
501
|
+
from experimaestro.connectors.local import LocalConnector
|
|
502
|
+
|
|
503
|
+
connector = LocalConnector.instance()
|
|
504
|
+
process = Process.fromDefinition(connector, json.loads(s))
|
|
505
|
+
|
|
506
|
+
# Wait out of the lock
|
|
507
|
+
if process is not None:
|
|
508
|
+
process.wait()
|
|
509
|
+
|
|
510
|
+
self.delete()
|
|
511
|
+
if on_released is not None:
|
|
512
|
+
on_released()
|
|
513
|
+
|
|
514
|
+
threading.Thread(target=run).start()
|
|
515
|
+
|
|
516
|
+
def from_information(self, info) -> None:
|
|
517
|
+
"""Set type-specific data from the "information" field.
|
|
518
|
+
|
|
519
|
+
Override in subclasses to handle extra data.
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
info: The "information" value from the JSON file
|
|
523
|
+
"""
|
|
524
|
+
pass
|
|
525
|
+
|
|
526
|
+
def to_information(self):
|
|
527
|
+
"""Get type-specific data for the "information" field.
|
|
528
|
+
|
|
529
|
+
Override in subclasses to include extra data.
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
Value to store in the "information" field (JSON-serializable)
|
|
533
|
+
"""
|
|
534
|
+
return None
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
class _TrackedResourceProxy(FileSystemEventHandler):
|
|
538
|
+
"""Weak reference proxy for file system events.
|
|
539
|
+
|
|
540
|
+
Prevents the resource from being kept alive by the watcher.
|
|
541
|
+
"""
|
|
542
|
+
|
|
543
|
+
def __init__(self, resource: "TrackedDynamicResource"):
|
|
544
|
+
self._resource_ref = weakref.ref(resource)
|
|
545
|
+
|
|
546
|
+
def on_modified(self, event):
|
|
547
|
+
resource = self._resource_ref()
|
|
548
|
+
if resource is not None:
|
|
549
|
+
return resource.on_modified(event)
|
|
550
|
+
|
|
551
|
+
def on_deleted(self, event):
|
|
552
|
+
resource = self._resource_ref()
|
|
553
|
+
if resource is not None:
|
|
554
|
+
return resource.on_deleted(event)
|
|
555
|
+
|
|
556
|
+
def on_created(self, event):
|
|
557
|
+
resource = self._resource_ref()
|
|
558
|
+
if resource is not None:
|
|
559
|
+
return resource.on_created(event)
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
class TrackedDynamicResource(DynamicResource, ABC):
|
|
563
|
+
"""Base class for resources with file-based lock tracking.
|
|
564
|
+
|
|
565
|
+
Inherits from DynamicResource to provide async_wait() via ResourcePoller.
|
|
566
|
+
|
|
567
|
+
This provides:
|
|
568
|
+
- File system watching for lock files
|
|
569
|
+
- IPC and thread locking
|
|
570
|
+
- Condition variable for waiting on availability
|
|
571
|
+
- Cache of lock files
|
|
572
|
+
- Async waiting via ResourcePoller
|
|
573
|
+
|
|
574
|
+
File structure:
|
|
575
|
+
- {lock_folder}/informations.json: Resource-level info (e.g., token counts)
|
|
576
|
+
- {lock_folder}/ipc.lock: IPC lock for inter-process coordination
|
|
577
|
+
- {lock_folder}/jobs/{task_specific_path}.json: Per-job lock files
|
|
578
|
+
|
|
579
|
+
Subclasses must implement:
|
|
580
|
+
- lock_folder: Path to the lock folder (abstract property)
|
|
581
|
+
- lock_file_class: The DynamicLockFile subclass to use
|
|
582
|
+
- is_available(): Check if resource is available for a dependency
|
|
583
|
+
- _do_acquire(): Perform acquire logic
|
|
584
|
+
- _do_release(): Perform release logic
|
|
585
|
+
"""
|
|
586
|
+
|
|
587
|
+
#: Subclass of DynamicLockFile to use for lock files
|
|
588
|
+
lock_file_class: type[DynamicLockFile]
|
|
589
|
+
|
|
590
|
+
@property
|
|
591
|
+
@abstractmethod
|
|
592
|
+
def lock_folder(self) -> Path:
|
|
593
|
+
"""Path to the lock folder. Must be implemented by subclasses."""
|
|
594
|
+
...
|
|
595
|
+
|
|
596
|
+
@property
|
|
597
|
+
def informations_path(self) -> Path:
|
|
598
|
+
"""Path to the informations.json file."""
|
|
599
|
+
return self.lock_folder / "informations.json"
|
|
600
|
+
|
|
601
|
+
@property
|
|
602
|
+
def ipc_lock_path(self) -> Path:
|
|
603
|
+
"""Path to the IPC lock file."""
|
|
604
|
+
return self.lock_folder / "ipc.lock"
|
|
605
|
+
|
|
606
|
+
@property
|
|
607
|
+
def jobs_folder(self) -> Path:
|
|
608
|
+
"""Path to the jobs folder containing per-job lock files."""
|
|
609
|
+
return self.lock_folder / "jobs"
|
|
610
|
+
|
|
611
|
+
def __init__(self, name: str):
|
|
612
|
+
"""Initialize the resource.
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
name: Human-readable name for the resource
|
|
616
|
+
"""
|
|
617
|
+
self.name = name
|
|
618
|
+
self.lock_folder.mkdir(exist_ok=True, parents=True)
|
|
619
|
+
|
|
620
|
+
self.cache: dict[str, DynamicLockFile] = {}
|
|
621
|
+
|
|
622
|
+
self.ipc_lock = fasteners.InterProcessLock(self.ipc_lock_path)
|
|
623
|
+
self.lock = threading.Lock()
|
|
624
|
+
self.available_condition = threading.Condition(self.lock)
|
|
625
|
+
|
|
626
|
+
self.timestamp = os.path.getmtime(self.lock_folder)
|
|
627
|
+
|
|
628
|
+
# Initial state update
|
|
629
|
+
with self.lock, self.ipc_lock:
|
|
630
|
+
self._update()
|
|
631
|
+
|
|
632
|
+
# Set up file system watching
|
|
633
|
+
from .ipc import ipcom
|
|
634
|
+
|
|
635
|
+
self.watchedpath = str(self.lock_folder.absolute())
|
|
636
|
+
self.proxy = _TrackedResourceProxy(self)
|
|
637
|
+
self.watcher = ipcom().fswatch(self.proxy, self.lock_folder, recursive=True)
|
|
638
|
+
logger.debug("Watching %s", self.watchedpath)
|
|
639
|
+
|
|
640
|
+
def __del__(self):
|
|
641
|
+
if self.watcher is not None:
|
|
642
|
+
logging.debug("Removing watcher on %s", self.watchedpath)
|
|
643
|
+
from .ipc import ipcom
|
|
644
|
+
|
|
645
|
+
ipcom().fsunwatch(self.watcher)
|
|
646
|
+
self.watcher = None
|
|
647
|
+
|
|
648
|
+
def refresh_state(self) -> None:
|
|
649
|
+
"""Refresh state from disk.
|
|
650
|
+
|
|
651
|
+
This is a fallback for when file system notifications are missed.
|
|
652
|
+
Called by ResourcePoller periodically.
|
|
653
|
+
"""
|
|
654
|
+
with self.lock, self.ipc_lock:
|
|
655
|
+
self._update()
|
|
656
|
+
self.available_condition.notify_all()
|
|
657
|
+
|
|
658
|
+
async def async_wait(self, timeout: float = 0) -> bool:
|
|
659
|
+
"""Wait asynchronously until the resource state may have changed.
|
|
660
|
+
|
|
661
|
+
Uses ResourcePoller for efficient polling across all resources.
|
|
662
|
+
|
|
663
|
+
Args:
|
|
664
|
+
timeout: Maximum time to wait in seconds (0 = wait indefinitely)
|
|
665
|
+
|
|
666
|
+
Returns:
|
|
667
|
+
True if notified of a change, False if timed out
|
|
668
|
+
"""
|
|
669
|
+
from experimaestro.dynamic import ResourcePoller
|
|
670
|
+
|
|
671
|
+
loop = asyncio.get_running_loop()
|
|
672
|
+
poller = ResourcePoller.instance()
|
|
673
|
+
|
|
674
|
+
event = poller.register(self, loop, timeout)
|
|
675
|
+
|
|
676
|
+
try:
|
|
677
|
+
if timeout > 0:
|
|
678
|
+
try:
|
|
679
|
+
await asyncio.wait_for(event.wait(), timeout=timeout)
|
|
680
|
+
return True
|
|
681
|
+
except asyncio.TimeoutError:
|
|
682
|
+
return False
|
|
683
|
+
else:
|
|
684
|
+
await event.wait()
|
|
685
|
+
return True
|
|
686
|
+
finally:
|
|
687
|
+
# Event cleanup is handled by poller
|
|
688
|
+
pass
|
|
689
|
+
|
|
690
|
+
def _lock_file_key(self, path: Path) -> str:
|
|
691
|
+
"""Get the cache key for a lock file path.
|
|
692
|
+
|
|
693
|
+
The key is the relative path from jobs_folder (e.g., "task_id@identifier.json").
|
|
694
|
+
"""
|
|
695
|
+
return str(path.relative_to(self.jobs_folder))
|
|
696
|
+
|
|
697
|
+
def _update(self) -> None:
|
|
698
|
+
"""Update state by reading all lock files from disk.
|
|
699
|
+
|
|
700
|
+
Assumes IPC lock is held.
|
|
701
|
+
"""
|
|
702
|
+
logging.debug("Full resource state update for %s", self.name)
|
|
703
|
+
old_cache = self.cache
|
|
704
|
+
self.cache = {}
|
|
705
|
+
|
|
706
|
+
self._reset_state()
|
|
707
|
+
|
|
708
|
+
if self.jobs_folder.exists():
|
|
709
|
+
for path in self.jobs_folder.glob("*.json"):
|
|
710
|
+
key = self._lock_file_key(path)
|
|
711
|
+
lf = old_cache.get(key)
|
|
712
|
+
if lf is None:
|
|
713
|
+
lf = self.lock_file_class(path)
|
|
714
|
+
lf.watch(lambda k=key: self._on_lock_released(k))
|
|
715
|
+
logging.debug("Read lock file %s", path)
|
|
716
|
+
else:
|
|
717
|
+
logging.debug("Lock file already in cache %s", key)
|
|
718
|
+
|
|
719
|
+
self.cache[key] = lf
|
|
720
|
+
self._account_lock_file(lf)
|
|
721
|
+
|
|
722
|
+
logging.debug("Full resource state update finished for %s", self.name)
|
|
723
|
+
|
|
724
|
+
def _on_lock_released(self, name: str) -> None:
|
|
725
|
+
"""Called when a watched lock is released (job finished).
|
|
726
|
+
|
|
727
|
+
Args:
|
|
728
|
+
name: Name of the lock file
|
|
729
|
+
"""
|
|
730
|
+
with self.lock:
|
|
731
|
+
if name in self.cache:
|
|
732
|
+
logging.debug("Lock released (job finished): %s", name)
|
|
733
|
+
lf = self.cache[name]
|
|
734
|
+
del self.cache[name]
|
|
735
|
+
self._unaccount_lock_file(lf)
|
|
736
|
+
self.available_condition.notify_all()
|
|
737
|
+
self._notify_poller()
|
|
738
|
+
|
|
739
|
+
def _is_job_lock_file(self, path: Path) -> bool:
|
|
740
|
+
"""Check if path is a job lock file (under jobs_folder)."""
|
|
741
|
+
try:
|
|
742
|
+
path.relative_to(self.jobs_folder)
|
|
743
|
+
return path.suffix == ".json"
|
|
744
|
+
except ValueError:
|
|
745
|
+
return False
|
|
746
|
+
|
|
747
|
+
def _notify_poller(self) -> None:
|
|
748
|
+
"""Notify the ResourcePoller that state has changed.
|
|
749
|
+
|
|
750
|
+
Called after file system events to wake up async waiters.
|
|
751
|
+
"""
|
|
752
|
+
from experimaestro.dynamic import ResourcePoller
|
|
753
|
+
|
|
754
|
+
if ResourcePoller._instance is not None:
|
|
755
|
+
ResourcePoller._instance.notify(self)
|
|
756
|
+
|
|
757
|
+
def on_deleted(self, event) -> None:
|
|
758
|
+
"""Handle file deletion event."""
|
|
759
|
+
logger.debug(
|
|
760
|
+
"Deleted path notification %s [watched %s]",
|
|
761
|
+
event.src_path,
|
|
762
|
+
self.watchedpath,
|
|
763
|
+
)
|
|
764
|
+
path = Path(event.src_path)
|
|
765
|
+
if not self._is_job_lock_file(path):
|
|
766
|
+
return
|
|
767
|
+
|
|
768
|
+
key = self._lock_file_key(path)
|
|
769
|
+
if key in self.cache:
|
|
770
|
+
with self.lock:
|
|
771
|
+
if key in self.cache:
|
|
772
|
+
logging.debug("Deleting %s from cache (event)", key)
|
|
773
|
+
lf = self.cache[key]
|
|
774
|
+
del self.cache[key]
|
|
775
|
+
self._unaccount_lock_file(lf)
|
|
776
|
+
self.available_condition.notify_all()
|
|
777
|
+
self._notify_poller()
|
|
778
|
+
|
|
779
|
+
def on_created(self, event) -> None:
|
|
780
|
+
"""Handle file creation event."""
|
|
781
|
+
logger.debug(
|
|
782
|
+
"Created path notification %s [watched %s]",
|
|
783
|
+
event.src_path,
|
|
784
|
+
self.watchedpath,
|
|
785
|
+
)
|
|
786
|
+
path = Path(event.src_path)
|
|
787
|
+
if not self._is_job_lock_file(path):
|
|
788
|
+
return
|
|
789
|
+
|
|
790
|
+
try:
|
|
791
|
+
key = self._lock_file_key(path)
|
|
792
|
+
if key not in self.cache:
|
|
793
|
+
with self.lock:
|
|
794
|
+
if key not in self.cache:
|
|
795
|
+
lf = self.lock_file_class(path)
|
|
796
|
+
lf.watch(lambda k=key: self._on_lock_released(k))
|
|
797
|
+
self.cache[key] = lf
|
|
798
|
+
self._account_lock_file(lf)
|
|
799
|
+
except FileNotFoundError:
|
|
800
|
+
pass
|
|
801
|
+
except Exception:
|
|
802
|
+
logger.exception("Uncaught exception in on_created handler")
|
|
803
|
+
raise
|
|
804
|
+
|
|
805
|
+
def on_modified(self, event) -> None:
|
|
806
|
+
"""Handle file modification event."""
|
|
807
|
+
try:
|
|
808
|
+
logger.debug(
|
|
809
|
+
"on modified path: %s [watched %s]",
|
|
810
|
+
event.src_path,
|
|
811
|
+
self.watchedpath,
|
|
812
|
+
)
|
|
813
|
+
path = Path(event.src_path)
|
|
814
|
+
|
|
815
|
+
# Handle informations.json modification
|
|
816
|
+
if event.src_path == str(self.informations_path):
|
|
817
|
+
self._on_information_modified()
|
|
818
|
+
return
|
|
819
|
+
|
|
820
|
+
# Handle job lock files
|
|
821
|
+
if not self._is_job_lock_file(path):
|
|
822
|
+
return
|
|
823
|
+
|
|
824
|
+
key = self._lock_file_key(path)
|
|
825
|
+
if key not in self.cache:
|
|
826
|
+
with self.lock:
|
|
827
|
+
if key not in self.cache:
|
|
828
|
+
logger.debug("Lock file not in cache %s", key)
|
|
829
|
+
try:
|
|
830
|
+
lf = self.lock_file_class(path)
|
|
831
|
+
lf.watch(lambda k=key: self._on_lock_released(k))
|
|
832
|
+
self.cache[key] = lf
|
|
833
|
+
self._account_lock_file(lf)
|
|
834
|
+
except FileNotFoundError:
|
|
835
|
+
pass
|
|
836
|
+
except Exception:
|
|
837
|
+
logger.exception("Uncaught exception in on_modified handler")
|
|
838
|
+
raise
|
|
839
|
+
|
|
840
|
+
def _on_information_modified(self) -> None:
|
|
841
|
+
"""Handle informations.json modification.
|
|
842
|
+
|
|
843
|
+
Checks timestamp to avoid duplicate processing, then calls
|
|
844
|
+
_handle_information_change() for subclass-specific logic.
|
|
845
|
+
"""
|
|
846
|
+
import os
|
|
847
|
+
|
|
848
|
+
logger.debug("Resource information modified: %s", self.name)
|
|
849
|
+
with self.lock:
|
|
850
|
+
timestamp = os.path.getmtime(self.informations_path)
|
|
851
|
+
if timestamp <= self.timestamp:
|
|
852
|
+
logger.debug(
|
|
853
|
+
"Not reading information file [%f <= %f]",
|
|
854
|
+
timestamp,
|
|
855
|
+
self.timestamp,
|
|
856
|
+
)
|
|
857
|
+
return
|
|
858
|
+
|
|
859
|
+
self._handle_information_change()
|
|
860
|
+
|
|
861
|
+
def _handle_information_change(self) -> None:
|
|
862
|
+
"""Handle resource-specific information changes.
|
|
863
|
+
|
|
864
|
+
Override in subclasses to handle changes to informations.json.
|
|
865
|
+
Called after timestamp check passes. Default implementation does nothing.
|
|
866
|
+
"""
|
|
867
|
+
pass
|
|
868
|
+
|
|
869
|
+
@abstractmethod
|
|
870
|
+
def _reset_state(self) -> None:
|
|
871
|
+
"""Reset resource state before re-reading lock files.
|
|
872
|
+
|
|
873
|
+
Called at the start of _update() before iterating lock files.
|
|
874
|
+
"""
|
|
875
|
+
pass
|
|
876
|
+
|
|
877
|
+
@abstractmethod
|
|
878
|
+
def _account_lock_file(self, lf: DynamicLockFile) -> None:
|
|
879
|
+
"""Account for a lock file in resource state.
|
|
880
|
+
|
|
881
|
+
Called when a lock file is read or created.
|
|
882
|
+
|
|
883
|
+
Args:
|
|
884
|
+
lf: The lock file to account for
|
|
885
|
+
"""
|
|
886
|
+
pass
|
|
887
|
+
|
|
888
|
+
@abstractmethod
|
|
889
|
+
def _unaccount_lock_file(self, lf: DynamicLockFile) -> None:
|
|
890
|
+
"""Remove a lock file from resource state accounting.
|
|
891
|
+
|
|
892
|
+
Called when a lock file is deleted.
|
|
893
|
+
|
|
894
|
+
Args:
|
|
895
|
+
lf: The lock file to unaccount
|
|
896
|
+
"""
|
|
897
|
+
pass
|
|
898
|
+
|
|
899
|
+
@abstractmethod
|
|
900
|
+
def is_available(self, dependency: "DynamicDependency") -> bool:
|
|
901
|
+
"""Check if resource is available for the given dependency.
|
|
902
|
+
|
|
903
|
+
Args:
|
|
904
|
+
dependency: The dependency requesting the resource
|
|
905
|
+
|
|
906
|
+
Returns:
|
|
907
|
+
True if resource is available
|
|
908
|
+
"""
|
|
909
|
+
pass
|
|
910
|
+
|
|
911
|
+
@abstractmethod
|
|
912
|
+
def _do_acquire(self, dependency: "DynamicDependency") -> None:
|
|
913
|
+
"""Perform acquire logic for the dependency.
|
|
914
|
+
|
|
915
|
+
Called after availability is confirmed and lock file is created.
|
|
916
|
+
|
|
917
|
+
Args:
|
|
918
|
+
dependency: The dependency acquiring the resource
|
|
919
|
+
"""
|
|
920
|
+
pass
|
|
921
|
+
|
|
922
|
+
@abstractmethod
|
|
923
|
+
def _do_release(self, dependency: "DynamicDependency") -> None:
|
|
924
|
+
"""Perform release logic for the dependency.
|
|
925
|
+
|
|
926
|
+
Called before lock file is deleted.
|
|
927
|
+
|
|
928
|
+
Args:
|
|
929
|
+
dependency: The dependency releasing the resource
|
|
930
|
+
"""
|
|
931
|
+
pass
|
|
932
|
+
|
|
933
|
+
def _get_job_lock_path(self, dependency: "DynamicDependency") -> Path:
|
|
934
|
+
"""Get the lock file path for a dependency.
|
|
935
|
+
|
|
936
|
+
Returns path under jobs_folder: jobs/{task_id}@{identifier}.json
|
|
937
|
+
"""
|
|
938
|
+
job = dependency.target
|
|
939
|
+
return self.jobs_folder / get_job_lock_relpath(job.task_id, job.identifier)
|
|
940
|
+
|
|
941
|
+
def acquire(self, dependency: "DynamicDependency") -> None:
|
|
942
|
+
"""Acquire the resource for a dependency.
|
|
943
|
+
|
|
944
|
+
Args:
|
|
945
|
+
dependency: The dependency requesting the resource
|
|
946
|
+
|
|
947
|
+
Raises:
|
|
948
|
+
LockError: If resource is not available
|
|
949
|
+
"""
|
|
950
|
+
with self.lock, self.ipc_lock:
|
|
951
|
+
self._update()
|
|
952
|
+
if not self.is_available(dependency):
|
|
953
|
+
raise LockError(f"Resource {self.name} not available")
|
|
954
|
+
|
|
955
|
+
# Create lock file
|
|
956
|
+
lock_path = self._get_job_lock_path(dependency)
|
|
957
|
+
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
958
|
+
lock_key = self._lock_file_key(lock_path)
|
|
959
|
+
|
|
960
|
+
lf = self.lock_file_class.create(
|
|
961
|
+
lock_path,
|
|
962
|
+
self._get_job_uri(dependency),
|
|
963
|
+
information=self._get_lock_file_information(dependency),
|
|
964
|
+
)
|
|
965
|
+
self.cache[lock_key] = lf
|
|
966
|
+
|
|
967
|
+
self._do_acquire(dependency)
|
|
968
|
+
|
|
969
|
+
logger.debug("Acquired %s for %s", self.name, dependency)
|
|
970
|
+
|
|
971
|
+
def release(self, dependency: "DynamicDependency") -> None:
|
|
972
|
+
"""Release the resource for a dependency.
|
|
973
|
+
|
|
974
|
+
Args:
|
|
975
|
+
dependency: The dependency releasing the resource
|
|
976
|
+
"""
|
|
977
|
+
with self.lock, self.ipc_lock:
|
|
978
|
+
self._update()
|
|
979
|
+
|
|
980
|
+
lock_path = self._get_job_lock_path(dependency)
|
|
981
|
+
lock_key = self._lock_file_key(lock_path)
|
|
982
|
+
lf = self.cache.get(lock_key)
|
|
983
|
+
if lf is None:
|
|
984
|
+
# Lock file may have been released already (e.g., job completed)
|
|
985
|
+
logger.debug(
|
|
986
|
+
"Lock file not in cache for %s (%s) - may have been released already",
|
|
987
|
+
dependency,
|
|
988
|
+
lock_key,
|
|
989
|
+
)
|
|
990
|
+
return
|
|
991
|
+
|
|
992
|
+
logger.debug("Deleting %s from cache", lock_key)
|
|
993
|
+
del self.cache[lock_key]
|
|
994
|
+
|
|
995
|
+
self._do_release(dependency)
|
|
996
|
+
|
|
997
|
+
self.available_condition.notify_all()
|
|
998
|
+
lf.delete()
|
|
999
|
+
|
|
1000
|
+
def _get_job_uri(self, dependency: "DynamicDependency") -> str:
|
|
1001
|
+
"""Get the job URI for a dependency.
|
|
1002
|
+
|
|
1003
|
+
Default implementation uses dependency.target.basepath.
|
|
1004
|
+
|
|
1005
|
+
Args:
|
|
1006
|
+
dependency: The dependency
|
|
1007
|
+
|
|
1008
|
+
Returns:
|
|
1009
|
+
Job URI string
|
|
1010
|
+
"""
|
|
1011
|
+
return str(dependency.target.basepath)
|
|
1012
|
+
|
|
1013
|
+
def _get_lock_file_information(self, dependency: "DynamicDependency"):
|
|
1014
|
+
"""Get information to store in lock file.
|
|
1015
|
+
|
|
1016
|
+
Override in subclasses to store type-specific data.
|
|
1017
|
+
|
|
1018
|
+
Args:
|
|
1019
|
+
dependency: The dependency
|
|
1020
|
+
|
|
1021
|
+
Returns:
|
|
1022
|
+
Information for lock file creation (JSON-serializable)
|
|
1023
|
+
"""
|
|
1024
|
+
return None
|