furu 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- furu/__init__.py +82 -0
- furu/adapters/__init__.py +3 -0
- furu/adapters/submitit.py +195 -0
- furu/config.py +98 -0
- furu/core/__init__.py +4 -0
- furu/core/furu.py +999 -0
- furu/core/list.py +123 -0
- furu/dashboard/__init__.py +9 -0
- furu/dashboard/__main__.py +7 -0
- furu/dashboard/api/__init__.py +7 -0
- furu/dashboard/api/models.py +170 -0
- furu/dashboard/api/routes.py +135 -0
- furu/dashboard/frontend/dist/assets/index-CbdDfSOZ.css +1 -0
- furu/dashboard/frontend/dist/assets/index-DDv_TYB_.js +67 -0
- furu/dashboard/frontend/dist/favicon.svg +10 -0
- furu/dashboard/frontend/dist/index.html +22 -0
- furu/dashboard/main.py +134 -0
- furu/dashboard/scanner.py +931 -0
- furu/errors.py +76 -0
- furu/migrate.py +48 -0
- furu/migration.py +926 -0
- furu/runtime/__init__.py +27 -0
- furu/runtime/env.py +8 -0
- furu/runtime/logging.py +301 -0
- furu/runtime/tracebacks.py +64 -0
- furu/serialization/__init__.py +20 -0
- furu/serialization/migrations.py +246 -0
- furu/serialization/serializer.py +233 -0
- furu/storage/__init__.py +32 -0
- furu/storage/metadata.py +282 -0
- furu/storage/migration.py +81 -0
- furu/storage/state.py +1107 -0
- furu-0.0.1.dist-info/METADATA +502 -0
- furu-0.0.1.dist-info/RECORD +36 -0
- furu-0.0.1.dist-info/WHEEL +4 -0
- furu-0.0.1.dist-info/entry_points.txt +2 -0
furu/storage/state.py
ADDED
|
@@ -0,0 +1,1107 @@
|
|
|
1
|
+
import datetime as _dt
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import socket
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
import uuid
|
|
8
|
+
from collections.abc import Generator
|
|
9
|
+
from contextlib import contextmanager
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Annotated, Any, Callable, Literal, Mapping, TypedDict, TypeAlias
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, ConfigDict, Field, ValidationError, model_validator
|
|
15
|
+
|
|
16
|
+
from ..errors import FuruLockNotAcquired, FuruWaitTimeout
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Type alias for scheduler-specific metadata. Different schedulers (SLURM, LSF, PBS, local)
|
|
20
|
+
# return different fields, so this must remain dynamic.
|
|
21
|
+
SchedulerMetadata = dict[str, Any]
|
|
22
|
+
|
|
23
|
+
# Type alias for probe results from submitit adapter
|
|
24
|
+
ProbeResult = dict[str, Any]
|
|
25
|
+
|
|
26
|
+
EventValue: TypeAlias = str | int | float | bool
|
|
27
|
+
EventMapping: TypeAlias = Mapping[str, EventValue]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class _LockInfoDict(TypedDict, total=False):
|
|
31
|
+
"""TypedDict for lock file information."""
|
|
32
|
+
|
|
33
|
+
pid: int
|
|
34
|
+
host: str
|
|
35
|
+
created_at: str
|
|
36
|
+
lock_id: str
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class _OwnerDict(TypedDict, total=False):
|
|
40
|
+
"""TypedDict for owner information passed to state manager functions."""
|
|
41
|
+
|
|
42
|
+
pid: int | None
|
|
43
|
+
host: str | None
|
|
44
|
+
hostname: str | None
|
|
45
|
+
user: str | None
|
|
46
|
+
command: str | None
|
|
47
|
+
timestamp: str | None
|
|
48
|
+
python_version: str | None
|
|
49
|
+
executable: str | None
|
|
50
|
+
platform: str | None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class _ErrorDict(TypedDict, total=False):
|
|
54
|
+
"""TypedDict for error information passed to state manager functions."""
|
|
55
|
+
|
|
56
|
+
type: str
|
|
57
|
+
message: str
|
|
58
|
+
traceback: str | None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class _StateResultBase(BaseModel):
|
|
62
|
+
model_config = ConfigDict(extra="forbid", validate_assignment=True, strict=True)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class _StateResultAbsent(_StateResultBase):
|
|
66
|
+
status: Literal["absent"] = "absent"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class _StateResultIncomplete(_StateResultBase):
|
|
70
|
+
status: Literal["incomplete"] = "incomplete"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class _StateResultSuccess(_StateResultBase):
|
|
74
|
+
status: Literal["success"] = "success"
|
|
75
|
+
created_at: str
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class _StateResultFailed(_StateResultBase):
|
|
79
|
+
status: Literal["failed"] = "failed"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class _StateResultMigrated(_StateResultBase):
|
|
83
|
+
status: Literal["migrated"] = "migrated"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
_StateResult = Annotated[
|
|
87
|
+
_StateResultAbsent
|
|
88
|
+
| _StateResultIncomplete
|
|
89
|
+
| _StateResultSuccess
|
|
90
|
+
| _StateResultFailed
|
|
91
|
+
| _StateResultMigrated,
|
|
92
|
+
Field(discriminator="status"),
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _coerce_result(current: _StateResult, **updates: str) -> _StateResult:
|
|
97
|
+
data = current.model_dump(mode="json")
|
|
98
|
+
data.update(updates)
|
|
99
|
+
status = data.get("status")
|
|
100
|
+
match status:
|
|
101
|
+
case "absent":
|
|
102
|
+
return _StateResultAbsent(status="absent")
|
|
103
|
+
case "incomplete":
|
|
104
|
+
return _StateResultIncomplete(status="incomplete")
|
|
105
|
+
case "success":
|
|
106
|
+
created_at = data.get("created_at")
|
|
107
|
+
if not isinstance(created_at, str) or not created_at:
|
|
108
|
+
raise ValueError("Success result requires created_at")
|
|
109
|
+
return _StateResultSuccess(status="success", created_at=created_at)
|
|
110
|
+
case "failed":
|
|
111
|
+
return _StateResultFailed(status="failed")
|
|
112
|
+
case "migrated":
|
|
113
|
+
return _StateResultMigrated(status="migrated")
|
|
114
|
+
|
|
115
|
+
case _:
|
|
116
|
+
raise ValueError(f"Invalid result status: {status!r}")
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class StateOwner(BaseModel):
|
|
120
|
+
"""Owner information for a Furu attempt."""
|
|
121
|
+
|
|
122
|
+
model_config = ConfigDict(extra="forbid", validate_assignment=True, strict=True)
|
|
123
|
+
|
|
124
|
+
pid: int | None = None
|
|
125
|
+
host: str | None = None
|
|
126
|
+
hostname: str | None = None
|
|
127
|
+
user: str | None = None
|
|
128
|
+
command: str | None = None
|
|
129
|
+
timestamp: str | None = None
|
|
130
|
+
python_version: str | None = None
|
|
131
|
+
executable: str | None = None
|
|
132
|
+
platform: str | None = None
|
|
133
|
+
|
|
134
|
+
@model_validator(mode="before")
|
|
135
|
+
@classmethod
|
|
136
|
+
def _normalize_host_keys(
|
|
137
|
+
cls, data: dict[str, str | int | None] | Any
|
|
138
|
+
) -> dict[str, str | int | None] | Any:
|
|
139
|
+
if not isinstance(data, dict):
|
|
140
|
+
return data
|
|
141
|
+
host = data.get("host")
|
|
142
|
+
hostname = data.get("hostname")
|
|
143
|
+
if host is None and hostname is not None:
|
|
144
|
+
data = dict(data)
|
|
145
|
+
data["host"] = hostname
|
|
146
|
+
return data
|
|
147
|
+
if hostname is None and host is not None:
|
|
148
|
+
data = dict(data)
|
|
149
|
+
data["hostname"] = host
|
|
150
|
+
return data
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class FuruErrorState(BaseModel):
|
|
154
|
+
"""Error state information for a Furu attempt."""
|
|
155
|
+
|
|
156
|
+
model_config = ConfigDict(extra="forbid", validate_assignment=True, strict=True)
|
|
157
|
+
|
|
158
|
+
type: str = "UnknownError"
|
|
159
|
+
message: str = ""
|
|
160
|
+
traceback: str | None = None
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class _StateAttemptBase(BaseModel):
|
|
164
|
+
model_config = ConfigDict(extra="forbid", validate_assignment=True, strict=True)
|
|
165
|
+
|
|
166
|
+
id: str
|
|
167
|
+
number: int = 1
|
|
168
|
+
backend: str
|
|
169
|
+
started_at: str
|
|
170
|
+
heartbeat_at: str
|
|
171
|
+
lease_duration_sec: float
|
|
172
|
+
lease_expires_at: str
|
|
173
|
+
owner: StateOwner
|
|
174
|
+
scheduler: SchedulerMetadata = Field(default_factory=dict)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class _StateAttemptQueued(_StateAttemptBase):
|
|
178
|
+
status: Literal["queued"] = "queued"
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class _StateAttemptRunning(_StateAttemptBase):
|
|
182
|
+
status: Literal["running"] = "running"
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class _StateAttemptSuccess(_StateAttemptBase):
|
|
186
|
+
status: Literal["success"] = "success"
|
|
187
|
+
ended_at: str
|
|
188
|
+
reason: None = None
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class _StateAttemptFailed(_StateAttemptBase):
|
|
192
|
+
status: Literal["failed"] = "failed"
|
|
193
|
+
ended_at: str
|
|
194
|
+
error: FuruErrorState
|
|
195
|
+
reason: str | None = None
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
class _StateAttemptTerminal(_StateAttemptBase):
|
|
199
|
+
status: Literal["cancelled", "preempted", "crashed"]
|
|
200
|
+
ended_at: str
|
|
201
|
+
error: FuruErrorState | None = None
|
|
202
|
+
reason: str | None = None
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
_StateAttempt = Annotated[
|
|
206
|
+
_StateAttemptQueued
|
|
207
|
+
| _StateAttemptRunning
|
|
208
|
+
| _StateAttemptSuccess
|
|
209
|
+
| _StateAttemptFailed
|
|
210
|
+
| _StateAttemptTerminal,
|
|
211
|
+
Field(discriminator="status"),
|
|
212
|
+
]
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class StateAttempt(BaseModel):
|
|
216
|
+
"""
|
|
217
|
+
Public read-only representation of a Furu attempt.
|
|
218
|
+
|
|
219
|
+
This model is used for external APIs (like the dashboard) to expose
|
|
220
|
+
attempt information without coupling to internal state variants.
|
|
221
|
+
All fields that may not be present on all attempt types are optional.
|
|
222
|
+
"""
|
|
223
|
+
|
|
224
|
+
model_config = ConfigDict(extra="forbid", strict=True)
|
|
225
|
+
|
|
226
|
+
id: str
|
|
227
|
+
number: int
|
|
228
|
+
backend: str
|
|
229
|
+
status: str
|
|
230
|
+
started_at: str
|
|
231
|
+
heartbeat_at: str
|
|
232
|
+
lease_duration_sec: float
|
|
233
|
+
lease_expires_at: str
|
|
234
|
+
owner: StateOwner
|
|
235
|
+
scheduler: SchedulerMetadata = Field(default_factory=dict)
|
|
236
|
+
ended_at: str | None = None
|
|
237
|
+
error: FuruErrorState | None = None
|
|
238
|
+
reason: str | None = None
|
|
239
|
+
|
|
240
|
+
@classmethod
|
|
241
|
+
def from_internal(cls, attempt: _StateAttempt) -> "StateAttempt":
|
|
242
|
+
"""Create a StateAttempt from an internal attempt state."""
|
|
243
|
+
return cls(
|
|
244
|
+
id=attempt.id,
|
|
245
|
+
number=attempt.number,
|
|
246
|
+
backend=attempt.backend,
|
|
247
|
+
status=attempt.status,
|
|
248
|
+
started_at=attempt.started_at,
|
|
249
|
+
heartbeat_at=attempt.heartbeat_at,
|
|
250
|
+
lease_duration_sec=attempt.lease_duration_sec,
|
|
251
|
+
lease_expires_at=attempt.lease_expires_at,
|
|
252
|
+
owner=attempt.owner,
|
|
253
|
+
scheduler=attempt.scheduler,
|
|
254
|
+
ended_at=getattr(attempt, "ended_at", None),
|
|
255
|
+
error=getattr(attempt, "error", None),
|
|
256
|
+
reason=getattr(attempt, "reason", None),
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
class _FuruState(BaseModel):
|
|
261
|
+
model_config = ConfigDict(extra="forbid", validate_assignment=True, strict=True)
|
|
262
|
+
|
|
263
|
+
schema_version: int = 1
|
|
264
|
+
result: _StateResult = Field(
|
|
265
|
+
default_factory=lambda: _StateResultAbsent(status="absent")
|
|
266
|
+
)
|
|
267
|
+
attempt: _StateAttempt | None = None
|
|
268
|
+
updated_at: str | None = None
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
class StateManager:
|
|
272
|
+
"""
|
|
273
|
+
Crash-safe state and liveness management for a single Furu artifact directory.
|
|
274
|
+
|
|
275
|
+
Design principles:
|
|
276
|
+
- Only `result.status == "success"` is treated as loadable by default.
|
|
277
|
+
- `attempt.status == "running"` is a lease-based claim that must be reconcilable.
|
|
278
|
+
- Writes are atomic (`os.replace`) and serialized via a state lock.
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
SCHEMA_VERSION = 1
|
|
282
|
+
|
|
283
|
+
INTERNAL_DIR = ".furu"
|
|
284
|
+
|
|
285
|
+
STATE_FILE = "state.json"
|
|
286
|
+
EVENTS_FILE = "events.jsonl"
|
|
287
|
+
SUCCESS_MARKER = "SUCCESS.json"
|
|
288
|
+
|
|
289
|
+
COMPUTE_LOCK = ".compute.lock"
|
|
290
|
+
SUBMIT_LOCK = ".submit.lock"
|
|
291
|
+
STATE_LOCK = ".state.lock"
|
|
292
|
+
|
|
293
|
+
TERMINAL_STATUSES = {
|
|
294
|
+
"success",
|
|
295
|
+
"failed",
|
|
296
|
+
"cancelled",
|
|
297
|
+
"preempted",
|
|
298
|
+
"crashed",
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
@classmethod
|
|
302
|
+
def get_internal_dir(cls, directory: Path) -> Path:
|
|
303
|
+
return directory / cls.INTERNAL_DIR
|
|
304
|
+
|
|
305
|
+
@classmethod
|
|
306
|
+
def get_state_path(cls, directory: Path) -> Path:
|
|
307
|
+
return cls.get_internal_dir(directory) / cls.STATE_FILE
|
|
308
|
+
|
|
309
|
+
@classmethod
|
|
310
|
+
def get_events_path(cls, directory: Path) -> Path:
|
|
311
|
+
return cls.get_internal_dir(directory) / cls.EVENTS_FILE
|
|
312
|
+
|
|
313
|
+
@classmethod
|
|
314
|
+
def get_success_marker_path(cls, directory: Path) -> Path:
|
|
315
|
+
return cls.get_internal_dir(directory) / cls.SUCCESS_MARKER
|
|
316
|
+
|
|
317
|
+
@classmethod
|
|
318
|
+
def get_lock_path(cls, directory: Path, lock_name: str) -> Path:
|
|
319
|
+
return cls.get_internal_dir(directory) / lock_name
|
|
320
|
+
|
|
321
|
+
@classmethod
|
|
322
|
+
def _utcnow(cls) -> _dt.datetime:
|
|
323
|
+
return _dt.datetime.now(_dt.timezone.utc)
|
|
324
|
+
|
|
325
|
+
@classmethod
|
|
326
|
+
def _iso_now(cls) -> str:
|
|
327
|
+
return cls._utcnow().isoformat(timespec="seconds")
|
|
328
|
+
|
|
329
|
+
@classmethod
|
|
330
|
+
def _parse_time(cls, value: str | None) -> _dt.datetime | None:
|
|
331
|
+
if not isinstance(value, str) or not value:
|
|
332
|
+
return None
|
|
333
|
+
dt = _dt.datetime.fromisoformat(value)
|
|
334
|
+
if dt.tzinfo is None:
|
|
335
|
+
dt = dt.replace(tzinfo=_dt.timezone.utc)
|
|
336
|
+
return dt.astimezone(_dt.timezone.utc)
|
|
337
|
+
|
|
338
|
+
@classmethod
|
|
339
|
+
def default_state(cls) -> _FuruState:
|
|
340
|
+
return _FuruState(schema_version=cls.SCHEMA_VERSION)
|
|
341
|
+
|
|
342
|
+
@classmethod
|
|
343
|
+
def read_state(cls, directory: Path) -> _FuruState:
|
|
344
|
+
state_path = cls.get_state_path(directory)
|
|
345
|
+
if not state_path.is_file():
|
|
346
|
+
return cls.default_state()
|
|
347
|
+
|
|
348
|
+
text = state_path.read_text()
|
|
349
|
+
|
|
350
|
+
try:
|
|
351
|
+
data = json.loads(text)
|
|
352
|
+
except Exception as e:
|
|
353
|
+
raise ValueError(f"Invalid JSON in state file: {state_path}") from e
|
|
354
|
+
|
|
355
|
+
if not isinstance(data, dict):
|
|
356
|
+
raise ValueError(f"Invalid state file (expected object): {state_path}")
|
|
357
|
+
if data.get("schema_version") != cls.SCHEMA_VERSION:
|
|
358
|
+
raise ValueError(
|
|
359
|
+
f"Unsupported state schema_version (expected {cls.SCHEMA_VERSION}): {state_path}"
|
|
360
|
+
)
|
|
361
|
+
try:
|
|
362
|
+
return _FuruState.model_validate(data)
|
|
363
|
+
except ValidationError as e:
|
|
364
|
+
raise ValueError(f"Invalid state schema: {state_path}") from e
|
|
365
|
+
|
|
366
|
+
@classmethod
|
|
367
|
+
def _write_state_unlocked(cls, directory: Path, state: _FuruState) -> None:
|
|
368
|
+
state_path = cls.get_state_path(directory)
|
|
369
|
+
state_path.parent.mkdir(parents=True, exist_ok=True)
|
|
370
|
+
tmp_path = state_path.with_suffix(".tmp")
|
|
371
|
+
tmp_path.write_text(json.dumps(state.model_dump(mode="json"), indent=2))
|
|
372
|
+
os.replace(tmp_path, state_path)
|
|
373
|
+
|
|
374
|
+
@classmethod
|
|
375
|
+
def _pid_alive(cls, pid: int) -> bool:
|
|
376
|
+
try:
|
|
377
|
+
os.kill(pid, 0)
|
|
378
|
+
return True
|
|
379
|
+
except ProcessLookupError:
|
|
380
|
+
return False
|
|
381
|
+
except PermissionError:
|
|
382
|
+
# Process exists but we can't signal it - still alive
|
|
383
|
+
return True
|
|
384
|
+
|
|
385
|
+
@classmethod
|
|
386
|
+
def try_lock(cls, lock_path: Path) -> int | None:
|
|
387
|
+
try:
|
|
388
|
+
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
389
|
+
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_RDWR, 0o644)
|
|
390
|
+
payload = {
|
|
391
|
+
"pid": os.getpid(),
|
|
392
|
+
"host": socket.gethostname(),
|
|
393
|
+
"created_at": cls._iso_now(),
|
|
394
|
+
"lock_id": uuid.uuid4().hex,
|
|
395
|
+
}
|
|
396
|
+
os.write(fd, (json.dumps(payload) + "\n").encode())
|
|
397
|
+
return fd
|
|
398
|
+
except FileExistsError:
|
|
399
|
+
return None
|
|
400
|
+
|
|
401
|
+
@classmethod
|
|
402
|
+
def release_lock(cls, fd: int | None, lock_path: Path) -> None:
|
|
403
|
+
if fd is not None:
|
|
404
|
+
os.close(fd)
|
|
405
|
+
lock_path.unlink(missing_ok=True)
|
|
406
|
+
|
|
407
|
+
@classmethod
|
|
408
|
+
def _read_lock_info(cls, lock_path: Path) -> _LockInfoDict | None:
|
|
409
|
+
if not lock_path.is_file():
|
|
410
|
+
return None
|
|
411
|
+
text = lock_path.read_text().strip()
|
|
412
|
+
if not text:
|
|
413
|
+
return None
|
|
414
|
+
lines = text.splitlines()
|
|
415
|
+
if not lines:
|
|
416
|
+
return None
|
|
417
|
+
data = json.loads(lines[0])
|
|
418
|
+
if isinstance(data, dict):
|
|
419
|
+
return data # type: ignore[return-value]
|
|
420
|
+
return None
|
|
421
|
+
|
|
422
|
+
@classmethod
|
|
423
|
+
def _acquire_lock_blocking(
|
|
424
|
+
cls,
|
|
425
|
+
lock_path: Path,
|
|
426
|
+
*,
|
|
427
|
+
timeout_sec: float = 5.0,
|
|
428
|
+
stale_after_sec: float = 60.0,
|
|
429
|
+
) -> int:
|
|
430
|
+
deadline = time.time() + timeout_sec
|
|
431
|
+
while True:
|
|
432
|
+
fd = cls.try_lock(lock_path)
|
|
433
|
+
if fd is not None:
|
|
434
|
+
return fd
|
|
435
|
+
|
|
436
|
+
should_break = False
|
|
437
|
+
info = cls._read_lock_info(lock_path)
|
|
438
|
+
if info and info.get("host") == socket.gethostname():
|
|
439
|
+
pid = info.get("pid")
|
|
440
|
+
if isinstance(pid, int) and not cls._pid_alive(pid):
|
|
441
|
+
should_break = True
|
|
442
|
+
if not should_break:
|
|
443
|
+
try:
|
|
444
|
+
stat_result = lock_path.stat()
|
|
445
|
+
age = time.time() - stat_result.st_mtime
|
|
446
|
+
if age > stale_after_sec:
|
|
447
|
+
should_break = True
|
|
448
|
+
except FileNotFoundError:
|
|
449
|
+
# Lock file was deleted by another process, retry
|
|
450
|
+
pass
|
|
451
|
+
|
|
452
|
+
if should_break:
|
|
453
|
+
lock_path.unlink(missing_ok=True)
|
|
454
|
+
continue
|
|
455
|
+
|
|
456
|
+
if time.time() >= deadline:
|
|
457
|
+
raise TimeoutError(f"Timeout acquiring lock: {lock_path}")
|
|
458
|
+
time.sleep(0.05)
|
|
459
|
+
|
|
460
|
+
@classmethod
|
|
461
|
+
def update_state(
|
|
462
|
+
cls, directory: Path, mutator: Callable[[_FuruState], None]
|
|
463
|
+
) -> _FuruState:
|
|
464
|
+
lock_path = cls.get_lock_path(directory, cls.STATE_LOCK)
|
|
465
|
+
fd: int | None = None
|
|
466
|
+
try:
|
|
467
|
+
fd = cls._acquire_lock_blocking(lock_path)
|
|
468
|
+
state = cls.read_state(directory)
|
|
469
|
+
mutator(state)
|
|
470
|
+
state.schema_version = cls.SCHEMA_VERSION
|
|
471
|
+
state.updated_at = cls._iso_now()
|
|
472
|
+
validated = _FuruState.model_validate(state)
|
|
473
|
+
cls._write_state_unlocked(directory, validated)
|
|
474
|
+
return validated
|
|
475
|
+
finally:
|
|
476
|
+
cls.release_lock(fd, lock_path)
|
|
477
|
+
|
|
478
|
+
@classmethod
|
|
479
|
+
def append_event(cls, directory: Path, event: EventMapping) -> None:
|
|
480
|
+
path = cls.get_events_path(directory)
|
|
481
|
+
enriched = {
|
|
482
|
+
"ts": cls._iso_now(),
|
|
483
|
+
"pid": os.getpid(),
|
|
484
|
+
"host": socket.gethostname(),
|
|
485
|
+
**event,
|
|
486
|
+
}
|
|
487
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
488
|
+
with path.open("a", encoding="utf-8") as f:
|
|
489
|
+
f.write(json.dumps(enriched) + "\n")
|
|
490
|
+
|
|
491
|
+
@classmethod
|
|
492
|
+
def write_success_marker(cls, directory: Path, *, attempt_id: str) -> None:
|
|
493
|
+
marker = cls.get_success_marker_path(directory)
|
|
494
|
+
marker.parent.mkdir(parents=True, exist_ok=True)
|
|
495
|
+
payload = {"attempt_id": attempt_id, "created_at": cls._iso_now()}
|
|
496
|
+
tmp = marker.with_suffix(".tmp")
|
|
497
|
+
tmp.write_text(json.dumps(payload, indent=2))
|
|
498
|
+
os.replace(tmp, marker)
|
|
499
|
+
|
|
500
|
+
@classmethod
|
|
501
|
+
def success_marker_exists(cls, directory: Path) -> bool:
|
|
502
|
+
return cls.get_success_marker_path(directory).is_file()
|
|
503
|
+
|
|
504
|
+
@classmethod
|
|
505
|
+
def _lease_expired(
|
|
506
|
+
cls, attempt: _StateAttemptQueued | _StateAttemptRunning
|
|
507
|
+
) -> bool:
|
|
508
|
+
expires = cls._parse_time(attempt.lease_expires_at)
|
|
509
|
+
if expires is None:
|
|
510
|
+
return True
|
|
511
|
+
return cls._utcnow() >= expires
|
|
512
|
+
|
|
513
|
+
@classmethod
|
|
514
|
+
def start_attempt_queued(
|
|
515
|
+
cls,
|
|
516
|
+
directory: Path,
|
|
517
|
+
*,
|
|
518
|
+
backend: str,
|
|
519
|
+
lease_duration_sec: float,
|
|
520
|
+
owner: _OwnerDict,
|
|
521
|
+
scheduler: SchedulerMetadata | None = None,
|
|
522
|
+
) -> str:
|
|
523
|
+
return cls._start_attempt(
|
|
524
|
+
directory,
|
|
525
|
+
backend=backend,
|
|
526
|
+
lease_duration_sec=lease_duration_sec,
|
|
527
|
+
owner=owner,
|
|
528
|
+
scheduler=scheduler,
|
|
529
|
+
attempt_cls=_StateAttemptQueued,
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
@classmethod
|
|
533
|
+
def start_attempt_running(
|
|
534
|
+
cls,
|
|
535
|
+
directory: Path,
|
|
536
|
+
*,
|
|
537
|
+
backend: str,
|
|
538
|
+
lease_duration_sec: float,
|
|
539
|
+
owner: _OwnerDict,
|
|
540
|
+
scheduler: SchedulerMetadata | None = None,
|
|
541
|
+
) -> str:
|
|
542
|
+
return cls._start_attempt(
|
|
543
|
+
directory,
|
|
544
|
+
backend=backend,
|
|
545
|
+
lease_duration_sec=lease_duration_sec,
|
|
546
|
+
owner=owner,
|
|
547
|
+
scheduler=scheduler,
|
|
548
|
+
attempt_cls=_StateAttemptRunning,
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
@classmethod
|
|
552
|
+
def _start_attempt(
|
|
553
|
+
cls,
|
|
554
|
+
directory: Path,
|
|
555
|
+
*,
|
|
556
|
+
backend: str,
|
|
557
|
+
lease_duration_sec: float,
|
|
558
|
+
owner: _OwnerDict,
|
|
559
|
+
scheduler: SchedulerMetadata | None,
|
|
560
|
+
attempt_cls: type[_StateAttemptQueued] | type[_StateAttemptRunning],
|
|
561
|
+
) -> str:
|
|
562
|
+
attempt_id = uuid.uuid4().hex
|
|
563
|
+
now = cls._utcnow()
|
|
564
|
+
expires = now + _dt.timedelta(seconds=float(lease_duration_sec))
|
|
565
|
+
prev_result_failed = False
|
|
566
|
+
prev_attempt_status: str | None = None
|
|
567
|
+
prev_attempt_reason: str | None = None
|
|
568
|
+
|
|
569
|
+
def mutate(state: _FuruState) -> None:
|
|
570
|
+
nonlocal prev_result_failed, prev_attempt_status, prev_attempt_reason
|
|
571
|
+
prev_result_failed = isinstance(state.result, _StateResultFailed)
|
|
572
|
+
prev = state.attempt
|
|
573
|
+
if prev is not None:
|
|
574
|
+
prev_attempt_status = prev.status
|
|
575
|
+
prev_attempt_reason = getattr(prev, "reason", None)
|
|
576
|
+
|
|
577
|
+
number = (prev.number + 1) if prev is not None else 1
|
|
578
|
+
|
|
579
|
+
owner_state = StateOwner.model_validate(owner)
|
|
580
|
+
started_at = now.isoformat(timespec="seconds")
|
|
581
|
+
heartbeat_at = started_at
|
|
582
|
+
lease_duration = float(lease_duration_sec)
|
|
583
|
+
lease_expires_at = expires.isoformat(timespec="seconds")
|
|
584
|
+
scheduler_state: SchedulerMetadata = scheduler or {}
|
|
585
|
+
|
|
586
|
+
attempt_kwargs = dict(
|
|
587
|
+
id=attempt_id,
|
|
588
|
+
number=int(number),
|
|
589
|
+
backend=backend,
|
|
590
|
+
started_at=started_at,
|
|
591
|
+
heartbeat_at=heartbeat_at,
|
|
592
|
+
lease_duration_sec=lease_duration,
|
|
593
|
+
lease_expires_at=lease_expires_at,
|
|
594
|
+
owner=owner_state,
|
|
595
|
+
scheduler=scheduler_state,
|
|
596
|
+
)
|
|
597
|
+
state.attempt = attempt_cls(**attempt_kwargs) # type: ignore[arg-type, misc]
|
|
598
|
+
|
|
599
|
+
state.result = _coerce_result(state.result, status="incomplete")
|
|
600
|
+
|
|
601
|
+
state = cls.update_state(directory, mutate)
|
|
602
|
+
if attempt_cls is _StateAttemptRunning:
|
|
603
|
+
from ..runtime.logging import get_logger
|
|
604
|
+
|
|
605
|
+
logger = get_logger()
|
|
606
|
+
if prev_result_failed:
|
|
607
|
+
logger.warning(
|
|
608
|
+
"state: retrying after previous failure %s",
|
|
609
|
+
directory,
|
|
610
|
+
)
|
|
611
|
+
elif prev_attempt_status == "crashed" and prev_attempt_reason in {
|
|
612
|
+
"pid_dead",
|
|
613
|
+
"lease_expired",
|
|
614
|
+
}:
|
|
615
|
+
logger.warning(
|
|
616
|
+
"state: restarting after stale attempt (%s) %s",
|
|
617
|
+
prev_attempt_reason,
|
|
618
|
+
directory,
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
cls.append_event(
|
|
622
|
+
directory,
|
|
623
|
+
{
|
|
624
|
+
"type": "attempt_started",
|
|
625
|
+
"attempt_id": attempt_id,
|
|
626
|
+
"backend": backend,
|
|
627
|
+
"status": state.attempt.status
|
|
628
|
+
if state.attempt is not None
|
|
629
|
+
else "unknown",
|
|
630
|
+
},
|
|
631
|
+
)
|
|
632
|
+
attempt = state.attempt
|
|
633
|
+
if attempt is None: # pragma: no cover
|
|
634
|
+
raise RuntimeError("start_attempt did not create attempt")
|
|
635
|
+
return attempt.id
|
|
636
|
+
|
|
637
|
+
@classmethod
|
|
638
|
+
def heartbeat(
|
|
639
|
+
cls, directory: Path, *, attempt_id: str, lease_duration_sec: float
|
|
640
|
+
) -> bool:
|
|
641
|
+
ok = False
|
|
642
|
+
|
|
643
|
+
def mutate(state: _FuruState) -> None:
|
|
644
|
+
nonlocal ok
|
|
645
|
+
attempt = state.attempt
|
|
646
|
+
if not isinstance(attempt, _StateAttemptRunning):
|
|
647
|
+
return
|
|
648
|
+
if attempt.id != attempt_id:
|
|
649
|
+
return
|
|
650
|
+
now = cls._utcnow()
|
|
651
|
+
expires = now + _dt.timedelta(seconds=float(lease_duration_sec))
|
|
652
|
+
attempt.heartbeat_at = now.isoformat(timespec="seconds")
|
|
653
|
+
attempt.lease_duration_sec = float(lease_duration_sec)
|
|
654
|
+
attempt.lease_expires_at = expires.isoformat(timespec="seconds")
|
|
655
|
+
ok = True
|
|
656
|
+
|
|
657
|
+
cls.update_state(directory, mutate)
|
|
658
|
+
return ok
|
|
659
|
+
|
|
660
|
+
@classmethod
|
|
661
|
+
def set_attempt_fields(
|
|
662
|
+
cls, directory: Path, *, attempt_id: str, fields: SchedulerMetadata
|
|
663
|
+
) -> bool:
|
|
664
|
+
ok = False
|
|
665
|
+
|
|
666
|
+
def mutate(state: _FuruState) -> None:
|
|
667
|
+
nonlocal ok
|
|
668
|
+
attempt = state.attempt
|
|
669
|
+
if attempt is None or attempt.id != attempt_id:
|
|
670
|
+
return
|
|
671
|
+
for key, value in fields.items():
|
|
672
|
+
if key == "scheduler" and isinstance(value, dict):
|
|
673
|
+
attempt.scheduler.update(value)
|
|
674
|
+
continue
|
|
675
|
+
if hasattr(attempt, key):
|
|
676
|
+
setattr(attempt, key, value)
|
|
677
|
+
ok = True
|
|
678
|
+
|
|
679
|
+
cls.update_state(directory, mutate)
|
|
680
|
+
return ok
|
|
681
|
+
|
|
682
|
+
@classmethod
|
|
683
|
+
def finish_attempt_success(cls, directory: Path, *, attempt_id: str) -> None:
|
|
684
|
+
now = cls._iso_now()
|
|
685
|
+
|
|
686
|
+
def mutate(state: _FuruState) -> None:
|
|
687
|
+
attempt = state.attempt
|
|
688
|
+
if attempt is not None and attempt.id == attempt_id:
|
|
689
|
+
state.attempt = _StateAttemptSuccess(
|
|
690
|
+
id=attempt.id,
|
|
691
|
+
number=attempt.number,
|
|
692
|
+
backend=attempt.backend,
|
|
693
|
+
started_at=attempt.started_at,
|
|
694
|
+
heartbeat_at=attempt.heartbeat_at,
|
|
695
|
+
lease_duration_sec=attempt.lease_duration_sec,
|
|
696
|
+
lease_expires_at=attempt.lease_expires_at,
|
|
697
|
+
owner=attempt.owner,
|
|
698
|
+
scheduler=attempt.scheduler,
|
|
699
|
+
ended_at=now,
|
|
700
|
+
)
|
|
701
|
+
state.result = _coerce_result(
|
|
702
|
+
state.result, status="success", created_at=now
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
cls.update_state(directory, mutate)
|
|
706
|
+
cls.append_event(
|
|
707
|
+
directory,
|
|
708
|
+
{"type": "attempt_finished", "attempt_id": attempt_id, "status": "success"},
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
@classmethod
|
|
712
|
+
def finish_attempt_failed(
|
|
713
|
+
cls,
|
|
714
|
+
directory: Path,
|
|
715
|
+
*,
|
|
716
|
+
attempt_id: str,
|
|
717
|
+
error: _ErrorDict,
|
|
718
|
+
) -> None:
|
|
719
|
+
now = cls._iso_now()
|
|
720
|
+
|
|
721
|
+
error_state = FuruErrorState.model_validate(error)
|
|
722
|
+
|
|
723
|
+
def mutate(state: _FuruState) -> None:
|
|
724
|
+
attempt = state.attempt
|
|
725
|
+
if attempt is not None and attempt.id == attempt_id:
|
|
726
|
+
state.attempt = _StateAttemptFailed(
|
|
727
|
+
id=attempt.id,
|
|
728
|
+
number=attempt.number,
|
|
729
|
+
backend=attempt.backend,
|
|
730
|
+
started_at=attempt.started_at,
|
|
731
|
+
heartbeat_at=attempt.heartbeat_at,
|
|
732
|
+
lease_duration_sec=attempt.lease_duration_sec,
|
|
733
|
+
lease_expires_at=attempt.lease_expires_at,
|
|
734
|
+
owner=attempt.owner,
|
|
735
|
+
scheduler=attempt.scheduler,
|
|
736
|
+
ended_at=now,
|
|
737
|
+
error=error_state,
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
state.result = _coerce_result(state.result, status="failed")
|
|
741
|
+
|
|
742
|
+
cls.update_state(directory, mutate)
|
|
743
|
+
cls.append_event(
|
|
744
|
+
directory,
|
|
745
|
+
{"type": "attempt_finished", "attempt_id": attempt_id, "status": "failed"},
|
|
746
|
+
)
|
|
747
|
+
|
|
748
|
+
@classmethod
|
|
749
|
+
def finish_attempt_preempted(
|
|
750
|
+
cls,
|
|
751
|
+
directory: Path,
|
|
752
|
+
*,
|
|
753
|
+
attempt_id: str,
|
|
754
|
+
error: _ErrorDict,
|
|
755
|
+
reason: str | None = None,
|
|
756
|
+
) -> None:
|
|
757
|
+
now = cls._iso_now()
|
|
758
|
+
error_state = FuruErrorState.model_validate(error)
|
|
759
|
+
|
|
760
|
+
def mutate(state: _FuruState) -> None:
|
|
761
|
+
attempt = state.attempt
|
|
762
|
+
if attempt is not None and attempt.id == attempt_id:
|
|
763
|
+
state.attempt = _StateAttemptTerminal(
|
|
764
|
+
status="preempted",
|
|
765
|
+
id=attempt.id,
|
|
766
|
+
number=attempt.number,
|
|
767
|
+
backend=attempt.backend,
|
|
768
|
+
started_at=attempt.started_at,
|
|
769
|
+
heartbeat_at=attempt.heartbeat_at,
|
|
770
|
+
lease_duration_sec=attempt.lease_duration_sec,
|
|
771
|
+
lease_expires_at=attempt.lease_expires_at,
|
|
772
|
+
owner=attempt.owner,
|
|
773
|
+
scheduler=attempt.scheduler,
|
|
774
|
+
ended_at=now,
|
|
775
|
+
error=error_state,
|
|
776
|
+
reason=reason,
|
|
777
|
+
)
|
|
778
|
+
state.result = _coerce_result(state.result, status="incomplete")
|
|
779
|
+
|
|
780
|
+
cls.update_state(directory, mutate)
|
|
781
|
+
cls.append_event(
|
|
782
|
+
directory,
|
|
783
|
+
{
|
|
784
|
+
"type": "attempt_finished",
|
|
785
|
+
"attempt_id": attempt_id,
|
|
786
|
+
"status": "preempted",
|
|
787
|
+
},
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
@classmethod
|
|
791
|
+
def _local_attempt_alive(
|
|
792
|
+
cls, attempt: _StateAttemptQueued | _StateAttemptRunning
|
|
793
|
+
) -> bool | None:
|
|
794
|
+
host = attempt.owner.host
|
|
795
|
+
pid = attempt.owner.pid
|
|
796
|
+
if host != socket.gethostname():
|
|
797
|
+
return None
|
|
798
|
+
if not isinstance(pid, int):
|
|
799
|
+
return None
|
|
800
|
+
return cls._pid_alive(pid)
|
|
801
|
+
|
|
802
|
+
@classmethod
|
|
803
|
+
def reconcile(
|
|
804
|
+
cls,
|
|
805
|
+
directory: Path,
|
|
806
|
+
*,
|
|
807
|
+
submitit_probe: Callable[[_FuruState], ProbeResult] | None = None,
|
|
808
|
+
) -> _FuruState:
|
|
809
|
+
"""
|
|
810
|
+
Reconcile a possibly-stale running/queued attempt.
|
|
811
|
+
|
|
812
|
+
- If a success marker exists, promote to success.
|
|
813
|
+
- For local attempts, if PID is provably dead or lease expired, mark as crashed and
|
|
814
|
+
remove compute lock so waiters can proceed.
|
|
815
|
+
- For submitit attempts, rely on `submitit_probe` when provided; otherwise fall back
|
|
816
|
+
to lease expiry.
|
|
817
|
+
"""
|
|
818
|
+
|
|
819
|
+
def mutate(state: _FuruState) -> None:
|
|
820
|
+
attempt = state.attempt
|
|
821
|
+
if not isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning)):
|
|
822
|
+
return
|
|
823
|
+
|
|
824
|
+
# Fast promotion if we can see a durable success marker.
|
|
825
|
+
if cls.success_marker_exists(directory):
|
|
826
|
+
ended = cls._iso_now()
|
|
827
|
+
state.attempt = _StateAttemptSuccess(
|
|
828
|
+
id=attempt.id,
|
|
829
|
+
number=attempt.number,
|
|
830
|
+
backend=attempt.backend,
|
|
831
|
+
started_at=attempt.started_at,
|
|
832
|
+
heartbeat_at=attempt.heartbeat_at,
|
|
833
|
+
lease_duration_sec=attempt.lease_duration_sec,
|
|
834
|
+
lease_expires_at=attempt.lease_expires_at,
|
|
835
|
+
owner=attempt.owner,
|
|
836
|
+
scheduler=attempt.scheduler,
|
|
837
|
+
ended_at=ended,
|
|
838
|
+
)
|
|
839
|
+
state.result = _coerce_result(
|
|
840
|
+
state.result, status="success", created_at=ended
|
|
841
|
+
)
|
|
842
|
+
return
|
|
843
|
+
|
|
844
|
+
backend = attempt.backend
|
|
845
|
+
now = cls._iso_now()
|
|
846
|
+
|
|
847
|
+
terminal_status: str | None = None
|
|
848
|
+
reason: str | None = None
|
|
849
|
+
|
|
850
|
+
if backend == "local":
|
|
851
|
+
alive = cls._local_attempt_alive(attempt)
|
|
852
|
+
if alive is False:
|
|
853
|
+
terminal_status = "crashed"
|
|
854
|
+
reason = "pid_dead"
|
|
855
|
+
elif cls._lease_expired(attempt):
|
|
856
|
+
terminal_status = "crashed"
|
|
857
|
+
reason = "lease_expired"
|
|
858
|
+
elif backend == "submitit":
|
|
859
|
+
if submitit_probe is not None:
|
|
860
|
+
verdict = submitit_probe(state)
|
|
861
|
+
if verdict.get("terminal_status") in cls.TERMINAL_STATUSES:
|
|
862
|
+
terminal_status = str(verdict["terminal_status"])
|
|
863
|
+
reason = str(verdict.get("reason") or "scheduler_terminal")
|
|
864
|
+
attempt.scheduler.update(
|
|
865
|
+
{k: v for k, v in verdict.items() if k != "terminal_status"}
|
|
866
|
+
)
|
|
867
|
+
if terminal_status is None and cls._lease_expired(attempt):
|
|
868
|
+
terminal_status = "crashed"
|
|
869
|
+
reason = "lease_expired"
|
|
870
|
+
else:
|
|
871
|
+
if cls._lease_expired(attempt):
|
|
872
|
+
terminal_status = "crashed"
|
|
873
|
+
reason = "lease_expired"
|
|
874
|
+
|
|
875
|
+
if terminal_status is None:
|
|
876
|
+
return
|
|
877
|
+
if terminal_status == "success":
|
|
878
|
+
terminal_status = "crashed"
|
|
879
|
+
reason = reason or "scheduler_success_no_success_marker"
|
|
880
|
+
|
|
881
|
+
if terminal_status == "failed":
|
|
882
|
+
state.attempt = _StateAttemptFailed(
|
|
883
|
+
id=attempt.id,
|
|
884
|
+
number=attempt.number,
|
|
885
|
+
backend=attempt.backend,
|
|
886
|
+
started_at=attempt.started_at,
|
|
887
|
+
heartbeat_at=attempt.heartbeat_at,
|
|
888
|
+
lease_duration_sec=attempt.lease_duration_sec,
|
|
889
|
+
lease_expires_at=attempt.lease_expires_at,
|
|
890
|
+
owner=attempt.owner,
|
|
891
|
+
scheduler=attempt.scheduler,
|
|
892
|
+
ended_at=now,
|
|
893
|
+
error=FuruErrorState(type="FuruComputeError", message=reason or ""),
|
|
894
|
+
reason=reason,
|
|
895
|
+
)
|
|
896
|
+
else:
|
|
897
|
+
if terminal_status == "cancelled":
|
|
898
|
+
state.attempt = _StateAttemptTerminal(
|
|
899
|
+
status="cancelled",
|
|
900
|
+
id=attempt.id,
|
|
901
|
+
number=attempt.number,
|
|
902
|
+
backend=attempt.backend,
|
|
903
|
+
started_at=attempt.started_at,
|
|
904
|
+
heartbeat_at=attempt.heartbeat_at,
|
|
905
|
+
lease_duration_sec=attempt.lease_duration_sec,
|
|
906
|
+
lease_expires_at=attempt.lease_expires_at,
|
|
907
|
+
owner=attempt.owner,
|
|
908
|
+
scheduler=attempt.scheduler,
|
|
909
|
+
ended_at=now,
|
|
910
|
+
reason=reason,
|
|
911
|
+
)
|
|
912
|
+
elif terminal_status == "preempted":
|
|
913
|
+
state.attempt = _StateAttemptTerminal(
|
|
914
|
+
status="preempted",
|
|
915
|
+
id=attempt.id,
|
|
916
|
+
number=attempt.number,
|
|
917
|
+
backend=attempt.backend,
|
|
918
|
+
started_at=attempt.started_at,
|
|
919
|
+
heartbeat_at=attempt.heartbeat_at,
|
|
920
|
+
lease_duration_sec=attempt.lease_duration_sec,
|
|
921
|
+
lease_expires_at=attempt.lease_expires_at,
|
|
922
|
+
owner=attempt.owner,
|
|
923
|
+
scheduler=attempt.scheduler,
|
|
924
|
+
ended_at=now,
|
|
925
|
+
reason=reason,
|
|
926
|
+
)
|
|
927
|
+
else:
|
|
928
|
+
state.attempt = _StateAttemptTerminal(
|
|
929
|
+
status="crashed",
|
|
930
|
+
id=attempt.id,
|
|
931
|
+
number=attempt.number,
|
|
932
|
+
backend=attempt.backend,
|
|
933
|
+
started_at=attempt.started_at,
|
|
934
|
+
heartbeat_at=attempt.heartbeat_at,
|
|
935
|
+
lease_duration_sec=attempt.lease_duration_sec,
|
|
936
|
+
lease_expires_at=attempt.lease_expires_at,
|
|
937
|
+
owner=attempt.owner,
|
|
938
|
+
scheduler=attempt.scheduler,
|
|
939
|
+
ended_at=now,
|
|
940
|
+
reason=reason,
|
|
941
|
+
)
|
|
942
|
+
|
|
943
|
+
state.result = _coerce_result(
|
|
944
|
+
state.result,
|
|
945
|
+
status="failed" if terminal_status == "failed" else "incomplete",
|
|
946
|
+
)
|
|
947
|
+
|
|
948
|
+
state = cls.update_state(directory, mutate)
|
|
949
|
+
attempt = state.attempt
|
|
950
|
+
if attempt is not None and attempt.status in {
|
|
951
|
+
"crashed",
|
|
952
|
+
"cancelled",
|
|
953
|
+
"preempted",
|
|
954
|
+
}:
|
|
955
|
+
cls.get_lock_path(directory, cls.COMPUTE_LOCK).unlink(missing_ok=True)
|
|
956
|
+
return state
|
|
957
|
+
|
|
958
|
+
|
|
959
|
+
@dataclass
|
|
960
|
+
class ComputeLockContext:
|
|
961
|
+
"""Context returned when a compute lock is successfully acquired."""
|
|
962
|
+
|
|
963
|
+
attempt_id: str
|
|
964
|
+
stop_heartbeat: Callable[[], None]
|
|
965
|
+
|
|
966
|
+
|
|
967
|
+
@contextmanager
|
|
968
|
+
def compute_lock(
|
|
969
|
+
directory: Path,
|
|
970
|
+
*,
|
|
971
|
+
backend: str,
|
|
972
|
+
lease_duration_sec: float,
|
|
973
|
+
heartbeat_interval_sec: float,
|
|
974
|
+
owner: _OwnerDict,
|
|
975
|
+
scheduler: SchedulerMetadata | None = None,
|
|
976
|
+
max_wait_time_sec: float | None = None,
|
|
977
|
+
poll_interval_sec: float = 10.0,
|
|
978
|
+
wait_log_every_sec: float = 10.0,
|
|
979
|
+
reconcile_fn: Callable[[Path], None] | None = None,
|
|
980
|
+
) -> Generator[ComputeLockContext, None, None]:
|
|
981
|
+
"""
|
|
982
|
+
Context manager that atomically acquires lock + records attempt + starts heartbeat.
|
|
983
|
+
|
|
984
|
+
This ensures there can never be a mismatch between the lock file and state:
|
|
985
|
+
- Lock acquisition and attempt recording happen together
|
|
986
|
+
- Heartbeat starts immediately after attempt is recorded
|
|
987
|
+
- On exit, heartbeat is stopped and lock is released
|
|
988
|
+
|
|
989
|
+
The context manager handles the wait loop internally, blocking until the lock
|
|
990
|
+
is acquired or timeout is reached.
|
|
991
|
+
|
|
992
|
+
Args:
|
|
993
|
+
directory: The furu directory for this experiment
|
|
994
|
+
backend: Backend type (e.g., "local", "submitit")
|
|
995
|
+
lease_duration_sec: Duration of the lease in seconds
|
|
996
|
+
heartbeat_interval_sec: Interval between heartbeats in seconds
|
|
997
|
+
owner: Owner information (pid, host, user, etc.)
|
|
998
|
+
scheduler: Optional scheduler metadata
|
|
999
|
+
max_wait_time_sec: Maximum time to wait for lock (None = wait forever)
|
|
1000
|
+
poll_interval_sec: Interval between lock acquisition attempts
|
|
1001
|
+
wait_log_every_sec: Interval between "waiting for lock" log messages
|
|
1002
|
+
reconcile_fn: Optional function to call to reconcile stale attempts
|
|
1003
|
+
|
|
1004
|
+
Yields:
|
|
1005
|
+
ComputeLockContext with attempt_id and stop_heartbeat callable
|
|
1006
|
+
|
|
1007
|
+
Raises:
|
|
1008
|
+
FuruLockNotAcquired: If lock cannot be acquired (after waiting)
|
|
1009
|
+
FuruWaitTimeout: If max_wait_time_sec is exceeded
|
|
1010
|
+
"""
|
|
1011
|
+
lock_path = StateManager.get_lock_path(directory, StateManager.COMPUTE_LOCK)
|
|
1012
|
+
|
|
1013
|
+
lock_fd: int | None = None
|
|
1014
|
+
start_time = time.time()
|
|
1015
|
+
next_wait_log_at = 0.0
|
|
1016
|
+
|
|
1017
|
+
# Import here to avoid circular import
|
|
1018
|
+
from ..runtime import get_logger
|
|
1019
|
+
|
|
1020
|
+
logger = get_logger()
|
|
1021
|
+
|
|
1022
|
+
# Wait loop to acquire lock
|
|
1023
|
+
while lock_fd is None:
|
|
1024
|
+
# Check timeout
|
|
1025
|
+
if max_wait_time_sec is not None:
|
|
1026
|
+
elapsed = time.time() - start_time
|
|
1027
|
+
if elapsed > max_wait_time_sec:
|
|
1028
|
+
raise FuruWaitTimeout(
|
|
1029
|
+
f"Timed out waiting for compute lock after {elapsed:.1f}s"
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
lock_fd = StateManager.try_lock(lock_path)
|
|
1033
|
+
if lock_fd is not None:
|
|
1034
|
+
break
|
|
1035
|
+
|
|
1036
|
+
# Lock held by someone else - reconcile and check state
|
|
1037
|
+
if reconcile_fn is not None:
|
|
1038
|
+
reconcile_fn(directory)
|
|
1039
|
+
|
|
1040
|
+
state = StateManager.read_state(directory)
|
|
1041
|
+
attempt = state.attempt
|
|
1042
|
+
|
|
1043
|
+
# If result is terminal, no point waiting
|
|
1044
|
+
if isinstance(state.result, _StateResultSuccess):
|
|
1045
|
+
raise FuruLockNotAcquired(
|
|
1046
|
+
"Cannot acquire lock: experiment already succeeded"
|
|
1047
|
+
)
|
|
1048
|
+
if isinstance(state.result, _StateResultFailed):
|
|
1049
|
+
raise FuruLockNotAcquired("Cannot acquire lock: experiment already failed")
|
|
1050
|
+
|
|
1051
|
+
# If no active attempt but lock exists, it's orphaned - clean it up
|
|
1052
|
+
if attempt is None or isinstance(
|
|
1053
|
+
attempt,
|
|
1054
|
+
(
|
|
1055
|
+
_StateAttemptSuccess,
|
|
1056
|
+
_StateAttemptFailed,
|
|
1057
|
+
_StateAttemptTerminal,
|
|
1058
|
+
),
|
|
1059
|
+
):
|
|
1060
|
+
# Orphaned lock file - remove it and retry immediately
|
|
1061
|
+
lock_path.unlink(missing_ok=True)
|
|
1062
|
+
continue
|
|
1063
|
+
|
|
1064
|
+
# Active attempt exists - wait for it
|
|
1065
|
+
now = time.time()
|
|
1066
|
+
if now >= next_wait_log_at:
|
|
1067
|
+
logger.info(
|
|
1068
|
+
"compute_lock: waiting for lock %s",
|
|
1069
|
+
directory,
|
|
1070
|
+
)
|
|
1071
|
+
next_wait_log_at = now + wait_log_every_sec
|
|
1072
|
+
time.sleep(poll_interval_sec)
|
|
1073
|
+
|
|
1074
|
+
# Lock acquired - now atomically record attempt and start heartbeat
|
|
1075
|
+
stop_event = threading.Event()
|
|
1076
|
+
attempt_id: str | None = None
|
|
1077
|
+
|
|
1078
|
+
try:
|
|
1079
|
+
# Record attempt IMMEDIATELY to minimize orphan window
|
|
1080
|
+
attempt_id = StateManager.start_attempt_running(
|
|
1081
|
+
directory,
|
|
1082
|
+
backend=backend,
|
|
1083
|
+
lease_duration_sec=lease_duration_sec,
|
|
1084
|
+
owner=owner,
|
|
1085
|
+
scheduler=scheduler,
|
|
1086
|
+
)
|
|
1087
|
+
|
|
1088
|
+
# Start heartbeat IMMEDIATELY
|
|
1089
|
+
def heartbeat() -> None:
|
|
1090
|
+
while not stop_event.wait(heartbeat_interval_sec):
|
|
1091
|
+
StateManager.heartbeat(
|
|
1092
|
+
directory,
|
|
1093
|
+
attempt_id=attempt_id, # type: ignore[arg-type]
|
|
1094
|
+
lease_duration_sec=lease_duration_sec,
|
|
1095
|
+
)
|
|
1096
|
+
|
|
1097
|
+
thread = threading.Thread(target=heartbeat, daemon=True)
|
|
1098
|
+
thread.start()
|
|
1099
|
+
|
|
1100
|
+
yield ComputeLockContext(
|
|
1101
|
+
attempt_id=attempt_id,
|
|
1102
|
+
stop_heartbeat=stop_event.set,
|
|
1103
|
+
)
|
|
1104
|
+
finally:
|
|
1105
|
+
# Always stop heartbeat and release lock
|
|
1106
|
+
stop_event.set()
|
|
1107
|
+
StateManager.release_lock(lock_fd, lock_path)
|