furu 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
furu/storage/state.py ADDED
@@ -0,0 +1,1107 @@
1
+ import datetime as _dt
2
+ import json
3
+ import os
4
+ import socket
5
+ import threading
6
+ import time
7
+ import uuid
8
+ from collections.abc import Generator
9
+ from contextlib import contextmanager
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+ from typing import Annotated, Any, Callable, Literal, Mapping, TypedDict, TypeAlias
13
+
14
+ from pydantic import BaseModel, ConfigDict, Field, ValidationError, model_validator
15
+
16
+ from ..errors import FuruLockNotAcquired, FuruWaitTimeout
17
+
18
+
19
+ # Type alias for scheduler-specific metadata. Different schedulers (SLURM, LSF, PBS, local)
20
+ # return different fields, so this must remain dynamic.
21
+ SchedulerMetadata = dict[str, Any]
22
+
23
+ # Type alias for probe results from submitit adapter
24
+ ProbeResult = dict[str, Any]
25
+
26
+ EventValue: TypeAlias = str | int | float | bool
27
+ EventMapping: TypeAlias = Mapping[str, EventValue]
28
+
29
+
30
+ class _LockInfoDict(TypedDict, total=False):
31
+ """TypedDict for lock file information."""
32
+
33
+ pid: int
34
+ host: str
35
+ created_at: str
36
+ lock_id: str
37
+
38
+
39
+ class _OwnerDict(TypedDict, total=False):
40
+ """TypedDict for owner information passed to state manager functions."""
41
+
42
+ pid: int | None
43
+ host: str | None
44
+ hostname: str | None
45
+ user: str | None
46
+ command: str | None
47
+ timestamp: str | None
48
+ python_version: str | None
49
+ executable: str | None
50
+ platform: str | None
51
+
52
+
53
+ class _ErrorDict(TypedDict, total=False):
54
+ """TypedDict for error information passed to state manager functions."""
55
+
56
+ type: str
57
+ message: str
58
+ traceback: str | None
59
+
60
+
61
+ class _StateResultBase(BaseModel):
62
+ model_config = ConfigDict(extra="forbid", validate_assignment=True, strict=True)
63
+
64
+
65
+ class _StateResultAbsent(_StateResultBase):
66
+ status: Literal["absent"] = "absent"
67
+
68
+
69
+ class _StateResultIncomplete(_StateResultBase):
70
+ status: Literal["incomplete"] = "incomplete"
71
+
72
+
73
+ class _StateResultSuccess(_StateResultBase):
74
+ status: Literal["success"] = "success"
75
+ created_at: str
76
+
77
+
78
+ class _StateResultFailed(_StateResultBase):
79
+ status: Literal["failed"] = "failed"
80
+
81
+
82
+ class _StateResultMigrated(_StateResultBase):
83
+ status: Literal["migrated"] = "migrated"
84
+
85
+
86
+ _StateResult = Annotated[
87
+ _StateResultAbsent
88
+ | _StateResultIncomplete
89
+ | _StateResultSuccess
90
+ | _StateResultFailed
91
+ | _StateResultMigrated,
92
+ Field(discriminator="status"),
93
+ ]
94
+
95
+
96
+ def _coerce_result(current: _StateResult, **updates: str) -> _StateResult:
97
+ data = current.model_dump(mode="json")
98
+ data.update(updates)
99
+ status = data.get("status")
100
+ match status:
101
+ case "absent":
102
+ return _StateResultAbsent(status="absent")
103
+ case "incomplete":
104
+ return _StateResultIncomplete(status="incomplete")
105
+ case "success":
106
+ created_at = data.get("created_at")
107
+ if not isinstance(created_at, str) or not created_at:
108
+ raise ValueError("Success result requires created_at")
109
+ return _StateResultSuccess(status="success", created_at=created_at)
110
+ case "failed":
111
+ return _StateResultFailed(status="failed")
112
+ case "migrated":
113
+ return _StateResultMigrated(status="migrated")
114
+
115
+ case _:
116
+ raise ValueError(f"Invalid result status: {status!r}")
117
+
118
+
119
+ class StateOwner(BaseModel):
120
+ """Owner information for a Furu attempt."""
121
+
122
+ model_config = ConfigDict(extra="forbid", validate_assignment=True, strict=True)
123
+
124
+ pid: int | None = None
125
+ host: str | None = None
126
+ hostname: str | None = None
127
+ user: str | None = None
128
+ command: str | None = None
129
+ timestamp: str | None = None
130
+ python_version: str | None = None
131
+ executable: str | None = None
132
+ platform: str | None = None
133
+
134
+ @model_validator(mode="before")
135
+ @classmethod
136
+ def _normalize_host_keys(
137
+ cls, data: dict[str, str | int | None] | Any
138
+ ) -> dict[str, str | int | None] | Any:
139
+ if not isinstance(data, dict):
140
+ return data
141
+ host = data.get("host")
142
+ hostname = data.get("hostname")
143
+ if host is None and hostname is not None:
144
+ data = dict(data)
145
+ data["host"] = hostname
146
+ return data
147
+ if hostname is None and host is not None:
148
+ data = dict(data)
149
+ data["hostname"] = host
150
+ return data
151
+
152
+
153
+ class FuruErrorState(BaseModel):
154
+ """Error state information for a Furu attempt."""
155
+
156
+ model_config = ConfigDict(extra="forbid", validate_assignment=True, strict=True)
157
+
158
+ type: str = "UnknownError"
159
+ message: str = ""
160
+ traceback: str | None = None
161
+
162
+
163
+ class _StateAttemptBase(BaseModel):
164
+ model_config = ConfigDict(extra="forbid", validate_assignment=True, strict=True)
165
+
166
+ id: str
167
+ number: int = 1
168
+ backend: str
169
+ started_at: str
170
+ heartbeat_at: str
171
+ lease_duration_sec: float
172
+ lease_expires_at: str
173
+ owner: StateOwner
174
+ scheduler: SchedulerMetadata = Field(default_factory=dict)
175
+
176
+
177
+ class _StateAttemptQueued(_StateAttemptBase):
178
+ status: Literal["queued"] = "queued"
179
+
180
+
181
+ class _StateAttemptRunning(_StateAttemptBase):
182
+ status: Literal["running"] = "running"
183
+
184
+
185
+ class _StateAttemptSuccess(_StateAttemptBase):
186
+ status: Literal["success"] = "success"
187
+ ended_at: str
188
+ reason: None = None
189
+
190
+
191
+ class _StateAttemptFailed(_StateAttemptBase):
192
+ status: Literal["failed"] = "failed"
193
+ ended_at: str
194
+ error: FuruErrorState
195
+ reason: str | None = None
196
+
197
+
198
+ class _StateAttemptTerminal(_StateAttemptBase):
199
+ status: Literal["cancelled", "preempted", "crashed"]
200
+ ended_at: str
201
+ error: FuruErrorState | None = None
202
+ reason: str | None = None
203
+
204
+
205
+ _StateAttempt = Annotated[
206
+ _StateAttemptQueued
207
+ | _StateAttemptRunning
208
+ | _StateAttemptSuccess
209
+ | _StateAttemptFailed
210
+ | _StateAttemptTerminal,
211
+ Field(discriminator="status"),
212
+ ]
213
+
214
+
215
+ class StateAttempt(BaseModel):
216
+ """
217
+ Public read-only representation of a Furu attempt.
218
+
219
+ This model is used for external APIs (like the dashboard) to expose
220
+ attempt information without coupling to internal state variants.
221
+ All fields that may not be present on all attempt types are optional.
222
+ """
223
+
224
+ model_config = ConfigDict(extra="forbid", strict=True)
225
+
226
+ id: str
227
+ number: int
228
+ backend: str
229
+ status: str
230
+ started_at: str
231
+ heartbeat_at: str
232
+ lease_duration_sec: float
233
+ lease_expires_at: str
234
+ owner: StateOwner
235
+ scheduler: SchedulerMetadata = Field(default_factory=dict)
236
+ ended_at: str | None = None
237
+ error: FuruErrorState | None = None
238
+ reason: str | None = None
239
+
240
+ @classmethod
241
+ def from_internal(cls, attempt: _StateAttempt) -> "StateAttempt":
242
+ """Create a StateAttempt from an internal attempt state."""
243
+ return cls(
244
+ id=attempt.id,
245
+ number=attempt.number,
246
+ backend=attempt.backend,
247
+ status=attempt.status,
248
+ started_at=attempt.started_at,
249
+ heartbeat_at=attempt.heartbeat_at,
250
+ lease_duration_sec=attempt.lease_duration_sec,
251
+ lease_expires_at=attempt.lease_expires_at,
252
+ owner=attempt.owner,
253
+ scheduler=attempt.scheduler,
254
+ ended_at=getattr(attempt, "ended_at", None),
255
+ error=getattr(attempt, "error", None),
256
+ reason=getattr(attempt, "reason", None),
257
+ )
258
+
259
+
260
+ class _FuruState(BaseModel):
261
+ model_config = ConfigDict(extra="forbid", validate_assignment=True, strict=True)
262
+
263
+ schema_version: int = 1
264
+ result: _StateResult = Field(
265
+ default_factory=lambda: _StateResultAbsent(status="absent")
266
+ )
267
+ attempt: _StateAttempt | None = None
268
+ updated_at: str | None = None
269
+
270
+
271
+ class StateManager:
272
+ """
273
+ Crash-safe state and liveness management for a single Furu artifact directory.
274
+
275
+ Design principles:
276
+ - Only `result.status == "success"` is treated as loadable by default.
277
+ - `attempt.status == "running"` is a lease-based claim that must be reconcilable.
278
+ - Writes are atomic (`os.replace`) and serialized via a state lock.
279
+ """
280
+
281
+ SCHEMA_VERSION = 1
282
+
283
+ INTERNAL_DIR = ".furu"
284
+
285
+ STATE_FILE = "state.json"
286
+ EVENTS_FILE = "events.jsonl"
287
+ SUCCESS_MARKER = "SUCCESS.json"
288
+
289
+ COMPUTE_LOCK = ".compute.lock"
290
+ SUBMIT_LOCK = ".submit.lock"
291
+ STATE_LOCK = ".state.lock"
292
+
293
+ TERMINAL_STATUSES = {
294
+ "success",
295
+ "failed",
296
+ "cancelled",
297
+ "preempted",
298
+ "crashed",
299
+ }
300
+
301
+ @classmethod
302
+ def get_internal_dir(cls, directory: Path) -> Path:
303
+ return directory / cls.INTERNAL_DIR
304
+
305
+ @classmethod
306
+ def get_state_path(cls, directory: Path) -> Path:
307
+ return cls.get_internal_dir(directory) / cls.STATE_FILE
308
+
309
+ @classmethod
310
+ def get_events_path(cls, directory: Path) -> Path:
311
+ return cls.get_internal_dir(directory) / cls.EVENTS_FILE
312
+
313
+ @classmethod
314
+ def get_success_marker_path(cls, directory: Path) -> Path:
315
+ return cls.get_internal_dir(directory) / cls.SUCCESS_MARKER
316
+
317
+ @classmethod
318
+ def get_lock_path(cls, directory: Path, lock_name: str) -> Path:
319
+ return cls.get_internal_dir(directory) / lock_name
320
+
321
+ @classmethod
322
+ def _utcnow(cls) -> _dt.datetime:
323
+ return _dt.datetime.now(_dt.timezone.utc)
324
+
325
+ @classmethod
326
+ def _iso_now(cls) -> str:
327
+ return cls._utcnow().isoformat(timespec="seconds")
328
+
329
+ @classmethod
330
+ def _parse_time(cls, value: str | None) -> _dt.datetime | None:
331
+ if not isinstance(value, str) or not value:
332
+ return None
333
+ dt = _dt.datetime.fromisoformat(value)
334
+ if dt.tzinfo is None:
335
+ dt = dt.replace(tzinfo=_dt.timezone.utc)
336
+ return dt.astimezone(_dt.timezone.utc)
337
+
338
+ @classmethod
339
+ def default_state(cls) -> _FuruState:
340
+ return _FuruState(schema_version=cls.SCHEMA_VERSION)
341
+
342
+ @classmethod
343
+ def read_state(cls, directory: Path) -> _FuruState:
344
+ state_path = cls.get_state_path(directory)
345
+ if not state_path.is_file():
346
+ return cls.default_state()
347
+
348
+ text = state_path.read_text()
349
+
350
+ try:
351
+ data = json.loads(text)
352
+ except Exception as e:
353
+ raise ValueError(f"Invalid JSON in state file: {state_path}") from e
354
+
355
+ if not isinstance(data, dict):
356
+ raise ValueError(f"Invalid state file (expected object): {state_path}")
357
+ if data.get("schema_version") != cls.SCHEMA_VERSION:
358
+ raise ValueError(
359
+ f"Unsupported state schema_version (expected {cls.SCHEMA_VERSION}): {state_path}"
360
+ )
361
+ try:
362
+ return _FuruState.model_validate(data)
363
+ except ValidationError as e:
364
+ raise ValueError(f"Invalid state schema: {state_path}") from e
365
+
366
+ @classmethod
367
+ def _write_state_unlocked(cls, directory: Path, state: _FuruState) -> None:
368
+ state_path = cls.get_state_path(directory)
369
+ state_path.parent.mkdir(parents=True, exist_ok=True)
370
+ tmp_path = state_path.with_suffix(".tmp")
371
+ tmp_path.write_text(json.dumps(state.model_dump(mode="json"), indent=2))
372
+ os.replace(tmp_path, state_path)
373
+
374
+ @classmethod
375
+ def _pid_alive(cls, pid: int) -> bool:
376
+ try:
377
+ os.kill(pid, 0)
378
+ return True
379
+ except ProcessLookupError:
380
+ return False
381
+ except PermissionError:
382
+ # Process exists but we can't signal it - still alive
383
+ return True
384
+
385
+ @classmethod
386
+ def try_lock(cls, lock_path: Path) -> int | None:
387
+ try:
388
+ lock_path.parent.mkdir(parents=True, exist_ok=True)
389
+ fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_RDWR, 0o644)
390
+ payload = {
391
+ "pid": os.getpid(),
392
+ "host": socket.gethostname(),
393
+ "created_at": cls._iso_now(),
394
+ "lock_id": uuid.uuid4().hex,
395
+ }
396
+ os.write(fd, (json.dumps(payload) + "\n").encode())
397
+ return fd
398
+ except FileExistsError:
399
+ return None
400
+
401
+ @classmethod
402
+ def release_lock(cls, fd: int | None, lock_path: Path) -> None:
403
+ if fd is not None:
404
+ os.close(fd)
405
+ lock_path.unlink(missing_ok=True)
406
+
407
+ @classmethod
408
+ def _read_lock_info(cls, lock_path: Path) -> _LockInfoDict | None:
409
+ if not lock_path.is_file():
410
+ return None
411
+ text = lock_path.read_text().strip()
412
+ if not text:
413
+ return None
414
+ lines = text.splitlines()
415
+ if not lines:
416
+ return None
417
+ data = json.loads(lines[0])
418
+ if isinstance(data, dict):
419
+ return data # type: ignore[return-value]
420
+ return None
421
+
422
+ @classmethod
423
+ def _acquire_lock_blocking(
424
+ cls,
425
+ lock_path: Path,
426
+ *,
427
+ timeout_sec: float = 5.0,
428
+ stale_after_sec: float = 60.0,
429
+ ) -> int:
430
+ deadline = time.time() + timeout_sec
431
+ while True:
432
+ fd = cls.try_lock(lock_path)
433
+ if fd is not None:
434
+ return fd
435
+
436
+ should_break = False
437
+ info = cls._read_lock_info(lock_path)
438
+ if info and info.get("host") == socket.gethostname():
439
+ pid = info.get("pid")
440
+ if isinstance(pid, int) and not cls._pid_alive(pid):
441
+ should_break = True
442
+ if not should_break:
443
+ try:
444
+ stat_result = lock_path.stat()
445
+ age = time.time() - stat_result.st_mtime
446
+ if age > stale_after_sec:
447
+ should_break = True
448
+ except FileNotFoundError:
449
+ # Lock file was deleted by another process, retry
450
+ pass
451
+
452
+ if should_break:
453
+ lock_path.unlink(missing_ok=True)
454
+ continue
455
+
456
+ if time.time() >= deadline:
457
+ raise TimeoutError(f"Timeout acquiring lock: {lock_path}")
458
+ time.sleep(0.05)
459
+
460
+ @classmethod
461
+ def update_state(
462
+ cls, directory: Path, mutator: Callable[[_FuruState], None]
463
+ ) -> _FuruState:
464
+ lock_path = cls.get_lock_path(directory, cls.STATE_LOCK)
465
+ fd: int | None = None
466
+ try:
467
+ fd = cls._acquire_lock_blocking(lock_path)
468
+ state = cls.read_state(directory)
469
+ mutator(state)
470
+ state.schema_version = cls.SCHEMA_VERSION
471
+ state.updated_at = cls._iso_now()
472
+ validated = _FuruState.model_validate(state)
473
+ cls._write_state_unlocked(directory, validated)
474
+ return validated
475
+ finally:
476
+ cls.release_lock(fd, lock_path)
477
+
478
+ @classmethod
479
+ def append_event(cls, directory: Path, event: EventMapping) -> None:
480
+ path = cls.get_events_path(directory)
481
+ enriched = {
482
+ "ts": cls._iso_now(),
483
+ "pid": os.getpid(),
484
+ "host": socket.gethostname(),
485
+ **event,
486
+ }
487
+ path.parent.mkdir(parents=True, exist_ok=True)
488
+ with path.open("a", encoding="utf-8") as f:
489
+ f.write(json.dumps(enriched) + "\n")
490
+
491
+ @classmethod
492
+ def write_success_marker(cls, directory: Path, *, attempt_id: str) -> None:
493
+ marker = cls.get_success_marker_path(directory)
494
+ marker.parent.mkdir(parents=True, exist_ok=True)
495
+ payload = {"attempt_id": attempt_id, "created_at": cls._iso_now()}
496
+ tmp = marker.with_suffix(".tmp")
497
+ tmp.write_text(json.dumps(payload, indent=2))
498
+ os.replace(tmp, marker)
499
+
500
+ @classmethod
501
+ def success_marker_exists(cls, directory: Path) -> bool:
502
+ return cls.get_success_marker_path(directory).is_file()
503
+
504
+ @classmethod
505
+ def _lease_expired(
506
+ cls, attempt: _StateAttemptQueued | _StateAttemptRunning
507
+ ) -> bool:
508
+ expires = cls._parse_time(attempt.lease_expires_at)
509
+ if expires is None:
510
+ return True
511
+ return cls._utcnow() >= expires
512
+
513
+ @classmethod
514
+ def start_attempt_queued(
515
+ cls,
516
+ directory: Path,
517
+ *,
518
+ backend: str,
519
+ lease_duration_sec: float,
520
+ owner: _OwnerDict,
521
+ scheduler: SchedulerMetadata | None = None,
522
+ ) -> str:
523
+ return cls._start_attempt(
524
+ directory,
525
+ backend=backend,
526
+ lease_duration_sec=lease_duration_sec,
527
+ owner=owner,
528
+ scheduler=scheduler,
529
+ attempt_cls=_StateAttemptQueued,
530
+ )
531
+
532
+ @classmethod
533
+ def start_attempt_running(
534
+ cls,
535
+ directory: Path,
536
+ *,
537
+ backend: str,
538
+ lease_duration_sec: float,
539
+ owner: _OwnerDict,
540
+ scheduler: SchedulerMetadata | None = None,
541
+ ) -> str:
542
+ return cls._start_attempt(
543
+ directory,
544
+ backend=backend,
545
+ lease_duration_sec=lease_duration_sec,
546
+ owner=owner,
547
+ scheduler=scheduler,
548
+ attempt_cls=_StateAttemptRunning,
549
+ )
550
+
551
+ @classmethod
552
+ def _start_attempt(
553
+ cls,
554
+ directory: Path,
555
+ *,
556
+ backend: str,
557
+ lease_duration_sec: float,
558
+ owner: _OwnerDict,
559
+ scheduler: SchedulerMetadata | None,
560
+ attempt_cls: type[_StateAttemptQueued] | type[_StateAttemptRunning],
561
+ ) -> str:
562
+ attempt_id = uuid.uuid4().hex
563
+ now = cls._utcnow()
564
+ expires = now + _dt.timedelta(seconds=float(lease_duration_sec))
565
+ prev_result_failed = False
566
+ prev_attempt_status: str | None = None
567
+ prev_attempt_reason: str | None = None
568
+
569
+ def mutate(state: _FuruState) -> None:
570
+ nonlocal prev_result_failed, prev_attempt_status, prev_attempt_reason
571
+ prev_result_failed = isinstance(state.result, _StateResultFailed)
572
+ prev = state.attempt
573
+ if prev is not None:
574
+ prev_attempt_status = prev.status
575
+ prev_attempt_reason = getattr(prev, "reason", None)
576
+
577
+ number = (prev.number + 1) if prev is not None else 1
578
+
579
+ owner_state = StateOwner.model_validate(owner)
580
+ started_at = now.isoformat(timespec="seconds")
581
+ heartbeat_at = started_at
582
+ lease_duration = float(lease_duration_sec)
583
+ lease_expires_at = expires.isoformat(timespec="seconds")
584
+ scheduler_state: SchedulerMetadata = scheduler or {}
585
+
586
+ attempt_kwargs = dict(
587
+ id=attempt_id,
588
+ number=int(number),
589
+ backend=backend,
590
+ started_at=started_at,
591
+ heartbeat_at=heartbeat_at,
592
+ lease_duration_sec=lease_duration,
593
+ lease_expires_at=lease_expires_at,
594
+ owner=owner_state,
595
+ scheduler=scheduler_state,
596
+ )
597
+ state.attempt = attempt_cls(**attempt_kwargs) # type: ignore[arg-type, misc]
598
+
599
+ state.result = _coerce_result(state.result, status="incomplete")
600
+
601
+ state = cls.update_state(directory, mutate)
602
+ if attempt_cls is _StateAttemptRunning:
603
+ from ..runtime.logging import get_logger
604
+
605
+ logger = get_logger()
606
+ if prev_result_failed:
607
+ logger.warning(
608
+ "state: retrying after previous failure %s",
609
+ directory,
610
+ )
611
+ elif prev_attempt_status == "crashed" and prev_attempt_reason in {
612
+ "pid_dead",
613
+ "lease_expired",
614
+ }:
615
+ logger.warning(
616
+ "state: restarting after stale attempt (%s) %s",
617
+ prev_attempt_reason,
618
+ directory,
619
+ )
620
+
621
+ cls.append_event(
622
+ directory,
623
+ {
624
+ "type": "attempt_started",
625
+ "attempt_id": attempt_id,
626
+ "backend": backend,
627
+ "status": state.attempt.status
628
+ if state.attempt is not None
629
+ else "unknown",
630
+ },
631
+ )
632
+ attempt = state.attempt
633
+ if attempt is None: # pragma: no cover
634
+ raise RuntimeError("start_attempt did not create attempt")
635
+ return attempt.id
636
+
637
+ @classmethod
638
+ def heartbeat(
639
+ cls, directory: Path, *, attempt_id: str, lease_duration_sec: float
640
+ ) -> bool:
641
+ ok = False
642
+
643
+ def mutate(state: _FuruState) -> None:
644
+ nonlocal ok
645
+ attempt = state.attempt
646
+ if not isinstance(attempt, _StateAttemptRunning):
647
+ return
648
+ if attempt.id != attempt_id:
649
+ return
650
+ now = cls._utcnow()
651
+ expires = now + _dt.timedelta(seconds=float(lease_duration_sec))
652
+ attempt.heartbeat_at = now.isoformat(timespec="seconds")
653
+ attempt.lease_duration_sec = float(lease_duration_sec)
654
+ attempt.lease_expires_at = expires.isoformat(timespec="seconds")
655
+ ok = True
656
+
657
+ cls.update_state(directory, mutate)
658
+ return ok
659
+
660
+ @classmethod
661
+ def set_attempt_fields(
662
+ cls, directory: Path, *, attempt_id: str, fields: SchedulerMetadata
663
+ ) -> bool:
664
+ ok = False
665
+
666
+ def mutate(state: _FuruState) -> None:
667
+ nonlocal ok
668
+ attempt = state.attempt
669
+ if attempt is None or attempt.id != attempt_id:
670
+ return
671
+ for key, value in fields.items():
672
+ if key == "scheduler" and isinstance(value, dict):
673
+ attempt.scheduler.update(value)
674
+ continue
675
+ if hasattr(attempt, key):
676
+ setattr(attempt, key, value)
677
+ ok = True
678
+
679
+ cls.update_state(directory, mutate)
680
+ return ok
681
+
682
+ @classmethod
683
+ def finish_attempt_success(cls, directory: Path, *, attempt_id: str) -> None:
684
+ now = cls._iso_now()
685
+
686
+ def mutate(state: _FuruState) -> None:
687
+ attempt = state.attempt
688
+ if attempt is not None and attempt.id == attempt_id:
689
+ state.attempt = _StateAttemptSuccess(
690
+ id=attempt.id,
691
+ number=attempt.number,
692
+ backend=attempt.backend,
693
+ started_at=attempt.started_at,
694
+ heartbeat_at=attempt.heartbeat_at,
695
+ lease_duration_sec=attempt.lease_duration_sec,
696
+ lease_expires_at=attempt.lease_expires_at,
697
+ owner=attempt.owner,
698
+ scheduler=attempt.scheduler,
699
+ ended_at=now,
700
+ )
701
+ state.result = _coerce_result(
702
+ state.result, status="success", created_at=now
703
+ )
704
+
705
+ cls.update_state(directory, mutate)
706
+ cls.append_event(
707
+ directory,
708
+ {"type": "attempt_finished", "attempt_id": attempt_id, "status": "success"},
709
+ )
710
+
711
+ @classmethod
712
+ def finish_attempt_failed(
713
+ cls,
714
+ directory: Path,
715
+ *,
716
+ attempt_id: str,
717
+ error: _ErrorDict,
718
+ ) -> None:
719
+ now = cls._iso_now()
720
+
721
+ error_state = FuruErrorState.model_validate(error)
722
+
723
+ def mutate(state: _FuruState) -> None:
724
+ attempt = state.attempt
725
+ if attempt is not None and attempt.id == attempt_id:
726
+ state.attempt = _StateAttemptFailed(
727
+ id=attempt.id,
728
+ number=attempt.number,
729
+ backend=attempt.backend,
730
+ started_at=attempt.started_at,
731
+ heartbeat_at=attempt.heartbeat_at,
732
+ lease_duration_sec=attempt.lease_duration_sec,
733
+ lease_expires_at=attempt.lease_expires_at,
734
+ owner=attempt.owner,
735
+ scheduler=attempt.scheduler,
736
+ ended_at=now,
737
+ error=error_state,
738
+ )
739
+
740
+ state.result = _coerce_result(state.result, status="failed")
741
+
742
+ cls.update_state(directory, mutate)
743
+ cls.append_event(
744
+ directory,
745
+ {"type": "attempt_finished", "attempt_id": attempt_id, "status": "failed"},
746
+ )
747
+
748
+ @classmethod
749
+ def finish_attempt_preempted(
750
+ cls,
751
+ directory: Path,
752
+ *,
753
+ attempt_id: str,
754
+ error: _ErrorDict,
755
+ reason: str | None = None,
756
+ ) -> None:
757
+ now = cls._iso_now()
758
+ error_state = FuruErrorState.model_validate(error)
759
+
760
+ def mutate(state: _FuruState) -> None:
761
+ attempt = state.attempt
762
+ if attempt is not None and attempt.id == attempt_id:
763
+ state.attempt = _StateAttemptTerminal(
764
+ status="preempted",
765
+ id=attempt.id,
766
+ number=attempt.number,
767
+ backend=attempt.backend,
768
+ started_at=attempt.started_at,
769
+ heartbeat_at=attempt.heartbeat_at,
770
+ lease_duration_sec=attempt.lease_duration_sec,
771
+ lease_expires_at=attempt.lease_expires_at,
772
+ owner=attempt.owner,
773
+ scheduler=attempt.scheduler,
774
+ ended_at=now,
775
+ error=error_state,
776
+ reason=reason,
777
+ )
778
+ state.result = _coerce_result(state.result, status="incomplete")
779
+
780
+ cls.update_state(directory, mutate)
781
+ cls.append_event(
782
+ directory,
783
+ {
784
+ "type": "attempt_finished",
785
+ "attempt_id": attempt_id,
786
+ "status": "preempted",
787
+ },
788
+ )
789
+
790
+ @classmethod
791
+ def _local_attempt_alive(
792
+ cls, attempt: _StateAttemptQueued | _StateAttemptRunning
793
+ ) -> bool | None:
794
+ host = attempt.owner.host
795
+ pid = attempt.owner.pid
796
+ if host != socket.gethostname():
797
+ return None
798
+ if not isinstance(pid, int):
799
+ return None
800
+ return cls._pid_alive(pid)
801
+
802
+ @classmethod
803
+ def reconcile(
804
+ cls,
805
+ directory: Path,
806
+ *,
807
+ submitit_probe: Callable[[_FuruState], ProbeResult] | None = None,
808
+ ) -> _FuruState:
809
+ """
810
+ Reconcile a possibly-stale running/queued attempt.
811
+
812
+ - If a success marker exists, promote to success.
813
+ - For local attempts, if PID is provably dead or lease expired, mark as crashed and
814
+ remove compute lock so waiters can proceed.
815
+ - For submitit attempts, rely on `submitit_probe` when provided; otherwise fall back
816
+ to lease expiry.
817
+ """
818
+
819
+ def mutate(state: _FuruState) -> None:
820
+ attempt = state.attempt
821
+ if not isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning)):
822
+ return
823
+
824
+ # Fast promotion if we can see a durable success marker.
825
+ if cls.success_marker_exists(directory):
826
+ ended = cls._iso_now()
827
+ state.attempt = _StateAttemptSuccess(
828
+ id=attempt.id,
829
+ number=attempt.number,
830
+ backend=attempt.backend,
831
+ started_at=attempt.started_at,
832
+ heartbeat_at=attempt.heartbeat_at,
833
+ lease_duration_sec=attempt.lease_duration_sec,
834
+ lease_expires_at=attempt.lease_expires_at,
835
+ owner=attempt.owner,
836
+ scheduler=attempt.scheduler,
837
+ ended_at=ended,
838
+ )
839
+ state.result = _coerce_result(
840
+ state.result, status="success", created_at=ended
841
+ )
842
+ return
843
+
844
+ backend = attempt.backend
845
+ now = cls._iso_now()
846
+
847
+ terminal_status: str | None = None
848
+ reason: str | None = None
849
+
850
+ if backend == "local":
851
+ alive = cls._local_attempt_alive(attempt)
852
+ if alive is False:
853
+ terminal_status = "crashed"
854
+ reason = "pid_dead"
855
+ elif cls._lease_expired(attempt):
856
+ terminal_status = "crashed"
857
+ reason = "lease_expired"
858
+ elif backend == "submitit":
859
+ if submitit_probe is not None:
860
+ verdict = submitit_probe(state)
861
+ if verdict.get("terminal_status") in cls.TERMINAL_STATUSES:
862
+ terminal_status = str(verdict["terminal_status"])
863
+ reason = str(verdict.get("reason") or "scheduler_terminal")
864
+ attempt.scheduler.update(
865
+ {k: v for k, v in verdict.items() if k != "terminal_status"}
866
+ )
867
+ if terminal_status is None and cls._lease_expired(attempt):
868
+ terminal_status = "crashed"
869
+ reason = "lease_expired"
870
+ else:
871
+ if cls._lease_expired(attempt):
872
+ terminal_status = "crashed"
873
+ reason = "lease_expired"
874
+
875
+ if terminal_status is None:
876
+ return
877
+ if terminal_status == "success":
878
+ terminal_status = "crashed"
879
+ reason = reason or "scheduler_success_no_success_marker"
880
+
881
+ if terminal_status == "failed":
882
+ state.attempt = _StateAttemptFailed(
883
+ id=attempt.id,
884
+ number=attempt.number,
885
+ backend=attempt.backend,
886
+ started_at=attempt.started_at,
887
+ heartbeat_at=attempt.heartbeat_at,
888
+ lease_duration_sec=attempt.lease_duration_sec,
889
+ lease_expires_at=attempt.lease_expires_at,
890
+ owner=attempt.owner,
891
+ scheduler=attempt.scheduler,
892
+ ended_at=now,
893
+ error=FuruErrorState(type="FuruComputeError", message=reason or ""),
894
+ reason=reason,
895
+ )
896
+ else:
897
+ if terminal_status == "cancelled":
898
+ state.attempt = _StateAttemptTerminal(
899
+ status="cancelled",
900
+ id=attempt.id,
901
+ number=attempt.number,
902
+ backend=attempt.backend,
903
+ started_at=attempt.started_at,
904
+ heartbeat_at=attempt.heartbeat_at,
905
+ lease_duration_sec=attempt.lease_duration_sec,
906
+ lease_expires_at=attempt.lease_expires_at,
907
+ owner=attempt.owner,
908
+ scheduler=attempt.scheduler,
909
+ ended_at=now,
910
+ reason=reason,
911
+ )
912
+ elif terminal_status == "preempted":
913
+ state.attempt = _StateAttemptTerminal(
914
+ status="preempted",
915
+ id=attempt.id,
916
+ number=attempt.number,
917
+ backend=attempt.backend,
918
+ started_at=attempt.started_at,
919
+ heartbeat_at=attempt.heartbeat_at,
920
+ lease_duration_sec=attempt.lease_duration_sec,
921
+ lease_expires_at=attempt.lease_expires_at,
922
+ owner=attempt.owner,
923
+ scheduler=attempt.scheduler,
924
+ ended_at=now,
925
+ reason=reason,
926
+ )
927
+ else:
928
+ state.attempt = _StateAttemptTerminal(
929
+ status="crashed",
930
+ id=attempt.id,
931
+ number=attempt.number,
932
+ backend=attempt.backend,
933
+ started_at=attempt.started_at,
934
+ heartbeat_at=attempt.heartbeat_at,
935
+ lease_duration_sec=attempt.lease_duration_sec,
936
+ lease_expires_at=attempt.lease_expires_at,
937
+ owner=attempt.owner,
938
+ scheduler=attempt.scheduler,
939
+ ended_at=now,
940
+ reason=reason,
941
+ )
942
+
943
+ state.result = _coerce_result(
944
+ state.result,
945
+ status="failed" if terminal_status == "failed" else "incomplete",
946
+ )
947
+
948
+ state = cls.update_state(directory, mutate)
949
+ attempt = state.attempt
950
+ if attempt is not None and attempt.status in {
951
+ "crashed",
952
+ "cancelled",
953
+ "preempted",
954
+ }:
955
+ cls.get_lock_path(directory, cls.COMPUTE_LOCK).unlink(missing_ok=True)
956
+ return state
957
+
958
+
959
+ @dataclass
960
+ class ComputeLockContext:
961
+ """Context returned when a compute lock is successfully acquired."""
962
+
963
+ attempt_id: str
964
+ stop_heartbeat: Callable[[], None]
965
+
966
+
967
+ @contextmanager
968
+ def compute_lock(
969
+ directory: Path,
970
+ *,
971
+ backend: str,
972
+ lease_duration_sec: float,
973
+ heartbeat_interval_sec: float,
974
+ owner: _OwnerDict,
975
+ scheduler: SchedulerMetadata | None = None,
976
+ max_wait_time_sec: float | None = None,
977
+ poll_interval_sec: float = 10.0,
978
+ wait_log_every_sec: float = 10.0,
979
+ reconcile_fn: Callable[[Path], None] | None = None,
980
+ ) -> Generator[ComputeLockContext, None, None]:
981
+ """
982
+ Context manager that atomically acquires lock + records attempt + starts heartbeat.
983
+
984
+ This ensures there can never be a mismatch between the lock file and state:
985
+ - Lock acquisition and attempt recording happen together
986
+ - Heartbeat starts immediately after attempt is recorded
987
+ - On exit, heartbeat is stopped and lock is released
988
+
989
+ The context manager handles the wait loop internally, blocking until the lock
990
+ is acquired or timeout is reached.
991
+
992
+ Args:
993
+ directory: The furu directory for this experiment
994
+ backend: Backend type (e.g., "local", "submitit")
995
+ lease_duration_sec: Duration of the lease in seconds
996
+ heartbeat_interval_sec: Interval between heartbeats in seconds
997
+ owner: Owner information (pid, host, user, etc.)
998
+ scheduler: Optional scheduler metadata
999
+ max_wait_time_sec: Maximum time to wait for lock (None = wait forever)
1000
+ poll_interval_sec: Interval between lock acquisition attempts
1001
+ wait_log_every_sec: Interval between "waiting for lock" log messages
1002
+ reconcile_fn: Optional function to call to reconcile stale attempts
1003
+
1004
+ Yields:
1005
+ ComputeLockContext with attempt_id and stop_heartbeat callable
1006
+
1007
+ Raises:
1008
+ FuruLockNotAcquired: If lock cannot be acquired (after waiting)
1009
+ FuruWaitTimeout: If max_wait_time_sec is exceeded
1010
+ """
1011
+ lock_path = StateManager.get_lock_path(directory, StateManager.COMPUTE_LOCK)
1012
+
1013
+ lock_fd: int | None = None
1014
+ start_time = time.time()
1015
+ next_wait_log_at = 0.0
1016
+
1017
+ # Import here to avoid circular import
1018
+ from ..runtime import get_logger
1019
+
1020
+ logger = get_logger()
1021
+
1022
+ # Wait loop to acquire lock
1023
+ while lock_fd is None:
1024
+ # Check timeout
1025
+ if max_wait_time_sec is not None:
1026
+ elapsed = time.time() - start_time
1027
+ if elapsed > max_wait_time_sec:
1028
+ raise FuruWaitTimeout(
1029
+ f"Timed out waiting for compute lock after {elapsed:.1f}s"
1030
+ )
1031
+
1032
+ lock_fd = StateManager.try_lock(lock_path)
1033
+ if lock_fd is not None:
1034
+ break
1035
+
1036
+ # Lock held by someone else - reconcile and check state
1037
+ if reconcile_fn is not None:
1038
+ reconcile_fn(directory)
1039
+
1040
+ state = StateManager.read_state(directory)
1041
+ attempt = state.attempt
1042
+
1043
+ # If result is terminal, no point waiting
1044
+ if isinstance(state.result, _StateResultSuccess):
1045
+ raise FuruLockNotAcquired(
1046
+ "Cannot acquire lock: experiment already succeeded"
1047
+ )
1048
+ if isinstance(state.result, _StateResultFailed):
1049
+ raise FuruLockNotAcquired("Cannot acquire lock: experiment already failed")
1050
+
1051
+ # If no active attempt but lock exists, it's orphaned - clean it up
1052
+ if attempt is None or isinstance(
1053
+ attempt,
1054
+ (
1055
+ _StateAttemptSuccess,
1056
+ _StateAttemptFailed,
1057
+ _StateAttemptTerminal,
1058
+ ),
1059
+ ):
1060
+ # Orphaned lock file - remove it and retry immediately
1061
+ lock_path.unlink(missing_ok=True)
1062
+ continue
1063
+
1064
+ # Active attempt exists - wait for it
1065
+ now = time.time()
1066
+ if now >= next_wait_log_at:
1067
+ logger.info(
1068
+ "compute_lock: waiting for lock %s",
1069
+ directory,
1070
+ )
1071
+ next_wait_log_at = now + wait_log_every_sec
1072
+ time.sleep(poll_interval_sec)
1073
+
1074
+ # Lock acquired - now atomically record attempt and start heartbeat
1075
+ stop_event = threading.Event()
1076
+ attempt_id: str | None = None
1077
+
1078
+ try:
1079
+ # Record attempt IMMEDIATELY to minimize orphan window
1080
+ attempt_id = StateManager.start_attempt_running(
1081
+ directory,
1082
+ backend=backend,
1083
+ lease_duration_sec=lease_duration_sec,
1084
+ owner=owner,
1085
+ scheduler=scheduler,
1086
+ )
1087
+
1088
+ # Start heartbeat IMMEDIATELY
1089
+ def heartbeat() -> None:
1090
+ while not stop_event.wait(heartbeat_interval_sec):
1091
+ StateManager.heartbeat(
1092
+ directory,
1093
+ attempt_id=attempt_id, # type: ignore[arg-type]
1094
+ lease_duration_sec=lease_duration_sec,
1095
+ )
1096
+
1097
+ thread = threading.Thread(target=heartbeat, daemon=True)
1098
+ thread.start()
1099
+
1100
+ yield ComputeLockContext(
1101
+ attempt_id=attempt_id,
1102
+ stop_heartbeat=stop_event.set,
1103
+ )
1104
+ finally:
1105
+ # Always stop heartbeat and release lock
1106
+ stop_event.set()
1107
+ StateManager.release_lock(lock_fd, lock_path)