furu 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- furu/__init__.py +8 -0
- furu/adapters/submitit.py +23 -2
- furu/config.py +40 -41
- furu/core/furu.py +479 -252
- furu/core/list.py +4 -3
- furu/dashboard/__init__.py +10 -1
- furu/dashboard/frontend/dist/assets/{index-DS3FsqcY.js → index-BjyrY-Zz.js} +1 -1
- furu/dashboard/frontend/dist/index.html +1 -1
- furu/dashboard/main.py +10 -3
- furu/errors.py +17 -4
- furu/execution/__init__.py +22 -0
- furu/execution/context.py +30 -0
- furu/execution/local.py +186 -0
- furu/execution/paths.py +20 -0
- furu/execution/plan.py +330 -0
- furu/execution/plan_utils.py +13 -0
- furu/execution/slurm_dag.py +273 -0
- furu/execution/slurm_pool.py +878 -0
- furu/execution/slurm_spec.py +38 -0
- furu/execution/submitit_factory.py +47 -0
- furu/migration.py +1 -2
- furu/runtime/env.py +1 -1
- furu/runtime/logging.py +40 -14
- furu/storage/metadata.py +25 -29
- furu/storage/migration.py +0 -1
- furu/storage/state.py +120 -98
- {furu-0.0.3.dist-info → furu-0.0.5.dist-info}/METADATA +91 -42
- furu-0.0.5.dist-info/RECORD +46 -0
- {furu-0.0.3.dist-info → furu-0.0.5.dist-info}/WHEEL +1 -1
- furu-0.0.3.dist-info/RECORD +0 -36
- {furu-0.0.3.dist-info → furu-0.0.5.dist-info}/entry_points.txt +0 -0
furu/storage/state.py
CHANGED
|
@@ -9,13 +9,12 @@ from collections.abc import Generator
|
|
|
9
9
|
from contextlib import contextmanager
|
|
10
10
|
from dataclasses import dataclass
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import Annotated, Any, Callable, Literal, Mapping,
|
|
12
|
+
from typing import Annotated, Any, Callable, Literal, Mapping, TypeAlias, TypedDict
|
|
13
13
|
|
|
14
14
|
from pydantic import BaseModel, ConfigDict, Field, ValidationError, model_validator
|
|
15
15
|
|
|
16
16
|
from ..errors import FuruLockNotAcquired, FuruWaitTimeout
|
|
17
17
|
|
|
18
|
-
|
|
19
18
|
# Type alias for scheduler-specific metadata. Different schedulers (SLURM, LSF, PBS, local)
|
|
20
19
|
# return different fields, so this must remain dynamic.
|
|
21
20
|
SchedulerMetadata = dict[str, Any]
|
|
@@ -167,7 +166,6 @@ class _StateAttemptBase(BaseModel):
|
|
|
167
166
|
number: int = 1
|
|
168
167
|
backend: str
|
|
169
168
|
started_at: str
|
|
170
|
-
heartbeat_at: str
|
|
171
169
|
lease_duration_sec: float
|
|
172
170
|
lease_expires_at: str
|
|
173
171
|
owner: StateOwner
|
|
@@ -228,7 +226,6 @@ class StateAttempt(BaseModel):
|
|
|
228
226
|
backend: str
|
|
229
227
|
status: str
|
|
230
228
|
started_at: str
|
|
231
|
-
heartbeat_at: str
|
|
232
229
|
lease_duration_sec: float
|
|
233
230
|
lease_expires_at: str
|
|
234
231
|
owner: StateOwner
|
|
@@ -246,7 +243,6 @@ class StateAttempt(BaseModel):
|
|
|
246
243
|
backend=attempt.backend,
|
|
247
244
|
status=attempt.status,
|
|
248
245
|
started_at=attempt.started_at,
|
|
249
|
-
heartbeat_at=attempt.heartbeat_at,
|
|
250
246
|
lease_duration_sec=attempt.lease_duration_sec,
|
|
251
247
|
lease_expires_at=attempt.lease_expires_at,
|
|
252
248
|
owner=attempt.owner,
|
|
@@ -286,9 +282,9 @@ class StateManager:
|
|
|
286
282
|
EVENTS_FILE = "events.jsonl"
|
|
287
283
|
SUCCESS_MARKER = "SUCCESS.json"
|
|
288
284
|
|
|
289
|
-
COMPUTE_LOCK = "
|
|
290
|
-
SUBMIT_LOCK = "
|
|
291
|
-
STATE_LOCK = "
|
|
285
|
+
COMPUTE_LOCK = "compute.lock"
|
|
286
|
+
SUBMIT_LOCK = "submit.lock"
|
|
287
|
+
STATE_LOCK = "state.lock"
|
|
292
288
|
|
|
293
289
|
TERMINAL_STATUSES = {
|
|
294
290
|
"success",
|
|
@@ -302,6 +298,12 @@ class StateManager:
|
|
|
302
298
|
def get_internal_dir(cls, directory: Path) -> Path:
|
|
303
299
|
return directory / cls.INTERNAL_DIR
|
|
304
300
|
|
|
301
|
+
@classmethod
|
|
302
|
+
def ensure_internal_dir(cls, directory: Path) -> Path:
|
|
303
|
+
internal_dir = cls.get_internal_dir(directory)
|
|
304
|
+
internal_dir.mkdir(parents=True, exist_ok=True)
|
|
305
|
+
return internal_dir
|
|
306
|
+
|
|
305
307
|
@classmethod
|
|
306
308
|
def get_state_path(cls, directory: Path) -> Path:
|
|
307
309
|
return cls.get_internal_dir(directory) / cls.STATE_FILE
|
|
@@ -366,7 +368,6 @@ class StateManager:
|
|
|
366
368
|
@classmethod
|
|
367
369
|
def _write_state_unlocked(cls, directory: Path, state: _FuruState) -> None:
|
|
368
370
|
state_path = cls.get_state_path(directory)
|
|
369
|
-
state_path.parent.mkdir(parents=True, exist_ok=True)
|
|
370
371
|
tmp_path = state_path.with_suffix(".tmp")
|
|
371
372
|
tmp_path.write_text(json.dumps(state.model_dump(mode="json"), indent=2))
|
|
372
373
|
os.replace(tmp_path, state_path)
|
|
@@ -385,7 +386,6 @@ class StateManager:
|
|
|
385
386
|
@classmethod
|
|
386
387
|
def try_lock(cls, lock_path: Path) -> int | None:
|
|
387
388
|
try:
|
|
388
|
-
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
389
389
|
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_RDWR, 0o644)
|
|
390
390
|
payload = {
|
|
391
391
|
"pid": os.getpid(),
|
|
@@ -400,15 +400,41 @@ class StateManager:
|
|
|
400
400
|
|
|
401
401
|
@classmethod
|
|
402
402
|
def release_lock(cls, fd: int | None, lock_path: Path) -> None:
|
|
403
|
-
|
|
403
|
+
"""Release a lock acquired via :meth:`try_lock`.
|
|
404
|
+
|
|
405
|
+
We best-effort avoid deleting a lock that was broken and replaced by
|
|
406
|
+
another process by verifying the inode of the open fd matches the
|
|
407
|
+
current lock_path inode before unlinking.
|
|
408
|
+
"""
|
|
409
|
+
if fd is None:
|
|
410
|
+
return
|
|
411
|
+
try:
|
|
412
|
+
fd_stat = os.fstat(fd)
|
|
413
|
+
except OSError:
|
|
414
|
+
fd_stat = None
|
|
415
|
+
try:
|
|
416
|
+
path_stat = lock_path.stat()
|
|
417
|
+
except FileNotFoundError:
|
|
418
|
+
path_stat = None
|
|
419
|
+
try:
|
|
420
|
+
if (
|
|
421
|
+
fd_stat is not None
|
|
422
|
+
and path_stat is not None
|
|
423
|
+
and fd_stat.st_ino == path_stat.st_ino
|
|
424
|
+
and fd_stat.st_dev == path_stat.st_dev
|
|
425
|
+
):
|
|
426
|
+
lock_path.unlink(missing_ok=True)
|
|
427
|
+
finally:
|
|
404
428
|
os.close(fd)
|
|
405
|
-
lock_path.unlink(missing_ok=True)
|
|
406
429
|
|
|
407
430
|
@classmethod
|
|
408
431
|
def _read_lock_info(cls, lock_path: Path) -> _LockInfoDict | None:
|
|
409
432
|
if not lock_path.is_file():
|
|
410
433
|
return None
|
|
411
|
-
|
|
434
|
+
try:
|
|
435
|
+
text = lock_path.read_text().strip()
|
|
436
|
+
except FileNotFoundError:
|
|
437
|
+
return None
|
|
412
438
|
if not text:
|
|
413
439
|
return None
|
|
414
440
|
lines = text.splitlines()
|
|
@@ -459,19 +485,23 @@ class StateManager:
|
|
|
459
485
|
|
|
460
486
|
@classmethod
|
|
461
487
|
def update_state(
|
|
462
|
-
cls, directory: Path, mutator: Callable[[_FuruState], None]
|
|
488
|
+
cls, directory: Path, mutator: Callable[[_FuruState], bool | None]
|
|
463
489
|
) -> _FuruState:
|
|
464
490
|
lock_path = cls.get_lock_path(directory, cls.STATE_LOCK)
|
|
465
491
|
fd: int | None = None
|
|
466
492
|
try:
|
|
467
493
|
fd = cls._acquire_lock_blocking(lock_path)
|
|
494
|
+
state_path = cls.get_state_path(directory)
|
|
495
|
+
force_write = not state_path.is_file()
|
|
468
496
|
state = cls.read_state(directory)
|
|
469
|
-
mutator(state)
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
497
|
+
changed = mutator(state)
|
|
498
|
+
if force_write or changed is not False:
|
|
499
|
+
state.schema_version = cls.SCHEMA_VERSION
|
|
500
|
+
state.updated_at = cls._iso_now()
|
|
501
|
+
validated = _FuruState.model_validate(state)
|
|
502
|
+
cls._write_state_unlocked(directory, validated)
|
|
503
|
+
return validated
|
|
504
|
+
return state
|
|
475
505
|
finally:
|
|
476
506
|
cls.release_lock(fd, lock_path)
|
|
477
507
|
|
|
@@ -484,14 +514,12 @@ class StateManager:
|
|
|
484
514
|
"host": socket.gethostname(),
|
|
485
515
|
**event,
|
|
486
516
|
}
|
|
487
|
-
path.parent.mkdir(parents=True, exist_ok=True)
|
|
488
517
|
with path.open("a", encoding="utf-8") as f:
|
|
489
518
|
f.write(json.dumps(enriched) + "\n")
|
|
490
519
|
|
|
491
520
|
@classmethod
|
|
492
521
|
def write_success_marker(cls, directory: Path, *, attempt_id: str) -> None:
|
|
493
522
|
marker = cls.get_success_marker_path(directory)
|
|
494
|
-
marker.parent.mkdir(parents=True, exist_ok=True)
|
|
495
523
|
payload = {"attempt_id": attempt_id, "created_at": cls._iso_now()}
|
|
496
524
|
tmp = marker.with_suffix(".tmp")
|
|
497
525
|
tmp.write_text(json.dumps(payload, indent=2))
|
|
@@ -510,6 +538,26 @@ class StateManager:
|
|
|
510
538
|
return True
|
|
511
539
|
return cls._utcnow() >= expires
|
|
512
540
|
|
|
541
|
+
@classmethod
|
|
542
|
+
def last_heartbeat_mtime(cls, directory: Path) -> float | None:
|
|
543
|
+
lock_path = cls.get_lock_path(directory, cls.COMPUTE_LOCK)
|
|
544
|
+
try:
|
|
545
|
+
return lock_path.stat().st_mtime
|
|
546
|
+
except FileNotFoundError:
|
|
547
|
+
return None
|
|
548
|
+
|
|
549
|
+
@classmethod
|
|
550
|
+
def _running_heartbeat_reason(
|
|
551
|
+
cls, directory: Path, attempt: _StateAttemptRunning
|
|
552
|
+
) -> str | None:
|
|
553
|
+
last_heartbeat = cls.last_heartbeat_mtime(directory)
|
|
554
|
+
if last_heartbeat is None:
|
|
555
|
+
return "missing_heartbeat"
|
|
556
|
+
expires_at = last_heartbeat + float(attempt.lease_duration_sec)
|
|
557
|
+
if time.time() >= expires_at:
|
|
558
|
+
return "lease_expired"
|
|
559
|
+
return None
|
|
560
|
+
|
|
513
561
|
@classmethod
|
|
514
562
|
def start_attempt_queued(
|
|
515
563
|
cls,
|
|
@@ -578,7 +626,6 @@ class StateManager:
|
|
|
578
626
|
|
|
579
627
|
owner_state = StateOwner.model_validate(owner)
|
|
580
628
|
started_at = now.isoformat(timespec="seconds")
|
|
581
|
-
heartbeat_at = started_at
|
|
582
629
|
lease_duration = float(lease_duration_sec)
|
|
583
630
|
lease_expires_at = expires.isoformat(timespec="seconds")
|
|
584
631
|
scheduler_state: SchedulerMetadata = scheduler or {}
|
|
@@ -588,7 +635,6 @@ class StateManager:
|
|
|
588
635
|
number=int(number),
|
|
589
636
|
backend=backend,
|
|
590
637
|
started_at=started_at,
|
|
591
|
-
heartbeat_at=heartbeat_at,
|
|
592
638
|
lease_duration_sec=lease_duration,
|
|
593
639
|
lease_expires_at=lease_expires_at,
|
|
594
640
|
owner=owner_state,
|
|
@@ -635,49 +681,9 @@ class StateManager:
|
|
|
635
681
|
return attempt.id
|
|
636
682
|
|
|
637
683
|
@classmethod
|
|
638
|
-
def heartbeat(
|
|
639
|
-
cls
|
|
640
|
-
|
|
641
|
-
ok = False
|
|
642
|
-
|
|
643
|
-
def mutate(state: _FuruState) -> None:
|
|
644
|
-
nonlocal ok
|
|
645
|
-
attempt = state.attempt
|
|
646
|
-
if not isinstance(attempt, _StateAttemptRunning):
|
|
647
|
-
return
|
|
648
|
-
if attempt.id != attempt_id:
|
|
649
|
-
return
|
|
650
|
-
now = cls._utcnow()
|
|
651
|
-
expires = now + _dt.timedelta(seconds=float(lease_duration_sec))
|
|
652
|
-
attempt.heartbeat_at = now.isoformat(timespec="seconds")
|
|
653
|
-
attempt.lease_duration_sec = float(lease_duration_sec)
|
|
654
|
-
attempt.lease_expires_at = expires.isoformat(timespec="seconds")
|
|
655
|
-
ok = True
|
|
656
|
-
|
|
657
|
-
cls.update_state(directory, mutate)
|
|
658
|
-
return ok
|
|
659
|
-
|
|
660
|
-
@classmethod
|
|
661
|
-
def set_attempt_fields(
|
|
662
|
-
cls, directory: Path, *, attempt_id: str, fields: SchedulerMetadata
|
|
663
|
-
) -> bool:
|
|
664
|
-
ok = False
|
|
665
|
-
|
|
666
|
-
def mutate(state: _FuruState) -> None:
|
|
667
|
-
nonlocal ok
|
|
668
|
-
attempt = state.attempt
|
|
669
|
-
if attempt is None or attempt.id != attempt_id:
|
|
670
|
-
return
|
|
671
|
-
for key, value in fields.items():
|
|
672
|
-
if key == "scheduler" and isinstance(value, dict):
|
|
673
|
-
attempt.scheduler.update(value)
|
|
674
|
-
continue
|
|
675
|
-
if hasattr(attempt, key):
|
|
676
|
-
setattr(attempt, key, value)
|
|
677
|
-
ok = True
|
|
678
|
-
|
|
679
|
-
cls.update_state(directory, mutate)
|
|
680
|
-
return ok
|
|
684
|
+
def heartbeat(cls, directory: Path) -> None:
|
|
685
|
+
lock_path = cls.get_lock_path(directory, cls.COMPUTE_LOCK)
|
|
686
|
+
os.utime(lock_path)
|
|
681
687
|
|
|
682
688
|
@classmethod
|
|
683
689
|
def finish_attempt_success(cls, directory: Path, *, attempt_id: str) -> None:
|
|
@@ -691,7 +697,6 @@ class StateManager:
|
|
|
691
697
|
number=attempt.number,
|
|
692
698
|
backend=attempt.backend,
|
|
693
699
|
started_at=attempt.started_at,
|
|
694
|
-
heartbeat_at=attempt.heartbeat_at,
|
|
695
700
|
lease_duration_sec=attempt.lease_duration_sec,
|
|
696
701
|
lease_expires_at=attempt.lease_expires_at,
|
|
697
702
|
owner=attempt.owner,
|
|
@@ -728,7 +733,6 @@ class StateManager:
|
|
|
728
733
|
number=attempt.number,
|
|
729
734
|
backend=attempt.backend,
|
|
730
735
|
started_at=attempt.started_at,
|
|
731
|
-
heartbeat_at=attempt.heartbeat_at,
|
|
732
736
|
lease_duration_sec=attempt.lease_duration_sec,
|
|
733
737
|
lease_expires_at=attempt.lease_expires_at,
|
|
734
738
|
owner=attempt.owner,
|
|
@@ -766,7 +770,6 @@ class StateManager:
|
|
|
766
770
|
number=attempt.number,
|
|
767
771
|
backend=attempt.backend,
|
|
768
772
|
started_at=attempt.started_at,
|
|
769
|
-
heartbeat_at=attempt.heartbeat_at,
|
|
770
773
|
lease_duration_sec=attempt.lease_duration_sec,
|
|
771
774
|
lease_expires_at=attempt.lease_expires_at,
|
|
772
775
|
owner=attempt.owner,
|
|
@@ -816,10 +819,10 @@ class StateManager:
|
|
|
816
819
|
to lease expiry.
|
|
817
820
|
"""
|
|
818
821
|
|
|
819
|
-
def mutate(state: _FuruState) ->
|
|
822
|
+
def mutate(state: _FuruState) -> bool:
|
|
820
823
|
attempt = state.attempt
|
|
821
824
|
if not isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning)):
|
|
822
|
-
return
|
|
825
|
+
return False
|
|
823
826
|
|
|
824
827
|
# Fast promotion if we can see a durable success marker.
|
|
825
828
|
if cls.success_marker_exists(directory):
|
|
@@ -829,7 +832,6 @@ class StateManager:
|
|
|
829
832
|
number=attempt.number,
|
|
830
833
|
backend=attempt.backend,
|
|
831
834
|
started_at=attempt.started_at,
|
|
832
|
-
heartbeat_at=attempt.heartbeat_at,
|
|
833
835
|
lease_duration_sec=attempt.lease_duration_sec,
|
|
834
836
|
lease_expires_at=attempt.lease_expires_at,
|
|
835
837
|
owner=attempt.owner,
|
|
@@ -839,7 +841,7 @@ class StateManager:
|
|
|
839
841
|
state.result = _coerce_result(
|
|
840
842
|
state.result, status="success", created_at=ended
|
|
841
843
|
)
|
|
842
|
-
return
|
|
844
|
+
return True
|
|
843
845
|
|
|
844
846
|
backend = attempt.backend
|
|
845
847
|
now = cls._iso_now()
|
|
@@ -852,6 +854,10 @@ class StateManager:
|
|
|
852
854
|
if alive is False:
|
|
853
855
|
terminal_status = "crashed"
|
|
854
856
|
reason = "pid_dead"
|
|
857
|
+
elif isinstance(attempt, _StateAttemptRunning):
|
|
858
|
+
reason = cls._running_heartbeat_reason(directory, attempt)
|
|
859
|
+
if reason is not None:
|
|
860
|
+
terminal_status = "crashed"
|
|
855
861
|
elif cls._lease_expired(attempt):
|
|
856
862
|
terminal_status = "crashed"
|
|
857
863
|
reason = "lease_expired"
|
|
@@ -864,16 +870,25 @@ class StateManager:
|
|
|
864
870
|
attempt.scheduler.update(
|
|
865
871
|
{k: v for k, v in verdict.items() if k != "terminal_status"}
|
|
866
872
|
)
|
|
867
|
-
if terminal_status is None
|
|
868
|
-
|
|
869
|
-
|
|
873
|
+
if terminal_status is None:
|
|
874
|
+
if isinstance(attempt, _StateAttemptRunning):
|
|
875
|
+
reason = cls._running_heartbeat_reason(directory, attempt)
|
|
876
|
+
if reason is not None:
|
|
877
|
+
terminal_status = "crashed"
|
|
878
|
+
elif cls._lease_expired(attempt):
|
|
879
|
+
terminal_status = "crashed"
|
|
880
|
+
reason = "lease_expired"
|
|
870
881
|
else:
|
|
871
|
-
if
|
|
882
|
+
if isinstance(attempt, _StateAttemptRunning):
|
|
883
|
+
reason = cls._running_heartbeat_reason(directory, attempt)
|
|
884
|
+
if reason is not None:
|
|
885
|
+
terminal_status = "crashed"
|
|
886
|
+
elif cls._lease_expired(attempt):
|
|
872
887
|
terminal_status = "crashed"
|
|
873
888
|
reason = "lease_expired"
|
|
874
889
|
|
|
875
890
|
if terminal_status is None:
|
|
876
|
-
return
|
|
891
|
+
return False
|
|
877
892
|
if terminal_status == "success":
|
|
878
893
|
terminal_status = "crashed"
|
|
879
894
|
reason = reason or "scheduler_success_no_success_marker"
|
|
@@ -884,7 +899,6 @@ class StateManager:
|
|
|
884
899
|
number=attempt.number,
|
|
885
900
|
backend=attempt.backend,
|
|
886
901
|
started_at=attempt.started_at,
|
|
887
|
-
heartbeat_at=attempt.heartbeat_at,
|
|
888
902
|
lease_duration_sec=attempt.lease_duration_sec,
|
|
889
903
|
lease_expires_at=attempt.lease_expires_at,
|
|
890
904
|
owner=attempt.owner,
|
|
@@ -901,7 +915,6 @@ class StateManager:
|
|
|
901
915
|
number=attempt.number,
|
|
902
916
|
backend=attempt.backend,
|
|
903
917
|
started_at=attempt.started_at,
|
|
904
|
-
heartbeat_at=attempt.heartbeat_at,
|
|
905
918
|
lease_duration_sec=attempt.lease_duration_sec,
|
|
906
919
|
lease_expires_at=attempt.lease_expires_at,
|
|
907
920
|
owner=attempt.owner,
|
|
@@ -916,7 +929,6 @@ class StateManager:
|
|
|
916
929
|
number=attempt.number,
|
|
917
930
|
backend=attempt.backend,
|
|
918
931
|
started_at=attempt.started_at,
|
|
919
|
-
heartbeat_at=attempt.heartbeat_at,
|
|
920
932
|
lease_duration_sec=attempt.lease_duration_sec,
|
|
921
933
|
lease_expires_at=attempt.lease_expires_at,
|
|
922
934
|
owner=attempt.owner,
|
|
@@ -931,7 +943,6 @@ class StateManager:
|
|
|
931
943
|
number=attempt.number,
|
|
932
944
|
backend=attempt.backend,
|
|
933
945
|
started_at=attempt.started_at,
|
|
934
|
-
heartbeat_at=attempt.heartbeat_at,
|
|
935
946
|
lease_duration_sec=attempt.lease_duration_sec,
|
|
936
947
|
lease_expires_at=attempt.lease_expires_at,
|
|
937
948
|
owner=attempt.owner,
|
|
@@ -944,6 +955,7 @@ class StateManager:
|
|
|
944
955
|
state.result,
|
|
945
956
|
status="failed" if terminal_status == "failed" else "incomplete",
|
|
946
957
|
)
|
|
958
|
+
return True
|
|
947
959
|
|
|
948
960
|
state = cls.update_state(directory, mutate)
|
|
949
961
|
attempt = state.attempt
|
|
@@ -978,6 +990,7 @@ def compute_lock(
|
|
|
978
990
|
wait_log_every_sec: float = 10.0,
|
|
979
991
|
reconcile_fn: Callable[[Path], None] | None = None,
|
|
980
992
|
allow_failed: bool = False,
|
|
993
|
+
allow_success: bool = False,
|
|
981
994
|
) -> Generator[ComputeLockContext, None, None]:
|
|
982
995
|
"""
|
|
983
996
|
Context manager that atomically acquires lock + records attempt + starts heartbeat.
|
|
@@ -1002,6 +1015,7 @@ def compute_lock(
|
|
|
1002
1015
|
wait_log_every_sec: Interval between "waiting for lock" log messages
|
|
1003
1016
|
reconcile_fn: Optional function to call to reconcile stale attempts
|
|
1004
1017
|
allow_failed: Allow recomputation even if state is failed
|
|
1018
|
+
allow_success: Allow recomputation even if state is successful
|
|
1005
1019
|
|
|
1006
1020
|
Yields:
|
|
1007
1021
|
ComputeLockContext with attempt_id and stop_heartbeat callable
|
|
@@ -1039,16 +1053,28 @@ def compute_lock(
|
|
|
1039
1053
|
return ", ".join(parts)
|
|
1040
1054
|
|
|
1041
1055
|
def _describe_wait(attempt: _StateAttempt, waited_sec: float) -> str:
|
|
1042
|
-
label = "last heartbeat"
|
|
1043
|
-
timestamp = attempt.heartbeat_at
|
|
1044
1056
|
if attempt.status == "queued":
|
|
1045
1057
|
label = "queued at"
|
|
1046
1058
|
timestamp = attempt.started_at
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1059
|
+
parsed = StateManager._parse_time(timestamp)
|
|
1060
|
+
timestamp_info = timestamp
|
|
1061
|
+
if parsed is not None:
|
|
1062
|
+
age = (StateManager._utcnow() - parsed).total_seconds()
|
|
1063
|
+
timestamp_info = f"{timestamp} ({_format_wait_duration(age)} ago)"
|
|
1064
|
+
else:
|
|
1065
|
+
label = "last heartbeat"
|
|
1066
|
+
last_heartbeat = StateManager.last_heartbeat_mtime(directory)
|
|
1067
|
+
if last_heartbeat is None:
|
|
1068
|
+
timestamp_info = "missing"
|
|
1069
|
+
else:
|
|
1070
|
+
heartbeat_dt = _dt.datetime.fromtimestamp(
|
|
1071
|
+
last_heartbeat, tz=_dt.timezone.utc
|
|
1072
|
+
)
|
|
1073
|
+
age = time.time() - last_heartbeat
|
|
1074
|
+
timestamp_info = (
|
|
1075
|
+
f"{heartbeat_dt.isoformat(timespec='seconds')} "
|
|
1076
|
+
f"({_format_wait_duration(age)} ago)"
|
|
1077
|
+
)
|
|
1052
1078
|
return (
|
|
1053
1079
|
"waited "
|
|
1054
1080
|
f"{_format_wait_duration(waited_sec)}, {label} {timestamp_info}, "
|
|
@@ -1097,7 +1123,7 @@ def compute_lock(
|
|
|
1097
1123
|
lock_fd = StateManager.try_lock(lock_path)
|
|
1098
1124
|
if lock_fd is not None:
|
|
1099
1125
|
state = StateManager.read_state(directory)
|
|
1100
|
-
if isinstance(state.result, _StateResultSuccess):
|
|
1126
|
+
if isinstance(state.result, _StateResultSuccess) and not allow_success:
|
|
1101
1127
|
StateManager.release_lock(lock_fd, lock_path)
|
|
1102
1128
|
raise FuruLockNotAcquired(
|
|
1103
1129
|
"Cannot acquire lock: experiment already succeeded"
|
|
@@ -1117,7 +1143,7 @@ def compute_lock(
|
|
|
1117
1143
|
if reconcile_fn is not None:
|
|
1118
1144
|
reconcile_fn(directory)
|
|
1119
1145
|
state = StateManager.read_state(directory)
|
|
1120
|
-
if isinstance(state.result, _StateResultSuccess):
|
|
1146
|
+
if isinstance(state.result, _StateResultSuccess) and not allow_success:
|
|
1121
1147
|
raise FuruLockNotAcquired(
|
|
1122
1148
|
"Cannot acquire lock: experiment already succeeded"
|
|
1123
1149
|
)
|
|
@@ -1151,7 +1177,7 @@ def compute_lock(
|
|
|
1151
1177
|
attempt = state.attempt
|
|
1152
1178
|
|
|
1153
1179
|
# If result is terminal, no point waiting
|
|
1154
|
-
if isinstance(state.result, _StateResultSuccess):
|
|
1180
|
+
if isinstance(state.result, _StateResultSuccess) and not allow_success:
|
|
1155
1181
|
raise FuruLockNotAcquired(
|
|
1156
1182
|
"Cannot acquire lock: experiment already succeeded"
|
|
1157
1183
|
)
|
|
@@ -1200,11 +1226,7 @@ def compute_lock(
|
|
|
1200
1226
|
# Start heartbeat IMMEDIATELY
|
|
1201
1227
|
def heartbeat() -> None:
|
|
1202
1228
|
while not stop_event.wait(heartbeat_interval_sec):
|
|
1203
|
-
StateManager.heartbeat(
|
|
1204
|
-
directory,
|
|
1205
|
-
attempt_id=attempt_id, # type: ignore[arg-type]
|
|
1206
|
-
lease_duration_sec=lease_duration_sec,
|
|
1207
|
-
)
|
|
1229
|
+
StateManager.heartbeat(directory)
|
|
1208
1230
|
|
|
1209
1231
|
thread = threading.Thread(target=heartbeat, daemon=True)
|
|
1210
1232
|
thread.start()
|