furu 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
furu/storage/state.py CHANGED
@@ -9,13 +9,12 @@ from collections.abc import Generator
9
9
  from contextlib import contextmanager
10
10
  from dataclasses import dataclass
11
11
  from pathlib import Path
12
- from typing import Annotated, Any, Callable, Literal, Mapping, TypedDict, TypeAlias
12
+ from typing import Annotated, Any, Callable, Literal, Mapping, TypeAlias, TypedDict
13
13
 
14
14
  from pydantic import BaseModel, ConfigDict, Field, ValidationError, model_validator
15
15
 
16
16
  from ..errors import FuruLockNotAcquired, FuruWaitTimeout
17
17
 
18
-
19
18
  # Type alias for scheduler-specific metadata. Different schedulers (SLURM, LSF, PBS, local)
20
19
  # return different fields, so this must remain dynamic.
21
20
  SchedulerMetadata = dict[str, Any]
@@ -167,7 +166,6 @@ class _StateAttemptBase(BaseModel):
167
166
  number: int = 1
168
167
  backend: str
169
168
  started_at: str
170
- heartbeat_at: str
171
169
  lease_duration_sec: float
172
170
  lease_expires_at: str
173
171
  owner: StateOwner
@@ -228,7 +226,6 @@ class StateAttempt(BaseModel):
228
226
  backend: str
229
227
  status: str
230
228
  started_at: str
231
- heartbeat_at: str
232
229
  lease_duration_sec: float
233
230
  lease_expires_at: str
234
231
  owner: StateOwner
@@ -246,7 +243,6 @@ class StateAttempt(BaseModel):
246
243
  backend=attempt.backend,
247
244
  status=attempt.status,
248
245
  started_at=attempt.started_at,
249
- heartbeat_at=attempt.heartbeat_at,
250
246
  lease_duration_sec=attempt.lease_duration_sec,
251
247
  lease_expires_at=attempt.lease_expires_at,
252
248
  owner=attempt.owner,
@@ -286,9 +282,9 @@ class StateManager:
286
282
  EVENTS_FILE = "events.jsonl"
287
283
  SUCCESS_MARKER = "SUCCESS.json"
288
284
 
289
- COMPUTE_LOCK = ".compute.lock"
290
- SUBMIT_LOCK = ".submit.lock"
291
- STATE_LOCK = ".state.lock"
285
+ COMPUTE_LOCK = "compute.lock"
286
+ SUBMIT_LOCK = "submit.lock"
287
+ STATE_LOCK = "state.lock"
292
288
 
293
289
  TERMINAL_STATUSES = {
294
290
  "success",
@@ -302,6 +298,12 @@ class StateManager:
302
298
  def get_internal_dir(cls, directory: Path) -> Path:
303
299
  return directory / cls.INTERNAL_DIR
304
300
 
301
+ @classmethod
302
+ def ensure_internal_dir(cls, directory: Path) -> Path:
303
+ internal_dir = cls.get_internal_dir(directory)
304
+ internal_dir.mkdir(parents=True, exist_ok=True)
305
+ return internal_dir
306
+
305
307
  @classmethod
306
308
  def get_state_path(cls, directory: Path) -> Path:
307
309
  return cls.get_internal_dir(directory) / cls.STATE_FILE
@@ -366,7 +368,6 @@ class StateManager:
366
368
  @classmethod
367
369
  def _write_state_unlocked(cls, directory: Path, state: _FuruState) -> None:
368
370
  state_path = cls.get_state_path(directory)
369
- state_path.parent.mkdir(parents=True, exist_ok=True)
370
371
  tmp_path = state_path.with_suffix(".tmp")
371
372
  tmp_path.write_text(json.dumps(state.model_dump(mode="json"), indent=2))
372
373
  os.replace(tmp_path, state_path)
@@ -385,7 +386,6 @@ class StateManager:
385
386
  @classmethod
386
387
  def try_lock(cls, lock_path: Path) -> int | None:
387
388
  try:
388
- lock_path.parent.mkdir(parents=True, exist_ok=True)
389
389
  fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_RDWR, 0o644)
390
390
  payload = {
391
391
  "pid": os.getpid(),
@@ -400,15 +400,41 @@ class StateManager:
400
400
 
401
401
  @classmethod
402
402
  def release_lock(cls, fd: int | None, lock_path: Path) -> None:
403
- if fd is not None:
403
+ """Release a lock acquired via :meth:`try_lock`.
404
+
405
+ We best-effort avoid deleting a lock that was broken and replaced by
406
+ another process by verifying the inode of the open fd matches the
407
+ current lock_path inode before unlinking.
408
+ """
409
+ if fd is None:
410
+ return
411
+ try:
412
+ fd_stat = os.fstat(fd)
413
+ except OSError:
414
+ fd_stat = None
415
+ try:
416
+ path_stat = lock_path.stat()
417
+ except FileNotFoundError:
418
+ path_stat = None
419
+ try:
420
+ if (
421
+ fd_stat is not None
422
+ and path_stat is not None
423
+ and fd_stat.st_ino == path_stat.st_ino
424
+ and fd_stat.st_dev == path_stat.st_dev
425
+ ):
426
+ lock_path.unlink(missing_ok=True)
427
+ finally:
404
428
  os.close(fd)
405
- lock_path.unlink(missing_ok=True)
406
429
 
407
430
  @classmethod
408
431
  def _read_lock_info(cls, lock_path: Path) -> _LockInfoDict | None:
409
432
  if not lock_path.is_file():
410
433
  return None
411
- text = lock_path.read_text().strip()
434
+ try:
435
+ text = lock_path.read_text().strip()
436
+ except FileNotFoundError:
437
+ return None
412
438
  if not text:
413
439
  return None
414
440
  lines = text.splitlines()
@@ -459,19 +485,23 @@ class StateManager:
459
485
 
460
486
  @classmethod
461
487
  def update_state(
462
- cls, directory: Path, mutator: Callable[[_FuruState], None]
488
+ cls, directory: Path, mutator: Callable[[_FuruState], bool | None]
463
489
  ) -> _FuruState:
464
490
  lock_path = cls.get_lock_path(directory, cls.STATE_LOCK)
465
491
  fd: int | None = None
466
492
  try:
467
493
  fd = cls._acquire_lock_blocking(lock_path)
494
+ state_path = cls.get_state_path(directory)
495
+ force_write = not state_path.is_file()
468
496
  state = cls.read_state(directory)
469
- mutator(state)
470
- state.schema_version = cls.SCHEMA_VERSION
471
- state.updated_at = cls._iso_now()
472
- validated = _FuruState.model_validate(state)
473
- cls._write_state_unlocked(directory, validated)
474
- return validated
497
+ changed = mutator(state)
498
+ if force_write or changed is not False:
499
+ state.schema_version = cls.SCHEMA_VERSION
500
+ state.updated_at = cls._iso_now()
501
+ validated = _FuruState.model_validate(state)
502
+ cls._write_state_unlocked(directory, validated)
503
+ return validated
504
+ return state
475
505
  finally:
476
506
  cls.release_lock(fd, lock_path)
477
507
 
@@ -484,14 +514,12 @@ class StateManager:
484
514
  "host": socket.gethostname(),
485
515
  **event,
486
516
  }
487
- path.parent.mkdir(parents=True, exist_ok=True)
488
517
  with path.open("a", encoding="utf-8") as f:
489
518
  f.write(json.dumps(enriched) + "\n")
490
519
 
491
520
  @classmethod
492
521
  def write_success_marker(cls, directory: Path, *, attempt_id: str) -> None:
493
522
  marker = cls.get_success_marker_path(directory)
494
- marker.parent.mkdir(parents=True, exist_ok=True)
495
523
  payload = {"attempt_id": attempt_id, "created_at": cls._iso_now()}
496
524
  tmp = marker.with_suffix(".tmp")
497
525
  tmp.write_text(json.dumps(payload, indent=2))
@@ -510,6 +538,26 @@ class StateManager:
510
538
  return True
511
539
  return cls._utcnow() >= expires
512
540
 
541
+ @classmethod
542
+ def last_heartbeat_mtime(cls, directory: Path) -> float | None:
543
+ lock_path = cls.get_lock_path(directory, cls.COMPUTE_LOCK)
544
+ try:
545
+ return lock_path.stat().st_mtime
546
+ except FileNotFoundError:
547
+ return None
548
+
549
+ @classmethod
550
+ def _running_heartbeat_reason(
551
+ cls, directory: Path, attempt: _StateAttemptRunning
552
+ ) -> str | None:
553
+ last_heartbeat = cls.last_heartbeat_mtime(directory)
554
+ if last_heartbeat is None:
555
+ return "missing_heartbeat"
556
+ expires_at = last_heartbeat + float(attempt.lease_duration_sec)
557
+ if time.time() >= expires_at:
558
+ return "lease_expired"
559
+ return None
560
+
513
561
  @classmethod
514
562
  def start_attempt_queued(
515
563
  cls,
@@ -578,7 +626,6 @@ class StateManager:
578
626
 
579
627
  owner_state = StateOwner.model_validate(owner)
580
628
  started_at = now.isoformat(timespec="seconds")
581
- heartbeat_at = started_at
582
629
  lease_duration = float(lease_duration_sec)
583
630
  lease_expires_at = expires.isoformat(timespec="seconds")
584
631
  scheduler_state: SchedulerMetadata = scheduler or {}
@@ -588,7 +635,6 @@ class StateManager:
588
635
  number=int(number),
589
636
  backend=backend,
590
637
  started_at=started_at,
591
- heartbeat_at=heartbeat_at,
592
638
  lease_duration_sec=lease_duration,
593
639
  lease_expires_at=lease_expires_at,
594
640
  owner=owner_state,
@@ -635,49 +681,9 @@ class StateManager:
635
681
  return attempt.id
636
682
 
637
683
  @classmethod
638
- def heartbeat(
639
- cls, directory: Path, *, attempt_id: str, lease_duration_sec: float
640
- ) -> bool:
641
- ok = False
642
-
643
- def mutate(state: _FuruState) -> None:
644
- nonlocal ok
645
- attempt = state.attempt
646
- if not isinstance(attempt, _StateAttemptRunning):
647
- return
648
- if attempt.id != attempt_id:
649
- return
650
- now = cls._utcnow()
651
- expires = now + _dt.timedelta(seconds=float(lease_duration_sec))
652
- attempt.heartbeat_at = now.isoformat(timespec="seconds")
653
- attempt.lease_duration_sec = float(lease_duration_sec)
654
- attempt.lease_expires_at = expires.isoformat(timespec="seconds")
655
- ok = True
656
-
657
- cls.update_state(directory, mutate)
658
- return ok
659
-
660
- @classmethod
661
- def set_attempt_fields(
662
- cls, directory: Path, *, attempt_id: str, fields: SchedulerMetadata
663
- ) -> bool:
664
- ok = False
665
-
666
- def mutate(state: _FuruState) -> None:
667
- nonlocal ok
668
- attempt = state.attempt
669
- if attempt is None or attempt.id != attempt_id:
670
- return
671
- for key, value in fields.items():
672
- if key == "scheduler" and isinstance(value, dict):
673
- attempt.scheduler.update(value)
674
- continue
675
- if hasattr(attempt, key):
676
- setattr(attempt, key, value)
677
- ok = True
678
-
679
- cls.update_state(directory, mutate)
680
- return ok
684
+ def heartbeat(cls, directory: Path) -> None:
685
+ lock_path = cls.get_lock_path(directory, cls.COMPUTE_LOCK)
686
+ os.utime(lock_path)
681
687
 
682
688
  @classmethod
683
689
  def finish_attempt_success(cls, directory: Path, *, attempt_id: str) -> None:
@@ -691,7 +697,6 @@ class StateManager:
691
697
  number=attempt.number,
692
698
  backend=attempt.backend,
693
699
  started_at=attempt.started_at,
694
- heartbeat_at=attempt.heartbeat_at,
695
700
  lease_duration_sec=attempt.lease_duration_sec,
696
701
  lease_expires_at=attempt.lease_expires_at,
697
702
  owner=attempt.owner,
@@ -728,7 +733,6 @@ class StateManager:
728
733
  number=attempt.number,
729
734
  backend=attempt.backend,
730
735
  started_at=attempt.started_at,
731
- heartbeat_at=attempt.heartbeat_at,
732
736
  lease_duration_sec=attempt.lease_duration_sec,
733
737
  lease_expires_at=attempt.lease_expires_at,
734
738
  owner=attempt.owner,
@@ -766,7 +770,6 @@ class StateManager:
766
770
  number=attempt.number,
767
771
  backend=attempt.backend,
768
772
  started_at=attempt.started_at,
769
- heartbeat_at=attempt.heartbeat_at,
770
773
  lease_duration_sec=attempt.lease_duration_sec,
771
774
  lease_expires_at=attempt.lease_expires_at,
772
775
  owner=attempt.owner,
@@ -816,10 +819,10 @@ class StateManager:
816
819
  to lease expiry.
817
820
  """
818
821
 
819
- def mutate(state: _FuruState) -> None:
822
+ def mutate(state: _FuruState) -> bool:
820
823
  attempt = state.attempt
821
824
  if not isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning)):
822
- return
825
+ return False
823
826
 
824
827
  # Fast promotion if we can see a durable success marker.
825
828
  if cls.success_marker_exists(directory):
@@ -829,7 +832,6 @@ class StateManager:
829
832
  number=attempt.number,
830
833
  backend=attempt.backend,
831
834
  started_at=attempt.started_at,
832
- heartbeat_at=attempt.heartbeat_at,
833
835
  lease_duration_sec=attempt.lease_duration_sec,
834
836
  lease_expires_at=attempt.lease_expires_at,
835
837
  owner=attempt.owner,
@@ -839,7 +841,7 @@ class StateManager:
839
841
  state.result = _coerce_result(
840
842
  state.result, status="success", created_at=ended
841
843
  )
842
- return
844
+ return True
843
845
 
844
846
  backend = attempt.backend
845
847
  now = cls._iso_now()
@@ -852,6 +854,10 @@ class StateManager:
852
854
  if alive is False:
853
855
  terminal_status = "crashed"
854
856
  reason = "pid_dead"
857
+ elif isinstance(attempt, _StateAttemptRunning):
858
+ reason = cls._running_heartbeat_reason(directory, attempt)
859
+ if reason is not None:
860
+ terminal_status = "crashed"
855
861
  elif cls._lease_expired(attempt):
856
862
  terminal_status = "crashed"
857
863
  reason = "lease_expired"
@@ -864,16 +870,25 @@ class StateManager:
864
870
  attempt.scheduler.update(
865
871
  {k: v for k, v in verdict.items() if k != "terminal_status"}
866
872
  )
867
- if terminal_status is None and cls._lease_expired(attempt):
868
- terminal_status = "crashed"
869
- reason = "lease_expired"
873
+ if terminal_status is None:
874
+ if isinstance(attempt, _StateAttemptRunning):
875
+ reason = cls._running_heartbeat_reason(directory, attempt)
876
+ if reason is not None:
877
+ terminal_status = "crashed"
878
+ elif cls._lease_expired(attempt):
879
+ terminal_status = "crashed"
880
+ reason = "lease_expired"
870
881
  else:
871
- if cls._lease_expired(attempt):
882
+ if isinstance(attempt, _StateAttemptRunning):
883
+ reason = cls._running_heartbeat_reason(directory, attempt)
884
+ if reason is not None:
885
+ terminal_status = "crashed"
886
+ elif cls._lease_expired(attempt):
872
887
  terminal_status = "crashed"
873
888
  reason = "lease_expired"
874
889
 
875
890
  if terminal_status is None:
876
- return
891
+ return False
877
892
  if terminal_status == "success":
878
893
  terminal_status = "crashed"
879
894
  reason = reason or "scheduler_success_no_success_marker"
@@ -884,7 +899,6 @@ class StateManager:
884
899
  number=attempt.number,
885
900
  backend=attempt.backend,
886
901
  started_at=attempt.started_at,
887
- heartbeat_at=attempt.heartbeat_at,
888
902
  lease_duration_sec=attempt.lease_duration_sec,
889
903
  lease_expires_at=attempt.lease_expires_at,
890
904
  owner=attempt.owner,
@@ -901,7 +915,6 @@ class StateManager:
901
915
  number=attempt.number,
902
916
  backend=attempt.backend,
903
917
  started_at=attempt.started_at,
904
- heartbeat_at=attempt.heartbeat_at,
905
918
  lease_duration_sec=attempt.lease_duration_sec,
906
919
  lease_expires_at=attempt.lease_expires_at,
907
920
  owner=attempt.owner,
@@ -916,7 +929,6 @@ class StateManager:
916
929
  number=attempt.number,
917
930
  backend=attempt.backend,
918
931
  started_at=attempt.started_at,
919
- heartbeat_at=attempt.heartbeat_at,
920
932
  lease_duration_sec=attempt.lease_duration_sec,
921
933
  lease_expires_at=attempt.lease_expires_at,
922
934
  owner=attempt.owner,
@@ -931,7 +943,6 @@ class StateManager:
931
943
  number=attempt.number,
932
944
  backend=attempt.backend,
933
945
  started_at=attempt.started_at,
934
- heartbeat_at=attempt.heartbeat_at,
935
946
  lease_duration_sec=attempt.lease_duration_sec,
936
947
  lease_expires_at=attempt.lease_expires_at,
937
948
  owner=attempt.owner,
@@ -944,6 +955,7 @@ class StateManager:
944
955
  state.result,
945
956
  status="failed" if terminal_status == "failed" else "incomplete",
946
957
  )
958
+ return True
947
959
 
948
960
  state = cls.update_state(directory, mutate)
949
961
  attempt = state.attempt
@@ -978,6 +990,7 @@ def compute_lock(
978
990
  wait_log_every_sec: float = 10.0,
979
991
  reconcile_fn: Callable[[Path], None] | None = None,
980
992
  allow_failed: bool = False,
993
+ allow_success: bool = False,
981
994
  ) -> Generator[ComputeLockContext, None, None]:
982
995
  """
983
996
  Context manager that atomically acquires lock + records attempt + starts heartbeat.
@@ -1002,6 +1015,7 @@ def compute_lock(
1002
1015
  wait_log_every_sec: Interval between "waiting for lock" log messages
1003
1016
  reconcile_fn: Optional function to call to reconcile stale attempts
1004
1017
  allow_failed: Allow recomputation even if state is failed
1018
+ allow_success: Allow recomputation even if state is successful
1005
1019
 
1006
1020
  Yields:
1007
1021
  ComputeLockContext with attempt_id and stop_heartbeat callable
@@ -1039,16 +1053,28 @@ def compute_lock(
1039
1053
  return ", ".join(parts)
1040
1054
 
1041
1055
  def _describe_wait(attempt: _StateAttempt, waited_sec: float) -> str:
1042
- label = "last heartbeat"
1043
- timestamp = attempt.heartbeat_at
1044
1056
  if attempt.status == "queued":
1045
1057
  label = "queued at"
1046
1058
  timestamp = attempt.started_at
1047
- parsed = StateManager._parse_time(timestamp)
1048
- timestamp_info = timestamp
1049
- if parsed is not None:
1050
- age = (StateManager._utcnow() - parsed).total_seconds()
1051
- timestamp_info = f"{timestamp} ({_format_wait_duration(age)} ago)"
1059
+ parsed = StateManager._parse_time(timestamp)
1060
+ timestamp_info = timestamp
1061
+ if parsed is not None:
1062
+ age = (StateManager._utcnow() - parsed).total_seconds()
1063
+ timestamp_info = f"{timestamp} ({_format_wait_duration(age)} ago)"
1064
+ else:
1065
+ label = "last heartbeat"
1066
+ last_heartbeat = StateManager.last_heartbeat_mtime(directory)
1067
+ if last_heartbeat is None:
1068
+ timestamp_info = "missing"
1069
+ else:
1070
+ heartbeat_dt = _dt.datetime.fromtimestamp(
1071
+ last_heartbeat, tz=_dt.timezone.utc
1072
+ )
1073
+ age = time.time() - last_heartbeat
1074
+ timestamp_info = (
1075
+ f"{heartbeat_dt.isoformat(timespec='seconds')} "
1076
+ f"({_format_wait_duration(age)} ago)"
1077
+ )
1052
1078
  return (
1053
1079
  "waited "
1054
1080
  f"{_format_wait_duration(waited_sec)}, {label} {timestamp_info}, "
@@ -1097,7 +1123,7 @@ def compute_lock(
1097
1123
  lock_fd = StateManager.try_lock(lock_path)
1098
1124
  if lock_fd is not None:
1099
1125
  state = StateManager.read_state(directory)
1100
- if isinstance(state.result, _StateResultSuccess):
1126
+ if isinstance(state.result, _StateResultSuccess) and not allow_success:
1101
1127
  StateManager.release_lock(lock_fd, lock_path)
1102
1128
  raise FuruLockNotAcquired(
1103
1129
  "Cannot acquire lock: experiment already succeeded"
@@ -1117,7 +1143,7 @@ def compute_lock(
1117
1143
  if reconcile_fn is not None:
1118
1144
  reconcile_fn(directory)
1119
1145
  state = StateManager.read_state(directory)
1120
- if isinstance(state.result, _StateResultSuccess):
1146
+ if isinstance(state.result, _StateResultSuccess) and not allow_success:
1121
1147
  raise FuruLockNotAcquired(
1122
1148
  "Cannot acquire lock: experiment already succeeded"
1123
1149
  )
@@ -1151,7 +1177,7 @@ def compute_lock(
1151
1177
  attempt = state.attempt
1152
1178
 
1153
1179
  # If result is terminal, no point waiting
1154
- if isinstance(state.result, _StateResultSuccess):
1180
+ if isinstance(state.result, _StateResultSuccess) and not allow_success:
1155
1181
  raise FuruLockNotAcquired(
1156
1182
  "Cannot acquire lock: experiment already succeeded"
1157
1183
  )
@@ -1200,11 +1226,7 @@ def compute_lock(
1200
1226
  # Start heartbeat IMMEDIATELY
1201
1227
  def heartbeat() -> None:
1202
1228
  while not stop_event.wait(heartbeat_interval_sec):
1203
- StateManager.heartbeat(
1204
- directory,
1205
- attempt_id=attempt_id, # type: ignore[arg-type]
1206
- lease_duration_sec=lease_duration_sec,
1207
- )
1229
+ StateManager.heartbeat(directory)
1208
1230
 
1209
1231
  thread = threading.Thread(target=heartbeat, daemon=True)
1210
1232
  thread.start()