furu 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Mapping, Protocol
5
+
6
+
7
+ SlurmSpecValue = str | int | float | bool
8
+ SlurmSpecExtraValue = SlurmSpecValue | Mapping[str, "SlurmSpecExtraValue"]
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class SlurmSpec:
13
+ partition: str | None = None
14
+ gpus: int = 0
15
+ cpus: int = 4
16
+ mem_gb: int = 16
17
+ time_min: int = 60
18
+ extra: Mapping[str, SlurmSpecExtraValue] | None = None
19
+
20
+
21
+ class _SpecNode(Protocol):
22
+ _furu_hash: str
23
+
24
+ def _executor_spec_key(self) -> str: ...
25
+
26
+
27
+ def resolve_slurm_spec(specs: Mapping[str, SlurmSpec], node: _SpecNode) -> SlurmSpec:
28
+ if "default" not in specs:
29
+ raise KeyError("Missing slurm spec for key 'default'.")
30
+
31
+ spec_key = node._executor_spec_key()
32
+ if spec_key not in specs:
33
+ raise KeyError(
34
+ "Missing slurm spec for key "
35
+ f"'{spec_key}' for node {node.__class__.__name__} ({node._furu_hash})."
36
+ )
37
+
38
+ return specs[spec_key]
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING
5
+
6
+ from .paths import submitit_logs_dir
7
+ from .slurm_spec import SlurmSpec, SlurmSpecExtraValue
8
+
9
+ if TYPE_CHECKING:
10
+ import submitit
11
+
12
+
13
+
14
+ def make_executor_for_spec(
15
+ spec_key: str,
16
+ spec: SlurmSpec,
17
+ *,
18
+ kind: str,
19
+ submitit_root: Path | None,
20
+ run_id: str | None = None,
21
+ ) -> submitit.AutoExecutor:
22
+ import submitit
23
+
24
+ folder = submitit_logs_dir(
25
+ kind,
26
+ spec_key,
27
+ override=submitit_root,
28
+ run_id=run_id,
29
+ )
30
+ folder.mkdir(parents=True, exist_ok=True)
31
+
32
+ executor = submitit.AutoExecutor(folder=str(folder))
33
+ params: dict[str, SlurmSpecExtraValue | None] = {
34
+ "timeout_min": spec.time_min,
35
+ "slurm_partition": spec.partition,
36
+ "cpus_per_task": spec.cpus,
37
+ "mem_gb": spec.mem_gb,
38
+ }
39
+ if spec.gpus:
40
+ params["gpus_per_node"] = spec.gpus
41
+ if spec.extra:
42
+ params.update(spec.extra)
43
+
44
+ executor.update_parameters(
45
+ **{key: value for key, value in params.items() if value is not None}
46
+ )
47
+ return executor
furu/migration.py CHANGED
@@ -507,8 +507,10 @@ def _apply_single_migration(
507
507
  event: dict[str, str | int] = {
508
508
  "type": "migrated",
509
509
  "policy": policy,
510
- "from": f"{candidate.from_ref.namespace}:{candidate.from_ref.furu_hash}",
511
- "to": f"{candidate.to_ref.namespace}:{candidate.to_ref.furu_hash}",
510
+ "from_namespace": candidate.from_ref.namespace,
511
+ "from_hash": candidate.from_ref.furu_hash,
512
+ "to_namespace": candidate.to_ref.namespace,
513
+ "to_hash": candidate.to_ref.furu_hash,
512
514
  }
513
515
  if default_values is not None:
514
516
  event["default_values"] = json.dumps(default_values, sort_keys=True)
@@ -519,8 +521,10 @@ def _apply_single_migration(
519
521
  overwrite_event = {
520
522
  "type": "migration_overwrite",
521
523
  "policy": policy,
522
- "from": f"{candidate.from_ref.namespace}:{candidate.from_ref.furu_hash}",
523
- "to": f"{candidate.to_ref.namespace}:{candidate.to_ref.furu_hash}",
524
+ "from_namespace": candidate.from_ref.namespace,
525
+ "from_hash": candidate.from_ref.furu_hash,
526
+ "to_namespace": candidate.to_ref.namespace,
527
+ "to_hash": candidate.to_ref.furu_hash,
524
528
  "reason": "force_overwrite",
525
529
  }
526
530
  StateManager.append_event(to_dir, overwrite_event)
furu/runtime/logging.py CHANGED
@@ -28,16 +28,16 @@ _FURU_HOLDER_STACK: contextvars.ContextVar[tuple[HolderType, ...]] = (
28
28
  _FURU_LOG_LOCK = threading.Lock()
29
29
  _FURU_CONSOLE_LOCK = threading.Lock()
30
30
 
31
- _LOAD_OR_CREATE_PREFIX = "load_or_create"
31
+ _GET_PREFIX = "get"
32
32
 
33
33
 
34
- def _strip_load_or_create_decision_suffix(message: str) -> str:
34
+ def _strip_get_decision_suffix(message: str) -> str:
35
35
  """
36
- Strip a trailing `(<decision>)` suffix from `load_or_create ...` console lines.
36
+ Strip a trailing `(<decision>)` suffix from `get ...` console lines.
37
37
 
38
38
  This keeps detailed decision info in file logs, but makes console output cleaner.
39
39
  """
40
- if not message.startswith(_LOAD_OR_CREATE_PREFIX):
40
+ if not message.startswith(_GET_PREFIX):
41
41
  return message
42
42
  if not message.endswith(")"):
43
43
  return message
@@ -69,7 +69,7 @@ def enter_holder(holder: HolderType) -> Generator[None, None, None]:
69
69
  """
70
70
  Push a holder object onto the logging stack for this context.
71
71
 
72
- Furu calls this automatically during `load_or_create()`, so nested
72
+ Furu calls this automatically during `get()`, so nested
73
73
  dependencies will log to the active dependency's folder and then revert.
74
74
  """
75
75
  configure_logging()
@@ -163,7 +163,7 @@ class _FuruRichConsoleHandler(logging.Handler):
163
163
 
164
164
  @staticmethod
165
165
  def _format_location(record: logging.LogRecord) -> str:
166
- # Use caller location if available (for load_or_create messages)
166
+ # Use caller location if available (for get messages)
167
167
  caller_file = getattr(record, "furu_caller_file", None)
168
168
  caller_line = getattr(record, "furu_caller_line", None)
169
169
  if caller_file is not None and caller_line is not None:
@@ -174,10 +174,10 @@ class _FuruRichConsoleHandler(logging.Handler):
174
174
 
175
175
  @staticmethod
176
176
  def _format_message_text(record: logging.LogRecord) -> Text:
177
- message = _strip_load_or_create_decision_suffix(record.getMessage())
177
+ message = _strip_get_decision_suffix(record.getMessage())
178
178
  action_color = getattr(record, "furu_action_color", None)
179
- if isinstance(action_color, str) and message.startswith(_LOAD_OR_CREATE_PREFIX):
180
- prefix = _LOAD_OR_CREATE_PREFIX
179
+ if isinstance(action_color, str) and message.startswith(_GET_PREFIX):
180
+ prefix = _GET_PREFIX
181
181
  rest = message[len(prefix) :]
182
182
  text = Text()
183
183
  text.append(prefix, style=action_color)
@@ -288,7 +288,7 @@ def write_separator(line: str = "------------------") -> Path:
288
288
  """
289
289
  Write a raw separator line to the current holder's `furu.log`.
290
290
 
291
- This bypasses standard formatting so repeated `load_or_create()` calls are easy to spot.
291
+ This bypasses standard formatting so repeated `get()` calls are easy to spot.
292
292
  """
293
293
  directory = current_log_dir()
294
294
  log_path = directory / "furu.log"
@@ -6,9 +6,10 @@ import json
6
6
  import pathlib
7
7
  import textwrap
8
8
  from pathlib import Path
9
- from typing import Any
9
+ from typing import Any, Protocol, Sequence, cast, runtime_checkable
10
10
 
11
11
  import chz
12
+ from chz.util import MISSING as CHZ_MISSING, MISSING_TYPE
12
13
 
13
14
  from ..errors import _FuruMissing
14
15
  from pydantic import BaseModel as PydanticBaseModel
@@ -91,13 +92,34 @@ class FuruSerializer:
91
92
  def compute_hash(cls, obj: object, verbose: bool = False) -> str:
92
93
  """Compute deterministic hash of object."""
93
94
 
95
+ @runtime_checkable
96
+ class _DependencyHashProvider(Protocol):
97
+ def _dependency_hashes(self) -> Sequence[str]: ...
98
+
99
+ def _has_required_fields(
100
+ data_class: type[object],
101
+ data: dict[str, JsonValue],
102
+ ) -> bool:
103
+ if not chz.is_chz(data_class):
104
+ return False
105
+ for field in chz.chz_fields(data_class).values():
106
+ name = field.logical_name
107
+ if name in data:
108
+ continue
109
+ if field._default is not CHZ_MISSING:
110
+ continue
111
+ if not isinstance(field._default_factory, MISSING_TYPE):
112
+ continue
113
+ return False
114
+ return True
115
+
94
116
  def canonicalize(item: object) -> JsonValue:
95
117
  if isinstance(item, _FuruMissing):
96
118
  raise ValueError("Cannot hash Furu.MISSING")
97
119
 
98
120
  if chz.is_chz(item):
99
121
  fields = chz.chz_fields(item)
100
- return {
122
+ result = {
101
123
  "__class__": cls.get_classname(item),
102
124
  **{
103
125
  name: canonicalize(getattr(item, name))
@@ -105,8 +127,24 @@ class FuruSerializer:
105
127
  if not name.startswith("_")
106
128
  },
107
129
  }
130
+ if isinstance(item, _DependencyHashProvider):
131
+ dependency_hashes = list(item._dependency_hashes())
132
+ if dependency_hashes:
133
+ result["__dependencies__"] = dependency_hashes
134
+ return result
108
135
 
109
136
  if isinstance(item, dict):
137
+ if cls.CLASS_MARKER in item:
138
+ config = cast(dict[str, JsonValue], item)
139
+ module_path, _, class_name = item[cls.CLASS_MARKER].rpartition(".")
140
+ module = importlib.import_module(module_path)
141
+ data_class = getattr(module, class_name, None)
142
+ if (
143
+ data_class is not None
144
+ and hasattr(data_class, "_dependency_hashes")
145
+ and _has_required_fields(data_class, config)
146
+ ):
147
+ return canonicalize(cls.from_dict(config))
110
148
  filtered = item
111
149
  if cls.CLASS_MARKER in item:
112
150
  filtered = {
furu/storage/metadata.py CHANGED
@@ -124,7 +124,7 @@ class MetadataManager:
124
124
  try:
125
125
  head = cls.run_git_command(["rev-parse", "HEAD"])
126
126
  branch = cls.run_git_command(["rev-parse", "--abbrev-ref", "HEAD"])
127
- except subprocess.CalledProcessError:
127
+ except (subprocess.CalledProcessError, FileNotFoundError):
128
128
  return GitInfo(
129
129
  git_commit="<no-git>",
130
130
  git_branch="<no-git>",
@@ -133,15 +133,27 @@ class MetadataManager:
133
133
  git_submodules={},
134
134
  )
135
135
  else:
136
- head = cls.run_git_command(["rev-parse", "HEAD"])
137
- branch = cls.run_git_command(["rev-parse", "--abbrev-ref", "HEAD"])
136
+ try:
137
+ head = cls.run_git_command(["rev-parse", "HEAD"])
138
+ branch = cls.run_git_command(["rev-parse", "--abbrev-ref", "HEAD"])
139
+ except (subprocess.CalledProcessError, FileNotFoundError) as e:
140
+ raise RuntimeError(
141
+ "Failed to read git commit/branch for provenance. "
142
+ "If this is expected, set FURU_REQUIRE_GIT=0."
143
+ ) from e
138
144
 
139
145
  if FURU_CONFIG.require_git_remote:
140
- remote = cls.run_git_command(["remote", "get-url", "origin"])
146
+ try:
147
+ remote = cls.run_git_command(["remote", "get-url", "origin"])
148
+ except (subprocess.CalledProcessError, FileNotFoundError) as e:
149
+ raise RuntimeError(
150
+ "Git remote 'origin' is required for provenance but was not found. "
151
+ "Set FURU_REQUIRE_GIT_REMOTE=0 to allow missing origin."
152
+ ) from e
141
153
  else:
142
154
  try:
143
155
  remote = cls.run_git_command(["remote", "get-url", "origin"])
144
- except subprocess.CalledProcessError:
156
+ except (subprocess.CalledProcessError, FileNotFoundError):
145
157
  remote = None
146
158
 
147
159
  if ignore_diff:
furu/storage/state.py CHANGED
@@ -400,15 +400,41 @@ class StateManager:
400
400
 
401
401
  @classmethod
402
402
  def release_lock(cls, fd: int | None, lock_path: Path) -> None:
403
- if fd is not None:
403
+ """Release a lock acquired via :meth:`try_lock`.
404
+
405
+ We best-effort avoid deleting a lock that was broken and replaced by
406
+ another process by verifying the inode of the open fd matches the
407
+ current lock_path inode before unlinking.
408
+ """
409
+ if fd is None:
410
+ return
411
+ try:
412
+ fd_stat = os.fstat(fd)
413
+ except OSError:
414
+ fd_stat = None
415
+ try:
416
+ path_stat = lock_path.stat()
417
+ except FileNotFoundError:
418
+ path_stat = None
419
+ try:
420
+ if (
421
+ fd_stat is not None
422
+ and path_stat is not None
423
+ and fd_stat.st_ino == path_stat.st_ino
424
+ and fd_stat.st_dev == path_stat.st_dev
425
+ ):
426
+ lock_path.unlink(missing_ok=True)
427
+ finally:
404
428
  os.close(fd)
405
- lock_path.unlink(missing_ok=True)
406
429
 
407
430
  @classmethod
408
431
  def _read_lock_info(cls, lock_path: Path) -> _LockInfoDict | None:
409
432
  if not lock_path.is_file():
410
433
  return None
411
- text = lock_path.read_text().strip()
434
+ try:
435
+ text = lock_path.read_text().strip()
436
+ except FileNotFoundError:
437
+ return None
412
438
  if not text:
413
439
  return None
414
440
  lines = text.splitlines()
@@ -977,6 +1003,8 @@ def compute_lock(
977
1003
  poll_interval_sec: float = 10.0,
978
1004
  wait_log_every_sec: float = 10.0,
979
1005
  reconcile_fn: Callable[[Path], None] | None = None,
1006
+ allow_failed: bool = False,
1007
+ allow_success: bool = False,
980
1008
  ) -> Generator[ComputeLockContext, None, None]:
981
1009
  """
982
1010
  Context manager that atomically acquires lock + records attempt + starts heartbeat.
@@ -1000,6 +1028,8 @@ def compute_lock(
1000
1028
  poll_interval_sec: Interval between lock acquisition attempts
1001
1029
  wait_log_every_sec: Interval between "waiting for lock" log messages
1002
1030
  reconcile_fn: Optional function to call to reconcile stale attempts
1031
+ allow_failed: Allow recomputation even if state is failed
1032
+ allow_success: Allow recomputation even if state is successful
1003
1033
 
1004
1034
  Yields:
1005
1035
  ComputeLockContext with attempt_id and stop_heartbeat callable
@@ -1008,6 +1038,7 @@ def compute_lock(
1008
1038
  FuruLockNotAcquired: If lock cannot be acquired (after waiting)
1009
1039
  FuruWaitTimeout: If max_wait_time_sec is exceeded
1010
1040
  """
1041
+
1011
1042
  def _format_wait_duration(seconds: float) -> str:
1012
1043
  if seconds < 60.0:
1013
1044
  return f"{seconds:.1f}s"
@@ -1020,6 +1051,21 @@ def compute_lock(
1020
1051
  days = hours / 24.0
1021
1052
  return f"{days:.1f}d"
1022
1053
 
1054
+ def _format_owner(attempt: _StateAttempt) -> str:
1055
+ owner = attempt.owner
1056
+ parts: list[str] = []
1057
+ if attempt.id:
1058
+ parts.append(f"attempt {attempt.id}")
1059
+ if owner.host:
1060
+ parts.append(f"host {owner.host}")
1061
+ if owner.pid is not None:
1062
+ parts.append(f"pid {owner.pid}")
1063
+ if owner.user:
1064
+ parts.append(f"user {owner.user}")
1065
+ if not parts:
1066
+ return "owner unknown"
1067
+ return ", ".join(parts)
1068
+
1023
1069
  def _describe_wait(attempt: _StateAttempt, waited_sec: float) -> str:
1024
1070
  label = "last heartbeat"
1025
1071
  timestamp = attempt.heartbeat_at
@@ -1034,7 +1080,7 @@ def compute_lock(
1034
1080
  return (
1035
1081
  "waited "
1036
1082
  f"{_format_wait_duration(waited_sec)}, {label} {timestamp_info}, "
1037
- f"status {attempt.status}, backend {attempt.backend}"
1083
+ f"status {attempt.status}, backend {attempt.backend}, {_format_owner(attempt)}"
1038
1084
  )
1039
1085
 
1040
1086
  lock_path = StateManager.get_lock_path(directory, StateManager.COMPUTE_LOCK)
@@ -1054,21 +1100,41 @@ def compute_lock(
1054
1100
  if max_wait_time_sec is not None:
1055
1101
  elapsed = time.time() - start_time
1056
1102
  if elapsed > max_wait_time_sec:
1103
+ state = StateManager.read_state(directory)
1104
+ attempt = state.attempt
1105
+ attempt_info = "no active attempt"
1106
+ if isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning)):
1107
+ attempt_info = _describe_wait(attempt, elapsed)
1108
+ message = (
1109
+ f"Timed out waiting for compute lock after {elapsed:.1f}s."
1110
+ f"\nDirectory: {directory}"
1111
+ f"\nLock file: {lock_path}"
1112
+ f"\nDetails: {attempt_info}"
1113
+ )
1057
1114
  raise FuruWaitTimeout(
1058
- f"Timed out waiting for compute lock after {elapsed:.1f}s"
1115
+ message,
1116
+ hints=[
1117
+ "Increase max wait: set FURU_MAX_WAIT_SECS (or override Furu._max_wait_time_sec).",
1118
+ "Change poll cadence: set FURU_POLL_INTERVAL_SECS.",
1119
+ "Change wait logging cadence: set FURU_WAIT_LOG_EVERY_SECS.",
1120
+ "If locks look stale too quickly/slowly: tune FURU_LEASE_SECS and FURU_HEARTBEAT_SECS.",
1121
+ "For more logs: set FURU_LOG_LEVEL=DEBUG.",
1122
+ ],
1059
1123
  )
1060
1124
 
1061
1125
  lock_fd = StateManager.try_lock(lock_path)
1062
1126
  if lock_fd is not None:
1063
1127
  state = StateManager.read_state(directory)
1064
- if isinstance(state.result, _StateResultSuccess):
1128
+ if isinstance(state.result, _StateResultSuccess) and not allow_success:
1065
1129
  StateManager.release_lock(lock_fd, lock_path)
1066
1130
  raise FuruLockNotAcquired(
1067
1131
  "Cannot acquire lock: experiment already succeeded"
1068
1132
  )
1069
- if isinstance(state.result, _StateResultFailed):
1133
+ if isinstance(state.result, _StateResultFailed) and not allow_failed:
1070
1134
  StateManager.release_lock(lock_fd, lock_path)
1071
- raise FuruLockNotAcquired("Cannot acquire lock: experiment already failed")
1135
+ raise FuruLockNotAcquired(
1136
+ "Cannot acquire lock: experiment already failed"
1137
+ )
1072
1138
  attempt = state.attempt
1073
1139
  if (
1074
1140
  isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning))
@@ -1079,11 +1145,11 @@ def compute_lock(
1079
1145
  if reconcile_fn is not None:
1080
1146
  reconcile_fn(directory)
1081
1147
  state = StateManager.read_state(directory)
1082
- if isinstance(state.result, _StateResultSuccess):
1148
+ if isinstance(state.result, _StateResultSuccess) and not allow_success:
1083
1149
  raise FuruLockNotAcquired(
1084
1150
  "Cannot acquire lock: experiment already succeeded"
1085
1151
  )
1086
- if isinstance(state.result, _StateResultFailed):
1152
+ if isinstance(state.result, _StateResultFailed) and not allow_failed:
1087
1153
  raise FuruLockNotAcquired(
1088
1154
  "Cannot acquire lock: experiment already failed"
1089
1155
  )
@@ -1113,11 +1179,11 @@ def compute_lock(
1113
1179
  attempt = state.attempt
1114
1180
 
1115
1181
  # If result is terminal, no point waiting
1116
- if isinstance(state.result, _StateResultSuccess):
1182
+ if isinstance(state.result, _StateResultSuccess) and not allow_success:
1117
1183
  raise FuruLockNotAcquired(
1118
1184
  "Cannot acquire lock: experiment already succeeded"
1119
1185
  )
1120
- if isinstance(state.result, _StateResultFailed):
1186
+ if isinstance(state.result, _StateResultFailed) and not allow_failed:
1121
1187
  raise FuruLockNotAcquired("Cannot acquire lock: experiment already failed")
1122
1188
 
1123
1189
  # If no active attempt but lock exists, it's orphaned - clean it up