furu 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- furu/__init__.py +8 -0
- furu/adapters/submitit.py +23 -2
- furu/config.py +40 -41
- furu/core/furu.py +479 -252
- furu/core/list.py +4 -3
- furu/dashboard/__init__.py +10 -1
- furu/dashboard/frontend/dist/assets/{index-DS3FsqcY.js → index-BjyrY-Zz.js} +1 -1
- furu/dashboard/frontend/dist/index.html +1 -1
- furu/dashboard/main.py +10 -3
- furu/errors.py +17 -4
- furu/execution/__init__.py +22 -0
- furu/execution/context.py +30 -0
- furu/execution/local.py +186 -0
- furu/execution/paths.py +20 -0
- furu/execution/plan.py +330 -0
- furu/execution/plan_utils.py +13 -0
- furu/execution/slurm_dag.py +273 -0
- furu/execution/slurm_pool.py +878 -0
- furu/execution/slurm_spec.py +38 -0
- furu/execution/submitit_factory.py +47 -0
- furu/migration.py +1 -2
- furu/runtime/env.py +1 -1
- furu/runtime/logging.py +40 -14
- furu/storage/metadata.py +25 -29
- furu/storage/migration.py +0 -1
- furu/storage/state.py +120 -98
- {furu-0.0.3.dist-info → furu-0.0.5.dist-info}/METADATA +91 -42
- furu-0.0.5.dist-info/RECORD +46 -0
- {furu-0.0.3.dist-info → furu-0.0.5.dist-info}/WHEEL +1 -1
- furu-0.0.3.dist-info/RECORD +0 -36
- {furu-0.0.3.dist-info → furu-0.0.5.dist-info}/entry_points.txt +0 -0
furu/execution/local.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
|
|
5
|
+
|
|
6
|
+
from ..config import FURU_CONFIG
|
|
7
|
+
from ..core import Furu
|
|
8
|
+
from ..errors import FuruComputeError, FuruError
|
|
9
|
+
from ..runtime.logging import enter_holder
|
|
10
|
+
from ..storage.state import StateManager
|
|
11
|
+
from .context import EXEC_CONTEXT, ExecContext
|
|
12
|
+
from .plan import PlanNode, build_plan, ready_todo
|
|
13
|
+
from .plan_utils import reconcile_or_timeout_in_progress
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _normalize_window_size(window_size: str | int, root_count: int) -> int:
|
|
17
|
+
if root_count == 0:
|
|
18
|
+
return 0
|
|
19
|
+
if isinstance(window_size, str):
|
|
20
|
+
match window_size:
|
|
21
|
+
case "dfs":
|
|
22
|
+
return 1
|
|
23
|
+
case "bfs":
|
|
24
|
+
return root_count
|
|
25
|
+
case _:
|
|
26
|
+
raise ValueError(
|
|
27
|
+
"window_size must be 'dfs', 'bfs', or a positive integer"
|
|
28
|
+
)
|
|
29
|
+
if isinstance(window_size, bool) or not isinstance(window_size, int):
|
|
30
|
+
raise TypeError("window_size must be 'dfs', 'bfs', or a positive integer")
|
|
31
|
+
if window_size < 1:
|
|
32
|
+
raise ValueError("window_size must be >= 1")
|
|
33
|
+
return min(window_size, root_count)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _run_node(node: PlanNode) -> None:
|
|
37
|
+
token = EXEC_CONTEXT.set(
|
|
38
|
+
ExecContext(
|
|
39
|
+
mode="executor",
|
|
40
|
+
spec_key=node.spec_key,
|
|
41
|
+
backend="local",
|
|
42
|
+
current_node_hash=node.obj.furu_hash,
|
|
43
|
+
)
|
|
44
|
+
)
|
|
45
|
+
try:
|
|
46
|
+
with enter_holder(node.obj):
|
|
47
|
+
node.obj.get(force=True)
|
|
48
|
+
finally:
|
|
49
|
+
EXEC_CONTEXT.reset(token)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def run_local(
|
|
53
|
+
roots: list[Furu],
|
|
54
|
+
*,
|
|
55
|
+
max_workers: int = 8,
|
|
56
|
+
window_size: str | int = "bfs",
|
|
57
|
+
poll_interval_sec: float = 0.25,
|
|
58
|
+
) -> None:
|
|
59
|
+
if not roots:
|
|
60
|
+
return
|
|
61
|
+
if max_workers < 1:
|
|
62
|
+
raise ValueError("max_workers must be >= 1")
|
|
63
|
+
|
|
64
|
+
window = _normalize_window_size(window_size, len(roots))
|
|
65
|
+
active_indices = list(range(min(window, len(roots))))
|
|
66
|
+
next_index = len(active_indices)
|
|
67
|
+
inflight: dict[str, Future[None]] = {}
|
|
68
|
+
completed_hashes: set[str] = set()
|
|
69
|
+
retry_attempts: dict[str, int] = {}
|
|
70
|
+
|
|
71
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
72
|
+
while True:
|
|
73
|
+
active_roots = [roots[index] for index in active_indices]
|
|
74
|
+
plan = build_plan(active_roots, completed_hashes=completed_hashes)
|
|
75
|
+
|
|
76
|
+
ready = [digest for digest in ready_todo(plan) if digest not in inflight]
|
|
77
|
+
available = max_workers - len(inflight)
|
|
78
|
+
for digest in ready[:available]:
|
|
79
|
+
node = plan.nodes[digest]
|
|
80
|
+
inflight[digest] = executor.submit(_run_node, node)
|
|
81
|
+
|
|
82
|
+
completed = [digest for digest, future in inflight.items() if future.done()]
|
|
83
|
+
for digest in completed:
|
|
84
|
+
future = inflight.pop(digest)
|
|
85
|
+
try:
|
|
86
|
+
future.result()
|
|
87
|
+
except Exception as exc:
|
|
88
|
+
if isinstance(exc, FuruComputeError):
|
|
89
|
+
compute_error = exc
|
|
90
|
+
wrapped_exc: Exception | None = None
|
|
91
|
+
elif isinstance(exc, FuruError):
|
|
92
|
+
raise
|
|
93
|
+
else:
|
|
94
|
+
node = plan.nodes.get(digest)
|
|
95
|
+
if node is None:
|
|
96
|
+
raise
|
|
97
|
+
state_path = StateManager.get_state_path(
|
|
98
|
+
node.obj._base_furu_dir()
|
|
99
|
+
)
|
|
100
|
+
compute_error = FuruComputeError(
|
|
101
|
+
"local executor failed for "
|
|
102
|
+
f"{node.obj.__class__.__name__}({node.obj.furu_hash})",
|
|
103
|
+
state_path,
|
|
104
|
+
original_error=exc,
|
|
105
|
+
)
|
|
106
|
+
wrapped_exc = exc
|
|
107
|
+
if not FURU_CONFIG.retry_failed:
|
|
108
|
+
if wrapped_exc is not None:
|
|
109
|
+
raise compute_error from wrapped_exc
|
|
110
|
+
raise compute_error
|
|
111
|
+
attempt = retry_attempts.get(digest, 0) + 1
|
|
112
|
+
retry_attempts[digest] = attempt
|
|
113
|
+
if attempt <= FURU_CONFIG.max_compute_retries:
|
|
114
|
+
continue
|
|
115
|
+
if wrapped_exc is not None:
|
|
116
|
+
raise compute_error from wrapped_exc
|
|
117
|
+
raise compute_error
|
|
118
|
+
completed_hashes.add(digest)
|
|
119
|
+
retry_attempts.pop(digest, None)
|
|
120
|
+
|
|
121
|
+
if not FURU_CONFIG.retry_failed:
|
|
122
|
+
failed = [
|
|
123
|
+
node
|
|
124
|
+
for digest, node in plan.nodes.items()
|
|
125
|
+
if node.status == "FAILED" and digest not in inflight
|
|
126
|
+
]
|
|
127
|
+
if failed:
|
|
128
|
+
names = ", ".join(
|
|
129
|
+
f"{node.obj.__class__.__name__}({node.obj.furu_hash})"
|
|
130
|
+
for node in failed
|
|
131
|
+
)
|
|
132
|
+
raise RuntimeError(
|
|
133
|
+
f"Cannot run local executor with failed dependencies: {names}"
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
if completed:
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
# Avoid a busy-spin loop while waiting for long-running tasks.
|
|
140
|
+
if inflight and not completed:
|
|
141
|
+
wait(
|
|
142
|
+
inflight.values(),
|
|
143
|
+
timeout=poll_interval_sec,
|
|
144
|
+
return_when=FIRST_COMPLETED,
|
|
145
|
+
)
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
finished_indices = [
|
|
149
|
+
index
|
|
150
|
+
for index in active_indices
|
|
151
|
+
if plan.nodes.get(roots[index].furu_hash) is not None
|
|
152
|
+
and plan.nodes[roots[index].furu_hash].status == "DONE"
|
|
153
|
+
]
|
|
154
|
+
for index in finished_indices:
|
|
155
|
+
active_indices.remove(index)
|
|
156
|
+
|
|
157
|
+
while len(active_indices) < window and next_index < len(roots):
|
|
158
|
+
active_indices.append(next_index)
|
|
159
|
+
next_index += 1
|
|
160
|
+
|
|
161
|
+
if not active_indices and not inflight and next_index >= len(roots):
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
if not inflight and not ready:
|
|
165
|
+
if any(node.status == "IN_PROGRESS" for node in plan.nodes.values()):
|
|
166
|
+
stale_detected = reconcile_or_timeout_in_progress(
|
|
167
|
+
plan,
|
|
168
|
+
stale_timeout_sec=FURU_CONFIG.stale_timeout,
|
|
169
|
+
)
|
|
170
|
+
if stale_detected:
|
|
171
|
+
continue
|
|
172
|
+
time.sleep(poll_interval_sec)
|
|
173
|
+
continue
|
|
174
|
+
todo_nodes = [
|
|
175
|
+
node for node in plan.nodes.values() if node.status == "TODO"
|
|
176
|
+
]
|
|
177
|
+
if todo_nodes:
|
|
178
|
+
sample = ", ".join(
|
|
179
|
+
f"{node.obj.__class__.__name__}({node.obj.furu_hash})"
|
|
180
|
+
for node in todo_nodes[:3]
|
|
181
|
+
)
|
|
182
|
+
raise RuntimeError(
|
|
183
|
+
"run_local stalled with no progress; "
|
|
184
|
+
f"remaining TODO nodes: {sample}"
|
|
185
|
+
)
|
|
186
|
+
time.sleep(poll_interval_sec)
|
furu/execution/paths.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from furu.config import FURU_CONFIG
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def submitit_root_dir(override: Path | None = None) -> Path:
|
|
7
|
+
return (override or FURU_CONFIG.get_submitit_root()).resolve()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def submitit_logs_dir(
|
|
11
|
+
kind: str,
|
|
12
|
+
spec_key: str,
|
|
13
|
+
override: Path | None = None,
|
|
14
|
+
run_id: str | None = None,
|
|
15
|
+
) -> Path:
|
|
16
|
+
root = submitit_root_dir(override)
|
|
17
|
+
path = root / kind / spec_key
|
|
18
|
+
if run_id:
|
|
19
|
+
path = path / run_id
|
|
20
|
+
return path
|
furu/execution/plan.py
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from ..config import FURU_CONFIG
|
|
9
|
+
from ..core import Furu
|
|
10
|
+
from ..errors import FuruValidationError
|
|
11
|
+
from ..runtime.logging import get_logger
|
|
12
|
+
from ..storage.migration import MigrationManager, MigrationRecord
|
|
13
|
+
from ..storage.state import (
|
|
14
|
+
StateManager,
|
|
15
|
+
_StateAttemptFailed,
|
|
16
|
+
_StateAttemptQueued,
|
|
17
|
+
_StateAttemptRunning,
|
|
18
|
+
_FuruState,
|
|
19
|
+
_StateResultFailed,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
Status = Literal["DONE", "IN_PROGRESS", "TODO", "FAILED"]
|
|
23
|
+
|
|
24
|
+
_MISSING_TIMESTAMP_SEEN: dict[str, float] = {}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class PlanNode:
|
|
29
|
+
obj: Furu
|
|
30
|
+
status: Status
|
|
31
|
+
spec_key: str
|
|
32
|
+
deps_all: set[str]
|
|
33
|
+
deps_pending: set[str]
|
|
34
|
+
dependents: set[str]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class DependencyPlan:
|
|
39
|
+
roots: list[Furu]
|
|
40
|
+
nodes: dict[str, PlanNode]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class _PlanCache:
|
|
45
|
+
migration_records: dict[Path, MigrationRecord | None]
|
|
46
|
+
alias_targets: dict[Path, Path | None]
|
|
47
|
+
marker_exists: dict[Path, bool]
|
|
48
|
+
states: dict[Path, _FuruState]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _marker_exists(directory: Path, cache: _PlanCache) -> bool:
|
|
52
|
+
if directory in cache.marker_exists:
|
|
53
|
+
return cache.marker_exists[directory]
|
|
54
|
+
exists = StateManager.success_marker_exists(directory)
|
|
55
|
+
cache.marker_exists[directory] = exists
|
|
56
|
+
return exists
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _migration_record(directory: Path, cache: _PlanCache) -> MigrationRecord | None:
|
|
60
|
+
if directory not in cache.migration_records:
|
|
61
|
+
cache.migration_records[directory] = MigrationManager.read_migration(directory)
|
|
62
|
+
return cache.migration_records[directory]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _alias_target_dir(base_dir: Path, cache: _PlanCache) -> Path | None:
|
|
66
|
+
if base_dir in cache.alias_targets:
|
|
67
|
+
return cache.alias_targets[base_dir]
|
|
68
|
+
record = _migration_record(base_dir, cache)
|
|
69
|
+
if record is None or record.kind != "alias" or record.overwritten_at is not None:
|
|
70
|
+
cache.alias_targets[base_dir] = None
|
|
71
|
+
return None
|
|
72
|
+
if _marker_exists(base_dir, cache):
|
|
73
|
+
cache.alias_targets[base_dir] = None
|
|
74
|
+
return None
|
|
75
|
+
target_dir = MigrationManager.resolve_dir(record, target="from")
|
|
76
|
+
if _marker_exists(target_dir, cache):
|
|
77
|
+
cache.alias_targets[base_dir] = target_dir
|
|
78
|
+
return target_dir
|
|
79
|
+
cache.alias_targets[base_dir] = None
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _state_for(directory: Path, cache: _PlanCache) -> _FuruState:
|
|
84
|
+
if directory not in cache.states:
|
|
85
|
+
cache.states[directory] = StateManager.read_state(directory)
|
|
86
|
+
return cache.states[directory]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _validate_cached(obj: Furu, *, directory: Path) -> bool:
|
|
90
|
+
try:
|
|
91
|
+
return obj._validate()
|
|
92
|
+
except FuruValidationError as exc:
|
|
93
|
+
logger = get_logger()
|
|
94
|
+
logger.warning(
|
|
95
|
+
"exists %s -> false (validate invalid for %s: %s)",
|
|
96
|
+
directory,
|
|
97
|
+
f"{obj.__class__.__name__}({obj.furu_hash})",
|
|
98
|
+
exc,
|
|
99
|
+
)
|
|
100
|
+
return False
|
|
101
|
+
except Exception as exc:
|
|
102
|
+
logger = get_logger()
|
|
103
|
+
logger.exception(
|
|
104
|
+
"exists %s -> false (validate crashed for %s: %s)",
|
|
105
|
+
directory,
|
|
106
|
+
f"{obj.__class__.__name__}({obj.furu_hash})",
|
|
107
|
+
exc,
|
|
108
|
+
)
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _classify(
|
|
113
|
+
obj: Furu,
|
|
114
|
+
completed_hashes: set[str] | None,
|
|
115
|
+
cache: _PlanCache,
|
|
116
|
+
) -> Status:
|
|
117
|
+
if completed_hashes is not None and obj.furu_hash in completed_hashes:
|
|
118
|
+
return "DONE"
|
|
119
|
+
base_dir = obj._base_furu_dir()
|
|
120
|
+
alias_target = None
|
|
121
|
+
if not obj._always_rerun():
|
|
122
|
+
alias_target = _alias_target_dir(base_dir, cache)
|
|
123
|
+
success_dir = alias_target or base_dir
|
|
124
|
+
if _marker_exists(success_dir, cache):
|
|
125
|
+
if _validate_cached(obj, directory=base_dir):
|
|
126
|
+
return "DONE"
|
|
127
|
+
|
|
128
|
+
state_dir = alias_target or base_dir
|
|
129
|
+
state = _state_for(state_dir, cache)
|
|
130
|
+
attempt = state.attempt
|
|
131
|
+
if isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning)):
|
|
132
|
+
return "IN_PROGRESS"
|
|
133
|
+
if isinstance(state.result, _StateResultFailed) or isinstance(
|
|
134
|
+
attempt, _StateAttemptFailed
|
|
135
|
+
):
|
|
136
|
+
if FURU_CONFIG.retry_failed:
|
|
137
|
+
return "TODO"
|
|
138
|
+
return "FAILED"
|
|
139
|
+
return "TODO"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def build_plan(
|
|
143
|
+
roots: list[Furu],
|
|
144
|
+
*,
|
|
145
|
+
completed_hashes: set[str] | None = None,
|
|
146
|
+
) -> DependencyPlan:
|
|
147
|
+
cache = _PlanCache(
|
|
148
|
+
migration_records={},
|
|
149
|
+
alias_targets={},
|
|
150
|
+
marker_exists={},
|
|
151
|
+
states={},
|
|
152
|
+
)
|
|
153
|
+
nodes: dict[str, PlanNode] = {}
|
|
154
|
+
stack = list(roots)
|
|
155
|
+
seen: set[str] = set()
|
|
156
|
+
|
|
157
|
+
while stack:
|
|
158
|
+
obj = stack.pop()
|
|
159
|
+
digest = obj.furu_hash
|
|
160
|
+
if digest in seen:
|
|
161
|
+
continue
|
|
162
|
+
seen.add(digest)
|
|
163
|
+
|
|
164
|
+
status = _classify(obj, completed_hashes, cache)
|
|
165
|
+
node = PlanNode(
|
|
166
|
+
obj=obj,
|
|
167
|
+
status=status,
|
|
168
|
+
spec_key=obj._executor_spec_key(),
|
|
169
|
+
deps_all=set(),
|
|
170
|
+
deps_pending=set(),
|
|
171
|
+
dependents=set(),
|
|
172
|
+
)
|
|
173
|
+
nodes[digest] = node
|
|
174
|
+
|
|
175
|
+
if status != "TODO":
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
deps = obj._get_dependencies(recursive=False)
|
|
179
|
+
node.deps_all = {dep.furu_hash for dep in deps}
|
|
180
|
+
for dep in deps:
|
|
181
|
+
stack.append(dep)
|
|
182
|
+
|
|
183
|
+
for digest, node in nodes.items():
|
|
184
|
+
if node.status != "TODO":
|
|
185
|
+
continue
|
|
186
|
+
node.deps_pending = {
|
|
187
|
+
dep for dep in node.deps_all if dep in nodes and nodes[dep].status != "DONE"
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
for digest, node in nodes.items():
|
|
191
|
+
for dep in node.deps_pending:
|
|
192
|
+
nodes[dep].dependents.add(digest)
|
|
193
|
+
|
|
194
|
+
return DependencyPlan(roots=roots, nodes=nodes)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def topo_order_todo(plan: DependencyPlan) -> list[str]:
|
|
198
|
+
todo = {digest for digest, node in plan.nodes.items() if node.status == "TODO"}
|
|
199
|
+
indeg = {digest: 0 for digest in todo}
|
|
200
|
+
|
|
201
|
+
for digest in todo:
|
|
202
|
+
node = plan.nodes[digest]
|
|
203
|
+
for dep in node.deps_pending:
|
|
204
|
+
if dep in todo:
|
|
205
|
+
indeg[digest] += 1
|
|
206
|
+
|
|
207
|
+
ready = sorted([digest for digest, deg in indeg.items() if deg == 0])
|
|
208
|
+
out: list[str] = []
|
|
209
|
+
|
|
210
|
+
while ready:
|
|
211
|
+
digest = ready.pop(0)
|
|
212
|
+
out.append(digest)
|
|
213
|
+
for dep in plan.nodes[digest].dependents:
|
|
214
|
+
if dep not in todo:
|
|
215
|
+
continue
|
|
216
|
+
indeg[dep] -= 1
|
|
217
|
+
if indeg[dep] == 0:
|
|
218
|
+
ready.append(dep)
|
|
219
|
+
ready.sort()
|
|
220
|
+
|
|
221
|
+
if len(out) != len(todo):
|
|
222
|
+
raise ValueError("Cycle detected in TODO dependency graph")
|
|
223
|
+
return out
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def ready_todo(plan: DependencyPlan) -> list[str]:
|
|
227
|
+
return sorted(
|
|
228
|
+
[
|
|
229
|
+
digest
|
|
230
|
+
for digest, node in plan.nodes.items()
|
|
231
|
+
if node.status == "TODO"
|
|
232
|
+
and all(plan.nodes[dep].status == "DONE" for dep in node.deps_pending)
|
|
233
|
+
]
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _attempt_age_sec(
|
|
238
|
+
attempt: _StateAttemptQueued | _StateAttemptRunning,
|
|
239
|
+
*,
|
|
240
|
+
directory: Path,
|
|
241
|
+
stale_timeout_sec: float,
|
|
242
|
+
digest: str,
|
|
243
|
+
name: str,
|
|
244
|
+
) -> float | None:
|
|
245
|
+
if attempt.status == "queued":
|
|
246
|
+
parsed = StateManager._parse_time(attempt.started_at)
|
|
247
|
+
if parsed is not None:
|
|
248
|
+
_MISSING_TIMESTAMP_SEEN.pop(digest, None)
|
|
249
|
+
return (StateManager._utcnow() - parsed).total_seconds()
|
|
250
|
+
else:
|
|
251
|
+
last_heartbeat = StateManager.last_heartbeat_mtime(directory)
|
|
252
|
+
if last_heartbeat is not None:
|
|
253
|
+
_MISSING_TIMESTAMP_SEEN.pop(digest, None)
|
|
254
|
+
return max(0.0, time.time() - last_heartbeat)
|
|
255
|
+
if stale_timeout_sec <= 0:
|
|
256
|
+
return None
|
|
257
|
+
now = StateManager._utcnow().timestamp()
|
|
258
|
+
first_seen = _MISSING_TIMESTAMP_SEEN.get(digest)
|
|
259
|
+
if first_seen is None:
|
|
260
|
+
_MISSING_TIMESTAMP_SEEN[digest] = now
|
|
261
|
+
logger = get_logger()
|
|
262
|
+
logger.warning(
|
|
263
|
+
"IN_PROGRESS attempt missing heartbeat/started timestamps for %s; "
|
|
264
|
+
"deferring stale timeout check.",
|
|
265
|
+
name,
|
|
266
|
+
)
|
|
267
|
+
return None
|
|
268
|
+
return now - first_seen
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def reconcile_in_progress(
|
|
272
|
+
plan: DependencyPlan,
|
|
273
|
+
*,
|
|
274
|
+
stale_timeout_sec: float,
|
|
275
|
+
) -> bool:
|
|
276
|
+
stale_attempts: list[
|
|
277
|
+
tuple[PlanNode, _StateAttemptQueued | _StateAttemptRunning]
|
|
278
|
+
] = []
|
|
279
|
+
for node in plan.nodes.values():
|
|
280
|
+
if node.status != "IN_PROGRESS":
|
|
281
|
+
_MISSING_TIMESTAMP_SEEN.pop(node.obj.furu_hash, None)
|
|
282
|
+
continue
|
|
283
|
+
state = StateManager.reconcile(node.obj._base_furu_dir())
|
|
284
|
+
attempt = state.attempt
|
|
285
|
+
if not isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning)):
|
|
286
|
+
_MISSING_TIMESTAMP_SEEN.pop(node.obj.furu_hash, None)
|
|
287
|
+
continue
|
|
288
|
+
if stale_timeout_sec <= 0:
|
|
289
|
+
continue
|
|
290
|
+
name = f"{node.obj.__class__.__name__}({node.obj.furu_hash})"
|
|
291
|
+
age = _attempt_age_sec(
|
|
292
|
+
attempt,
|
|
293
|
+
directory=node.obj._base_furu_dir(),
|
|
294
|
+
stale_timeout_sec=stale_timeout_sec,
|
|
295
|
+
digest=node.obj.furu_hash,
|
|
296
|
+
name=name,
|
|
297
|
+
)
|
|
298
|
+
if age is None or age < stale_timeout_sec:
|
|
299
|
+
continue
|
|
300
|
+
stale_attempts.append((node, attempt))
|
|
301
|
+
|
|
302
|
+
if not stale_attempts:
|
|
303
|
+
return False
|
|
304
|
+
|
|
305
|
+
names = ", ".join(
|
|
306
|
+
f"{node.obj.__class__.__name__}({node.obj.furu_hash})"
|
|
307
|
+
for node, _attempt in stale_attempts
|
|
308
|
+
)
|
|
309
|
+
if not FURU_CONFIG.retry_failed:
|
|
310
|
+
raise RuntimeError(
|
|
311
|
+
"Stale IN_PROGRESS dependencies detected: "
|
|
312
|
+
f"{names} exceeded {stale_timeout_sec:.1f}s without heartbeat."
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
stale_detected = False
|
|
316
|
+
for node, attempt in stale_attempts:
|
|
317
|
+
stale_detected = True
|
|
318
|
+
StateManager.finish_attempt_preempted(
|
|
319
|
+
node.obj._base_furu_dir(),
|
|
320
|
+
attempt_id=attempt.id,
|
|
321
|
+
error={
|
|
322
|
+
"type": "StaleHeartbeat",
|
|
323
|
+
"message": (
|
|
324
|
+
f"Attempt stale after {stale_timeout_sec:.1f}s without heartbeat."
|
|
325
|
+
),
|
|
326
|
+
},
|
|
327
|
+
reason="stale_timeout",
|
|
328
|
+
)
|
|
329
|
+
_MISSING_TIMESTAMP_SEEN.pop(node.obj.furu_hash, None)
|
|
330
|
+
return stale_detected
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .plan import DependencyPlan, reconcile_in_progress
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def reconcile_or_timeout_in_progress(
|
|
7
|
+
plan: DependencyPlan,
|
|
8
|
+
*,
|
|
9
|
+
stale_timeout_sec: float,
|
|
10
|
+
) -> bool:
|
|
11
|
+
if not any(node.status == "IN_PROGRESS" for node in plan.nodes.values()):
|
|
12
|
+
return False
|
|
13
|
+
return reconcile_in_progress(plan, stale_timeout_sec=stale_timeout_sec)
|