furu 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- furu/__init__.py +8 -0
- furu/adapters/submitit.py +23 -2
- furu/config.py +13 -1
- furu/core/furu.py +355 -196
- furu/core/list.py +1 -1
- furu/dashboard/__init__.py +10 -1
- furu/dashboard/main.py +10 -3
- furu/errors.py +17 -4
- furu/execution/__init__.py +22 -0
- furu/execution/context.py +30 -0
- furu/execution/local.py +184 -0
- furu/execution/paths.py +20 -0
- furu/execution/plan.py +238 -0
- furu/execution/plan_utils.py +13 -0
- furu/execution/slurm_dag.py +271 -0
- furu/execution/slurm_pool.py +878 -0
- furu/execution/slurm_spec.py +38 -0
- furu/execution/submitit_factory.py +47 -0
- furu/runtime/logging.py +10 -10
- furu/storage/state.py +34 -6
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/METADATA +74 -37
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/RECORD +24 -14
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/WHEEL +0 -0
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/entry_points.txt +0 -0
furu/core/list.py
CHANGED
furu/dashboard/__init__.py
CHANGED
|
@@ -5,5 +5,14 @@ Install with: uv add furu[dashboard]
|
|
|
5
5
|
Run with: furu-dashboard serve
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
from importlib import metadata
|
|
9
9
|
|
|
10
|
+
|
|
11
|
+
def _resolve_version() -> str:
|
|
12
|
+
try:
|
|
13
|
+
return metadata.version("furu")
|
|
14
|
+
except metadata.PackageNotFoundError:
|
|
15
|
+
return "0.0.0"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
__version__ = _resolve_version()
|
furu/dashboard/main.py
CHANGED
|
@@ -5,11 +5,12 @@ from pathlib import Path
|
|
|
5
5
|
|
|
6
6
|
import typer
|
|
7
7
|
import uvicorn
|
|
8
|
-
from fastapi import FastAPI
|
|
8
|
+
from fastapi import FastAPI, HTTPException
|
|
9
9
|
from fastapi.middleware.cors import CORSMiddleware
|
|
10
10
|
from fastapi.responses import FileResponse
|
|
11
11
|
from fastapi.staticfiles import StaticFiles
|
|
12
12
|
|
|
13
|
+
from . import __version__
|
|
13
14
|
from .api.routes import router as api_router
|
|
14
15
|
|
|
15
16
|
|
|
@@ -36,7 +37,7 @@ def create_app(*, serve_frontend: bool = False) -> FastAPI:
|
|
|
36
37
|
app = FastAPI(
|
|
37
38
|
title="Furu Dashboard",
|
|
38
39
|
description="Monitoring dashboard for Furu experiments",
|
|
39
|
-
version=
|
|
40
|
+
version=__version__,
|
|
40
41
|
)
|
|
41
42
|
|
|
42
43
|
# CORS middleware for development
|
|
@@ -64,7 +65,13 @@ def create_app(*, serve_frontend: bool = False) -> FastAPI:
|
|
|
64
65
|
@app.get("/{full_path:path}")
|
|
65
66
|
async def serve_spa(full_path: str) -> FileResponse:
|
|
66
67
|
"""Serve the React SPA for all non-API routes."""
|
|
67
|
-
|
|
68
|
+
requested = Path(full_path)
|
|
69
|
+
if ".." in requested.parts:
|
|
70
|
+
raise HTTPException(status_code=404, detail="Not found")
|
|
71
|
+
frontend_root = frontend_dir.resolve()
|
|
72
|
+
file_path = (frontend_dir / requested).resolve()
|
|
73
|
+
if not file_path.is_relative_to(frontend_root):
|
|
74
|
+
raise HTTPException(status_code=404, detail="Not found")
|
|
68
75
|
if file_path.is_file() and not full_path.startswith("api"):
|
|
69
76
|
return FileResponse(file_path)
|
|
70
77
|
return FileResponse(frontend_dir / "index.html")
|
furu/errors.py
CHANGED
|
@@ -30,6 +30,14 @@ class FuruError(Exception):
|
|
|
30
30
|
return "\n".join(lines)
|
|
31
31
|
|
|
32
32
|
|
|
33
|
+
class FuruExecutionError(FuruError):
|
|
34
|
+
"""Raised when executor wiring or scheduling fails."""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class FuruValidationError(FuruError):
|
|
38
|
+
"""Raised by `_validate()` to indicate an invalid or missing artifact."""
|
|
39
|
+
|
|
40
|
+
|
|
33
41
|
class FuruWaitTimeout(FuruError):
|
|
34
42
|
"""Raised when waiting for a result exceeds _max_wait_time_sec."""
|
|
35
43
|
|
|
@@ -70,11 +78,8 @@ class FuruComputeError(FuruError):
|
|
|
70
78
|
msg = super().__str__() # ty: ignore[invalid-super-argument]
|
|
71
79
|
internal_dir = self.state_path.parent
|
|
72
80
|
furu_dir = internal_dir.parent
|
|
73
|
-
log_path = internal_dir / "furu.log"
|
|
74
81
|
|
|
75
|
-
msg += f"\n\
|
|
76
|
-
msg += f"\nState file: {self.state_path}"
|
|
77
|
-
msg += f"\nLog file: {log_path}"
|
|
82
|
+
msg += f"\n\nFuru dir: {furu_dir}"
|
|
78
83
|
|
|
79
84
|
if self.recorded_error_type or self.recorded_error_message:
|
|
80
85
|
msg += "\n\nRecorded error (from state.json):"
|
|
@@ -116,3 +121,11 @@ class FuruMigrationRequired(FuruError):
|
|
|
116
121
|
if self.state_path is not None:
|
|
117
122
|
msg += f"\n\nState file: {self.state_path}"
|
|
118
123
|
return msg
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class FuruMissingArtifact(FuruError):
|
|
127
|
+
"""Raised when a dependency is missing in executor mode."""
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class FuruSpecMismatch(FuruError):
|
|
131
|
+
"""Raised when executor spec keys do not match."""
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Execution helpers for Furu."""
|
|
2
|
+
|
|
3
|
+
from .local import run_local
|
|
4
|
+
from .paths import submitit_logs_dir, submitit_root_dir
|
|
5
|
+
from .slurm_dag import SlurmDagSubmission, submit_slurm_dag
|
|
6
|
+
from .slurm_pool import SlurmPoolRun, run_slurm_pool
|
|
7
|
+
from .slurm_spec import SlurmSpec, SlurmSpecValue, resolve_slurm_spec
|
|
8
|
+
from .submitit_factory import make_executor_for_spec
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"SlurmSpec",
|
|
12
|
+
"SlurmSpecValue",
|
|
13
|
+
"resolve_slurm_spec",
|
|
14
|
+
"SlurmDagSubmission",
|
|
15
|
+
"submit_slurm_dag",
|
|
16
|
+
"make_executor_for_spec",
|
|
17
|
+
"SlurmPoolRun",
|
|
18
|
+
"run_slurm_pool",
|
|
19
|
+
"run_local",
|
|
20
|
+
"submitit_logs_dir",
|
|
21
|
+
"submitit_root_dir",
|
|
22
|
+
]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextvars import ContextVar
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
Backend = Literal["local", "submitit"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class ExecContext:
|
|
12
|
+
mode: Literal["interactive", "executor"]
|
|
13
|
+
spec_key: str | None = None
|
|
14
|
+
backend: Backend | None = None
|
|
15
|
+
current_node_hash: str | None = None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
EXEC_CONTEXT: ContextVar[ExecContext] = ContextVar(
|
|
19
|
+
"FURU_EXEC_CONTEXT",
|
|
20
|
+
default=ExecContext(
|
|
21
|
+
mode="interactive",
|
|
22
|
+
spec_key=None,
|
|
23
|
+
backend=None,
|
|
24
|
+
current_node_hash=None,
|
|
25
|
+
),
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def in_executor() -> bool:
|
|
30
|
+
return EXEC_CONTEXT.get().mode == "executor"
|
furu/execution/local.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
|
|
5
|
+
|
|
6
|
+
from ..config import FURU_CONFIG
|
|
7
|
+
from ..core import Furu
|
|
8
|
+
from ..errors import FuruComputeError, FuruError
|
|
9
|
+
from ..storage.state import StateManager
|
|
10
|
+
from .context import EXEC_CONTEXT, ExecContext
|
|
11
|
+
from .plan import PlanNode, build_plan, ready_todo
|
|
12
|
+
from .plan_utils import reconcile_or_timeout_in_progress
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _normalize_window_size(window_size: str | int, root_count: int) -> int:
|
|
16
|
+
if root_count == 0:
|
|
17
|
+
return 0
|
|
18
|
+
if isinstance(window_size, str):
|
|
19
|
+
match window_size:
|
|
20
|
+
case "dfs":
|
|
21
|
+
return 1
|
|
22
|
+
case "bfs":
|
|
23
|
+
return root_count
|
|
24
|
+
case _:
|
|
25
|
+
raise ValueError(
|
|
26
|
+
"window_size must be 'dfs', 'bfs', or a positive integer"
|
|
27
|
+
)
|
|
28
|
+
if isinstance(window_size, bool) or not isinstance(window_size, int):
|
|
29
|
+
raise TypeError("window_size must be 'dfs', 'bfs', or a positive integer")
|
|
30
|
+
if window_size < 1:
|
|
31
|
+
raise ValueError("window_size must be >= 1")
|
|
32
|
+
return min(window_size, root_count)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _run_node(node: PlanNode) -> None:
|
|
36
|
+
token = EXEC_CONTEXT.set(
|
|
37
|
+
ExecContext(
|
|
38
|
+
mode="executor",
|
|
39
|
+
spec_key=node.spec_key,
|
|
40
|
+
backend="local",
|
|
41
|
+
current_node_hash=node.obj._furu_hash,
|
|
42
|
+
)
|
|
43
|
+
)
|
|
44
|
+
try:
|
|
45
|
+
node.obj.get(force=True)
|
|
46
|
+
finally:
|
|
47
|
+
EXEC_CONTEXT.reset(token)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def run_local(
|
|
51
|
+
roots: list[Furu],
|
|
52
|
+
*,
|
|
53
|
+
max_workers: int = 8,
|
|
54
|
+
window_size: str | int = "bfs",
|
|
55
|
+
poll_interval_sec: float = 0.25,
|
|
56
|
+
) -> None:
|
|
57
|
+
if not roots:
|
|
58
|
+
return
|
|
59
|
+
if max_workers < 1:
|
|
60
|
+
raise ValueError("max_workers must be >= 1")
|
|
61
|
+
|
|
62
|
+
window = _normalize_window_size(window_size, len(roots))
|
|
63
|
+
active_indices = list(range(min(window, len(roots))))
|
|
64
|
+
next_index = len(active_indices)
|
|
65
|
+
inflight: dict[str, Future[None]] = {}
|
|
66
|
+
completed_hashes: set[str] = set()
|
|
67
|
+
retry_attempts: dict[str, int] = {}
|
|
68
|
+
|
|
69
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
70
|
+
while True:
|
|
71
|
+
active_roots = [roots[index] for index in active_indices]
|
|
72
|
+
plan = build_plan(active_roots, completed_hashes=completed_hashes)
|
|
73
|
+
|
|
74
|
+
ready = [digest for digest in ready_todo(plan) if digest not in inflight]
|
|
75
|
+
available = max_workers - len(inflight)
|
|
76
|
+
for digest in ready[:available]:
|
|
77
|
+
node = plan.nodes[digest]
|
|
78
|
+
inflight[digest] = executor.submit(_run_node, node)
|
|
79
|
+
|
|
80
|
+
completed = [digest for digest, future in inflight.items() if future.done()]
|
|
81
|
+
for digest in completed:
|
|
82
|
+
future = inflight.pop(digest)
|
|
83
|
+
try:
|
|
84
|
+
future.result()
|
|
85
|
+
except Exception as exc:
|
|
86
|
+
if isinstance(exc, FuruComputeError):
|
|
87
|
+
compute_error = exc
|
|
88
|
+
wrapped_exc: Exception | None = None
|
|
89
|
+
elif isinstance(exc, FuruError):
|
|
90
|
+
raise
|
|
91
|
+
else:
|
|
92
|
+
node = plan.nodes.get(digest)
|
|
93
|
+
if node is None:
|
|
94
|
+
raise
|
|
95
|
+
state_path = StateManager.get_state_path(
|
|
96
|
+
node.obj._base_furu_dir()
|
|
97
|
+
)
|
|
98
|
+
compute_error = FuruComputeError(
|
|
99
|
+
"local executor failed for "
|
|
100
|
+
f"{node.obj.__class__.__name__}({node.obj._furu_hash})",
|
|
101
|
+
state_path,
|
|
102
|
+
original_error=exc,
|
|
103
|
+
)
|
|
104
|
+
wrapped_exc = exc
|
|
105
|
+
if not FURU_CONFIG.retry_failed:
|
|
106
|
+
if wrapped_exc is not None:
|
|
107
|
+
raise compute_error from wrapped_exc
|
|
108
|
+
raise compute_error
|
|
109
|
+
attempt = retry_attempts.get(digest, 0) + 1
|
|
110
|
+
retry_attempts[digest] = attempt
|
|
111
|
+
if attempt <= FURU_CONFIG.max_compute_retries:
|
|
112
|
+
continue
|
|
113
|
+
if wrapped_exc is not None:
|
|
114
|
+
raise compute_error from wrapped_exc
|
|
115
|
+
raise compute_error
|
|
116
|
+
completed_hashes.add(digest)
|
|
117
|
+
retry_attempts.pop(digest, None)
|
|
118
|
+
|
|
119
|
+
if not FURU_CONFIG.retry_failed:
|
|
120
|
+
failed = [
|
|
121
|
+
node
|
|
122
|
+
for digest, node in plan.nodes.items()
|
|
123
|
+
if node.status == "FAILED" and digest not in inflight
|
|
124
|
+
]
|
|
125
|
+
if failed:
|
|
126
|
+
names = ", ".join(
|
|
127
|
+
f"{node.obj.__class__.__name__}({node.obj._furu_hash})"
|
|
128
|
+
for node in failed
|
|
129
|
+
)
|
|
130
|
+
raise RuntimeError(
|
|
131
|
+
f"Cannot run local executor with failed dependencies: {names}"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if completed:
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
# Avoid a busy-spin loop while waiting for long-running tasks.
|
|
138
|
+
if inflight and not completed:
|
|
139
|
+
wait(
|
|
140
|
+
inflight.values(),
|
|
141
|
+
timeout=poll_interval_sec,
|
|
142
|
+
return_when=FIRST_COMPLETED,
|
|
143
|
+
)
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
finished_indices = [
|
|
147
|
+
index
|
|
148
|
+
for index in active_indices
|
|
149
|
+
if plan.nodes.get(roots[index]._furu_hash) is not None
|
|
150
|
+
and plan.nodes[roots[index]._furu_hash].status == "DONE"
|
|
151
|
+
]
|
|
152
|
+
for index in finished_indices:
|
|
153
|
+
active_indices.remove(index)
|
|
154
|
+
|
|
155
|
+
while len(active_indices) < window and next_index < len(roots):
|
|
156
|
+
active_indices.append(next_index)
|
|
157
|
+
next_index += 1
|
|
158
|
+
|
|
159
|
+
if not active_indices and not inflight and next_index >= len(roots):
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
if not inflight and not ready:
|
|
163
|
+
if any(node.status == "IN_PROGRESS" for node in plan.nodes.values()):
|
|
164
|
+
stale_detected = reconcile_or_timeout_in_progress(
|
|
165
|
+
plan,
|
|
166
|
+
stale_timeout_sec=FURU_CONFIG.stale_timeout,
|
|
167
|
+
)
|
|
168
|
+
if stale_detected:
|
|
169
|
+
continue
|
|
170
|
+
time.sleep(poll_interval_sec)
|
|
171
|
+
continue
|
|
172
|
+
todo_nodes = [
|
|
173
|
+
node for node in plan.nodes.values() if node.status == "TODO"
|
|
174
|
+
]
|
|
175
|
+
if todo_nodes:
|
|
176
|
+
sample = ", ".join(
|
|
177
|
+
f"{node.obj.__class__.__name__}({node.obj._furu_hash})"
|
|
178
|
+
for node in todo_nodes[:3]
|
|
179
|
+
)
|
|
180
|
+
raise RuntimeError(
|
|
181
|
+
"run_local stalled with no progress; "
|
|
182
|
+
f"remaining TODO nodes: {sample}"
|
|
183
|
+
)
|
|
184
|
+
time.sleep(poll_interval_sec)
|
furu/execution/paths.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from furu.config import FURU_CONFIG
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def submitit_root_dir(override: Path | None = None) -> Path:
|
|
7
|
+
return (override or FURU_CONFIG.get_submitit_root()).resolve()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def submitit_logs_dir(
|
|
11
|
+
kind: str,
|
|
12
|
+
spec_key: str,
|
|
13
|
+
override: Path | None = None,
|
|
14
|
+
run_id: str | None = None,
|
|
15
|
+
) -> Path:
|
|
16
|
+
root = submitit_root_dir(override)
|
|
17
|
+
path = root / kind / spec_key
|
|
18
|
+
if run_id:
|
|
19
|
+
path = path / run_id
|
|
20
|
+
return path
|
furu/execution/plan.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
from ..config import FURU_CONFIG
|
|
7
|
+
from ..core import Furu
|
|
8
|
+
from ..runtime.logging import get_logger
|
|
9
|
+
from ..storage.state import (
|
|
10
|
+
StateManager,
|
|
11
|
+
_StateAttemptFailed,
|
|
12
|
+
_StateAttemptQueued,
|
|
13
|
+
_StateAttemptRunning,
|
|
14
|
+
_StateResultFailed,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
Status = Literal["DONE", "IN_PROGRESS", "TODO", "FAILED"]
|
|
18
|
+
|
|
19
|
+
_MISSING_TIMESTAMP_SEEN: dict[str, float] = {}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class PlanNode:
|
|
24
|
+
obj: Furu
|
|
25
|
+
status: Status
|
|
26
|
+
spec_key: str
|
|
27
|
+
deps_all: set[str]
|
|
28
|
+
deps_pending: set[str]
|
|
29
|
+
dependents: set[str]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class DependencyPlan:
|
|
34
|
+
roots: list[Furu]
|
|
35
|
+
nodes: dict[str, PlanNode]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _classify(obj: Furu, completed_hashes: set[str] | None) -> Status:
|
|
39
|
+
if completed_hashes is not None and obj._furu_hash in completed_hashes:
|
|
40
|
+
return "DONE"
|
|
41
|
+
if obj._exists_quiet() and not obj._always_rerun():
|
|
42
|
+
return "DONE"
|
|
43
|
+
|
|
44
|
+
state = obj.get_state()
|
|
45
|
+
attempt = state.attempt
|
|
46
|
+
if isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning)):
|
|
47
|
+
return "IN_PROGRESS"
|
|
48
|
+
if isinstance(state.result, _StateResultFailed) or isinstance(
|
|
49
|
+
attempt, _StateAttemptFailed
|
|
50
|
+
):
|
|
51
|
+
if FURU_CONFIG.retry_failed:
|
|
52
|
+
return "TODO"
|
|
53
|
+
return "FAILED"
|
|
54
|
+
return "TODO"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def build_plan(
|
|
58
|
+
roots: list[Furu],
|
|
59
|
+
*,
|
|
60
|
+
completed_hashes: set[str] | None = None,
|
|
61
|
+
) -> DependencyPlan:
|
|
62
|
+
nodes: dict[str, PlanNode] = {}
|
|
63
|
+
stack = list(roots)
|
|
64
|
+
seen: set[str] = set()
|
|
65
|
+
|
|
66
|
+
while stack:
|
|
67
|
+
obj = stack.pop()
|
|
68
|
+
digest = obj._furu_hash
|
|
69
|
+
if digest in seen:
|
|
70
|
+
continue
|
|
71
|
+
seen.add(digest)
|
|
72
|
+
|
|
73
|
+
status = _classify(obj, completed_hashes)
|
|
74
|
+
node = PlanNode(
|
|
75
|
+
obj=obj,
|
|
76
|
+
status=status,
|
|
77
|
+
spec_key=obj._executor_spec_key(),
|
|
78
|
+
deps_all=set(),
|
|
79
|
+
deps_pending=set(),
|
|
80
|
+
dependents=set(),
|
|
81
|
+
)
|
|
82
|
+
nodes[digest] = node
|
|
83
|
+
|
|
84
|
+
if status != "TODO":
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
deps = obj._get_dependencies(recursive=False)
|
|
88
|
+
node.deps_all = {dep._furu_hash for dep in deps}
|
|
89
|
+
for dep in deps:
|
|
90
|
+
stack.append(dep)
|
|
91
|
+
|
|
92
|
+
for digest, node in nodes.items():
|
|
93
|
+
if node.status != "TODO":
|
|
94
|
+
continue
|
|
95
|
+
node.deps_pending = {
|
|
96
|
+
dep for dep in node.deps_all if dep in nodes and nodes[dep].status != "DONE"
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
for digest, node in nodes.items():
|
|
100
|
+
for dep in node.deps_pending:
|
|
101
|
+
nodes[dep].dependents.add(digest)
|
|
102
|
+
|
|
103
|
+
return DependencyPlan(roots=roots, nodes=nodes)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def topo_order_todo(plan: DependencyPlan) -> list[str]:
|
|
107
|
+
todo = {digest for digest, node in plan.nodes.items() if node.status == "TODO"}
|
|
108
|
+
indeg = {digest: 0 for digest in todo}
|
|
109
|
+
|
|
110
|
+
for digest in todo:
|
|
111
|
+
node = plan.nodes[digest]
|
|
112
|
+
for dep in node.deps_pending:
|
|
113
|
+
if dep in todo:
|
|
114
|
+
indeg[digest] += 1
|
|
115
|
+
|
|
116
|
+
ready = sorted([digest for digest, deg in indeg.items() if deg == 0])
|
|
117
|
+
out: list[str] = []
|
|
118
|
+
|
|
119
|
+
while ready:
|
|
120
|
+
digest = ready.pop(0)
|
|
121
|
+
out.append(digest)
|
|
122
|
+
for dep in plan.nodes[digest].dependents:
|
|
123
|
+
if dep not in todo:
|
|
124
|
+
continue
|
|
125
|
+
indeg[dep] -= 1
|
|
126
|
+
if indeg[dep] == 0:
|
|
127
|
+
ready.append(dep)
|
|
128
|
+
ready.sort()
|
|
129
|
+
|
|
130
|
+
if len(out) != len(todo):
|
|
131
|
+
raise ValueError("Cycle detected in TODO dependency graph")
|
|
132
|
+
return out
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def ready_todo(plan: DependencyPlan) -> list[str]:
|
|
136
|
+
return sorted(
|
|
137
|
+
[
|
|
138
|
+
digest
|
|
139
|
+
for digest, node in plan.nodes.items()
|
|
140
|
+
if node.status == "TODO"
|
|
141
|
+
and all(plan.nodes[dep].status == "DONE" for dep in node.deps_pending)
|
|
142
|
+
]
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _attempt_age_sec(
|
|
147
|
+
attempt: _StateAttemptQueued | _StateAttemptRunning,
|
|
148
|
+
*,
|
|
149
|
+
updated_at: str | None,
|
|
150
|
+
stale_timeout_sec: float,
|
|
151
|
+
digest: str,
|
|
152
|
+
name: str,
|
|
153
|
+
) -> float | None:
|
|
154
|
+
timestamp = attempt.heartbeat_at
|
|
155
|
+
if attempt.status == "queued":
|
|
156
|
+
timestamp = attempt.started_at
|
|
157
|
+
parsed = StateManager._parse_time(timestamp)
|
|
158
|
+
if parsed is None:
|
|
159
|
+
parsed = StateManager._parse_time(updated_at)
|
|
160
|
+
if parsed is not None:
|
|
161
|
+
_MISSING_TIMESTAMP_SEEN.pop(digest, None)
|
|
162
|
+
return (StateManager._utcnow() - parsed).total_seconds()
|
|
163
|
+
if stale_timeout_sec <= 0:
|
|
164
|
+
return None
|
|
165
|
+
now = StateManager._utcnow().timestamp()
|
|
166
|
+
first_seen = _MISSING_TIMESTAMP_SEEN.get(digest)
|
|
167
|
+
if first_seen is None:
|
|
168
|
+
_MISSING_TIMESTAMP_SEEN[digest] = now
|
|
169
|
+
logger = get_logger()
|
|
170
|
+
logger.warning(
|
|
171
|
+
"IN_PROGRESS attempt missing heartbeat/started timestamps for %s; "
|
|
172
|
+
"deferring stale timeout check.",
|
|
173
|
+
name,
|
|
174
|
+
)
|
|
175
|
+
return None
|
|
176
|
+
return now - first_seen
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def reconcile_in_progress(
|
|
180
|
+
plan: DependencyPlan,
|
|
181
|
+
*,
|
|
182
|
+
stale_timeout_sec: float,
|
|
183
|
+
) -> bool:
|
|
184
|
+
stale_attempts: list[
|
|
185
|
+
tuple[PlanNode, _StateAttemptQueued | _StateAttemptRunning]
|
|
186
|
+
] = []
|
|
187
|
+
for node in plan.nodes.values():
|
|
188
|
+
if node.status != "IN_PROGRESS":
|
|
189
|
+
_MISSING_TIMESTAMP_SEEN.pop(node.obj._furu_hash, None)
|
|
190
|
+
continue
|
|
191
|
+
state = StateManager.reconcile(node.obj._base_furu_dir())
|
|
192
|
+
attempt = state.attempt
|
|
193
|
+
if not isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning)):
|
|
194
|
+
_MISSING_TIMESTAMP_SEEN.pop(node.obj._furu_hash, None)
|
|
195
|
+
continue
|
|
196
|
+
if stale_timeout_sec <= 0:
|
|
197
|
+
continue
|
|
198
|
+
name = f"{node.obj.__class__.__name__}({node.obj._furu_hash})"
|
|
199
|
+
age = _attempt_age_sec(
|
|
200
|
+
attempt,
|
|
201
|
+
updated_at=state.updated_at,
|
|
202
|
+
stale_timeout_sec=stale_timeout_sec,
|
|
203
|
+
digest=node.obj._furu_hash,
|
|
204
|
+
name=name,
|
|
205
|
+
)
|
|
206
|
+
if age is None or age < stale_timeout_sec:
|
|
207
|
+
continue
|
|
208
|
+
stale_attempts.append((node, attempt))
|
|
209
|
+
|
|
210
|
+
if not stale_attempts:
|
|
211
|
+
return False
|
|
212
|
+
|
|
213
|
+
names = ", ".join(
|
|
214
|
+
f"{node.obj.__class__.__name__}({node.obj._furu_hash})"
|
|
215
|
+
for node, _attempt in stale_attempts
|
|
216
|
+
)
|
|
217
|
+
if not FURU_CONFIG.retry_failed:
|
|
218
|
+
raise RuntimeError(
|
|
219
|
+
"Stale IN_PROGRESS dependencies detected: "
|
|
220
|
+
f"{names} exceeded {stale_timeout_sec:.1f}s without heartbeat."
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
stale_detected = False
|
|
224
|
+
for node, attempt in stale_attempts:
|
|
225
|
+
stale_detected = True
|
|
226
|
+
StateManager.finish_attempt_preempted(
|
|
227
|
+
node.obj._base_furu_dir(),
|
|
228
|
+
attempt_id=attempt.id,
|
|
229
|
+
error={
|
|
230
|
+
"type": "StaleHeartbeat",
|
|
231
|
+
"message": (
|
|
232
|
+
f"Attempt stale after {stale_timeout_sec:.1f}s without heartbeat."
|
|
233
|
+
),
|
|
234
|
+
},
|
|
235
|
+
reason="stale_timeout",
|
|
236
|
+
)
|
|
237
|
+
_MISSING_TIMESTAMP_SEEN.pop(node.obj._furu_hash, None)
|
|
238
|
+
return stale_detected
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .plan import DependencyPlan, reconcile_in_progress
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def reconcile_or_timeout_in_progress(
|
|
7
|
+
plan: DependencyPlan,
|
|
8
|
+
*,
|
|
9
|
+
stale_timeout_sec: float,
|
|
10
|
+
) -> bool:
|
|
11
|
+
if not any(node.status == "IN_PROGRESS" for node in plan.nodes.values()):
|
|
12
|
+
return False
|
|
13
|
+
return reconcile_in_progress(plan, stale_timeout_sec=stale_timeout_sec)
|