furu 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,186 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
5
+
6
+ from ..config import FURU_CONFIG
7
+ from ..core import Furu
8
+ from ..errors import FuruComputeError, FuruError
9
+ from ..runtime.logging import enter_holder
10
+ from ..storage.state import StateManager
11
+ from .context import EXEC_CONTEXT, ExecContext
12
+ from .plan import PlanNode, build_plan, ready_todo
13
+ from .plan_utils import reconcile_or_timeout_in_progress
14
+
15
+
16
+ def _normalize_window_size(window_size: str | int, root_count: int) -> int:
17
+ if root_count == 0:
18
+ return 0
19
+ if isinstance(window_size, str):
20
+ match window_size:
21
+ case "dfs":
22
+ return 1
23
+ case "bfs":
24
+ return root_count
25
+ case _:
26
+ raise ValueError(
27
+ "window_size must be 'dfs', 'bfs', or a positive integer"
28
+ )
29
+ if isinstance(window_size, bool) or not isinstance(window_size, int):
30
+ raise TypeError("window_size must be 'dfs', 'bfs', or a positive integer")
31
+ if window_size < 1:
32
+ raise ValueError("window_size must be >= 1")
33
+ return min(window_size, root_count)
34
+
35
+
36
+ def _run_node(node: PlanNode) -> None:
37
+ token = EXEC_CONTEXT.set(
38
+ ExecContext(
39
+ mode="executor",
40
+ spec_key=node.spec_key,
41
+ backend="local",
42
+ current_node_hash=node.obj.furu_hash,
43
+ )
44
+ )
45
+ try:
46
+ with enter_holder(node.obj):
47
+ node.obj.get(force=True)
48
+ finally:
49
+ EXEC_CONTEXT.reset(token)
50
+
51
+
52
+ def run_local(
53
+ roots: list[Furu],
54
+ *,
55
+ max_workers: int = 8,
56
+ window_size: str | int = "bfs",
57
+ poll_interval_sec: float = 0.25,
58
+ ) -> None:
59
+ if not roots:
60
+ return
61
+ if max_workers < 1:
62
+ raise ValueError("max_workers must be >= 1")
63
+
64
+ window = _normalize_window_size(window_size, len(roots))
65
+ active_indices = list(range(min(window, len(roots))))
66
+ next_index = len(active_indices)
67
+ inflight: dict[str, Future[None]] = {}
68
+ completed_hashes: set[str] = set()
69
+ retry_attempts: dict[str, int] = {}
70
+
71
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
72
+ while True:
73
+ active_roots = [roots[index] for index in active_indices]
74
+ plan = build_plan(active_roots, completed_hashes=completed_hashes)
75
+
76
+ ready = [digest for digest in ready_todo(plan) if digest not in inflight]
77
+ available = max_workers - len(inflight)
78
+ for digest in ready[:available]:
79
+ node = plan.nodes[digest]
80
+ inflight[digest] = executor.submit(_run_node, node)
81
+
82
+ completed = [digest for digest, future in inflight.items() if future.done()]
83
+ for digest in completed:
84
+ future = inflight.pop(digest)
85
+ try:
86
+ future.result()
87
+ except Exception as exc:
88
+ if isinstance(exc, FuruComputeError):
89
+ compute_error = exc
90
+ wrapped_exc: Exception | None = None
91
+ elif isinstance(exc, FuruError):
92
+ raise
93
+ else:
94
+ node = plan.nodes.get(digest)
95
+ if node is None:
96
+ raise
97
+ state_path = StateManager.get_state_path(
98
+ node.obj._base_furu_dir()
99
+ )
100
+ compute_error = FuruComputeError(
101
+ "local executor failed for "
102
+ f"{node.obj.__class__.__name__}({node.obj.furu_hash})",
103
+ state_path,
104
+ original_error=exc,
105
+ )
106
+ wrapped_exc = exc
107
+ if not FURU_CONFIG.retry_failed:
108
+ if wrapped_exc is not None:
109
+ raise compute_error from wrapped_exc
110
+ raise compute_error
111
+ attempt = retry_attempts.get(digest, 0) + 1
112
+ retry_attempts[digest] = attempt
113
+ if attempt <= FURU_CONFIG.max_compute_retries:
114
+ continue
115
+ if wrapped_exc is not None:
116
+ raise compute_error from wrapped_exc
117
+ raise compute_error
118
+ completed_hashes.add(digest)
119
+ retry_attempts.pop(digest, None)
120
+
121
+ if not FURU_CONFIG.retry_failed:
122
+ failed = [
123
+ node
124
+ for digest, node in plan.nodes.items()
125
+ if node.status == "FAILED" and digest not in inflight
126
+ ]
127
+ if failed:
128
+ names = ", ".join(
129
+ f"{node.obj.__class__.__name__}({node.obj.furu_hash})"
130
+ for node in failed
131
+ )
132
+ raise RuntimeError(
133
+ f"Cannot run local executor with failed dependencies: {names}"
134
+ )
135
+
136
+ if completed:
137
+ continue
138
+
139
+ # Avoid a busy-spin loop while waiting for long-running tasks.
140
+ if inflight and not completed:
141
+ wait(
142
+ inflight.values(),
143
+ timeout=poll_interval_sec,
144
+ return_when=FIRST_COMPLETED,
145
+ )
146
+ continue
147
+
148
+ finished_indices = [
149
+ index
150
+ for index in active_indices
151
+ if plan.nodes.get(roots[index].furu_hash) is not None
152
+ and plan.nodes[roots[index].furu_hash].status == "DONE"
153
+ ]
154
+ for index in finished_indices:
155
+ active_indices.remove(index)
156
+
157
+ while len(active_indices) < window and next_index < len(roots):
158
+ active_indices.append(next_index)
159
+ next_index += 1
160
+
161
+ if not active_indices and not inflight and next_index >= len(roots):
162
+ return
163
+
164
+ if not inflight and not ready:
165
+ if any(node.status == "IN_PROGRESS" for node in plan.nodes.values()):
166
+ stale_detected = reconcile_or_timeout_in_progress(
167
+ plan,
168
+ stale_timeout_sec=FURU_CONFIG.stale_timeout,
169
+ )
170
+ if stale_detected:
171
+ continue
172
+ time.sleep(poll_interval_sec)
173
+ continue
174
+ todo_nodes = [
175
+ node for node in plan.nodes.values() if node.status == "TODO"
176
+ ]
177
+ if todo_nodes:
178
+ sample = ", ".join(
179
+ f"{node.obj.__class__.__name__}({node.obj.furu_hash})"
180
+ for node in todo_nodes[:3]
181
+ )
182
+ raise RuntimeError(
183
+ "run_local stalled with no progress; "
184
+ f"remaining TODO nodes: {sample}"
185
+ )
186
+ time.sleep(poll_interval_sec)
@@ -0,0 +1,20 @@
1
+ from pathlib import Path
2
+
3
+ from furu.config import FURU_CONFIG
4
+
5
+
6
+ def submitit_root_dir(override: Path | None = None) -> Path:
7
+ return (override or FURU_CONFIG.get_submitit_root()).resolve()
8
+
9
+
10
+ def submitit_logs_dir(
11
+ kind: str,
12
+ spec_key: str,
13
+ override: Path | None = None,
14
+ run_id: str | None = None,
15
+ ) -> Path:
16
+ root = submitit_root_dir(override)
17
+ path = root / kind / spec_key
18
+ if run_id:
19
+ path = path / run_id
20
+ return path
furu/execution/plan.py ADDED
@@ -0,0 +1,330 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ import time
5
+ from pathlib import Path
6
+ from typing import Literal
7
+
8
+ from ..config import FURU_CONFIG
9
+ from ..core import Furu
10
+ from ..errors import FuruValidationError
11
+ from ..runtime.logging import get_logger
12
+ from ..storage.migration import MigrationManager, MigrationRecord
13
+ from ..storage.state import (
14
+ StateManager,
15
+ _StateAttemptFailed,
16
+ _StateAttemptQueued,
17
+ _StateAttemptRunning,
18
+ _FuruState,
19
+ _StateResultFailed,
20
+ )
21
+
22
+ Status = Literal["DONE", "IN_PROGRESS", "TODO", "FAILED"]
23
+
24
+ _MISSING_TIMESTAMP_SEEN: dict[str, float] = {}
25
+
26
+
27
+ @dataclass
28
+ class PlanNode:
29
+ obj: Furu
30
+ status: Status
31
+ spec_key: str
32
+ deps_all: set[str]
33
+ deps_pending: set[str]
34
+ dependents: set[str]
35
+
36
+
37
+ @dataclass
38
+ class DependencyPlan:
39
+ roots: list[Furu]
40
+ nodes: dict[str, PlanNode]
41
+
42
+
43
+ @dataclass
44
+ class _PlanCache:
45
+ migration_records: dict[Path, MigrationRecord | None]
46
+ alias_targets: dict[Path, Path | None]
47
+ marker_exists: dict[Path, bool]
48
+ states: dict[Path, _FuruState]
49
+
50
+
51
+ def _marker_exists(directory: Path, cache: _PlanCache) -> bool:
52
+ if directory in cache.marker_exists:
53
+ return cache.marker_exists[directory]
54
+ exists = StateManager.success_marker_exists(directory)
55
+ cache.marker_exists[directory] = exists
56
+ return exists
57
+
58
+
59
+ def _migration_record(directory: Path, cache: _PlanCache) -> MigrationRecord | None:
60
+ if directory not in cache.migration_records:
61
+ cache.migration_records[directory] = MigrationManager.read_migration(directory)
62
+ return cache.migration_records[directory]
63
+
64
+
65
+ def _alias_target_dir(base_dir: Path, cache: _PlanCache) -> Path | None:
66
+ if base_dir in cache.alias_targets:
67
+ return cache.alias_targets[base_dir]
68
+ record = _migration_record(base_dir, cache)
69
+ if record is None or record.kind != "alias" or record.overwritten_at is not None:
70
+ cache.alias_targets[base_dir] = None
71
+ return None
72
+ if _marker_exists(base_dir, cache):
73
+ cache.alias_targets[base_dir] = None
74
+ return None
75
+ target_dir = MigrationManager.resolve_dir(record, target="from")
76
+ if _marker_exists(target_dir, cache):
77
+ cache.alias_targets[base_dir] = target_dir
78
+ return target_dir
79
+ cache.alias_targets[base_dir] = None
80
+ return None
81
+
82
+
83
+ def _state_for(directory: Path, cache: _PlanCache) -> _FuruState:
84
+ if directory not in cache.states:
85
+ cache.states[directory] = StateManager.read_state(directory)
86
+ return cache.states[directory]
87
+
88
+
89
+ def _validate_cached(obj: Furu, *, directory: Path) -> bool:
90
+ try:
91
+ return obj._validate()
92
+ except FuruValidationError as exc:
93
+ logger = get_logger()
94
+ logger.warning(
95
+ "exists %s -> false (validate invalid for %s: %s)",
96
+ directory,
97
+ f"{obj.__class__.__name__}({obj.furu_hash})",
98
+ exc,
99
+ )
100
+ return False
101
+ except Exception as exc:
102
+ logger = get_logger()
103
+ logger.exception(
104
+ "exists %s -> false (validate crashed for %s: %s)",
105
+ directory,
106
+ f"{obj.__class__.__name__}({obj.furu_hash})",
107
+ exc,
108
+ )
109
+ return False
110
+
111
+
112
+ def _classify(
113
+ obj: Furu,
114
+ completed_hashes: set[str] | None,
115
+ cache: _PlanCache,
116
+ ) -> Status:
117
+ if completed_hashes is not None and obj.furu_hash in completed_hashes:
118
+ return "DONE"
119
+ base_dir = obj._base_furu_dir()
120
+ alias_target = None
121
+ if not obj._always_rerun():
122
+ alias_target = _alias_target_dir(base_dir, cache)
123
+ success_dir = alias_target or base_dir
124
+ if _marker_exists(success_dir, cache):
125
+ if _validate_cached(obj, directory=base_dir):
126
+ return "DONE"
127
+
128
+ state_dir = alias_target or base_dir
129
+ state = _state_for(state_dir, cache)
130
+ attempt = state.attempt
131
+ if isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning)):
132
+ return "IN_PROGRESS"
133
+ if isinstance(state.result, _StateResultFailed) or isinstance(
134
+ attempt, _StateAttemptFailed
135
+ ):
136
+ if FURU_CONFIG.retry_failed:
137
+ return "TODO"
138
+ return "FAILED"
139
+ return "TODO"
140
+
141
+
142
+ def build_plan(
143
+ roots: list[Furu],
144
+ *,
145
+ completed_hashes: set[str] | None = None,
146
+ ) -> DependencyPlan:
147
+ cache = _PlanCache(
148
+ migration_records={},
149
+ alias_targets={},
150
+ marker_exists={},
151
+ states={},
152
+ )
153
+ nodes: dict[str, PlanNode] = {}
154
+ stack = list(roots)
155
+ seen: set[str] = set()
156
+
157
+ while stack:
158
+ obj = stack.pop()
159
+ digest = obj.furu_hash
160
+ if digest in seen:
161
+ continue
162
+ seen.add(digest)
163
+
164
+ status = _classify(obj, completed_hashes, cache)
165
+ node = PlanNode(
166
+ obj=obj,
167
+ status=status,
168
+ spec_key=obj._executor_spec_key(),
169
+ deps_all=set(),
170
+ deps_pending=set(),
171
+ dependents=set(),
172
+ )
173
+ nodes[digest] = node
174
+
175
+ if status != "TODO":
176
+ continue
177
+
178
+ deps = obj._get_dependencies(recursive=False)
179
+ node.deps_all = {dep.furu_hash for dep in deps}
180
+ for dep in deps:
181
+ stack.append(dep)
182
+
183
+ for digest, node in nodes.items():
184
+ if node.status != "TODO":
185
+ continue
186
+ node.deps_pending = {
187
+ dep for dep in node.deps_all if dep in nodes and nodes[dep].status != "DONE"
188
+ }
189
+
190
+ for digest, node in nodes.items():
191
+ for dep in node.deps_pending:
192
+ nodes[dep].dependents.add(digest)
193
+
194
+ return DependencyPlan(roots=roots, nodes=nodes)
195
+
196
+
197
+ def topo_order_todo(plan: DependencyPlan) -> list[str]:
198
+ todo = {digest for digest, node in plan.nodes.items() if node.status == "TODO"}
199
+ indeg = {digest: 0 for digest in todo}
200
+
201
+ for digest in todo:
202
+ node = plan.nodes[digest]
203
+ for dep in node.deps_pending:
204
+ if dep in todo:
205
+ indeg[digest] += 1
206
+
207
+ ready = sorted([digest for digest, deg in indeg.items() if deg == 0])
208
+ out: list[str] = []
209
+
210
+ while ready:
211
+ digest = ready.pop(0)
212
+ out.append(digest)
213
+ for dep in plan.nodes[digest].dependents:
214
+ if dep not in todo:
215
+ continue
216
+ indeg[dep] -= 1
217
+ if indeg[dep] == 0:
218
+ ready.append(dep)
219
+ ready.sort()
220
+
221
+ if len(out) != len(todo):
222
+ raise ValueError("Cycle detected in TODO dependency graph")
223
+ return out
224
+
225
+
226
+ def ready_todo(plan: DependencyPlan) -> list[str]:
227
+ return sorted(
228
+ [
229
+ digest
230
+ for digest, node in plan.nodes.items()
231
+ if node.status == "TODO"
232
+ and all(plan.nodes[dep].status == "DONE" for dep in node.deps_pending)
233
+ ]
234
+ )
235
+
236
+
237
+ def _attempt_age_sec(
238
+ attempt: _StateAttemptQueued | _StateAttemptRunning,
239
+ *,
240
+ directory: Path,
241
+ stale_timeout_sec: float,
242
+ digest: str,
243
+ name: str,
244
+ ) -> float | None:
245
+ if attempt.status == "queued":
246
+ parsed = StateManager._parse_time(attempt.started_at)
247
+ if parsed is not None:
248
+ _MISSING_TIMESTAMP_SEEN.pop(digest, None)
249
+ return (StateManager._utcnow() - parsed).total_seconds()
250
+ else:
251
+ last_heartbeat = StateManager.last_heartbeat_mtime(directory)
252
+ if last_heartbeat is not None:
253
+ _MISSING_TIMESTAMP_SEEN.pop(digest, None)
254
+ return max(0.0, time.time() - last_heartbeat)
255
+ if stale_timeout_sec <= 0:
256
+ return None
257
+ now = StateManager._utcnow().timestamp()
258
+ first_seen = _MISSING_TIMESTAMP_SEEN.get(digest)
259
+ if first_seen is None:
260
+ _MISSING_TIMESTAMP_SEEN[digest] = now
261
+ logger = get_logger()
262
+ logger.warning(
263
+ "IN_PROGRESS attempt missing heartbeat/started timestamps for %s; "
264
+ "deferring stale timeout check.",
265
+ name,
266
+ )
267
+ return None
268
+ return now - first_seen
269
+
270
+
271
+ def reconcile_in_progress(
272
+ plan: DependencyPlan,
273
+ *,
274
+ stale_timeout_sec: float,
275
+ ) -> bool:
276
+ stale_attempts: list[
277
+ tuple[PlanNode, _StateAttemptQueued | _StateAttemptRunning]
278
+ ] = []
279
+ for node in plan.nodes.values():
280
+ if node.status != "IN_PROGRESS":
281
+ _MISSING_TIMESTAMP_SEEN.pop(node.obj.furu_hash, None)
282
+ continue
283
+ state = StateManager.reconcile(node.obj._base_furu_dir())
284
+ attempt = state.attempt
285
+ if not isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning)):
286
+ _MISSING_TIMESTAMP_SEEN.pop(node.obj.furu_hash, None)
287
+ continue
288
+ if stale_timeout_sec <= 0:
289
+ continue
290
+ name = f"{node.obj.__class__.__name__}({node.obj.furu_hash})"
291
+ age = _attempt_age_sec(
292
+ attempt,
293
+ directory=node.obj._base_furu_dir(),
294
+ stale_timeout_sec=stale_timeout_sec,
295
+ digest=node.obj.furu_hash,
296
+ name=name,
297
+ )
298
+ if age is None or age < stale_timeout_sec:
299
+ continue
300
+ stale_attempts.append((node, attempt))
301
+
302
+ if not stale_attempts:
303
+ return False
304
+
305
+ names = ", ".join(
306
+ f"{node.obj.__class__.__name__}({node.obj.furu_hash})"
307
+ for node, _attempt in stale_attempts
308
+ )
309
+ if not FURU_CONFIG.retry_failed:
310
+ raise RuntimeError(
311
+ "Stale IN_PROGRESS dependencies detected: "
312
+ f"{names} exceeded {stale_timeout_sec:.1f}s without heartbeat."
313
+ )
314
+
315
+ stale_detected = False
316
+ for node, attempt in stale_attempts:
317
+ stale_detected = True
318
+ StateManager.finish_attempt_preempted(
319
+ node.obj._base_furu_dir(),
320
+ attempt_id=attempt.id,
321
+ error={
322
+ "type": "StaleHeartbeat",
323
+ "message": (
324
+ f"Attempt stale after {stale_timeout_sec:.1f}s without heartbeat."
325
+ ),
326
+ },
327
+ reason="stale_timeout",
328
+ )
329
+ _MISSING_TIMESTAMP_SEEN.pop(node.obj.furu_hash, None)
330
+ return stale_detected
@@ -0,0 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ from .plan import DependencyPlan, reconcile_in_progress
4
+
5
+
6
+ def reconcile_or_timeout_in_progress(
7
+ plan: DependencyPlan,
8
+ *,
9
+ stale_timeout_sec: float,
10
+ ) -> bool:
11
+ if not any(node.status == "IN_PROGRESS" for node in plan.nodes.values()):
12
+ return False
13
+ return reconcile_in_progress(plan, stale_timeout_sec=stale_timeout_sec)