furu 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
furu/core/list.py CHANGED
@@ -116,7 +116,7 @@ class FuruList(Generic[_H], metaclass=_FuruListMeta):
116
116
 
117
117
  # Use the collection
118
118
  for exp in MyExperiments:
119
- result = exp.load_or_create()
119
+ result = exp.get()
120
120
  print(result)
121
121
  """
122
122
 
@@ -5,5 +5,14 @@ Install with: uv add furu[dashboard]
5
5
  Run with: furu-dashboard serve
6
6
  """
7
7
 
8
- __version__ = "0.1.0"
8
+ from importlib import metadata
9
9
 
10
+
11
+ def _resolve_version() -> str:
12
+ try:
13
+ return metadata.version("furu")
14
+ except metadata.PackageNotFoundError:
15
+ return "0.0.0"
16
+
17
+
18
+ __version__ = _resolve_version()
furu/dashboard/main.py CHANGED
@@ -5,11 +5,12 @@ from pathlib import Path
5
5
 
6
6
  import typer
7
7
  import uvicorn
8
- from fastapi import FastAPI
8
+ from fastapi import FastAPI, HTTPException
9
9
  from fastapi.middleware.cors import CORSMiddleware
10
10
  from fastapi.responses import FileResponse
11
11
  from fastapi.staticfiles import StaticFiles
12
12
 
13
+ from . import __version__
13
14
  from .api.routes import router as api_router
14
15
 
15
16
 
@@ -36,7 +37,7 @@ def create_app(*, serve_frontend: bool = False) -> FastAPI:
36
37
  app = FastAPI(
37
38
  title="Furu Dashboard",
38
39
  description="Monitoring dashboard for Furu experiments",
39
- version="0.1.0",
40
+ version=__version__,
40
41
  )
41
42
 
42
43
  # CORS middleware for development
@@ -64,7 +65,13 @@ def create_app(*, serve_frontend: bool = False) -> FastAPI:
64
65
  @app.get("/{full_path:path}")
65
66
  async def serve_spa(full_path: str) -> FileResponse:
66
67
  """Serve the React SPA for all non-API routes."""
67
- file_path = frontend_dir / full_path
68
+ requested = Path(full_path)
69
+ if ".." in requested.parts:
70
+ raise HTTPException(status_code=404, detail="Not found")
71
+ frontend_root = frontend_dir.resolve()
72
+ file_path = (frontend_dir / requested).resolve()
73
+ if not file_path.is_relative_to(frontend_root):
74
+ raise HTTPException(status_code=404, detail="Not found")
68
75
  if file_path.is_file() and not full_path.startswith("api"):
69
76
  return FileResponse(file_path)
70
77
  return FileResponse(frontend_dir / "index.html")
furu/errors.py CHANGED
@@ -30,6 +30,14 @@ class FuruError(Exception):
30
30
  return "\n".join(lines)
31
31
 
32
32
 
33
+ class FuruExecutionError(FuruError):
34
+ """Raised when executor wiring or scheduling fails."""
35
+
36
+
37
+ class FuruValidationError(FuruError):
38
+ """Raised by `_validate()` to indicate an invalid or missing artifact."""
39
+
40
+
33
41
  class FuruWaitTimeout(FuruError):
34
42
  """Raised when waiting for a result exceeds _max_wait_time_sec."""
35
43
 
@@ -70,11 +78,8 @@ class FuruComputeError(FuruError):
70
78
  msg = super().__str__() # ty: ignore[invalid-super-argument]
71
79
  internal_dir = self.state_path.parent
72
80
  furu_dir = internal_dir.parent
73
- log_path = internal_dir / "furu.log"
74
81
 
75
- msg += f"\n\nDirectory: {furu_dir}"
76
- msg += f"\nState file: {self.state_path}"
77
- msg += f"\nLog file: {log_path}"
82
+ msg += f"\n\nFuru dir: {furu_dir}"
78
83
 
79
84
  if self.recorded_error_type or self.recorded_error_message:
80
85
  msg += "\n\nRecorded error (from state.json):"
@@ -116,3 +121,11 @@ class FuruMigrationRequired(FuruError):
116
121
  if self.state_path is not None:
117
122
  msg += f"\n\nState file: {self.state_path}"
118
123
  return msg
124
+
125
+
126
+ class FuruMissingArtifact(FuruError):
127
+ """Raised when a dependency is missing in executor mode."""
128
+
129
+
130
+ class FuruSpecMismatch(FuruError):
131
+ """Raised when executor spec keys do not match."""
@@ -0,0 +1,22 @@
1
+ """Execution helpers for Furu."""
2
+
3
+ from .local import run_local
4
+ from .paths import submitit_logs_dir, submitit_root_dir
5
+ from .slurm_dag import SlurmDagSubmission, submit_slurm_dag
6
+ from .slurm_pool import SlurmPoolRun, run_slurm_pool
7
+ from .slurm_spec import SlurmSpec, SlurmSpecValue, resolve_slurm_spec
8
+ from .submitit_factory import make_executor_for_spec
9
+
10
+ __all__ = [
11
+ "SlurmSpec",
12
+ "SlurmSpecValue",
13
+ "resolve_slurm_spec",
14
+ "SlurmDagSubmission",
15
+ "submit_slurm_dag",
16
+ "make_executor_for_spec",
17
+ "SlurmPoolRun",
18
+ "run_slurm_pool",
19
+ "run_local",
20
+ "submitit_logs_dir",
21
+ "submitit_root_dir",
22
+ ]
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+ from contextvars import ContextVar
4
+ from dataclasses import dataclass
5
+ from typing import Literal
6
+
7
+ Backend = Literal["local", "submitit"]
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class ExecContext:
12
+ mode: Literal["interactive", "executor"]
13
+ spec_key: str | None = None
14
+ backend: Backend | None = None
15
+ current_node_hash: str | None = None
16
+
17
+
18
+ EXEC_CONTEXT: ContextVar[ExecContext] = ContextVar(
19
+ "FURU_EXEC_CONTEXT",
20
+ default=ExecContext(
21
+ mode="interactive",
22
+ spec_key=None,
23
+ backend=None,
24
+ current_node_hash=None,
25
+ ),
26
+ )
27
+
28
+
29
+ def in_executor() -> bool:
30
+ return EXEC_CONTEXT.get().mode == "executor"
@@ -0,0 +1,184 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
5
+
6
+ from ..config import FURU_CONFIG
7
+ from ..core import Furu
8
+ from ..errors import FuruComputeError, FuruError
9
+ from ..storage.state import StateManager
10
+ from .context import EXEC_CONTEXT, ExecContext
11
+ from .plan import PlanNode, build_plan, ready_todo
12
+ from .plan_utils import reconcile_or_timeout_in_progress
13
+
14
+
15
+ def _normalize_window_size(window_size: str | int, root_count: int) -> int:
16
+ if root_count == 0:
17
+ return 0
18
+ if isinstance(window_size, str):
19
+ match window_size:
20
+ case "dfs":
21
+ return 1
22
+ case "bfs":
23
+ return root_count
24
+ case _:
25
+ raise ValueError(
26
+ "window_size must be 'dfs', 'bfs', or a positive integer"
27
+ )
28
+ if isinstance(window_size, bool) or not isinstance(window_size, int):
29
+ raise TypeError("window_size must be 'dfs', 'bfs', or a positive integer")
30
+ if window_size < 1:
31
+ raise ValueError("window_size must be >= 1")
32
+ return min(window_size, root_count)
33
+
34
+
35
+ def _run_node(node: PlanNode) -> None:
36
+ token = EXEC_CONTEXT.set(
37
+ ExecContext(
38
+ mode="executor",
39
+ spec_key=node.spec_key,
40
+ backend="local",
41
+ current_node_hash=node.obj._furu_hash,
42
+ )
43
+ )
44
+ try:
45
+ node.obj.get(force=True)
46
+ finally:
47
+ EXEC_CONTEXT.reset(token)
48
+
49
+
50
+ def run_local(
51
+ roots: list[Furu],
52
+ *,
53
+ max_workers: int = 8,
54
+ window_size: str | int = "bfs",
55
+ poll_interval_sec: float = 0.25,
56
+ ) -> None:
57
+ if not roots:
58
+ return
59
+ if max_workers < 1:
60
+ raise ValueError("max_workers must be >= 1")
61
+
62
+ window = _normalize_window_size(window_size, len(roots))
63
+ active_indices = list(range(min(window, len(roots))))
64
+ next_index = len(active_indices)
65
+ inflight: dict[str, Future[None]] = {}
66
+ completed_hashes: set[str] = set()
67
+ retry_attempts: dict[str, int] = {}
68
+
69
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
70
+ while True:
71
+ active_roots = [roots[index] for index in active_indices]
72
+ plan = build_plan(active_roots, completed_hashes=completed_hashes)
73
+
74
+ ready = [digest for digest in ready_todo(plan) if digest not in inflight]
75
+ available = max_workers - len(inflight)
76
+ for digest in ready[:available]:
77
+ node = plan.nodes[digest]
78
+ inflight[digest] = executor.submit(_run_node, node)
79
+
80
+ completed = [digest for digest, future in inflight.items() if future.done()]
81
+ for digest in completed:
82
+ future = inflight.pop(digest)
83
+ try:
84
+ future.result()
85
+ except Exception as exc:
86
+ if isinstance(exc, FuruComputeError):
87
+ compute_error = exc
88
+ wrapped_exc: Exception | None = None
89
+ elif isinstance(exc, FuruError):
90
+ raise
91
+ else:
92
+ node = plan.nodes.get(digest)
93
+ if node is None:
94
+ raise
95
+ state_path = StateManager.get_state_path(
96
+ node.obj._base_furu_dir()
97
+ )
98
+ compute_error = FuruComputeError(
99
+ "local executor failed for "
100
+ f"{node.obj.__class__.__name__}({node.obj._furu_hash})",
101
+ state_path,
102
+ original_error=exc,
103
+ )
104
+ wrapped_exc = exc
105
+ if not FURU_CONFIG.retry_failed:
106
+ if wrapped_exc is not None:
107
+ raise compute_error from wrapped_exc
108
+ raise compute_error
109
+ attempt = retry_attempts.get(digest, 0) + 1
110
+ retry_attempts[digest] = attempt
111
+ if attempt <= FURU_CONFIG.max_compute_retries:
112
+ continue
113
+ if wrapped_exc is not None:
114
+ raise compute_error from wrapped_exc
115
+ raise compute_error
116
+ completed_hashes.add(digest)
117
+ retry_attempts.pop(digest, None)
118
+
119
+ if not FURU_CONFIG.retry_failed:
120
+ failed = [
121
+ node
122
+ for digest, node in plan.nodes.items()
123
+ if node.status == "FAILED" and digest not in inflight
124
+ ]
125
+ if failed:
126
+ names = ", ".join(
127
+ f"{node.obj.__class__.__name__}({node.obj._furu_hash})"
128
+ for node in failed
129
+ )
130
+ raise RuntimeError(
131
+ f"Cannot run local executor with failed dependencies: {names}"
132
+ )
133
+
134
+ if completed:
135
+ continue
136
+
137
+ # Avoid a busy-spin loop while waiting for long-running tasks.
138
+ if inflight and not completed:
139
+ wait(
140
+ inflight.values(),
141
+ timeout=poll_interval_sec,
142
+ return_when=FIRST_COMPLETED,
143
+ )
144
+ continue
145
+
146
+ finished_indices = [
147
+ index
148
+ for index in active_indices
149
+ if plan.nodes.get(roots[index]._furu_hash) is not None
150
+ and plan.nodes[roots[index]._furu_hash].status == "DONE"
151
+ ]
152
+ for index in finished_indices:
153
+ active_indices.remove(index)
154
+
155
+ while len(active_indices) < window and next_index < len(roots):
156
+ active_indices.append(next_index)
157
+ next_index += 1
158
+
159
+ if not active_indices and not inflight and next_index >= len(roots):
160
+ return
161
+
162
+ if not inflight and not ready:
163
+ if any(node.status == "IN_PROGRESS" for node in plan.nodes.values()):
164
+ stale_detected = reconcile_or_timeout_in_progress(
165
+ plan,
166
+ stale_timeout_sec=FURU_CONFIG.stale_timeout,
167
+ )
168
+ if stale_detected:
169
+ continue
170
+ time.sleep(poll_interval_sec)
171
+ continue
172
+ todo_nodes = [
173
+ node for node in plan.nodes.values() if node.status == "TODO"
174
+ ]
175
+ if todo_nodes:
176
+ sample = ", ".join(
177
+ f"{node.obj.__class__.__name__}({node.obj._furu_hash})"
178
+ for node in todo_nodes[:3]
179
+ )
180
+ raise RuntimeError(
181
+ "run_local stalled with no progress; "
182
+ f"remaining TODO nodes: {sample}"
183
+ )
184
+ time.sleep(poll_interval_sec)
@@ -0,0 +1,20 @@
1
+ from pathlib import Path
2
+
3
+ from furu.config import FURU_CONFIG
4
+
5
+
6
+ def submitit_root_dir(override: Path | None = None) -> Path:
7
+ return (override or FURU_CONFIG.get_submitit_root()).resolve()
8
+
9
+
10
+ def submitit_logs_dir(
11
+ kind: str,
12
+ spec_key: str,
13
+ override: Path | None = None,
14
+ run_id: str | None = None,
15
+ ) -> Path:
16
+ root = submitit_root_dir(override)
17
+ path = root / kind / spec_key
18
+ if run_id:
19
+ path = path / run_id
20
+ return path
furu/execution/plan.py ADDED
@@ -0,0 +1,238 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Literal
5
+
6
+ from ..config import FURU_CONFIG
7
+ from ..core import Furu
8
+ from ..runtime.logging import get_logger
9
+ from ..storage.state import (
10
+ StateManager,
11
+ _StateAttemptFailed,
12
+ _StateAttemptQueued,
13
+ _StateAttemptRunning,
14
+ _StateResultFailed,
15
+ )
16
+
17
+ Status = Literal["DONE", "IN_PROGRESS", "TODO", "FAILED"]
18
+
19
+ _MISSING_TIMESTAMP_SEEN: dict[str, float] = {}
20
+
21
+
22
+ @dataclass
23
+ class PlanNode:
24
+ obj: Furu
25
+ status: Status
26
+ spec_key: str
27
+ deps_all: set[str]
28
+ deps_pending: set[str]
29
+ dependents: set[str]
30
+
31
+
32
+ @dataclass
33
+ class DependencyPlan:
34
+ roots: list[Furu]
35
+ nodes: dict[str, PlanNode]
36
+
37
+
38
+ def _classify(obj: Furu, completed_hashes: set[str] | None) -> Status:
39
+ if completed_hashes is not None and obj._furu_hash in completed_hashes:
40
+ return "DONE"
41
+ if obj._exists_quiet() and not obj._always_rerun():
42
+ return "DONE"
43
+
44
+ state = obj.get_state()
45
+ attempt = state.attempt
46
+ if isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning)):
47
+ return "IN_PROGRESS"
48
+ if isinstance(state.result, _StateResultFailed) or isinstance(
49
+ attempt, _StateAttemptFailed
50
+ ):
51
+ if FURU_CONFIG.retry_failed:
52
+ return "TODO"
53
+ return "FAILED"
54
+ return "TODO"
55
+
56
+
57
+ def build_plan(
58
+ roots: list[Furu],
59
+ *,
60
+ completed_hashes: set[str] | None = None,
61
+ ) -> DependencyPlan:
62
+ nodes: dict[str, PlanNode] = {}
63
+ stack = list(roots)
64
+ seen: set[str] = set()
65
+
66
+ while stack:
67
+ obj = stack.pop()
68
+ digest = obj._furu_hash
69
+ if digest in seen:
70
+ continue
71
+ seen.add(digest)
72
+
73
+ status = _classify(obj, completed_hashes)
74
+ node = PlanNode(
75
+ obj=obj,
76
+ status=status,
77
+ spec_key=obj._executor_spec_key(),
78
+ deps_all=set(),
79
+ deps_pending=set(),
80
+ dependents=set(),
81
+ )
82
+ nodes[digest] = node
83
+
84
+ if status != "TODO":
85
+ continue
86
+
87
+ deps = obj._get_dependencies(recursive=False)
88
+ node.deps_all = {dep._furu_hash for dep in deps}
89
+ for dep in deps:
90
+ stack.append(dep)
91
+
92
+ for digest, node in nodes.items():
93
+ if node.status != "TODO":
94
+ continue
95
+ node.deps_pending = {
96
+ dep for dep in node.deps_all if dep in nodes and nodes[dep].status != "DONE"
97
+ }
98
+
99
+ for digest, node in nodes.items():
100
+ for dep in node.deps_pending:
101
+ nodes[dep].dependents.add(digest)
102
+
103
+ return DependencyPlan(roots=roots, nodes=nodes)
104
+
105
+
106
+ def topo_order_todo(plan: DependencyPlan) -> list[str]:
107
+ todo = {digest for digest, node in plan.nodes.items() if node.status == "TODO"}
108
+ indeg = {digest: 0 for digest in todo}
109
+
110
+ for digest in todo:
111
+ node = plan.nodes[digest]
112
+ for dep in node.deps_pending:
113
+ if dep in todo:
114
+ indeg[digest] += 1
115
+
116
+ ready = sorted([digest for digest, deg in indeg.items() if deg == 0])
117
+ out: list[str] = []
118
+
119
+ while ready:
120
+ digest = ready.pop(0)
121
+ out.append(digest)
122
+ for dep in plan.nodes[digest].dependents:
123
+ if dep not in todo:
124
+ continue
125
+ indeg[dep] -= 1
126
+ if indeg[dep] == 0:
127
+ ready.append(dep)
128
+ ready.sort()
129
+
130
+ if len(out) != len(todo):
131
+ raise ValueError("Cycle detected in TODO dependency graph")
132
+ return out
133
+
134
+
135
+ def ready_todo(plan: DependencyPlan) -> list[str]:
136
+ return sorted(
137
+ [
138
+ digest
139
+ for digest, node in plan.nodes.items()
140
+ if node.status == "TODO"
141
+ and all(plan.nodes[dep].status == "DONE" for dep in node.deps_pending)
142
+ ]
143
+ )
144
+
145
+
146
+ def _attempt_age_sec(
147
+ attempt: _StateAttemptQueued | _StateAttemptRunning,
148
+ *,
149
+ updated_at: str | None,
150
+ stale_timeout_sec: float,
151
+ digest: str,
152
+ name: str,
153
+ ) -> float | None:
154
+ timestamp = attempt.heartbeat_at
155
+ if attempt.status == "queued":
156
+ timestamp = attempt.started_at
157
+ parsed = StateManager._parse_time(timestamp)
158
+ if parsed is None:
159
+ parsed = StateManager._parse_time(updated_at)
160
+ if parsed is not None:
161
+ _MISSING_TIMESTAMP_SEEN.pop(digest, None)
162
+ return (StateManager._utcnow() - parsed).total_seconds()
163
+ if stale_timeout_sec <= 0:
164
+ return None
165
+ now = StateManager._utcnow().timestamp()
166
+ first_seen = _MISSING_TIMESTAMP_SEEN.get(digest)
167
+ if first_seen is None:
168
+ _MISSING_TIMESTAMP_SEEN[digest] = now
169
+ logger = get_logger()
170
+ logger.warning(
171
+ "IN_PROGRESS attempt missing heartbeat/started timestamps for %s; "
172
+ "deferring stale timeout check.",
173
+ name,
174
+ )
175
+ return None
176
+ return now - first_seen
177
+
178
+
179
+ def reconcile_in_progress(
180
+ plan: DependencyPlan,
181
+ *,
182
+ stale_timeout_sec: float,
183
+ ) -> bool:
184
+ stale_attempts: list[
185
+ tuple[PlanNode, _StateAttemptQueued | _StateAttemptRunning]
186
+ ] = []
187
+ for node in plan.nodes.values():
188
+ if node.status != "IN_PROGRESS":
189
+ _MISSING_TIMESTAMP_SEEN.pop(node.obj._furu_hash, None)
190
+ continue
191
+ state = StateManager.reconcile(node.obj._base_furu_dir())
192
+ attempt = state.attempt
193
+ if not isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning)):
194
+ _MISSING_TIMESTAMP_SEEN.pop(node.obj._furu_hash, None)
195
+ continue
196
+ if stale_timeout_sec <= 0:
197
+ continue
198
+ name = f"{node.obj.__class__.__name__}({node.obj._furu_hash})"
199
+ age = _attempt_age_sec(
200
+ attempt,
201
+ updated_at=state.updated_at,
202
+ stale_timeout_sec=stale_timeout_sec,
203
+ digest=node.obj._furu_hash,
204
+ name=name,
205
+ )
206
+ if age is None or age < stale_timeout_sec:
207
+ continue
208
+ stale_attempts.append((node, attempt))
209
+
210
+ if not stale_attempts:
211
+ return False
212
+
213
+ names = ", ".join(
214
+ f"{node.obj.__class__.__name__}({node.obj._furu_hash})"
215
+ for node, _attempt in stale_attempts
216
+ )
217
+ if not FURU_CONFIG.retry_failed:
218
+ raise RuntimeError(
219
+ "Stale IN_PROGRESS dependencies detected: "
220
+ f"{names} exceeded {stale_timeout_sec:.1f}s without heartbeat."
221
+ )
222
+
223
+ stale_detected = False
224
+ for node, attempt in stale_attempts:
225
+ stale_detected = True
226
+ StateManager.finish_attempt_preempted(
227
+ node.obj._base_furu_dir(),
228
+ attempt_id=attempt.id,
229
+ error={
230
+ "type": "StaleHeartbeat",
231
+ "message": (
232
+ f"Attempt stale after {stale_timeout_sec:.1f}s without heartbeat."
233
+ ),
234
+ },
235
+ reason="stale_timeout",
236
+ )
237
+ _MISSING_TIMESTAMP_SEEN.pop(node.obj._furu_hash, None)
238
+ return stale_detected
@@ -0,0 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ from .plan import DependencyPlan, reconcile_in_progress
4
+
5
+
6
+ def reconcile_or_timeout_in_progress(
7
+ plan: DependencyPlan,
8
+ *,
9
+ stale_timeout_sec: float,
10
+ ) -> bool:
11
+ if not any(node.status == "IN_PROGRESS" for node in plan.nodes.values()):
12
+ return False
13
+ return reconcile_in_progress(plan, stale_timeout_sec=stale_timeout_sec)