furu 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- furu/__init__.py +8 -0
- furu/adapters/submitit.py +23 -2
- furu/config.py +13 -1
- furu/core/furu.py +355 -196
- furu/core/list.py +1 -1
- furu/dashboard/__init__.py +10 -1
- furu/dashboard/main.py +10 -3
- furu/errors.py +17 -4
- furu/execution/__init__.py +22 -0
- furu/execution/context.py +30 -0
- furu/execution/local.py +184 -0
- furu/execution/paths.py +20 -0
- furu/execution/plan.py +238 -0
- furu/execution/plan_utils.py +13 -0
- furu/execution/slurm_dag.py +271 -0
- furu/execution/slurm_pool.py +878 -0
- furu/execution/slurm_spec.py +38 -0
- furu/execution/submitit_factory.py +47 -0
- furu/runtime/logging.py +10 -10
- furu/storage/state.py +34 -6
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/METADATA +74 -37
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/RECORD +24 -14
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/WHEEL +0 -0
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/entry_points.txt +0 -0
furu/__init__.py
CHANGED
|
@@ -17,8 +17,12 @@ from .core import DependencyChzSpec, DependencySpec, Furu, FuruList
|
|
|
17
17
|
from .errors import (
|
|
18
18
|
FuruComputeError,
|
|
19
19
|
FuruError,
|
|
20
|
+
FuruExecutionError,
|
|
20
21
|
FuruLockNotAcquired,
|
|
22
|
+
FuruMissingArtifact,
|
|
21
23
|
FuruMigrationRequired,
|
|
24
|
+
FuruSpecMismatch,
|
|
25
|
+
FuruValidationError,
|
|
22
26
|
FuruWaitTimeout,
|
|
23
27
|
MISSING,
|
|
24
28
|
)
|
|
@@ -51,9 +55,13 @@ __all__ = [
|
|
|
51
55
|
"FuruComputeError",
|
|
52
56
|
"FuruConfig",
|
|
53
57
|
"FuruError",
|
|
58
|
+
"FuruExecutionError",
|
|
54
59
|
"FuruList",
|
|
55
60
|
"FuruLockNotAcquired",
|
|
61
|
+
"FuruMissingArtifact",
|
|
56
62
|
"FuruMigrationRequired",
|
|
63
|
+
"FuruSpecMismatch",
|
|
64
|
+
"FuruValidationError",
|
|
57
65
|
"FuruSerializer",
|
|
58
66
|
"FuruWaitTimeout",
|
|
59
67
|
"DependencyChzSpec",
|
furu/adapters/submitit.py
CHANGED
|
@@ -6,6 +6,7 @@ from typing import Any, Callable, Protocol
|
|
|
6
6
|
|
|
7
7
|
from ..config import FURU_CONFIG
|
|
8
8
|
from ..storage import StateManager
|
|
9
|
+
from ..runtime.logging import get_logger
|
|
9
10
|
from ..storage.state import _FuruState, ProbeResult
|
|
10
11
|
|
|
11
12
|
|
|
@@ -102,19 +103,39 @@ class SubmititAdapter:
|
|
|
102
103
|
"""Watch for job ID in background thread and update state."""
|
|
103
104
|
|
|
104
105
|
def watcher():
|
|
106
|
+
_ = attempt_id # intentionally unused; queued->running attempt swap is expected
|
|
105
107
|
while True:
|
|
106
108
|
job_id = self.get_job_id(job)
|
|
107
109
|
if job_id:
|
|
108
110
|
|
|
109
111
|
def mutate(state: _FuruState) -> None:
|
|
110
112
|
attempt = state.attempt
|
|
111
|
-
if attempt is None
|
|
113
|
+
if attempt is None:
|
|
114
|
+
return
|
|
115
|
+
if attempt.backend != "submitit":
|
|
116
|
+
return
|
|
117
|
+
if (
|
|
118
|
+
attempt.status not in {"queued", "running"}
|
|
119
|
+
and attempt.status not in StateManager.TERMINAL_STATUSES
|
|
120
|
+
):
|
|
121
|
+
return
|
|
122
|
+
existing = attempt.scheduler.get("job_id")
|
|
123
|
+
if existing == job_id:
|
|
112
124
|
return
|
|
113
125
|
attempt.scheduler["job_id"] = job_id
|
|
114
126
|
|
|
115
127
|
StateManager.update_state(directory, mutate)
|
|
116
128
|
if callback:
|
|
117
|
-
|
|
129
|
+
try:
|
|
130
|
+
callback(job_id)
|
|
131
|
+
except Exception:
|
|
132
|
+
# Avoid killing the watcher thread; state update already happened.
|
|
133
|
+
logger = get_logger()
|
|
134
|
+
logger.exception(
|
|
135
|
+
"submitit watcher: job_id callback failed for %s: %s",
|
|
136
|
+
directory,
|
|
137
|
+
job_id,
|
|
138
|
+
)
|
|
118
139
|
break
|
|
119
140
|
|
|
120
141
|
if self.is_done(job):
|
furu/config.py
CHANGED
|
@@ -18,6 +18,11 @@ class FuruConfig:
|
|
|
18
18
|
return (project_root / self.DEFAULT_ROOT_DIR).resolve()
|
|
19
19
|
|
|
20
20
|
self.base_root = _get_base_root()
|
|
21
|
+
self.submitit_root = (
|
|
22
|
+
Path(os.getenv("FURU_SUBMITIT_PATH", str(self.base_root / "submitit")))
|
|
23
|
+
.expanduser()
|
|
24
|
+
.resolve()
|
|
25
|
+
)
|
|
21
26
|
self.version_controlled_root_override = self._get_version_controlled_override()
|
|
22
27
|
self.poll_interval = float(os.getenv("FURU_POLL_INTERVAL_SECS", "10"))
|
|
23
28
|
self.wait_log_every_sec = float(os.getenv("FURU_WAIT_LOG_EVERY_SECS", "10"))
|
|
@@ -30,6 +35,7 @@ class FuruConfig:
|
|
|
30
35
|
float(hb) if hb is not None else max(1.0, self.lease_duration_sec / 3.0)
|
|
31
36
|
)
|
|
32
37
|
self.max_requeues = int(os.getenv("FURU_PREEMPT_MAX", "5"))
|
|
38
|
+
self.max_compute_retries = int(os.getenv("FURU_MAX_COMPUTE_RETRIES", "3"))
|
|
33
39
|
self.retry_failed = os.getenv("FURU_RETRY_FAILED", "1").lower() in {
|
|
34
40
|
"1",
|
|
35
41
|
"true",
|
|
@@ -109,6 +115,9 @@ class FuruConfig:
|
|
|
109
115
|
return self._resolve_version_controlled_root()
|
|
110
116
|
return self.base_root / "data"
|
|
111
117
|
|
|
118
|
+
def get_submitit_root(self) -> Path:
|
|
119
|
+
return self.submitit_root
|
|
120
|
+
|
|
112
121
|
@classmethod
|
|
113
122
|
def _get_version_controlled_override(cls) -> Path | None:
|
|
114
123
|
env = os.getenv("FURU_VERSION_CONTROLLED_PATH")
|
|
@@ -175,4 +184,7 @@ def get_furu_root(*, version_controlled: bool = False) -> Path:
|
|
|
175
184
|
|
|
176
185
|
|
|
177
186
|
def set_furu_root(path: Path) -> None:
|
|
178
|
-
|
|
187
|
+
root = path.resolve()
|
|
188
|
+
FURU_CONFIG.base_root = root
|
|
189
|
+
if os.getenv("FURU_SUBMITIT_PATH") is None:
|
|
190
|
+
FURU_CONFIG.submitit_root = (root / "submitit").resolve()
|