starforge-kernel 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- starforge/__init__.py +86 -0
- starforge/core/__init__.py +0 -0
- starforge/core/checkpoints.py +178 -0
- starforge/core/figures.py +119 -0
- starforge/core/previews.py +109 -0
- starforge/core/provenance.py +192 -0
- starforge/core/runner.py +293 -0
- starforge/core/serializers.py +141 -0
- starforge/core/spec.py +126 -0
- starforge/index/__init__.py +9 -0
- starforge/index/scanner.py +487 -0
- starforge/kernel/__init__.py +0 -0
- starforge/kernel/__main__.py +3 -0
- starforge/kernel/server.py +351 -0
- starforge/kernel/worker.py +66 -0
- starforge/mcp.py +283 -0
- starforge_kernel-0.1.0.dist-info/METADATA +76 -0
- starforge_kernel-0.1.0.dist-info/RECORD +20 -0
- starforge_kernel-0.1.0.dist-info/WHEEL +5 -0
- starforge_kernel-0.1.0.dist-info/top_level.txt +1 -0
starforge/__init__.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""*Forge — pipeline canvas for the repo you already have open.
|
|
2
|
+
|
|
3
|
+
This top-level module is the entire public surface that user code touches.
|
|
4
|
+
It must import in microseconds and depend on nothing: the decorator lives in
|
|
5
|
+
production codebases and has to be free. Everything heavy (indexer, engine,
|
|
6
|
+
kernel) lives in submodules that only *Forge itself* imports.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
12
|
+
|
|
13
|
+
__all__ = ["block", "progress", "BLOCK_ATTR"]
|
|
14
|
+
|
|
15
|
+
#: Attribute set on decorated functions. The AST indexer matches the decorator
|
|
16
|
+
#: syntactically and never imports user code; this runtime tag exists so user
|
|
17
|
+
#: code and future runtime introspection can also recognize blocks.
|
|
18
|
+
BLOCK_ATTR = "__starforge_block__"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def block(fn=None, *, label=None, category=None, outputs=None):
|
|
22
|
+
"""Register a function as a *Forge block.
|
|
23
|
+
|
|
24
|
+
Usable bare or with keyword arguments::
|
|
25
|
+
|
|
26
|
+
@block
|
|
27
|
+
def clean(raw: pd.DataFrame) -> pd.DataFrame: ...
|
|
28
|
+
|
|
29
|
+
@block(label="Clean AUC Matrix", category="QC", outputs=("clean", "stats"))
|
|
30
|
+
def clean_auc(raw, min_coverage: float = 0.8): ...
|
|
31
|
+
|
|
32
|
+
The decorated function is returned unchanged — behavior under pytest, in
|
|
33
|
+
CI, or in production is identical whether or not *Forge is anywhere near.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
label: Palette display name. Defaults to the function name, title-cased.
|
|
37
|
+
category: Palette grouping. Defaults to the defining module's path.
|
|
38
|
+
outputs: Names for multiple return values (function must return a tuple
|
|
39
|
+
of the same length). Defaults to a single output named "output".
|
|
40
|
+
|
|
41
|
+
Note for palette metadata: the indexer reads ``label``/``category``/
|
|
42
|
+
``outputs`` from the *source*, so they must be literals at the decoration
|
|
43
|
+
site to appear in the palette.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def apply(f):
|
|
47
|
+
setattr(
|
|
48
|
+
f,
|
|
49
|
+
BLOCK_ATTR,
|
|
50
|
+
{
|
|
51
|
+
"label": label,
|
|
52
|
+
"category": category,
|
|
53
|
+
"outputs": tuple(outputs) if outputs is not None else None,
|
|
54
|
+
},
|
|
55
|
+
)
|
|
56
|
+
return f
|
|
57
|
+
|
|
58
|
+
if fn is not None:
|
|
59
|
+
return apply(fn)
|
|
60
|
+
return apply
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
#: Installed by the *Forge run worker around each block call; None everywhere
|
|
64
|
+
#: else, which keeps progress() a guaranteed no-op in pytest/CI/production.
|
|
65
|
+
_progress_hook = None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def progress(current=None, total=None, label=None):
|
|
69
|
+
"""Report block progress to the *Forge canvas.
|
|
70
|
+
|
|
71
|
+
Call freely inside a block::
|
|
72
|
+
|
|
73
|
+
for i, chunk in enumerate(chunks):
|
|
74
|
+
progress(i + 1, len(chunks), "fitting folds")
|
|
75
|
+
...
|
|
76
|
+
|
|
77
|
+
Outside a *Forge run this does nothing and costs one attribute read —
|
|
78
|
+
safe to leave in production code. Any combination of arguments works:
|
|
79
|
+
(current, total) renders a determinate bar, label alone updates the text.
|
|
80
|
+
"""
|
|
81
|
+
hook = _progress_hook
|
|
82
|
+
if hook is not None:
|
|
83
|
+
try:
|
|
84
|
+
hook(current, total, label)
|
|
85
|
+
except Exception:
|
|
86
|
+
pass # progress must never break user code
|
|
File without changes
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Checkpoint store under ``<workspace>/.forge/checkpoints/``.
|
|
2
|
+
|
|
3
|
+
One directory per history hash (truncated to 32 hex chars — 128 bits — to
|
|
4
|
+
stay friendly to Windows path limits):
|
|
5
|
+
|
|
6
|
+
.forge/checkpoints/<hash32>/
|
|
7
|
+
├── provenance.json # written LAST: its presence marks completeness
|
|
8
|
+
└── outputs/<name>.<ext per serializer>
|
|
9
|
+
|
|
10
|
+
The store also owns ``.forge/.gitignore`` so checkpoints and caches never
|
|
11
|
+
land in the user's repo history while ``pipelines/`` remains committable.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
import shutil
|
|
20
|
+
import time
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
from starforge.core import figures as figmod
|
|
24
|
+
from starforge.core import previews, serializers
|
|
25
|
+
|
|
26
|
+
FORGE_DIR = ".forge"
|
|
27
|
+
GITIGNORE_BODY = "checkpoints/\ncache/\n"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class CheckpointStore:
|
|
31
|
+
def __init__(self, workspace: str | Path) -> None:
|
|
32
|
+
self.workspace = Path(workspace)
|
|
33
|
+
self.forge_dir = self.workspace / FORGE_DIR
|
|
34
|
+
self.base = self.forge_dir / "checkpoints"
|
|
35
|
+
|
|
36
|
+
def ensure_layout(self) -> None:
|
|
37
|
+
(self.forge_dir / "pipelines").mkdir(parents=True, exist_ok=True)
|
|
38
|
+
(self.forge_dir / "cache").mkdir(parents=True, exist_ok=True)
|
|
39
|
+
self.base.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
gitignore = self.forge_dir / ".gitignore"
|
|
41
|
+
if not gitignore.exists():
|
|
42
|
+
gitignore.write_text(GITIGNORE_BODY, encoding="utf-8")
|
|
43
|
+
|
|
44
|
+
def dir_for(self, history_hash: str) -> Path:
|
|
45
|
+
return self.base / history_hash[:32]
|
|
46
|
+
|
|
47
|
+
def exists(self, history_hash: str) -> bool:
|
|
48
|
+
return (self.dir_for(history_hash) / "provenance.json").is_file()
|
|
49
|
+
|
|
50
|
+
def read_provenance(self, history_hash: str) -> dict[str, Any]:
|
|
51
|
+
path = self.dir_for(history_hash) / "provenance.json"
|
|
52
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
53
|
+
|
|
54
|
+
def write(
|
|
55
|
+
self,
|
|
56
|
+
history_hash: str,
|
|
57
|
+
provenance: dict[str, Any],
|
|
58
|
+
outputs: dict[str, Any],
|
|
59
|
+
pickle_enabled: bool = False,
|
|
60
|
+
side_figures: list[Any] | None = None,
|
|
61
|
+
) -> list[dict[str, Any]]:
|
|
62
|
+
"""Persist outputs then provenance (in that order, for atomicity).
|
|
63
|
+
Returns the output manifest, including ephemeral entries.
|
|
64
|
+
|
|
65
|
+
``side_figures`` are figures the block created or showed without
|
|
66
|
+
returning them (plt.show() and friends); they render to artifacts
|
|
67
|
+
recorded under the provenance ``figures`` key."""
|
|
68
|
+
directory = self.dir_for(history_hash)
|
|
69
|
+
outputs_dir = directory / "outputs"
|
|
70
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
manifest = []
|
|
72
|
+
for name, value in outputs.items():
|
|
73
|
+
entry = serializers.save_value(value, outputs_dir, name, pickle_enabled=pickle_enabled)
|
|
74
|
+
try:
|
|
75
|
+
# Previews ride inside provenance.json so the stdlib-only
|
|
76
|
+
# kernel can serve them without deserializing data. Computed
|
|
77
|
+
# for ephemeral outputs too — their only window is right now.
|
|
78
|
+
if entry.get("artifact"):
|
|
79
|
+
entry["preview"] = {
|
|
80
|
+
"kind": "figure",
|
|
81
|
+
"file": entry["artifact"]["file"],
|
|
82
|
+
"format": entry["artifact"]["kind"],
|
|
83
|
+
}
|
|
84
|
+
else:
|
|
85
|
+
entry["preview"] = previews.build_preview(value)
|
|
86
|
+
except Exception:
|
|
87
|
+
entry["preview"] = {"kind": "text", "text": f"<preview failed for {type(value).__name__}>"}
|
|
88
|
+
manifest.append(entry)
|
|
89
|
+
|
|
90
|
+
rendered_figures: list[dict[str, Any]] = []
|
|
91
|
+
for i, fig in enumerate(side_figures or []):
|
|
92
|
+
try:
|
|
93
|
+
artifact = figmod.render_figure(fig, outputs_dir, f"figure_{i}")
|
|
94
|
+
except Exception:
|
|
95
|
+
artifact = None
|
|
96
|
+
if artifact is not None:
|
|
97
|
+
rendered_figures.append(artifact)
|
|
98
|
+
|
|
99
|
+
record = dict(provenance)
|
|
100
|
+
record["history_hash"] = history_hash
|
|
101
|
+
record["outputs"] = manifest
|
|
102
|
+
record["figures"] = rendered_figures
|
|
103
|
+
record["dir"] = directory.relative_to(self.workspace).as_posix()
|
|
104
|
+
path = directory / "provenance.json"
|
|
105
|
+
tmp = directory / "provenance.json.tmp"
|
|
106
|
+
tmp.write_text(json.dumps(record, indent=2, default=repr), encoding="utf-8")
|
|
107
|
+
tmp.replace(path)
|
|
108
|
+
return manifest
|
|
109
|
+
|
|
110
|
+
def output_entry(self, history_hash: str, name: str) -> dict[str, Any]:
|
|
111
|
+
for entry in self.read_provenance(history_hash).get("outputs", []):
|
|
112
|
+
if entry.get("name") == name:
|
|
113
|
+
return entry
|
|
114
|
+
raise KeyError(f"checkpoint {history_hash[:12]} has no output named '{name}'")
|
|
115
|
+
|
|
116
|
+
def load_output(self, history_hash: str, name: str) -> Any:
|
|
117
|
+
"""Raises serializers.EphemeralValueError for non-persisted outputs."""
|
|
118
|
+
entry = self.output_entry(history_hash, name)
|
|
119
|
+
return serializers.load_value(self.dir_for(history_hash) / "outputs", entry)
|
|
120
|
+
|
|
121
|
+
def is_ephemeral(self, history_hash: str, name: str) -> bool:
|
|
122
|
+
try:
|
|
123
|
+
entry = self.output_entry(history_hash, name)
|
|
124
|
+
except (KeyError, FileNotFoundError, json.JSONDecodeError):
|
|
125
|
+
return True
|
|
126
|
+
return entry.get("serializer") == serializers.EPHEMERAL
|
|
127
|
+
|
|
128
|
+
def touch(self, history_hash: str) -> None:
|
|
129
|
+
"""Bump the checkpoint dir's mtime so LRU GC sees reuse as recency."""
|
|
130
|
+
try:
|
|
131
|
+
os.utime(self.dir_for(history_hash))
|
|
132
|
+
except OSError:
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
def gc(self, max_bytes: int) -> dict[str, int]:
|
|
136
|
+
"""Least-recently-used eviction down to ``max_bytes`` total.
|
|
137
|
+
|
|
138
|
+
Deleting a live checkpoint is always safe — the node just reads as
|
|
139
|
+
stale and recomputes — so a plain LRU needs no liveness analysis.
|
|
140
|
+
Returns {"freed_bytes", "deleted", "remaining_bytes"}.
|
|
141
|
+
"""
|
|
142
|
+
entries: list[tuple[float, int, Path]] = []
|
|
143
|
+
total = 0
|
|
144
|
+
if self.base.is_dir():
|
|
145
|
+
for directory in self.base.iterdir():
|
|
146
|
+
if not directory.is_dir():
|
|
147
|
+
continue
|
|
148
|
+
size = sum(f.stat().st_size for f in directory.rglob("*") if f.is_file())
|
|
149
|
+
try:
|
|
150
|
+
mtime = directory.stat().st_mtime
|
|
151
|
+
except OSError:
|
|
152
|
+
continue
|
|
153
|
+
entries.append((mtime, size, directory))
|
|
154
|
+
total += size
|
|
155
|
+
|
|
156
|
+
freed = 0
|
|
157
|
+
deleted = 0
|
|
158
|
+
entries.sort() # oldest first
|
|
159
|
+
for _mtime, size, directory in entries:
|
|
160
|
+
if total - freed <= max_bytes:
|
|
161
|
+
break
|
|
162
|
+
shutil.rmtree(directory, ignore_errors=True)
|
|
163
|
+
freed += size
|
|
164
|
+
deleted += 1
|
|
165
|
+
return {"freed_bytes": freed, "deleted": deleted, "remaining_bytes": total - freed}
|
|
166
|
+
|
|
167
|
+
def clean_run_specs(self, max_age_seconds: float = 86400.0) -> None:
|
|
168
|
+
"""Run-spec files are one-shot worker inputs; sweep the stale ones."""
|
|
169
|
+
runs_dir = self.forge_dir / "cache" / "runs"
|
|
170
|
+
if not runs_dir.is_dir():
|
|
171
|
+
return
|
|
172
|
+
cutoff = time.time() - max_age_seconds
|
|
173
|
+
for spec in runs_dir.glob("*.json"):
|
|
174
|
+
try:
|
|
175
|
+
if spec.stat().st_mtime < cutoff:
|
|
176
|
+
spec.unlink()
|
|
177
|
+
except OSError:
|
|
178
|
+
continue
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Figure capture and artifact rendering.
|
|
2
|
+
|
|
3
|
+
The notebook muscle memory is ``plt.plot(...); plt.show()`` — or no show()
|
|
4
|
+
at all. The worker honors it with zero code changes: matplotlib runs on the
|
|
5
|
+
Agg backend, and :func:`capture` sweeps every figure that exists after the
|
|
6
|
+
block call that didn't exist before (``plt.show`` is a no-op under Agg, so
|
|
7
|
+
"shown" figures are still open when we sweep). Plotly's ``fig.show()`` is
|
|
8
|
+
intercepted by patching ``plotly.io.show`` while the block runs.
|
|
9
|
+
|
|
10
|
+
Captured and returned figures render to checkpoint artifacts — matplotlib →
|
|
11
|
+
PNG, plotly → self-contained HTML — and are closed afterward so a long run
|
|
12
|
+
never accumulates canvases.
|
|
13
|
+
|
|
14
|
+
Import discipline: stdlib-only at import time; matplotlib/plotly are only
|
|
15
|
+
touched when the user's process already loaded them.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from contextlib import contextmanager
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
import sys
|
|
24
|
+
from typing import Any, Iterator
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _pyplot() -> Any | None:
|
|
28
|
+
return sys.modules.get("matplotlib.pyplot")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _root_module(value: Any) -> str:
|
|
32
|
+
return type(value).__module__.split(".")[0]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class CapturedFigures:
|
|
37
|
+
matplotlib: list[Any] = field(default_factory=list)
|
|
38
|
+
plotly: list[Any] = field(default_factory=list)
|
|
39
|
+
|
|
40
|
+
def all_objects(self) -> list[Any]:
|
|
41
|
+
return [*self.matplotlib, *self.plotly]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@contextmanager
|
|
45
|
+
def capture() -> Iterator[CapturedFigures]:
|
|
46
|
+
"""Collect figures created (matplotlib) or shown (plotly) inside the
|
|
47
|
+
block call. The matplotlib sweep also catches figures created during the
|
|
48
|
+
block module's first import, since the import happens inside this
|
|
49
|
+
context in the runner."""
|
|
50
|
+
captured = CapturedFigures()
|
|
51
|
+
|
|
52
|
+
plt = _pyplot()
|
|
53
|
+
before: set[int] = set(plt.get_fignums()) if plt is not None else set()
|
|
54
|
+
|
|
55
|
+
pio = sys.modules.get("plotly.io")
|
|
56
|
+
original_show = getattr(pio, "show", None) if pio is not None else None
|
|
57
|
+
if pio is not None and original_show is not None:
|
|
58
|
+
|
|
59
|
+
def _grab(fig: Any, *args: Any, **kwargs: Any) -> None:
|
|
60
|
+
captured.plotly.append(fig)
|
|
61
|
+
|
|
62
|
+
pio.show = _grab
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
yield captured
|
|
66
|
+
finally:
|
|
67
|
+
if pio is not None and original_show is not None:
|
|
68
|
+
pio.show = original_show
|
|
69
|
+
plt = _pyplot() # may have been imported during the call
|
|
70
|
+
if plt is not None:
|
|
71
|
+
for num in plt.get_fignums():
|
|
72
|
+
if num not in before:
|
|
73
|
+
captured.matplotlib.append(plt.figure(num))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def as_figure(value: Any) -> Any | None:
|
|
77
|
+
"""Return a renderable figure for ``value``, or None.
|
|
78
|
+
|
|
79
|
+
Accepts matplotlib Figures, matplotlib Axes (``sns.heatmap`` et al.
|
|
80
|
+
return Axes — we render their parent figure), and plotly figures.
|
|
81
|
+
"""
|
|
82
|
+
root = _root_module(value)
|
|
83
|
+
if root == "matplotlib":
|
|
84
|
+
if hasattr(value, "savefig"):
|
|
85
|
+
return value
|
|
86
|
+
parent = getattr(value, "figure", None) # Axes and friends
|
|
87
|
+
if parent is not None and hasattr(parent, "savefig"):
|
|
88
|
+
return parent
|
|
89
|
+
if root == "plotly" and hasattr(value, "write_html"):
|
|
90
|
+
return value
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def render_figure(value: Any, directory: Path, basename: str) -> dict[str, Any] | None:
|
|
95
|
+
"""Render to ``directory/basename.(png|html)``; returns the artifact
|
|
96
|
+
entry ``{"file", "kind"}`` or None if ``value`` is not a figure."""
|
|
97
|
+
fig = as_figure(value)
|
|
98
|
+
if fig is None:
|
|
99
|
+
return None
|
|
100
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
if _root_module(fig) == "matplotlib":
|
|
102
|
+
filename = f"{basename}.png"
|
|
103
|
+
fig.savefig(directory / filename, dpi=110, bbox_inches="tight", facecolor=fig.get_facecolor())
|
|
104
|
+
return {"file": filename, "kind": "image"}
|
|
105
|
+
filename = f"{basename}.html"
|
|
106
|
+
fig.write_html(directory / filename, include_plotlyjs=True, full_html=True)
|
|
107
|
+
return {"file": filename, "kind": "html"}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def close_figures(figures: list[Any]) -> None:
|
|
111
|
+
plt = _pyplot()
|
|
112
|
+
if plt is None:
|
|
113
|
+
return
|
|
114
|
+
for fig in figures:
|
|
115
|
+
if _root_module(fig) == "matplotlib":
|
|
116
|
+
try:
|
|
117
|
+
plt.close(fig)
|
|
118
|
+
except Exception:
|
|
119
|
+
pass
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Cropped, JSON-safe output previews, computed at checkpoint-write time.
|
|
2
|
+
|
|
3
|
+
Previews are precomputed artifacts stored inside ``provenance.json`` — the
|
|
4
|
+
kernel serves them by reading a file, never by deserializing data (it stays
|
|
5
|
+
stdlib-only and instant). Because they're built while the value is in the
|
|
6
|
+
worker's hands, even EPHEMERAL outputs get a preview of their last run.
|
|
7
|
+
|
|
8
|
+
Everything emitted here must survive strict JSON.parse on the TypeScript
|
|
9
|
+
side: NaN/Infinity are stringified, containers are size-capped, and unknown
|
|
10
|
+
objects fall back to repr.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
MAX_ROWS = 8
|
|
19
|
+
MAX_COLS = 10
|
|
20
|
+
MAX_ITEMS = 50
|
|
21
|
+
MAX_DEPTH = 5
|
|
22
|
+
MAX_CELL_CHARS = 120
|
|
23
|
+
MAX_TEXT_CHARS = 600
|
|
24
|
+
MAX_VALUE_CHARS = 2000
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _cell(value: Any) -> Any:
|
|
28
|
+
"""One scalar table/array cell, strict-JSON safe."""
|
|
29
|
+
if isinstance(value, bool) or value is None:
|
|
30
|
+
return value
|
|
31
|
+
if isinstance(value, int):
|
|
32
|
+
return value
|
|
33
|
+
if isinstance(value, float):
|
|
34
|
+
return value if value == value and abs(value) != float("inf") else str(value)
|
|
35
|
+
text = value if isinstance(value, str) else repr(value)
|
|
36
|
+
return text[:MAX_CELL_CHARS] + ("…" if len(text) > MAX_CELL_CHARS else "")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _sanitize(value: Any, depth: int = 0) -> Any:
|
|
40
|
+
if depth >= MAX_DEPTH:
|
|
41
|
+
return _cell(value)
|
|
42
|
+
if isinstance(value, dict):
|
|
43
|
+
items = list(value.items())[:MAX_ITEMS]
|
|
44
|
+
out = {str(k)[:MAX_CELL_CHARS]: _sanitize(v, depth + 1) for k, v in items}
|
|
45
|
+
if len(value) > MAX_ITEMS:
|
|
46
|
+
out["…"] = f"+{len(value) - MAX_ITEMS} more"
|
|
47
|
+
return out
|
|
48
|
+
if isinstance(value, (list, tuple)):
|
|
49
|
+
out = [_sanitize(v, depth + 1) for v in value[:MAX_ITEMS]]
|
|
50
|
+
if len(value) > MAX_ITEMS:
|
|
51
|
+
out.append(f"… +{len(value) - MAX_ITEMS} more")
|
|
52
|
+
return out
|
|
53
|
+
return _cell(value)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _root_type_module(value: Any) -> str:
|
|
57
|
+
return type(value).__module__.split(".")[0]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def build_preview(value: Any) -> dict[str, Any]:
|
|
61
|
+
if _root_type_module(value) == "pandas":
|
|
62
|
+
import pandas as pd
|
|
63
|
+
|
|
64
|
+
frame = value.to_frame() if isinstance(value, pd.Series) else value
|
|
65
|
+
if isinstance(frame, pd.DataFrame):
|
|
66
|
+
columns = [str(c) for c in frame.columns[:MAX_COLS]]
|
|
67
|
+
head = frame.iloc[:MAX_ROWS, :MAX_COLS]
|
|
68
|
+
return {
|
|
69
|
+
"kind": "table",
|
|
70
|
+
"shape": [int(frame.shape[0]), int(frame.shape[1])],
|
|
71
|
+
"columns": columns,
|
|
72
|
+
"columns_truncated": frame.shape[1] > MAX_COLS,
|
|
73
|
+
"index": [_cell(i) for i in head.index.tolist()],
|
|
74
|
+
"rows": [[_cell(v) for v in row] for row in head.itertuples(index=False, name=None)],
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
if _root_type_module(value) == "numpy":
|
|
78
|
+
import numpy as np
|
|
79
|
+
|
|
80
|
+
if isinstance(value, np.ndarray):
|
|
81
|
+
corner = value
|
|
82
|
+
if corner.ndim == 0:
|
|
83
|
+
corner_list: Any = _cell(corner.item())
|
|
84
|
+
else:
|
|
85
|
+
slicer = tuple(slice(0, MAX_ROWS) for _ in range(corner.ndim))
|
|
86
|
+
corner_list = _sanitize(corner[slicer].tolist())
|
|
87
|
+
return {
|
|
88
|
+
"kind": "array",
|
|
89
|
+
"dtype": str(value.dtype),
|
|
90
|
+
"shape": list(value.shape),
|
|
91
|
+
"corner": corner_list,
|
|
92
|
+
}
|
|
93
|
+
if isinstance(value, np.generic):
|
|
94
|
+
return {"kind": "value", "value": _cell(value.item())}
|
|
95
|
+
|
|
96
|
+
if isinstance(value, (dict, list, tuple, str, int, float, bool)) or value is None:
|
|
97
|
+
sanitized = _sanitize(value)
|
|
98
|
+
try:
|
|
99
|
+
encoded = json.dumps(sanitized, allow_nan=False)
|
|
100
|
+
except (TypeError, ValueError):
|
|
101
|
+
encoded = None
|
|
102
|
+
if encoded is not None:
|
|
103
|
+
if len(encoded) > MAX_VALUE_CHARS:
|
|
104
|
+
return {"kind": "text", "text": encoded[:MAX_VALUE_CHARS] + "…"}
|
|
105
|
+
return {"kind": "value", "value": sanitized}
|
|
106
|
+
|
|
107
|
+
# Arbitrary objects: an honest repr, marked as text rather than data.
|
|
108
|
+
text = repr(value)
|
|
109
|
+
return {"kind": "text", "text": text[:MAX_TEXT_CHARS] + ("…" if len(text) > MAX_TEXT_CHARS else "")}
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""History-hash computation — the Tier 2 staleness recipe from DESIGN.md §7.
|
|
2
|
+
|
|
3
|
+
history_hash = sha256(canonical_json({
|
|
4
|
+
fn: source hash of the decorated function (AST-normalized),
|
|
5
|
+
closure: hash of the defining module's repo import-closure,
|
|
6
|
+
env: environment fingerprint (python version + dependency files),
|
|
7
|
+
params: literal params for UNCONNECTED parameters only,
|
|
8
|
+
inputs: {param_name: [parent_history_hash, source_output]},
|
|
9
|
+
}))
|
|
10
|
+
|
|
11
|
+
A node is stale iff no checkpoint exists for its computed hash. Everything
|
|
12
|
+
here is pure stdlib and cheap enough to recompute on every document edit.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
import hashlib
|
|
19
|
+
import json
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
import sys
|
|
22
|
+
from typing import Any, Callable
|
|
23
|
+
|
|
24
|
+
from starforge.core.spec import PipelineDoc
|
|
25
|
+
from starforge.index.scanner import WorkspaceIndex
|
|
26
|
+
|
|
27
|
+
#: Doc-native nodes that execute without importing user code. Constants are
|
|
28
|
+
#: the first; the snippet node (DESIGN.md §10) will join this namespace.
|
|
29
|
+
BUILTIN_PREFIX = "builtin:"
|
|
30
|
+
BUILTINS = {"builtin:constant"}
|
|
31
|
+
|
|
32
|
+
#: Dependency manifests folded into the environment fingerprint. pyproject is
|
|
33
|
+
#: deliberately excluded — version bumps would invalidate every checkpoint.
|
|
34
|
+
ENV_FILES = (
|
|
35
|
+
"requirements.txt",
|
|
36
|
+
"requirements.lock",
|
|
37
|
+
"poetry.lock",
|
|
38
|
+
"uv.lock",
|
|
39
|
+
"Pipfile.lock",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def canonical_json(value: Any) -> str:
|
|
44
|
+
return json.dumps(value, sort_keys=True, separators=(",", ":"), default=repr)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _sha(text: str) -> str:
|
|
48
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def env_fingerprint(workspace: str | Path) -> str:
|
|
52
|
+
workspace = Path(workspace)
|
|
53
|
+
parts = [f"python:{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"]
|
|
54
|
+
for name in ENV_FILES:
|
|
55
|
+
path = workspace / name
|
|
56
|
+
if path.is_file():
|
|
57
|
+
try:
|
|
58
|
+
parts.append(f"{name}:{hashlib.sha256(path.read_bytes()).hexdigest()}")
|
|
59
|
+
except OSError:
|
|
60
|
+
continue
|
|
61
|
+
return _sha("\n".join(parts))
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class NodeState:
|
|
66
|
+
history_hash: str | None = None
|
|
67
|
+
stale: bool = True
|
|
68
|
+
#: Human-readable reasons the node cannot hash or run (missing block,
|
|
69
|
+
#: cycle membership, bad edge target, ...). Empty means healthy.
|
|
70
|
+
problems: list[str] = field(default_factory=list)
|
|
71
|
+
|
|
72
|
+
def to_dict(self) -> dict[str, Any]:
|
|
73
|
+
return {"history_hash": self.history_hash, "stale": self.stale, "problems": self.problems}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def toposort(doc: PipelineDoc) -> tuple[list[str], set[str]]:
|
|
77
|
+
"""Kahn's algorithm. Returns (ordered_ids, ids_stuck_in_cycles)."""
|
|
78
|
+
indegree = {n.id: 0 for n in doc.nodes}
|
|
79
|
+
children: dict[str, list[str]] = {n.id: [] for n in doc.nodes}
|
|
80
|
+
for edge in doc.edges:
|
|
81
|
+
if edge.source in indegree and edge.target in indegree:
|
|
82
|
+
indegree[edge.target] += 1
|
|
83
|
+
children[edge.source].append(edge.target)
|
|
84
|
+
ready = sorted(nid for nid, deg in indegree.items() if deg == 0)
|
|
85
|
+
order: list[str] = []
|
|
86
|
+
while ready:
|
|
87
|
+
nid = ready.pop(0)
|
|
88
|
+
order.append(nid)
|
|
89
|
+
for child in children[nid]:
|
|
90
|
+
indegree[child] -= 1
|
|
91
|
+
if indegree[child] == 0:
|
|
92
|
+
ready.append(child)
|
|
93
|
+
ready.sort() # determinism beats micro-speed at canvas scale
|
|
94
|
+
return order, {nid for nid, deg in indegree.items() if deg > 0}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
#: Staleness tiers (DESIGN.md §7). The closure component per tier:
|
|
98
|
+
#: T0 — function body only (cheapest, misses helper edits entirely)
|
|
99
|
+
#: T1 — + the defining module (catches same-file helpers)
|
|
100
|
+
#: T2 — + the repo import closure (default; safe over-invalidation)
|
|
101
|
+
TIERS = ("T0", "T1", "T2")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _closure_component(tier: str, index: WorkspaceIndex, module: str) -> str:
|
|
105
|
+
if tier == "T0":
|
|
106
|
+
return ""
|
|
107
|
+
if tier == "T1":
|
|
108
|
+
info = index.modules.get(module)
|
|
109
|
+
return (info.ast_hash or info.file_hash) if info else ""
|
|
110
|
+
return index.closure_hash(module)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def compute_states(
|
|
114
|
+
doc: PipelineDoc,
|
|
115
|
+
index: WorkspaceIndex,
|
|
116
|
+
env_fp: str,
|
|
117
|
+
checkpoint_exists: Callable[[str], bool],
|
|
118
|
+
tier: str = "T2",
|
|
119
|
+
) -> dict[str, NodeState]:
|
|
120
|
+
"""Hash every node and decide staleness. Never raises on a sick document —
|
|
121
|
+
problems are reported per-node so the canvas can render them in place."""
|
|
122
|
+
blocks = index.blocks
|
|
123
|
+
states: dict[str, NodeState] = {n.id: NodeState() for n in doc.nodes}
|
|
124
|
+
order, cyclic = toposort(doc)
|
|
125
|
+
for nid in cyclic:
|
|
126
|
+
states[nid].problems.append("part of a dependency cycle")
|
|
127
|
+
|
|
128
|
+
for nid in order:
|
|
129
|
+
node = doc.node(nid)
|
|
130
|
+
state = states[nid]
|
|
131
|
+
|
|
132
|
+
if node.block.startswith(BUILTIN_PREFIX):
|
|
133
|
+
if node.block not in BUILTINS:
|
|
134
|
+
state.problems.append(f"unknown builtin '{node.block}'")
|
|
135
|
+
continue
|
|
136
|
+
if doc.in_edges(nid):
|
|
137
|
+
state.problems.append("Constant is a source node and takes no inputs")
|
|
138
|
+
continue
|
|
139
|
+
# Deliberately excludes env/closure: a constant's identity is its
|
|
140
|
+
# value, so checkpoints survive dependency upgrades and edits.
|
|
141
|
+
state.history_hash = _sha(canonical_json({"builtin": node.block, "params": node.params}))
|
|
142
|
+
state.stale = not checkpoint_exists(state.history_hash)
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
info = blocks.get(node.block)
|
|
146
|
+
if info is None:
|
|
147
|
+
state.problems.append(f"block '{node.block}' not found in workspace (decorator removed or file deleted?)")
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
param_names = {p.name for p in info.params}
|
|
151
|
+
inputs: dict[str, list[str]] = {}
|
|
152
|
+
broken = False
|
|
153
|
+
for edge in doc.in_edges(nid):
|
|
154
|
+
parent_state = states.get(edge.source)
|
|
155
|
+
if parent_state is None or parent_state.history_hash is None:
|
|
156
|
+
state.problems.append(f"input '{edge.target_param}' depends on unresolvable node '{edge.source}'")
|
|
157
|
+
broken = True
|
|
158
|
+
continue
|
|
159
|
+
if edge.target_param not in param_names:
|
|
160
|
+
state.problems.append(f"edge targets unknown parameter '{edge.target_param}'")
|
|
161
|
+
broken = True
|
|
162
|
+
continue
|
|
163
|
+
if edge.target_param in inputs:
|
|
164
|
+
state.problems.append(f"parameter '{edge.target_param}' has multiple incoming edges")
|
|
165
|
+
broken = True
|
|
166
|
+
continue
|
|
167
|
+
parent_block = blocks.get(doc.node(edge.source).block)
|
|
168
|
+
if parent_block is not None and edge.source_output not in parent_block.outputs:
|
|
169
|
+
state.problems.append(
|
|
170
|
+
f"edge expects output '{edge.source_output}' but '{parent_block.label}' produces {parent_block.outputs}"
|
|
171
|
+
)
|
|
172
|
+
broken = True
|
|
173
|
+
continue
|
|
174
|
+
inputs[edge.target_param] = [parent_state.history_hash, edge.source_output]
|
|
175
|
+
if broken:
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
literals = {k: v for k, v in node.params.items() if k not in inputs}
|
|
179
|
+
state.history_hash = _sha(
|
|
180
|
+
canonical_json(
|
|
181
|
+
{
|
|
182
|
+
"fn": info.source_hash,
|
|
183
|
+
"closure": _closure_component(tier, index, info.module),
|
|
184
|
+
"env": env_fp,
|
|
185
|
+
"params": literals,
|
|
186
|
+
"inputs": inputs,
|
|
187
|
+
}
|
|
188
|
+
)
|
|
189
|
+
)
|
|
190
|
+
state.stale = not checkpoint_exists(state.history_hash)
|
|
191
|
+
|
|
192
|
+
return states
|