starforge-kernel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,293 @@
1
+ """Pipeline execution. Runs INSIDE the per-run worker process — the kernel
2
+ never imports user code (DESIGN.md §3, process-per-run).
3
+
4
+ Planning rules:
5
+ - stale nodes (no checkpoint for their history hash) execute;
6
+ - a fresh node is pulled back into execution if an executing descendant needs
7
+ one of its outputs and that output was ephemeral (never persisted) — this
8
+ cascades upward until every needed value is either loadable or recomputed;
9
+ - nodes with hashing problems, and every node downstream of a problem or a
10
+ failure, are blocked (independent branches keep running).
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from dataclasses import dataclass
16
+ import datetime as dt
17
+ import importlib
18
+ import sys
19
+ import time
20
+ import traceback
21
+ from typing import Any, Callable
22
+
23
+ import starforge
24
+ from starforge.core import figures as figmod
25
+ from starforge.core.checkpoints import CheckpointStore
26
+ from starforge.core.provenance import BUILTIN_PREFIX, toposort
27
+ from starforge.core.serializers import EphemeralValueError
28
+ from starforge.core.spec import PipelineDoc, Node
29
+
30
+ #: Minimum seconds between forwarded progress events per node — blocks may
31
+ #: call progress() in tight loops; the canvas needs ~12fps at most.
32
+ PROGRESS_THROTTLE_S = 0.08
33
+
34
+
35
+ def _progress_hook(node_id: str, emit: Emit) -> Callable[[Any, Any, Any], None]:
36
+ last = {"at": 0.0}
37
+
38
+ def hook(current: Any, total: Any, label: Any) -> None:
39
+ now = time.monotonic()
40
+ final = total is not None and current == total
41
+ if not final and now - last["at"] < PROGRESS_THROTTLE_S:
42
+ return
43
+ last["at"] = now
44
+ event: dict[str, Any] = {"event": "node_progress", "node": node_id}
45
+ if current is not None:
46
+ event["current"] = current
47
+ if total is not None:
48
+ event["total"] = total
49
+ if label is not None:
50
+ event["label"] = str(label)
51
+ if isinstance(current, (int, float)) and isinstance(total, (int, float)) and total:
52
+ event["percent"] = max(0.0, min(1.0, current / total))
53
+ emit(event)
54
+
55
+ return hook
56
+
57
+ Emit = Callable[[dict[str, Any]], None]
58
+
59
+
60
+ @dataclass
61
+ class _NodePlanInfo:
62
+ history_hash: str | None
63
+ problems: list[str]
64
+
65
+
66
+ def _ancestor_cone(doc: PipelineDoc, target: str) -> set[str]:
67
+ """The target plus everything upstream of it."""
68
+ cone = {target}
69
+ frontier = [target]
70
+ while frontier:
71
+ nid = frontier.pop()
72
+ for edge in doc.in_edges(nid):
73
+ if edge.source not in cone:
74
+ cone.add(edge.source)
75
+ frontier.append(edge.source)
76
+ return cone
77
+
78
+
79
+ def plan_execution(
80
+ doc: PipelineDoc,
81
+ states: dict[str, dict[str, Any]],
82
+ store: CheckpointStore,
83
+ target: str | None = None,
84
+ ) -> tuple[list[str], set[str], set[str], set[str]]:
85
+ """Returns (topo_order, execute, reuse, blocked).
86
+
87
+ With ``target``, planning is scoped to the target's ancestor cone —
88
+ "run to here": only what's needed to produce that node executes."""
89
+ order, cyclic = toposort(doc)
90
+ if target is not None and any(n.id == target for n in doc.nodes):
91
+ cone = _ancestor_cone(doc, target)
92
+ order = [nid for nid in order if nid in cone]
93
+ blocked: set[str] = set(cyclic) & set(order)
94
+ execute: set[str] = set()
95
+
96
+ for nid in order:
97
+ state = states.get(nid, {})
98
+ if state.get("problems") or state.get("history_hash") is None:
99
+ blocked.add(nid)
100
+ elif state.get("stale", True):
101
+ execute.add(nid)
102
+
103
+ # Block everything downstream of a blocked node.
104
+ for nid in order:
105
+ if nid in blocked:
106
+ continue
107
+ if any(edge.source in blocked for edge in doc.in_edges(nid)):
108
+ blocked.add(nid)
109
+ execute.discard(nid)
110
+
111
+ # Ephemeral cascade: pull fresh parents back in when an executing child
112
+ # needs an output that was never persisted.
113
+ changed = True
114
+ while changed:
115
+ changed = False
116
+ for edge in doc.edges:
117
+ if edge.target not in execute or edge.source in execute or edge.source in blocked:
118
+ continue
119
+ parent_hash = states.get(edge.source, {}).get("history_hash")
120
+ if parent_hash is None:
121
+ continue
122
+ if store.is_ephemeral(parent_hash, edge.source_output):
123
+ execute.add(edge.source)
124
+ changed = True
125
+
126
+ reuse = {nid for nid in order if nid not in execute and nid not in blocked}
127
+ return order, execute, reuse, blocked
128
+
129
+
130
+ def _run_builtin(node: Node) -> dict[str, Any]:
131
+ if node.block == "builtin:constant":
132
+ return {"output": node.params.get("value")}
133
+ raise ValueError(f"unknown builtin '{node.block}'")
134
+
135
+
136
+ def _import_function(module: str, qualname: str) -> Callable[..., Any]:
137
+ mod = importlib.import_module(module)
138
+ fn = getattr(mod, qualname, None)
139
+ if fn is None or not callable(fn):
140
+ raise AttributeError(f"function '{qualname}' not found in module '{module}'")
141
+ return fn
142
+
143
+
144
+ def _map_outputs(result: Any, output_names: list[str], block_label: str) -> dict[str, Any]:
145
+ if len(output_names) == 1:
146
+ return {output_names[0]: result}
147
+ if not isinstance(result, (tuple, list)) or len(result) != len(output_names):
148
+ raise TypeError(
149
+ f"'{block_label}' declares outputs {output_names} but returned "
150
+ f"{type(result).__name__} (expected a {len(output_names)}-tuple)"
151
+ )
152
+ return dict(zip(output_names, result))
153
+
154
+
155
+ def run_pipeline(
156
+ doc: PipelineDoc,
157
+ blocks: dict[str, dict[str, Any]],
158
+ states: dict[str, dict[str, Any]],
159
+ store: CheckpointStore,
160
+ emit: Emit,
161
+ pickle_enabled: bool = False,
162
+ target: str | None = None,
163
+ ) -> str:
164
+ """Execute the plan; returns terminal status: 'completed' or 'failed'."""
165
+ store.ensure_layout()
166
+ order, execute, reuse, blocked = plan_execution(doc, states, store, target=target)
167
+ emit(
168
+ {
169
+ "event": "run_plan",
170
+ "execute": sorted(execute),
171
+ "reuse": sorted(reuse),
172
+ "blocked": sorted(blocked),
173
+ }
174
+ )
175
+
176
+ # In-memory values for this run: (node_id, output_name) -> value.
177
+ memory: dict[tuple[str, str], Any] = {}
178
+ failed: set[str] = set()
179
+ any_failed = False
180
+
181
+ for nid in order:
182
+ state = states.get(nid, {})
183
+ node_hash = state.get("history_hash")
184
+
185
+ if nid in blocked or any(e.source in failed or e.source in blocked for e in doc.in_edges(nid)):
186
+ blocked.add(nid)
187
+ emit({"event": "node_blocked", "node": nid, "problems": state.get("problems", [])})
188
+ continue
189
+
190
+ if nid in reuse:
191
+ store.touch(node_hash) # LRU signal for checkpoint GC
192
+ emit({"event": "node_skipped", "node": nid, "history_hash": node_hash})
193
+ continue
194
+
195
+ node = doc.node(nid)
196
+ emit({"event": "node_started", "node": nid, "block": node.block})
197
+ started = time.time()
198
+ try:
199
+ connected: dict[str, dict[str, str]] = {}
200
+ side_figures: list[Any] = []
201
+ if node.block.startswith(BUILTIN_PREFIX):
202
+ outputs = _run_builtin(node)
203
+ label = node.block.removeprefix(BUILTIN_PREFIX).title()
204
+ source_hash = node.block
205
+ else:
206
+ info = blocks[node.block]
207
+ kwargs: dict[str, Any] = {}
208
+ for edge in doc.in_edges(nid):
209
+ key = (edge.source, edge.source_output)
210
+ if key not in memory:
211
+ parent_hash = states[edge.source]["history_hash"]
212
+ try:
213
+ memory[key] = store.load_output(parent_hash, edge.source_output)
214
+ except EphemeralValueError:
215
+ # plan_execution should have prevented this; surface loudly.
216
+ raise RuntimeError(
217
+ f"input '{edge.target_param}' of node '{nid}' is ephemeral and its "
218
+ f"producer was not scheduled — planner bug, please report"
219
+ )
220
+ kwargs[edge.target_param] = memory[key]
221
+ connected[edge.target_param] = {
222
+ "node": edge.source,
223
+ "output": edge.source_output,
224
+ "history_hash": states[edge.source]["history_hash"],
225
+ }
226
+ for name, value in node.params.items():
227
+ if name not in kwargs:
228
+ kwargs[name] = value
229
+ # `T | None` params with no signature default are optional by
230
+ # *Forge convention: inject None when nothing else fills them.
231
+ for name in info.get("optional_params", ()):
232
+ kwargs.setdefault(name, None)
233
+
234
+ # capture() wraps the import too: module-level figure code
235
+ # and figures created/shown inside the call are both swept.
236
+ starforge._progress_hook = _progress_hook(nid, emit)
237
+ try:
238
+ with figmod.capture() as captured:
239
+ fn = _import_function(info["module"], info["qualname"])
240
+ result = fn(**kwargs)
241
+ finally:
242
+ starforge._progress_hook = None
243
+ outputs = _map_outputs(result, list(info["outputs"]), info.get("label", node.block))
244
+ side_figures = [
245
+ fig for fig in captured.all_objects()
246
+ if all(fig is not value for value in outputs.values())
247
+ ]
248
+ label = info.get("label", node.block)
249
+ source_hash = info.get("source_hash")
250
+
251
+ duration = time.time() - started
252
+ manifest = store.write(
253
+ node_hash,
254
+ {
255
+ "block_id": node.block,
256
+ "label": label,
257
+ "source_hash": source_hash,
258
+ "params": {k: v for k, v in node.params.items() if k not in connected},
259
+ "inputs": connected,
260
+ "duration_seconds": round(duration, 6),
261
+ "finished_at": dt.datetime.now(dt.timezone.utc).isoformat(),
262
+ },
263
+ outputs,
264
+ pickle_enabled=pickle_enabled,
265
+ side_figures=side_figures,
266
+ )
267
+ figmod.close_figures(side_figures)
268
+ for name, value in outputs.items():
269
+ memory[(nid, name)] = value
270
+ emit(
271
+ {
272
+ "event": "node_completed",
273
+ "node": nid,
274
+ "history_hash": node_hash,
275
+ "duration_seconds": round(duration, 6),
276
+ "outputs": manifest,
277
+ }
278
+ )
279
+ except Exception:
280
+ any_failed = True
281
+ failed.add(nid)
282
+ emit(
283
+ {
284
+ "event": "node_failed",
285
+ "node": nid,
286
+ "duration_seconds": round(time.time() - started, 6),
287
+ "traceback": traceback.format_exc(),
288
+ }
289
+ )
290
+
291
+ status = "failed" if any_failed else "completed"
292
+ emit({"event": "run_finished", "status": status})
293
+ return status
@@ -0,0 +1,141 @@
1
+ """Typed serializer registry for checkpoint outputs (DESIGN.md §8).
2
+
3
+ Probed in order per value: parquet (DataFrame/Series) → npy (ndarray) → json
4
+ (plain data) → pickle (opt-in, default OFF) → ephemeral.
5
+
6
+ Ephemeral values flow normally to downstream nodes within the same run but
7
+ are not persisted, so future runs recompute them on demand. That keeps the
8
+ cost of unserializable types localized instead of poisoning the store.
9
+
10
+ Import discipline: this module never imports pandas/numpy itself. If a value
11
+ IS a DataFrame, pandas is by definition already imported in this process —
12
+ we detect by the type's module name first, then import for free.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ from pathlib import Path
19
+ import pickle
20
+ from typing import Any
21
+
22
+ EPHEMERAL = "ephemeral"
23
+
24
+
25
+ class EphemeralValueError(RuntimeError):
26
+ """Raised when loading an output that was never persisted."""
27
+
28
+
29
+ def _root_type_module(value: Any) -> str:
30
+ return type(value).__module__.split(".")[0]
31
+
32
+
33
+ def _meta(value: Any) -> dict[str, Any]:
34
+ meta: dict[str, Any] = {"type": type(value).__name__}
35
+ shape = getattr(value, "shape", None)
36
+ if isinstance(shape, tuple) and all(isinstance(d, int) for d in shape):
37
+ meta["shape"] = list(shape)
38
+ elif isinstance(value, (list, dict, str)):
39
+ meta["len"] = len(value)
40
+ return meta
41
+
42
+
43
+ def save_value(
44
+ value: Any,
45
+ outputs_dir: Path,
46
+ name: str,
47
+ pickle_enabled: bool = False,
48
+ ) -> dict[str, Any]:
49
+ """Persist one output value; returns its manifest entry."""
50
+ entry: dict[str, Any] = {"name": name, "meta": _meta(value)}
51
+ outputs_dir.mkdir(parents=True, exist_ok=True)
52
+
53
+ if _root_type_module(value) == "pandas":
54
+ import pandas as pd
55
+
56
+ frame = value
57
+ if isinstance(value, pd.Series):
58
+ frame = value.to_frame(name=value.name if value.name is not None else "value")
59
+ entry["meta"]["series"] = True
60
+ if isinstance(frame, pd.DataFrame):
61
+ try:
62
+ filename = f"{name}.parquet"
63
+ frame.to_parquet(outputs_dir / filename)
64
+ entry.update(serializer="parquet", file=filename)
65
+ return entry
66
+ except (ImportError, ValueError, OSError):
67
+ pass # no pyarrow, or unserializable dtypes — fall through
68
+
69
+ if _root_type_module(value) == "numpy":
70
+ import numpy as np
71
+
72
+ if isinstance(value, np.ndarray):
73
+ try:
74
+ filename = f"{name}.npy"
75
+ np.save(outputs_dir / filename, value, allow_pickle=False)
76
+ entry.update(serializer="npy", file=filename)
77
+ return entry
78
+ except (ValueError, OSError):
79
+ pass
80
+
81
+ try:
82
+ text = json.dumps(value)
83
+ filename = f"{name}.json"
84
+ (outputs_dir / filename).write_text(text, encoding="utf-8")
85
+ entry.update(serializer="json", file=filename)
86
+ return entry
87
+ except (TypeError, ValueError, OSError):
88
+ pass
89
+
90
+ # Figures can't round-trip from their rendered form, so the VALUE is
91
+ # ephemeral (flows within a run; downstream recomputes later via the
92
+ # cascade) — but it leaves a rendered artifact behind for display.
93
+ from starforge.core import figures
94
+
95
+ try:
96
+ artifact = figures.render_figure(value, outputs_dir, name)
97
+ except Exception:
98
+ artifact = None
99
+ if artifact is not None:
100
+ entry.update(serializer=EPHEMERAL, file=None, artifact=artifact)
101
+ return entry
102
+
103
+ if pickle_enabled:
104
+ try:
105
+ filename = f"{name}.pkl"
106
+ with (outputs_dir / filename).open("wb") as handle:
107
+ pickle.dump(value, handle, protocol=pickle.HIGHEST_PROTOCOL)
108
+ entry.update(serializer="pickle", file=filename)
109
+ return entry
110
+ except Exception:
111
+ pass
112
+
113
+ entry.update(serializer=EPHEMERAL, file=None)
114
+ return entry
115
+
116
+
117
+ def load_value(outputs_dir: Path, entry: dict[str, Any]) -> Any:
118
+ serializer = entry.get("serializer")
119
+ name = entry.get("name")
120
+ if serializer == EPHEMERAL:
121
+ raise EphemeralValueError(
122
+ f"output '{name}' was not persisted (ephemeral); the producing node must re-run"
123
+ )
124
+ path = outputs_dir / entry["file"]
125
+ if serializer == "parquet":
126
+ import pandas as pd
127
+
128
+ frame = pd.read_parquet(path)
129
+ if entry.get("meta", {}).get("series"):
130
+ return frame.iloc[:, 0]
131
+ return frame
132
+ if serializer == "npy":
133
+ import numpy as np
134
+
135
+ return np.load(path, allow_pickle=False)
136
+ if serializer == "json":
137
+ return json.loads(path.read_text(encoding="utf-8"))
138
+ if serializer == "pickle":
139
+ with path.open("rb") as handle:
140
+ return pickle.load(handle)
141
+ raise ValueError(f"unknown serializer {serializer!r} for output '{name}'")
starforge/core/spec.py ADDED
@@ -0,0 +1,126 @@
1
+ """The ``.forge`` pipeline document model.
2
+
3
+ Documents are plain JSON text — the VS Code custom editor is text-based so
4
+ undo/redo/diff/merge come for free. Layout and notes are durable metadata and
5
+ never participate in provenance hashing.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ import json
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ SCHEMA = "starforge/1"
16
+
17
+
18
+ @dataclass
19
+ class Node:
20
+ id: str
21
+ block: str # "dotted.module:qualname"
22
+ params: dict[str, Any] = field(default_factory=dict) # literals for unconnected params
23
+ position: dict[str, float] = field(default_factory=lambda: {"x": 0.0, "y": 0.0})
24
+ notes: str = ""
25
+
26
+ def to_dict(self) -> dict[str, Any]:
27
+ return {
28
+ "id": self.id,
29
+ "block": self.block,
30
+ "params": self.params,
31
+ "position": self.position,
32
+ "notes": self.notes,
33
+ }
34
+
35
+
36
+ @dataclass
37
+ class Edge:
38
+ id: str
39
+ source: str
40
+ target: str
41
+ target_param: str # parameter NAME on the target function — never a slot index
42
+ source_output: str = "output"
43
+
44
+ def to_dict(self) -> dict[str, Any]:
45
+ return {
46
+ "id": self.id,
47
+ "source": self.source,
48
+ "source_output": self.source_output,
49
+ "target": self.target,
50
+ "target_param": self.target_param,
51
+ }
52
+
53
+
54
+ @dataclass
55
+ class PipelineDoc:
56
+ name: str = "Untitled"
57
+ nodes: list[Node] = field(default_factory=list)
58
+ edges: list[Edge] = field(default_factory=list)
59
+ #: Canvas annotation boxes: [{id, title, description, position, width,
60
+ #: height, color}]. Pure layout metadata — round-trips untouched, never
61
+ #: participates in hashing or execution.
62
+ comments: list[dict[str, Any]] = field(default_factory=list)
63
+
64
+ def node(self, node_id: str) -> Node:
65
+ for node in self.nodes:
66
+ if node.id == node_id:
67
+ return node
68
+ raise KeyError(node_id)
69
+
70
+ def in_edges(self, node_id: str) -> list[Edge]:
71
+ return [e for e in self.edges if e.target == node_id]
72
+
73
+ def to_dict(self) -> dict[str, Any]:
74
+ return {
75
+ "schema": SCHEMA,
76
+ "name": self.name,
77
+ "nodes": [n.to_dict() for n in self.nodes],
78
+ "edges": [e.to_dict() for e in self.edges],
79
+ "comments": self.comments,
80
+ }
81
+
82
+ def to_json(self) -> str:
83
+ return json.dumps(self.to_dict(), indent=2)
84
+
85
+ @classmethod
86
+ def from_dict(cls, d: dict[str, Any]) -> "PipelineDoc":
87
+ schema = d.get("schema", SCHEMA)
88
+ if schema != SCHEMA:
89
+ raise ValueError(f"unsupported .forge schema {schema!r} (expected {SCHEMA!r})")
90
+ nodes = [
91
+ Node(
92
+ id=n["id"],
93
+ block=n["block"],
94
+ params=dict(n.get("params", {})),
95
+ position=dict(n.get("position", {"x": 0.0, "y": 0.0})),
96
+ notes=n.get("notes", ""),
97
+ )
98
+ for n in d.get("nodes", [])
99
+ ]
100
+ edges = [
101
+ Edge(
102
+ id=e["id"],
103
+ source=e["source"],
104
+ source_output=e.get("source_output", "output"),
105
+ target=e["target"],
106
+ target_param=e["target_param"],
107
+ )
108
+ for e in d.get("edges", [])
109
+ ]
110
+ return cls(
111
+ name=d.get("name", "Untitled"),
112
+ nodes=nodes,
113
+ edges=edges,
114
+ comments=list(d.get("comments", [])),
115
+ )
116
+
117
+ @classmethod
118
+ def from_json(cls, text: str) -> "PipelineDoc":
119
+ return cls.from_dict(json.loads(text))
120
+
121
+ @classmethod
122
+ def load(cls, path: str | Path) -> "PipelineDoc":
123
+ return cls.from_json(Path(path).read_text(encoding="utf-8"))
124
+
125
+ def save(self, path: str | Path) -> None:
126
+ Path(path).write_text(self.to_json() + "\n", encoding="utf-8")
@@ -0,0 +1,9 @@
1
+ from starforge.index.scanner import (
2
+ BlockInfo,
3
+ ModuleInfo,
4
+ ParamInfo,
5
+ WorkspaceIndex,
6
+ scan_workspace,
7
+ )
8
+
9
+ __all__ = ["BlockInfo", "ModuleInfo", "ParamInfo", "WorkspaceIndex", "scan_workspace"]