starforge-kernel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,351 @@
1
+ """The *Forge kernel: NDJSON JSON-RPC over stdio (DESIGN.md §9).
2
+
3
+ Spawned by the VS Code extension inside the workspace venv as
4
+ ``python -m starforge.kernel``. Stdlib-only by design: pandas and friends
5
+ exist only inside run workers, so an idle kernel stays small and an idle
6
+ workspace costs nothing at all (the extension kills us when unused).
7
+
8
+ Requests: {"id": .., "method": "..", "params": {..}}
9
+ Responses: {"id": .., "result": {..}} | {"id": .., "error": {"message": ..}}
10
+ Notifications (kernel → client): {"method": "run/event", "params": {..}}
11
+ {"method": "log", "params": {..}}
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import os
18
+ from pathlib import Path
19
+ import subprocess
20
+ import sys
21
+ import threading
22
+ from typing import Any, BinaryIO
23
+ import uuid
24
+
25
+ import starforge
26
+ from starforge.core.checkpoints import CheckpointStore
27
+ from starforge.core.provenance import BUILTIN_PREFIX, compute_states, env_fingerprint
28
+ from starforge.core.spec import PipelineDoc
29
+ from starforge.index import WorkspaceIndex, scan_workspace
30
+
31
+ #: Doc-native nodes, served to the palette alongside discovered functions.
32
+ BUILTIN_PALETTE = [
33
+ {
34
+ "block_id": "builtin:constant",
35
+ "module": "builtin",
36
+ "qualname": "constant",
37
+ "file": "",
38
+ "lineno": 0,
39
+ "label": "Constant",
40
+ "category": "Built-in",
41
+ "params": [
42
+ {
43
+ "name": "value",
44
+ "annotation": None,
45
+ "default_repr": "null",
46
+ "has_default": True,
47
+ "keyword_only": False,
48
+ }
49
+ ],
50
+ "outputs": ["output"],
51
+ "returns": None,
52
+ "doc": "Inject a literal JSON value (number, string, list, object) as a source node.",
53
+ "source_hash": "builtin:constant",
54
+ }
55
+ ]
56
+
57
+
58
+ class Kernel:
59
+ def __init__(self, stdin: BinaryIO, stdout: BinaryIO) -> None:
60
+ self._stdin = stdin
61
+ self._stdout = stdout
62
+ self._write_lock = threading.Lock()
63
+ self.workspace: Path | None = None
64
+ self.settings: dict[str, Any] = {}
65
+ self.index: WorkspaceIndex | None = None
66
+ self._scan_cache: dict[str, Any] | None = None
67
+ self._runs: dict[str, subprocess.Popen] = {}
68
+
69
+ # ---------------------------------------------------------------- wire
70
+
71
+ def _send(self, payload: dict[str, Any]) -> None:
72
+ data = (json.dumps(payload, default=repr) + "\n").encode("utf-8")
73
+ with self._write_lock:
74
+ self._stdout.write(data)
75
+ self._stdout.flush()
76
+
77
+ def notify(self, method: str, params: dict[str, Any]) -> None:
78
+ self._send({"method": method, "params": params})
79
+
80
+ def serve_forever(self) -> None:
81
+ for raw in self._stdin:
82
+ line = raw.decode("utf-8", errors="replace").strip()
83
+ if not line:
84
+ continue
85
+ try:
86
+ request = json.loads(line)
87
+ except json.JSONDecodeError as exc:
88
+ self._send({"id": None, "error": {"message": f"bad frame: {exc}"}})
89
+ continue
90
+ request_id = request.get("id")
91
+ method = str(request.get("method", ""))
92
+ params = request.get("params") or {}
93
+ if method == "shutdown":
94
+ self._send({"id": request_id, "result": {}})
95
+ break
96
+ handler = getattr(self, "rpc_" + method.replace("/", "_"), None)
97
+ if handler is None:
98
+ self._send({"id": request_id, "error": {"message": f"unknown method '{method}'"}})
99
+ continue
100
+ try:
101
+ self._send({"id": request_id, "result": handler(params)})
102
+ except Exception as exc: # noqa: BLE001 — every failure must answer the request
103
+ self._send({"id": request_id, "error": {"message": f"{type(exc).__name__}: {exc}"}})
104
+ self._terminate_runs()
105
+
106
+ # ------------------------------------------------------------- helpers
107
+
108
+ def _require_workspace(self) -> Path:
109
+ if self.workspace is None:
110
+ raise RuntimeError("kernel not initialized — call 'initialize' first")
111
+ return self.workspace
112
+
113
+ def _tier(self) -> str:
114
+ tier = str(self.settings.get("tier", "T2"))
115
+ return tier if tier in ("T0", "T1", "T2") else "T2"
116
+
117
+ def _max_checkpoint_bytes(self) -> int:
118
+ try:
119
+ mb = float(self.settings.get("max_checkpoint_mb", 2048))
120
+ except (TypeError, ValueError):
121
+ mb = 2048
122
+ return int(max(64.0, mb) * 1024 * 1024)
123
+
124
+ def _run_gc(self) -> None:
125
+ try:
126
+ stats = self._store().gc(self._max_checkpoint_bytes())
127
+ if stats["deleted"]:
128
+ freed = stats["freed_bytes"] / (1024 * 1024)
129
+ self.notify(
130
+ "log",
131
+ {"text": f"checkpoint GC: evicted {stats['deleted']} checkpoints ({freed:.1f} MB)"},
132
+ )
133
+ except Exception:
134
+ pass # hygiene must never break runs
135
+
136
+ def _cache_path(self) -> Path:
137
+ return self._require_workspace() / ".forge" / "cache" / "index.json"
138
+
139
+ def _store(self) -> CheckpointStore:
140
+ return CheckpointStore(self._require_workspace())
141
+
142
+ def _ensure_index(self) -> WorkspaceIndex:
143
+ if self.index is None:
144
+ self.rpc_index_scan({})
145
+ assert self.index is not None
146
+ return self.index
147
+
148
+ def _palette(self, index: WorkspaceIndex) -> dict[str, Any]:
149
+ discovered = [b.to_dict() for b in sorted(index.blocks.values(), key=lambda b: b.block_id)]
150
+ return {
151
+ "blocks": list(BUILTIN_PALETTE) + discovered,
152
+ "errors": index.errors(),
153
+ }
154
+
155
+ def _terminate_runs(self) -> None:
156
+ for proc in self._runs.values():
157
+ if proc.poll() is None:
158
+ proc.terminate()
159
+
160
+ # ------------------------------------------------------------- methods
161
+
162
+ def rpc_initialize(self, params: dict[str, Any]) -> dict[str, Any]:
163
+ self.workspace = Path(params["workspace"]).resolve()
164
+ self.settings = dict(params.get("settings") or {})
165
+ store = self._store()
166
+ store.ensure_layout()
167
+ store.clean_run_specs()
168
+ cache_path = self._cache_path()
169
+ if cache_path.is_file():
170
+ try:
171
+ self._scan_cache = json.loads(cache_path.read_text(encoding="utf-8"))
172
+ except (OSError, json.JSONDecodeError):
173
+ self._scan_cache = None
174
+ return {
175
+ "kernel_version": starforge.__version__,
176
+ "python": sys.version.split()[0],
177
+ "env_fingerprint": env_fingerprint(self.workspace),
178
+ }
179
+
180
+ def rpc_index_scan(self, params: dict[str, Any]) -> dict[str, Any]:
181
+ workspace = self._require_workspace()
182
+ index, cache = scan_workspace(workspace, self._scan_cache)
183
+ self.index, self._scan_cache = index, cache
184
+ cache_path = self._cache_path()
185
+ try:
186
+ cache_path.parent.mkdir(parents=True, exist_ok=True)
187
+ cache_path.write_text(json.dumps(cache), encoding="utf-8")
188
+ except OSError:
189
+ pass # cache is an optimization; never fail a scan over it
190
+ return self._palette(index)
191
+
192
+ def rpc_pipeline_hashes(self, params: dict[str, Any]) -> dict[str, Any]:
193
+ workspace = self._require_workspace()
194
+ doc = PipelineDoc.from_dict(params["doc"])
195
+ index = self._ensure_index()
196
+ states = compute_states(
197
+ doc, index, env_fingerprint(workspace), self._store().exists, tier=self._tier()
198
+ )
199
+ return {"nodes": {nid: state.to_dict() for nid, state in states.items()}}
200
+
201
+ def rpc_run_start(self, params: dict[str, Any]) -> dict[str, Any]:
202
+ workspace = self._require_workspace()
203
+ doc = PipelineDoc.from_dict(params["doc"])
204
+ index = self._ensure_index()
205
+ states = compute_states(
206
+ doc, index, env_fingerprint(workspace), self._store().exists, tier=self._tier()
207
+ )
208
+ blocks = index.blocks
209
+ referenced = {
210
+ node.block: {
211
+ "module": blocks[node.block].module,
212
+ "qualname": blocks[node.block].qualname,
213
+ "label": blocks[node.block].label,
214
+ "outputs": blocks[node.block].outputs,
215
+ "source_hash": blocks[node.block].source_hash,
216
+ "optional_params": [
217
+ p.name for p in blocks[node.block].params if p.optional and not p.has_default
218
+ ],
219
+ }
220
+ for node in doc.nodes
221
+ if node.block in blocks and not node.block.startswith(BUILTIN_PREFIX)
222
+ }
223
+ run_id = params.get("run_id") or uuid.uuid4().hex[:12]
224
+ spec = {
225
+ "workspace": str(workspace),
226
+ "doc": doc.to_dict(),
227
+ "blocks": referenced,
228
+ "states": {nid: state.to_dict() for nid, state in states.items()},
229
+ "pickle_enabled": bool(self.settings.get("pickle_enabled", False)),
230
+ "target": params.get("target"),
231
+ }
232
+ runs_dir = workspace / ".forge" / "cache" / "runs"
233
+ runs_dir.mkdir(parents=True, exist_ok=True)
234
+ spec_path = runs_dir / f"{run_id}.json"
235
+ spec_path.write_text(json.dumps(spec, default=repr), encoding="utf-8")
236
+
237
+ env = dict(os.environ)
238
+ package_root = str(Path(starforge.__file__).resolve().parent.parent)
239
+ env["PYTHONPATH"] = package_root + os.pathsep + env.get("PYTHONPATH", "")
240
+ proc = subprocess.Popen(
241
+ [sys.executable, "-m", "starforge.kernel.worker", str(spec_path)],
242
+ cwd=str(workspace),
243
+ env=env,
244
+ # DEVNULL is load-bearing: inheriting our stdin (the protocol
245
+ # pipe) deadlocks the child's interpreter bootstrap on Windows
246
+ # while our main thread is blocked reading that same handle —
247
+ # and user code calling input() must never eat protocol frames.
248
+ stdin=subprocess.DEVNULL,
249
+ stdout=subprocess.PIPE,
250
+ stderr=subprocess.PIPE,
251
+ )
252
+ self._runs[run_id] = proc
253
+ threading.Thread(target=self._pump_events, args=(run_id, proc), daemon=True).start()
254
+ threading.Thread(target=self._pump_logs, args=(run_id, proc), daemon=True).start()
255
+ return {"run_id": run_id}
256
+
257
+ def _pump_events(self, run_id: str, proc: subprocess.Popen) -> None:
258
+ assert proc.stdout is not None
259
+ saw_terminal = False
260
+ for raw in proc.stdout:
261
+ try:
262
+ event = json.loads(raw.decode("utf-8", errors="replace"))
263
+ except json.JSONDecodeError:
264
+ continue
265
+ saw_terminal = saw_terminal or event.get("event") == "run_finished"
266
+ self.notify("run/event", {"run_id": run_id, **event})
267
+ code = proc.wait()
268
+ if not saw_terminal:
269
+ # rpc_run_cancel pops the run before terminating it; a missing
270
+ # entry with no terminal event means the worker was killed by us.
271
+ status = "cancelled" if run_id not in self._runs else "failed"
272
+ self.notify(
273
+ "run/event",
274
+ {"run_id": run_id, "event": "run_finished", "status": status, "exit_code": code},
275
+ )
276
+ self._runs.pop(run_id, None)
277
+ # Post-run hygiene: the one-shot spec file and, if over budget, LRU
278
+ # checkpoint eviction.
279
+ try:
280
+ spec_path = self._require_workspace() / ".forge" / "cache" / "runs" / f"{run_id}.json"
281
+ spec_path.unlink(missing_ok=True)
282
+ except Exception:
283
+ pass
284
+ self._run_gc()
285
+
286
+ def _pump_logs(self, run_id: str, proc: subprocess.Popen) -> None:
287
+ assert proc.stderr is not None
288
+ for raw in proc.stderr:
289
+ text = raw.decode("utf-8", errors="replace").rstrip()
290
+ if text:
291
+ self.notify("log", {"run_id": run_id, "text": text})
292
+
293
+ def rpc_run_cancel(self, params: dict[str, Any]) -> dict[str, Any]:
294
+ run_id = params["run_id"]
295
+ proc = self._runs.pop(run_id, None)
296
+ if proc is None or proc.poll() is not None:
297
+ return {"cancelled": False, "reason": "run not active"}
298
+ proc.terminate()
299
+ try:
300
+ proc.wait(timeout=3)
301
+ except subprocess.TimeoutExpired:
302
+ proc.kill()
303
+ return {"cancelled": True}
304
+
305
+ def rpc_results_manifest(self, params: dict[str, Any]) -> dict[str, Any]:
306
+ store = self._store()
307
+ history_hash = params["history_hash"]
308
+ if not store.exists(history_hash):
309
+ raise FileNotFoundError(f"no checkpoint for hash {history_hash[:16]}")
310
+ return store.read_provenance(history_hash)
311
+
312
+ def rpc_maintenance_gc(self, params: dict[str, Any]) -> dict[str, Any]:
313
+ """Manual checkpoint eviction; `max_mb` overrides the configured cap
314
+ (pass 0 to clear everything)."""
315
+ store = self._store()
316
+ store.clean_run_specs(max_age_seconds=0)
317
+ if "max_mb" in params:
318
+ max_bytes = int(max(0.0, float(params["max_mb"])) * 1024 * 1024)
319
+ else:
320
+ max_bytes = self._max_checkpoint_bytes()
321
+ return store.gc(max_bytes)
322
+
323
+ def rpc_results_figures(self, params: dict[str, Any]) -> dict[str, Any]:
324
+ """Batch lookup of display artifacts for node thumbnails: for each
325
+ existing checkpoint, its dir plus side-figures and output artifacts."""
326
+ store = self._store()
327
+ results: dict[str, Any] = {}
328
+ for history_hash in params.get("history_hashes", []):
329
+ if not store.exists(history_hash):
330
+ continue
331
+ try:
332
+ provenance = store.read_provenance(history_hash)
333
+ except (OSError, json.JSONDecodeError):
334
+ continue
335
+ artifacts = list(provenance.get("figures", []))
336
+ artifacts.extend(
337
+ entry["artifact"]
338
+ for entry in provenance.get("outputs", [])
339
+ if entry.get("artifact")
340
+ )
341
+ if artifacts:
342
+ results[history_hash] = {"dir": provenance.get("dir"), "artifacts": artifacts}
343
+ return {"checkpoints": results}
344
+
345
+
346
+ def main() -> None:
347
+ Kernel(sys.stdin.buffer, sys.stdout.buffer).serve_forever()
348
+
349
+
350
+ if __name__ == "__main__":
351
+ main()
@@ -0,0 +1,66 @@
1
+ """Per-run worker subprocess: ``python -m starforge.kernel.worker <spec.json>``.
2
+
3
+ This is the ONLY *Forge process that imports user code. It lives for exactly
4
+ one run, so cancellation is a process kill, leaked memory returns to the OS,
5
+ and just-edited functions are picked up by fresh imports — no reload hacks.
6
+
7
+ Protocol: NDJSON events on the real stdout. ``sys.stdout`` is rebound to
8
+ stderr before any user code runs, so stray ``print()`` calls in user blocks
9
+ become log lines instead of corrupting the event stream.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import os
16
+ from pathlib import Path
17
+ import sys
18
+
19
+
20
+ def main(argv: list[str]) -> int:
21
+ if len(argv) != 2:
22
+ print("usage: python -m starforge.kernel.worker <run-spec.json>", file=sys.stderr)
23
+ return 2
24
+
25
+ # Before any user import: headless matplotlib (no GUI windows, no Tk
26
+ # errors) — plt.show() becomes a no-op and the figure sweep collects
27
+ # what the user "showed".
28
+ os.environ.setdefault("MPLBACKEND", "Agg")
29
+ spec = json.loads(Path(argv[1]).read_text(encoding="utf-8"))
30
+
31
+ events_out = sys.stdout
32
+ sys.stdout = sys.stderr # user prints become logs, never protocol frames
33
+
34
+ def emit(event: dict) -> None:
35
+ events_out.write(json.dumps(event, default=repr) + "\n")
36
+ events_out.flush()
37
+
38
+ emit({"event": "worker_started", "python": sys.version.split()[0]})
39
+
40
+ workspace = spec["workspace"]
41
+ sys.path.insert(0, workspace)
42
+
43
+ from starforge.core.checkpoints import CheckpointStore
44
+ from starforge.core.runner import run_pipeline
45
+ from starforge.core.spec import PipelineDoc
46
+
47
+ try:
48
+ status = run_pipeline(
49
+ doc=PipelineDoc.from_dict(spec["doc"]),
50
+ blocks=spec["blocks"],
51
+ states=spec["states"],
52
+ store=CheckpointStore(workspace),
53
+ emit=emit,
54
+ pickle_enabled=bool(spec.get("pickle_enabled", False)),
55
+ target=spec.get("target"),
56
+ )
57
+ except Exception:
58
+ import traceback
59
+
60
+ emit({"event": "run_finished", "status": "failed", "traceback": traceback.format_exc()})
61
+ return 1
62
+ return 0 if status == "completed" else 1
63
+
64
+
65
+ if __name__ == "__main__":
66
+ sys.exit(main(sys.argv))