starforge-kernel 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- starforge_kernel-0.1.0/PKG-INFO +76 -0
- starforge_kernel-0.1.0/README.md +59 -0
- starforge_kernel-0.1.0/pyproject.toml +31 -0
- starforge_kernel-0.1.0/setup.cfg +4 -0
- starforge_kernel-0.1.0/src/starforge/__init__.py +86 -0
- starforge_kernel-0.1.0/src/starforge/core/__init__.py +0 -0
- starforge_kernel-0.1.0/src/starforge/core/checkpoints.py +178 -0
- starforge_kernel-0.1.0/src/starforge/core/figures.py +119 -0
- starforge_kernel-0.1.0/src/starforge/core/previews.py +109 -0
- starforge_kernel-0.1.0/src/starforge/core/provenance.py +192 -0
- starforge_kernel-0.1.0/src/starforge/core/runner.py +293 -0
- starforge_kernel-0.1.0/src/starforge/core/serializers.py +141 -0
- starforge_kernel-0.1.0/src/starforge/core/spec.py +126 -0
- starforge_kernel-0.1.0/src/starforge/index/__init__.py +9 -0
- starforge_kernel-0.1.0/src/starforge/index/scanner.py +487 -0
- starforge_kernel-0.1.0/src/starforge/kernel/__init__.py +0 -0
- starforge_kernel-0.1.0/src/starforge/kernel/__main__.py +3 -0
- starforge_kernel-0.1.0/src/starforge/kernel/server.py +351 -0
- starforge_kernel-0.1.0/src/starforge/kernel/worker.py +66 -0
- starforge_kernel-0.1.0/src/starforge/mcp.py +283 -0
- starforge_kernel-0.1.0/src/starforge_kernel.egg-info/PKG-INFO +76 -0
- starforge_kernel-0.1.0/src/starforge_kernel.egg-info/SOURCES.txt +33 -0
- starforge_kernel-0.1.0/src/starforge_kernel.egg-info/dependency_links.txt +1 -0
- starforge_kernel-0.1.0/src/starforge_kernel.egg-info/requires.txt +9 -0
- starforge_kernel-0.1.0/src/starforge_kernel.egg-info/top_level.txt +1 -0
- starforge_kernel-0.1.0/tests/test_decorator.py +43 -0
- starforge_kernel-0.1.0/tests/test_figures.py +146 -0
- starforge_kernel-0.1.0/tests/test_indexer.py +142 -0
- starforge_kernel-0.1.0/tests/test_kernel_protocol.py +204 -0
- starforge_kernel-0.1.0/tests/test_m1_features.py +182 -0
- starforge_kernel-0.1.0/tests/test_mcp_module.py +60 -0
- starforge_kernel-0.1.0/tests/test_previews.py +55 -0
- starforge_kernel-0.1.0/tests/test_provenance.py +86 -0
- starforge_kernel-0.1.0/tests/test_runner_end_to_end.py +271 -0
- starforge_kernel-0.1.0/tests/test_serializers.py +61 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: starforge-kernel
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: *Forge — pipeline canvas, checkpointing, and stale/hydrate execution for the repo you already have open
|
|
5
|
+
Author: Jonathan Potter
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/Jonpot/forge
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
12
|
+
Requires-Dist: pandas>=2.0; extra == "dev"
|
|
13
|
+
Requires-Dist: pyarrow>=15.0; extra == "dev"
|
|
14
|
+
Requires-Dist: numpy>=1.26; extra == "dev"
|
|
15
|
+
Provides-Extra: mcp
|
|
16
|
+
Requires-Dist: mcp>=1.8; extra == "mcp"
|
|
17
|
+
|
|
18
|
+
# *Forge (`starforge`)
|
|
19
|
+
|
|
20
|
+
Forge's canvas — checkpointing, provenance, stale/hydrate execution — as a VS Code
|
|
21
|
+
extension over the repo you already have open. Blocks are ordinary Python functions
|
|
22
|
+
tagged with `@block`. See [DESIGN.md](DESIGN.md) for the full design.
|
|
23
|
+
|
|
24
|
+
## Try it (M0)
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# 1. Install the kernel + decorator into the venv your target repo uses
|
|
28
|
+
pip install -e <forge-repo>/starforge
|
|
29
|
+
|
|
30
|
+
# 2. Build the extension
|
|
31
|
+
cd <forge-repo>/starforge/vscode
|
|
32
|
+
npm install && npm run build
|
|
33
|
+
|
|
34
|
+
# 3. Open starforge/vscode in VS Code and press F5 (extension dev host).
|
|
35
|
+
# In the dev-host window, open any Python repo.
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
In your repo:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
# analysis/blocks.py
|
|
42
|
+
import matplotlib.pyplot as plt
|
|
43
|
+
from starforge import block
|
|
44
|
+
|
|
45
|
+
@block(category="IO")
|
|
46
|
+
def make_numbers(n: int = 5) -> dict:
|
|
47
|
+
return {"values": list(range(1, n + 1))}
|
|
48
|
+
|
|
49
|
+
@block
|
|
50
|
+
def scale(data: dict, factor: float = 2.0) -> dict:
|
|
51
|
+
return {"values": [v * factor for v in data["values"]]}
|
|
52
|
+
|
|
53
|
+
@block
|
|
54
|
+
def plot(data: dict) -> dict:
|
|
55
|
+
plt.plot(data["values"])
|
|
56
|
+
plt.show() # rendered inline on the canvas node
|
|
57
|
+
return data
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Save, run **“*Forge: New Pipeline”**, drag the blocks from the palette, wire
|
|
61
|
+
`output → data`, hit **▶ Run**. Run again — instant, everything reused. Edit
|
|
62
|
+
`scale`, watch it (and only it) go stale.
|
|
63
|
+
|
|
64
|
+
## Layout
|
|
65
|
+
|
|
66
|
+
| Path | What |
|
|
67
|
+
|---|---|
|
|
68
|
+
| `src/starforge/__init__.py` | the `@block` decorator — zero-dep, the only thing user code touches |
|
|
69
|
+
| `src/starforge/index/` | static AST indexer (discovery, import graph, incremental cache) |
|
|
70
|
+
| `src/starforge/core/` | doc schema, history hashing, serializers, checkpoint store, runner |
|
|
71
|
+
| `src/starforge/kernel/` | stdio JSON-RPC kernel + per-run worker subprocess |
|
|
72
|
+
| `vscode/` | the extension (TS host + React Flow webview) |
|
|
73
|
+
| `tests/` | headless M0 proof — `python -m pytest starforge/tests` |
|
|
74
|
+
|
|
75
|
+
State lives in the target repo under `.forge/` — `pipelines/` is committable,
|
|
76
|
+
`checkpoints/` and `cache/` are auto-gitignored.
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# *Forge (`starforge`)
|
|
2
|
+
|
|
3
|
+
Forge's canvas — checkpointing, provenance, stale/hydrate execution — as a VS Code
|
|
4
|
+
extension over the repo you already have open. Blocks are ordinary Python functions
|
|
5
|
+
tagged with `@block`. See [DESIGN.md](DESIGN.md) for the full design.
|
|
6
|
+
|
|
7
|
+
## Try it (M0)
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
# 1. Install the kernel + decorator into the venv your target repo uses
|
|
11
|
+
pip install -e <forge-repo>/starforge
|
|
12
|
+
|
|
13
|
+
# 2. Build the extension
|
|
14
|
+
cd <forge-repo>/starforge/vscode
|
|
15
|
+
npm install && npm run build
|
|
16
|
+
|
|
17
|
+
# 3. Open starforge/vscode in VS Code and press F5 (extension dev host).
|
|
18
|
+
# In the dev-host window, open any Python repo.
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
In your repo:
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
# analysis/blocks.py
|
|
25
|
+
import matplotlib.pyplot as plt
|
|
26
|
+
from starforge import block
|
|
27
|
+
|
|
28
|
+
@block(category="IO")
|
|
29
|
+
def make_numbers(n: int = 5) -> dict:
|
|
30
|
+
return {"values": list(range(1, n + 1))}
|
|
31
|
+
|
|
32
|
+
@block
|
|
33
|
+
def scale(data: dict, factor: float = 2.0) -> dict:
|
|
34
|
+
return {"values": [v * factor for v in data["values"]]}
|
|
35
|
+
|
|
36
|
+
@block
|
|
37
|
+
def plot(data: dict) -> dict:
|
|
38
|
+
plt.plot(data["values"])
|
|
39
|
+
plt.show() # rendered inline on the canvas node
|
|
40
|
+
return data
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Save, run **“*Forge: New Pipeline”**, drag the blocks from the palette, wire
|
|
44
|
+
`output → data`, hit **▶ Run**. Run again — instant, everything reused. Edit
|
|
45
|
+
`scale`, watch it (and only it) go stale.
|
|
46
|
+
|
|
47
|
+
## Layout
|
|
48
|
+
|
|
49
|
+
| Path | What |
|
|
50
|
+
|---|---|
|
|
51
|
+
| `src/starforge/__init__.py` | the `@block` decorator — zero-dep, the only thing user code touches |
|
|
52
|
+
| `src/starforge/index/` | static AST indexer (discovery, import graph, incremental cache) |
|
|
53
|
+
| `src/starforge/core/` | doc schema, history hashing, serializers, checkpoint store, runner |
|
|
54
|
+
| `src/starforge/kernel/` | stdio JSON-RPC kernel + per-run worker subprocess |
|
|
55
|
+
| `vscode/` | the extension (TS host + React Flow webview) |
|
|
56
|
+
| `tests/` | headless M0 proof — `python -m pytest starforge/tests` |
|
|
57
|
+
|
|
58
|
+
State lives in the target repo under `.forge/` — `pipelines/` is committable,
|
|
59
|
+
`checkpoints/` and `cache/` are auto-gitignored.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
# PyPI name `starforge` is squatted by a dormant Galaxy tool (see DESIGN.md §4);
|
|
7
|
+
# the import name is still `starforge`.
|
|
8
|
+
name = "starforge-kernel"
|
|
9
|
+
version = "0.1.0"
|
|
10
|
+
description = "*Forge — pipeline canvas, checkpointing, and stale/hydrate execution for the repo you already have open"
|
|
11
|
+
readme = "README.md"
|
|
12
|
+
authors = [{ name = "Jonathan Potter" }]
|
|
13
|
+
license = "Apache-2.0"
|
|
14
|
+
requires-python = ">=3.10"
|
|
15
|
+
# Intentionally empty: the decorator must import in microseconds inside user
|
|
16
|
+
# production code, and the kernel runs stdlib-only. pandas/numpy/pyarrow are
|
|
17
|
+
# probed lazily in workers and used only if the workspace env provides them.
|
|
18
|
+
dependencies = []
|
|
19
|
+
|
|
20
|
+
[project.urls]
|
|
21
|
+
Homepage = "https://github.com/Jonpot/forge"
|
|
22
|
+
|
|
23
|
+
[project.optional-dependencies]
|
|
24
|
+
dev = ["pytest>=8.0", "pandas>=2.0", "pyarrow>=15.0", "numpy>=1.26"]
|
|
25
|
+
mcp = ["mcp>=1.8"]
|
|
26
|
+
|
|
27
|
+
[tool.setuptools.packages.find]
|
|
28
|
+
where = ["src"]
|
|
29
|
+
|
|
30
|
+
[tool.pytest.ini_options]
|
|
31
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""*Forge — pipeline canvas for the repo you already have open.
|
|
2
|
+
|
|
3
|
+
This top-level module is the entire public surface that user code touches.
|
|
4
|
+
It must import in microseconds and depend on nothing: the decorator lives in
|
|
5
|
+
production codebases and has to be free. Everything heavy (indexer, engine,
|
|
6
|
+
kernel) lives in submodules that only *Forge itself* imports.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
12
|
+
|
|
13
|
+
__all__ = ["block", "progress", "BLOCK_ATTR"]
|
|
14
|
+
|
|
15
|
+
#: Attribute set on decorated functions. The AST indexer matches the decorator
|
|
16
|
+
#: syntactically and never imports user code; this runtime tag exists so user
|
|
17
|
+
#: code and future runtime introspection can also recognize blocks.
|
|
18
|
+
BLOCK_ATTR = "__starforge_block__"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def block(fn=None, *, label=None, category=None, outputs=None):
|
|
22
|
+
"""Register a function as a *Forge block.
|
|
23
|
+
|
|
24
|
+
Usable bare or with keyword arguments::
|
|
25
|
+
|
|
26
|
+
@block
|
|
27
|
+
def clean(raw: pd.DataFrame) -> pd.DataFrame: ...
|
|
28
|
+
|
|
29
|
+
@block(label="Clean AUC Matrix", category="QC", outputs=("clean", "stats"))
|
|
30
|
+
def clean_auc(raw, min_coverage: float = 0.8): ...
|
|
31
|
+
|
|
32
|
+
The decorated function is returned unchanged — behavior under pytest, in
|
|
33
|
+
CI, or in production is identical whether or not *Forge is anywhere near.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
label: Palette display name. Defaults to the function name, title-cased.
|
|
37
|
+
category: Palette grouping. Defaults to the defining module's path.
|
|
38
|
+
outputs: Names for multiple return values (function must return a tuple
|
|
39
|
+
of the same length). Defaults to a single output named "output".
|
|
40
|
+
|
|
41
|
+
Note for palette metadata: the indexer reads ``label``/``category``/
|
|
42
|
+
``outputs`` from the *source*, so they must be literals at the decoration
|
|
43
|
+
site to appear in the palette.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def apply(f):
|
|
47
|
+
setattr(
|
|
48
|
+
f,
|
|
49
|
+
BLOCK_ATTR,
|
|
50
|
+
{
|
|
51
|
+
"label": label,
|
|
52
|
+
"category": category,
|
|
53
|
+
"outputs": tuple(outputs) if outputs is not None else None,
|
|
54
|
+
},
|
|
55
|
+
)
|
|
56
|
+
return f
|
|
57
|
+
|
|
58
|
+
if fn is not None:
|
|
59
|
+
return apply(fn)
|
|
60
|
+
return apply
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
#: Installed by the *Forge run worker around each block call; None everywhere
|
|
64
|
+
#: else, which keeps progress() a guaranteed no-op in pytest/CI/production.
|
|
65
|
+
_progress_hook = None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def progress(current=None, total=None, label=None):
|
|
69
|
+
"""Report block progress to the *Forge canvas.
|
|
70
|
+
|
|
71
|
+
Call freely inside a block::
|
|
72
|
+
|
|
73
|
+
for i, chunk in enumerate(chunks):
|
|
74
|
+
progress(i + 1, len(chunks), "fitting folds")
|
|
75
|
+
...
|
|
76
|
+
|
|
77
|
+
Outside a *Forge run this does nothing and costs one attribute read —
|
|
78
|
+
safe to leave in production code. Any combination of arguments works:
|
|
79
|
+
(current, total) renders a determinate bar, label alone updates the text.
|
|
80
|
+
"""
|
|
81
|
+
hook = _progress_hook
|
|
82
|
+
if hook is not None:
|
|
83
|
+
try:
|
|
84
|
+
hook(current, total, label)
|
|
85
|
+
except Exception:
|
|
86
|
+
pass # progress must never break user code
|
|
File without changes
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Checkpoint store under ``<workspace>/.forge/checkpoints/``.
|
|
2
|
+
|
|
3
|
+
One directory per history hash (truncated to 32 hex chars — 128 bits — to
|
|
4
|
+
stay friendly to Windows path limits):
|
|
5
|
+
|
|
6
|
+
.forge/checkpoints/<hash32>/
|
|
7
|
+
├── provenance.json # written LAST: its presence marks completeness
|
|
8
|
+
└── outputs/<name>.<ext per serializer>
|
|
9
|
+
|
|
10
|
+
The store also owns ``.forge/.gitignore`` so checkpoints and caches never
|
|
11
|
+
land in the user's repo history while ``pipelines/`` remains committable.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
import shutil
|
|
20
|
+
import time
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
from starforge.core import figures as figmod
|
|
24
|
+
from starforge.core import previews, serializers
|
|
25
|
+
|
|
26
|
+
FORGE_DIR = ".forge"
|
|
27
|
+
GITIGNORE_BODY = "checkpoints/\ncache/\n"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class CheckpointStore:
|
|
31
|
+
def __init__(self, workspace: str | Path) -> None:
|
|
32
|
+
self.workspace = Path(workspace)
|
|
33
|
+
self.forge_dir = self.workspace / FORGE_DIR
|
|
34
|
+
self.base = self.forge_dir / "checkpoints"
|
|
35
|
+
|
|
36
|
+
def ensure_layout(self) -> None:
|
|
37
|
+
(self.forge_dir / "pipelines").mkdir(parents=True, exist_ok=True)
|
|
38
|
+
(self.forge_dir / "cache").mkdir(parents=True, exist_ok=True)
|
|
39
|
+
self.base.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
gitignore = self.forge_dir / ".gitignore"
|
|
41
|
+
if not gitignore.exists():
|
|
42
|
+
gitignore.write_text(GITIGNORE_BODY, encoding="utf-8")
|
|
43
|
+
|
|
44
|
+
def dir_for(self, history_hash: str) -> Path:
|
|
45
|
+
return self.base / history_hash[:32]
|
|
46
|
+
|
|
47
|
+
def exists(self, history_hash: str) -> bool:
|
|
48
|
+
return (self.dir_for(history_hash) / "provenance.json").is_file()
|
|
49
|
+
|
|
50
|
+
def read_provenance(self, history_hash: str) -> dict[str, Any]:
|
|
51
|
+
path = self.dir_for(history_hash) / "provenance.json"
|
|
52
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
53
|
+
|
|
54
|
+
def write(
|
|
55
|
+
self,
|
|
56
|
+
history_hash: str,
|
|
57
|
+
provenance: dict[str, Any],
|
|
58
|
+
outputs: dict[str, Any],
|
|
59
|
+
pickle_enabled: bool = False,
|
|
60
|
+
side_figures: list[Any] | None = None,
|
|
61
|
+
) -> list[dict[str, Any]]:
|
|
62
|
+
"""Persist outputs then provenance (in that order, for atomicity).
|
|
63
|
+
Returns the output manifest, including ephemeral entries.
|
|
64
|
+
|
|
65
|
+
``side_figures`` are figures the block created or showed without
|
|
66
|
+
returning them (plt.show() and friends); they render to artifacts
|
|
67
|
+
recorded under the provenance ``figures`` key."""
|
|
68
|
+
directory = self.dir_for(history_hash)
|
|
69
|
+
outputs_dir = directory / "outputs"
|
|
70
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
manifest = []
|
|
72
|
+
for name, value in outputs.items():
|
|
73
|
+
entry = serializers.save_value(value, outputs_dir, name, pickle_enabled=pickle_enabled)
|
|
74
|
+
try:
|
|
75
|
+
# Previews ride inside provenance.json so the stdlib-only
|
|
76
|
+
# kernel can serve them without deserializing data. Computed
|
|
77
|
+
# for ephemeral outputs too — their only window is right now.
|
|
78
|
+
if entry.get("artifact"):
|
|
79
|
+
entry["preview"] = {
|
|
80
|
+
"kind": "figure",
|
|
81
|
+
"file": entry["artifact"]["file"],
|
|
82
|
+
"format": entry["artifact"]["kind"],
|
|
83
|
+
}
|
|
84
|
+
else:
|
|
85
|
+
entry["preview"] = previews.build_preview(value)
|
|
86
|
+
except Exception:
|
|
87
|
+
entry["preview"] = {"kind": "text", "text": f"<preview failed for {type(value).__name__}>"}
|
|
88
|
+
manifest.append(entry)
|
|
89
|
+
|
|
90
|
+
rendered_figures: list[dict[str, Any]] = []
|
|
91
|
+
for i, fig in enumerate(side_figures or []):
|
|
92
|
+
try:
|
|
93
|
+
artifact = figmod.render_figure(fig, outputs_dir, f"figure_{i}")
|
|
94
|
+
except Exception:
|
|
95
|
+
artifact = None
|
|
96
|
+
if artifact is not None:
|
|
97
|
+
rendered_figures.append(artifact)
|
|
98
|
+
|
|
99
|
+
record = dict(provenance)
|
|
100
|
+
record["history_hash"] = history_hash
|
|
101
|
+
record["outputs"] = manifest
|
|
102
|
+
record["figures"] = rendered_figures
|
|
103
|
+
record["dir"] = directory.relative_to(self.workspace).as_posix()
|
|
104
|
+
path = directory / "provenance.json"
|
|
105
|
+
tmp = directory / "provenance.json.tmp"
|
|
106
|
+
tmp.write_text(json.dumps(record, indent=2, default=repr), encoding="utf-8")
|
|
107
|
+
tmp.replace(path)
|
|
108
|
+
return manifest
|
|
109
|
+
|
|
110
|
+
def output_entry(self, history_hash: str, name: str) -> dict[str, Any]:
|
|
111
|
+
for entry in self.read_provenance(history_hash).get("outputs", []):
|
|
112
|
+
if entry.get("name") == name:
|
|
113
|
+
return entry
|
|
114
|
+
raise KeyError(f"checkpoint {history_hash[:12]} has no output named '{name}'")
|
|
115
|
+
|
|
116
|
+
def load_output(self, history_hash: str, name: str) -> Any:
|
|
117
|
+
"""Raises serializers.EphemeralValueError for non-persisted outputs."""
|
|
118
|
+
entry = self.output_entry(history_hash, name)
|
|
119
|
+
return serializers.load_value(self.dir_for(history_hash) / "outputs", entry)
|
|
120
|
+
|
|
121
|
+
def is_ephemeral(self, history_hash: str, name: str) -> bool:
|
|
122
|
+
try:
|
|
123
|
+
entry = self.output_entry(history_hash, name)
|
|
124
|
+
except (KeyError, FileNotFoundError, json.JSONDecodeError):
|
|
125
|
+
return True
|
|
126
|
+
return entry.get("serializer") == serializers.EPHEMERAL
|
|
127
|
+
|
|
128
|
+
def touch(self, history_hash: str) -> None:
|
|
129
|
+
"""Bump the checkpoint dir's mtime so LRU GC sees reuse as recency."""
|
|
130
|
+
try:
|
|
131
|
+
os.utime(self.dir_for(history_hash))
|
|
132
|
+
except OSError:
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
def gc(self, max_bytes: int) -> dict[str, int]:
|
|
136
|
+
"""Least-recently-used eviction down to ``max_bytes`` total.
|
|
137
|
+
|
|
138
|
+
Deleting a live checkpoint is always safe — the node just reads as
|
|
139
|
+
stale and recomputes — so a plain LRU needs no liveness analysis.
|
|
140
|
+
Returns {"freed_bytes", "deleted", "remaining_bytes"}.
|
|
141
|
+
"""
|
|
142
|
+
entries: list[tuple[float, int, Path]] = []
|
|
143
|
+
total = 0
|
|
144
|
+
if self.base.is_dir():
|
|
145
|
+
for directory in self.base.iterdir():
|
|
146
|
+
if not directory.is_dir():
|
|
147
|
+
continue
|
|
148
|
+
size = sum(f.stat().st_size for f in directory.rglob("*") if f.is_file())
|
|
149
|
+
try:
|
|
150
|
+
mtime = directory.stat().st_mtime
|
|
151
|
+
except OSError:
|
|
152
|
+
continue
|
|
153
|
+
entries.append((mtime, size, directory))
|
|
154
|
+
total += size
|
|
155
|
+
|
|
156
|
+
freed = 0
|
|
157
|
+
deleted = 0
|
|
158
|
+
entries.sort() # oldest first
|
|
159
|
+
for _mtime, size, directory in entries:
|
|
160
|
+
if total - freed <= max_bytes:
|
|
161
|
+
break
|
|
162
|
+
shutil.rmtree(directory, ignore_errors=True)
|
|
163
|
+
freed += size
|
|
164
|
+
deleted += 1
|
|
165
|
+
return {"freed_bytes": freed, "deleted": deleted, "remaining_bytes": total - freed}
|
|
166
|
+
|
|
167
|
+
def clean_run_specs(self, max_age_seconds: float = 86400.0) -> None:
|
|
168
|
+
"""Run-spec files are one-shot worker inputs; sweep the stale ones."""
|
|
169
|
+
runs_dir = self.forge_dir / "cache" / "runs"
|
|
170
|
+
if not runs_dir.is_dir():
|
|
171
|
+
return
|
|
172
|
+
cutoff = time.time() - max_age_seconds
|
|
173
|
+
for spec in runs_dir.glob("*.json"):
|
|
174
|
+
try:
|
|
175
|
+
if spec.stat().st_mtime < cutoff:
|
|
176
|
+
spec.unlink()
|
|
177
|
+
except OSError:
|
|
178
|
+
continue
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Figure capture and artifact rendering.
|
|
2
|
+
|
|
3
|
+
The notebook muscle memory is ``plt.plot(...); plt.show()`` — or no show()
|
|
4
|
+
at all. The worker honors it with zero code changes: matplotlib runs on the
|
|
5
|
+
Agg backend, and :func:`capture` sweeps every figure that exists after the
|
|
6
|
+
block call that didn't exist before (``plt.show`` is a no-op under Agg, so
|
|
7
|
+
"shown" figures are still open when we sweep). Plotly's ``fig.show()`` is
|
|
8
|
+
intercepted by patching ``plotly.io.show`` while the block runs.
|
|
9
|
+
|
|
10
|
+
Captured and returned figures render to checkpoint artifacts — matplotlib →
|
|
11
|
+
PNG, plotly → self-contained HTML — and are closed afterward so a long run
|
|
12
|
+
never accumulates canvases.
|
|
13
|
+
|
|
14
|
+
Import discipline: stdlib-only at import time; matplotlib/plotly are only
|
|
15
|
+
touched when the user's process already loaded them.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from contextlib import contextmanager
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
import sys
|
|
24
|
+
from typing import Any, Iterator
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _pyplot() -> Any | None:
|
|
28
|
+
return sys.modules.get("matplotlib.pyplot")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _root_module(value: Any) -> str:
|
|
32
|
+
return type(value).__module__.split(".")[0]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class CapturedFigures:
|
|
37
|
+
matplotlib: list[Any] = field(default_factory=list)
|
|
38
|
+
plotly: list[Any] = field(default_factory=list)
|
|
39
|
+
|
|
40
|
+
def all_objects(self) -> list[Any]:
|
|
41
|
+
return [*self.matplotlib, *self.plotly]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@contextmanager
|
|
45
|
+
def capture() -> Iterator[CapturedFigures]:
|
|
46
|
+
"""Collect figures created (matplotlib) or shown (plotly) inside the
|
|
47
|
+
block call. The matplotlib sweep also catches figures created during the
|
|
48
|
+
block module's first import, since the import happens inside this
|
|
49
|
+
context in the runner."""
|
|
50
|
+
captured = CapturedFigures()
|
|
51
|
+
|
|
52
|
+
plt = _pyplot()
|
|
53
|
+
before: set[int] = set(plt.get_fignums()) if plt is not None else set()
|
|
54
|
+
|
|
55
|
+
pio = sys.modules.get("plotly.io")
|
|
56
|
+
original_show = getattr(pio, "show", None) if pio is not None else None
|
|
57
|
+
if pio is not None and original_show is not None:
|
|
58
|
+
|
|
59
|
+
def _grab(fig: Any, *args: Any, **kwargs: Any) -> None:
|
|
60
|
+
captured.plotly.append(fig)
|
|
61
|
+
|
|
62
|
+
pio.show = _grab
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
yield captured
|
|
66
|
+
finally:
|
|
67
|
+
if pio is not None and original_show is not None:
|
|
68
|
+
pio.show = original_show
|
|
69
|
+
plt = _pyplot() # may have been imported during the call
|
|
70
|
+
if plt is not None:
|
|
71
|
+
for num in plt.get_fignums():
|
|
72
|
+
if num not in before:
|
|
73
|
+
captured.matplotlib.append(plt.figure(num))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def as_figure(value: Any) -> Any | None:
|
|
77
|
+
"""Return a renderable figure for ``value``, or None.
|
|
78
|
+
|
|
79
|
+
Accepts matplotlib Figures, matplotlib Axes (``sns.heatmap`` et al.
|
|
80
|
+
return Axes — we render their parent figure), and plotly figures.
|
|
81
|
+
"""
|
|
82
|
+
root = _root_module(value)
|
|
83
|
+
if root == "matplotlib":
|
|
84
|
+
if hasattr(value, "savefig"):
|
|
85
|
+
return value
|
|
86
|
+
parent = getattr(value, "figure", None) # Axes and friends
|
|
87
|
+
if parent is not None and hasattr(parent, "savefig"):
|
|
88
|
+
return parent
|
|
89
|
+
if root == "plotly" and hasattr(value, "write_html"):
|
|
90
|
+
return value
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def render_figure(value: Any, directory: Path, basename: str) -> dict[str, Any] | None:
|
|
95
|
+
"""Render to ``directory/basename.(png|html)``; returns the artifact
|
|
96
|
+
entry ``{"file", "kind"}`` or None if ``value`` is not a figure."""
|
|
97
|
+
fig = as_figure(value)
|
|
98
|
+
if fig is None:
|
|
99
|
+
return None
|
|
100
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
if _root_module(fig) == "matplotlib":
|
|
102
|
+
filename = f"{basename}.png"
|
|
103
|
+
fig.savefig(directory / filename, dpi=110, bbox_inches="tight", facecolor=fig.get_facecolor())
|
|
104
|
+
return {"file": filename, "kind": "image"}
|
|
105
|
+
filename = f"{basename}.html"
|
|
106
|
+
fig.write_html(directory / filename, include_plotlyjs=True, full_html=True)
|
|
107
|
+
return {"file": filename, "kind": "html"}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def close_figures(figures: list[Any]) -> None:
|
|
111
|
+
plt = _pyplot()
|
|
112
|
+
if plt is None:
|
|
113
|
+
return
|
|
114
|
+
for fig in figures:
|
|
115
|
+
if _root_module(fig) == "matplotlib":
|
|
116
|
+
try:
|
|
117
|
+
plt.close(fig)
|
|
118
|
+
except Exception:
|
|
119
|
+
pass
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Cropped, JSON-safe output previews, computed at checkpoint-write time.
|
|
2
|
+
|
|
3
|
+
Previews are precomputed artifacts stored inside ``provenance.json`` — the
|
|
4
|
+
kernel serves them by reading a file, never by deserializing data (it stays
|
|
5
|
+
stdlib-only and instant). Because they're built while the value is in the
|
|
6
|
+
worker's hands, even EPHEMERAL outputs get a preview of their last run.
|
|
7
|
+
|
|
8
|
+
Everything emitted here must survive strict JSON.parse on the TypeScript
|
|
9
|
+
side: NaN/Infinity are stringified, containers are size-capped, and unknown
|
|
10
|
+
objects fall back to repr.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
MAX_ROWS = 8
|
|
19
|
+
MAX_COLS = 10
|
|
20
|
+
MAX_ITEMS = 50
|
|
21
|
+
MAX_DEPTH = 5
|
|
22
|
+
MAX_CELL_CHARS = 120
|
|
23
|
+
MAX_TEXT_CHARS = 600
|
|
24
|
+
MAX_VALUE_CHARS = 2000
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _cell(value: Any) -> Any:
|
|
28
|
+
"""One scalar table/array cell, strict-JSON safe."""
|
|
29
|
+
if isinstance(value, bool) or value is None:
|
|
30
|
+
return value
|
|
31
|
+
if isinstance(value, int):
|
|
32
|
+
return value
|
|
33
|
+
if isinstance(value, float):
|
|
34
|
+
return value if value == value and abs(value) != float("inf") else str(value)
|
|
35
|
+
text = value if isinstance(value, str) else repr(value)
|
|
36
|
+
return text[:MAX_CELL_CHARS] + ("…" if len(text) > MAX_CELL_CHARS else "")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _sanitize(value: Any, depth: int = 0) -> Any:
|
|
40
|
+
if depth >= MAX_DEPTH:
|
|
41
|
+
return _cell(value)
|
|
42
|
+
if isinstance(value, dict):
|
|
43
|
+
items = list(value.items())[:MAX_ITEMS]
|
|
44
|
+
out = {str(k)[:MAX_CELL_CHARS]: _sanitize(v, depth + 1) for k, v in items}
|
|
45
|
+
if len(value) > MAX_ITEMS:
|
|
46
|
+
out["…"] = f"+{len(value) - MAX_ITEMS} more"
|
|
47
|
+
return out
|
|
48
|
+
if isinstance(value, (list, tuple)):
|
|
49
|
+
out = [_sanitize(v, depth + 1) for v in value[:MAX_ITEMS]]
|
|
50
|
+
if len(value) > MAX_ITEMS:
|
|
51
|
+
out.append(f"… +{len(value) - MAX_ITEMS} more")
|
|
52
|
+
return out
|
|
53
|
+
return _cell(value)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _root_type_module(value: Any) -> str:
|
|
57
|
+
return type(value).__module__.split(".")[0]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def build_preview(value: Any) -> dict[str, Any]:
|
|
61
|
+
if _root_type_module(value) == "pandas":
|
|
62
|
+
import pandas as pd
|
|
63
|
+
|
|
64
|
+
frame = value.to_frame() if isinstance(value, pd.Series) else value
|
|
65
|
+
if isinstance(frame, pd.DataFrame):
|
|
66
|
+
columns = [str(c) for c in frame.columns[:MAX_COLS]]
|
|
67
|
+
head = frame.iloc[:MAX_ROWS, :MAX_COLS]
|
|
68
|
+
return {
|
|
69
|
+
"kind": "table",
|
|
70
|
+
"shape": [int(frame.shape[0]), int(frame.shape[1])],
|
|
71
|
+
"columns": columns,
|
|
72
|
+
"columns_truncated": frame.shape[1] > MAX_COLS,
|
|
73
|
+
"index": [_cell(i) for i in head.index.tolist()],
|
|
74
|
+
"rows": [[_cell(v) for v in row] for row in head.itertuples(index=False, name=None)],
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
if _root_type_module(value) == "numpy":
|
|
78
|
+
import numpy as np
|
|
79
|
+
|
|
80
|
+
if isinstance(value, np.ndarray):
|
|
81
|
+
corner = value
|
|
82
|
+
if corner.ndim == 0:
|
|
83
|
+
corner_list: Any = _cell(corner.item())
|
|
84
|
+
else:
|
|
85
|
+
slicer = tuple(slice(0, MAX_ROWS) for _ in range(corner.ndim))
|
|
86
|
+
corner_list = _sanitize(corner[slicer].tolist())
|
|
87
|
+
return {
|
|
88
|
+
"kind": "array",
|
|
89
|
+
"dtype": str(value.dtype),
|
|
90
|
+
"shape": list(value.shape),
|
|
91
|
+
"corner": corner_list,
|
|
92
|
+
}
|
|
93
|
+
if isinstance(value, np.generic):
|
|
94
|
+
return {"kind": "value", "value": _cell(value.item())}
|
|
95
|
+
|
|
96
|
+
if isinstance(value, (dict, list, tuple, str, int, float, bool)) or value is None:
|
|
97
|
+
sanitized = _sanitize(value)
|
|
98
|
+
try:
|
|
99
|
+
encoded = json.dumps(sanitized, allow_nan=False)
|
|
100
|
+
except (TypeError, ValueError):
|
|
101
|
+
encoded = None
|
|
102
|
+
if encoded is not None:
|
|
103
|
+
if len(encoded) > MAX_VALUE_CHARS:
|
|
104
|
+
return {"kind": "text", "text": encoded[:MAX_VALUE_CHARS] + "…"}
|
|
105
|
+
return {"kind": "value", "value": sanitized}
|
|
106
|
+
|
|
107
|
+
# Arbitrary objects: an honest repr, marked as text rather than data.
|
|
108
|
+
text = repr(value)
|
|
109
|
+
return {"kind": "text", "text": text[:MAX_TEXT_CHARS] + ("…" if len(text) > MAX_TEXT_CHARS else "")}
|