probelock 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- probelock/__init__.py +8 -0
- probelock/cli.py +540 -0
- probelock/clients.py +465 -0
- probelock/data/format_probes.json +12 -0
- probelock/data/restraint_probes.json +14 -0
- probelock/diff.py +91 -0
- probelock/lockfile.py +18 -0
- probelock/models.py +147 -0
- probelock/probes.py +366 -0
- probelock/runner.py +69 -0
- probelock/scoring.py +154 -0
- probelock/stats.py +46 -0
- probelock-0.1.0.dist-info/METADATA +257 -0
- probelock-0.1.0.dist-info/RECORD +17 -0
- probelock-0.1.0.dist-info/WHEEL +4 -0
- probelock-0.1.0.dist-info/entry_points.txt +2 -0
- probelock-0.1.0.dist-info/licenses/LICENSE +201 -0
probelock/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""probelock — a capability lockfile for local models.
|
|
2
|
+
|
|
3
|
+
Derive deterministic capability probes from your agent's own tool schemas, score
|
|
4
|
+
them with no LLM judge, and gate CI on within-model regression when you swap a
|
|
5
|
+
model version, quantization, or runtime.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
probelock/cli.py
ADDED
|
@@ -0,0 +1,540 @@
|
|
|
1
|
+
"""probelock command line interface."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import datetime as _dt
|
|
6
|
+
import html as _html
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
import typer
|
|
12
|
+
from rich.console import Console
|
|
13
|
+
from rich.markup import escape
|
|
14
|
+
from rich.table import Table
|
|
15
|
+
|
|
16
|
+
from . import __version__
|
|
17
|
+
from .clients import AnyLlmClient, ClientError, HttpClient, LiteLlmClient, SimulatedClient
|
|
18
|
+
from .diff import diff_lockfiles
|
|
19
|
+
from .lockfile import read_lockfile, write_lockfile
|
|
20
|
+
from .probes import derive_probes, tools_fingerprint
|
|
21
|
+
from .runner import run_probes
|
|
22
|
+
|
|
23
|
+
app = typer.Typer(
|
|
24
|
+
add_completion=False,
|
|
25
|
+
help="probelock — a capability lockfile for local models. Catch silent "
|
|
26
|
+
"regressions when you swap a model, quant, or runtime.",
|
|
27
|
+
)
|
|
28
|
+
console = Console()
|
|
29
|
+
err_console = Console(stderr=True)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _err(msg: str) -> None:
|
|
33
|
+
"""Print a red error line, escaping dynamic text so brackets in paths or
|
|
34
|
+
messages (e.g. 'probelock[anyllm]') aren't eaten by rich's markup parser."""
|
|
35
|
+
err_console.print(f"[red]{escape(msg)}[/]")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
_BAR = 24
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _bar(score: float) -> str:
|
|
42
|
+
filled = round(score * _BAR)
|
|
43
|
+
color = "green" if score >= 0.9 else "yellow" if score >= 0.7 else "red"
|
|
44
|
+
return f"[{color}]{'█' * filled}{'░' * (_BAR - filled)}[/] {score:.2f}"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _load_tools(path: Path):
|
|
48
|
+
tools = json.loads(Path(path).read_text())
|
|
49
|
+
if not isinstance(tools, list):
|
|
50
|
+
raise ValueError("tools file must be a JSON array of OpenAI-style tools")
|
|
51
|
+
for i, tool in enumerate(tools):
|
|
52
|
+
if not isinstance(tool, dict) or "name" not in (tool.get("function") or {}):
|
|
53
|
+
raise ValueError(
|
|
54
|
+
f"tool #{i} must be an object with a 'function.name' (OpenAI tools format)"
|
|
55
|
+
)
|
|
56
|
+
return tools
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _load_tools_or_exit(path: Path):
|
|
60
|
+
"""Load tools, converting any bad-input error to a clean Exit(2) so a typo'd
|
|
61
|
+
--tools path never exits 1 (the regression code) with a raw traceback."""
|
|
62
|
+
try:
|
|
63
|
+
return _load_tools(path)
|
|
64
|
+
except FileNotFoundError:
|
|
65
|
+
_err(f"Tools file not found: {path}")
|
|
66
|
+
raise typer.Exit(2)
|
|
67
|
+
except (json.JSONDecodeError, ValueError, TypeError) as exc:
|
|
68
|
+
_err(f"Invalid tools file {path}: {exc}")
|
|
69
|
+
raise typer.Exit(2)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _load_json_or_exit(path: Path, what: str):
|
|
73
|
+
try:
|
|
74
|
+
return json.loads(Path(path).read_text())
|
|
75
|
+
except FileNotFoundError:
|
|
76
|
+
_err(f"{what} not found: {path}")
|
|
77
|
+
raise typer.Exit(2)
|
|
78
|
+
except (json.JSONDecodeError, ValueError) as exc:
|
|
79
|
+
_err(f"Invalid {what} {path}: {exc}")
|
|
80
|
+
raise typer.Exit(2)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@app.command()
|
|
84
|
+
def derive(
|
|
85
|
+
tools: Path = typer.Option(..., "--tools", "-t", help="OpenAI-style tools JSON file."),
|
|
86
|
+
) -> None:
|
|
87
|
+
"""Show the probe battery that would be generated from a toolset (transparency)."""
|
|
88
|
+
probes = derive_probes(_load_tools_or_exit(tools))
|
|
89
|
+
table = Table(title=f"{len(probes)} probes derived", expand=True)
|
|
90
|
+
table.add_column("Probe id", no_wrap=True)
|
|
91
|
+
table.add_column("Capability", no_wrap=True)
|
|
92
|
+
table.add_column("Checks")
|
|
93
|
+
for p in probes:
|
|
94
|
+
table.add_row(p.id, p.capability, p.description)
|
|
95
|
+
console.print(table)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@app.command()
|
|
99
|
+
def probe(
|
|
100
|
+
tools: Path = typer.Option(..., "--tools", "-t", help="OpenAI-style tools JSON file."),
|
|
101
|
+
simulate: Optional[Path] = typer.Option(
|
|
102
|
+
None, "--simulate", "-s", help="Run against a deterministic profile (no model)."
|
|
103
|
+
),
|
|
104
|
+
endpoint: Optional[str] = typer.Option(
|
|
105
|
+
None, "--endpoint", help="OpenAI-compatible base URL (e.g. http://localhost:11434/v1)."
|
|
106
|
+
),
|
|
107
|
+
via: str = typer.Option(
|
|
108
|
+
"", "--via", help="Route through a library instead of --endpoint: anyllm | litellm "
|
|
109
|
+
"(model is 'provider/name', e.g. anthropic/claude-3-5-sonnet)."
|
|
110
|
+
),
|
|
111
|
+
model: str = typer.Option("", "--model", "-m", help="Model id (or 'provider/name' with --via)."),
|
|
112
|
+
quant: str = typer.Option("", "--quant", help="Quantization tag, recorded in the lockfile."),
|
|
113
|
+
runtime: str = typer.Option("", "--runtime", help="Runtime tag (ollama, llama.cpp, mlx...)."),
|
|
114
|
+
api_key: str = typer.Option("", "--api-key", help="Bearer token, if the endpoint needs one."),
|
|
115
|
+
timeout: float = typer.Option(60.0, "--timeout", help="Per-probe timeout in seconds."),
|
|
116
|
+
samples: int = typer.Option(
|
|
117
|
+
1, "--samples", help="Run each probe N times; the score becomes a pass-rate."
|
|
118
|
+
),
|
|
119
|
+
temperature: float = typer.Option(
|
|
120
|
+
0.0, "--temperature", help="Sampling temperature (raise it with --samples for variance)."
|
|
121
|
+
),
|
|
122
|
+
label: Optional[str] = typer.Option(None, "--label", help="Override the lockfile label."),
|
|
123
|
+
out: Optional[Path] = typer.Option(None, "--out", "-o", help="Write the lockfile here."),
|
|
124
|
+
) -> None:
|
|
125
|
+
"""Run the probe battery and produce a capability lockfile."""
|
|
126
|
+
tool_list = _load_tools_or_exit(tools)
|
|
127
|
+
probes = derive_probes(tool_list)
|
|
128
|
+
fingerprint = tools_fingerprint(tool_list)
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
if simulate is not None:
|
|
132
|
+
client = SimulatedClient(_load_json_or_exit(simulate, "simulate profile"))
|
|
133
|
+
elif via:
|
|
134
|
+
kind = via.lower()
|
|
135
|
+
if kind in ("anyllm", "any-llm"):
|
|
136
|
+
client = AnyLlmClient(model=model, api_key=api_key, temperature=temperature, quant=quant)
|
|
137
|
+
elif kind in ("litellm", "lite-llm"):
|
|
138
|
+
client = LiteLlmClient(model=model, api_key=api_key, temperature=temperature, quant=quant)
|
|
139
|
+
else:
|
|
140
|
+
_err(f"Unknown --via '{via}' (use anyllm | litellm).")
|
|
141
|
+
raise typer.Exit(2)
|
|
142
|
+
elif endpoint is not None:
|
|
143
|
+
client = HttpClient(
|
|
144
|
+
base_url=endpoint, model=model, api_key=api_key, quant=quant,
|
|
145
|
+
runtime=runtime, timeout=timeout, temperature=temperature,
|
|
146
|
+
)
|
|
147
|
+
else:
|
|
148
|
+
err_console.print("[red]Provide --simulate PROFILE, --endpoint URL, or --via {anyllm,litellm}.[/]")
|
|
149
|
+
raise typer.Exit(2)
|
|
150
|
+
except ClientError as exc: # e.g. the --via SDK isn't installed
|
|
151
|
+
_err(str(exc))
|
|
152
|
+
raise typer.Exit(2)
|
|
153
|
+
|
|
154
|
+
try:
|
|
155
|
+
lock = run_probes(client, probes, fingerprint, __version__, samples=samples)
|
|
156
|
+
except ClientError as exc:
|
|
157
|
+
_err(str(exc))
|
|
158
|
+
raise typer.Exit(2)
|
|
159
|
+
|
|
160
|
+
errored = [r for r in lock.results if r.error]
|
|
161
|
+
if errored and len(errored) == lock.n_probes:
|
|
162
|
+
# Every probe failed at the API level -> this is a misconfiguration, not a
|
|
163
|
+
# capability profile. Refuse to write a lockfile that could become a
|
|
164
|
+
# poisoned all-zeros baseline.
|
|
165
|
+
_err(
|
|
166
|
+
f"All {lock.n_probes} probes failed at the API level — refusing to write "
|
|
167
|
+
f"a lockfile. First error: {errored[0].error}"
|
|
168
|
+
)
|
|
169
|
+
raise typer.Exit(2)
|
|
170
|
+
|
|
171
|
+
if samples > lock.samples:
|
|
172
|
+
err_console.print(
|
|
173
|
+
f"[yellow]--samples {samples} had no effect: this endpoint is deterministic "
|
|
174
|
+
f"(temperature 0 / simulated), so samples are identical. Recorded samples=1; "
|
|
175
|
+
f"raise --temperature for independent samples.[/]"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
lock.generated_at = _dt.datetime.now(_dt.timezone.utc).isoformat()
|
|
179
|
+
if label:
|
|
180
|
+
lock.label = label
|
|
181
|
+
|
|
182
|
+
console.print(f"\n[bold]{lock.label}[/] ([cyan]{lock.n_probes} probes[/], fp {lock.tools_fingerprint})")
|
|
183
|
+
table = Table(expand=True)
|
|
184
|
+
table.add_column("Capability", no_wrap=True)
|
|
185
|
+
table.add_column("Score", ratio=1)
|
|
186
|
+
for cap, sc in lock.capabilities.items():
|
|
187
|
+
table.add_row(cap, _bar(sc))
|
|
188
|
+
console.print(table)
|
|
189
|
+
|
|
190
|
+
if errored:
|
|
191
|
+
console.print(
|
|
192
|
+
f"[yellow]{len(errored)} probe(s) errored at the API level.[/] "
|
|
193
|
+
f"e.g. {escape(str(errored[0].error))}"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
if out is not None:
|
|
197
|
+
write_lockfile(lock, out)
|
|
198
|
+
console.print(f"[green]wrote[/] {out}")
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _validate_confidence(confidence: Optional[float]) -> None:
|
|
202
|
+
if confidence is not None and not (0.0 < confidence < 1.0):
|
|
203
|
+
err_console.print(
|
|
204
|
+
f"[red]--confidence must be between 0 and 1 (exclusive); got {confidence}.[/]"
|
|
205
|
+
)
|
|
206
|
+
raise typer.Exit(2)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _read_lock(path: Path):
|
|
210
|
+
"""Read a lockfile, converting any malformed-input error into a clean Exit(2)
|
|
211
|
+
so CI can distinguish a broken lockfile from a real regression (exit 1)."""
|
|
212
|
+
try:
|
|
213
|
+
return read_lockfile(path)
|
|
214
|
+
except FileNotFoundError:
|
|
215
|
+
_err(f"Lockfile not found: {path}")
|
|
216
|
+
raise typer.Exit(2)
|
|
217
|
+
except (json.JSONDecodeError, ValueError, TypeError, KeyError) as exc:
|
|
218
|
+
_err(f"Could not read lockfile {path}: {exc}")
|
|
219
|
+
raise typer.Exit(2)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# One source of truth for status display (table rich-markup, markdown text).
|
|
223
|
+
_STATUS_LABELS = {
|
|
224
|
+
"ok": ("[green]ok[/]", "✅ ok"),
|
|
225
|
+
"regression": ("[bold red]REGRESSION[/]", "⚠️ REGRESSION"),
|
|
226
|
+
"noisy": ("[yellow]noisy ↓[/]", "〰️ noisy"),
|
|
227
|
+
"improved": ("[cyan]improved[/]", "⬆️ improved"),
|
|
228
|
+
"added": ("[dim]added[/]", "➕ added"),
|
|
229
|
+
"removed": ("[bold red]REMOVED[/]", "⛔ REMOVED"),
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _cell(value, signed=False) -> str:
|
|
234
|
+
if value is None:
|
|
235
|
+
return "—"
|
|
236
|
+
return f"{value:+.2f}" if signed else f"{value:.2f}"
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _diff_notes(result, b, c):
|
|
240
|
+
"""Plain-text comparison caveats, shared by the table and markdown renderers
|
|
241
|
+
so the two can never drift out of sync."""
|
|
242
|
+
notes = []
|
|
243
|
+
if result.tools_changed:
|
|
244
|
+
notes.append("⚠ toolsets differ — comparison may not be apples-to-apples.")
|
|
245
|
+
if b.model and c.model and b.model != c.model:
|
|
246
|
+
notes.append(
|
|
247
|
+
f"⚠ different models ({b.model} → {c.model}) — cross-model comparison, "
|
|
248
|
+
f"not a within-model regression check."
|
|
249
|
+
)
|
|
250
|
+
elif b.model and (b.quant, b.runtime) != (c.quant, c.runtime):
|
|
251
|
+
notes.append(
|
|
252
|
+
f"within-model swap: {b.quant or 'native'}/{b.runtime or '?'} → "
|
|
253
|
+
f"{c.quant or 'native'}/{c.runtime or '?'}"
|
|
254
|
+
)
|
|
255
|
+
if b.samples != c.samples:
|
|
256
|
+
notes.append(
|
|
257
|
+
f"⚠ sample counts differ ({b.samples} vs {c.samples}); --confidence has "
|
|
258
|
+
f"uneven statistical power across the two lockfiles."
|
|
259
|
+
)
|
|
260
|
+
return notes
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _render_diff(result, b, c) -> None:
|
|
264
|
+
# escape() externally-sourced text (labels, capability names) so brackets in a
|
|
265
|
+
# model/quant tag aren't eaten by rich markup; the status badge is real markup.
|
|
266
|
+
title = f"{escape(b.label or 'baseline')} → {escape(c.label or 'candidate')}"
|
|
267
|
+
table = Table(title=title, expand=True)
|
|
268
|
+
for col, justify in (("Capability", "left"), ("Baseline", "right"),
|
|
269
|
+
("Candidate", "right"), ("Δ", "right"), ("Status", "left")):
|
|
270
|
+
table.add_column(col, justify=justify, no_wrap=(col in ("Capability", "Status")))
|
|
271
|
+
for r in result.rows:
|
|
272
|
+
label = _STATUS_LABELS.get(r.status, (r.status, r.status))[0]
|
|
273
|
+
table.add_row(escape(r.capability), _cell(r.baseline), _cell(r.candidate),
|
|
274
|
+
_cell(r.delta, signed=True), label)
|
|
275
|
+
console.print(table)
|
|
276
|
+
for note in _diff_notes(result, b, c):
|
|
277
|
+
console.print(f"[yellow]{escape(note)}[/]")
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _markdown_diff(result, b, c) -> str:
|
|
281
|
+
lines = [
|
|
282
|
+
f"### probelock: `{b.label or 'baseline'}` → `{c.label or 'candidate'}`",
|
|
283
|
+
"",
|
|
284
|
+
"| Capability | Baseline | Candidate | Δ | Status |",
|
|
285
|
+
"|---|--:|--:|--:|---|",
|
|
286
|
+
]
|
|
287
|
+
for r in result.rows:
|
|
288
|
+
label = _STATUS_LABELS.get(r.status, (r.status, r.status))[1]
|
|
289
|
+
lines.append(
|
|
290
|
+
f"| `{r.capability}` | {_cell(r.baseline)} | {_cell(r.candidate)} | "
|
|
291
|
+
f"{_cell(r.delta, signed=True)} | {label} |"
|
|
292
|
+
)
|
|
293
|
+
lines.append("")
|
|
294
|
+
if result.regressed:
|
|
295
|
+
names = ", ".join(f"`{r.capability}`" for r in result.regressions)
|
|
296
|
+
lines.append(f"**FAIL** — capabilities regressed or removed: {names}")
|
|
297
|
+
else:
|
|
298
|
+
lines.append("**PASS** — no capability regressed.")
|
|
299
|
+
for note in _diff_notes(result, b, c):
|
|
300
|
+
lines.append(f"\n> {note}")
|
|
301
|
+
return "\n".join(lines)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _diff_payload(result, b, c) -> dict:
|
|
305
|
+
def meta(lock):
|
|
306
|
+
return {"label": lock.label, "model": lock.model, "quant": lock.quant,
|
|
307
|
+
"runtime": lock.runtime, "samples": lock.samples}
|
|
308
|
+
return {
|
|
309
|
+
"baseline": meta(b),
|
|
310
|
+
"candidate": meta(c),
|
|
311
|
+
"max_drop": result.max_drop,
|
|
312
|
+
"tools_changed": result.tools_changed,
|
|
313
|
+
"regressed": result.regressed,
|
|
314
|
+
"rows": [
|
|
315
|
+
{"capability": r.capability, "baseline": r.baseline, "candidate": r.candidate,
|
|
316
|
+
"delta": r.delta, "status": r.status, "significant": r.significant}
|
|
317
|
+
for r in result.rows
|
|
318
|
+
],
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
_HTML_CSS = """
|
|
323
|
+
body{font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',Roboto,sans-serif;
|
|
324
|
+
max-width:780px;margin:2rem auto;padding:0 1rem;color:#1a1a1a;line-height:1.45}
|
|
325
|
+
h1{font-size:1.2rem;margin:0 0 .2rem}.sub{color:#6b7280;font-size:.9rem;margin:0 0 1rem}
|
|
326
|
+
table{border-collapse:collapse;width:100%;font-size:.92rem}
|
|
327
|
+
th,td{padding:.45rem .6rem;border-bottom:1px solid #eee}th{text-align:left;color:#6b7280;font-weight:600}
|
|
328
|
+
td.cap{font-family:ui-monospace,Menlo,monospace}td.num{text-align:right;font-variant-numeric:tabular-nums}
|
|
329
|
+
.bar{height:.5rem;border-radius:3px;background:#eceef1;min-width:90px}
|
|
330
|
+
.bar>span{display:block;height:100%;border-radius:3px}
|
|
331
|
+
.badge{font-size:.72rem;font-weight:700;padding:.08rem .42rem;border-radius:4px;white-space:nowrap}
|
|
332
|
+
.ok,.good{background:#dcfce7;color:#166534}.bad{background:#fee2e2;color:#991b1b}
|
|
333
|
+
.warn{background:#fef9c3;color:#854d0e}.dim{background:#f3f4f6;color:#6b7280}
|
|
334
|
+
.banner{padding:.6rem .9rem;border-radius:6px;font-weight:700;margin:1rem 0}
|
|
335
|
+
.banner.fail{background:#fee2e2;color:#991b1b}.banner.pass{background:#dcfce7;color:#166534}
|
|
336
|
+
.note{color:#854d0e;font-size:.85rem;margin:.25rem 0}
|
|
337
|
+
footer{color:#9ca3af;font-size:.78rem;margin-top:1.5rem}
|
|
338
|
+
"""
|
|
339
|
+
_HTML_STATUS = { # status -> (css class, label)
|
|
340
|
+
"ok": ("ok", "ok"), "regression": ("bad", "REGRESSION"), "noisy": ("warn", "noisy ↓"),
|
|
341
|
+
"improved": ("good", "improved"), "added": ("dim", "added"), "removed": ("bad", "REMOVED"),
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _html_diff(result, b, c) -> str:
|
|
346
|
+
def esc(s):
|
|
347
|
+
return _html.escape(str(s))
|
|
348
|
+
|
|
349
|
+
rows = []
|
|
350
|
+
for r in result.rows:
|
|
351
|
+
cls, label = _HTML_STATUS.get(r.status, ("dim", r.status))
|
|
352
|
+
pct = int(round((r.candidate or 0.0) * 100))
|
|
353
|
+
color = "#16a34a" if (r.candidate or 0) >= 0.9 else "#ca8a04" if (r.candidate or 0) >= 0.7 else "#dc2626"
|
|
354
|
+
rows.append(
|
|
355
|
+
f"<tr><td class='cap'>{esc(r.capability)}</td>"
|
|
356
|
+
f"<td class='num'>{_cell(r.baseline)}</td>"
|
|
357
|
+
f"<td class='num'>{_cell(r.candidate)}</td>"
|
|
358
|
+
f"<td class='num'>{_cell(r.delta, signed=True)}</td>"
|
|
359
|
+
f"<td><div class='bar'><span style='width:{pct}%;background:{color}'></span></div></td>"
|
|
360
|
+
f"<td><span class='badge {cls}'>{esc(label)}</span></td></tr>"
|
|
361
|
+
)
|
|
362
|
+
banner = (
|
|
363
|
+
f"<div class='banner fail'>FAIL — capabilities regressed or removed: "
|
|
364
|
+
f"{esc(', '.join(r.capability for r in result.regressions))}</div>"
|
|
365
|
+
if result.regressed else
|
|
366
|
+
"<div class='banner pass'>PASS — no capability regressed.</div>"
|
|
367
|
+
)
|
|
368
|
+
notes = "".join(f"<p class='note'>{esc(n)}</p>" for n in _diff_notes(result, b, c))
|
|
369
|
+
return (
|
|
370
|
+
"<!doctype html><html><head><meta charset='utf-8'>"
|
|
371
|
+
f"<meta name='viewport' content='width=device-width,initial-scale=1'><style>{_HTML_CSS}</style>"
|
|
372
|
+
"<title>probelock report</title></head><body>"
|
|
373
|
+
f"<h1>probelock capability report</h1>"
|
|
374
|
+
f"<p class='sub'>{esc(b.label or 'baseline')} → {esc(c.label or 'candidate')}</p>"
|
|
375
|
+
f"{banner}"
|
|
376
|
+
"<table><thead><tr><th>Capability</th><th>Baseline</th><th>Candidate</th>"
|
|
377
|
+
"<th>Δ</th><th>Candidate</th><th>Status</th></tr></thead>"
|
|
378
|
+
f"<tbody>{''.join(rows)}</tbody></table>{notes}"
|
|
379
|
+
"<footer>Generated by probelock — deterministic, no LLM judge.</footer>"
|
|
380
|
+
"</body></html>"
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
@app.command()
|
|
385
|
+
def diff(
|
|
386
|
+
baseline: Path = typer.Argument(..., help="Baseline lockfile."),
|
|
387
|
+
candidate: Path = typer.Argument(..., help="Candidate lockfile."),
|
|
388
|
+
max_drop: float = typer.Option(0.05, "--max-drop", help="Regression threshold."),
|
|
389
|
+
confidence: Optional[float] = typer.Option(
|
|
390
|
+
None, "--confidence", help="If set (e.g. 0.95), mark sub-significant drops 'noisy'."
|
|
391
|
+
),
|
|
392
|
+
fmt: str = typer.Option("table", "--format", help="table | markdown | json | html"),
|
|
393
|
+
) -> None:
|
|
394
|
+
"""Show within-model capability deltas between two lockfiles (informational)."""
|
|
395
|
+
_validate_confidence(confidence)
|
|
396
|
+
b, c = _read_lock(baseline), _read_lock(candidate)
|
|
397
|
+
result = diff_lockfiles(b, c, max_drop, confidence)
|
|
398
|
+
if fmt == "markdown":
|
|
399
|
+
print(_markdown_diff(result, b, c))
|
|
400
|
+
elif fmt == "json":
|
|
401
|
+
print(json.dumps(_diff_payload(result, b, c), indent=2))
|
|
402
|
+
elif fmt == "html":
|
|
403
|
+
print(_html_diff(result, b, c))
|
|
404
|
+
elif fmt == "table":
|
|
405
|
+
_render_diff(result, b, c)
|
|
406
|
+
else:
|
|
407
|
+
_err(f"Unknown --format '{fmt}' (use table | markdown | json | html).")
|
|
408
|
+
raise typer.Exit(2)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
@app.command()
|
|
412
|
+
def gate(
|
|
413
|
+
baseline: Path = typer.Option(..., "--baseline", "-b", help="Baseline (committed) lockfile."),
|
|
414
|
+
candidate: Path = typer.Option(..., "--candidate", "-c", help="Candidate lockfile."),
|
|
415
|
+
max_drop: float = typer.Option(0.05, "--max-drop", help="Regression threshold."),
|
|
416
|
+
require_same_model: bool = typer.Option(
|
|
417
|
+
False, "--require-same-model", help="Fail if baseline and candidate are different models."
|
|
418
|
+
),
|
|
419
|
+
confidence: Optional[float] = typer.Option(
|
|
420
|
+
None, "--confidence", help="Only fail on drops significant at this confidence (e.g. 0.95)."
|
|
421
|
+
),
|
|
422
|
+
) -> None:
|
|
423
|
+
"""CI gate: exit 1 if any capability regressed (or was dropped) beyond --max-drop.
|
|
424
|
+
|
|
425
|
+
With --confidence, a drop past --max-drop that isn't statistically significant
|
|
426
|
+
for the recorded sample count is reported as 'noisy' and does NOT fail the gate.
|
|
427
|
+
Exit 2 is reserved for invalid input (bad lockfile, or a cross-model
|
|
428
|
+
comparison under --require-same-model), so CI can tell the two apart.
|
|
429
|
+
"""
|
|
430
|
+
_validate_confidence(confidence)
|
|
431
|
+
b, c = _read_lock(baseline), _read_lock(candidate)
|
|
432
|
+
result = diff_lockfiles(b, c, max_drop, confidence)
|
|
433
|
+
_render_diff(result, b, c)
|
|
434
|
+
|
|
435
|
+
noisy = [r for r in result.rows if r.status == "noisy"]
|
|
436
|
+
if noisy:
|
|
437
|
+
console.print(
|
|
438
|
+
f"[yellow]{len(noisy)} drop(s) past --max-drop are below the {confidence} "
|
|
439
|
+
f"confidence bar (noisy ↓) — raise --samples to confirm or clear them.[/]"
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
if require_same_model and b.model and c.model and b.model != c.model:
|
|
443
|
+
console.print(
|
|
444
|
+
f"\n[bold red]INVALID[/] — --require-same-model set but models differ "
|
|
445
|
+
f"({escape(b.model)} vs {escape(c.model)})."
|
|
446
|
+
)
|
|
447
|
+
raise typer.Exit(2)
|
|
448
|
+
if result.regressed:
|
|
449
|
+
names = escape(", ".join(r.capability for r in result.regressions))
|
|
450
|
+
console.print(f"\n[bold red]FAIL[/] — capabilities regressed or removed: {names}")
|
|
451
|
+
raise typer.Exit(1)
|
|
452
|
+
console.print(f"\n[bold green]PASS[/] — no capability regressed beyond {max_drop:.2f}.")
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
_TEMPLATE_TOOLS = """\
|
|
456
|
+
[
|
|
457
|
+
{
|
|
458
|
+
"type": "function",
|
|
459
|
+
"function": {
|
|
460
|
+
"name": "create_event",
|
|
461
|
+
"description": "Create a calendar event",
|
|
462
|
+
"parameters": {
|
|
463
|
+
"type": "object",
|
|
464
|
+
"properties": {
|
|
465
|
+
"title": {"type": "string"},
|
|
466
|
+
"start": {"type": "string", "description": "ISO 8601 datetime"},
|
|
467
|
+
"visibility": {"type": "string", "enum": ["public", "private"]}
|
|
468
|
+
},
|
|
469
|
+
"required": ["title", "start"]
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
]
|
|
474
|
+
"""
|
|
475
|
+
|
|
476
|
+
_TEMPLATE_WORKFLOW = """\
|
|
477
|
+
name: probelock
|
|
478
|
+
on: [pull_request]
|
|
479
|
+
|
|
480
|
+
jobs:
|
|
481
|
+
capabilities:
|
|
482
|
+
runs-on: ubuntu-latest
|
|
483
|
+
steps:
|
|
484
|
+
- uses: actions/checkout@v4
|
|
485
|
+
- uses: astral-sh/setup-uv@v5
|
|
486
|
+
# Point --endpoint at your model server. CI needs network access to it
|
|
487
|
+
# (a hosted endpoint, or a self-hosted runner with Ollama/llama.cpp).
|
|
488
|
+
# Uses the published `probelock` from PyPI. To pin an unreleased revision,
|
|
489
|
+
# replace `uvx probelock` with:
|
|
490
|
+
# uvx --from git+https://github.com/kelkalot/probelock probelock ...
|
|
491
|
+
- name: Probe candidate
|
|
492
|
+
run: uvx probelock probe --tools probelock.tools.json
|
|
493
|
+
--endpoint "$LLM_ENDPOINT" --model "$LLM_MODEL"
|
|
494
|
+
--samples 5 --temperature 0.7 -o candidate.lock
|
|
495
|
+
env:
|
|
496
|
+
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
|
497
|
+
LLM_MODEL: ${{ vars.LLM_MODEL }}
|
|
498
|
+
- name: Gate on regression
|
|
499
|
+
run: uvx probelock gate --baseline probelock.lock --candidate candidate.lock
|
|
500
|
+
--max-drop 0.05 --confidence 0.95
|
|
501
|
+
"""
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
@app.command()
|
|
505
|
+
def init(
|
|
506
|
+
path: Path = typer.Option(Path("."), "--path", help="Directory to scaffold into."),
|
|
507
|
+
force: bool = typer.Option(False, "--force", help="Overwrite existing files."),
|
|
508
|
+
) -> None:
|
|
509
|
+
"""Scaffold a tools file and a CI workflow to get started."""
|
|
510
|
+
targets = [
|
|
511
|
+
(path / "probelock.tools.json", _TEMPLATE_TOOLS),
|
|
512
|
+
(path / ".github" / "workflows" / "probelock.yml", _TEMPLATE_WORKFLOW),
|
|
513
|
+
]
|
|
514
|
+
for target, content in targets:
|
|
515
|
+
if target.exists() and not force:
|
|
516
|
+
console.print(f"[yellow]exists, skipped[/] {target} (use --force)")
|
|
517
|
+
continue
|
|
518
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
519
|
+
target.write_text(content)
|
|
520
|
+
console.print(f"[green]created[/] {target}")
|
|
521
|
+
console.print(
|
|
522
|
+
"\nNext: probe your model and commit the baseline, then gate candidates in CI:\n"
|
|
523
|
+
" probelock probe --tools probelock.tools.json "
|
|
524
|
+
"--endpoint http://localhost:11434/v1 --model <model> -o probelock.lock\n"
|
|
525
|
+
" git add probelock.lock # this is your committed baseline"
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
@app.command()
|
|
530
|
+
def version() -> None:
|
|
531
|
+
"""Print the probelock version."""
|
|
532
|
+
console.print(__version__)
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
def main() -> None:
|
|
536
|
+
app()
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
if __name__ == "__main__":
|
|
540
|
+
main()
|