probelock 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
probelock/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ """probelock — a capability lockfile for local models.
2
+
3
+ Derive deterministic capability probes from your agent's own tool schemas, score
4
+ them with no LLM judge, and gate CI on within-model regression when you swap a
5
+ model version, quantization, or runtime.
6
+ """
7
+
8
+ __version__ = "0.1.0"
probelock/cli.py ADDED
@@ -0,0 +1,540 @@
1
+ """probelock command line interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import datetime as _dt
6
+ import html as _html
7
+ import json
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ import typer
12
+ from rich.console import Console
13
+ from rich.markup import escape
14
+ from rich.table import Table
15
+
16
+ from . import __version__
17
+ from .clients import AnyLlmClient, ClientError, HttpClient, LiteLlmClient, SimulatedClient
18
+ from .diff import diff_lockfiles
19
+ from .lockfile import read_lockfile, write_lockfile
20
+ from .probes import derive_probes, tools_fingerprint
21
+ from .runner import run_probes
22
+
23
+ app = typer.Typer(
24
+ add_completion=False,
25
+ help="probelock — a capability lockfile for local models. Catch silent "
26
+ "regressions when you swap a model, quant, or runtime.",
27
+ )
28
+ console = Console()
29
+ err_console = Console(stderr=True)
30
+
31
+
32
+ def _err(msg: str) -> None:
33
+ """Print a red error line, escaping dynamic text so brackets in paths or
34
+ messages (e.g. 'probelock[anyllm]') aren't eaten by rich's markup parser."""
35
+ err_console.print(f"[red]{escape(msg)}[/]")
36
+
37
+
38
+ _BAR = 24
39
+
40
+
41
+ def _bar(score: float) -> str:
42
+ filled = round(score * _BAR)
43
+ color = "green" if score >= 0.9 else "yellow" if score >= 0.7 else "red"
44
+ return f"[{color}]{'█' * filled}{'░' * (_BAR - filled)}[/] {score:.2f}"
45
+
46
+
47
+ def _load_tools(path: Path):
48
+ tools = json.loads(Path(path).read_text())
49
+ if not isinstance(tools, list):
50
+ raise ValueError("tools file must be a JSON array of OpenAI-style tools")
51
+ for i, tool in enumerate(tools):
52
+ if not isinstance(tool, dict) or "name" not in (tool.get("function") or {}):
53
+ raise ValueError(
54
+ f"tool #{i} must be an object with a 'function.name' (OpenAI tools format)"
55
+ )
56
+ return tools
57
+
58
+
59
+ def _load_tools_or_exit(path: Path):
60
+ """Load tools, converting any bad-input error to a clean Exit(2) so a typo'd
61
+ --tools path never exits 1 (the regression code) with a raw traceback."""
62
+ try:
63
+ return _load_tools(path)
64
+ except FileNotFoundError:
65
+ _err(f"Tools file not found: {path}")
66
+ raise typer.Exit(2)
67
+ except (json.JSONDecodeError, ValueError, TypeError) as exc:
68
+ _err(f"Invalid tools file {path}: {exc}")
69
+ raise typer.Exit(2)
70
+
71
+
72
+ def _load_json_or_exit(path: Path, what: str):
73
+ try:
74
+ return json.loads(Path(path).read_text())
75
+ except FileNotFoundError:
76
+ _err(f"{what} not found: {path}")
77
+ raise typer.Exit(2)
78
+ except (json.JSONDecodeError, ValueError) as exc:
79
+ _err(f"Invalid {what} {path}: {exc}")
80
+ raise typer.Exit(2)
81
+
82
+
83
+ @app.command()
84
+ def derive(
85
+ tools: Path = typer.Option(..., "--tools", "-t", help="OpenAI-style tools JSON file."),
86
+ ) -> None:
87
+ """Show the probe battery that would be generated from a toolset (transparency)."""
88
+ probes = derive_probes(_load_tools_or_exit(tools))
89
+ table = Table(title=f"{len(probes)} probes derived", expand=True)
90
+ table.add_column("Probe id", no_wrap=True)
91
+ table.add_column("Capability", no_wrap=True)
92
+ table.add_column("Checks")
93
+ for p in probes:
94
+ table.add_row(p.id, p.capability, p.description)
95
+ console.print(table)
96
+
97
+
98
+ @app.command()
99
+ def probe(
100
+ tools: Path = typer.Option(..., "--tools", "-t", help="OpenAI-style tools JSON file."),
101
+ simulate: Optional[Path] = typer.Option(
102
+ None, "--simulate", "-s", help="Run against a deterministic profile (no model)."
103
+ ),
104
+ endpoint: Optional[str] = typer.Option(
105
+ None, "--endpoint", help="OpenAI-compatible base URL (e.g. http://localhost:11434/v1)."
106
+ ),
107
+ via: str = typer.Option(
108
+ "", "--via", help="Route through a library instead of --endpoint: anyllm | litellm "
109
+ "(model is 'provider/name', e.g. anthropic/claude-3-5-sonnet)."
110
+ ),
111
+ model: str = typer.Option("", "--model", "-m", help="Model id (or 'provider/name' with --via)."),
112
+ quant: str = typer.Option("", "--quant", help="Quantization tag, recorded in the lockfile."),
113
+ runtime: str = typer.Option("", "--runtime", help="Runtime tag (ollama, llama.cpp, mlx...)."),
114
+ api_key: str = typer.Option("", "--api-key", help="Bearer token, if the endpoint needs one."),
115
+ timeout: float = typer.Option(60.0, "--timeout", help="Per-probe timeout in seconds."),
116
+ samples: int = typer.Option(
117
+ 1, "--samples", help="Run each probe N times; the score becomes a pass-rate."
118
+ ),
119
+ temperature: float = typer.Option(
120
+ 0.0, "--temperature", help="Sampling temperature (raise it with --samples for variance)."
121
+ ),
122
+ label: Optional[str] = typer.Option(None, "--label", help="Override the lockfile label."),
123
+ out: Optional[Path] = typer.Option(None, "--out", "-o", help="Write the lockfile here."),
124
+ ) -> None:
125
+ """Run the probe battery and produce a capability lockfile."""
126
+ tool_list = _load_tools_or_exit(tools)
127
+ probes = derive_probes(tool_list)
128
+ fingerprint = tools_fingerprint(tool_list)
129
+
130
+ try:
131
+ if simulate is not None:
132
+ client = SimulatedClient(_load_json_or_exit(simulate, "simulate profile"))
133
+ elif via:
134
+ kind = via.lower()
135
+ if kind in ("anyllm", "any-llm"):
136
+ client = AnyLlmClient(model=model, api_key=api_key, temperature=temperature, quant=quant)
137
+ elif kind in ("litellm", "lite-llm"):
138
+ client = LiteLlmClient(model=model, api_key=api_key, temperature=temperature, quant=quant)
139
+ else:
140
+ _err(f"Unknown --via '{via}' (use anyllm | litellm).")
141
+ raise typer.Exit(2)
142
+ elif endpoint is not None:
143
+ client = HttpClient(
144
+ base_url=endpoint, model=model, api_key=api_key, quant=quant,
145
+ runtime=runtime, timeout=timeout, temperature=temperature,
146
+ )
147
+ else:
148
+ err_console.print("[red]Provide --simulate PROFILE, --endpoint URL, or --via {anyllm,litellm}.[/]")
149
+ raise typer.Exit(2)
150
+ except ClientError as exc: # e.g. the --via SDK isn't installed
151
+ _err(str(exc))
152
+ raise typer.Exit(2)
153
+
154
+ try:
155
+ lock = run_probes(client, probes, fingerprint, __version__, samples=samples)
156
+ except ClientError as exc:
157
+ _err(str(exc))
158
+ raise typer.Exit(2)
159
+
160
+ errored = [r for r in lock.results if r.error]
161
+ if errored and len(errored) == lock.n_probes:
162
+ # Every probe failed at the API level -> this is a misconfiguration, not a
163
+ # capability profile. Refuse to write a lockfile that could become a
164
+ # poisoned all-zeros baseline.
165
+ _err(
166
+ f"All {lock.n_probes} probes failed at the API level — refusing to write "
167
+ f"a lockfile. First error: {errored[0].error}"
168
+ )
169
+ raise typer.Exit(2)
170
+
171
+ if samples > lock.samples:
172
+ err_console.print(
173
+ f"[yellow]--samples {samples} had no effect: this endpoint is deterministic "
174
+ f"(temperature 0 / simulated), so samples are identical. Recorded samples=1; "
175
+ f"raise --temperature for independent samples.[/]"
176
+ )
177
+
178
+ lock.generated_at = _dt.datetime.now(_dt.timezone.utc).isoformat()
179
+ if label:
180
+ lock.label = label
181
+
182
+ console.print(f"\n[bold]{lock.label}[/] ([cyan]{lock.n_probes} probes[/], fp {lock.tools_fingerprint})")
183
+ table = Table(expand=True)
184
+ table.add_column("Capability", no_wrap=True)
185
+ table.add_column("Score", ratio=1)
186
+ for cap, sc in lock.capabilities.items():
187
+ table.add_row(cap, _bar(sc))
188
+ console.print(table)
189
+
190
+ if errored:
191
+ console.print(
192
+ f"[yellow]{len(errored)} probe(s) errored at the API level.[/] "
193
+ f"e.g. {escape(str(errored[0].error))}"
194
+ )
195
+
196
+ if out is not None:
197
+ write_lockfile(lock, out)
198
+ console.print(f"[green]wrote[/] {out}")
199
+
200
+
201
+ def _validate_confidence(confidence: Optional[float]) -> None:
202
+ if confidence is not None and not (0.0 < confidence < 1.0):
203
+ err_console.print(
204
+ f"[red]--confidence must be between 0 and 1 (exclusive); got {confidence}.[/]"
205
+ )
206
+ raise typer.Exit(2)
207
+
208
+
209
+ def _read_lock(path: Path):
210
+ """Read a lockfile, converting any malformed-input error into a clean Exit(2)
211
+ so CI can distinguish a broken lockfile from a real regression (exit 1)."""
212
+ try:
213
+ return read_lockfile(path)
214
+ except FileNotFoundError:
215
+ _err(f"Lockfile not found: {path}")
216
+ raise typer.Exit(2)
217
+ except (json.JSONDecodeError, ValueError, TypeError, KeyError) as exc:
218
+ _err(f"Could not read lockfile {path}: {exc}")
219
+ raise typer.Exit(2)
220
+
221
+
222
+ # One source of truth for status display (table rich-markup, markdown text).
223
+ _STATUS_LABELS = {
224
+ "ok": ("[green]ok[/]", "✅ ok"),
225
+ "regression": ("[bold red]REGRESSION[/]", "⚠️ REGRESSION"),
226
+ "noisy": ("[yellow]noisy ↓[/]", "〰️ noisy"),
227
+ "improved": ("[cyan]improved[/]", "⬆️ improved"),
228
+ "added": ("[dim]added[/]", "➕ added"),
229
+ "removed": ("[bold red]REMOVED[/]", "⛔ REMOVED"),
230
+ }
231
+
232
+
233
+ def _cell(value, signed=False) -> str:
234
+ if value is None:
235
+ return "—"
236
+ return f"{value:+.2f}" if signed else f"{value:.2f}"
237
+
238
+
239
+ def _diff_notes(result, b, c):
240
+ """Plain-text comparison caveats, shared by the table and markdown renderers
241
+ so the two can never drift out of sync."""
242
+ notes = []
243
+ if result.tools_changed:
244
+ notes.append("⚠ toolsets differ — comparison may not be apples-to-apples.")
245
+ if b.model and c.model and b.model != c.model:
246
+ notes.append(
247
+ f"⚠ different models ({b.model} → {c.model}) — cross-model comparison, "
248
+ f"not a within-model regression check."
249
+ )
250
+ elif b.model and (b.quant, b.runtime) != (c.quant, c.runtime):
251
+ notes.append(
252
+ f"within-model swap: {b.quant or 'native'}/{b.runtime or '?'} → "
253
+ f"{c.quant or 'native'}/{c.runtime or '?'}"
254
+ )
255
+ if b.samples != c.samples:
256
+ notes.append(
257
+ f"⚠ sample counts differ ({b.samples} vs {c.samples}); --confidence has "
258
+ f"uneven statistical power across the two lockfiles."
259
+ )
260
+ return notes
261
+
262
+
263
+ def _render_diff(result, b, c) -> None:
264
+ # escape() externally-sourced text (labels, capability names) so brackets in a
265
+ # model/quant tag aren't eaten by rich markup; the status badge is real markup.
266
+ title = f"{escape(b.label or 'baseline')} → {escape(c.label or 'candidate')}"
267
+ table = Table(title=title, expand=True)
268
+ for col, justify in (("Capability", "left"), ("Baseline", "right"),
269
+ ("Candidate", "right"), ("Δ", "right"), ("Status", "left")):
270
+ table.add_column(col, justify=justify, no_wrap=(col in ("Capability", "Status")))
271
+ for r in result.rows:
272
+ label = _STATUS_LABELS.get(r.status, (r.status, r.status))[0]
273
+ table.add_row(escape(r.capability), _cell(r.baseline), _cell(r.candidate),
274
+ _cell(r.delta, signed=True), label)
275
+ console.print(table)
276
+ for note in _diff_notes(result, b, c):
277
+ console.print(f"[yellow]{escape(note)}[/]")
278
+
279
+
280
+ def _markdown_diff(result, b, c) -> str:
281
+ lines = [
282
+ f"### probelock: `{b.label or 'baseline'}` → `{c.label or 'candidate'}`",
283
+ "",
284
+ "| Capability | Baseline | Candidate | Δ | Status |",
285
+ "|---|--:|--:|--:|---|",
286
+ ]
287
+ for r in result.rows:
288
+ label = _STATUS_LABELS.get(r.status, (r.status, r.status))[1]
289
+ lines.append(
290
+ f"| `{r.capability}` | {_cell(r.baseline)} | {_cell(r.candidate)} | "
291
+ f"{_cell(r.delta, signed=True)} | {label} |"
292
+ )
293
+ lines.append("")
294
+ if result.regressed:
295
+ names = ", ".join(f"`{r.capability}`" for r in result.regressions)
296
+ lines.append(f"**FAIL** — capabilities regressed or removed: {names}")
297
+ else:
298
+ lines.append("**PASS** — no capability regressed.")
299
+ for note in _diff_notes(result, b, c):
300
+ lines.append(f"\n> {note}")
301
+ return "\n".join(lines)
302
+
303
+
304
+ def _diff_payload(result, b, c) -> dict:
305
+ def meta(lock):
306
+ return {"label": lock.label, "model": lock.model, "quant": lock.quant,
307
+ "runtime": lock.runtime, "samples": lock.samples}
308
+ return {
309
+ "baseline": meta(b),
310
+ "candidate": meta(c),
311
+ "max_drop": result.max_drop,
312
+ "tools_changed": result.tools_changed,
313
+ "regressed": result.regressed,
314
+ "rows": [
315
+ {"capability": r.capability, "baseline": r.baseline, "candidate": r.candidate,
316
+ "delta": r.delta, "status": r.status, "significant": r.significant}
317
+ for r in result.rows
318
+ ],
319
+ }
320
+
321
+
322
+ _HTML_CSS = """
323
+ body{font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',Roboto,sans-serif;
324
+ max-width:780px;margin:2rem auto;padding:0 1rem;color:#1a1a1a;line-height:1.45}
325
+ h1{font-size:1.2rem;margin:0 0 .2rem}.sub{color:#6b7280;font-size:.9rem;margin:0 0 1rem}
326
+ table{border-collapse:collapse;width:100%;font-size:.92rem}
327
+ th,td{padding:.45rem .6rem;border-bottom:1px solid #eee}th{text-align:left;color:#6b7280;font-weight:600}
328
+ td.cap{font-family:ui-monospace,Menlo,monospace}td.num{text-align:right;font-variant-numeric:tabular-nums}
329
+ .bar{height:.5rem;border-radius:3px;background:#eceef1;min-width:90px}
330
+ .bar>span{display:block;height:100%;border-radius:3px}
331
+ .badge{font-size:.72rem;font-weight:700;padding:.08rem .42rem;border-radius:4px;white-space:nowrap}
332
+ .ok,.good{background:#dcfce7;color:#166534}.bad{background:#fee2e2;color:#991b1b}
333
+ .warn{background:#fef9c3;color:#854d0e}.dim{background:#f3f4f6;color:#6b7280}
334
+ .banner{padding:.6rem .9rem;border-radius:6px;font-weight:700;margin:1rem 0}
335
+ .banner.fail{background:#fee2e2;color:#991b1b}.banner.pass{background:#dcfce7;color:#166534}
336
+ .note{color:#854d0e;font-size:.85rem;margin:.25rem 0}
337
+ footer{color:#9ca3af;font-size:.78rem;margin-top:1.5rem}
338
+ """
339
+ _HTML_STATUS = { # status -> (css class, label)
340
+ "ok": ("ok", "ok"), "regression": ("bad", "REGRESSION"), "noisy": ("warn", "noisy ↓"),
341
+ "improved": ("good", "improved"), "added": ("dim", "added"), "removed": ("bad", "REMOVED"),
342
+ }
343
+
344
+
345
+ def _html_diff(result, b, c) -> str:
346
+ def esc(s):
347
+ return _html.escape(str(s))
348
+
349
+ rows = []
350
+ for r in result.rows:
351
+ cls, label = _HTML_STATUS.get(r.status, ("dim", r.status))
352
+ pct = int(round((r.candidate or 0.0) * 100))
353
+ color = "#16a34a" if (r.candidate or 0) >= 0.9 else "#ca8a04" if (r.candidate or 0) >= 0.7 else "#dc2626"
354
+ rows.append(
355
+ f"<tr><td class='cap'>{esc(r.capability)}</td>"
356
+ f"<td class='num'>{_cell(r.baseline)}</td>"
357
+ f"<td class='num'>{_cell(r.candidate)}</td>"
358
+ f"<td class='num'>{_cell(r.delta, signed=True)}</td>"
359
+ f"<td><div class='bar'><span style='width:{pct}%;background:{color}'></span></div></td>"
360
+ f"<td><span class='badge {cls}'>{esc(label)}</span></td></tr>"
361
+ )
362
+ banner = (
363
+ f"<div class='banner fail'>FAIL — capabilities regressed or removed: "
364
+ f"{esc(', '.join(r.capability for r in result.regressions))}</div>"
365
+ if result.regressed else
366
+ "<div class='banner pass'>PASS — no capability regressed.</div>"
367
+ )
368
+ notes = "".join(f"<p class='note'>{esc(n)}</p>" for n in _diff_notes(result, b, c))
369
+ return (
370
+ "<!doctype html><html><head><meta charset='utf-8'>"
371
+ f"<meta name='viewport' content='width=device-width,initial-scale=1'><style>{_HTML_CSS}</style>"
372
+ "<title>probelock report</title></head><body>"
373
+ f"<h1>probelock capability report</h1>"
374
+ f"<p class='sub'>{esc(b.label or 'baseline')} &nbsp;→&nbsp; {esc(c.label or 'candidate')}</p>"
375
+ f"{banner}"
376
+ "<table><thead><tr><th>Capability</th><th>Baseline</th><th>Candidate</th>"
377
+ "<th>Δ</th><th>Candidate</th><th>Status</th></tr></thead>"
378
+ f"<tbody>{''.join(rows)}</tbody></table>{notes}"
379
+ "<footer>Generated by probelock — deterministic, no LLM judge.</footer>"
380
+ "</body></html>"
381
+ )
382
+
383
+
384
+ @app.command()
385
+ def diff(
386
+ baseline: Path = typer.Argument(..., help="Baseline lockfile."),
387
+ candidate: Path = typer.Argument(..., help="Candidate lockfile."),
388
+ max_drop: float = typer.Option(0.05, "--max-drop", help="Regression threshold."),
389
+ confidence: Optional[float] = typer.Option(
390
+ None, "--confidence", help="If set (e.g. 0.95), mark sub-significant drops 'noisy'."
391
+ ),
392
+ fmt: str = typer.Option("table", "--format", help="table | markdown | json | html"),
393
+ ) -> None:
394
+ """Show within-model capability deltas between two lockfiles (informational)."""
395
+ _validate_confidence(confidence)
396
+ b, c = _read_lock(baseline), _read_lock(candidate)
397
+ result = diff_lockfiles(b, c, max_drop, confidence)
398
+ if fmt == "markdown":
399
+ print(_markdown_diff(result, b, c))
400
+ elif fmt == "json":
401
+ print(json.dumps(_diff_payload(result, b, c), indent=2))
402
+ elif fmt == "html":
403
+ print(_html_diff(result, b, c))
404
+ elif fmt == "table":
405
+ _render_diff(result, b, c)
406
+ else:
407
+ _err(f"Unknown --format '{fmt}' (use table | markdown | json | html).")
408
+ raise typer.Exit(2)
409
+
410
+
411
+ @app.command()
412
+ def gate(
413
+ baseline: Path = typer.Option(..., "--baseline", "-b", help="Baseline (committed) lockfile."),
414
+ candidate: Path = typer.Option(..., "--candidate", "-c", help="Candidate lockfile."),
415
+ max_drop: float = typer.Option(0.05, "--max-drop", help="Regression threshold."),
416
+ require_same_model: bool = typer.Option(
417
+ False, "--require-same-model", help="Fail if baseline and candidate are different models."
418
+ ),
419
+ confidence: Optional[float] = typer.Option(
420
+ None, "--confidence", help="Only fail on drops significant at this confidence (e.g. 0.95)."
421
+ ),
422
+ ) -> None:
423
+ """CI gate: exit 1 if any capability regressed (or was dropped) beyond --max-drop.
424
+
425
+ With --confidence, a drop past --max-drop that isn't statistically significant
426
+ for the recorded sample count is reported as 'noisy' and does NOT fail the gate.
427
+ Exit 2 is reserved for invalid input (bad lockfile, or a cross-model
428
+ comparison under --require-same-model), so CI can tell the two apart.
429
+ """
430
+ _validate_confidence(confidence)
431
+ b, c = _read_lock(baseline), _read_lock(candidate)
432
+ result = diff_lockfiles(b, c, max_drop, confidence)
433
+ _render_diff(result, b, c)
434
+
435
+ noisy = [r for r in result.rows if r.status == "noisy"]
436
+ if noisy:
437
+ console.print(
438
+ f"[yellow]{len(noisy)} drop(s) past --max-drop are below the {confidence} "
439
+ f"confidence bar (noisy ↓) — raise --samples to confirm or clear them.[/]"
440
+ )
441
+
442
+ if require_same_model and b.model and c.model and b.model != c.model:
443
+ console.print(
444
+ f"\n[bold red]INVALID[/] — --require-same-model set but models differ "
445
+ f"({escape(b.model)} vs {escape(c.model)})."
446
+ )
447
+ raise typer.Exit(2)
448
+ if result.regressed:
449
+ names = escape(", ".join(r.capability for r in result.regressions))
450
+ console.print(f"\n[bold red]FAIL[/] — capabilities regressed or removed: {names}")
451
+ raise typer.Exit(1)
452
+ console.print(f"\n[bold green]PASS[/] — no capability regressed beyond {max_drop:.2f}.")
453
+
454
+
455
+ _TEMPLATE_TOOLS = """\
456
+ [
457
+ {
458
+ "type": "function",
459
+ "function": {
460
+ "name": "create_event",
461
+ "description": "Create a calendar event",
462
+ "parameters": {
463
+ "type": "object",
464
+ "properties": {
465
+ "title": {"type": "string"},
466
+ "start": {"type": "string", "description": "ISO 8601 datetime"},
467
+ "visibility": {"type": "string", "enum": ["public", "private"]}
468
+ },
469
+ "required": ["title", "start"]
470
+ }
471
+ }
472
+ }
473
+ ]
474
+ """
475
+
476
+ _TEMPLATE_WORKFLOW = """\
477
+ name: probelock
478
+ on: [pull_request]
479
+
480
+ jobs:
481
+ capabilities:
482
+ runs-on: ubuntu-latest
483
+ steps:
484
+ - uses: actions/checkout@v4
485
+ - uses: astral-sh/setup-uv@v5
486
+ # Point --endpoint at your model server. CI needs network access to it
487
+ # (a hosted endpoint, or a self-hosted runner with Ollama/llama.cpp).
488
+ # Uses the published `probelock` from PyPI. To pin an unreleased revision,
489
+ # replace `uvx probelock` with:
490
+ # uvx --from git+https://github.com/kelkalot/probelock probelock ...
491
+ - name: Probe candidate
492
+ run: uvx probelock probe --tools probelock.tools.json
493
+ --endpoint "$LLM_ENDPOINT" --model "$LLM_MODEL"
494
+ --samples 5 --temperature 0.7 -o candidate.lock
495
+ env:
496
+ LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
497
+ LLM_MODEL: ${{ vars.LLM_MODEL }}
498
+ - name: Gate on regression
499
+ run: uvx probelock gate --baseline probelock.lock --candidate candidate.lock
500
+ --max-drop 0.05 --confidence 0.95
501
+ """
502
+
503
+
504
+ @app.command()
505
+ def init(
506
+ path: Path = typer.Option(Path("."), "--path", help="Directory to scaffold into."),
507
+ force: bool = typer.Option(False, "--force", help="Overwrite existing files."),
508
+ ) -> None:
509
+ """Scaffold a tools file and a CI workflow to get started."""
510
+ targets = [
511
+ (path / "probelock.tools.json", _TEMPLATE_TOOLS),
512
+ (path / ".github" / "workflows" / "probelock.yml", _TEMPLATE_WORKFLOW),
513
+ ]
514
+ for target, content in targets:
515
+ if target.exists() and not force:
516
+ console.print(f"[yellow]exists, skipped[/] {target} (use --force)")
517
+ continue
518
+ target.parent.mkdir(parents=True, exist_ok=True)
519
+ target.write_text(content)
520
+ console.print(f"[green]created[/] {target}")
521
+ console.print(
522
+ "\nNext: probe your model and commit the baseline, then gate candidates in CI:\n"
523
+ " probelock probe --tools probelock.tools.json "
524
+ "--endpoint http://localhost:11434/v1 --model <model> -o probelock.lock\n"
525
+ " git add probelock.lock # this is your committed baseline"
526
+ )
527
+
528
+
529
+ @app.command()
530
+ def version() -> None:
531
+ """Print the probelock version."""
532
+ console.print(__version__)
533
+
534
+
535
+ def main() -> None:
536
+ app()
537
+
538
+
539
+ if __name__ == "__main__":
540
+ main()