debugerai 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debugai/__init__.py +51 -0
- debugai/agents/__init__.py +43 -0
- debugai/agents/base.py +192 -0
- debugai/agents/builtin.py +246 -0
- debugai/agents/registry.py +31 -0
- debugai/agents/types.py +108 -0
- debugai/analyze.py +142 -0
- debugai/calibration.py +198 -0
- debugai/cli.py +171 -0
- debugai/config.py +134 -0
- debugai/detectors.py +206 -0
- debugai/diagnosis.py +64 -0
- debugai/explainer.py +105 -0
- debugai/integrations/__init__.py +5 -0
- debugai/integrations/langchain.py +109 -0
- debugai/judge.py +171 -0
- debugai/metrics.py +139 -0
- debugai/models.py +92 -0
- debugai/providers.py +179 -0
- debugai/schema.py +66 -0
- debugai/sdk.py +1271 -0
- debugai/signals.py +399 -0
- debugai/thresholds.json +15 -0
- debugai/thresholds.py +44 -0
- debugai/tracing.py +283 -0
- debugerai-0.2.0.dist-info/METADATA +535 -0
- debugerai-0.2.0.dist-info/RECORD +31 -0
- debugerai-0.2.0.dist-info/WHEEL +5 -0
- debugerai-0.2.0.dist-info/entry_points.txt +2 -0
- debugerai-0.2.0.dist-info/licenses/LICENSE +21 -0
- debugerai-0.2.0.dist-info/top_level.txt +1 -0
debugai/analyze.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Level 1 API — the single-call entry point (Architecture §3.2).
|
|
2
|
+
|
|
3
|
+
from debugai import analyze
|
|
4
|
+
result = analyze(
|
|
5
|
+
prompt="What is the refund policy?",
|
|
6
|
+
output="Refunds are issued within 90 days...",
|
|
7
|
+
chunks=[...],
|
|
8
|
+
similarity_scores=[...],
|
|
9
|
+
)
|
|
10
|
+
print(result["primary"]["failure"], result["primary"]["confidence"])
|
|
11
|
+
|
|
12
|
+
Returns the structured JSON contract from §7.3:
|
|
13
|
+
{ "healthy": bool,
|
|
14
|
+
"primary": {failure, confidence, severity, root_cause, fix, evidence},
|
|
15
|
+
"secondary": [ ... ],
|
|
16
|
+
"signals": { ...8 metrics... },
|
|
17
|
+
"explanation": "human-readable text" }
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
from debugai.diagnosis import diagnose
|
|
25
|
+
from debugai.explainer import explain
|
|
26
|
+
from debugai.judge import INSTRUCTION_VIOLATION, judge_instructions
|
|
27
|
+
from debugai.schema import CaptureRecord
|
|
28
|
+
from debugai.signals import compute_signals, measure_variance
|
|
29
|
+
from debugai.thresholds import DEFAULT_THRESHOLDS, Thresholds
|
|
30
|
+
|
|
31
|
+
_IV_FIX = (
|
|
32
|
+
"Strengthen the system prompt to enforce the violated rules: reveal at most "
|
|
33
|
+
"one small hint per turn (never the full solution early), ask exactly one NEW "
|
|
34
|
+
"leading question that advances beyond what was already said (never restate a "
|
|
35
|
+
"prior question), and don't open by paraphrasing the student."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _merge_instruction(result: dict, jd) -> dict:
|
|
40
|
+
"""Fold an instruction-adherence verdict into the diagnosis, re-ranking by
|
|
41
|
+
confidence so the most severe failure becomes primary."""
|
|
42
|
+
severity = "critical" if any(v.severity == "critical" for v in jd.violations) else "warning"
|
|
43
|
+
rules = "; ".join(v.rule for v in jd.violations[:3])
|
|
44
|
+
iv = {
|
|
45
|
+
"failure": INSTRUCTION_VIOLATION,
|
|
46
|
+
"confidence": jd.confidence,
|
|
47
|
+
"severity": severity,
|
|
48
|
+
"root_cause": f"The response violates {len(jd.violations)} system-prompt "
|
|
49
|
+
f"rule(s): {rules}",
|
|
50
|
+
"fix": _IV_FIX,
|
|
51
|
+
"evidence": {"violations": [v.to_dict() for v in jd.violations],
|
|
52
|
+
"judge_model": jd.model},
|
|
53
|
+
}
|
|
54
|
+
fired = ([result["primary"]] if result.get("primary") else []) + result.get("secondary", []) + [iv]
|
|
55
|
+
fired.sort(key=lambda r: r["confidence"], reverse=True)
|
|
56
|
+
result["healthy"] = False
|
|
57
|
+
result["primary"] = fired[0]
|
|
58
|
+
result["secondary"] = fired[1:]
|
|
59
|
+
return result
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def analyze(
|
|
63
|
+
prompt: str,
|
|
64
|
+
output: str,
|
|
65
|
+
*,
|
|
66
|
+
system_prompt: str = "",
|
|
67
|
+
chunks: list[str] | None = None,
|
|
68
|
+
similarity_scores: list[float] | None = None,
|
|
69
|
+
retrieval_query: str | None = None,
|
|
70
|
+
expected_output: str | None = None,
|
|
71
|
+
model_name: str | None = None,
|
|
72
|
+
temperature: float | None = None,
|
|
73
|
+
max_tokens: int | None = None,
|
|
74
|
+
context_window: int | None = None,
|
|
75
|
+
latency_ms: int | None = None,
|
|
76
|
+
token_usage: dict[str, int] | None = None,
|
|
77
|
+
thresholds: Thresholds = DEFAULT_THRESHOLDS,
|
|
78
|
+
explain_with_llm: bool = True,
|
|
79
|
+
lazy: bool = False,
|
|
80
|
+
judge: bool = False,
|
|
81
|
+
judge_model: str | None = None,
|
|
82
|
+
variance_rerun: Any = None,
|
|
83
|
+
variance_runs: int = 3,
|
|
84
|
+
) -> dict[str, Any]:
|
|
85
|
+
"""Diagnose why an LLM output failed and return a structured fix.
|
|
86
|
+
|
|
87
|
+
Only ``prompt`` and ``output`` are required (Core IO). Supplying retrieval
|
|
88
|
+
and runtime fields unlocks the RAG and capacity signals.
|
|
89
|
+
"""
|
|
90
|
+
rec = CaptureRecord(
|
|
91
|
+
user_prompt=prompt,
|
|
92
|
+
llm_output=output,
|
|
93
|
+
system_prompt=system_prompt,
|
|
94
|
+
expected_output=expected_output,
|
|
95
|
+
retrieved_chunks=chunks or [],
|
|
96
|
+
similarity_scores=similarity_scores or [],
|
|
97
|
+
retrieval_query=retrieval_query,
|
|
98
|
+
model_name=model_name,
|
|
99
|
+
temperature=temperature,
|
|
100
|
+
max_tokens=max_tokens,
|
|
101
|
+
context_window=context_window,
|
|
102
|
+
latency_ms=latency_ms,
|
|
103
|
+
token_usage=token_usage or {},
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
signals = compute_signals(rec, lazy=lazy)
|
|
107
|
+
# Deep mode (§7.5 Tier 2): replace the variance proxy with a measured value
|
|
108
|
+
# from actually re-running the model, before classifying.
|
|
109
|
+
if variance_rerun is not None:
|
|
110
|
+
signals.variance = measure_variance(
|
|
111
|
+
variance_rerun, rec.system_prompt, rec.user_prompt,
|
|
112
|
+
rec.retrieved_chunks, rec.temperature, variance_runs)
|
|
113
|
+
signals.variance_method = "measured"
|
|
114
|
+
diag = diagnose(signals, rec, thresholds)
|
|
115
|
+
result = diag.to_dict()
|
|
116
|
+
|
|
117
|
+
# Optional behavioural / instruction-following check (LLM-as-judge) — catches
|
|
118
|
+
# failures the grounding signals can't see (e.g. a tutor revealing the answer).
|
|
119
|
+
if judge and rec.system_prompt:
|
|
120
|
+
jd = judge_instructions(rec.system_prompt, rec.user_prompt, rec.llm_output,
|
|
121
|
+
model=judge_model)
|
|
122
|
+
if not jd.healthy:
|
|
123
|
+
result = _merge_instruction(result, jd)
|
|
124
|
+
|
|
125
|
+
if explain_with_llm:
|
|
126
|
+
explanation = explain(diag)
|
|
127
|
+
result["explainer_model"] = explanation["model"]
|
|
128
|
+
# Prefer the deterministic primary's own root_cause when the judge changed
|
|
129
|
+
# the primary; otherwise use the LLM explanation.
|
|
130
|
+
if (result.get("primary") or {}).get("failure") == INSTRUCTION_VIOLATION:
|
|
131
|
+
result["explanation"] = result["primary"]["root_cause"]
|
|
132
|
+
else:
|
|
133
|
+
result["explanation"] = explanation["explanation"]
|
|
134
|
+
if diag.primary is not None and explanation.get("fix"):
|
|
135
|
+
result["primary"]["fix"] = explanation["fix"]
|
|
136
|
+
else:
|
|
137
|
+
result["explanation"] = (
|
|
138
|
+
result["primary"]["root_cause"] if result.get("primary") else "No failure detected."
|
|
139
|
+
)
|
|
140
|
+
result["explainer_model"] = "none"
|
|
141
|
+
|
|
142
|
+
return result
|
debugai/calibration.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""Adaptive threshold calibration (Architecture §7.2).
|
|
2
|
+
|
|
3
|
+
Static thresholds break across embedding models, domains, and chunk sizes. The
|
|
4
|
+
``ThresholdStore`` learns a per-user "known good" baseline from the signals of
|
|
5
|
+
healthy requests and tightens the gating thresholds to *that user's* norms:
|
|
6
|
+
|
|
7
|
+
cold (<50 requests) sensible defaults — not enough data yet
|
|
8
|
+
warm (50-500) percentile-based (5th / 95th of healthy baseline)
|
|
9
|
+
hot (>500) rolling-window z-score (mean ± 2 std), last `window`
|
|
10
|
+
|
|
11
|
+
A signal is only adapted once it has ``MIN_SAMPLES`` healthy observations;
|
|
12
|
+
otherwise its default is kept. Every calibrated value is clamped to a sane band
|
|
13
|
+
so pathological data can't produce a runaway threshold.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import dataclasses
|
|
19
|
+
import json
|
|
20
|
+
import threading
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
from debugai.thresholds import DEFAULT_THRESHOLDS, Thresholds
|
|
25
|
+
|
|
26
|
+
COLD_MAX = 50
|
|
27
|
+
WARM_MAX = 500
|
|
28
|
+
MIN_SAMPLES = 15
|
|
29
|
+
Z = 2.0 # z-score: anomaly = 2 std beyond the healthy baseline
|
|
30
|
+
|
|
31
|
+
# Thresholds field -> (signal key, direction). "low" = anomaly below threshold,
|
|
32
|
+
# "high" = anomaly above threshold. Fields not listed stay fixed (semantic).
|
|
33
|
+
CALIBRATION_SPEC: dict[str, tuple[str, str]] = {
|
|
34
|
+
"similarity_min": ("similarity", "low"),
|
|
35
|
+
"overlap_low": ("overlap", "low"),
|
|
36
|
+
"entity_coverage_min": ("entity_coverage", "low"),
|
|
37
|
+
"contradiction_min": ("contradiction", "high"),
|
|
38
|
+
"variance_min": ("variance", "high"),
|
|
39
|
+
"context_length_ratio_max": ("context_ratio", "high"),
|
|
40
|
+
"token_usage_high": ("token_ratio", "high"),
|
|
41
|
+
"latency_high_ms": ("latency_ms", "high"),
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
# Clamp bands keep calibrated thresholds reasonable regardless of the data.
|
|
45
|
+
CLAMP: dict[str, tuple[float, float]] = {
|
|
46
|
+
"similarity_min": (0.10, 0.90),
|
|
47
|
+
"overlap_low": (0.10, 0.85),
|
|
48
|
+
"entity_coverage_min": (0.10, 0.85),
|
|
49
|
+
"contradiction_min": (0.10, 0.60),
|
|
50
|
+
"variance_min": (0.15, 0.80),
|
|
51
|
+
"context_length_ratio_max": (0.50, 0.95),
|
|
52
|
+
"token_usage_high": (0.50, 0.95),
|
|
53
|
+
"latency_high_ms": (500.0, 30000.0),
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
_SIGNAL_KEYS = sorted({k for k, _ in CALIBRATION_SPEC.values()})
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _mean(xs: list[float]) -> float:
|
|
60
|
+
return sum(xs) / len(xs)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _std(xs: list[float], mu: float) -> float:
|
|
64
|
+
if len(xs) < 2:
|
|
65
|
+
return 0.0
|
|
66
|
+
return (sum((x - mu) ** 2 for x in xs) / (len(xs) - 1)) ** 0.5
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _percentile(xs: list[float], p: float) -> float:
|
|
70
|
+
s = sorted(xs)
|
|
71
|
+
if not s:
|
|
72
|
+
return 0.0
|
|
73
|
+
idx = (len(s) - 1) * (p / 100.0)
|
|
74
|
+
lo, hi = int(idx), min(int(idx) + 1, len(s) - 1)
|
|
75
|
+
frac = idx - lo
|
|
76
|
+
return s[lo] * (1 - frac) + s[hi] * frac
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class SignalCalibration:
|
|
81
|
+
signal: str
|
|
82
|
+
direction: str
|
|
83
|
+
field: str
|
|
84
|
+
n: int
|
|
85
|
+
baseline_mean: float
|
|
86
|
+
baseline_std: float
|
|
87
|
+
default: float
|
|
88
|
+
value: float
|
|
89
|
+
adapted: bool
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class ThresholdStore:
|
|
93
|
+
"""Per-user adaptive threshold store. Thread-safe; optionally persisted."""
|
|
94
|
+
|
|
95
|
+
def __init__(self, path: Path | None = None, window: int = WARM_MAX):
|
|
96
|
+
self._path = path
|
|
97
|
+
self._window = window
|
|
98
|
+
self._lock = threading.Lock()
|
|
99
|
+
self._total = 0
|
|
100
|
+
self._healthy: list[dict[str, float]] = [] # baseline signal rows
|
|
101
|
+
if path is not None:
|
|
102
|
+
self._load()
|
|
103
|
+
|
|
104
|
+
# --- persistence -------------------------------------------------------
|
|
105
|
+
def _load(self) -> None:
|
|
106
|
+
try:
|
|
107
|
+
data = json.loads(self._path.read_text())
|
|
108
|
+
if isinstance(data, dict):
|
|
109
|
+
self._total = int(data.get("total", 0) or 0)
|
|
110
|
+
healthy = data.get("healthy", [])
|
|
111
|
+
self._healthy = healthy[-self._window:] if isinstance(healthy, list) else []
|
|
112
|
+
except Exception: # missing / corrupt / unreadable → cold start
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
def _persist(self) -> None:
|
|
116
|
+
if self._path is None:
|
|
117
|
+
return
|
|
118
|
+
self._path.write_text(json.dumps(
|
|
119
|
+
{"total": self._total, "healthy": self._healthy[-self._window:]}
|
|
120
|
+
))
|
|
121
|
+
|
|
122
|
+
# --- ingest ------------------------------------------------------------
|
|
123
|
+
def record(self, signals: dict, healthy: bool) -> None:
|
|
124
|
+
with self._lock:
|
|
125
|
+
self._total += 1
|
|
126
|
+
if healthy:
|
|
127
|
+
row = {k: float(signals.get(k, 0.0)) for k in _SIGNAL_KEYS}
|
|
128
|
+
self._healthy.append(row)
|
|
129
|
+
del self._healthy[:-self._window]
|
|
130
|
+
self._persist()
|
|
131
|
+
|
|
132
|
+
# --- regime ------------------------------------------------------------
|
|
133
|
+
def regime(self) -> str:
|
|
134
|
+
if self._total < COLD_MAX:
|
|
135
|
+
return "cold"
|
|
136
|
+
return "warm" if self._total <= WARM_MAX else "hot"
|
|
137
|
+
|
|
138
|
+
# --- calibration -------------------------------------------------------
|
|
139
|
+
def _calibrate_field(self, field: str, regime: str) -> SignalCalibration:
|
|
140
|
+
signal, direction = CALIBRATION_SPEC[field]
|
|
141
|
+
default = getattr(DEFAULT_THRESHOLDS, field)
|
|
142
|
+
values = [r[signal] for r in self._healthy if signal in r]
|
|
143
|
+
mu = _mean(values) if values else default
|
|
144
|
+
sd = _std(values, mu) if values else 0.0
|
|
145
|
+
|
|
146
|
+
# A baseline that is entirely zero means the signal was never exercised
|
|
147
|
+
# (e.g. no context_window supplied → context_ratio always 0). Don't adapt
|
|
148
|
+
# it — keep the default rather than collapsing to a clamp floor.
|
|
149
|
+
degenerate = mu == 0.0 and sd == 0.0
|
|
150
|
+
adapted = regime != "cold" and len(values) >= MIN_SAMPLES and not degenerate
|
|
151
|
+
value = default
|
|
152
|
+
if adapted:
|
|
153
|
+
if regime == "warm": # percentile of the healthy baseline
|
|
154
|
+
value = _percentile(values, 5.0 if direction == "low" else 95.0)
|
|
155
|
+
else: # hot: rolling-window z-score
|
|
156
|
+
value = mu - Z * sd if direction == "low" else mu + Z * sd
|
|
157
|
+
lo, hi = CLAMP[field]
|
|
158
|
+
value = max(lo, min(value, hi))
|
|
159
|
+
|
|
160
|
+
return SignalCalibration(
|
|
161
|
+
signal=signal, direction=direction, field=field, n=len(values),
|
|
162
|
+
baseline_mean=round(mu, 4), baseline_std=round(sd, 4),
|
|
163
|
+
default=default, value=round(value, 4), adapted=adapted,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
def current(self) -> Thresholds:
|
|
167
|
+
"""The active, calibrated thresholds for this user."""
|
|
168
|
+
with self._lock:
|
|
169
|
+
regime = self.regime()
|
|
170
|
+
if regime == "cold":
|
|
171
|
+
return DEFAULT_THRESHOLDS
|
|
172
|
+
overrides = {
|
|
173
|
+
field: self._calibrate_field(field, regime).value
|
|
174
|
+
for field in CALIBRATION_SPEC
|
|
175
|
+
}
|
|
176
|
+
return dataclasses.replace(DEFAULT_THRESHOLDS, **overrides)
|
|
177
|
+
|
|
178
|
+
def details(self) -> dict:
|
|
179
|
+
"""Full calibration report for the dashboard."""
|
|
180
|
+
with self._lock:
|
|
181
|
+
regime = self.regime()
|
|
182
|
+
cals = [self._calibrate_field(f, regime) for f in CALIBRATION_SPEC]
|
|
183
|
+
return {
|
|
184
|
+
"regime": regime,
|
|
185
|
+
"total_requests": self._total,
|
|
186
|
+
"healthy_baseline": len(self._healthy),
|
|
187
|
+
"window": self._window,
|
|
188
|
+
"next_regime_at": COLD_MAX if regime == "cold" else (
|
|
189
|
+
WARM_MAX if regime == "warm" else None
|
|
190
|
+
),
|
|
191
|
+
"signals": [dataclasses.asdict(c) for c in cals],
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
def reset(self) -> None:
|
|
195
|
+
with self._lock:
|
|
196
|
+
self._total = 0
|
|
197
|
+
self._healthy = []
|
|
198
|
+
self._persist()
|
debugai/cli.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""DebugAI command-line interface.
|
|
2
|
+
|
|
3
|
+
debugai analyze --prompt "..." --output "..." --chunk "..." --score 0.4
|
|
4
|
+
debugai diagnose cases.json
|
|
5
|
+
debugai fix cases.json --simulate
|
|
6
|
+
debugai serve --port 8000
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import sys
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from debugai import analyze
|
|
17
|
+
from debugai.agents import propose_fix
|
|
18
|
+
from debugai.schema import CaptureRecord
|
|
19
|
+
|
|
20
|
+
_ANSI = {"red": "\033[31m", "green": "\033[32m", "yellow": "\033[33m",
|
|
21
|
+
"dim": "\033[2m", "bold": "\033[1m", "reset": "\033[0m"}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _c(text: str, color: str) -> str:
|
|
25
|
+
if not sys.stdout.isatty():
|
|
26
|
+
return text
|
|
27
|
+
return f"{_ANSI[color]}{text}{_ANSI['reset']}"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _grounded_stub(system_prompt, user_prompt, chunks, temperature):
|
|
31
|
+
ctx = " ".join(chunks)
|
|
32
|
+
return ("Per the provided context: " + ctx) if ctx else "I don't have that information."
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _case_kwargs(case: dict) -> dict:
|
|
36
|
+
keys = ("prompt", "output", "system_prompt", "chunks", "similarity_scores",
|
|
37
|
+
"retrieval_query", "temperature", "max_tokens", "context_window",
|
|
38
|
+
"latency_ms", "model_name")
|
|
39
|
+
return {k: case[k] for k in keys if k in case}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _print_diagnosis(diag: dict, as_json: bool) -> None:
|
|
43
|
+
if as_json:
|
|
44
|
+
print(json.dumps(diag, indent=2))
|
|
45
|
+
return
|
|
46
|
+
if diag.get("healthy"):
|
|
47
|
+
print(_c("✓ healthy", "green") + " — no failure detected")
|
|
48
|
+
return
|
|
49
|
+
p = diag["primary"]
|
|
50
|
+
color = "red" if p["severity"] == "critical" else "yellow"
|
|
51
|
+
print(_c(f"✗ {p['failure']}", color) + f" conf {p['confidence']} ({p['severity']})")
|
|
52
|
+
print(" " + p["root_cause"])
|
|
53
|
+
print(_c(" fix: ", "dim") + p["fix"])
|
|
54
|
+
if diag.get("secondary"):
|
|
55
|
+
print(_c(" secondary: ", "dim") + ", ".join(s["failure"] for s in diag["secondary"]))
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _load_cases(path: Path) -> list[dict]:
|
|
59
|
+
data = json.loads(path.read_text())
|
|
60
|
+
if isinstance(data, dict) and "cases" in data:
|
|
61
|
+
return [{k: v for k, v in c.items() if k not in ("id", "expected", "_comment")}
|
|
62
|
+
for c in data["cases"]]
|
|
63
|
+
if isinstance(data, list):
|
|
64
|
+
return data
|
|
65
|
+
return [data]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# --------------------------------------------------------------------------- #
|
|
69
|
+
def cmd_analyze(args) -> int:
|
|
70
|
+
diag = analyze(
|
|
71
|
+
prompt=args.prompt, output=args.output, system_prompt=args.system or "",
|
|
72
|
+
chunks=args.chunk or None, similarity_scores=args.score or None,
|
|
73
|
+
temperature=args.temperature, context_window=args.context_window,
|
|
74
|
+
explain_with_llm=args.explain,
|
|
75
|
+
)
|
|
76
|
+
_print_diagnosis(diag, args.json)
|
|
77
|
+
return 0
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def cmd_diagnose(args) -> int:
|
|
81
|
+
cases = _load_cases(Path(args.file))
|
|
82
|
+
results = []
|
|
83
|
+
for c in cases:
|
|
84
|
+
diag = analyze(explain_with_llm=False, **_case_kwargs(c))
|
|
85
|
+
results.append(diag)
|
|
86
|
+
if not args.json:
|
|
87
|
+
label = c.get("label") or (c.get("prompt", "")[:48])
|
|
88
|
+
print(_c(label, "bold"))
|
|
89
|
+
_print_diagnosis(diag, False)
|
|
90
|
+
print()
|
|
91
|
+
if args.json:
|
|
92
|
+
print(json.dumps(results, indent=2))
|
|
93
|
+
else:
|
|
94
|
+
failing = sum(0 if r["healthy"] else 1 for r in results)
|
|
95
|
+
print(_c(f"{failing}/{len(results)} failing", "dim"))
|
|
96
|
+
return 0
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def cmd_fix(args) -> int:
|
|
100
|
+
cases = _load_cases(Path(args.file))
|
|
101
|
+
rerun = _grounded_stub if args.simulate else None
|
|
102
|
+
for c in cases:
|
|
103
|
+
kw = _case_kwargs(c)
|
|
104
|
+
diag = analyze(explain_with_llm=False, **kw)
|
|
105
|
+
rec = CaptureRecord(
|
|
106
|
+
user_prompt=kw.get("prompt", ""), llm_output=kw.get("output", ""),
|
|
107
|
+
system_prompt=kw.get("system_prompt", ""),
|
|
108
|
+
retrieved_chunks=kw.get("chunks") or [],
|
|
109
|
+
similarity_scores=kw.get("similarity_scores") or [],
|
|
110
|
+
temperature=kw.get("temperature"), context_window=kw.get("context_window"),
|
|
111
|
+
)
|
|
112
|
+
report = propose_fix(diag, rec, rerun=rerun)
|
|
113
|
+
label = c.get("label") or kw.get("prompt", "")[:48]
|
|
114
|
+
print(_c(label, "bold"))
|
|
115
|
+
if report is None:
|
|
116
|
+
print(" healthy / no agent\n")
|
|
117
|
+
continue
|
|
118
|
+
vcolor = {"verified": "green", "mitigated": "yellow", "escalated": "yellow",
|
|
119
|
+
"failed": "red", "pending_rerun": "dim"}.get(report.verdict, "dim")
|
|
120
|
+
print(f" {report.agent} → " + _c(report.verdict, vcolor) +
|
|
121
|
+
f" tests {report.tests_passed}/{report.tests_total}")
|
|
122
|
+
if report.diff:
|
|
123
|
+
print(_c(" " + report.diff.replace("\n", "\n "), "dim"))
|
|
124
|
+
print()
|
|
125
|
+
return 0
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def cmd_serve(args) -> int:
|
|
129
|
+
import uvicorn
|
|
130
|
+
uvicorn.run("server.app:app", host=args.host, port=args.port, reload=args.reload)
|
|
131
|
+
return 0
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def main(argv=None) -> int:
|
|
135
|
+
p = argparse.ArgumentParser(prog="debugai", description="Diagnose & fix LLM failures.")
|
|
136
|
+
sub = p.add_subparsers(dest="cmd", required=True)
|
|
137
|
+
|
|
138
|
+
a = sub.add_parser("analyze", help="diagnose a single prompt/output")
|
|
139
|
+
a.add_argument("--prompt", required=True)
|
|
140
|
+
a.add_argument("--output", required=True)
|
|
141
|
+
a.add_argument("--system", default="")
|
|
142
|
+
a.add_argument("--chunk", action="append", help="a retrieved chunk (repeatable)")
|
|
143
|
+
a.add_argument("--score", action="append", type=float, help="similarity score (repeatable)")
|
|
144
|
+
a.add_argument("--temperature", type=float)
|
|
145
|
+
a.add_argument("--context-window", type=int, dest="context_window")
|
|
146
|
+
a.add_argument("--explain", action="store_true", help="use the LLM explainer")
|
|
147
|
+
a.add_argument("--json", action="store_true")
|
|
148
|
+
a.set_defaults(func=cmd_analyze)
|
|
149
|
+
|
|
150
|
+
d = sub.add_parser("diagnose", help="diagnose a JSON file of cases")
|
|
151
|
+
d.add_argument("file")
|
|
152
|
+
d.add_argument("--json", action="store_true")
|
|
153
|
+
d.set_defaults(func=cmd_diagnose)
|
|
154
|
+
|
|
155
|
+
fx = sub.add_parser("fix", help="diagnose + propose/verify a fix for each case")
|
|
156
|
+
fx.add_argument("file")
|
|
157
|
+
fx.add_argument("--simulate", action="store_true", help="run the verify loop with a grounded stub model")
|
|
158
|
+
fx.set_defaults(func=cmd_fix)
|
|
159
|
+
|
|
160
|
+
sv = sub.add_parser("serve", help="launch the web app")
|
|
161
|
+
sv.add_argument("--host", default="127.0.0.1")
|
|
162
|
+
sv.add_argument("--port", type=int, default=8000)
|
|
163
|
+
sv.add_argument("--reload", action="store_true")
|
|
164
|
+
sv.set_defaults(func=cmd_serve)
|
|
165
|
+
|
|
166
|
+
args = p.parse_args(argv)
|
|
167
|
+
return args.func(args)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
if __name__ == "__main__":
|
|
171
|
+
raise SystemExit(main())
|
debugai/config.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""DebugAI SDK configuration — a single object controls everything that runs
|
|
2
|
+
per request, replacing the scattered wrap_llm() keyword arguments.
|
|
3
|
+
|
|
4
|
+
from debugai import DebugAIConfig
|
|
5
|
+
config = DebugAIConfig(
|
|
6
|
+
enable_judge=True, # LLM-as-judge for system-prompt adherence
|
|
7
|
+
sample_rate=0.1, # diagnose 10% of requests
|
|
8
|
+
on_diagnosis=lambda d: print(d["primary"]),
|
|
9
|
+
sink_url="http://my-debugai/api/traces",
|
|
10
|
+
sink_token="dbg_...",
|
|
11
|
+
)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import Any, Callable
|
|
18
|
+
|
|
19
|
+
from debugai.thresholds import DEFAULT_THRESHOLDS, Thresholds
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class DebugAIConfig:
|
|
24
|
+
# ── Background workers ──────────────────────────────────────────────────
|
|
25
|
+
enable_diagnosis: bool = True
|
|
26
|
+
"""Run the 8-signal engine + 5 detectors on every (sampled) request."""
|
|
27
|
+
|
|
28
|
+
enable_traces: bool = True
|
|
29
|
+
"""Emit an observability Trace (spans, scores, cost) per request."""
|
|
30
|
+
|
|
31
|
+
enable_judge: bool = False
|
|
32
|
+
"""LLM-as-judge: check system-prompt rule adherence (costs an LLM call)."""
|
|
33
|
+
|
|
34
|
+
enable_explain: bool = False
|
|
35
|
+
"""LLM explainer: generate a human-readable explanation (costs an LLM call)."""
|
|
36
|
+
|
|
37
|
+
lazy: bool = True
|
|
38
|
+
"""Skip expensive signals (embeddings/NER/NLI) when cheap signals are healthy."""
|
|
39
|
+
|
|
40
|
+
sample_rate: float = 1.0
|
|
41
|
+
"""Fraction of requests to diagnose (0.0–1.0). Deterministic count-based."""
|
|
42
|
+
|
|
43
|
+
max_queue_depth: int = 10_000
|
|
44
|
+
"""Maximum pending jobs in the background worker queue. Excess jobs are
|
|
45
|
+
dropped (backpressure) so diagnosis never slows the real request."""
|
|
46
|
+
|
|
47
|
+
# ── Metrics ─────────────────────────────────────────────────────────────
|
|
48
|
+
track_tokens: bool = True
|
|
49
|
+
"""Accumulate prompt + completion token counts per model in MetricsLedger."""
|
|
50
|
+
|
|
51
|
+
track_cost: bool = True
|
|
52
|
+
"""Estimate cost per request and accumulate in MetricsLedger."""
|
|
53
|
+
|
|
54
|
+
track_latency: bool = True
|
|
55
|
+
"""Record per-request latency for p50/p95 in MetricsLedger."""
|
|
56
|
+
|
|
57
|
+
# ── Sinks ───────────────────────────────────────────────────────────────
|
|
58
|
+
on_diagnosis: Callable[[dict], None] | None = None
|
|
59
|
+
"""Called with each diagnosis dict after background analysis completes."""
|
|
60
|
+
|
|
61
|
+
on_trace: Callable[[Any], None] | None = None
|
|
62
|
+
"""Called with each Trace object after background analysis completes."""
|
|
63
|
+
|
|
64
|
+
on_metrics: Callable[[dict], None] | None = None
|
|
65
|
+
"""Called after each request with a snapshot of per-request metrics."""
|
|
66
|
+
|
|
67
|
+
sink_url: str | None = None
|
|
68
|
+
"""POST traces to a DebugAI server endpoint (e.g. http://…/api/traces).
|
|
69
|
+
Requires sink_token if the server has auth enabled."""
|
|
70
|
+
|
|
71
|
+
sink_token: str | None = None
|
|
72
|
+
"""X-API-Key token for sink_url authentication."""
|
|
73
|
+
|
|
74
|
+
# ── Conversation ────────────────────────────────────────────────────────
|
|
75
|
+
session_id: str | None = None
|
|
76
|
+
"""Default session ID for all traces; overridden by the session() ctx manager."""
|
|
77
|
+
|
|
78
|
+
tags: dict[str, str] = field(default_factory=dict)
|
|
79
|
+
"""Key-value tags attached to every trace and diagnosis record."""
|
|
80
|
+
|
|
81
|
+
# ── Thresholds ──────────────────────────────────────────────────────────
|
|
82
|
+
thresholds: Thresholds = field(default_factory=lambda: DEFAULT_THRESHOLDS)
|
|
83
|
+
"""Detection thresholds. Per-user adaptive calibration overrides these at
|
|
84
|
+
the server level; SDK callers can override them explicitly here."""
|
|
85
|
+
|
|
86
|
+
# ── Provider config ──────────────────────────────────────────────────────
|
|
87
|
+
ollama_base_url: str = "http://localhost:11434/v1"
|
|
88
|
+
"""Ollama server URL for local models (Qwen, Llama, Phi, DeepSeek…).
|
|
89
|
+
Overridden by the OLLAMA_BASE_URL env var."""
|
|
90
|
+
|
|
91
|
+
model_prices: dict | None = None
|
|
92
|
+
"""Custom per-model pricing overrides: {"my-model": (input_$/1M, output_$/1M)}.
|
|
93
|
+
Merged with the built-in table; your entries take precedence."""
|
|
94
|
+
|
|
95
|
+
# ── LiteLLM-parity features (B1+) ───────────────────────────────────────
|
|
96
|
+
fallbacks: list = field(default_factory=list)
|
|
97
|
+
"""Model names to try if the primary call fails (rate limit / error / timeout).
|
|
98
|
+
e.g. fallbacks=['claude-haiku-4-5', 'ollama/qwen2.5']"""
|
|
99
|
+
|
|
100
|
+
response_schema: dict | None = None
|
|
101
|
+
"""JSON Schema to validate structured outputs. Violations are surfaced as
|
|
102
|
+
an instruction_violation in the diagnosis."""
|
|
103
|
+
|
|
104
|
+
on_schema_violation: Callable | None = None
|
|
105
|
+
"""Called when a schema violation is detected: fn(output_text, violations_list)."""
|
|
106
|
+
|
|
107
|
+
# ── B4: Budget manager ───────────────────────────────────────────────────
|
|
108
|
+
budget_usd: float | None = None
|
|
109
|
+
"""Soft spend cap across the MetricsLedger. Raises BudgetExceededError (or calls
|
|
110
|
+
on_budget_exceeded) before each call once this threshold is crossed."""
|
|
111
|
+
|
|
112
|
+
on_budget_exceeded: Callable | None = None
|
|
113
|
+
"""Called instead of raising when the budget is exhausted: fn(spent_usd).
|
|
114
|
+
If set, the call is NOT made and this callback fires instead."""
|
|
115
|
+
|
|
116
|
+
# ── B5: Request caching ──────────────────────────────────────────────────
|
|
117
|
+
cache_ttl_seconds: int | None = None
|
|
118
|
+
"""Cache identical (model, messages) calls for this many seconds.
|
|
119
|
+
Cache hits skip the provider call and return a CompletionResponse(from_cache=True)."""
|
|
120
|
+
|
|
121
|
+
# ── B6: Retry tracing ────────────────────────────────────────────────────
|
|
122
|
+
max_retries: int = 2
|
|
123
|
+
"""Retry attempts on rate-limit (429) or transient server errors (500/502/503).
|
|
124
|
+
Each attempt is recorded in CompletionResponse.retry_count and trace metadata."""
|
|
125
|
+
|
|
126
|
+
retry_backoff_seconds: float = 1.0
|
|
127
|
+
"""Base back-off between retries (doubled each attempt)."""
|
|
128
|
+
|
|
129
|
+
# ── B8: Latency SLA ──────────────────────────────────────────────────────
|
|
130
|
+
latency_sla_ms: float | None = None
|
|
131
|
+
"""Alert when a request exceeds this latency threshold."""
|
|
132
|
+
|
|
133
|
+
on_sla_breach: Callable | None = None
|
|
134
|
+
"""Called when latency_sla_ms is breached: fn({"model", "latency_ms", "threshold_ms"})."""
|