debugerai 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debugai/analyze.py ADDED
@@ -0,0 +1,142 @@
1
+ """Level 1 API — the single-call entry point (Architecture §3.2).
2
+
3
+ from debugai import analyze
4
+ result = analyze(
5
+ prompt="What is the refund policy?",
6
+ output="Refunds are issued within 90 days...",
7
+ chunks=[...],
8
+ similarity_scores=[...],
9
+ )
10
+ print(result["primary"]["failure"], result["primary"]["confidence"])
11
+
12
+ Returns the structured JSON contract from §7.3:
13
+ { "healthy": bool,
14
+ "primary": {failure, confidence, severity, root_cause, fix, evidence},
15
+ "secondary": [ ... ],
16
+ "signals": { ...8 metrics... },
17
+ "explanation": "human-readable text" }
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from typing import Any
23
+
24
+ from debugai.diagnosis import diagnose
25
+ from debugai.explainer import explain
26
+ from debugai.judge import INSTRUCTION_VIOLATION, judge_instructions
27
+ from debugai.schema import CaptureRecord
28
+ from debugai.signals import compute_signals, measure_variance
29
+ from debugai.thresholds import DEFAULT_THRESHOLDS, Thresholds
30
+
31
+ _IV_FIX = (
32
+ "Strengthen the system prompt to enforce the violated rules: reveal at most "
33
+ "one small hint per turn (never the full solution early), ask exactly one NEW "
34
+ "leading question that advances beyond what was already said (never restate a "
35
+ "prior question), and don't open by paraphrasing the student."
36
+ )
37
+
38
+
39
+ def _merge_instruction(result: dict, jd) -> dict:
40
+ """Fold an instruction-adherence verdict into the diagnosis, re-ranking by
41
+ confidence so the most severe failure becomes primary."""
42
+ severity = "critical" if any(v.severity == "critical" for v in jd.violations) else "warning"
43
+ rules = "; ".join(v.rule for v in jd.violations[:3])
44
+ iv = {
45
+ "failure": INSTRUCTION_VIOLATION,
46
+ "confidence": jd.confidence,
47
+ "severity": severity,
48
+ "root_cause": f"The response violates {len(jd.violations)} system-prompt "
49
+ f"rule(s): {rules}",
50
+ "fix": _IV_FIX,
51
+ "evidence": {"violations": [v.to_dict() for v in jd.violations],
52
+ "judge_model": jd.model},
53
+ }
54
+ fired = ([result["primary"]] if result.get("primary") else []) + result.get("secondary", []) + [iv]
55
+ fired.sort(key=lambda r: r["confidence"], reverse=True)
56
+ result["healthy"] = False
57
+ result["primary"] = fired[0]
58
+ result["secondary"] = fired[1:]
59
+ return result
60
+
61
+
62
+ def analyze(
63
+ prompt: str,
64
+ output: str,
65
+ *,
66
+ system_prompt: str = "",
67
+ chunks: list[str] | None = None,
68
+ similarity_scores: list[float] | None = None,
69
+ retrieval_query: str | None = None,
70
+ expected_output: str | None = None,
71
+ model_name: str | None = None,
72
+ temperature: float | None = None,
73
+ max_tokens: int | None = None,
74
+ context_window: int | None = None,
75
+ latency_ms: int | None = None,
76
+ token_usage: dict[str, int] | None = None,
77
+ thresholds: Thresholds = DEFAULT_THRESHOLDS,
78
+ explain_with_llm: bool = True,
79
+ lazy: bool = False,
80
+ judge: bool = False,
81
+ judge_model: str | None = None,
82
+ variance_rerun: Any = None,
83
+ variance_runs: int = 3,
84
+ ) -> dict[str, Any]:
85
+ """Diagnose why an LLM output failed and return a structured fix.
86
+
87
+ Only ``prompt`` and ``output`` are required (Core IO). Supplying retrieval
88
+ and runtime fields unlocks the RAG and capacity signals.
89
+ """
90
+ rec = CaptureRecord(
91
+ user_prompt=prompt,
92
+ llm_output=output,
93
+ system_prompt=system_prompt,
94
+ expected_output=expected_output,
95
+ retrieved_chunks=chunks or [],
96
+ similarity_scores=similarity_scores or [],
97
+ retrieval_query=retrieval_query,
98
+ model_name=model_name,
99
+ temperature=temperature,
100
+ max_tokens=max_tokens,
101
+ context_window=context_window,
102
+ latency_ms=latency_ms,
103
+ token_usage=token_usage or {},
104
+ )
105
+
106
+ signals = compute_signals(rec, lazy=lazy)
107
+ # Deep mode (§7.5 Tier 2): replace the variance proxy with a measured value
108
+ # from actually re-running the model, before classifying.
109
+ if variance_rerun is not None:
110
+ signals.variance = measure_variance(
111
+ variance_rerun, rec.system_prompt, rec.user_prompt,
112
+ rec.retrieved_chunks, rec.temperature, variance_runs)
113
+ signals.variance_method = "measured"
114
+ diag = diagnose(signals, rec, thresholds)
115
+ result = diag.to_dict()
116
+
117
+ # Optional behavioural / instruction-following check (LLM-as-judge) — catches
118
+ # failures the grounding signals can't see (e.g. a tutor revealing the answer).
119
+ if judge and rec.system_prompt:
120
+ jd = judge_instructions(rec.system_prompt, rec.user_prompt, rec.llm_output,
121
+ model=judge_model)
122
+ if not jd.healthy:
123
+ result = _merge_instruction(result, jd)
124
+
125
+ if explain_with_llm:
126
+ explanation = explain(diag)
127
+ result["explainer_model"] = explanation["model"]
128
+ # Prefer the deterministic primary's own root_cause when the judge changed
129
+ # the primary; otherwise use the LLM explanation.
130
+ if (result.get("primary") or {}).get("failure") == INSTRUCTION_VIOLATION:
131
+ result["explanation"] = result["primary"]["root_cause"]
132
+ else:
133
+ result["explanation"] = explanation["explanation"]
134
+ if diag.primary is not None and explanation.get("fix"):
135
+ result["primary"]["fix"] = explanation["fix"]
136
+ else:
137
+ result["explanation"] = (
138
+ result["primary"]["root_cause"] if result.get("primary") else "No failure detected."
139
+ )
140
+ result["explainer_model"] = "none"
141
+
142
+ return result
debugai/calibration.py ADDED
@@ -0,0 +1,198 @@
1
+ """Adaptive threshold calibration (Architecture §7.2).
2
+
3
+ Static thresholds break across embedding models, domains, and chunk sizes. The
4
+ ``ThresholdStore`` learns a per-user "known good" baseline from the signals of
5
+ healthy requests and tightens the gating thresholds to *that user's* norms:
6
+
7
+ cold (<50 requests) sensible defaults — not enough data yet
8
+ warm (50-500) percentile-based (5th / 95th of healthy baseline)
9
+ hot (>500) rolling-window z-score (mean ± 2 std), last `window`
10
+
11
+ A signal is only adapted once it has ``MIN_SAMPLES`` healthy observations;
12
+ otherwise its default is kept. Every calibrated value is clamped to a sane band
13
+ so pathological data can't produce a runaway threshold.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import dataclasses
19
+ import json
20
+ import threading
21
+ from dataclasses import dataclass
22
+ from pathlib import Path
23
+
24
+ from debugai.thresholds import DEFAULT_THRESHOLDS, Thresholds
25
+
26
+ COLD_MAX = 50
27
+ WARM_MAX = 500
28
+ MIN_SAMPLES = 15
29
+ Z = 2.0 # z-score: anomaly = 2 std beyond the healthy baseline
30
+
31
+ # Thresholds field -> (signal key, direction). "low" = anomaly below threshold,
32
+ # "high" = anomaly above threshold. Fields not listed stay fixed (semantic).
33
+ CALIBRATION_SPEC: dict[str, tuple[str, str]] = {
34
+ "similarity_min": ("similarity", "low"),
35
+ "overlap_low": ("overlap", "low"),
36
+ "entity_coverage_min": ("entity_coverage", "low"),
37
+ "contradiction_min": ("contradiction", "high"),
38
+ "variance_min": ("variance", "high"),
39
+ "context_length_ratio_max": ("context_ratio", "high"),
40
+ "token_usage_high": ("token_ratio", "high"),
41
+ "latency_high_ms": ("latency_ms", "high"),
42
+ }
43
+
44
+ # Clamp bands keep calibrated thresholds reasonable regardless of the data.
45
+ CLAMP: dict[str, tuple[float, float]] = {
46
+ "similarity_min": (0.10, 0.90),
47
+ "overlap_low": (0.10, 0.85),
48
+ "entity_coverage_min": (0.10, 0.85),
49
+ "contradiction_min": (0.10, 0.60),
50
+ "variance_min": (0.15, 0.80),
51
+ "context_length_ratio_max": (0.50, 0.95),
52
+ "token_usage_high": (0.50, 0.95),
53
+ "latency_high_ms": (500.0, 30000.0),
54
+ }
55
+
56
+ _SIGNAL_KEYS = sorted({k for k, _ in CALIBRATION_SPEC.values()})
57
+
58
+
59
+ def _mean(xs: list[float]) -> float:
60
+ return sum(xs) / len(xs)
61
+
62
+
63
+ def _std(xs: list[float], mu: float) -> float:
64
+ if len(xs) < 2:
65
+ return 0.0
66
+ return (sum((x - mu) ** 2 for x in xs) / (len(xs) - 1)) ** 0.5
67
+
68
+
69
+ def _percentile(xs: list[float], p: float) -> float:
70
+ s = sorted(xs)
71
+ if not s:
72
+ return 0.0
73
+ idx = (len(s) - 1) * (p / 100.0)
74
+ lo, hi = int(idx), min(int(idx) + 1, len(s) - 1)
75
+ frac = idx - lo
76
+ return s[lo] * (1 - frac) + s[hi] * frac
77
+
78
+
79
+ @dataclass
80
+ class SignalCalibration:
81
+ signal: str
82
+ direction: str
83
+ field: str
84
+ n: int
85
+ baseline_mean: float
86
+ baseline_std: float
87
+ default: float
88
+ value: float
89
+ adapted: bool
90
+
91
+
92
+ class ThresholdStore:
93
+ """Per-user adaptive threshold store. Thread-safe; optionally persisted."""
94
+
95
+ def __init__(self, path: Path | None = None, window: int = WARM_MAX):
96
+ self._path = path
97
+ self._window = window
98
+ self._lock = threading.Lock()
99
+ self._total = 0
100
+ self._healthy: list[dict[str, float]] = [] # baseline signal rows
101
+ if path is not None:
102
+ self._load()
103
+
104
+ # --- persistence -------------------------------------------------------
105
+ def _load(self) -> None:
106
+ try:
107
+ data = json.loads(self._path.read_text())
108
+ if isinstance(data, dict):
109
+ self._total = int(data.get("total", 0) or 0)
110
+ healthy = data.get("healthy", [])
111
+ self._healthy = healthy[-self._window:] if isinstance(healthy, list) else []
112
+ except Exception: # missing / corrupt / unreadable → cold start
113
+ pass
114
+
115
+ def _persist(self) -> None:
116
+ if self._path is None:
117
+ return
118
+ self._path.write_text(json.dumps(
119
+ {"total": self._total, "healthy": self._healthy[-self._window:]}
120
+ ))
121
+
122
+ # --- ingest ------------------------------------------------------------
123
+ def record(self, signals: dict, healthy: bool) -> None:
124
+ with self._lock:
125
+ self._total += 1
126
+ if healthy:
127
+ row = {k: float(signals.get(k, 0.0)) for k in _SIGNAL_KEYS}
128
+ self._healthy.append(row)
129
+ del self._healthy[:-self._window]
130
+ self._persist()
131
+
132
+ # --- regime ------------------------------------------------------------
133
+ def regime(self) -> str:
134
+ if self._total < COLD_MAX:
135
+ return "cold"
136
+ return "warm" if self._total <= WARM_MAX else "hot"
137
+
138
+ # --- calibration -------------------------------------------------------
139
+ def _calibrate_field(self, field: str, regime: str) -> SignalCalibration:
140
+ signal, direction = CALIBRATION_SPEC[field]
141
+ default = getattr(DEFAULT_THRESHOLDS, field)
142
+ values = [r[signal] for r in self._healthy if signal in r]
143
+ mu = _mean(values) if values else default
144
+ sd = _std(values, mu) if values else 0.0
145
+
146
+ # A baseline that is entirely zero means the signal was never exercised
147
+ # (e.g. no context_window supplied → context_ratio always 0). Don't adapt
148
+ # it — keep the default rather than collapsing to a clamp floor.
149
+ degenerate = mu == 0.0 and sd == 0.0
150
+ adapted = regime != "cold" and len(values) >= MIN_SAMPLES and not degenerate
151
+ value = default
152
+ if adapted:
153
+ if regime == "warm": # percentile of the healthy baseline
154
+ value = _percentile(values, 5.0 if direction == "low" else 95.0)
155
+ else: # hot: rolling-window z-score
156
+ value = mu - Z * sd if direction == "low" else mu + Z * sd
157
+ lo, hi = CLAMP[field]
158
+ value = max(lo, min(value, hi))
159
+
160
+ return SignalCalibration(
161
+ signal=signal, direction=direction, field=field, n=len(values),
162
+ baseline_mean=round(mu, 4), baseline_std=round(sd, 4),
163
+ default=default, value=round(value, 4), adapted=adapted,
164
+ )
165
+
166
+ def current(self) -> Thresholds:
167
+ """The active, calibrated thresholds for this user."""
168
+ with self._lock:
169
+ regime = self.regime()
170
+ if regime == "cold":
171
+ return DEFAULT_THRESHOLDS
172
+ overrides = {
173
+ field: self._calibrate_field(field, regime).value
174
+ for field in CALIBRATION_SPEC
175
+ }
176
+ return dataclasses.replace(DEFAULT_THRESHOLDS, **overrides)
177
+
178
+ def details(self) -> dict:
179
+ """Full calibration report for the dashboard."""
180
+ with self._lock:
181
+ regime = self.regime()
182
+ cals = [self._calibrate_field(f, regime) for f in CALIBRATION_SPEC]
183
+ return {
184
+ "regime": regime,
185
+ "total_requests": self._total,
186
+ "healthy_baseline": len(self._healthy),
187
+ "window": self._window,
188
+ "next_regime_at": COLD_MAX if regime == "cold" else (
189
+ WARM_MAX if regime == "warm" else None
190
+ ),
191
+ "signals": [dataclasses.asdict(c) for c in cals],
192
+ }
193
+
194
+ def reset(self) -> None:
195
+ with self._lock:
196
+ self._total = 0
197
+ self._healthy = []
198
+ self._persist()
debugai/cli.py ADDED
@@ -0,0 +1,171 @@
1
+ """DebugAI command-line interface.
2
+
3
+ debugai analyze --prompt "..." --output "..." --chunk "..." --score 0.4
4
+ debugai diagnose cases.json
5
+ debugai fix cases.json --simulate
6
+ debugai serve --port 8000
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import json
13
+ import sys
14
+ from pathlib import Path
15
+
16
+ from debugai import analyze
17
+ from debugai.agents import propose_fix
18
+ from debugai.schema import CaptureRecord
19
+
20
+ _ANSI = {"red": "\033[31m", "green": "\033[32m", "yellow": "\033[33m",
21
+ "dim": "\033[2m", "bold": "\033[1m", "reset": "\033[0m"}
22
+
23
+
24
+ def _c(text: str, color: str) -> str:
25
+ if not sys.stdout.isatty():
26
+ return text
27
+ return f"{_ANSI[color]}{text}{_ANSI['reset']}"
28
+
29
+
30
+ def _grounded_stub(system_prompt, user_prompt, chunks, temperature):
31
+ ctx = " ".join(chunks)
32
+ return ("Per the provided context: " + ctx) if ctx else "I don't have that information."
33
+
34
+
35
+ def _case_kwargs(case: dict) -> dict:
36
+ keys = ("prompt", "output", "system_prompt", "chunks", "similarity_scores",
37
+ "retrieval_query", "temperature", "max_tokens", "context_window",
38
+ "latency_ms", "model_name")
39
+ return {k: case[k] for k in keys if k in case}
40
+
41
+
42
+ def _print_diagnosis(diag: dict, as_json: bool) -> None:
43
+ if as_json:
44
+ print(json.dumps(diag, indent=2))
45
+ return
46
+ if diag.get("healthy"):
47
+ print(_c("✓ healthy", "green") + " — no failure detected")
48
+ return
49
+ p = diag["primary"]
50
+ color = "red" if p["severity"] == "critical" else "yellow"
51
+ print(_c(f"✗ {p['failure']}", color) + f" conf {p['confidence']} ({p['severity']})")
52
+ print(" " + p["root_cause"])
53
+ print(_c(" fix: ", "dim") + p["fix"])
54
+ if diag.get("secondary"):
55
+ print(_c(" secondary: ", "dim") + ", ".join(s["failure"] for s in diag["secondary"]))
56
+
57
+
58
+ def _load_cases(path: Path) -> list[dict]:
59
+ data = json.loads(path.read_text())
60
+ if isinstance(data, dict) and "cases" in data:
61
+ return [{k: v for k, v in c.items() if k not in ("id", "expected", "_comment")}
62
+ for c in data["cases"]]
63
+ if isinstance(data, list):
64
+ return data
65
+ return [data]
66
+
67
+
68
+ # --------------------------------------------------------------------------- #
69
+ def cmd_analyze(args) -> int:
70
+ diag = analyze(
71
+ prompt=args.prompt, output=args.output, system_prompt=args.system or "",
72
+ chunks=args.chunk or None, similarity_scores=args.score or None,
73
+ temperature=args.temperature, context_window=args.context_window,
74
+ explain_with_llm=args.explain,
75
+ )
76
+ _print_diagnosis(diag, args.json)
77
+ return 0
78
+
79
+
80
+ def cmd_diagnose(args) -> int:
81
+ cases = _load_cases(Path(args.file))
82
+ results = []
83
+ for c in cases:
84
+ diag = analyze(explain_with_llm=False, **_case_kwargs(c))
85
+ results.append(diag)
86
+ if not args.json:
87
+ label = c.get("label") or (c.get("prompt", "")[:48])
88
+ print(_c(label, "bold"))
89
+ _print_diagnosis(diag, False)
90
+ print()
91
+ if args.json:
92
+ print(json.dumps(results, indent=2))
93
+ else:
94
+ failing = sum(0 if r["healthy"] else 1 for r in results)
95
+ print(_c(f"{failing}/{len(results)} failing", "dim"))
96
+ return 0
97
+
98
+
99
+ def cmd_fix(args) -> int:
100
+ cases = _load_cases(Path(args.file))
101
+ rerun = _grounded_stub if args.simulate else None
102
+ for c in cases:
103
+ kw = _case_kwargs(c)
104
+ diag = analyze(explain_with_llm=False, **kw)
105
+ rec = CaptureRecord(
106
+ user_prompt=kw.get("prompt", ""), llm_output=kw.get("output", ""),
107
+ system_prompt=kw.get("system_prompt", ""),
108
+ retrieved_chunks=kw.get("chunks") or [],
109
+ similarity_scores=kw.get("similarity_scores") or [],
110
+ temperature=kw.get("temperature"), context_window=kw.get("context_window"),
111
+ )
112
+ report = propose_fix(diag, rec, rerun=rerun)
113
+ label = c.get("label") or kw.get("prompt", "")[:48]
114
+ print(_c(label, "bold"))
115
+ if report is None:
116
+ print(" healthy / no agent\n")
117
+ continue
118
+ vcolor = {"verified": "green", "mitigated": "yellow", "escalated": "yellow",
119
+ "failed": "red", "pending_rerun": "dim"}.get(report.verdict, "dim")
120
+ print(f" {report.agent} → " + _c(report.verdict, vcolor) +
121
+ f" tests {report.tests_passed}/{report.tests_total}")
122
+ if report.diff:
123
+ print(_c(" " + report.diff.replace("\n", "\n "), "dim"))
124
+ print()
125
+ return 0
126
+
127
+
128
+ def cmd_serve(args) -> int:
129
+ import uvicorn
130
+ uvicorn.run("server.app:app", host=args.host, port=args.port, reload=args.reload)
131
+ return 0
132
+
133
+
134
+ def main(argv=None) -> int:
135
+ p = argparse.ArgumentParser(prog="debugai", description="Diagnose & fix LLM failures.")
136
+ sub = p.add_subparsers(dest="cmd", required=True)
137
+
138
+ a = sub.add_parser("analyze", help="diagnose a single prompt/output")
139
+ a.add_argument("--prompt", required=True)
140
+ a.add_argument("--output", required=True)
141
+ a.add_argument("--system", default="")
142
+ a.add_argument("--chunk", action="append", help="a retrieved chunk (repeatable)")
143
+ a.add_argument("--score", action="append", type=float, help="similarity score (repeatable)")
144
+ a.add_argument("--temperature", type=float)
145
+ a.add_argument("--context-window", type=int, dest="context_window")
146
+ a.add_argument("--explain", action="store_true", help="use the LLM explainer")
147
+ a.add_argument("--json", action="store_true")
148
+ a.set_defaults(func=cmd_analyze)
149
+
150
+ d = sub.add_parser("diagnose", help="diagnose a JSON file of cases")
151
+ d.add_argument("file")
152
+ d.add_argument("--json", action="store_true")
153
+ d.set_defaults(func=cmd_diagnose)
154
+
155
+ fx = sub.add_parser("fix", help="diagnose + propose/verify a fix for each case")
156
+ fx.add_argument("file")
157
+ fx.add_argument("--simulate", action="store_true", help="run the verify loop with a grounded stub model")
158
+ fx.set_defaults(func=cmd_fix)
159
+
160
+ sv = sub.add_parser("serve", help="launch the web app")
161
+ sv.add_argument("--host", default="127.0.0.1")
162
+ sv.add_argument("--port", type=int, default=8000)
163
+ sv.add_argument("--reload", action="store_true")
164
+ sv.set_defaults(func=cmd_serve)
165
+
166
+ args = p.parse_args(argv)
167
+ return args.func(args)
168
+
169
+
170
+ if __name__ == "__main__":
171
+ raise SystemExit(main())
debugai/config.py ADDED
@@ -0,0 +1,134 @@
1
+ """DebugAI SDK configuration — a single object controls everything that runs
2
+ per request, replacing the scattered wrap_llm() keyword arguments.
3
+
4
+ from debugai import DebugAIConfig
5
+ config = DebugAIConfig(
6
+ enable_judge=True, # LLM-as-judge for system-prompt adherence
7
+ sample_rate=0.1, # diagnose 10% of requests
8
+ on_diagnosis=lambda d: print(d["primary"]),
9
+ sink_url="http://my-debugai/api/traces",
10
+ sink_token="dbg_...",
11
+ )
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from dataclasses import dataclass, field
17
+ from typing import Any, Callable
18
+
19
+ from debugai.thresholds import DEFAULT_THRESHOLDS, Thresholds
20
+
21
+
22
+ @dataclass
23
+ class DebugAIConfig:
24
+ # ── Background workers ──────────────────────────────────────────────────
25
+ enable_diagnosis: bool = True
26
+ """Run the 8-signal engine + 5 detectors on every (sampled) request."""
27
+
28
+ enable_traces: bool = True
29
+ """Emit an observability Trace (spans, scores, cost) per request."""
30
+
31
+ enable_judge: bool = False
32
+ """LLM-as-judge: check system-prompt rule adherence (costs an LLM call)."""
33
+
34
+ enable_explain: bool = False
35
+ """LLM explainer: generate a human-readable explanation (costs an LLM call)."""
36
+
37
+ lazy: bool = True
38
+ """Skip expensive signals (embeddings/NER/NLI) when cheap signals are healthy."""
39
+
40
+ sample_rate: float = 1.0
41
+ """Fraction of requests to diagnose (0.0–1.0). Deterministic count-based."""
42
+
43
+ max_queue_depth: int = 10_000
44
+ """Maximum pending jobs in the background worker queue. Excess jobs are
45
+ dropped (backpressure) so diagnosis never slows the real request."""
46
+
47
+ # ── Metrics ─────────────────────────────────────────────────────────────
48
+ track_tokens: bool = True
49
+ """Accumulate prompt + completion token counts per model in MetricsLedger."""
50
+
51
+ track_cost: bool = True
52
+ """Estimate cost per request and accumulate in MetricsLedger."""
53
+
54
+ track_latency: bool = True
55
+ """Record per-request latency for p50/p95 in MetricsLedger."""
56
+
57
+ # ── Sinks ───────────────────────────────────────────────────────────────
58
+ on_diagnosis: Callable[[dict], None] | None = None
59
+ """Called with each diagnosis dict after background analysis completes."""
60
+
61
+ on_trace: Callable[[Any], None] | None = None
62
+ """Called with each Trace object after background analysis completes."""
63
+
64
+ on_metrics: Callable[[dict], None] | None = None
65
+ """Called after each request with a snapshot of per-request metrics."""
66
+
67
+ sink_url: str | None = None
68
+ """POST traces to a DebugAI server endpoint (e.g. http://…/api/traces).
69
+ Requires sink_token if the server has auth enabled."""
70
+
71
+ sink_token: str | None = None
72
+ """X-API-Key token for sink_url authentication."""
73
+
74
+ # ── Conversation ────────────────────────────────────────────────────────
75
+ session_id: str | None = None
76
+ """Default session ID for all traces; overridden by the session() ctx manager."""
77
+
78
+ tags: dict[str, str] = field(default_factory=dict)
79
+ """Key-value tags attached to every trace and diagnosis record."""
80
+
81
+ # ── Thresholds ──────────────────────────────────────────────────────────
82
+ thresholds: Thresholds = field(default_factory=lambda: DEFAULT_THRESHOLDS)
83
+ """Detection thresholds. Per-user adaptive calibration overrides these at
84
+ the server level; SDK callers can override them explicitly here."""
85
+
86
+ # ── Provider config ──────────────────────────────────────────────────────
87
+ ollama_base_url: str = "http://localhost:11434/v1"
88
+ """Ollama server URL for local models (Qwen, Llama, Phi, DeepSeek…).
89
+ Overridden by the OLLAMA_BASE_URL env var."""
90
+
91
+ model_prices: dict | None = None
92
+ """Custom per-model pricing overrides: {"my-model": (input_$/1M, output_$/1M)}.
93
+ Merged with the built-in table; your entries take precedence."""
94
+
95
+ # ── LiteLLM-parity features (B1+) ───────────────────────────────────────
96
+ fallbacks: list = field(default_factory=list)
97
+ """Model names to try if the primary call fails (rate limit / error / timeout).
98
+ e.g. fallbacks=['claude-haiku-4-5', 'ollama/qwen2.5']"""
99
+
100
+ response_schema: dict | None = None
101
+ """JSON Schema to validate structured outputs. Violations are surfaced as
102
+ an instruction_violation in the diagnosis."""
103
+
104
+ on_schema_violation: Callable | None = None
105
+ """Called when a schema violation is detected: fn(output_text, violations_list)."""
106
+
107
+ # ── B4: Budget manager ───────────────────────────────────────────────────
108
+ budget_usd: float | None = None
109
+ """Soft spend cap across the MetricsLedger. Raises BudgetExceededError (or calls
110
+ on_budget_exceeded) before each call once this threshold is crossed."""
111
+
112
+ on_budget_exceeded: Callable | None = None
113
+ """Called instead of raising when the budget is exhausted: fn(spent_usd).
114
+ If set, the call is NOT made and this callback fires instead."""
115
+
116
+ # ── B5: Request caching ──────────────────────────────────────────────────
117
+ cache_ttl_seconds: int | None = None
118
+ """Cache identical (model, messages) calls for this many seconds.
119
+ Cache hits skip the provider call and return a CompletionResponse(from_cache=True)."""
120
+
121
+ # ── B6: Retry tracing ────────────────────────────────────────────────────
122
+ max_retries: int = 2
123
+ """Retry attempts on rate-limit (429) or transient server errors (500/502/503).
124
+ Each attempt is recorded in CompletionResponse.retry_count and trace metadata."""
125
+
126
+ retry_backoff_seconds: float = 1.0
127
+ """Base back-off between retries (doubled each attempt)."""
128
+
129
+ # ── B8: Latency SLA ──────────────────────────────────────────────────────
130
+ latency_sla_ms: float | None = None
131
+ """Alert when a request exceeds this latency threshold."""
132
+
133
+ on_sla_breach: Callable | None = None
134
+ """Called when latency_sla_ms is breached: fn({"model", "latency_ms", "threshold_ms"})."""