debugerai 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debugerai-0.2.0/LICENSE +21 -0
- debugerai-0.2.0/PKG-INFO +535 -0
- debugerai-0.2.0/README.md +498 -0
- debugerai-0.2.0/debugai/__init__.py +51 -0
- debugerai-0.2.0/debugai/agents/__init__.py +43 -0
- debugerai-0.2.0/debugai/agents/base.py +192 -0
- debugerai-0.2.0/debugai/agents/builtin.py +246 -0
- debugerai-0.2.0/debugai/agents/registry.py +31 -0
- debugerai-0.2.0/debugai/agents/types.py +108 -0
- debugerai-0.2.0/debugai/analyze.py +142 -0
- debugerai-0.2.0/debugai/calibration.py +198 -0
- debugerai-0.2.0/debugai/cli.py +171 -0
- debugerai-0.2.0/debugai/config.py +134 -0
- debugerai-0.2.0/debugai/detectors.py +206 -0
- debugerai-0.2.0/debugai/diagnosis.py +64 -0
- debugerai-0.2.0/debugai/explainer.py +105 -0
- debugerai-0.2.0/debugai/integrations/__init__.py +5 -0
- debugerai-0.2.0/debugai/integrations/langchain.py +109 -0
- debugerai-0.2.0/debugai/judge.py +171 -0
- debugerai-0.2.0/debugai/metrics.py +139 -0
- debugerai-0.2.0/debugai/models.py +92 -0
- debugerai-0.2.0/debugai/providers.py +179 -0
- debugerai-0.2.0/debugai/schema.py +66 -0
- debugerai-0.2.0/debugai/sdk.py +1271 -0
- debugerai-0.2.0/debugai/signals.py +399 -0
- debugerai-0.2.0/debugai/thresholds.json +15 -0
- debugerai-0.2.0/debugai/thresholds.py +44 -0
- debugerai-0.2.0/debugai/tracing.py +283 -0
- debugerai-0.2.0/debugerai.egg-info/PKG-INFO +535 -0
- debugerai-0.2.0/debugerai.egg-info/SOURCES.txt +54 -0
- debugerai-0.2.0/debugerai.egg-info/dependency_links.txt +1 -0
- debugerai-0.2.0/debugerai.egg-info/entry_points.txt +2 -0
- debugerai-0.2.0/debugerai.egg-info/requires.txt +17 -0
- debugerai-0.2.0/debugerai.egg-info/top_level.txt +1 -0
- debugerai-0.2.0/pyproject.toml +56 -0
- debugerai-0.2.0/setup.cfg +4 -0
- debugerai-0.2.0/tests/test_agents.py +174 -0
- debugerai-0.2.0/tests/test_analyze.py +62 -0
- debugerai-0.2.0/tests/test_auth.py +80 -0
- debugerai-0.2.0/tests/test_b3_b10.py +300 -0
- debugerai-0.2.0/tests/test_benchmark.py +25 -0
- debugerai-0.2.0/tests/test_calibration.py +99 -0
- debugerai-0.2.0/tests/test_cli.py +50 -0
- debugerai-0.2.0/tests/test_completion.py +219 -0
- debugerai-0.2.0/tests/test_deepmode.py +35 -0
- debugerai-0.2.0/tests/test_detectors.py +80 -0
- debugerai-0.2.0/tests/test_integrations.py +47 -0
- debugerai-0.2.0/tests/test_judge.py +85 -0
- debugerai-0.2.0/tests/test_metrics.py +89 -0
- debugerai-0.2.0/tests/test_providers.py +264 -0
- debugerai-0.2.0/tests/test_robustness.py +85 -0
- debugerai-0.2.0/tests/test_sdk.py +194 -0
- debugerai-0.2.0/tests/test_server.py +378 -0
- debugerai-0.2.0/tests/test_signals.py +95 -0
- debugerai-0.2.0/tests/test_tracing.py +60 -0
- debugerai-0.2.0/tests/test_ui_adapter.py +83 -0
debugerai-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 civicRJ
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
debugerai-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,535 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: debugerai
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: AI observability & debugging — diagnose why LLM outputs fail and get specific fixes.
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/civicRJ/DebugAI
|
|
7
|
+
Project-URL: Repository, https://github.com/civicRJ/DebugAI
|
|
8
|
+
Project-URL: Issues, https://github.com/civicRJ/DebugAI/issues
|
|
9
|
+
Project-URL: PyPI, https://pypi.org/project/debugerai/
|
|
10
|
+
Keywords: llm,observability,debugging,rag,ai,diagnosis
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Topic :: Software Development :: Debuggers
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.11
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: torch>=2.2
|
|
22
|
+
Requires-Dist: sentence-transformers>=2.7
|
|
23
|
+
Requires-Dist: spacy>=3.7
|
|
24
|
+
Requires-Dist: transformers>=4.40
|
|
25
|
+
Requires-Dist: sentencepiece>=0.2
|
|
26
|
+
Requires-Dist: anthropic>=0.40
|
|
27
|
+
Requires-Dist: openai>=1.40
|
|
28
|
+
Requires-Dist: numpy>=1.26
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
31
|
+
Provides-Extra: server
|
|
32
|
+
Requires-Dist: fastapi>=0.110; extra == "server"
|
|
33
|
+
Requires-Dist: uvicorn[standard]>=0.29; extra == "server"
|
|
34
|
+
Requires-Dist: psycopg2-binary>=2.9; extra == "server"
|
|
35
|
+
Requires-Dist: sqlalchemy>=2.0; extra == "server"
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
|
|
38
|
+
# DebugAI
|
|
39
|
+
|
|
40
|
+
[](https://github.com/civicRJ/DebugAI/actions/workflows/ci.yml)
|
|
41
|
+
[](LICENSE)
|
|
42
|
+
|
|
43
|
+
> Diagnose **why** LLM outputs fail and get the **exact fix** — reducing hours of trial-and-error debugging to seconds.
|
|
44
|
+
|
|
45
|
+
DebugAI is a 3-layer root-cause engine for LLM applications (RAG systems,
|
|
46
|
+
chatbots, copilots). Unlike observability tools that stop at dashboards, it
|
|
47
|
+
classifies the failure and proposes a specific, actionable fix.
|
|
48
|
+
|
|
49
|
+
This repository implements **Phase 1 — the deterministic diagnosis core**
|
|
50
|
+
(Steps 1–3 of the roadmap in `debugai_architecture_v3.pdf`): the signal engine,
|
|
51
|
+
the rule engine, and the `analyze()` API with an LLM explainer — plus the
|
|
52
|
+
**Level 2 `wrap_llm()` SDK wrapper** (Step 5) and a **web dashboard** (Step 6)
|
|
53
|
+
built on the `Debug_AI/` design system.
|
|
54
|
+
|
|
55
|
+
## Architecture (implemented)
|
|
56
|
+
|
|
57
|
+
| Layer | Type | Module | What it does |
|
|
58
|
+
|------|------|--------|--------------|
|
|
59
|
+
| 1 — Signal Extraction | deterministic | `debugai/signals.py` | Computes the 8-metric signal vector (small CPU models + fallbacks, lazy eval) |
|
|
60
|
+
| 2 — Rule Engine | deterministic | `debugai/detectors.py`, `diagnosis.py` | 5 failure detectors → primary + secondary diagnosis |
|
|
61
|
+
| 3 — LLM Explainer | probabilistic | `debugai/explainer.py` | Translates the diagnosis into human-readable explanation + fix (Claude; deterministic fallback) |
|
|
62
|
+
| API | — | `debugai/analyze.py` | Level-1 single-call entry point |
|
|
63
|
+
|
|
64
|
+
**Detection is deterministic; only the explanation uses an LLM.** Healthy
|
|
65
|
+
requests fail open (no LLM tokens, no cost).
|
|
66
|
+
|
|
67
|
+
### The 8 signals
|
|
68
|
+
context-output overlap · entity coverage · retrieval similarity · contradiction
|
|
69
|
+
(NLI) · output variance (proxy) · latency · token-usage ratio · context-length
|
|
70
|
+
ratio.
|
|
71
|
+
|
|
72
|
+
### The 5 detectors (evaluation order)
|
|
73
|
+
context overflow → retrieval failure → entity gap → hallucination → prompt
|
|
74
|
+
brittleness. All run; results are ranked by confidence; gate patterns prevent
|
|
75
|
+
nonsensical combinations.
|
|
76
|
+
|
|
77
|
+
## Quickstart
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
python3.11 -m venv .venv && source .venv/bin/activate
|
|
81
|
+
pip install -r requirements.txt
|
|
82
|
+
python -m spacy download en_core_web_sm
|
|
83
|
+
|
|
84
|
+
./run.sh # → http://127.0.0.1:8000 (home + dashboard)
|
|
85
|
+
# or: pytest -q # run the test suite
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Set `ANTHROPIC_API_KEY` to enable the live LLM explainer and the fix-agent
|
|
89
|
+
re-run; everything works without it (deterministic detection + a grounded-stub
|
|
90
|
+
re-run for the demo).
|
|
91
|
+
|
|
92
|
+
### Frontend build
|
|
93
|
+
|
|
94
|
+
The UI is React, **pre-compiled with esbuild** (no in-browser Babel, no CDN —
|
|
95
|
+
React is vendored locally, so pages load fast, work offline, and run under a
|
|
96
|
+
strict `script-src 'self'` CSP). The built bundles (`server/static/dist/`) and
|
|
97
|
+
vendored React (`server/static/vendor/`) are committed, so a plain
|
|
98
|
+
`pip install` + run works with no Node. To rebuild after editing any `.jsx`:
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
npm install && npm run build # → server/static/dist/*.js
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
`./run.sh` builds automatically if the bundles are missing.
|
|
105
|
+
|
|
106
|
+
### CLI
|
|
107
|
+
|
|
108
|
+
Installs a `debugai` console command (`pip install debugerai` or `pip install -e .` for local dev):
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
debugai analyze --prompt "..." --output "..." --chunk "..." --score 0.41
|
|
112
|
+
debugai diagnose cases.json # a capture dict, list, or {cases:[...]}
|
|
113
|
+
debugai fix cases.json --simulate # diagnose + propose & verify a fix
|
|
114
|
+
debugai serve --port 8000 # launch the web app
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Install
|
|
118
|
+
|
|
119
|
+
Models used (all small, CPU, downloaded once): `all-MiniLM-L6-v2` (embeddings),
|
|
120
|
+
`en_core_web_sm` (NER), `cross-encoder/nli-deberta-v3-base` (NLI).
|
|
121
|
+
|
|
122
|
+
## Usage
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from debugai import analyze
|
|
126
|
+
|
|
127
|
+
result = analyze(
|
|
128
|
+
prompt="What is the refund policy for electronics?",
|
|
129
|
+
output="Electronics can be returned within 90 days for a full cash refund.",
|
|
130
|
+
chunks=["Our store hours are 9am to 5pm.", "Parking is behind the building."],
|
|
131
|
+
similarity_scores=[0.42, 0.40],
|
|
132
|
+
temperature=0.2,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
print(result["primary"]["failure"]) # retrieval_failure
|
|
136
|
+
print(result["primary"]["confidence"]) # 0.95
|
|
137
|
+
print(result["primary"]["fix"]) # specific, actionable fix
|
|
138
|
+
print(result["signals"]) # full 8-metric vector
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Only `prompt` and `output` are required (Core IO). Supplying retrieval
|
|
142
|
+
(`chunks`, `similarity_scores`) and runtime fields (`latency_ms`,
|
|
143
|
+
`temperature`, `context_window`, `token_usage`) unlocks the RAG and capacity
|
|
144
|
+
signals.
|
|
145
|
+
|
|
146
|
+
### Output contract
|
|
147
|
+
|
|
148
|
+
```jsonc
|
|
149
|
+
{
|
|
150
|
+
"healthy": false,
|
|
151
|
+
"primary": { "failure", "confidence", "severity", "root_cause", "fix", "evidence" },
|
|
152
|
+
"secondary": [ /* other detected issues, ranked */ ],
|
|
153
|
+
"signals": { /* the 8-metric vector */ },
|
|
154
|
+
"explanation": "human-readable text"
|
|
155
|
+
}
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Level 2 — one-line SDK wrapper
|
|
159
|
+
|
|
160
|
+
Wrap your existing OpenAI or Anthropic client and every call is auto-diagnosed
|
|
161
|
+
in the background — no call-site changes, no added request latency
|
|
162
|
+
(~0.004ms overhead; diagnosis runs on a worker thread).
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
from openai import OpenAI
|
|
166
|
+
from debugai import wrap_llm, retrieval_context
|
|
167
|
+
|
|
168
|
+
client = wrap_llm(OpenAI(), on_diagnosis=lambda d: print(d["primary"]))
|
|
169
|
+
|
|
170
|
+
# Attach RAG context either via a context manager around your retriever...
|
|
171
|
+
with retrieval_context(chunks, similarity_scores=scores):
|
|
172
|
+
client.chat.completions.create(model="gpt-4o", messages=[...])
|
|
173
|
+
|
|
174
|
+
# ...or inline as debugai_* kwargs (stripped before the real SDK call):
|
|
175
|
+
client.chat.completions.create(
|
|
176
|
+
model="gpt-4o", messages=[...],
|
|
177
|
+
debugai_chunks=chunks, debugai_similarity_scores=scores,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Inspect recent diagnoses without a callback:
|
|
181
|
+
client.debugai.recent # list of diagnosis dicts
|
|
182
|
+
client.debugai.flush() # block until the queue drains
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
`wrap_llm` auto-detects the provider (OpenAI `.chat.completions.create` /
|
|
186
|
+
Anthropic `.messages.create`) and captures the Core IO, metadata, and runtime
|
|
187
|
+
data groups; retrieval is attached via the mechanisms above. Pass
|
|
188
|
+
`explain_with_llm=True` to also run the Layer-3 explainer, `sample_rate` to
|
|
189
|
+
diagnose a fraction of traffic, and `context_window` to enable capacity signals.
|
|
190
|
+
|
|
191
|
+
## Deploy (Docker)
|
|
192
|
+
|
|
193
|
+
```bash
|
|
194
|
+
cp .env.example .env # optional: add OPENAI_API_KEY / ANTHROPIC_API_KEY / hardening
|
|
195
|
+
docker compose up --build # → http://localhost:8000
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
- **Multi-stage image:** a Node stage builds the frontend bundles; the Python
|
|
199
|
+
runtime installs **CPU-only torch** and **bakes the signal models** in, so the
|
|
200
|
+
container runs fully offline (no model downloads at start). Expect a large
|
|
201
|
+
image (~2–3 GB) — it's an ML app.
|
|
202
|
+
- **Persistence:** all state (diagnoses, traces, calibration, **user accounts**)
|
|
203
|
+
is written to `DEBUGAI_DATA_DIR` (`/data` in the image), mounted as the
|
|
204
|
+
`debugai-data` volume — survives restarts and rebuilds.
|
|
205
|
+
- **TLS:** terminate at a reverse proxy (nginx/Caddy) or pass
|
|
206
|
+
`DEBUGAI_SSL_CERT`/`DEBUGAI_SSL_KEY`. Set `DEBUGAI_TRUST_PROXY=1` behind a proxy.
|
|
207
|
+
- Config is via env (see `.env.example` and the **Security & robustness** table).
|
|
208
|
+
|
|
209
|
+
## Accounts & multi-tenancy
|
|
210
|
+
|
|
211
|
+
The web app has full authentication — register, log in, manage your account —
|
|
212
|
+
and every account's data is **private**.
|
|
213
|
+
|
|
214
|
+
- **Auth:** `server/auth.py` — users + server-side sessions in a stdlib
|
|
215
|
+
`sqlite3` DB, passwords hashed with **scrypt** + per-user salt, an httpOnly
|
|
216
|
+
`SameSite=Lax` session cookie (`Secure` under HTTPS). Logout and account
|
|
217
|
+
deletion revoke sessions server-side.
|
|
218
|
+
- **Pages:** `/register`, `/login`, `/account` (update name/email/password,
|
|
219
|
+
log out, delete account). `/dashboard` and `/playground` redirect to `/login`
|
|
220
|
+
when signed out.
|
|
221
|
+
- **Per-user isolation:** diagnoses, traces, sessions, and adaptive calibration
|
|
222
|
+
are all scoped to the signed-in account (`owner`); a new account starts with
|
|
223
|
+
its own auto-seeded sample data and can never see another user's data.
|
|
224
|
+
Deleting an account purges all of its data.
|
|
225
|
+
- **API:** `POST /api/auth/register|login|logout`, `GET /api/auth/me`,
|
|
226
|
+
`PATCH /api/account`, `DELETE /api/account`. All `/api/*` data endpoints
|
|
227
|
+
require a valid session (this supersedes the older `DEBUGAI_API_KEY` gate).
|
|
228
|
+
|
|
229
|
+
### API tokens (programmatic access)
|
|
230
|
+
|
|
231
|
+
Mint per-account tokens under **Account → API tokens** (or `POST
|
|
232
|
+
/api/account/tokens`). A token authenticates `/api/*` as your account via
|
|
233
|
+
`X-API-Key: <token>` or `Authorization: Bearer <token>` — only its hash is
|
|
234
|
+
stored, and the plaintext is shown once. This lets the SDK stream traces to your
|
|
235
|
+
own server:
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
from openai import OpenAI
|
|
239
|
+
from debugai import wrap_llm, http_trace_sink
|
|
240
|
+
|
|
241
|
+
client = wrap_llm(OpenAI(), on_trace=http_trace_sink(
|
|
242
|
+
"http://localhost:8000/api/traces", token="dbg_…"))
|
|
243
|
+
client.chat.completions.create(...) # → diagnosis + trace land in your dashboard
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
Tokens are revocable (`DELETE /api/account/tokens/{id}`) and are purged when the
|
|
247
|
+
account is deleted.
|
|
248
|
+
|
|
249
|
+
## Web app (Step 6)
|
|
250
|
+
|
|
251
|
+
A FastAPI backend serves the site built entirely on the `Debug_AI/` design
|
|
252
|
+
system:
|
|
253
|
+
|
|
254
|
+
```bash
|
|
255
|
+
uvicorn server.app:app --reload
|
|
256
|
+
# home page → http://127.0.0.1:8000/ (public marketing page)
|
|
257
|
+
# register → http://127.0.0.1:8000/register (create an account, then you're in)
|
|
258
|
+
# dashboard → http://127.0.0.1:8000/dashboard (requires login)
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
- **`/` — home / landing.** The marketing page (animated signal-flow hero, how-it-works
|
|
262
|
+
pipeline, features, CTA) adapted to the real LLM product; every nav link and CTA
|
|
263
|
+
routes into the dashboard.
|
|
264
|
+
- **`/dashboard` — the app.** Ranked diagnosis cards with the 8-signal breakdown +
|
|
265
|
+
confidence + fix (`DiagnosticCard` + `SignalIndicator`), filter by failure type, live
|
|
266
|
+
stats, an adaptive-calibration strip, and a per-card **Propose fix** button. Seeds the
|
|
267
|
+
20 labeled cases on first run so the board isn't empty.
|
|
268
|
+
|
|
269
|
+
### Observability (traces · sessions · cost)
|
|
270
|
+
|
|
271
|
+
A native, Langfuse-style observability layer. The dashboard's **Traces** tab
|
|
272
|
+
shows each request as a trace with a span waterfall (retrieval → generation),
|
|
273
|
+
rolled-up latency / tokens / **estimated cost**, and DebugAI's diagnosis attached
|
|
274
|
+
as **scores** (`healthy`, `failure`, `confidence`). The **Sessions** tab groups
|
|
275
|
+
traces into conversations; a metrics strip shows p50/p95 latency, tokens, and cost.
|
|
276
|
+
|
|
277
|
+
| Endpoint | Purpose |
|
|
278
|
+
|---|---|
|
|
279
|
+
| `POST /api/traces` | ingest a trace (from the SDK or any client) |
|
|
280
|
+
| `GET /api/traces` · `/api/traces/{id}` | list / detail |
|
|
281
|
+
| `GET /api/sessions` | per-session rollups |
|
|
282
|
+
| `GET /api/observability/stats` | aggregate latency / tokens / cost |
|
|
283
|
+
|
|
284
|
+
**Auto-trace from the SDK** — one line gives you traces *and* diagnoses:
|
|
285
|
+
|
|
286
|
+
```python
|
|
287
|
+
from debugai import wrap_llm, session
|
|
288
|
+
|
|
289
|
+
client = wrap_llm(OpenAI(), on_trace=requests.post_to("/api/traces"))
|
|
290
|
+
with session("conv-42"): # group a conversation
|
|
291
|
+
client.chat.completions.create(...) # → trace + spans + scores, async
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
`debugai/tracing.py` is also usable standalone (`Tracer`, `Trace`, `Span`,
|
|
295
|
+
`Score`, cost table) for manual instrumentation.
|
|
296
|
+
|
|
297
|
+
### Playground
|
|
298
|
+
|
|
299
|
+
`/playground` is a live editor: tweak the system prompt, query, output, chunks,
|
|
300
|
+
scores, or temperature and the diagnosis + signal bars update as you type
|
|
301
|
+
(`POST /api/playground`, non-storing). When a fix is proposed you can **apply it
|
|
302
|
+
to the system prompt** in place and re-analyze — the interactive
|
|
303
|
+
diagnose → fix → re-check loop.
|
|
304
|
+
|
|
305
|
+
### "Debug a bug" workbench
|
|
306
|
+
|
|
307
|
+
Hit **+ Debug a bug** on the dashboard (or `POST /api/debug`) to paste a real
|
|
308
|
+
failing case and get a one-shot diagnosis **and** verified fix:
|
|
309
|
+
|
|
310
|
+
> Describe the issue (e.g. *"my chatbot answers from outside the retrieved
|
|
311
|
+
> context"*), paste the system prompt, the user query, the bad output, and the
|
|
312
|
+
> retrieved chunks (+ optional similarity scores / temperature / context
|
|
313
|
+
> window). DebugAI computes the signals, names the failure, then the matching
|
|
314
|
+
> fix agent proposes a repair, runs the regression suite, and re-diagnoses — all
|
|
315
|
+
> shown inline. "Load example" fills a sample hallucination case.
|
|
316
|
+
|
|
317
|
+
Both pages load React via CDN and the compiled design-system bundle from the `/ds`
|
|
318
|
+
mount — the same pattern as the original template.
|
|
319
|
+
|
|
320
|
+
### LangChain integration
|
|
321
|
+
|
|
322
|
+
Drop the callback handler onto any LangChain run to auto-diagnose it — it
|
|
323
|
+
captures the retrieved documents + the LLM prompt/output and runs `analyze()`:
|
|
324
|
+
|
|
325
|
+
```python
|
|
326
|
+
from debugai.integrations import DebugAICallbackHandler
|
|
327
|
+
|
|
328
|
+
handler = DebugAICallbackHandler(on_diagnosis=lambda d: print(d["primary"]))
|
|
329
|
+
chain.invoke(question, config={"callbacks": [handler]})
|
|
330
|
+
print(handler.last) # the most recent diagnosis
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
Importable with or without `langchain` installed; diagnosis failures never
|
|
334
|
+
break the chain. (Without retriever scores it can't judge retrieval quality, but
|
|
335
|
+
it still catches ungrounded answers — hallucination / entity gap.)
|
|
336
|
+
|
|
337
|
+
### Adaptive thresholds (§7.2)
|
|
338
|
+
|
|
339
|
+
`debugai/calibration.py` provides a per-user `ThresholdStore` that learns a
|
|
340
|
+
"known good" baseline from healthy requests and tightens the gating thresholds
|
|
341
|
+
to that user's norms:
|
|
342
|
+
|
|
343
|
+
| Regime | Requests | Method |
|
|
344
|
+
|---|---|---|
|
|
345
|
+
| cold | < 50 | sensible defaults |
|
|
346
|
+
| warm | 50–500 | percentile (5th / 95th of healthy baseline) |
|
|
347
|
+
| hot | > 500 | rolling-window z-score (mean ± 2σ) |
|
|
348
|
+
|
|
349
|
+
A signal is only adapted after `MIN_SAMPLES` healthy observations, every value
|
|
350
|
+
is clamped to a sane band, and a signal that's never exercised (all-zero
|
|
351
|
+
baseline) keeps its default. The dashboard's **Adaptive thresholds** strip shows
|
|
352
|
+
the live regime and each `default → calibrated` shift; `GET /api/thresholds`
|
|
353
|
+
returns the full report. The server diagnoses each request with
|
|
354
|
+
`tstore.current()` and feeds the result back, so calibration improves online.
|
|
355
|
+
|
|
356
|
+
API:
|
|
357
|
+
|
|
358
|
+
| Method | Path | Purpose |
|
|
359
|
+
|---|---|---|
|
|
360
|
+
| `POST` | `/api/analyze` | run the engine on a request, store + return the diagnosis (+ `ui` props) |
|
|
361
|
+
| `GET` | `/api/diagnoses?failure=` | recent diagnoses, optionally filtered |
|
|
362
|
+
| `GET` | `/api/stats` | counts by failure type |
|
|
363
|
+
| `DELETE` | `/api/diagnoses` | clear history |
|
|
364
|
+
| `POST` | `/api/seed` | (re)seed from the labeled dataset |
|
|
365
|
+
|
|
366
|
+
`server/ui_adapter.py` maps each diagnosis to design-system props (severity,
|
|
367
|
+
per-signal anomaly status vs thresholds, normalized confidence bars), so the
|
|
368
|
+
frontend stays a thin renderer.
|
|
369
|
+
|
|
370
|
+
## Instruction-adherence judge (behavioural failures)
|
|
371
|
+
|
|
372
|
+
Some failures aren't about retrieval or hallucination — e.g. a Socratic tutor
|
|
373
|
+
that **reveals the answer in the first turn** or **re-asks the same guiding
|
|
374
|
+
question**. These violate the *system prompt's own rules*, which the grounding
|
|
375
|
+
signals can't see. `debugai/judge.py` adds an **LLM-as-judge** that scores an
|
|
376
|
+
output against its system-prompt rules and reports the violations as an
|
|
377
|
+
`instruction_violation` diagnosis.
|
|
378
|
+
|
|
379
|
+
```python
|
|
380
|
+
analyze(prompt, output, system_prompt=tutor_rules, judge=True)
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
- **Judge model:** OpenAI by default (`DEBUGAI_JUDGE_MODEL`, default `gpt-5.5`)
|
|
384
|
+
via `OPENAI_API_KEY`; falls back to a deterministic heuristic (question count,
|
|
385
|
+
reveal-too-much, paraphrase-of-student) when no key is set, so it runs offline.
|
|
386
|
+
- **`SocraticTutorAgent`** handles `instruction_violation`: it rewrites the
|
|
387
|
+
system prompt to enforce the broken rules, regenerates the response, and
|
|
388
|
+
**re-judges** to confirm the fix — the corrected reply is shown in the
|
|
389
|
+
dashboard. (Server runs the judge automatically whenever a system prompt is
|
|
390
|
+
supplied.)
|
|
391
|
+
|
|
392
|
+
## Fix Agent Framework (Phase 2, §8)
|
|
393
|
+
|
|
394
|
+
`debugai/agents/` implements the universal **diagnose → generate-fix →
|
|
395
|
+
regression-test → re-diagnose → review** loop. The agent (fix + test
|
|
396
|
+
generation) is the only probabilistic step; it's sandwiched between
|
|
397
|
+
deterministic verification — if Layer 1+2 still detects the failure after the
|
|
398
|
+
fix, the agent knows it failed.
|
|
399
|
+
|
|
400
|
+
```python
|
|
401
|
+
from debugai import analyze
|
|
402
|
+
from debugai.agents import propose_fix
|
|
403
|
+
from debugai.schema import CaptureRecord
|
|
404
|
+
|
|
405
|
+
diag = analyze(prompt, output, chunks=..., similarity_scores=...)
|
|
406
|
+
report = propose_fix(diag, CaptureRecord(...), rerun=my_llm_callable)
|
|
407
|
+
print(report.verdict) # verified | mitigated | failed | escalated | pending_rerun
|
|
408
|
+
print(report.diff, report.tests_passed, report.after_diagnosis)
|
|
409
|
+
```
|
|
410
|
+
|
|
411
|
+
`rerun(system_prompt, user_prompt, chunks, temperature) -> output` is injected,
|
|
412
|
+
so the framework has no hard LLM dependency (pass `None` to get the proposal +
|
|
413
|
+
test suite without executing them).
|
|
414
|
+
|
|
415
|
+
**Five built-in agents** (auto-selected by the registry):
|
|
416
|
+
|
|
417
|
+
| Agent | Handles | Strategy | Verdict behavior |
|
|
418
|
+
|---|---|---|---|
|
|
419
|
+
| Prompt Rule | hallucination | grounding constraints + "say not found" | verified when fabrication stops |
|
|
420
|
+
| Knowledge Base | retrieval failure | re-chunk + interim guard | **mitigated** (real fix is pipeline-side) |
|
|
421
|
+
| Constraint | prompt brittleness | lower temperature + format template + few-shot | verified when variance clears |
|
|
422
|
+
| Context Optimizer | context overflow | top-N chunks + summarize to fit window | verified when ratio drops |
|
|
423
|
+
| Document Patch | entity gap | flag the KB gap | **escalated** (no safe auto-fix) |
|
|
424
|
+
|
|
425
|
+
**Plugin architecture (§8.5):** custom agents register at the front and win
|
|
426
|
+
over built-ins:
|
|
427
|
+
|
|
428
|
+
```python
|
|
429
|
+
from debugai.agents import FixAgentRegistry
|
|
430
|
+
reg = FixAgentRegistry()
|
|
431
|
+
reg.register(SyllabusAgent("class10_cbse.pdf")) # checked before built-ins
|
|
432
|
+
```
|
|
433
|
+
|
|
434
|
+
In the dashboard, every failing diagnosis card has a **Propose fix** button that
|
|
435
|
+
runs the loop (`POST /api/fix/{id}`) and shows the verdict, the diff, the
|
|
436
|
+
regression suite (pass/fail), and the before → after re-diagnosis. With
|
|
437
|
+
`ANTHROPIC_API_KEY` set the re-run uses Claude; otherwise a labeled
|
|
438
|
+
grounded-stub model drives the loop for the offline demo.
|
|
439
|
+
|
|
440
|
+
### LLM explainer (optional)
|
|
441
|
+
|
|
442
|
+
Set `ANTHROPIC_API_KEY` to get LLM-generated explanations
|
|
443
|
+
(`DEBUGAI_EXPLAINER_MODEL` defaults to `claude-haiku-4-5`). Without a key, the
|
|
444
|
+
explainer falls back to the deterministic detector text — everything still
|
|
445
|
+
works offline.
|
|
446
|
+
|
|
447
|
+
## Security & robustness
|
|
448
|
+
|
|
449
|
+
The web app is hardened for safe local/self-hosted use:
|
|
450
|
+
|
|
451
|
+
- **No HTML injection** — model- and input-derived text is stripped of markup
|
|
452
|
+
server-side before it reaches the one `innerHTML` slot (`ui_adapter._plain`);
|
|
453
|
+
everything else renders as escaped React text.
|
|
454
|
+
- **Bounded inputs** — request bodies are validated Pydantic models with length
|
|
455
|
+
and item caps; `limit` query params are clamped to `[1, 500]`; trace ingest
|
|
456
|
+
uses a constrained model that drops unknown/oversized fields.
|
|
457
|
+
- **No internal leakage** — engine/LLM exceptions are logged server-side and
|
|
458
|
+
returned to clients as generic messages.
|
|
459
|
+
- **Crash-safe persistence** — JSON stores write via a temp file + atomic
|
|
460
|
+
`os.replace`, so an interrupted write can't corrupt history.
|
|
461
|
+
- **No `eval`/`exec`/`pickle`**, no path traversal (ids are server-assigned),
|
|
462
|
+
and the LLM re-run only activates with an explicit `ANTHROPIC_API_KEY`.
|
|
463
|
+
- Frontend a11y: keyboard-operable controls, `:focus-visible` rings, and
|
|
464
|
+
error/loading/empty states throughout.
|
|
465
|
+
|
|
466
|
+
### Hardening a hosted deployment
|
|
467
|
+
|
|
468
|
+
Everything above is on by default. For a public/hosted instance, set these env
|
|
469
|
+
vars (the local demo needs none):
|
|
470
|
+
|
|
471
|
+
| Env var | Effect |
|
|
472
|
+
|---|---|
|
|
473
|
+
| `DEBUGAI_API_KEY` | require a matching `X-API-Key` header on every `/api/*` call (constant-time compare). The dashboard prompts for the key via the 🔑 button and stores it in `localStorage`. |
|
|
474
|
+
| `DEBUGAI_RATE_LIMIT` | per-client `/api/*` requests per minute (default 240); over-limit → `429` + `Retry-After`. |
|
|
475
|
+
| `DEBUGAI_TRUST_PROXY` | use the first `X-Forwarded-For` hop for client identity behind a reverse proxy. |
|
|
476
|
+
| `DEBUGAI_SSL_CERT` / `DEBUGAI_SSL_KEY` | serve HTTPS directly (`./run.sh`), or terminate TLS at a proxy (nginx/Caddy). |
|
|
477
|
+
|
|
478
|
+
Security headers (CSP, `X-Frame-Options: DENY`, `nosniff`, `Referrer-Policy`,
|
|
479
|
+
COOP) are sent on every response. The CSP allows the unpkg CDN + inline/eval
|
|
480
|
+
because the dashboard transforms JSX in-browser; for a strict CSP, pre-compile
|
|
481
|
+
the JSX and drop the Babel/CDN script tags.
|
|
482
|
+
|
|
483
|
+
## Accuracy benchmark
|
|
484
|
+
|
|
485
|
+
```bash
|
|
486
|
+
python scripts/benchmark.py # tests/dataset/failures.json + eval.json
|
|
487
|
+
```
|
|
488
|
+
|
|
489
|
+
Runs every labeled case through the engine and reports **overall accuracy, a
|
|
490
|
+
confusion matrix, and per-class precision/recall/F1**. Current: **93.8% (30/32)**
|
|
491
|
+
on the seed + held-out eval set (`entity_gap` 4/4 after the DeBERTa-v3 NLI
|
|
492
|
+
upgrade — see below). A `test_benchmark.py` guard fails CI if combined accuracy
|
|
493
|
+
drops below 80%.
|
|
494
|
+
|
|
495
|
+
The NLI signal uses **`cross-encoder/nli-deberta-v3-base`** rather than the
|
|
496
|
+
smaller MiniLM2: the latter emitted confident false-positive contradictions on
|
|
497
|
+
neutral attribute-additions (e.g. an answer adding "boot space" to a spec),
|
|
498
|
+
which misclassified `entity_gap` as `hallucination`. DeBERTa scores those ~0.00
|
|
499
|
+
contradiction while still catching real contradictions ~0.99.
|
|
500
|
+
|
|
501
|
+
### Deep-mode variance & Tier-3 NER
|
|
502
|
+
|
|
503
|
+
- **Measured variance (§7.5 Tier 2):** pass `variance_rerun=<callable>` (and
|
|
504
|
+
`variance_runs`) to `analyze()` to replace the temperature proxy with a real
|
|
505
|
+
measure — it re-runs the model N times and scores `1 − mean pairwise
|
|
506
|
+
similarity` (signal `variance_method` becomes `"measured"`). Opt-in (costs N
|
|
507
|
+
calls), for async/CI.
|
|
508
|
+
- **Tier-3 NER fallback (§7.1):** when spaCy + regex extract nothing, an LLM can
|
|
509
|
+
extract entities — opt-in via `DEBUGAI_LLM_NER=1` (+ `OPENAI_API_KEY`), off by
|
|
510
|
+
default so normal runs make no LLM calls.
|
|
511
|
+
|
|
512
|
+
## Tests
|
|
513
|
+
|
|
514
|
+
```bash
|
|
515
|
+
pytest -q
|
|
516
|
+
```
|
|
517
|
+
|
|
518
|
+
`tests/dataset/failures.json` holds 20 labeled failures (Step 0). The suite
|
|
519
|
+
asserts the rule engine meets the roadmap's **≥16/20 (80%)** acceptance bar —
|
|
520
|
+
it currently classifies **20/20**, and reproduces the doc's worked Scenario A
|
|
521
|
+
(retrieval failure, confidence 0.95).
|
|
522
|
+
|
|
523
|
+
## Roadmap status
|
|
524
|
+
|
|
525
|
+
- [x] Step 0 — 20 labeled failures (`tests/dataset/failures.json`)
|
|
526
|
+
- [x] Step 1 — signal extraction layer
|
|
527
|
+
- [x] Step 2 — rule engine
|
|
528
|
+
- [x] Step 3 — `analyze()` + LLM explainer (this MVP)
|
|
529
|
+
- [ ] Step 4 — test with 5 real users
|
|
530
|
+
- [x] Step 5 — SDK wrapper (`wrap_llm()`, Level 2)
|
|
531
|
+
- [x] Step 6 — dashboard (`server/`) + adaptive thresholds (`debugai/calibration.py`)
|
|
532
|
+
- [x] Phase 2 — fix-agent framework (`debugai/agents/`)
|
|
533
|
+
- [x] Observability — native traces / spans / sessions / scores / cost (`debugai/tracing.py`)
|
|
534
|
+
- [x] Playground + `debugai` CLI
|
|
535
|
+
- [ ] Phase 2b — community plugin registry + fix-success data sharing
|