debugerai 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debugai/__init__.py +51 -0
- debugai/agents/__init__.py +43 -0
- debugai/agents/base.py +192 -0
- debugai/agents/builtin.py +246 -0
- debugai/agents/registry.py +31 -0
- debugai/agents/types.py +108 -0
- debugai/analyze.py +142 -0
- debugai/calibration.py +198 -0
- debugai/cli.py +171 -0
- debugai/config.py +134 -0
- debugai/detectors.py +206 -0
- debugai/diagnosis.py +64 -0
- debugai/explainer.py +105 -0
- debugai/integrations/__init__.py +5 -0
- debugai/integrations/langchain.py +109 -0
- debugai/judge.py +171 -0
- debugai/metrics.py +139 -0
- debugai/models.py +92 -0
- debugai/providers.py +179 -0
- debugai/schema.py +66 -0
- debugai/sdk.py +1271 -0
- debugai/signals.py +399 -0
- debugai/thresholds.json +15 -0
- debugai/thresholds.py +44 -0
- debugai/tracing.py +283 -0
- debugerai-0.2.0.dist-info/METADATA +535 -0
- debugerai-0.2.0.dist-info/RECORD +31 -0
- debugerai-0.2.0.dist-info/WHEEL +5 -0
- debugerai-0.2.0.dist-info/entry_points.txt +2 -0
- debugerai-0.2.0.dist-info/licenses/LICENSE +21 -0
- debugerai-0.2.0.dist-info/top_level.txt +1 -0
debugai/__init__.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""DebugAI — AI Observability & Debugging Platform.
|
|
2
|
+
|
|
3
|
+
A 3-layer root-cause engine for LLM application failures:
|
|
4
|
+
|
|
5
|
+
Layer 1 (deterministic) — Signal extraction: 8 metrics per request.
|
|
6
|
+
Layer 2 (deterministic) — Rule engine: 5 failure detectors, primary + secondary.
|
|
7
|
+
Layer 3 (probabilistic) — LLM explainer: human-readable explanation + fix.
|
|
8
|
+
|
|
9
|
+
Public API (Level 1 integration):
|
|
10
|
+
|
|
11
|
+
from debugai import analyze
|
|
12
|
+
result = analyze(prompt, output, chunks=..., similarity_scores=...)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from debugai.schema import CaptureRecord
|
|
16
|
+
from debugai.analyze import analyze
|
|
17
|
+
from debugai.config import DebugAIConfig
|
|
18
|
+
from debugai.metrics import metrics
|
|
19
|
+
from debugai.sdk import (
|
|
20
|
+
wrap_llm, awrap_llm, retrieval_context, session, http_trace_sink,
|
|
21
|
+
completion, acompletion, CompletionResponse,
|
|
22
|
+
register_provider, register_adapter, set_default_config,
|
|
23
|
+
compare, ComparisonResult, BudgetExceededError,
|
|
24
|
+
_GenericOpenAICompatAdapter,
|
|
25
|
+
)
|
|
26
|
+
from debugai.tracing import Trace, Span, Tracer, Score
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
# Core
|
|
30
|
+
"analyze", "CaptureRecord",
|
|
31
|
+
# Config & metrics
|
|
32
|
+
"DebugAIConfig", "metrics",
|
|
33
|
+
# SDK wrappers
|
|
34
|
+
"wrap_llm", "awrap_llm",
|
|
35
|
+
# Universal completion API
|
|
36
|
+
"completion", "acompletion", "CompletionResponse",
|
|
37
|
+
# Registration
|
|
38
|
+
"register_provider", "register_adapter", "set_default_config",
|
|
39
|
+
# Context managers
|
|
40
|
+
"retrieval_context", "session",
|
|
41
|
+
# Observability
|
|
42
|
+
"Trace", "Span", "Tracer", "Score",
|
|
43
|
+
# Sinks
|
|
44
|
+
"http_trace_sink",
|
|
45
|
+
# Adapters
|
|
46
|
+
"_GenericOpenAICompatAdapter",
|
|
47
|
+
# Compare + budget
|
|
48
|
+
"compare", "ComparisonResult", "BudgetExceededError",
|
|
49
|
+
]
|
|
50
|
+
__version__ = "0.2.0"
|
|
51
|
+
__all__ += ["__version__"]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Fix Agent Framework (Architecture §8) — diagnose → fix → verify → review."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from debugai.agents.base import FixAgent
|
|
6
|
+
from debugai.agents.builtin import (
|
|
7
|
+
ConstraintAgent, ContextOptimizerAgent, DocumentPatchAgent,
|
|
8
|
+
KnowledgeBaseAgent, PromptRuleAgent, SocraticTutorAgent,
|
|
9
|
+
)
|
|
10
|
+
from debugai.agents.registry import FixAgentRegistry
|
|
11
|
+
from debugai.agents.types import (
|
|
12
|
+
ESCALATED, FAILED, MITIGATED, PENDING_RERUN, VERIFIED,
|
|
13
|
+
FixCandidate, FixReport, TestCase, TestResult,
|
|
14
|
+
)
|
|
15
|
+
from debugai.schema import CaptureRecord
|
|
16
|
+
|
|
17
|
+
# A process-wide default registry with the five built-ins.
|
|
18
|
+
DEFAULT_REGISTRY = FixAgentRegistry()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def propose_fix(diagnosis: dict, record: CaptureRecord, rerun=None,
|
|
22
|
+
registry: FixAgentRegistry | None = None) -> FixReport | None:
|
|
23
|
+
"""Select the right agent for a diagnosis and run the fix-verify loop.
|
|
24
|
+
|
|
25
|
+
Returns a FixReport, or None if no agent handles the failure (or the
|
|
26
|
+
request is healthy).
|
|
27
|
+
"""
|
|
28
|
+
if not diagnosis or diagnosis.get("healthy"):
|
|
29
|
+
return None
|
|
30
|
+
reg = registry or DEFAULT_REGISTRY
|
|
31
|
+
agent = reg.find_agent(diagnosis)
|
|
32
|
+
if agent is None:
|
|
33
|
+
return None
|
|
34
|
+
return agent.run(diagnosis, record, rerun=rerun)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"FixAgent", "FixAgentRegistry", "DEFAULT_REGISTRY", "propose_fix",
|
|
39
|
+
"FixCandidate", "FixReport", "TestCase", "TestResult",
|
|
40
|
+
"PromptRuleAgent", "KnowledgeBaseAgent", "ConstraintAgent",
|
|
41
|
+
"ContextOptimizerAgent", "DocumentPatchAgent", "SocraticTutorAgent",
|
|
42
|
+
"VERIFIED", "MITIGATED", "FAILED", "PENDING_RERUN", "ESCALATED",
|
|
43
|
+
]
|
debugai/agents/base.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""Abstract FixAgent + the universal diagnose-fix-verify loop (Architecture §8.1).
|
|
2
|
+
|
|
3
|
+
1. Diagnose (deterministic — already done; passed in)
|
|
4
|
+
2. Generate fix (agent — generate_fix)
|
|
5
|
+
3. Build tests (agent — build_test_cases)
|
|
6
|
+
4. Run regression (deterministic — _run_test)
|
|
7
|
+
5. Re-diagnose (deterministic — Layer 1+2 on the re-run output)
|
|
8
|
+
6. Developer review (human — consumes the FixReport)
|
|
9
|
+
|
|
10
|
+
The agent (steps 2-3) is the only probabilistic part, and it is sandwiched
|
|
11
|
+
between deterministic verification. Re-running the model is injected as a
|
|
12
|
+
`rerun` callable so the framework has no hard dependency on any LLM:
|
|
13
|
+
|
|
14
|
+
rerun(system_prompt, user_prompt, chunks, temperature) -> output_text
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
from abc import ABC, abstractmethod
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from typing import Callable
|
|
23
|
+
|
|
24
|
+
from debugai.agents.types import (
|
|
25
|
+
ESCALATED, FAILED, MITIGATED, PENDING_RERUN, VERIFIED,
|
|
26
|
+
FixCandidate, FixReport, TestCase, TestResult,
|
|
27
|
+
)
|
|
28
|
+
from debugai.analyze import analyze
|
|
29
|
+
from debugai.schema import CaptureRecord
|
|
30
|
+
from debugai.signals import _extract_entities
|
|
31
|
+
|
|
32
|
+
log = logging.getLogger("debugai.agents")
|
|
33
|
+
|
|
34
|
+
Rerun = Callable[[str, str, list, "float | None"], str]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class _Applied:
|
|
39
|
+
system_prompt: str
|
|
40
|
+
chunks: list[str]
|
|
41
|
+
similarity_scores: list[float]
|
|
42
|
+
temperature: float | None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class FixAgent(ABC):
|
|
46
|
+
name: str = "fix-agent"
|
|
47
|
+
handles: str = "" # failure id this agent targets
|
|
48
|
+
# When False, the fix lives in the pipeline (e.g. re-chunking) and a prompt-only
|
|
49
|
+
# rerun can't structurally clear the failure — tests verify an interim guard,
|
|
50
|
+
# and a clean test pass yields a MITIGATED (not VERIFIED) verdict.
|
|
51
|
+
verifiable_by_rerun: bool = True
|
|
52
|
+
|
|
53
|
+
# --- selection ---------------------------------------------------------
|
|
54
|
+
def can_handle(self, diagnosis: dict) -> bool:
|
|
55
|
+
primary = (diagnosis or {}).get("primary") or {}
|
|
56
|
+
return primary.get("failure") == self.handles
|
|
57
|
+
|
|
58
|
+
# --- agent-specific (subclasses implement) -----------------------------
|
|
59
|
+
@abstractmethod
|
|
60
|
+
def generate_fix(self, diagnosis: dict, record: CaptureRecord) -> FixCandidate: ...
|
|
61
|
+
|
|
62
|
+
@abstractmethod
|
|
63
|
+
def build_test_cases(self, diagnosis: dict, record: CaptureRecord) -> list[TestCase]: ...
|
|
64
|
+
|
|
65
|
+
# --- the inherited loop ------------------------------------------------
|
|
66
|
+
def run(self, diagnosis: dict, record: CaptureRecord, rerun: Rerun | None = None) -> FixReport:
|
|
67
|
+
candidate = self.generate_fix(diagnosis, record)
|
|
68
|
+
before_conf = ((diagnosis.get("primary") or {}).get("confidence"))
|
|
69
|
+
|
|
70
|
+
if candidate.escalate:
|
|
71
|
+
return FixReport(
|
|
72
|
+
agent=self.name, failure=self.handles, verdict=ESCALATED,
|
|
73
|
+
candidate=candidate, diff=self._diff(candidate, record),
|
|
74
|
+
before_confidence=before_conf,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
tests = self.build_test_cases(diagnosis, record)
|
|
78
|
+
report = FixReport(
|
|
79
|
+
agent=self.name, failure=self.handles, verdict=PENDING_RERUN,
|
|
80
|
+
candidate=candidate, diff=self._diff(candidate, record),
|
|
81
|
+
tests_total=len(tests), before_confidence=before_conf,
|
|
82
|
+
)
|
|
83
|
+
if rerun is None:
|
|
84
|
+
# Candidate + tests produced, but nothing to execute them against.
|
|
85
|
+
report.test_results = [TestResult(case=t, passed=False,
|
|
86
|
+
failures=["not run (no model)"]) for t in tests]
|
|
87
|
+
return report
|
|
88
|
+
|
|
89
|
+
applied = self._apply(candidate, record)
|
|
90
|
+
|
|
91
|
+
# Step 4 — deterministic regression tests.
|
|
92
|
+
results = [self._run_test(t, applied, rerun) for t in tests]
|
|
93
|
+
report.test_results = results
|
|
94
|
+
report.tests_passed = sum(1 for r in results if r.passed)
|
|
95
|
+
|
|
96
|
+
# Step 5 — re-diagnose the original failing request with the fix applied.
|
|
97
|
+
after, after_output = self._rediagnose(record, applied, rerun)
|
|
98
|
+
report.reverified = True
|
|
99
|
+
report.after_diagnosis = after
|
|
100
|
+
report.after_output = after_output
|
|
101
|
+
cleared = after.get("healthy") or (
|
|
102
|
+
(after.get("primary") or {}).get("failure") != self.handles
|
|
103
|
+
)
|
|
104
|
+
report.reverified_cleared = bool(cleared)
|
|
105
|
+
|
|
106
|
+
all_pass = report.tests_passed == report.tests_total
|
|
107
|
+
if not self.verifiable_by_rerun:
|
|
108
|
+
# Pipeline fix: interim guard tests verify, but Layer 1+2 still sees the
|
|
109
|
+
# structural failure until the pipeline change lands.
|
|
110
|
+
report.verdict = MITIGATED if all_pass else FAILED
|
|
111
|
+
else:
|
|
112
|
+
report.verdict = VERIFIED if (all_pass and cleared) else FAILED
|
|
113
|
+
return report
|
|
114
|
+
|
|
115
|
+
# --- deterministic helpers --------------------------------------------
|
|
116
|
+
def _apply(self, c: FixCandidate, record: CaptureRecord) -> _Applied:
|
|
117
|
+
system = record.system_prompt
|
|
118
|
+
if c.system_prompt_additions:
|
|
119
|
+
system = (system + "\n\n" + c.system_prompt_additions).strip()
|
|
120
|
+
chunks, scores = list(record.retrieved_chunks), list(record.similarity_scores)
|
|
121
|
+
if c.max_chunks is not None:
|
|
122
|
+
if scores and len(scores) == len(chunks):
|
|
123
|
+
order = sorted(range(len(chunks)), key=lambda i: scores[i], reverse=True)
|
|
124
|
+
keep = sorted(order[: c.max_chunks])
|
|
125
|
+
chunks = [chunks[i] for i in keep]
|
|
126
|
+
scores = [scores[i] for i in keep]
|
|
127
|
+
else:
|
|
128
|
+
chunks = chunks[: c.max_chunks]
|
|
129
|
+
scores = scores[: c.max_chunks]
|
|
130
|
+
if c.chunk_char_budget is not None:
|
|
131
|
+
# Stand-in for summarization: cap each kept chunk to the budget.
|
|
132
|
+
chunks = [ch[: c.chunk_char_budget] for ch in chunks]
|
|
133
|
+
temp = c.new_temperature if c.new_temperature is not None else record.temperature
|
|
134
|
+
return _Applied(system_prompt=system, chunks=chunks, similarity_scores=scores, temperature=temp)
|
|
135
|
+
|
|
136
|
+
def _run_test(self, t: TestCase, applied: _Applied, rerun: Rerun) -> TestResult:
|
|
137
|
+
outputs, failures = [], []
|
|
138
|
+
for _ in range(max(1, t.runs)):
|
|
139
|
+
out = rerun(applied.system_prompt, t.input, applied.chunks, applied.temperature) or ""
|
|
140
|
+
outputs.append(out)
|
|
141
|
+
low = out.lower()
|
|
142
|
+
for kw in t.must_contain:
|
|
143
|
+
if kw.lower() not in low:
|
|
144
|
+
failures.append(f"missing '{kw}'")
|
|
145
|
+
for kw in t.must_not_contain:
|
|
146
|
+
if kw.lower() in low:
|
|
147
|
+
failures.append(f"contains '{kw}'")
|
|
148
|
+
return TestResult(case=t, passed=not failures, outputs=outputs, failures=failures)
|
|
149
|
+
|
|
150
|
+
def _rediagnose(self, record: CaptureRecord, applied: _Applied, rerun: Rerun) -> tuple[dict, str]:
|
|
151
|
+
new_output = rerun(applied.system_prompt, record.user_prompt, applied.chunks, applied.temperature) or ""
|
|
152
|
+
result = analyze(
|
|
153
|
+
prompt=record.user_prompt,
|
|
154
|
+
output=new_output,
|
|
155
|
+
system_prompt=applied.system_prompt,
|
|
156
|
+
chunks=applied.chunks,
|
|
157
|
+
similarity_scores=applied.similarity_scores,
|
|
158
|
+
temperature=applied.temperature,
|
|
159
|
+
context_window=record.context_window,
|
|
160
|
+
explain_with_llm=False,
|
|
161
|
+
)
|
|
162
|
+
return result, new_output
|
|
163
|
+
|
|
164
|
+
def _diff(self, c: FixCandidate, record: CaptureRecord) -> str:
|
|
165
|
+
lines: list[str] = []
|
|
166
|
+
if c.system_prompt_additions:
|
|
167
|
+
lines.append("--- system_prompt (appended)")
|
|
168
|
+
for ln in c.system_prompt_additions.splitlines():
|
|
169
|
+
lines.append("+ " + ln)
|
|
170
|
+
if c.new_temperature is not None:
|
|
171
|
+
lines.append(f"~ temperature: {record.temperature} -> {c.new_temperature}")
|
|
172
|
+
if c.max_chunks is not None:
|
|
173
|
+
lines.append(f"~ retrieved_chunks: {len(record.retrieved_chunks)} -> top {c.max_chunks} by similarity")
|
|
174
|
+
for ex in c.few_shot_examples:
|
|
175
|
+
lines.append(f"+ few-shot: {ex.get('input','')[:48]} -> {ex.get('output','')[:48]}")
|
|
176
|
+
if c.notes:
|
|
177
|
+
lines.append(f"# note: {c.notes}")
|
|
178
|
+
return "\n".join(lines)
|
|
179
|
+
|
|
180
|
+
# --- shared NER helper for test generation -----------------------------
|
|
181
|
+
@staticmethod
|
|
182
|
+
def _fabricated_entities(record: CaptureRecord) -> list[str]:
|
|
183
|
+
"""Entities in the output that are absent from the retrieved context."""
|
|
184
|
+
out_ents = _extract_entities(record.llm_output)
|
|
185
|
+
ctx = record.context_text.lower()
|
|
186
|
+
return sorted(e for e in out_ents if e not in ctx)
|
|
187
|
+
|
|
188
|
+
@staticmethod
|
|
189
|
+
def _grounded_entities(record: CaptureRecord) -> list[str]:
|
|
190
|
+
out_ents = _extract_entities(record.llm_output)
|
|
191
|
+
ctx = record.context_text.lower()
|
|
192
|
+
return sorted(e for e in out_ents if e in ctx)
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
"""The five built-in fix agents (Architecture §8.3).
|
|
2
|
+
|
|
3
|
+
Each targets one failure type and ships a deterministic fix template. (Fix text
|
|
4
|
+
can be LLM-drafted when a key is present, but the templates make the framework
|
|
5
|
+
work — and verify — offline.) Generation is the only probabilistic step; the
|
|
6
|
+
loop in ``base.FixAgent.run`` verifies every fix deterministically.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from debugai.agents.base import FixAgent
|
|
12
|
+
from debugai.agents.types import (
|
|
13
|
+
FAILED, PENDING_RERUN, VERIFIED, FixCandidate, FixReport, TestCase,
|
|
14
|
+
)
|
|
15
|
+
from debugai.detectors import (
|
|
16
|
+
CONTEXT_OVERFLOW, ENTITY_GAP, HALLUCINATION, PROMPT_BRITTLENESS, RETRIEVAL_FAILURE,
|
|
17
|
+
)
|
|
18
|
+
from debugai.judge import INSTRUCTION_VIOLATION, judge_instructions
|
|
19
|
+
from debugai.schema import CaptureRecord
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _first(xs: list[str], n: int) -> list[str]:
|
|
23
|
+
return [x for x in xs[:n] if x]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# --------------------------------------------------------------------------- #
|
|
27
|
+
# 1. Prompt Rule Agent — hallucination
|
|
28
|
+
# --------------------------------------------------------------------------- #
|
|
29
|
+
class PromptRuleAgent(FixAgent):
|
|
30
|
+
name = "Prompt Rule Agent"
|
|
31
|
+
handles = HALLUCINATION
|
|
32
|
+
|
|
33
|
+
GROUNDING = (
|
|
34
|
+
"Answer ONLY using the provided context. If the context does not contain "
|
|
35
|
+
"the answer, reply exactly: \"I don't have that information.\" Never invent "
|
|
36
|
+
"names, numbers, dates, clauses, or citations. For every specific claim, the "
|
|
37
|
+
"supporting text must appear in the context."
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def generate_fix(self, diagnosis, record):
|
|
41
|
+
return FixCandidate(
|
|
42
|
+
agent=self.name, failure=self.handles,
|
|
43
|
+
strategy="Add grounding constraints + an out-of-context fallback to the system prompt.",
|
|
44
|
+
rationale="Retrieval succeeded but the output is ungrounded — constrain "
|
|
45
|
+
"generation to the supplied context.",
|
|
46
|
+
system_prompt_additions=self.GROUNDING,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def build_test_cases(self, diagnosis, record):
|
|
50
|
+
fab = self._fabricated_entities(record)
|
|
51
|
+
grounded = self._grounded_entities(record)
|
|
52
|
+
tests = [
|
|
53
|
+
TestCase(input=record.user_prompt, must_not_contain=_first(fab, 4),
|
|
54
|
+
category="original"),
|
|
55
|
+
TestCase(input=record.user_prompt, must_not_contain=_first(fab, 4),
|
|
56
|
+
category="variance", runs=3),
|
|
57
|
+
]
|
|
58
|
+
if grounded:
|
|
59
|
+
tests.append(TestCase(input=record.user_prompt,
|
|
60
|
+
must_contain=_first(grounded, 1), category="regression"))
|
|
61
|
+
return tests
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# --------------------------------------------------------------------------- #
|
|
65
|
+
# 2. Knowledge Base Agent — retrieval failure
|
|
66
|
+
# --------------------------------------------------------------------------- #
|
|
67
|
+
class KnowledgeBaseAgent(FixAgent):
|
|
68
|
+
name = "Knowledge Base Agent"
|
|
69
|
+
handles = RETRIEVAL_FAILURE
|
|
70
|
+
verifiable_by_rerun = False # real fix is re-chunking/re-embedding the corpus
|
|
71
|
+
|
|
72
|
+
GUARD = (
|
|
73
|
+
"If the retrieved context does not address the question, reply exactly: "
|
|
74
|
+
"\"The knowledge base does not contain this information.\" Do not answer "
|
|
75
|
+
"from prior knowledge."
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def generate_fix(self, diagnosis, record):
|
|
79
|
+
sim = ((diagnosis.get("signals") or {}).get("similarity"))
|
|
80
|
+
return FixCandidate(
|
|
81
|
+
agent=self.name, failure=self.handles,
|
|
82
|
+
strategy="Re-chunk source docs entity-aware + add an interim 'not in KB' guard.",
|
|
83
|
+
rationale=f"Mean retrieval similarity {sim} is below threshold — the "
|
|
84
|
+
"retriever returned irrelevant chunks.",
|
|
85
|
+
system_prompt_additions=self.GUARD,
|
|
86
|
+
notes="Re-chunk the source corpus with an entity-aware strategy and "
|
|
87
|
+
"re-embed; verify the target document is actually indexed.",
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def build_test_cases(self, diagnosis, record):
|
|
91
|
+
fab = self._fabricated_entities(record)
|
|
92
|
+
return [
|
|
93
|
+
TestCase(input=record.user_prompt, must_not_contain=_first(fab, 4),
|
|
94
|
+
category="original"),
|
|
95
|
+
TestCase(input=record.user_prompt, must_not_contain=_first(fab, 4),
|
|
96
|
+
category="variance", runs=3),
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# --------------------------------------------------------------------------- #
|
|
101
|
+
# 3. Constraint Agent — prompt brittleness
|
|
102
|
+
# --------------------------------------------------------------------------- #
|
|
103
|
+
class ConstraintAgent(FixAgent):
|
|
104
|
+
name = "Constraint Agent"
|
|
105
|
+
handles = PROMPT_BRITTLENESS
|
|
106
|
+
|
|
107
|
+
TEMPLATE = (
|
|
108
|
+
"Be deterministic and consistent across runs. Follow a fixed output format "
|
|
109
|
+
"and do not vary phrasing, ordering, or structure between identical inputs."
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
def generate_fix(self, diagnosis, record):
|
|
113
|
+
return FixCandidate(
|
|
114
|
+
agent=self.name, failure=self.handles,
|
|
115
|
+
strategy="Lower temperature, add an output-format template, and pin behavior with a few-shot example.",
|
|
116
|
+
rationale="Grounding signals are healthy but output variance is high — "
|
|
117
|
+
"constrain sampling and format.",
|
|
118
|
+
new_temperature=0.2,
|
|
119
|
+
system_prompt_additions=self.TEMPLATE,
|
|
120
|
+
few_shot_examples=[{"input": record.user_prompt, "output": record.llm_output}],
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
def build_test_cases(self, diagnosis, record):
|
|
124
|
+
grounded = self._grounded_entities(record)
|
|
125
|
+
mc = _first(grounded, 1)
|
|
126
|
+
return [
|
|
127
|
+
TestCase(input=record.user_prompt, must_contain=mc, category="regression"),
|
|
128
|
+
TestCase(input=record.user_prompt, must_contain=mc, category="variance", runs=3),
|
|
129
|
+
]
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
# --------------------------------------------------------------------------- #
|
|
133
|
+
# 4. Context Optimizer Agent — context overflow
|
|
134
|
+
# --------------------------------------------------------------------------- #
|
|
135
|
+
class ContextOptimizerAgent(FixAgent):
|
|
136
|
+
name = "Context Optimizer Agent"
|
|
137
|
+
handles = CONTEXT_OVERFLOW
|
|
138
|
+
|
|
139
|
+
def generate_fix(self, diagnosis, record):
|
|
140
|
+
return FixCandidate(
|
|
141
|
+
agent=self.name, failure=self.handles,
|
|
142
|
+
strategy="Reduce to the top-N most relevant chunks and summarize each to fit the window.",
|
|
143
|
+
rationale="The prompt overflows the context window; trim and compress "
|
|
144
|
+
"the retrieved context.",
|
|
145
|
+
max_chunks=8,
|
|
146
|
+
chunk_char_budget=240,
|
|
147
|
+
notes="Summarize prior conversation history; consider a larger-context model.",
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
def build_test_cases(self, diagnosis, record):
|
|
151
|
+
grounded = self._grounded_entities(record)
|
|
152
|
+
return [
|
|
153
|
+
TestCase(input=record.user_prompt, must_contain=_first(grounded, 1),
|
|
154
|
+
category="regression"),
|
|
155
|
+
]
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# --------------------------------------------------------------------------- #
|
|
159
|
+
# 5. Document Patch Agent — entity gap (escalates)
|
|
160
|
+
# --------------------------------------------------------------------------- #
|
|
161
|
+
class DocumentPatchAgent(FixAgent):
|
|
162
|
+
name = "Document Patch Agent"
|
|
163
|
+
handles = ENTITY_GAP
|
|
164
|
+
|
|
165
|
+
def generate_fix(self, diagnosis, record):
|
|
166
|
+
missing = self._fabricated_entities(record) # entities not in the corpus
|
|
167
|
+
names = ", ".join(_first(missing, 6)) or "the requested entities"
|
|
168
|
+
return FixCandidate(
|
|
169
|
+
agent=self.name, failure=self.handles,
|
|
170
|
+
strategy="Identify missing entities and flag the knowledge-base gap for human review.",
|
|
171
|
+
rationale="Retrieval is healthy but the corpus lacks coverage for these "
|
|
172
|
+
"entities — content cannot be safely auto-generated.",
|
|
173
|
+
notes=f"Knowledge base needs articles covering: {names}.",
|
|
174
|
+
escalate=True,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
def build_test_cases(self, diagnosis, record):
|
|
178
|
+
return [] # escalated before tests run
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
# --------------------------------------------------------------------------- #
|
|
182
|
+
# 6. Socratic Tutor Agent — instruction_violation (behavioural / pedagogy)
|
|
183
|
+
# --------------------------------------------------------------------------- #
|
|
184
|
+
class SocraticTutorAgent(FixAgent):
|
|
185
|
+
name = "Socratic Tutor Agent"
|
|
186
|
+
handles = INSTRUCTION_VIOLATION
|
|
187
|
+
|
|
188
|
+
RULES = (
|
|
189
|
+
"STRICT Socratic correction — these override any conflicting guidance above:\n"
|
|
190
|
+
"- Do NOT explain the concept or state the answer. Give at most ONE short "
|
|
191
|
+
"clue (one sentence maximum), and only if the student is stuck.\n"
|
|
192
|
+
"- Lead with the question: the turn must centre on exactly ONE short leading "
|
|
193
|
+
"question (exactly one '?') that moves the student one step forward, and must "
|
|
194
|
+
"never restate or reword a question already asked.\n"
|
|
195
|
+
"- Do not open by paraphrasing the student's message.\n"
|
|
196
|
+
"- Keep the whole turn to 1-2 sentences. When unsure, ask rather than explain."
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
def generate_fix(self, diagnosis, record):
|
|
200
|
+
ev = ((diagnosis.get("primary") or {}).get("evidence") or {})
|
|
201
|
+
viols = ev.get("violations") or []
|
|
202
|
+
names = "; ".join(v.get("rule", "") for v in viols[:4]) or "Socratic-method rules"
|
|
203
|
+
return FixCandidate(
|
|
204
|
+
agent=self.name, failure=self.handles,
|
|
205
|
+
strategy="Rewrite the system prompt to enforce the violated Socratic rules.",
|
|
206
|
+
rationale=f"The response broke pedagogy rules ({names}); tighten the "
|
|
207
|
+
"system prompt so the tutor guides instead of answering.",
|
|
208
|
+
system_prompt_additions=self.RULES,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
def build_test_cases(self, diagnosis, record):
|
|
212
|
+
return [] # verified by re-judging (below), not by must_contain checks
|
|
213
|
+
|
|
214
|
+
def run(self, diagnosis, record, rerun=None):
|
|
215
|
+
"""Judge-based verify loop: rewrite prompt → regenerate → re-judge."""
|
|
216
|
+
candidate = self.generate_fix(diagnosis, record)
|
|
217
|
+
before = (diagnosis.get("primary") or {}).get("confidence")
|
|
218
|
+
report = FixReport(agent=self.name, failure=self.handles, verdict=PENDING_RERUN,
|
|
219
|
+
candidate=candidate, diff=self._diff(candidate, record),
|
|
220
|
+
before_confidence=before)
|
|
221
|
+
if rerun is None:
|
|
222
|
+
return report
|
|
223
|
+
applied = self._apply(candidate, record)
|
|
224
|
+
new_output = rerun(applied.system_prompt, record.user_prompt,
|
|
225
|
+
applied.chunks, applied.temperature) or ""
|
|
226
|
+
jd = judge_instructions(applied.system_prompt, record.user_prompt, new_output)
|
|
227
|
+
report.reverified = True
|
|
228
|
+
report.after_output = new_output
|
|
229
|
+
report.reverified_cleared = jd.healthy
|
|
230
|
+
report.after_diagnosis = {
|
|
231
|
+
"healthy": jd.healthy,
|
|
232
|
+
"primary": None if jd.healthy else {
|
|
233
|
+
"failure": INSTRUCTION_VIOLATION, "confidence": jd.confidence},
|
|
234
|
+
}
|
|
235
|
+
report.verdict = VERIFIED if jd.healthy else FAILED
|
|
236
|
+
return report
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
BUILTIN_AGENTS = [
|
|
240
|
+
PromptRuleAgent,
|
|
241
|
+
KnowledgeBaseAgent,
|
|
242
|
+
ConstraintAgent,
|
|
243
|
+
ContextOptimizerAgent,
|
|
244
|
+
DocumentPatchAgent,
|
|
245
|
+
SocraticTutorAgent,
|
|
246
|
+
]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Fix agent registry + plugin architecture (Architecture §8.5).
|
|
2
|
+
|
|
3
|
+
Custom agents register at the front, so they take priority over the built-ins
|
|
4
|
+
and inherit the full diagnose-fix-verify loop.
|
|
5
|
+
|
|
6
|
+
registry = FixAgentRegistry()
|
|
7
|
+
registry.register(SyllabusAgent("class10_cbse.pdf")) # custom, checked first
|
|
8
|
+
agent = registry.find_agent(diagnosis)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from debugai.agents.base import FixAgent
|
|
14
|
+
from debugai.agents.builtin import BUILTIN_AGENTS
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FixAgentRegistry:
|
|
18
|
+
def __init__(self, include_builtins: bool = True):
|
|
19
|
+
self.agents: list[FixAgent] = (
|
|
20
|
+
[cls() for cls in BUILTIN_AGENTS] if include_builtins else []
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
def register(self, agent: FixAgent) -> None:
|
|
24
|
+
"""Register a custom agent — inserted first so it wins over built-ins."""
|
|
25
|
+
self.agents.insert(0, agent)
|
|
26
|
+
|
|
27
|
+
def find_agent(self, diagnosis: dict) -> FixAgent | None:
|
|
28
|
+
for agent in self.agents:
|
|
29
|
+
if agent.can_handle(diagnosis):
|
|
30
|
+
return agent
|
|
31
|
+
return None
|
debugai/agents/types.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Data types for the Fix Agent Framework (Architecture §8)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import asdict, dataclass, field
|
|
6
|
+
|
|
7
|
+
# Verdicts for a fix attempt.
|
|
8
|
+
VERIFIED = "verified" # tests pass AND re-diagnosis clears the failure
|
|
9
|
+
MITIGATED = "mitigated" # interim guard verified, but full fix needs a pipeline change
|
|
10
|
+
FAILED = "failed" # fix did not clear the failure / tests failed
|
|
11
|
+
PENDING_RERUN = "pending_rerun" # candidate + tests produced, but no model to verify
|
|
12
|
+
ESCALATED = "escalated" # agent declined to auto-fix; flagged for a human
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class FixCandidate:
|
|
17
|
+
"""A candidate fix produced by an agent. The modifications are applied
|
|
18
|
+
deterministically by the loop before re-running the model."""
|
|
19
|
+
|
|
20
|
+
agent: str
|
|
21
|
+
failure: str
|
|
22
|
+
strategy: str
|
|
23
|
+
rationale: str = ""
|
|
24
|
+
# Modifications (any subset applies):
|
|
25
|
+
system_prompt_additions: str = ""
|
|
26
|
+
new_temperature: float | None = None
|
|
27
|
+
max_chunks: int | None = None
|
|
28
|
+
chunk_char_budget: int | None = None # summarize/truncate each kept chunk
|
|
29
|
+
few_shot_examples: list[dict] = field(default_factory=list)
|
|
30
|
+
# Advisory output (not auto-applied):
|
|
31
|
+
notes: str = ""
|
|
32
|
+
escalate: bool = False
|
|
33
|
+
|
|
34
|
+
def to_dict(self) -> dict:
|
|
35
|
+
return asdict(self)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class TestCase:
|
|
40
|
+
"""One deterministic regression check (§8.4)."""
|
|
41
|
+
|
|
42
|
+
__test__ = False # not a pytest test class
|
|
43
|
+
|
|
44
|
+
input: str
|
|
45
|
+
must_contain: list[str] = field(default_factory=list)
|
|
46
|
+
must_not_contain: list[str] = field(default_factory=list)
|
|
47
|
+
category: str = "regression" # original | edge | regression | variance
|
|
48
|
+
runs: int = 1 # variance checks run 3x
|
|
49
|
+
|
|
50
|
+
def to_dict(self) -> dict:
|
|
51
|
+
return asdict(self)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class TestResult:
|
|
56
|
+
__test__ = False # not a pytest test class
|
|
57
|
+
|
|
58
|
+
case: TestCase
|
|
59
|
+
passed: bool
|
|
60
|
+
outputs: list[str] = field(default_factory=list)
|
|
61
|
+
failures: list[str] = field(default_factory=list)
|
|
62
|
+
|
|
63
|
+
def to_dict(self) -> dict:
|
|
64
|
+
return {
|
|
65
|
+
"input": self.case.input,
|
|
66
|
+
"category": self.case.category,
|
|
67
|
+
"runs": self.case.runs,
|
|
68
|
+
"passed": self.passed,
|
|
69
|
+
"failures": self.failures,
|
|
70
|
+
"must_contain": self.case.must_contain,
|
|
71
|
+
"must_not_contain": self.case.must_not_contain,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class FixReport:
|
|
77
|
+
"""The full diagnose-fix-verify report presented for developer review (§8.1)."""
|
|
78
|
+
|
|
79
|
+
agent: str
|
|
80
|
+
failure: str
|
|
81
|
+
verdict: str
|
|
82
|
+
candidate: FixCandidate
|
|
83
|
+
diff: str = ""
|
|
84
|
+
test_results: list[TestResult] = field(default_factory=list)
|
|
85
|
+
tests_passed: int = 0
|
|
86
|
+
tests_total: int = 0
|
|
87
|
+
reverified: bool = False
|
|
88
|
+
reverified_cleared: bool | None = None
|
|
89
|
+
before_confidence: float | None = None
|
|
90
|
+
after_diagnosis: dict | None = None
|
|
91
|
+
after_output: str | None = None
|
|
92
|
+
|
|
93
|
+
def to_dict(self) -> dict:
|
|
94
|
+
return {
|
|
95
|
+
"agent": self.agent,
|
|
96
|
+
"failure": self.failure,
|
|
97
|
+
"verdict": self.verdict,
|
|
98
|
+
"candidate": self.candidate.to_dict(),
|
|
99
|
+
"diff": self.diff,
|
|
100
|
+
"tests_passed": self.tests_passed,
|
|
101
|
+
"tests_total": self.tests_total,
|
|
102
|
+
"test_results": [t.to_dict() for t in self.test_results],
|
|
103
|
+
"reverified": self.reverified,
|
|
104
|
+
"reverified_cleared": self.reverified_cleared,
|
|
105
|
+
"before_confidence": self.before_confidence,
|
|
106
|
+
"after_diagnosis": self.after_diagnosis,
|
|
107
|
+
"after_output": self.after_output,
|
|
108
|
+
}
|