debugerai 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debugai/__init__.py ADDED
@@ -0,0 +1,51 @@
1
+ """DebugAI — AI Observability & Debugging Platform.
2
+
3
+ A 3-layer root-cause engine for LLM application failures:
4
+
5
+ Layer 1 (deterministic) — Signal extraction: 8 metrics per request.
6
+ Layer 2 (deterministic) — Rule engine: 5 failure detectors, primary + secondary.
7
+ Layer 3 (probabilistic) — LLM explainer: human-readable explanation + fix.
8
+
9
+ Public API (Level 1 integration):
10
+
11
+ from debugai import analyze
12
+ result = analyze(prompt, output, chunks=..., similarity_scores=...)
13
+ """
14
+
15
+ from debugai.schema import CaptureRecord
16
+ from debugai.analyze import analyze
17
+ from debugai.config import DebugAIConfig
18
+ from debugai.metrics import metrics
19
+ from debugai.sdk import (
20
+ wrap_llm, awrap_llm, retrieval_context, session, http_trace_sink,
21
+ completion, acompletion, CompletionResponse,
22
+ register_provider, register_adapter, set_default_config,
23
+ compare, ComparisonResult, BudgetExceededError,
24
+ _GenericOpenAICompatAdapter,
25
+ )
26
+ from debugai.tracing import Trace, Span, Tracer, Score
27
+
28
+ __all__ = [
29
+ # Core
30
+ "analyze", "CaptureRecord",
31
+ # Config & metrics
32
+ "DebugAIConfig", "metrics",
33
+ # SDK wrappers
34
+ "wrap_llm", "awrap_llm",
35
+ # Universal completion API
36
+ "completion", "acompletion", "CompletionResponse",
37
+ # Registration
38
+ "register_provider", "register_adapter", "set_default_config",
39
+ # Context managers
40
+ "retrieval_context", "session",
41
+ # Observability
42
+ "Trace", "Span", "Tracer", "Score",
43
+ # Sinks
44
+ "http_trace_sink",
45
+ # Adapters
46
+ "_GenericOpenAICompatAdapter",
47
+ # Compare + budget
48
+ "compare", "ComparisonResult", "BudgetExceededError",
49
+ ]
50
+ __version__ = "0.2.0"
51
+ __all__ += ["__version__"]
@@ -0,0 +1,43 @@
1
+ """Fix Agent Framework (Architecture §8) — diagnose → fix → verify → review."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from debugai.agents.base import FixAgent
6
+ from debugai.agents.builtin import (
7
+ ConstraintAgent, ContextOptimizerAgent, DocumentPatchAgent,
8
+ KnowledgeBaseAgent, PromptRuleAgent, SocraticTutorAgent,
9
+ )
10
+ from debugai.agents.registry import FixAgentRegistry
11
+ from debugai.agents.types import (
12
+ ESCALATED, FAILED, MITIGATED, PENDING_RERUN, VERIFIED,
13
+ FixCandidate, FixReport, TestCase, TestResult,
14
+ )
15
+ from debugai.schema import CaptureRecord
16
+
17
+ # A process-wide default registry with the five built-ins.
18
+ DEFAULT_REGISTRY = FixAgentRegistry()
19
+
20
+
21
+ def propose_fix(diagnosis: dict, record: CaptureRecord, rerun=None,
22
+ registry: FixAgentRegistry | None = None) -> FixReport | None:
23
+ """Select the right agent for a diagnosis and run the fix-verify loop.
24
+
25
+ Returns a FixReport, or None if no agent handles the failure (or the
26
+ request is healthy).
27
+ """
28
+ if not diagnosis or diagnosis.get("healthy"):
29
+ return None
30
+ reg = registry or DEFAULT_REGISTRY
31
+ agent = reg.find_agent(diagnosis)
32
+ if agent is None:
33
+ return None
34
+ return agent.run(diagnosis, record, rerun=rerun)
35
+
36
+
37
+ __all__ = [
38
+ "FixAgent", "FixAgentRegistry", "DEFAULT_REGISTRY", "propose_fix",
39
+ "FixCandidate", "FixReport", "TestCase", "TestResult",
40
+ "PromptRuleAgent", "KnowledgeBaseAgent", "ConstraintAgent",
41
+ "ContextOptimizerAgent", "DocumentPatchAgent", "SocraticTutorAgent",
42
+ "VERIFIED", "MITIGATED", "FAILED", "PENDING_RERUN", "ESCALATED",
43
+ ]
debugai/agents/base.py ADDED
@@ -0,0 +1,192 @@
1
+ """Abstract FixAgent + the universal diagnose-fix-verify loop (Architecture §8.1).
2
+
3
+ 1. Diagnose (deterministic — already done; passed in)
4
+ 2. Generate fix (agent — generate_fix)
5
+ 3. Build tests (agent — build_test_cases)
6
+ 4. Run regression (deterministic — _run_test)
7
+ 5. Re-diagnose (deterministic — Layer 1+2 on the re-run output)
8
+ 6. Developer review (human — consumes the FixReport)
9
+
10
+ The agent (steps 2-3) is the only probabilistic part, and it is sandwiched
11
+ between deterministic verification. Re-running the model is injected as a
12
+ `rerun` callable so the framework has no hard dependency on any LLM:
13
+
14
+ rerun(system_prompt, user_prompt, chunks, temperature) -> output_text
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import logging
20
+ from abc import ABC, abstractmethod
21
+ from dataclasses import dataclass
22
+ from typing import Callable
23
+
24
+ from debugai.agents.types import (
25
+ ESCALATED, FAILED, MITIGATED, PENDING_RERUN, VERIFIED,
26
+ FixCandidate, FixReport, TestCase, TestResult,
27
+ )
28
+ from debugai.analyze import analyze
29
+ from debugai.schema import CaptureRecord
30
+ from debugai.signals import _extract_entities
31
+
32
+ log = logging.getLogger("debugai.agents")
33
+
34
+ Rerun = Callable[[str, str, list, "float | None"], str]
35
+
36
+
37
+ @dataclass
38
+ class _Applied:
39
+ system_prompt: str
40
+ chunks: list[str]
41
+ similarity_scores: list[float]
42
+ temperature: float | None
43
+
44
+
45
+ class FixAgent(ABC):
46
+ name: str = "fix-agent"
47
+ handles: str = "" # failure id this agent targets
48
+ # When False, the fix lives in the pipeline (e.g. re-chunking) and a prompt-only
49
+ # rerun can't structurally clear the failure — tests verify an interim guard,
50
+ # and a clean test pass yields a MITIGATED (not VERIFIED) verdict.
51
+ verifiable_by_rerun: bool = True
52
+
53
+ # --- selection ---------------------------------------------------------
54
+ def can_handle(self, diagnosis: dict) -> bool:
55
+ primary = (diagnosis or {}).get("primary") or {}
56
+ return primary.get("failure") == self.handles
57
+
58
+ # --- agent-specific (subclasses implement) -----------------------------
59
+ @abstractmethod
60
+ def generate_fix(self, diagnosis: dict, record: CaptureRecord) -> FixCandidate: ...
61
+
62
+ @abstractmethod
63
+ def build_test_cases(self, diagnosis: dict, record: CaptureRecord) -> list[TestCase]: ...
64
+
65
+ # --- the inherited loop ------------------------------------------------
66
+ def run(self, diagnosis: dict, record: CaptureRecord, rerun: Rerun | None = None) -> FixReport:
67
+ candidate = self.generate_fix(diagnosis, record)
68
+ before_conf = ((diagnosis.get("primary") or {}).get("confidence"))
69
+
70
+ if candidate.escalate:
71
+ return FixReport(
72
+ agent=self.name, failure=self.handles, verdict=ESCALATED,
73
+ candidate=candidate, diff=self._diff(candidate, record),
74
+ before_confidence=before_conf,
75
+ )
76
+
77
+ tests = self.build_test_cases(diagnosis, record)
78
+ report = FixReport(
79
+ agent=self.name, failure=self.handles, verdict=PENDING_RERUN,
80
+ candidate=candidate, diff=self._diff(candidate, record),
81
+ tests_total=len(tests), before_confidence=before_conf,
82
+ )
83
+ if rerun is None:
84
+ # Candidate + tests produced, but nothing to execute them against.
85
+ report.test_results = [TestResult(case=t, passed=False,
86
+ failures=["not run (no model)"]) for t in tests]
87
+ return report
88
+
89
+ applied = self._apply(candidate, record)
90
+
91
+ # Step 4 — deterministic regression tests.
92
+ results = [self._run_test(t, applied, rerun) for t in tests]
93
+ report.test_results = results
94
+ report.tests_passed = sum(1 for r in results if r.passed)
95
+
96
+ # Step 5 — re-diagnose the original failing request with the fix applied.
97
+ after, after_output = self._rediagnose(record, applied, rerun)
98
+ report.reverified = True
99
+ report.after_diagnosis = after
100
+ report.after_output = after_output
101
+ cleared = after.get("healthy") or (
102
+ (after.get("primary") or {}).get("failure") != self.handles
103
+ )
104
+ report.reverified_cleared = bool(cleared)
105
+
106
+ all_pass = report.tests_passed == report.tests_total
107
+ if not self.verifiable_by_rerun:
108
+ # Pipeline fix: interim guard tests verify, but Layer 1+2 still sees the
109
+ # structural failure until the pipeline change lands.
110
+ report.verdict = MITIGATED if all_pass else FAILED
111
+ else:
112
+ report.verdict = VERIFIED if (all_pass and cleared) else FAILED
113
+ return report
114
+
115
+ # --- deterministic helpers --------------------------------------------
116
+ def _apply(self, c: FixCandidate, record: CaptureRecord) -> _Applied:
117
+ system = record.system_prompt
118
+ if c.system_prompt_additions:
119
+ system = (system + "\n\n" + c.system_prompt_additions).strip()
120
+ chunks, scores = list(record.retrieved_chunks), list(record.similarity_scores)
121
+ if c.max_chunks is not None:
122
+ if scores and len(scores) == len(chunks):
123
+ order = sorted(range(len(chunks)), key=lambda i: scores[i], reverse=True)
124
+ keep = sorted(order[: c.max_chunks])
125
+ chunks = [chunks[i] for i in keep]
126
+ scores = [scores[i] for i in keep]
127
+ else:
128
+ chunks = chunks[: c.max_chunks]
129
+ scores = scores[: c.max_chunks]
130
+ if c.chunk_char_budget is not None:
131
+ # Stand-in for summarization: cap each kept chunk to the budget.
132
+ chunks = [ch[: c.chunk_char_budget] for ch in chunks]
133
+ temp = c.new_temperature if c.new_temperature is not None else record.temperature
134
+ return _Applied(system_prompt=system, chunks=chunks, similarity_scores=scores, temperature=temp)
135
+
136
+ def _run_test(self, t: TestCase, applied: _Applied, rerun: Rerun) -> TestResult:
137
+ outputs, failures = [], []
138
+ for _ in range(max(1, t.runs)):
139
+ out = rerun(applied.system_prompt, t.input, applied.chunks, applied.temperature) or ""
140
+ outputs.append(out)
141
+ low = out.lower()
142
+ for kw in t.must_contain:
143
+ if kw.lower() not in low:
144
+ failures.append(f"missing '{kw}'")
145
+ for kw in t.must_not_contain:
146
+ if kw.lower() in low:
147
+ failures.append(f"contains '{kw}'")
148
+ return TestResult(case=t, passed=not failures, outputs=outputs, failures=failures)
149
+
150
+ def _rediagnose(self, record: CaptureRecord, applied: _Applied, rerun: Rerun) -> tuple[dict, str]:
151
+ new_output = rerun(applied.system_prompt, record.user_prompt, applied.chunks, applied.temperature) or ""
152
+ result = analyze(
153
+ prompt=record.user_prompt,
154
+ output=new_output,
155
+ system_prompt=applied.system_prompt,
156
+ chunks=applied.chunks,
157
+ similarity_scores=applied.similarity_scores,
158
+ temperature=applied.temperature,
159
+ context_window=record.context_window,
160
+ explain_with_llm=False,
161
+ )
162
+ return result, new_output
163
+
164
+ def _diff(self, c: FixCandidate, record: CaptureRecord) -> str:
165
+ lines: list[str] = []
166
+ if c.system_prompt_additions:
167
+ lines.append("--- system_prompt (appended)")
168
+ for ln in c.system_prompt_additions.splitlines():
169
+ lines.append("+ " + ln)
170
+ if c.new_temperature is not None:
171
+ lines.append(f"~ temperature: {record.temperature} -> {c.new_temperature}")
172
+ if c.max_chunks is not None:
173
+ lines.append(f"~ retrieved_chunks: {len(record.retrieved_chunks)} -> top {c.max_chunks} by similarity")
174
+ for ex in c.few_shot_examples:
175
+ lines.append(f"+ few-shot: {ex.get('input','')[:48]} -> {ex.get('output','')[:48]}")
176
+ if c.notes:
177
+ lines.append(f"# note: {c.notes}")
178
+ return "\n".join(lines)
179
+
180
+ # --- shared NER helper for test generation -----------------------------
181
+ @staticmethod
182
+ def _fabricated_entities(record: CaptureRecord) -> list[str]:
183
+ """Entities in the output that are absent from the retrieved context."""
184
+ out_ents = _extract_entities(record.llm_output)
185
+ ctx = record.context_text.lower()
186
+ return sorted(e for e in out_ents if e not in ctx)
187
+
188
+ @staticmethod
189
+ def _grounded_entities(record: CaptureRecord) -> list[str]:
190
+ out_ents = _extract_entities(record.llm_output)
191
+ ctx = record.context_text.lower()
192
+ return sorted(e for e in out_ents if e in ctx)
@@ -0,0 +1,246 @@
1
+ """The five built-in fix agents (Architecture §8.3).
2
+
3
+ Each targets one failure type and ships a deterministic fix template. (Fix text
4
+ can be LLM-drafted when a key is present, but the templates make the framework
5
+ work — and verify — offline.) Generation is the only probabilistic step; the
6
+ loop in ``base.FixAgent.run`` verifies every fix deterministically.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from debugai.agents.base import FixAgent
12
+ from debugai.agents.types import (
13
+ FAILED, PENDING_RERUN, VERIFIED, FixCandidate, FixReport, TestCase,
14
+ )
15
+ from debugai.detectors import (
16
+ CONTEXT_OVERFLOW, ENTITY_GAP, HALLUCINATION, PROMPT_BRITTLENESS, RETRIEVAL_FAILURE,
17
+ )
18
+ from debugai.judge import INSTRUCTION_VIOLATION, judge_instructions
19
+ from debugai.schema import CaptureRecord
20
+
21
+
22
+ def _first(xs: list[str], n: int) -> list[str]:
23
+ return [x for x in xs[:n] if x]
24
+
25
+
26
+ # --------------------------------------------------------------------------- #
27
+ # 1. Prompt Rule Agent — hallucination
28
+ # --------------------------------------------------------------------------- #
29
+ class PromptRuleAgent(FixAgent):
30
+ name = "Prompt Rule Agent"
31
+ handles = HALLUCINATION
32
+
33
+ GROUNDING = (
34
+ "Answer ONLY using the provided context. If the context does not contain "
35
+ "the answer, reply exactly: \"I don't have that information.\" Never invent "
36
+ "names, numbers, dates, clauses, or citations. For every specific claim, the "
37
+ "supporting text must appear in the context."
38
+ )
39
+
40
+ def generate_fix(self, diagnosis, record):
41
+ return FixCandidate(
42
+ agent=self.name, failure=self.handles,
43
+ strategy="Add grounding constraints + an out-of-context fallback to the system prompt.",
44
+ rationale="Retrieval succeeded but the output is ungrounded — constrain "
45
+ "generation to the supplied context.",
46
+ system_prompt_additions=self.GROUNDING,
47
+ )
48
+
49
+ def build_test_cases(self, diagnosis, record):
50
+ fab = self._fabricated_entities(record)
51
+ grounded = self._grounded_entities(record)
52
+ tests = [
53
+ TestCase(input=record.user_prompt, must_not_contain=_first(fab, 4),
54
+ category="original"),
55
+ TestCase(input=record.user_prompt, must_not_contain=_first(fab, 4),
56
+ category="variance", runs=3),
57
+ ]
58
+ if grounded:
59
+ tests.append(TestCase(input=record.user_prompt,
60
+ must_contain=_first(grounded, 1), category="regression"))
61
+ return tests
62
+
63
+
64
+ # --------------------------------------------------------------------------- #
65
+ # 2. Knowledge Base Agent — retrieval failure
66
+ # --------------------------------------------------------------------------- #
67
+ class KnowledgeBaseAgent(FixAgent):
68
+ name = "Knowledge Base Agent"
69
+ handles = RETRIEVAL_FAILURE
70
+ verifiable_by_rerun = False # real fix is re-chunking/re-embedding the corpus
71
+
72
+ GUARD = (
73
+ "If the retrieved context does not address the question, reply exactly: "
74
+ "\"The knowledge base does not contain this information.\" Do not answer "
75
+ "from prior knowledge."
76
+ )
77
+
78
+ def generate_fix(self, diagnosis, record):
79
+ sim = ((diagnosis.get("signals") or {}).get("similarity"))
80
+ return FixCandidate(
81
+ agent=self.name, failure=self.handles,
82
+ strategy="Re-chunk source docs entity-aware + add an interim 'not in KB' guard.",
83
+ rationale=f"Mean retrieval similarity {sim} is below threshold — the "
84
+ "retriever returned irrelevant chunks.",
85
+ system_prompt_additions=self.GUARD,
86
+ notes="Re-chunk the source corpus with an entity-aware strategy and "
87
+ "re-embed; verify the target document is actually indexed.",
88
+ )
89
+
90
+ def build_test_cases(self, diagnosis, record):
91
+ fab = self._fabricated_entities(record)
92
+ return [
93
+ TestCase(input=record.user_prompt, must_not_contain=_first(fab, 4),
94
+ category="original"),
95
+ TestCase(input=record.user_prompt, must_not_contain=_first(fab, 4),
96
+ category="variance", runs=3),
97
+ ]
98
+
99
+
100
+ # --------------------------------------------------------------------------- #
101
+ # 3. Constraint Agent — prompt brittleness
102
+ # --------------------------------------------------------------------------- #
103
+ class ConstraintAgent(FixAgent):
104
+ name = "Constraint Agent"
105
+ handles = PROMPT_BRITTLENESS
106
+
107
+ TEMPLATE = (
108
+ "Be deterministic and consistent across runs. Follow a fixed output format "
109
+ "and do not vary phrasing, ordering, or structure between identical inputs."
110
+ )
111
+
112
+ def generate_fix(self, diagnosis, record):
113
+ return FixCandidate(
114
+ agent=self.name, failure=self.handles,
115
+ strategy="Lower temperature, add an output-format template, and pin behavior with a few-shot example.",
116
+ rationale="Grounding signals are healthy but output variance is high — "
117
+ "constrain sampling and format.",
118
+ new_temperature=0.2,
119
+ system_prompt_additions=self.TEMPLATE,
120
+ few_shot_examples=[{"input": record.user_prompt, "output": record.llm_output}],
121
+ )
122
+
123
+ def build_test_cases(self, diagnosis, record):
124
+ grounded = self._grounded_entities(record)
125
+ mc = _first(grounded, 1)
126
+ return [
127
+ TestCase(input=record.user_prompt, must_contain=mc, category="regression"),
128
+ TestCase(input=record.user_prompt, must_contain=mc, category="variance", runs=3),
129
+ ]
130
+
131
+
132
+ # --------------------------------------------------------------------------- #
133
+ # 4. Context Optimizer Agent — context overflow
134
+ # --------------------------------------------------------------------------- #
135
+ class ContextOptimizerAgent(FixAgent):
136
+ name = "Context Optimizer Agent"
137
+ handles = CONTEXT_OVERFLOW
138
+
139
+ def generate_fix(self, diagnosis, record):
140
+ return FixCandidate(
141
+ agent=self.name, failure=self.handles,
142
+ strategy="Reduce to the top-N most relevant chunks and summarize each to fit the window.",
143
+ rationale="The prompt overflows the context window; trim and compress "
144
+ "the retrieved context.",
145
+ max_chunks=8,
146
+ chunk_char_budget=240,
147
+ notes="Summarize prior conversation history; consider a larger-context model.",
148
+ )
149
+
150
+ def build_test_cases(self, diagnosis, record):
151
+ grounded = self._grounded_entities(record)
152
+ return [
153
+ TestCase(input=record.user_prompt, must_contain=_first(grounded, 1),
154
+ category="regression"),
155
+ ]
156
+
157
+
158
+ # --------------------------------------------------------------------------- #
159
+ # 5. Document Patch Agent — entity gap (escalates)
160
+ # --------------------------------------------------------------------------- #
161
+ class DocumentPatchAgent(FixAgent):
162
+ name = "Document Patch Agent"
163
+ handles = ENTITY_GAP
164
+
165
+ def generate_fix(self, diagnosis, record):
166
+ missing = self._fabricated_entities(record) # entities not in the corpus
167
+ names = ", ".join(_first(missing, 6)) or "the requested entities"
168
+ return FixCandidate(
169
+ agent=self.name, failure=self.handles,
170
+ strategy="Identify missing entities and flag the knowledge-base gap for human review.",
171
+ rationale="Retrieval is healthy but the corpus lacks coverage for these "
172
+ "entities — content cannot be safely auto-generated.",
173
+ notes=f"Knowledge base needs articles covering: {names}.",
174
+ escalate=True,
175
+ )
176
+
177
+ def build_test_cases(self, diagnosis, record):
178
+ return [] # escalated before tests run
179
+
180
+
181
+ # --------------------------------------------------------------------------- #
182
+ # 6. Socratic Tutor Agent — instruction_violation (behavioural / pedagogy)
183
+ # --------------------------------------------------------------------------- #
184
+ class SocraticTutorAgent(FixAgent):
185
+ name = "Socratic Tutor Agent"
186
+ handles = INSTRUCTION_VIOLATION
187
+
188
+ RULES = (
189
+ "STRICT Socratic correction — these override any conflicting guidance above:\n"
190
+ "- Do NOT explain the concept or state the answer. Give at most ONE short "
191
+ "clue (one sentence maximum), and only if the student is stuck.\n"
192
+ "- Lead with the question: the turn must centre on exactly ONE short leading "
193
+ "question (exactly one '?') that moves the student one step forward, and must "
194
+ "never restate or reword a question already asked.\n"
195
+ "- Do not open by paraphrasing the student's message.\n"
196
+ "- Keep the whole turn to 1-2 sentences. When unsure, ask rather than explain."
197
+ )
198
+
199
+ def generate_fix(self, diagnosis, record):
200
+ ev = ((diagnosis.get("primary") or {}).get("evidence") or {})
201
+ viols = ev.get("violations") or []
202
+ names = "; ".join(v.get("rule", "") for v in viols[:4]) or "Socratic-method rules"
203
+ return FixCandidate(
204
+ agent=self.name, failure=self.handles,
205
+ strategy="Rewrite the system prompt to enforce the violated Socratic rules.",
206
+ rationale=f"The response broke pedagogy rules ({names}); tighten the "
207
+ "system prompt so the tutor guides instead of answering.",
208
+ system_prompt_additions=self.RULES,
209
+ )
210
+
211
+ def build_test_cases(self, diagnosis, record):
212
+ return [] # verified by re-judging (below), not by must_contain checks
213
+
214
+ def run(self, diagnosis, record, rerun=None):
215
+ """Judge-based verify loop: rewrite prompt → regenerate → re-judge."""
216
+ candidate = self.generate_fix(diagnosis, record)
217
+ before = (diagnosis.get("primary") or {}).get("confidence")
218
+ report = FixReport(agent=self.name, failure=self.handles, verdict=PENDING_RERUN,
219
+ candidate=candidate, diff=self._diff(candidate, record),
220
+ before_confidence=before)
221
+ if rerun is None:
222
+ return report
223
+ applied = self._apply(candidate, record)
224
+ new_output = rerun(applied.system_prompt, record.user_prompt,
225
+ applied.chunks, applied.temperature) or ""
226
+ jd = judge_instructions(applied.system_prompt, record.user_prompt, new_output)
227
+ report.reverified = True
228
+ report.after_output = new_output
229
+ report.reverified_cleared = jd.healthy
230
+ report.after_diagnosis = {
231
+ "healthy": jd.healthy,
232
+ "primary": None if jd.healthy else {
233
+ "failure": INSTRUCTION_VIOLATION, "confidence": jd.confidence},
234
+ }
235
+ report.verdict = VERIFIED if jd.healthy else FAILED
236
+ return report
237
+
238
+
239
+ BUILTIN_AGENTS = [
240
+ PromptRuleAgent,
241
+ KnowledgeBaseAgent,
242
+ ConstraintAgent,
243
+ ContextOptimizerAgent,
244
+ DocumentPatchAgent,
245
+ SocraticTutorAgent,
246
+ ]
@@ -0,0 +1,31 @@
1
+ """Fix agent registry + plugin architecture (Architecture §8.5).
2
+
3
+ Custom agents register at the front, so they take priority over the built-ins
4
+ and inherit the full diagnose-fix-verify loop.
5
+
6
+ registry = FixAgentRegistry()
7
+ registry.register(SyllabusAgent("class10_cbse.pdf")) # custom, checked first
8
+ agent = registry.find_agent(diagnosis)
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from debugai.agents.base import FixAgent
14
+ from debugai.agents.builtin import BUILTIN_AGENTS
15
+
16
+
17
+ class FixAgentRegistry:
18
+ def __init__(self, include_builtins: bool = True):
19
+ self.agents: list[FixAgent] = (
20
+ [cls() for cls in BUILTIN_AGENTS] if include_builtins else []
21
+ )
22
+
23
+ def register(self, agent: FixAgent) -> None:
24
+ """Register a custom agent — inserted first so it wins over built-ins."""
25
+ self.agents.insert(0, agent)
26
+
27
+ def find_agent(self, diagnosis: dict) -> FixAgent | None:
28
+ for agent in self.agents:
29
+ if agent.can_handle(diagnosis):
30
+ return agent
31
+ return None
@@ -0,0 +1,108 @@
1
+ """Data types for the Fix Agent Framework (Architecture §8)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import asdict, dataclass, field
6
+
7
+ # Verdicts for a fix attempt.
8
+ VERIFIED = "verified" # tests pass AND re-diagnosis clears the failure
9
+ MITIGATED = "mitigated" # interim guard verified, but full fix needs a pipeline change
10
+ FAILED = "failed" # fix did not clear the failure / tests failed
11
+ PENDING_RERUN = "pending_rerun" # candidate + tests produced, but no model to verify
12
+ ESCALATED = "escalated" # agent declined to auto-fix; flagged for a human
13
+
14
+
15
+ @dataclass
16
+ class FixCandidate:
17
+ """A candidate fix produced by an agent. The modifications are applied
18
+ deterministically by the loop before re-running the model."""
19
+
20
+ agent: str
21
+ failure: str
22
+ strategy: str
23
+ rationale: str = ""
24
+ # Modifications (any subset applies):
25
+ system_prompt_additions: str = ""
26
+ new_temperature: float | None = None
27
+ max_chunks: int | None = None
28
+ chunk_char_budget: int | None = None # summarize/truncate each kept chunk
29
+ few_shot_examples: list[dict] = field(default_factory=list)
30
+ # Advisory output (not auto-applied):
31
+ notes: str = ""
32
+ escalate: bool = False
33
+
34
+ def to_dict(self) -> dict:
35
+ return asdict(self)
36
+
37
+
38
+ @dataclass
39
+ class TestCase:
40
+ """One deterministic regression check (§8.4)."""
41
+
42
+ __test__ = False # not a pytest test class
43
+
44
+ input: str
45
+ must_contain: list[str] = field(default_factory=list)
46
+ must_not_contain: list[str] = field(default_factory=list)
47
+ category: str = "regression" # original | edge | regression | variance
48
+ runs: int = 1 # variance checks run 3x
49
+
50
+ def to_dict(self) -> dict:
51
+ return asdict(self)
52
+
53
+
54
+ @dataclass
55
+ class TestResult:
56
+ __test__ = False # not a pytest test class
57
+
58
+ case: TestCase
59
+ passed: bool
60
+ outputs: list[str] = field(default_factory=list)
61
+ failures: list[str] = field(default_factory=list)
62
+
63
+ def to_dict(self) -> dict:
64
+ return {
65
+ "input": self.case.input,
66
+ "category": self.case.category,
67
+ "runs": self.case.runs,
68
+ "passed": self.passed,
69
+ "failures": self.failures,
70
+ "must_contain": self.case.must_contain,
71
+ "must_not_contain": self.case.must_not_contain,
72
+ }
73
+
74
+
75
+ @dataclass
76
+ class FixReport:
77
+ """The full diagnose-fix-verify report presented for developer review (§8.1)."""
78
+
79
+ agent: str
80
+ failure: str
81
+ verdict: str
82
+ candidate: FixCandidate
83
+ diff: str = ""
84
+ test_results: list[TestResult] = field(default_factory=list)
85
+ tests_passed: int = 0
86
+ tests_total: int = 0
87
+ reverified: bool = False
88
+ reverified_cleared: bool | None = None
89
+ before_confidence: float | None = None
90
+ after_diagnosis: dict | None = None
91
+ after_output: str | None = None
92
+
93
+ def to_dict(self) -> dict:
94
+ return {
95
+ "agent": self.agent,
96
+ "failure": self.failure,
97
+ "verdict": self.verdict,
98
+ "candidate": self.candidate.to_dict(),
99
+ "diff": self.diff,
100
+ "tests_passed": self.tests_passed,
101
+ "tests_total": self.tests_total,
102
+ "test_results": [t.to_dict() for t in self.test_results],
103
+ "reverified": self.reverified,
104
+ "reverified_cleared": self.reverified_cleared,
105
+ "before_confidence": self.before_confidence,
106
+ "after_diagnosis": self.after_diagnosis,
107
+ "after_output": self.after_output,
108
+ }