agent-failure-debugger 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_failure_debugger/__init__.py +53 -0
- agent_failure_debugger/abstraction.py +380 -0
- agent_failure_debugger/auto_apply.py +510 -0
- agent_failure_debugger/autofix.py +222 -0
- agent_failure_debugger/causal_resolver.py +143 -0
- agent_failure_debugger/config.py +96 -0
- agent_failure_debugger/decision_support.py +416 -0
- agent_failure_debugger/diagnose.py +210 -0
- agent_failure_debugger/evaluate_fix.py +334 -0
- agent_failure_debugger/execute_fix.py +364 -0
- agent_failure_debugger/explain.py +61 -0
- agent_failure_debugger/explainer.py +598 -0
- agent_failure_debugger/fix_templates.py +242 -0
- agent_failure_debugger/formatter.py +259 -0
- agent_failure_debugger/graph_loader.py +44 -0
- agent_failure_debugger/labels.py +196 -0
- agent_failure_debugger/main.py +39 -0
- agent_failure_debugger/pipeline.py +373 -0
- agent_failure_debugger/pipeline_post_apply.py +97 -0
- agent_failure_debugger/pipeline_summary.py +36 -0
- agent_failure_debugger/policy_loader.py +96 -0
- agent_failure_debugger/templates/system_prompt.txt +22 -0
- agent_failure_debugger/templates/user_prompt.txt +8 -0
- agent_failure_debugger-0.1.0.dist-info/METADATA +431 -0
- agent_failure_debugger-0.1.0.dist-info/RECORD +29 -0
- agent_failure_debugger-0.1.0.dist-info/WHEEL +5 -0
- agent_failure_debugger-0.1.0.dist-info/entry_points.txt +2 -0
- agent_failure_debugger-0.1.0.dist-info/licenses/LICENSE +21 -0
- agent_failure_debugger-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""
|
|
2
|
+
agent_failure_debugger — Diagnose why your LLM agent failed.
|
|
3
|
+
|
|
4
|
+
Deterministic causal analysis with fix generation.
|
|
5
|
+
|
|
6
|
+
Primary API:
|
|
7
|
+
from agent_failure_debugger import diagnose
|
|
8
|
+
result = diagnose(raw_log, adapter="langchain")
|
|
9
|
+
|
|
10
|
+
from agent_failure_debugger import watch
|
|
11
|
+
graph = watch(workflow.compile(), auto_diagnose=True)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
__version__ = "0.1.0"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def diagnose(raw_log, adapter="langchain", **kwargs):
|
|
18
|
+
"""Diagnose failures from a raw agent log.
|
|
19
|
+
|
|
20
|
+
This is the primary entry point for the tool.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
raw_log: Raw log/response from the agent or service.
|
|
24
|
+
adapter: Adapter name ("langchain", "langsmith", "crewai", "redis_help_demo").
|
|
25
|
+
**kwargs: Passed to run_pipeline (e.g. use_learning, top_k).
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Dict with: diagnosis, fix, summary, explanation, telemetry, matcher_output.
|
|
29
|
+
"""
|
|
30
|
+
from agent_failure_debugger.diagnose import diagnose as _diagnose
|
|
31
|
+
return _diagnose(raw_log, adapter=adapter, **kwargs)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def watch(compiled_graph, **kwargs):
|
|
35
|
+
"""Wrap a LangGraph agent for live failure detection.
|
|
36
|
+
|
|
37
|
+
Requires langchain-core: pip install agent-failure-debugger[langchain]
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
compiled_graph: A compiled LangGraph graph.
|
|
41
|
+
**kwargs: auto_diagnose (bool), auto_pipeline (bool), verbose (bool).
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
A wrapped graph with Atlas diagnosis injected.
|
|
45
|
+
"""
|
|
46
|
+
try:
|
|
47
|
+
from llm_failure_atlas.adapters.callback_handler import watch as _watch
|
|
48
|
+
except ImportError:
|
|
49
|
+
raise ImportError(
|
|
50
|
+
"watch() requires langchain-core. "
|
|
51
|
+
"Install with: pip install agent-failure-debugger[langchain]"
|
|
52
|
+
)
|
|
53
|
+
return _watch(compiled_graph, **kwargs)
|
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
"""
|
|
2
|
+
abstraction.py
|
|
3
|
+
|
|
4
|
+
Phase 14: Abstraction / Pruning / UX layer.
|
|
5
|
+
|
|
6
|
+
Pipeline position:
|
|
7
|
+
matcher → debugger → formatter → abstraction → explainer
|
|
8
|
+
|
|
9
|
+
Three capabilities:
|
|
10
|
+
1. top-k causal path selection
|
|
11
|
+
2. failure clustering
|
|
12
|
+
3. explanation simplification (verbose / standard / brief)
|
|
13
|
+
|
|
14
|
+
This layer does NOT modify diagnostic data.
|
|
15
|
+
It only controls what is displayed and how.
|
|
16
|
+
|
|
17
|
+
GPT review fixes applied:
|
|
18
|
+
① top-k: winner paths always prioritized before scoring
|
|
19
|
+
② clustering: representative uses confidence + root_score
|
|
20
|
+
③ simplification: uses selected_paths not raw alternative_paths
|
|
21
|
+
④ cluster collapse: same-cluster split by non-cluster is correct (spec)
|
|
22
|
+
⑤ brief mode: includes representative failure name
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from agent_failure_debugger.labels import FAILURE_MAP
|
|
26
|
+
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
# Failure clusters
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
CLUSTERS = {
|
|
32
|
+
"reasoning_failure": {
|
|
33
|
+
"label": "Reasoning failure",
|
|
34
|
+
"members": [
|
|
35
|
+
"clarification_failure",
|
|
36
|
+
"assumption_invalidation_failure",
|
|
37
|
+
"premature_model_commitment",
|
|
38
|
+
"repair_strategy_failure",
|
|
39
|
+
],
|
|
40
|
+
},
|
|
41
|
+
"retrieval_failure": {
|
|
42
|
+
"label": "Retrieval failure",
|
|
43
|
+
"members": [
|
|
44
|
+
"semantic_cache_intent_bleeding",
|
|
45
|
+
"prompt_injection_via_retrieval",
|
|
46
|
+
"context_truncation_loss",
|
|
47
|
+
"rag_retrieval_drift",
|
|
48
|
+
],
|
|
49
|
+
},
|
|
50
|
+
"tool_failure": {
|
|
51
|
+
"label": "Tool failure",
|
|
52
|
+
"members": [
|
|
53
|
+
"agent_tool_call_loop",
|
|
54
|
+
"tool_result_misinterpretation",
|
|
55
|
+
],
|
|
56
|
+
},
|
|
57
|
+
"output_failure": {
|
|
58
|
+
"label": "Output failure",
|
|
59
|
+
"members": [
|
|
60
|
+
"incorrect_output",
|
|
61
|
+
],
|
|
62
|
+
},
|
|
63
|
+
"instruction_failure": {
|
|
64
|
+
"label": "Instruction failure",
|
|
65
|
+
"members": [
|
|
66
|
+
"instruction_priority_inversion",
|
|
67
|
+
],
|
|
68
|
+
},
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
_FAILURE_TO_CLUSTER = {}
|
|
72
|
+
for cid, cdef in CLUSTERS.items():
|
|
73
|
+
for member in cdef["members"]:
|
|
74
|
+
_FAILURE_TO_CLUSTER[member] = cid
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_cluster(failure_id: str) -> str | None:
|
|
78
|
+
return _FAILURE_TO_CLUSTER.get(failure_id)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# 1. Top-k path selection (fix ①: winner paths always prioritized)
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
def _score_path_simple(path: list, debugger_output: dict) -> float:
|
|
86
|
+
conf_map = {f["id"]: f["confidence"] for f in debugger_output.get("failures", [])}
|
|
87
|
+
root_scores = {r["id"]: r["score"] for r in debugger_output.get("root_ranking", [])}
|
|
88
|
+
|
|
89
|
+
avg_conf = sum(conf_map.get(n, 0) for n in path) / len(path) if path else 0
|
|
90
|
+
root_score = root_scores.get(path[0], 0) if path else 0
|
|
91
|
+
length_bonus = len(path) / 10.0
|
|
92
|
+
|
|
93
|
+
return avg_conf + root_score * 0.3 + length_bonus
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def select_top_k(debugger_output: dict, k: int = 2) -> dict:
|
|
97
|
+
"""
|
|
98
|
+
Select top-k paths for display.
|
|
99
|
+
Winner paths (containing conflict group winners) are always prioritized.
|
|
100
|
+
"""
|
|
101
|
+
all_paths = debugger_output.get("causal_paths", [])
|
|
102
|
+
multi_hop = [p for p in all_paths if len(p) >= 2]
|
|
103
|
+
|
|
104
|
+
if len(multi_hop) <= k:
|
|
105
|
+
return {
|
|
106
|
+
"selected_paths": multi_hop,
|
|
107
|
+
"suppressed_paths": [],
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
# Collect conflict winners
|
|
111
|
+
conflict_winners = set()
|
|
112
|
+
for c in debugger_output.get("conflicts", []):
|
|
113
|
+
conflict_winners.add(c.get("winner", ""))
|
|
114
|
+
|
|
115
|
+
# Partition: winner paths first, then others
|
|
116
|
+
winner_paths = []
|
|
117
|
+
other_paths = []
|
|
118
|
+
for p in multi_hop:
|
|
119
|
+
if any(node in conflict_winners for node in p):
|
|
120
|
+
winner_paths.append(p)
|
|
121
|
+
else:
|
|
122
|
+
other_paths.append(p)
|
|
123
|
+
|
|
124
|
+
# Score within each group
|
|
125
|
+
winner_paths.sort(
|
|
126
|
+
key=lambda p: _score_path_simple(p, debugger_output), reverse=True)
|
|
127
|
+
other_paths.sort(
|
|
128
|
+
key=lambda p: _score_path_simple(p, debugger_output), reverse=True)
|
|
129
|
+
|
|
130
|
+
# Fill selected: winners first, then others
|
|
131
|
+
selected = []
|
|
132
|
+
for p in winner_paths:
|
|
133
|
+
if len(selected) < k:
|
|
134
|
+
selected.append(p)
|
|
135
|
+
for p in other_paths:
|
|
136
|
+
if len(selected) < k:
|
|
137
|
+
selected.append(p)
|
|
138
|
+
|
|
139
|
+
# Everything else is suppressed
|
|
140
|
+
selected_set = [id(p) for p in selected]
|
|
141
|
+
suppressed = [p for p in multi_hop if id(p) not in selected_set]
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
"selected_paths": selected,
|
|
145
|
+
"suppressed_paths": suppressed,
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# ---------------------------------------------------------------------------
|
|
150
|
+
# 2. Failure clustering (fix ②: representative uses conf + root_score)
|
|
151
|
+
# ---------------------------------------------------------------------------
|
|
152
|
+
|
|
153
|
+
def cluster_failures(debugger_output: dict) -> list[dict]:
|
|
154
|
+
"""
|
|
155
|
+
Group active failures into clusters.
|
|
156
|
+
Representative: highest (confidence + 0.3 * root_score).
|
|
157
|
+
"""
|
|
158
|
+
active = {f["id"]: f["confidence"] for f in debugger_output.get("failures", [])}
|
|
159
|
+
root_scores = {r["id"]: r["score"] for r in debugger_output.get("root_ranking", [])}
|
|
160
|
+
|
|
161
|
+
result = []
|
|
162
|
+
for cid, cdef in CLUSTERS.items():
|
|
163
|
+
active_members = [m for m in cdef["members"] if m in active]
|
|
164
|
+
if not active_members:
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
# Representative: confidence + root importance
|
|
168
|
+
representative = max(
|
|
169
|
+
active_members,
|
|
170
|
+
key=lambda m: active[m] + 0.3 * root_scores.get(m, 0)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
result.append({
|
|
174
|
+
"cluster": cid,
|
|
175
|
+
"label": cdef["label"],
|
|
176
|
+
"members": active_members,
|
|
177
|
+
"representative": representative,
|
|
178
|
+
"representative_description": FAILURE_MAP.get(representative, ""),
|
|
179
|
+
})
|
|
180
|
+
|
|
181
|
+
return result
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# ---------------------------------------------------------------------------
|
|
185
|
+
# 3. Explanation simplification
|
|
186
|
+
# ---------------------------------------------------------------------------
|
|
187
|
+
|
|
188
|
+
def _collapse_cluster_sequence(path: list) -> list[dict]:
|
|
189
|
+
"""
|
|
190
|
+
Collapse consecutive same-cluster nodes into cluster entries.
|
|
191
|
+
Note: same cluster split by a different cluster stays split (by design).
|
|
192
|
+
"""
|
|
193
|
+
if not path:
|
|
194
|
+
return []
|
|
195
|
+
|
|
196
|
+
entries = []
|
|
197
|
+
current_cluster = None
|
|
198
|
+
current_members = []
|
|
199
|
+
|
|
200
|
+
for node in path:
|
|
201
|
+
cluster = get_cluster(node)
|
|
202
|
+
if cluster == current_cluster and current_cluster is not None:
|
|
203
|
+
current_members.append(node)
|
|
204
|
+
else:
|
|
205
|
+
if current_members:
|
|
206
|
+
if current_cluster:
|
|
207
|
+
entries.append({
|
|
208
|
+
"type": "cluster",
|
|
209
|
+
"cluster": current_cluster,
|
|
210
|
+
"label": CLUSTERS[current_cluster]["label"],
|
|
211
|
+
"members": current_members,
|
|
212
|
+
})
|
|
213
|
+
else:
|
|
214
|
+
for m in current_members:
|
|
215
|
+
entries.append({"type": "node", "id": m})
|
|
216
|
+
current_cluster = cluster
|
|
217
|
+
current_members = [node]
|
|
218
|
+
|
|
219
|
+
if current_members:
|
|
220
|
+
if current_cluster:
|
|
221
|
+
entries.append({
|
|
222
|
+
"type": "cluster",
|
|
223
|
+
"cluster": current_cluster,
|
|
224
|
+
"label": CLUSTERS[current_cluster]["label"],
|
|
225
|
+
"members": current_members,
|
|
226
|
+
})
|
|
227
|
+
else:
|
|
228
|
+
for m in current_members:
|
|
229
|
+
entries.append({"type": "node", "id": m})
|
|
230
|
+
|
|
231
|
+
return entries
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _get_cluster_representative(members: list, debugger_output: dict) -> str:
|
|
235
|
+
"""Pick the representative from collapsed cluster members."""
|
|
236
|
+
if not members:
|
|
237
|
+
return ""
|
|
238
|
+
active = {f["id"]: f["confidence"] for f in debugger_output.get("failures", [])}
|
|
239
|
+
root_scores = {r["id"]: r["score"] for r in debugger_output.get("root_ranking", [])}
|
|
240
|
+
scored = [(m, active.get(m, 0) + 0.3 * root_scores.get(m, 0)) for m in members]
|
|
241
|
+
return max(scored, key=lambda x: x[1])[0]
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def simplify_explanation(debugger_output: dict, selected_paths: list,
|
|
245
|
+
mode: str = "standard") -> dict:
|
|
246
|
+
"""
|
|
247
|
+
Generate simplified explanation based on mode.
|
|
248
|
+
Uses selected_paths (from top-k) for alternatives, not raw alternative_paths.
|
|
249
|
+
(fix ③)
|
|
250
|
+
"""
|
|
251
|
+
primary = debugger_output.get("primary_path") or []
|
|
252
|
+
explanation = debugger_output.get("explanation", "")
|
|
253
|
+
|
|
254
|
+
# Alternatives = selected paths minus primary (fix ③)
|
|
255
|
+
alternatives = [p for p in selected_paths if p != primary]
|
|
256
|
+
|
|
257
|
+
if mode == "verbose":
|
|
258
|
+
return {
|
|
259
|
+
"display_mode": "verbose",
|
|
260
|
+
"summary_explanation": explanation,
|
|
261
|
+
"detailed_explanation": explanation,
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
collapsed = _collapse_cluster_sequence(primary)
|
|
265
|
+
|
|
266
|
+
# --- Brief mode (fix ⑤: include representative) ---
|
|
267
|
+
if mode == "brief":
|
|
268
|
+
if len(collapsed) >= 2:
|
|
269
|
+
first = collapsed[0]
|
|
270
|
+
last = collapsed[-1]
|
|
271
|
+
first_label = first.get("label", first.get("id", "unknown"))
|
|
272
|
+
last_label = last.get("label", last.get("id", "unknown"))
|
|
273
|
+
# Add representative for context
|
|
274
|
+
first_rep = _get_cluster_representative(
|
|
275
|
+
first.get("members", []), debugger_output)
|
|
276
|
+
rep_desc = FAILURE_MAP.get(first_rep, first_rep) if first_rep else ""
|
|
277
|
+
if rep_desc:
|
|
278
|
+
summary = (
|
|
279
|
+
f"The failure originated in {first_label.lower()} "
|
|
280
|
+
f"({rep_desc}) "
|
|
281
|
+
f"and resulted in {last_label.lower()}."
|
|
282
|
+
)
|
|
283
|
+
else:
|
|
284
|
+
summary = (
|
|
285
|
+
f"The failure originated in {first_label.lower()} "
|
|
286
|
+
f"and resulted in {last_label.lower()}."
|
|
287
|
+
)
|
|
288
|
+
elif len(collapsed) == 1:
|
|
289
|
+
name = collapsed[0].get("label", collapsed[0].get("id", "unknown"))
|
|
290
|
+
summary = f"The failure is categorized as {name.lower()}."
|
|
291
|
+
else:
|
|
292
|
+
summary = "No causal path detected."
|
|
293
|
+
|
|
294
|
+
return {
|
|
295
|
+
"display_mode": "brief",
|
|
296
|
+
"summary_explanation": summary,
|
|
297
|
+
"detailed_explanation": explanation,
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
# --- Standard mode ---
|
|
301
|
+
parts = []
|
|
302
|
+
for i, entry in enumerate(collapsed):
|
|
303
|
+
if entry["type"] == "cluster":
|
|
304
|
+
label = entry["label"]
|
|
305
|
+
members = entry["members"]
|
|
306
|
+
if len(members) == 1:
|
|
307
|
+
desc = FAILURE_MAP.get(members[0], members[0])
|
|
308
|
+
part = f"{label} ({desc})"
|
|
309
|
+
else:
|
|
310
|
+
first = members[0]
|
|
311
|
+
last = members[-1]
|
|
312
|
+
part = f"{label} ({first} → {last})"
|
|
313
|
+
else:
|
|
314
|
+
desc = FAILURE_MAP.get(entry["id"], entry["id"])
|
|
315
|
+
part = f"{entry['id']} ({desc})"
|
|
316
|
+
|
|
317
|
+
if i == 0:
|
|
318
|
+
parts.append(part)
|
|
319
|
+
else:
|
|
320
|
+
parts.append(f"leading to {part}")
|
|
321
|
+
|
|
322
|
+
summary = ", ".join(parts) + "." if parts else "No causal path detected."
|
|
323
|
+
|
|
324
|
+
# Alternatives from selected_paths only (fix ③)
|
|
325
|
+
alt_summaries = []
|
|
326
|
+
for alt in alternatives:
|
|
327
|
+
alt_collapsed = _collapse_cluster_sequence(alt)
|
|
328
|
+
alt_labels = [
|
|
329
|
+
e.get("label", e.get("id", "?")) for e in alt_collapsed
|
|
330
|
+
]
|
|
331
|
+
alt_summaries.append(" → ".join(alt_labels))
|
|
332
|
+
|
|
333
|
+
standard_explanation = f"Primary: {summary}"
|
|
334
|
+
if alt_summaries:
|
|
335
|
+
for alt in alt_summaries:
|
|
336
|
+
standard_explanation += f"\nAlternative: {alt}"
|
|
337
|
+
|
|
338
|
+
return {
|
|
339
|
+
"display_mode": "standard",
|
|
340
|
+
"summary_explanation": standard_explanation,
|
|
341
|
+
"detailed_explanation": explanation,
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
# ---------------------------------------------------------------------------
|
|
346
|
+
# Orchestrator
|
|
347
|
+
# ---------------------------------------------------------------------------
|
|
348
|
+
|
|
349
|
+
def abstract(debugger_output: dict, top_k: int = 2,
|
|
350
|
+
mode: str = "standard") -> dict:
|
|
351
|
+
"""
|
|
352
|
+
Full abstraction pipeline:
|
|
353
|
+
1. top-k path selection
|
|
354
|
+
2. failure clustering
|
|
355
|
+
3. explanation simplification (uses selected_paths)
|
|
356
|
+
"""
|
|
357
|
+
path_selection = select_top_k(debugger_output, k=top_k)
|
|
358
|
+
clusters = cluster_failures(debugger_output)
|
|
359
|
+
simplified = simplify_explanation(
|
|
360
|
+
debugger_output,
|
|
361
|
+
selected_paths=path_selection["selected_paths"],
|
|
362
|
+
mode=mode,
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
return {
|
|
366
|
+
# Original data (preserved)
|
|
367
|
+
"root_candidates": debugger_output.get("root_candidates", []),
|
|
368
|
+
"root_ranking": debugger_output.get("root_ranking", []),
|
|
369
|
+
"failures": debugger_output.get("failures", []),
|
|
370
|
+
"causal_links": debugger_output.get("causal_links", []),
|
|
371
|
+
"causal_paths": debugger_output.get("causal_paths", []),
|
|
372
|
+
"primary_path": debugger_output.get("primary_path"),
|
|
373
|
+
"conflicts": debugger_output.get("conflicts", []),
|
|
374
|
+
"evidence": debugger_output.get("evidence", []),
|
|
375
|
+
# Abstraction layer
|
|
376
|
+
"selected_paths": path_selection["selected_paths"],
|
|
377
|
+
"suppressed_paths": path_selection["suppressed_paths"],
|
|
378
|
+
"clusters": clusters,
|
|
379
|
+
**simplified,
|
|
380
|
+
}
|