agent-failure-debugger 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/PKG-INFO +117 -15
- agent_failure_debugger-0.2.0/src/agent_failure_debugger.egg-info/PKG-INFO → agent_failure_debugger-0.2.1/README.md +604 -522
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/pyproject.toml +1 -1
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/__init__.py +1 -1
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/execution_quality.py +16 -0
- agent_failure_debugger-0.2.0/README.md → agent_failure_debugger-0.2.1/src/agent_failure_debugger.egg-info/PKG-INFO +624 -502
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/LICENSE +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/setup.cfg +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/abstraction.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/auto_apply.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/autofix.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/causal_resolver.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/config.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/decision_support.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/diagnose.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/evaluate_fix.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/execute_fix.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/explain.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/explainer.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/fix_templates.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/formatter.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/graph_loader.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/labels.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/main.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/pipeline.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/pipeline_post_apply.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/pipeline_summary.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/policy_loader.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/reliability.py +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/templates/system_prompt.txt +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger/templates/user_prompt.txt +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger.egg-info/SOURCES.txt +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger.egg-info/dependency_links.txt +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger.egg-info/entry_points.txt +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger.egg-info/requires.txt +0 -0
- {agent_failure_debugger-0.2.0 → agent_failure_debugger-0.2.1}/src/agent_failure_debugger.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: agent-failure-debugger
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Diagnose why your LLM agent failed. Deterministic causal analysis with fix generation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Project-URL: Homepage, https://github.com/kiyoshisasano/agent-failure-debugger
|
|
@@ -41,15 +41,23 @@ print(result["explanation"]["context_summary"])
|
|
|
41
41
|
|
|
42
42
|
## Use the Debugger
|
|
43
43
|
|
|
44
|
-
|
|
45
|
-
- An agent gives confident answers without data
|
|
46
|
-
- Tools return empty results or errors
|
|
47
|
-
- Behavior changes between runs and you need to understand why
|
|
44
|
+
Call `diagnose()` after every agent run. It returns execution quality (healthy, degraded, or failed), root cause analysis when failures are detected, and fix proposals.
|
|
48
45
|
|
|
49
|
-
|
|
46
|
+
```python
|
|
47
|
+
result = diagnose(raw_log, adapter="langchain")
|
|
48
|
+
status = result["summary"]["execution_quality"]["status"]
|
|
49
|
+
|
|
50
|
+
# In CI/CD or automated pipelines:
|
|
51
|
+
assert status != "failed", f"Agent execution failed: {result['summary']['root_cause']}"
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
When the agent runs normally, you get `healthy` with confidence scores and grounding state. When something goes wrong, you get the root cause, causal path, and a fix proposal — without changing how you call the tool.
|
|
50
55
|
|
|
51
|
-
|
|
52
|
-
|
|
56
|
+
**Entry points:**
|
|
57
|
+
|
|
58
|
+
- **Every run** — call `diagnose()` on the raw log or trace after each execution
|
|
59
|
+
- **Live observation** — use Atlas [`watch()`](https://github.com/kiyoshisasano/llm-failure-atlas) to capture telemetry and diagnose during execution
|
|
60
|
+
- **Multi-run comparison** — use `compare_runs()` and `diff_runs()` to track stability across runs
|
|
53
61
|
|
|
54
62
|
Atlas detects failures; the debugger explains why they happened and proposes fixes. You can use Atlas alone for detection, but diagnosis requires the debugger.
|
|
55
63
|
|
|
@@ -148,7 +156,29 @@ For a copy-paste example without an API key, see [Reproducible Examples](#reprod
|
|
|
148
156
|
pip install agent-failure-debugger
|
|
149
157
|
```
|
|
150
158
|
|
|
151
|
-
###
|
|
159
|
+
### Healthy run
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
from agent_failure_debugger import diagnose
|
|
163
|
+
|
|
164
|
+
raw_log = {
|
|
165
|
+
"inputs": {"query": "What was Q3 revenue?"},
|
|
166
|
+
"outputs": {"response": "Q3 revenue was $4.2M based on the latest earnings report."},
|
|
167
|
+
"steps": [
|
|
168
|
+
{"type": "tool", "name": "search_earnings", "inputs": {"quarter": "Q3"},
|
|
169
|
+
"outputs": {"revenue": "$4.2M", "source": "10-Q filing"}, "error": None},
|
|
170
|
+
{"type": "llm", "outputs": {"text": "Q3 revenue was $4.2M based on the latest earnings report."}}
|
|
171
|
+
]
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
result = diagnose(raw_log, adapter="langchain")
|
|
175
|
+
print(result["summary"]["execution_quality"]["status"]) # healthy
|
|
176
|
+
print(result["summary"]["failure_count"]) # 0
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
The tool returns a result on every run. When the agent is healthy, you get confirmation — not silence.
|
|
180
|
+
|
|
181
|
+
### Degraded run
|
|
152
182
|
|
|
153
183
|
```python
|
|
154
184
|
from agent_failure_debugger import diagnose
|
|
@@ -170,9 +200,12 @@ raw_log = {
|
|
|
170
200
|
}
|
|
171
201
|
|
|
172
202
|
result = diagnose(raw_log, adapter="langchain")
|
|
173
|
-
print(result["summary"]["root_cause"])
|
|
203
|
+
print(result["summary"]["root_cause"]) # incorrect_output
|
|
204
|
+
print(result["summary"]["execution_quality"]["status"]) # degraded
|
|
174
205
|
```
|
|
175
206
|
|
|
207
|
+
Same function, same interface. The difference is in the input, not in how you call the tool.
|
|
208
|
+
|
|
176
209
|
### From matcher output (advanced)
|
|
177
210
|
|
|
178
211
|
If you already have matcher output (e.g., from a custom integration):
|
|
@@ -184,7 +217,7 @@ result = run_pipeline(matcher_output, use_learning=True)
|
|
|
184
217
|
print(result["summary"])
|
|
185
218
|
```
|
|
186
219
|
|
|
187
|
-
See [Quick Start Guide](docs/quickstart.md) for more usage patterns including `watch()
|
|
220
|
+
See [Quick Start Guide](docs/quickstart.md) for more usage patterns including `watch()`, multi-run analysis, and direct telemetry.
|
|
188
221
|
|
|
189
222
|
## Common Mistakes
|
|
190
223
|
|
|
@@ -210,7 +243,7 @@ See [Limitations & FAQ](docs/limitations_faq.md) for details.
|
|
|
210
243
|
|
|
211
244
|
### Execution quality
|
|
212
245
|
|
|
213
|
-
Every `diagnose()` and `run_pipeline()` result
|
|
246
|
+
Every `diagnose()` and `run_pipeline()` result includes execution quality assessment — this is what makes the tool useful on every run, not just when failures occur.
|
|
214
247
|
|
|
215
248
|
```python
|
|
216
249
|
eq = result["summary"]["execution_quality"]
|
|
@@ -221,9 +254,11 @@ print(eq["summary"]) # one-line human-readable assessment
|
|
|
221
254
|
```
|
|
222
255
|
|
|
223
256
|
- **healthy** — no significant issues detected
|
|
224
|
-
- **degraded** — output may have been produced but quality indicators are weak (low alignment, weak grounding, unmodeled failures)
|
|
257
|
+
- **degraded** — output may have been produced but quality indicators are weak (low alignment, weak grounding, redundant tool results, unmodeled failures)
|
|
225
258
|
- **failed** — execution did not produce usable output (silent exit or error)
|
|
226
259
|
|
|
260
|
+
Degradation indicators include: low alignment score (< 0.5), tools called but no usable data returned, high expansion ratio without uncertainty disclosure (> 3.0), low tool result diversity (< 0.5 across 2+ calls — tools returned identical results), low observation coverage, and unmodeled or conflicting failure signals.
|
|
261
|
+
|
|
227
262
|
Execution quality uses existing telemetry and diagnosis results. No new matcher patterns are added.
|
|
228
263
|
|
|
229
264
|
### Multi-run analysis
|
|
@@ -245,6 +280,8 @@ print(diff["causal_path_diff"]) # where paths diverge
|
|
|
245
280
|
|
|
246
281
|
`compare_runs()` measures stability — whether the same task produces consistent diagnoses across runs. `diff_runs()` identifies divergence — what structural differences separate successful runs from failed ones.
|
|
247
282
|
|
|
283
|
+
For runnable examples with expected output, see [examples/multi_run_stability](examples/multi_run_stability/) (compare_runs → diff_runs workflow) and [examples/termination_divergence](examples/termination_divergence/) (same root cause, different exit modes).
|
|
284
|
+
|
|
248
285
|
### Enhanced explanation
|
|
249
286
|
|
|
250
287
|
```python
|
|
@@ -416,6 +453,13 @@ matcher_output.json
|
|
|
416
453
|
| `reliability.py` | Cross-run stability and differential analysis |
|
|
417
454
|
| `execution_quality.py` | Single-run execution behavior assessment |
|
|
418
455
|
|
|
456
|
+
### Examples
|
|
457
|
+
|
|
458
|
+
| Directory | Demonstrates |
|
|
459
|
+
|---|---|
|
|
460
|
+
| `examples/termination_divergence/` | `diff_runs()`: same root cause, different termination modes |
|
|
461
|
+
| `examples/multi_run_stability/` | `compare_runs()` → `diff_runs()`: two-step stability and divergence workflow |
|
|
462
|
+
|
|
419
463
|
---
|
|
420
464
|
|
|
421
465
|
## Graph Source
|
|
@@ -463,7 +507,56 @@ All scoring weights and gate thresholds are in `config.py`.
|
|
|
463
507
|
|
|
464
508
|
## Reproducible Examples
|
|
465
509
|
|
|
466
|
-
**
|
|
510
|
+
**Healthy run** (copy-paste-run, no API key needed):
|
|
511
|
+
|
|
512
|
+
```bash
|
|
513
|
+
pip install agent-failure-debugger
|
|
514
|
+
```
|
|
515
|
+
|
|
516
|
+
```python
|
|
517
|
+
from agent_failure_debugger import diagnose
|
|
518
|
+
|
|
519
|
+
raw_log = {
|
|
520
|
+
"inputs": {"query": "What was Q3 revenue?"},
|
|
521
|
+
"outputs": {"response": "Q3 revenue was $4.2M based on the latest earnings report."},
|
|
522
|
+
"steps": [
|
|
523
|
+
{"type": "tool", "name": "search_earnings", "inputs": {"quarter": "Q3"},
|
|
524
|
+
"outputs": {"revenue": "$4.2M", "source": "10-Q filing"}, "error": None},
|
|
525
|
+
{"type": "llm", "outputs": {"text": "Q3 revenue was $4.2M based on the latest earnings report."}}
|
|
526
|
+
]
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
result = diagnose(raw_log, adapter="langchain")
|
|
530
|
+
print(result["summary"]["execution_quality"]["status"]) # healthy
|
|
531
|
+
print(result["summary"]["failure_count"]) # 0
|
|
532
|
+
```
|
|
533
|
+
|
|
534
|
+
**Degraded run** (copy-paste-run):
|
|
535
|
+
|
|
536
|
+
```python
|
|
537
|
+
raw_log = {
|
|
538
|
+
"inputs": {"query": "Change my flight to tomorrow morning"},
|
|
539
|
+
"outputs": {"response": "I've found several hotels near the airport for you."},
|
|
540
|
+
"steps": [
|
|
541
|
+
{"type": "llm", "outputs": {"text": "Let me check available flights."}},
|
|
542
|
+
{"type": "tool", "name": "search_flights", "inputs": {"date": "2025-03-20"},
|
|
543
|
+
"outputs": {"flights": []}, "error": None},
|
|
544
|
+
{"type": "tool", "name": "search_flights", "inputs": {"date": "2025-03-20"},
|
|
545
|
+
"outputs": {"flights": []}, "error": None},
|
|
546
|
+
{"type": "tool", "name": "search_flights", "inputs": {"date": "2025-03-20"},
|
|
547
|
+
"outputs": {"flights": []}, "error": None},
|
|
548
|
+
{"type": "llm", "outputs": {"text": "I've found several hotels near the airport."}}
|
|
549
|
+
],
|
|
550
|
+
"feedback": {"user_correction": "I asked about flights, not hotels."}
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
result = diagnose(raw_log, adapter="langchain")
|
|
554
|
+
print(result["summary"]["root_cause"])
|
|
555
|
+
print(result["summary"]["execution_quality"]["status"])
|
|
556
|
+
# → root cause + execution quality (degraded)
|
|
557
|
+
```
|
|
558
|
+
|
|
559
|
+
**With a live agent** (requires `langchain-core` and `langgraph`):
|
|
467
560
|
|
|
468
561
|
```bash
|
|
469
562
|
pip install agent-failure-debugger[langchain] langgraph
|
|
@@ -493,14 +586,23 @@ graph = watch(workflow.compile(), auto_diagnose=True)
|
|
|
493
586
|
graph.invoke({"messages": [HumanMessage(content="What was Q3 revenue?")]})
|
|
494
587
|
```
|
|
495
588
|
|
|
589
|
+
Note: `watch()` with `FakeListLLM` demonstrates the callback integration but may not trigger failure patterns — the fake LLM produces no tool calls or user corrections. For failure detection examples, use `diagnose()` with the raw log above.
|
|
590
|
+
|
|
496
591
|
**Regression test examples:**
|
|
497
592
|
|
|
498
|
-
|
|
593
|
+
12 examples in [llm-failure-atlas](https://github.com/kiyoshisasano/llm-failure-atlas) under `examples/` (10 agent + 2 non-LLM). Each contains `log.json`, `matcher_output.json`, and `expected_debugger_output.json`.
|
|
499
594
|
|
|
500
595
|
```bash
|
|
501
596
|
python -m agent_failure_debugger.main matcher_output.json
|
|
502
597
|
```
|
|
503
598
|
|
|
599
|
+
**Multi-run analysis examples:**
|
|
600
|
+
|
|
601
|
+
2 examples in this repository under `examples/`. Each contains input fixtures, a runnable script, and `expected_output.json`:
|
|
602
|
+
|
|
603
|
+
- [termination_divergence](examples/termination_divergence/) — `diff_runs()` comparing silent exit vs error exit
|
|
604
|
+
- [multi_run_stability](examples/multi_run_stability/) — `compare_runs()` → `diff_runs()` two-step workflow
|
|
605
|
+
|
|
504
606
|
---
|
|
505
607
|
|
|
506
608
|
## Internals
|