agentdebugx 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/PKG-INFO +1 -1
  2. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/23_status_v0_2.md +50 -9
  3. agentdebugx-0.2.4/docs/benchmarks/e2e_v0_2_3.md +373 -0
  4. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/pyproject.toml +1 -1
  5. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/__init__.py +14 -2
  6. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/analyzers.py +42 -12
  7. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/attribution.py +174 -0
  8. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/models.py +12 -0
  9. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/recorder.py +15 -1
  10. agentdebugx-0.2.4/src/agentdebug/recovery.py +314 -0
  11. agentdebugx-0.2.2/src/agentdebug/recovery.py +0 -113
  12. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/LICENSE +0 -0
  13. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/README.md +0 -0
  14. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/00_overview.md +0 -0
  15. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/01_literature_survey.md +0 -0
  16. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/02_architecture.md +0 -0
  17. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/03_taxonomy.md +0 -0
  18. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/04_trace_schema.md +0 -0
  19. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/05_adapters.md +0 -0
  20. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/06_detectors.md +0 -0
  21. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/07_attribution.md +0 -0
  22. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/08_recovery.md +0 -0
  23. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/09_error_database.md +0 -0
  24. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/10_taxonomy_induction.md +0 -0
  25. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/11_multimodal.md +0 -0
  26. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/12_ui_dashboard.md +0 -0
  27. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/13_class_design.md +0 -0
  28. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/14_api_reference.md +0 -0
  29. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/15_roadmap.md +0 -0
  30. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/16_governance.md +0 -0
  31. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/17_claude_code_design_patterns.md +0 -0
  32. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/18_comparison_codex_vs_design.md +0 -0
  33. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/19_error_hub.md +0 -0
  34. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/20_deep_debug.md +0 -0
  35. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/21_integrations.md +0 -0
  36. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/22_industry_track_paper_eval_plan.md +0 -0
  37. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/ERROR_TAXONOMY.md +0 -0
  38. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
  39. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/README.md +0 -0
  40. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/RESEARCH_SURVEY.md +0 -0
  41. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/benchmarks/v0_1_smoke.json +0 -0
  42. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/docs/benchmarks/v0_1_smoke.md +0 -0
  43. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/adapters/__init__.py +0 -0
  44. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/adapters/base.py +0 -0
  45. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/adapters/langgraph.py +0 -0
  46. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/adapters/otel.py +0 -0
  47. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/adapters/raw.py +0 -0
  48. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/cli.py +0 -0
  49. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/deep.py +0 -0
  50. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/detectors.py +0 -0
  51. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/events.py +0 -0
  52. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/hub/__init__.py +0 -0
  53. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/hub/backend_base.py +0 -0
  54. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/hub/backends.py +0 -0
  55. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/hub/bundle.py +0 -0
  56. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/hub/scrub.py +0 -0
  57. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/instrumentation.py +0 -0
  58. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/integrations/__init__.py +0 -0
  59. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/integrations/claude_skill.py +0 -0
  60. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/integrations/openhands.py +0 -0
  61. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/judges.py +0 -0
  62. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/llm.py +0 -0
  63. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/storage.py +0 -0
  64. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/taxonomy.py +0 -0
  65. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/traceback.py +0 -0
  66. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/ui/__init__.py +0 -0
  67. {agentdebugx-0.2.2 → agentdebugx-0.2.4}/src/agentdebug/ui/server.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentdebugx
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -1,4 +1,4 @@
1
- # 23 — Capability + Test Coverage Status (v0.2.2)
1
+ # 23 — Capability + Test Coverage Status (v0.2.3)
2
2
 
3
3
  A live audit of what's implemented, what's tested, and what's specced but
4
4
  not yet built. Pair this with [docs/15_roadmap.md](./15_roadmap.md), which is
@@ -20,7 +20,9 @@ the forward-looking plan; this doc is the rear-view mirror.
20
20
  | Attribution | `agentdebug.attribution.HeuristicAttributor` | ✅ stable | first-finding + tiebreak |
21
21
  | Attribution | `agentdebug.attribution.AllAtOnceAttributor` | ✅ stable | mocked LLM + fallback |
22
22
  | Attribution | `agentdebug.attribution.StepByStepAttributor` | ✅ **new 0.2.2** | scripted-LLM + fallback |
23
+ | Attribution | `agentdebug.attribution.BinarySearchAttributor` | ✅ **new 0.2.3** | oracle-LLM logarithmic convergence + fallback + render elision |
23
24
  | Recovery | `agentdebug.recovery.ReflexionSuggestion` | ✅ stable | per-finding + empty |
25
+ | Recovery | `agentdebug.recovery.CriticRecoverer` + `VerifierSpec` registry | ✅ **new 0.2.3** | 5 family-matched verifier templates; dedup + custom-override |
24
26
  | DeepDebug | `agentdebug.deep.DeepDebugAnalyzer` | ✅ stable | full loop + silent LLM |
25
27
  | Cascade view | `agentdebug.traceback.format_traceback` | ✅ stable | cascade + step-order + ANSI + empty |
26
28
  | Detectors | `agentdebug.detectors.RepeatedToolCall / RepeatedState / StepCountLimit` | ✅ **new 0.2.2** | threshold + window + budget |
@@ -45,13 +47,11 @@ across 32 source files.
45
47
  | [06_detectors.md](./06_detectors.md) | `trajectory_perplexity` (TrajAD) | needs token-level LM perplexity API or embedding model + baseline calibration | v0.3 |
46
48
  | [06_detectors.md](./06_detectors.md) | `topic_drift` (embedding cosine) | needs embedding client; consider reusing `OpenAICompatClient` `/embeddings` | v0.3 |
47
49
  | [06_detectors.md](./06_detectors.md) | LTL spec monitors | requires user-supplied spec or LLM-synthesized monitors; gated on RV research | v1.2 |
48
- | [07_attribution.md](./07_attribution.md) | `BinarySearchAttributor` (ddmin) | requires replayable environment; few frameworks expose it | v0.3 |
49
- | [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` | requires re-rolling agent actions; same replay constraint | v0.3 |
50
+ | [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` | requires re-rolling agent actions; framework-replay dependent | v0.3 |
50
51
  | [07_attribution.md](./07_attribution.md) | `SBFLAttributor` (Tarantula/Ochiai) | needs corpus of passing + failing traces of same task; gated on Hub adoption | v0.4 |
51
52
  | [07_attribution.md](./07_attribution.md) | `DeltaDebugAttributor` (Zeller) | same replay constraint | v0.3 |
52
- | [07_attribution.md](./07_attribution.md) | `EnsembleAttributor` | trivial once 2+ heavy backends ship; awaits BinarySearch/Counterfactual | v0.3 |
53
+ | [07_attribution.md](./07_attribution.md) | `EnsembleAttributor` | trivial once Counterfactual lands; awaits Counterfactual | v0.3 |
53
54
  | [08_recovery.md](./08_recovery.md) | `SelfRefineLoop` | small but needs a generator-critic-refiner orchestration | v0.3 |
54
- | [08_recovery.md](./08_recovery.md) | `CriticRecoverer` | needs a verifier registry (search, code-exec, type-check) | v0.3 |
55
55
  | [08_recovery.md](./08_recovery.md) | `AutoManualRules` | needs persistent project manual + injection into next-run prompts | v0.3 |
56
56
  | [08_recovery.md](./08_recovery.md) | `LangGraphRewind` | depends on LangGraph checkpointer; ships when we have a real LangGraph user | v0.3 |
57
57
  | [08_recovery.md](./08_recovery.md) | `SagaRollback` | needs compensation registry on tool definitions; new schema | v0.3 |
@@ -82,6 +82,40 @@ The audit found one real bug and a handful of test gaps:
82
82
  5. **`recovery.ReflexionSuggestion`** had only an indirect test from DeepDebug
83
83
  examples; now has direct happy + empty tests.
84
84
 
85
+ ## 3.6 Real-usage E2E (live Gemini)
86
+
87
+ Beyond unit tests, `scripts/e2e_real_usage.py` builds three realistic failing
88
+ trajectories using **only the public API** (`AgentDebug`, `traced_tool`,
89
+ `SQLiteTraceStore`) and runs the full pipeline against the live LLM.
90
+
91
+ Stage results (see [docs/benchmarks/e2e_v0_2_3.md](./benchmarks/e2e_v0_2_3.md)):
92
+
93
+ | Scenario | Stages OK |
94
+ |---|---|
95
+ | `action_format_then_hallucination` (planner → bad tool call → hallucinated answer) | 12 / 12 |
96
+ | `multiagent_handoff_loss` (researcher → handoff drops constraint → wrong summary) | 12 / 12 |
97
+ | `planning_loop` (browser clicks #submit 4× with no progress) | 12 / 12 |
98
+ | UI smoke (`/healthz`, `/api/v1/traces`, `/api/v1/traces/<id>`, `/api/v1/taxonomy`, `/`) | 5 / 5 |
99
+ | Fresh-venv `pip install agentdebugx==0.2.3` + import + CLI listing | ✅ |
100
+
101
+ **Honest issues the E2E surfaced** (none of these would have been caught by
102
+ the mocked unit tests):
103
+
104
+ 1. **LLM judge can return truncated JSON on long traces** — gemini-3-flash
105
+ spent its `max_tokens` budget on reasoning tokens before completing the
106
+ findings array; the pipeline gracefully returned 0 findings rather than
107
+ crashing. Mitigation: per-call `max_tokens=6144`+; document the
108
+ thinking-token trap (done in [docs/20_deep_debug.md §7](./20_deep_debug.md)).
109
+ 2. **`BinarySearchAttributor` falls back to `HeuristicAttributor` when its
110
+ probe JSON is truncated** — observed in 2 of 3 scenarios. The fallback
111
+ chain works correctly, but the user loses the O(log N) advantage.
112
+ Followup: tighter bisection prompts; track in `result.raw['probe_count']`.
113
+ 3. **`HeuristicAnalyzer` returns `root_cause_step_index=None` when all
114
+ findings have `step_index=None`** — the event recorded via `traced_tool`
115
+ doesn't carry a step index. Real bug; `traced_tool` should auto-assign.
116
+
117
+ These are tracked as v0.2.4 fixes.
118
+
85
119
  ## 4. Coverage matrix (post-0.2.2)
86
120
 
87
121
  Run `PYTHONPATH=src pytest --cov=agentdebug --cov-report=term`. The two largest
@@ -97,10 +131,17 @@ remaining gaps are deliberate:
97
131
 
98
132
  Before v0.3 ships, this doc should record green checkmarks for:
99
133
 
100
- - [ ] One replayable counterfactual attributor (`BinarySearchAttributor` is
101
- the cheapest entry).
102
- - [ ] One tool-grounded recovery strategy (`CriticRecoverer`) wired against
103
- a `Verifier` Protocol.
134
+ - [x] **Logarithmic-cost attributor** (`BinarySearchAttributor`) shipped in
135
+ 0.2.3 Who&When method 3, O(log N) LLM calls, bisects the trajectory
136
+ via prefix evaluation. **Note:** this is not yet a "replayable
137
+ counterfactual" attributor; it predicts whether the failure has
138
+ already occurred from the prefix without re-rolling the agent. True
139
+ counterfactual replay is still v0.3.
140
+ - [x] **Tool-grounded recovery strategy** (`CriticRecoverer` + `VerifierSpec`
141
+ registry) shipped in 0.2.3 — pattern-matches failure modes against 5
142
+ default verifier templates (JSON-schema guard, final-state check,
143
+ tool-result type-check, handoff contract, loop-detector guard) and
144
+ emits per-finding `FixProposal` with rationale + suggested code.
104
145
  - [ ] One additional framework adapter that goes through the full conformance
105
146
  suite (CrewAI is the most-requested).
106
147
  - [ ] HuggingFace Hub round-trip live test (gated on `HF_TOKEN`).
@@ -0,0 +1,373 @@
1
+ # AgentDebugX v0.2.3 End-to-End Real-Usage Smoke
2
+
3
+ Scenarios: **3**. LLM model: `gemini-3-flash`. Generated by `scripts/e2e_real_usage.py`.
4
+
5
+ ## Per-scenario pipeline status
6
+
7
+ | Scenario | trace_id | OK / Total stages | Failed stages |
8
+ |---|---|---|---|
9
+ | `action_format_then_hallucination` | `trace_f81860…` | 12 / 12 | — |
10
+ | `multiagent_handoff_loss` | `trace_45009e…` | 12 / 12 | — |
11
+ | `planning_loop` | `trace_3d1c98…` | 12 / 12 | — |
12
+
13
+ **UI smoke:** ✅ all endpoints responded
14
+
15
+ ```
16
+ GET /healthz -> 200 {"status":"ok"}
17
+ GET /api/v1/traces -> 3 trace(s)
18
+ GET /api/v1/traces/<id> -> 200 events=11 findings=4
19
+ GET /api/v1/taxonomy -> modes=19
20
+ GET / -> 200 content_length=33666 has_brand=True
21
+ ```
22
+
23
+ ## `action_format_then_hallucination`
24
+
25
+ `trace_id=trace_f81860758c6d439aaf1ecd7457de6654`
26
+
27
+ ### ✅ `heuristic_analyzer` (0.00s) — 1 finding(s); root=None
28
+ ### ✅ `cross_event_detectors` (0.00s) — 0 finding(s) from default_detectors()
29
+ ### ✅ `traceback_offline` (0.00s) — rendered
30
+
31
+ ```
32
+ AgentTraceback (root cause first, manifested failure last):
33
+ trace_id=trace_f81860758c6d439aaf1ecd7457de6654 framework=e2e-react goal='Find the latest AgentDebug paper, summarize the method, then email alice@example.com'
34
+
35
+ File "root cause", in trajectory
36
+ Step ? agent=search_web mode=system.tool_execution_error confidence=0.86
37
+ event_id=evt_582bbb55430a4be583ad6c374f7c1564
38
+ error> JSON schema validation failed: missing parameter query
39
+ evidence:
40
+ - JSON schema validation failed: missing parameter query
41
+ suggested: Capture tool stderr/status/latency and classify retryable versus non-retryable failures.
42
+
43
+ AgentFailure[system.tool_execution_error]: Likely root cause: Tool execution error in search_web at step None.
44
+ ```
45
+
46
+ ### ✅ `reflexion_suggestion` (0.00s) — 1 proposal(s)
47
+
48
+ ```
49
+ Reflexion retry hint for system.tool_execution_error at step None
50
+ ```
51
+
52
+ ### ✅ `critic_recoverer` (0.00s) — 1 verifier proposal(s)
53
+
54
+ ```
55
+ Add tool_result_typecheck before system.tool_execution_error (step None, agent search_web)
56
+ ```
57
+
58
+ ### ✅ `llm_judge` (27.97s) — 0 finding(s); root=None
59
+ ### ✅ `attribute_heuristic` (0.00s) — method=heuristic (no hypotheses)
60
+ ### ✅ `attribute_all_at_once` (6.27s) — method=all_at_once agent=search_web step=None conf=0.90
61
+
62
+ ```
63
+ The agent failed to provide the required 'query' parameter in the search tool call, which resulted in a validation error and prevented the agent from finding the paper.
64
+ ```
65
+
66
+ ### ✅ `attribute_step_by_step` (17.05s) — method=step_by_step agent=planner step=4 conf=1.00
67
+
68
+ ```
69
+ The agent prematurely terminates the task with a generic statement, failing to summarize the method or email the recipient as required by the goal.
70
+ ```
71
+
72
+ ### ✅ `attribute_binary_search` (9.48s) — method=binary_search agent=planner step=4 conf=0.90
73
+
74
+ ```
75
+ Binary search located the decisive step within 3 probes over 6 events.
76
+ ```
77
+
78
+ ### ✅ `deep_debug` (25.52s) — 3 finding(s); rounds=6
79
+
80
+ ```
81
+ rounds: plan:3794ms / hypothesize:7161ms / verify:h1:2551ms / verify:h2:2381ms / verify:h3:2097ms / refine:7534ms
82
+ summary: The agent failed to provide the required 'query' parameter for the search tool, and the planner subsequently misjudged the task as complete despite failing to find the paper, summarize it, or send the email.
83
+
84
+ AgentTraceback (root cause first, manifested failure last):
85
+ trace_id=trace_f81860758c6d439aaf1ecd7457de6654 framework=e2e-react goal='Find the latest AgentDebug paper, summarize the method, then email alice@example.com'
86
+
87
+ File "root cause", in trajectory
88
+ Step ? agent=search_web mode=action.parameter_error confidence=1.00
89
+ event_id=evt_582bbb55430a4be583ad6c374f7c1564
90
+ error> JSON schema validation failed: missing parameter query
91
+ evidence:
92
+ - JSON schema validation failed: missing parameter query
93
+ - args': '()', 'kwargs': '{}'
94
+ suggested: Validate parameters against tool schemas and ask for missing user/context fields.
95
+ ↓ cascaded to
96
+ File "cascade depth 1", in trajectory
97
+ Step 4 agent=planner mode=reflection.progress_misjudge confidence=1.00
98
+ module=reflection
99
+ event_id=evt_047e5ad596874186ac8d4413b8ba8185
100
+ output> Final answer: AgentDebug is a popular paper. Done.
101
+ evidence:
102
+ - Final answer: AgentDebug is a popular paper. Done.
103
+ - error=JSON schema validation failed: missing parameter query
104
+ suggested: Add an external task verifier before termination.
105
+ ↓ cascaded to
106
+ File "cascade depth 2", in trajectory
107
+ Step ? agent=system mode=verification.missing_task_validation confidence=1.00
108
+ event_id=evt_0e34f3f892664015b10bba11ed2ac3dd
109
+ evidence:
110
+ - meta={'success': True}
111
+ - Final answer: AgentDebug is a popular paper. Done.
112
+ suggested: Add final-state validation that is independent of the acting agent.
113
+
114
+ AgentFailure[verification.missing_task_validation]: The agent failed to provide the required 'query' parameter for the search tool, and the planner subsequently misjudged the task as complete despite failing to find the paper, summarize it, or send the email.
115
+ ```
116
+
117
+ ### ✅ `hub_round_trip` (0.01s) — pushed=/home/kunlunz2/AgentDebugX/.agentdebug/e2e_hub/bundle_b8fa2127c001463d81c86c8c03e4002a ; bundle_id=bundle_b8fa2127c001463d81c86c8c03e4002a ; listed=1 ; round-trip ok
118
+
119
+ ## `multiagent_handoff_loss`
120
+
121
+ `trace_id=trace_45009e26b64341e69af395c4d4cabc07`
122
+
123
+ ### ✅ `heuristic_analyzer` (0.00s) — 1 finding(s); root=2
124
+ ### ✅ `cross_event_detectors` (0.00s) — 0 finding(s) from default_detectors()
125
+ ### ✅ `traceback_offline` (0.00s) — rendered
126
+
127
+ ```
128
+ AgentTraceback (root cause first, manifested failure last):
129
+ trace_id=trace_45009e26b64341e69af395c4d4cabc07 framework=e2e-multiagent goal='Find the best paper on agent debugging, prefer the most recent.'
130
+
131
+ File "root cause", in trajectory
132
+ Step 2 agent=researcher mode=multiagent.handoff_loss confidence=0.70
133
+ module=multiagent
134
+ event_id=evt_09730d2f195349639b671b8278d0202c
135
+ output> Please summarize the agent debugging paper.
136
+ evidence:
137
+ - handoff/context signal in event payload
138
+ suggested: Make handoff payloads typed and include goal, constraints, evidence, confidence, and open questions.
139
+
140
+ AgentFailure[multiagent.handoff_loss]: Likely root cause: Handoff context loss in researcher at step 2.
141
+ ```
142
+
143
+ ### ✅ `reflexion_suggestion` (0.00s) — 1 proposal(s)
144
+
145
+ ```
146
+ Reflexion retry hint for multiagent.handoff_loss at step 2
147
+ ```
148
+
149
+ ### ✅ `critic_recoverer` (0.00s) — 1 verifier proposal(s)
150
+
151
+ ```
152
+ Add handoff_context_contract before multiagent.handoff_loss (step 2, agent researcher)
153
+ ```
154
+
155
+ ### ✅ `llm_judge` (8.01s) — 2 finding(s); root=2
156
+
157
+ ```
158
+ - multiagent.handoff_loss (conf=1.00) step=2 agent=researcher
159
+ - verification.missing_task_validation (conf=0.90) step=None agent=system
160
+ ```
161
+
162
+ ### ✅ `attribute_heuristic` (0.00s) — method=heuristic agent=researcher step=2 conf=1.00
163
+
164
+ ```
165
+ Earliest finding with non-trivial confidence: Handoff context loss
166
+ ```
167
+
168
+ ### ✅ `attribute_all_at_once` (3.30s) — method=all_at_once agent=researcher step=2 conf=1.00
169
+
170
+ ```
171
+ The researcher correctly identified Paper A as the most recent in step 1 but failed to communicate this specific choice or the recency constraint during the handoff in step 2, leading the summarizer to pick the wrong paper.
172
+ ```
173
+
174
+ ### ✅ `attribute_step_by_step` (20.70s) — method=step_by_step agent=researcher step=2 conf=1.00
175
+
176
+ ```
177
+ The researcher agent identified the correct paper in the previous step but failed to deliver the result, instead initiating an unnecessary handoff for summarization.
178
+ ```
179
+
180
+ ### ✅ `attribute_binary_search` (7.03s) — method=heuristic agent=researcher step=2 conf=1.00
181
+
182
+ ```
183
+ Earliest finding with non-trivial confidence: Handoff context loss
184
+ ```
185
+
186
+ ### ✅ `deep_debug` (30.30s) — 3 finding(s); rounds=6
187
+
188
+ ```
189
+ rounds: plan:3749ms / hypothesize:8433ms / verify:h1:3764ms / verify:h2:2344ms / verify:h3:4481ms / refine:7526ms
190
+ summary: The researcher agent hallucinated paper candidates without performing a search and subsequently failed to communicate the user's recency constraints and the selected paper to the summarizer, leading to an incorrect final output.
191
+
192
+ AgentTraceback (root cause first, manifested failure last):
193
+ trace_id=trace_45009e26b64341e69af395c4d4cabc07 framework=e2e-multiagent goal='Find the best paper on agent debugging, prefer the most recent.'
194
+
195
+ File "root cause", in trajectory
196
+ Step 1 agent=researcher mode=memory.hallucination confidence=0.95
197
+ module=planning
198
+ event_id=evt_1f436c0e08534b579faba36acb2e6703
199
+ output> Found two candidate papers: A (May 2025) and B (Mar 2024). A is preferred because it is more recent (per user constraint).
200
+ evidence:
201
+ - Found two candidate papers: A (May 2025) and B (Mar 2024)
202
+ - without any apparent search or data gathering steps
203
+ suggested: Require memory reads to cite the source event or artifact before use.
204
+ ↓ cascaded to
205
+ File "cascade depth 1", in trajectory
206
+ Step 2 agent=researcher mode=multiagent.handoff_loss confidence=1.00
207
+ module=multiagent
208
+ event_id=evt_09730d2f195349639b671b8278d0202c
209
+ output> Please summarize the agent debugging paper.
210
+ evidence:
211
+ - Please summarize the agent debugging paper.
212
+ - omitted_context: 'preference for A; recency constraint'
213
+ suggested: Make handoff payloads typed and include goal, constraints, evidence, confidence, and open questions.
214
+ ↓ cascaded to
215
+ File "cascade depth 2", in trajectory
216
+ Step 4 agent=summarizer mode=reflection.progress_misjudge confidence=0.90
217
+ module=reflection
218
+ event_id=evt_e7e8080c4dde47ef88c96dc0db743023
219
+ output> I summarized paper B (the only one I knew about).
220
+ evidence:
221
+ - I summarized paper B (the only one I knew about).
222
+ - prefer the most recent
223
+ suggested: Add an external task verifier before termination.
224
+
225
+ AgentFailure[reflection.progress_misjudge]: The researcher agent hallucinated paper candidates without performing a search and subsequently failed to communicate the user's recency constraints and the selected paper to the summarizer, leading to an incorrect final output.
226
+ ```
227
+
228
+ ### ✅ `hub_round_trip` (0.01s) — pushed=/home/kunlunz2/AgentDebugX/.agentdebug/e2e_hub/bundle_6c474a1056074785a0275407f24fedc1 ; bundle_id=bundle_6c474a1056074785a0275407f24fedc1 ; listed=3 ; round-trip ok
229
+
230
+ ## `planning_loop`
231
+
232
+ `trace_id=trace_3d1c98d2424a4c05ae104b942fe0a302`
233
+
234
+ ### ✅ `heuristic_analyzer` (0.00s) — 4 finding(s); root=None
235
+ ### ✅ `cross_event_detectors` (0.00s) — 3 finding(s) from default_detectors()
236
+
237
+ ```
238
+ - planning.inefficient_plan (source=repeated_tool_call)
239
+ - planning.inefficient_plan (source=repeated_state)
240
+ - planning.inefficient_plan (source=repeated_state)
241
+ ```
242
+
243
+ ### ✅ `traceback_offline` (0.00s) — rendered
244
+
245
+ ```
246
+ AgentTraceback (root cause first, manifested failure last):
247
+ trace_id=trace_3d1c98d2424a4c05ae104b942fe0a302 framework=e2e-browser goal='Submit the checkout form on shop.example.com'
248
+
249
+ File "root cause", in trajectory
250
+ Step ? agent=browser mode=planning.inefficient_plan confidence=0.67
251
+ event_id=evt_3f474425ad3841b886240a70ec694fa5
252
+ output> no progress; same checkout screen
253
+ evidence:
254
+ - loop/progress signal in event payload
255
+ suggested: Add loop detection over tool calls and state deltas.
256
+ ↓ cascaded to
257
+ File "cascade depth 1", in trajectory
258
+ Step ? agent=browser mode=planning.inefficient_plan confidence=0.67
259
+ event_id=evt_bce9260a3a024f89ac18430ac2f660ef
260
+ output> no progress; same checkout screen
261
+ evidence:
262
+ - loop/progress signal in event payload
263
+ suggested: Add loop detection over tool calls and state deltas.
264
+ ↓ cascaded to
265
+ File "cascade depth 2", in trajectory
266
+ Step ? agent=browser mode=planning.inefficient_plan confidence=0.67
267
+ event_id=evt_45dc8858899546ff8159af3ce4f8d6dd
268
+ output> no progress; same checkout screen
269
+ evidence:
270
+ - loop/progress signal in event payload
271
+ suggested: Add loop detection over tool calls and state deltas.
272
+ ↓ cascaded to
273
+ File "cascade depth 3", in trajectory
274
+ Step ? agent=browser mode=planning.inefficient_plan confidence=0.67
275
+ event_id=evt_90f10dde040341dcb9434192fba3255d
276
+ output> no progress; same checkout screen
277
+ evidence:
278
+ - loop/progress signal in event payload
279
+ suggested: Add loop detection over tool calls and state deltas.
280
+
281
+ AgentFailure[planning.inefficient_plan]: Likely root cause: Inefficient plan in browser at step None.
282
+ ```
283
+
284
+ ### ✅ `reflexion_suggestion` (0.00s) — 4 proposal(s)
285
+
286
+ ```
287
+ Reflexion retry hint for planning.inefficient_plan at step None
288
+ Reflexion retry hint for planning.inefficient_plan at step None
289
+ Reflexion retry hint for planning.inefficient_plan at step None
290
+ Reflexion retry hint for planning.inefficient_plan at step None
291
+ ```
292
+
293
+ ### ✅ `critic_recoverer` (0.00s) — 4 verifier proposal(s)
294
+
295
+ ```
296
+ Add loop_detector_guard before planning.inefficient_plan (step None, agent browser)
297
+ Add loop_detector_guard before planning.inefficient_plan (step None, agent browser)
298
+ Add loop_detector_guard before planning.inefficient_plan (step None, agent browser)
299
+ Add loop_detector_guard before planning.inefficient_plan (step None, agent browser)
300
+ ```
301
+
302
+ ### ✅ `llm_judge` (19.00s) — 2 finding(s); root=1
303
+
304
+ ```
305
+ - planning.inefficient_plan (conf=0.90) step=1 agent=planner
306
+ - reflection.progress_misjudge (conf=1.00) step=None agent=system
307
+ ```
308
+
309
+ ### ✅ `attribute_heuristic` (0.00s) — method=heuristic agent=planner step=1 conf=0.90
310
+
311
+ ```
312
+ Earliest finding with non-trivial confidence: Inefficient plan
313
+ ```
314
+
315
+ ### ✅ `attribute_all_at_once` (5.44s) — method=all_at_once agent=planner step=1 conf=0.90
316
+
317
+ ```
318
+ The planner's initial strategy was fundamentally flawed, instructing the agent to repeatedly click a button without verifying form completion or handling errors, which led to the failure.
319
+ ```
320
+
321
+ ### ✅ `attribute_step_by_step` (50.07s) — method=step_by_step agent=planner step=1 conf=0.90
322
+
323
+ ```
324
+ The planner proposed a brute-force clicking strategy without accounting for the necessary form-filling steps required for a checkout process.
325
+ ```
326
+
327
+ ### ✅ `attribute_binary_search` (6.06s) — method=heuristic agent=planner step=1 conf=0.90
328
+
329
+ ```
330
+ Earliest finding with non-trivial confidence: Inefficient plan
331
+ ```
332
+
333
+ ### ✅ `deep_debug` (38.24s) — 3 finding(s); rounds=6
334
+
335
+ ```
336
+ rounds: plan:4530ms / hypothesize:12830ms / verify:h1:4850ms / verify:h2:4577ms / verify:h3:4967ms / refine:6484ms
337
+ summary: The checkout submission failed because the planner ignored mandatory form fields and instead devised a strategy to repeatedly click the submit button, which the browser executed without success.
338
+
339
+ AgentTraceback (root cause first, manifested failure last):
340
+ trace_id=trace_3d1c98d2424a4c05ae104b942fe0a302 framework=e2e-browser goal='Submit the checkout form on shop.example.com'
341
+
342
+ File "root cause", in trajectory
343
+ Step 1 agent=planner mode=planning.constraint_ignorance confidence=1.00
344
+ module=planning
345
+ event_id=evt_b1d43814c6b642079ba7015d13b140c5
346
+ output> Strategy: click #submit until success
347
+ evidence:
348
+ - Strategy: click #submit until success
349
+ suggested: Compile task and tool constraints into pre-action checks.
350
+ ↓ cascaded to
351
+ File "cascade depth 1", in trajectory
352
+ Step ? agent=browser mode=planning.inefficient_plan confidence=0.95
353
+ event_id=evt_3ceba7d898d04e5a98f463ea8f9c8e72
354
+ input> {'tool': 'click', 'args': '()', 'kwargs': "{'selector': '#submit'}"}
355
+ evidence:
356
+ - click
357
+ - {'selector': '#submit'}
358
+ - no progress; same checkout screen
359
+ suggested: Add loop detection over tool calls and state deltas.
360
+ ↓ cascaded to
361
+ File "cascade depth 2", in trajectory
362
+ Step ? agent=browser mode=reflection.progress_misjudge confidence=0.90
363
+ event_id=evt_90f10dde040341dcb9434192fba3255d
364
+ output> no progress; same checkout screen
365
+ evidence:
366
+ - no progress; same checkout screen
367
+ - meta={'success': True}
368
+ suggested: Add an external task verifier before termination.
369
+
370
+ AgentFailure[reflection.progress_misjudge]: The checkout submission failed because the planner ignored mandatory form fields and instead devised a strategy to repeatedly click the submit button, which the browser executed without success.
371
+ ```
372
+
373
+ ### ✅ `hub_round_trip` (0.01s) — pushed=/home/kunlunz2/AgentDebugX/.agentdebug/e2e_hub/bundle_15650f55244b4ec98adf6ef0042496ec ; bundle_id=bundle_15650f55244b4ec98adf6ef0042496ec ; listed=4 ; round-trip ok
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "agentdebugx"
3
- version = "0.2.2"
3
+ version = "0.2.4"
4
4
  description = "Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`."
5
5
  authors = ["ULab @ UIUC <ulab@illinois.edu>"]
6
6
  license = "MIT"
@@ -13,6 +13,7 @@ from agentdebug.attribution import (
13
13
  AllAtOnceAttributor,
14
14
  AttributionResult,
15
15
  Attributor,
16
+ BinarySearchAttributor,
16
17
  Blame,
17
18
  HeuristicAttributor,
18
19
  StepByStepAttributor,
@@ -38,7 +39,14 @@ from agentdebug.models import (
38
39
  Modality,
39
40
  )
40
41
  from agentdebug.recorder import AgentDebug, TraceSession
41
- from agentdebug.recovery import FixProposal, Recoverer, ReflexionSuggestion
42
+ from agentdebug.recovery import (
43
+ DEFAULT_VERIFIERS,
44
+ CriticRecoverer,
45
+ FixProposal,
46
+ Recoverer,
47
+ ReflexionSuggestion,
48
+ VerifierSpec,
49
+ )
42
50
  from agentdebug.traceback import CascadeFrame, build_cascade, format_traceback
43
51
  from agentdebug.storage import JsonlTraceStore, SQLiteTraceStore
44
52
  from agentdebug.taxonomy import SEED_FAILURE_MODES, get_failure_mode
@@ -53,13 +61,17 @@ __all__ = [
53
61
  'Attributor',
54
62
  'Blame',
55
63
  'BusEvent',
64
+ 'BinarySearchAttributor',
56
65
  'CascadeFrame',
66
+ 'CriticRecoverer',
67
+ 'DEFAULT_VERIFIERS',
57
68
  'Detector',
58
69
  'DetectorConfig',
59
70
  'RepeatedStateDetector',
60
71
  'RepeatedToolCallDetector',
61
72
  'StepByStepAttributor',
62
73
  'StepCountLimitDetector',
74
+ 'VerifierSpec',
63
75
  'build_cascade',
64
76
  'default_detectors',
65
77
  'format_traceback',
@@ -84,4 +96,4 @@ __all__ = [
84
96
  'get_failure_mode',
85
97
  ]
86
98
 
87
- __version__ = '0.2.2'
99
+ __version__ = '0.2.4'
@@ -25,7 +25,11 @@ class HeuristicAnalyzer:
25
25
 
26
26
  def analyze(self, trajectory: AgentTrajectory) -> DiagnosticReport:
27
27
  findings = [finding for event in trajectory.events for finding in self._event_findings(event)]
28
- root = self._select_root_cause(findings)
28
+ # Build event-order map so the root selector can fall back to it when
29
+ # findings lack step_index (e.g., events recorded via traced_tool on
30
+ # pre-0.2.4 captures that pre-date the auto-counter).
31
+ event_order = {evt.event_id: i for i, evt in enumerate(trajectory.events)}
32
+ root = self._select_root_cause(findings, event_order=event_order)
29
33
  suggestions = self._dedupe(
30
34
  finding.suggestion for finding in findings if finding.suggestion is not None
31
35
  )
@@ -40,11 +44,19 @@ class HeuristicAnalyzer:
40
44
  if root is not None:
41
45
  report.root_cause_event_id = root.event_id
42
46
  report.root_cause_agent = root.agent_name
43
- report.root_cause_step_index = root.step_index
47
+ # If the finding lacked an explicit step_index, fall back to the
48
+ # event's position so root_cause_step_index is non-null in the UI
49
+ # and the AgentTraceback header reads sensibly.
50
+ inferred_step = root.step_index
51
+ if inferred_step is None and root.event_id is not None:
52
+ pos = event_order.get(root.event_id)
53
+ if pos is not None:
54
+ inferred_step = pos
55
+ report.root_cause_step_index = inferred_step
44
56
  report.summary = (
45
57
  f'Likely root cause: {root.failure_mode.name}'
46
58
  f' in {root.agent_name or "unknown agent"}'
47
- f' at step {root.step_index}.'
59
+ f' at step {inferred_step if inferred_step is not None else "?"}.'
48
60
  )
49
61
  return report
50
62
 
@@ -106,17 +118,35 @@ class HeuristicAnalyzer:
106
118
  ) -> Tuple[FailureMode, float, List[str]]:
107
119
  return SEED_FAILURE_MODES[mode_id], confidence, evidence
108
120
 
109
- def _select_root_cause(self, findings: List[FailureFinding]) -> Optional[FailureFinding]:
121
+ def _select_root_cause(
122
+ self,
123
+ findings: List[FailureFinding],
124
+ *,
125
+ event_order: Optional[dict[str, int]] = None,
126
+ ) -> Optional[FailureFinding]:
110
127
  if not findings:
111
128
  return None
112
- return sorted(
113
- findings,
114
- key=lambda finding: (
115
- finding.step_index is None,
116
- finding.step_index if finding.step_index is not None else 10**9,
117
- -finding.confidence,
118
- ),
119
- )[0]
129
+ # Primary key: step_index (None pushed to end).
130
+ # Fallback: when step_index is None, use the event's position in the
131
+ # trajectory so we still pick the *earliest* finding rather than just
132
+ # the highest-confidence one.
133
+ event_order = event_order or {}
134
+
135
+ def _key(f: FailureFinding) -> tuple[int, int, int, float]:
136
+ step = f.step_index if f.step_index is not None else 10**9
137
+ order = (
138
+ event_order.get(f.event_id, 10**9)
139
+ if f.event_id is not None and f.step_index is None
140
+ else 10**9
141
+ )
142
+ return (
143
+ 0 if f.step_index is not None else 1,
144
+ step,
145
+ order,
146
+ -f.confidence,
147
+ )
148
+
149
+ return sorted(findings, key=_key)[0]
120
150
 
121
151
  def _event_text(self, event: AgentEvent) -> str:
122
152
  parts = [