agentdebugx 0.2.3__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/PKG-INFO +1 -1
  2. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/23_status_v0_2.md +34 -0
  3. agentdebugx-0.2.4/docs/benchmarks/e2e_v0_2_3.md +373 -0
  4. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/pyproject.toml +1 -1
  5. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/__init__.py +1 -1
  6. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/analyzers.py +42 -12
  7. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/attribution.py +11 -8
  8. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/recorder.py +15 -1
  9. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/LICENSE +0 -0
  10. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/README.md +0 -0
  11. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/00_overview.md +0 -0
  12. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/01_literature_survey.md +0 -0
  13. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/02_architecture.md +0 -0
  14. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/03_taxonomy.md +0 -0
  15. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/04_trace_schema.md +0 -0
  16. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/05_adapters.md +0 -0
  17. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/06_detectors.md +0 -0
  18. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/07_attribution.md +0 -0
  19. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/08_recovery.md +0 -0
  20. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/09_error_database.md +0 -0
  21. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/10_taxonomy_induction.md +0 -0
  22. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/11_multimodal.md +0 -0
  23. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/12_ui_dashboard.md +0 -0
  24. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/13_class_design.md +0 -0
  25. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/14_api_reference.md +0 -0
  26. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/15_roadmap.md +0 -0
  27. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/16_governance.md +0 -0
  28. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/17_claude_code_design_patterns.md +0 -0
  29. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/18_comparison_codex_vs_design.md +0 -0
  30. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/19_error_hub.md +0 -0
  31. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/20_deep_debug.md +0 -0
  32. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/21_integrations.md +0 -0
  33. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/22_industry_track_paper_eval_plan.md +0 -0
  34. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/ERROR_TAXONOMY.md +0 -0
  35. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
  36. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/README.md +0 -0
  37. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/RESEARCH_SURVEY.md +0 -0
  38. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/benchmarks/v0_1_smoke.json +0 -0
  39. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/benchmarks/v0_1_smoke.md +0 -0
  40. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/adapters/__init__.py +0 -0
  41. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/adapters/base.py +0 -0
  42. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/adapters/langgraph.py +0 -0
  43. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/adapters/otel.py +0 -0
  44. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/adapters/raw.py +0 -0
  45. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/cli.py +0 -0
  46. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/deep.py +0 -0
  47. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/detectors.py +0 -0
  48. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/events.py +0 -0
  49. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/hub/__init__.py +0 -0
  50. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/hub/backend_base.py +0 -0
  51. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/hub/backends.py +0 -0
  52. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/hub/bundle.py +0 -0
  53. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/hub/scrub.py +0 -0
  54. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/instrumentation.py +0 -0
  55. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/integrations/__init__.py +0 -0
  56. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/integrations/claude_skill.py +0 -0
  57. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/integrations/openhands.py +0 -0
  58. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/judges.py +0 -0
  59. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/llm.py +0 -0
  60. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/models.py +0 -0
  61. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/recovery.py +0 -0
  62. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/storage.py +0 -0
  63. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/taxonomy.py +0 -0
  64. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/traceback.py +0 -0
  65. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/ui/__init__.py +0 -0
  66. {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/ui/server.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentdebugx
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -82,6 +82,40 @@ The audit found one real bug and a handful of test gaps:
82
82
  5. **`recovery.ReflexionSuggestion`** had only an indirect test from DeepDebug
83
83
  examples; now has direct happy + empty tests.
84
84
 
85
+ ## 3.6 Real-usage E2E (live Gemini)
86
+
87
+ Beyond unit tests, `scripts/e2e_real_usage.py` builds three realistic failing
88
+ trajectories using **only the public API** (`AgentDebug`, `traced_tool`,
89
+ `SQLiteTraceStore`) and runs the full pipeline against the live LLM.
90
+
91
+ Stage results (see [docs/benchmarks/e2e_v0_2_3.md](./benchmarks/e2e_v0_2_3.md)):
92
+
93
+ | Scenario | Stages OK |
94
+ |---|---|
95
+ | `action_format_then_hallucination` (planner → bad tool call → hallucinated answer) | 12 / 12 |
96
+ | `multiagent_handoff_loss` (researcher → handoff drops constraint → wrong summary) | 12 / 12 |
97
+ | `planning_loop` (browser clicks #submit 4× with no progress) | 12 / 12 |
98
+ | UI smoke (`/healthz`, `/api/v1/traces`, `/api/v1/traces/<id>`, `/api/v1/taxonomy`, `/`) | 5 / 5 |
99
+ | Fresh-venv `pip install agentdebugx==0.2.3` + import + CLI listing | ✅ |
100
+
101
+ **Honest issues the E2E surfaced** (none of these would have been caught by
102
+ the mocked unit tests):
103
+
104
+ 1. **LLM judge can return truncated JSON on long traces** — gemini-3-flash
105
+ spent its `max_tokens` budget on reasoning tokens before completing the
106
+ findings array; the pipeline gracefully returned 0 findings rather than
107
+ crashing. Mitigation: per-call `max_tokens=6144`+; document the
108
+ thinking-token trap (done in [docs/20_deep_debug.md §7](./20_deep_debug.md)).
109
+ 2. **`BinarySearchAttributor` falls back to `HeuristicAttributor` when its
110
+ probe JSON is truncated** — observed in 2 of 3 scenarios. The fallback
111
+ chain works correctly, but the user loses the O(log N) advantage.
112
+ Followup: tighter bisection prompts; track in `result.raw['probe_count']`.
113
+ 3. **`HeuristicAnalyzer` returns `root_cause_step_index=None` when all
114
+ findings have `step_index=None`** — the event recorded via `traced_tool`
115
+ doesn't carry a step index. Real bug; `traced_tool` should auto-assign.
116
+
117
+ These are tracked as v0.2.4 fixes.
118
+
85
119
  ## 4. Coverage matrix (post-0.2.2)
86
120
 
87
121
  Run `PYTHONPATH=src pytest --cov=agentdebug --cov-report=term`. The two largest
@@ -0,0 +1,373 @@
1
+ # AgentDebugX v0.2.3 End-to-End Real-Usage Smoke
2
+
3
+ Scenarios: **3**. LLM model: `gemini-3-flash`. Generated by `scripts/e2e_real_usage.py`.
4
+
5
+ ## Per-scenario pipeline status
6
+
7
+ | Scenario | trace_id | OK / Total stages | Failed stages |
8
+ |---|---|---|---|
9
+ | `action_format_then_hallucination` | `trace_f81860…` | 12 / 12 | — |
10
+ | `multiagent_handoff_loss` | `trace_45009e…` | 12 / 12 | — |
11
+ | `planning_loop` | `trace_3d1c98…` | 12 / 12 | — |
12
+
13
+ **UI smoke:** ✅ all endpoints responded
14
+
15
+ ```
16
+ GET /healthz -> 200 {"status":"ok"}
17
+ GET /api/v1/traces -> 3 trace(s)
18
+ GET /api/v1/traces/<id> -> 200 events=11 findings=4
19
+ GET /api/v1/taxonomy -> modes=19
20
+ GET / -> 200 content_length=33666 has_brand=True
21
+ ```
22
+
23
+ ## `action_format_then_hallucination`
24
+
25
+ `trace_id=trace_f81860758c6d439aaf1ecd7457de6654`
26
+
27
+ ### ✅ `heuristic_analyzer` (0.00s) — 1 finding(s); root=None
28
+ ### ✅ `cross_event_detectors` (0.00s) — 0 finding(s) from default_detectors()
29
+ ### ✅ `traceback_offline` (0.00s) — rendered
30
+
31
+ ```
32
+ AgentTraceback (root cause first, manifested failure last):
33
+ trace_id=trace_f81860758c6d439aaf1ecd7457de6654 framework=e2e-react goal='Find the latest AgentDebug paper, summarize the method, then email alice@example.com'
34
+
35
+ File "root cause", in trajectory
36
+ Step ? agent=search_web mode=system.tool_execution_error confidence=0.86
37
+ event_id=evt_582bbb55430a4be583ad6c374f7c1564
38
+ error> JSON schema validation failed: missing parameter query
39
+ evidence:
40
+ - JSON schema validation failed: missing parameter query
41
+ suggested: Capture tool stderr/status/latency and classify retryable versus non-retryable failures.
42
+
43
+ AgentFailure[system.tool_execution_error]: Likely root cause: Tool execution error in search_web at step None.
44
+ ```
45
+
46
+ ### ✅ `reflexion_suggestion` (0.00s) — 1 proposal(s)
47
+
48
+ ```
49
+ Reflexion retry hint for system.tool_execution_error at step None
50
+ ```
51
+
52
+ ### ✅ `critic_recoverer` (0.00s) — 1 verifier proposal(s)
53
+
54
+ ```
55
+ Add tool_result_typecheck before system.tool_execution_error (step None, agent search_web)
56
+ ```
57
+
58
+ ### ✅ `llm_judge` (27.97s) — 0 finding(s); root=None
59
+ ### ✅ `attribute_heuristic` (0.00s) — method=heuristic (no hypotheses)
60
+ ### ✅ `attribute_all_at_once` (6.27s) — method=all_at_once agent=search_web step=None conf=0.90
61
+
62
+ ```
63
+ The agent failed to provide the required 'query' parameter in the search tool call, which resulted in a validation error and prevented the agent from finding the paper.
64
+ ```
65
+
66
+ ### ✅ `attribute_step_by_step` (17.05s) — method=step_by_step agent=planner step=4 conf=1.00
67
+
68
+ ```
69
+ The agent prematurely terminates the task with a generic statement, failing to summarize the method or email the recipient as required by the goal.
70
+ ```
71
+
72
+ ### ✅ `attribute_binary_search` (9.48s) — method=binary_search agent=planner step=4 conf=0.90
73
+
74
+ ```
75
+ Binary search located the decisive step within 3 probes over 6 events.
76
+ ```
77
+
78
+ ### ✅ `deep_debug` (25.52s) — 3 finding(s); rounds=6
79
+
80
+ ```
81
+ rounds: plan:3794ms / hypothesize:7161ms / verify:h1:2551ms / verify:h2:2381ms / verify:h3:2097ms / refine:7534ms
82
+ summary: The agent failed to provide the required 'query' parameter for the search tool, and the planner subsequently misjudged the task as complete despite failing to find the paper, summarize it, or send the email.
83
+
84
+ AgentTraceback (root cause first, manifested failure last):
85
+ trace_id=trace_f81860758c6d439aaf1ecd7457de6654 framework=e2e-react goal='Find the latest AgentDebug paper, summarize the method, then email alice@example.com'
86
+
87
+ File "root cause", in trajectory
88
+ Step ? agent=search_web mode=action.parameter_error confidence=1.00
89
+ event_id=evt_582bbb55430a4be583ad6c374f7c1564
90
+ error> JSON schema validation failed: missing parameter query
91
+ evidence:
92
+ - JSON schema validation failed: missing parameter query
93
+ - args': '()', 'kwargs': '{}'
94
+ suggested: Validate parameters against tool schemas and ask for missing user/context fields.
95
+ ↓ cascaded to
96
+ File "cascade depth 1", in trajectory
97
+ Step 4 agent=planner mode=reflection.progress_misjudge confidence=1.00
98
+ module=reflection
99
+ event_id=evt_047e5ad596874186ac8d4413b8ba8185
100
+ output> Final answer: AgentDebug is a popular paper. Done.
101
+ evidence:
102
+ - Final answer: AgentDebug is a popular paper. Done.
103
+ - error=JSON schema validation failed: missing parameter query
104
+ suggested: Add an external task verifier before termination.
105
+ ↓ cascaded to
106
+ File "cascade depth 2", in trajectory
107
+ Step ? agent=system mode=verification.missing_task_validation confidence=1.00
108
+ event_id=evt_0e34f3f892664015b10bba11ed2ac3dd
109
+ evidence:
110
+ - meta={'success': True}
111
+ - Final answer: AgentDebug is a popular paper. Done.
112
+ suggested: Add final-state validation that is independent of the acting agent.
113
+
114
+ AgentFailure[verification.missing_task_validation]: The agent failed to provide the required 'query' parameter for the search tool, and the planner subsequently misjudged the task as complete despite failing to find the paper, summarize it, or send the email.
115
+ ```
116
+
117
+ ### ✅ `hub_round_trip` (0.01s) — pushed=/home/kunlunz2/AgentDebugX/.agentdebug/e2e_hub/bundle_b8fa2127c001463d81c86c8c03e4002a ; bundle_id=bundle_b8fa2127c001463d81c86c8c03e4002a ; listed=1 ; round-trip ok
118
+
119
+ ## `multiagent_handoff_loss`
120
+
121
+ `trace_id=trace_45009e26b64341e69af395c4d4cabc07`
122
+
123
+ ### ✅ `heuristic_analyzer` (0.00s) — 1 finding(s); root=2
124
+ ### ✅ `cross_event_detectors` (0.00s) — 0 finding(s) from default_detectors()
125
+ ### ✅ `traceback_offline` (0.00s) — rendered
126
+
127
+ ```
128
+ AgentTraceback (root cause first, manifested failure last):
129
+ trace_id=trace_45009e26b64341e69af395c4d4cabc07 framework=e2e-multiagent goal='Find the best paper on agent debugging, prefer the most recent.'
130
+
131
+ File "root cause", in trajectory
132
+ Step 2 agent=researcher mode=multiagent.handoff_loss confidence=0.70
133
+ module=multiagent
134
+ event_id=evt_09730d2f195349639b671b8278d0202c
135
+ output> Please summarize the agent debugging paper.
136
+ evidence:
137
+ - handoff/context signal in event payload
138
+ suggested: Make handoff payloads typed and include goal, constraints, evidence, confidence, and open questions.
139
+
140
+ AgentFailure[multiagent.handoff_loss]: Likely root cause: Handoff context loss in researcher at step 2.
141
+ ```
142
+
143
+ ### ✅ `reflexion_suggestion` (0.00s) — 1 proposal(s)
144
+
145
+ ```
146
+ Reflexion retry hint for multiagent.handoff_loss at step 2
147
+ ```
148
+
149
+ ### ✅ `critic_recoverer` (0.00s) — 1 verifier proposal(s)
150
+
151
+ ```
152
+ Add handoff_context_contract before multiagent.handoff_loss (step 2, agent researcher)
153
+ ```
154
+
155
+ ### ✅ `llm_judge` (8.01s) — 2 finding(s); root=2
156
+
157
+ ```
158
+ - multiagent.handoff_loss (conf=1.00) step=2 agent=researcher
159
+ - verification.missing_task_validation (conf=0.90) step=None agent=system
160
+ ```
161
+
162
+ ### ✅ `attribute_heuristic` (0.00s) — method=heuristic agent=researcher step=2 conf=1.00
163
+
164
+ ```
165
+ Earliest finding with non-trivial confidence: Handoff context loss
166
+ ```
167
+
168
+ ### ✅ `attribute_all_at_once` (3.30s) — method=all_at_once agent=researcher step=2 conf=1.00
169
+
170
+ ```
171
+ The researcher correctly identified Paper A as the most recent in step 1 but failed to communicate this specific choice or the recency constraint during the handoff in step 2, leading the summarizer to pick the wrong paper.
172
+ ```
173
+
174
+ ### ✅ `attribute_step_by_step` (20.70s) — method=step_by_step agent=researcher step=2 conf=1.00
175
+
176
+ ```
177
+ The researcher agent identified the correct paper in the previous step but failed to deliver the result, instead initiating an unnecessary handoff for summarization.
178
+ ```
179
+
180
+ ### ✅ `attribute_binary_search` (7.03s) — method=heuristic agent=researcher step=2 conf=1.00
181
+
182
+ ```
183
+ Earliest finding with non-trivial confidence: Handoff context loss
184
+ ```
185
+
186
+ ### ✅ `deep_debug` (30.30s) — 3 finding(s); rounds=6
187
+
188
+ ```
189
+ rounds: plan:3749ms / hypothesize:8433ms / verify:h1:3764ms / verify:h2:2344ms / verify:h3:4481ms / refine:7526ms
190
+ summary: The researcher agent hallucinated paper candidates without performing a search and subsequently failed to communicate the user's recency constraints and the selected paper to the summarizer, leading to an incorrect final output.
191
+
192
+ AgentTraceback (root cause first, manifested failure last):
193
+ trace_id=trace_45009e26b64341e69af395c4d4cabc07 framework=e2e-multiagent goal='Find the best paper on agent debugging, prefer the most recent.'
194
+
195
+ File "root cause", in trajectory
196
+ Step 1 agent=researcher mode=memory.hallucination confidence=0.95
197
+ module=planning
198
+ event_id=evt_1f436c0e08534b579faba36acb2e6703
199
+ output> Found two candidate papers: A (May 2025) and B (Mar 2024). A is preferred because it is more recent (per user constraint).
200
+ evidence:
201
+ - Found two candidate papers: A (May 2025) and B (Mar 2024)
202
+ - without any apparent search or data gathering steps
203
+ suggested: Require memory reads to cite the source event or artifact before use.
204
+ ↓ cascaded to
205
+ File "cascade depth 1", in trajectory
206
+ Step 2 agent=researcher mode=multiagent.handoff_loss confidence=1.00
207
+ module=multiagent
208
+ event_id=evt_09730d2f195349639b671b8278d0202c
209
+ output> Please summarize the agent debugging paper.
210
+ evidence:
211
+ - Please summarize the agent debugging paper.
212
+ - omitted_context: 'preference for A; recency constraint'
213
+ suggested: Make handoff payloads typed and include goal, constraints, evidence, confidence, and open questions.
214
+ ↓ cascaded to
215
+ File "cascade depth 2", in trajectory
216
+ Step 4 agent=summarizer mode=reflection.progress_misjudge confidence=0.90
217
+ module=reflection
218
+ event_id=evt_e7e8080c4dde47ef88c96dc0db743023
219
+ output> I summarized paper B (the only one I knew about).
220
+ evidence:
221
+ - I summarized paper B (the only one I knew about).
222
+ - prefer the most recent
223
+ suggested: Add an external task verifier before termination.
224
+
225
+ AgentFailure[reflection.progress_misjudge]: The researcher agent hallucinated paper candidates without performing a search and subsequently failed to communicate the user's recency constraints and the selected paper to the summarizer, leading to an incorrect final output.
226
+ ```
227
+
228
+ ### ✅ `hub_round_trip` (0.01s) — pushed=/home/kunlunz2/AgentDebugX/.agentdebug/e2e_hub/bundle_6c474a1056074785a0275407f24fedc1 ; bundle_id=bundle_6c474a1056074785a0275407f24fedc1 ; listed=3 ; round-trip ok
229
+
230
+ ## `planning_loop`
231
+
232
+ `trace_id=trace_3d1c98d2424a4c05ae104b942fe0a302`
233
+
234
+ ### ✅ `heuristic_analyzer` (0.00s) — 4 finding(s); root=None
235
+ ### ✅ `cross_event_detectors` (0.00s) — 3 finding(s) from default_detectors()
236
+
237
+ ```
238
+ - planning.inefficient_plan (source=repeated_tool_call)
239
+ - planning.inefficient_plan (source=repeated_state)
240
+ - planning.inefficient_plan (source=repeated_state)
241
+ ```
242
+
243
+ ### ✅ `traceback_offline` (0.00s) — rendered
244
+
245
+ ```
246
+ AgentTraceback (root cause first, manifested failure last):
247
+ trace_id=trace_3d1c98d2424a4c05ae104b942fe0a302 framework=e2e-browser goal='Submit the checkout form on shop.example.com'
248
+
249
+ File "root cause", in trajectory
250
+ Step ? agent=browser mode=planning.inefficient_plan confidence=0.67
251
+ event_id=evt_3f474425ad3841b886240a70ec694fa5
252
+ output> no progress; same checkout screen
253
+ evidence:
254
+ - loop/progress signal in event payload
255
+ suggested: Add loop detection over tool calls and state deltas.
256
+ ↓ cascaded to
257
+ File "cascade depth 1", in trajectory
258
+ Step ? agent=browser mode=planning.inefficient_plan confidence=0.67
259
+ event_id=evt_bce9260a3a024f89ac18430ac2f660ef
260
+ output> no progress; same checkout screen
261
+ evidence:
262
+ - loop/progress signal in event payload
263
+ suggested: Add loop detection over tool calls and state deltas.
264
+ ↓ cascaded to
265
+ File "cascade depth 2", in trajectory
266
+ Step ? agent=browser mode=planning.inefficient_plan confidence=0.67
267
+ event_id=evt_45dc8858899546ff8159af3ce4f8d6dd
268
+ output> no progress; same checkout screen
269
+ evidence:
270
+ - loop/progress signal in event payload
271
+ suggested: Add loop detection over tool calls and state deltas.
272
+ ↓ cascaded to
273
+ File "cascade depth 3", in trajectory
274
+ Step ? agent=browser mode=planning.inefficient_plan confidence=0.67
275
+ event_id=evt_90f10dde040341dcb9434192fba3255d
276
+ output> no progress; same checkout screen
277
+ evidence:
278
+ - loop/progress signal in event payload
279
+ suggested: Add loop detection over tool calls and state deltas.
280
+
281
+ AgentFailure[planning.inefficient_plan]: Likely root cause: Inefficient plan in browser at step None.
282
+ ```
283
+
284
+ ### ✅ `reflexion_suggestion` (0.00s) — 4 proposal(s)
285
+
286
+ ```
287
+ Reflexion retry hint for planning.inefficient_plan at step None
288
+ Reflexion retry hint for planning.inefficient_plan at step None
289
+ Reflexion retry hint for planning.inefficient_plan at step None
290
+ Reflexion retry hint for planning.inefficient_plan at step None
291
+ ```
292
+
293
+ ### ✅ `critic_recoverer` (0.00s) — 4 verifier proposal(s)
294
+
295
+ ```
296
+ Add loop_detector_guard before planning.inefficient_plan (step None, agent browser)
297
+ Add loop_detector_guard before planning.inefficient_plan (step None, agent browser)
298
+ Add loop_detector_guard before planning.inefficient_plan (step None, agent browser)
299
+ Add loop_detector_guard before planning.inefficient_plan (step None, agent browser)
300
+ ```
301
+
302
+ ### ✅ `llm_judge` (19.00s) — 2 finding(s); root=1
303
+
304
+ ```
305
+ - planning.inefficient_plan (conf=0.90) step=1 agent=planner
306
+ - reflection.progress_misjudge (conf=1.00) step=None agent=system
307
+ ```
308
+
309
+ ### ✅ `attribute_heuristic` (0.00s) — method=heuristic agent=planner step=1 conf=0.90
310
+
311
+ ```
312
+ Earliest finding with non-trivial confidence: Inefficient plan
313
+ ```
314
+
315
+ ### ✅ `attribute_all_at_once` (5.44s) — method=all_at_once agent=planner step=1 conf=0.90
316
+
317
+ ```
318
+ The planner's initial strategy was fundamentally flawed, instructing the agent to repeatedly click a button without verifying form completion or handling errors, which led to the failure.
319
+ ```
320
+
321
+ ### ✅ `attribute_step_by_step` (50.07s) — method=step_by_step agent=planner step=1 conf=0.90
322
+
323
+ ```
324
+ The planner proposed a brute-force clicking strategy without accounting for the necessary form-filling steps required for a checkout process.
325
+ ```
326
+
327
+ ### ✅ `attribute_binary_search` (6.06s) — method=heuristic agent=planner step=1 conf=0.90
328
+
329
+ ```
330
+ Earliest finding with non-trivial confidence: Inefficient plan
331
+ ```
332
+
333
+ ### ✅ `deep_debug` (38.24s) — 3 finding(s); rounds=6
334
+
335
+ ```
336
+ rounds: plan:4530ms / hypothesize:12830ms / verify:h1:4850ms / verify:h2:4577ms / verify:h3:4967ms / refine:6484ms
337
+ summary: The checkout submission failed because the planner ignored mandatory form fields and instead devised a strategy to repeatedly click the submit button, which the browser executed without success.
338
+
339
+ AgentTraceback (root cause first, manifested failure last):
340
+ trace_id=trace_3d1c98d2424a4c05ae104b942fe0a302 framework=e2e-browser goal='Submit the checkout form on shop.example.com'
341
+
342
+ File "root cause", in trajectory
343
+ Step 1 agent=planner mode=planning.constraint_ignorance confidence=1.00
344
+ module=planning
345
+ event_id=evt_b1d43814c6b642079ba7015d13b140c5
346
+ output> Strategy: click #submit until success
347
+ evidence:
348
+ - Strategy: click #submit until success
349
+ suggested: Compile task and tool constraints into pre-action checks.
350
+ ↓ cascaded to
351
+ File "cascade depth 1", in trajectory
352
+ Step ? agent=browser mode=planning.inefficient_plan confidence=0.95
353
+ event_id=evt_3ceba7d898d04e5a98f463ea8f9c8e72
354
+ input> {'tool': 'click', 'args': '()', 'kwargs': "{'selector': '#submit'}"}
355
+ evidence:
356
+ - click
357
+ - {'selector': '#submit'}
358
+ - no progress; same checkout screen
359
+ suggested: Add loop detection over tool calls and state deltas.
360
+ ↓ cascaded to
361
+ File "cascade depth 2", in trajectory
362
+ Step ? agent=browser mode=reflection.progress_misjudge confidence=0.90
363
+ event_id=evt_90f10dde040341dcb9434192fba3255d
364
+ output> no progress; same checkout screen
365
+ evidence:
366
+ - no progress; same checkout screen
367
+ - meta={'success': True}
368
+ suggested: Add an external task verifier before termination.
369
+
370
+ AgentFailure[reflection.progress_misjudge]: The checkout submission failed because the planner ignored mandatory form fields and instead devised a strategy to repeatedly click the submit button, which the browser executed without success.
371
+ ```
372
+
373
+ ### ✅ `hub_round_trip` (0.01s) — pushed=/home/kunlunz2/AgentDebugX/.agentdebug/e2e_hub/bundle_15650f55244b4ec98adf6ef0042496ec ; bundle_id=bundle_15650f55244b4ec98adf6ef0042496ec ; listed=4 ; round-trip ok
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "agentdebugx"
3
- version = "0.2.3"
3
+ version = "0.2.4"
4
4
  description = "Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`."
5
5
  authors = ["ULab @ UIUC <ulab@illinois.edu>"]
6
6
  license = "MIT"
@@ -96,4 +96,4 @@ __all__ = [
96
96
  'get_failure_mode',
97
97
  ]
98
98
 
99
- __version__ = '0.2.3'
99
+ __version__ = '0.2.4'
@@ -25,7 +25,11 @@ class HeuristicAnalyzer:
25
25
 
26
26
  def analyze(self, trajectory: AgentTrajectory) -> DiagnosticReport:
27
27
  findings = [finding for event in trajectory.events for finding in self._event_findings(event)]
28
- root = self._select_root_cause(findings)
28
+ # Build event-order map so the root selector can fall back to it when
29
+ # findings lack step_index (e.g., events recorded via traced_tool on
30
+ # pre-0.2.4 captures that pre-date the auto-counter).
31
+ event_order = {evt.event_id: i for i, evt in enumerate(trajectory.events)}
32
+ root = self._select_root_cause(findings, event_order=event_order)
29
33
  suggestions = self._dedupe(
30
34
  finding.suggestion for finding in findings if finding.suggestion is not None
31
35
  )
@@ -40,11 +44,19 @@ class HeuristicAnalyzer:
40
44
  if root is not None:
41
45
  report.root_cause_event_id = root.event_id
42
46
  report.root_cause_agent = root.agent_name
43
- report.root_cause_step_index = root.step_index
47
+ # If the finding lacked an explicit step_index, fall back to the
48
+ # event's position so root_cause_step_index is non-null in the UI
49
+ # and the AgentTraceback header reads sensibly.
50
+ inferred_step = root.step_index
51
+ if inferred_step is None and root.event_id is not None:
52
+ pos = event_order.get(root.event_id)
53
+ if pos is not None:
54
+ inferred_step = pos
55
+ report.root_cause_step_index = inferred_step
44
56
  report.summary = (
45
57
  f'Likely root cause: {root.failure_mode.name}'
46
58
  f' in {root.agent_name or "unknown agent"}'
47
- f' at step {root.step_index}.'
59
+ f' at step {inferred_step if inferred_step is not None else "?"}.'
48
60
  )
49
61
  return report
50
62
 
@@ -106,17 +118,35 @@ class HeuristicAnalyzer:
106
118
  ) -> Tuple[FailureMode, float, List[str]]:
107
119
  return SEED_FAILURE_MODES[mode_id], confidence, evidence
108
120
 
109
- def _select_root_cause(self, findings: List[FailureFinding]) -> Optional[FailureFinding]:
121
+ def _select_root_cause(
122
+ self,
123
+ findings: List[FailureFinding],
124
+ *,
125
+ event_order: Optional[dict[str, int]] = None,
126
+ ) -> Optional[FailureFinding]:
110
127
  if not findings:
111
128
  return None
112
- return sorted(
113
- findings,
114
- key=lambda finding: (
115
- finding.step_index is None,
116
- finding.step_index if finding.step_index is not None else 10**9,
117
- -finding.confidence,
118
- ),
119
- )[0]
129
+ # Primary key: step_index (None pushed to end).
130
+ # Fallback: when step_index is None, use the event's position in the
131
+ # trajectory so we still pick the *earliest* finding rather than just
132
+ # the highest-confidence one.
133
+ event_order = event_order or {}
134
+
135
+ def _key(f: FailureFinding) -> tuple[int, int, int, float]:
136
+ step = f.step_index if f.step_index is not None else 10**9
137
+ order = (
138
+ event_order.get(f.event_id, 10**9)
139
+ if f.event_id is not None and f.step_index is None
140
+ else 10**9
141
+ )
142
+ return (
143
+ 0 if f.step_index is not None else 1,
144
+ step,
145
+ order,
146
+ -f.confidence,
147
+ )
148
+
149
+ return sorted(findings, key=_key)[0]
120
150
 
121
151
  def _event_text(self, event: AgentEvent) -> str:
122
152
  parts = [
@@ -238,14 +238,14 @@ shown a PREFIX of a failed agent trajectory truncated to its first N events.
238
238
  Decide whether the failure has ALREADY occurred within this prefix, i.e.,
239
239
  whether the trajectory is unrecoverable as of the last event shown.
240
240
 
241
- Respond ONLY with a JSON object (no prose, no markdown):
241
+ CRITICAL OUTPUT RULES these maximize the chance your response parses:
242
+ 1. Output ONLY a JSON object. No prose before or after. No markdown fences.
243
+ 2. Keep the rationale to ONE short sentence (<= 200 chars). Do NOT include
244
+ line breaks inside string values.
245
+ 3. Emit the JSON object COMPLETE; do not stop mid-key.
242
246
 
243
- {
244
- "failure_already_happened": true | false,
245
- "confidence": <float in [0,1]>,
246
- "rationale": "<one or two sentences>",
247
- "decisive_event_id": "<event_id or null>"
248
- }
247
+ Schema:
248
+ {"failure_already_happened": true|false, "confidence": <0..1>, "rationale": "<short>", "decisive_event_id": "<event_id or null>"}
249
249
 
250
250
  Be conservative: only return true when you can point to evidence in the
251
251
  prefix that the agent has already taken (or omitted) the decisive step.
@@ -443,9 +443,12 @@ class BinarySearchAttributor:
443
443
  llm: LLMClient,
444
444
  *,
445
445
  fallback: Optional[Attributor] = None,
446
- max_tokens: int = 1024,
446
+ max_tokens: int = 2048,
447
447
  context_window: int = 6,
448
448
  ) -> None:
449
+ # max_tokens default doubled in 0.2.4: thinking models (Gemini, o-series)
450
+ # consume most of the budget on reasoning before any JSON is emitted, so
451
+ # 1024 was empirically truncating bisect probes in the v0.2.3 E2E.
449
452
  self.llm = llm
450
453
  self.fallback: Attributor = fallback or HeuristicAttributor()
451
454
  self.max_tokens = max_tokens
@@ -127,11 +127,23 @@ class AgentDebug:
127
127
 
128
128
 
129
129
  class TraceSession:
130
- """Context manager around a trajectory."""
130
+ """Context manager around a trajectory.
131
+
132
+ Maintains a monotonic step counter so that callers (and instrumentation
133
+ helpers like :func:`agentdebug.instrumentation.traced_tool`) can omit
134
+ ``step_index`` and still have downstream analyzers see properly ordered
135
+ steps. Explicit ``step_index`` always wins; passing ``None`` triggers
136
+ auto-assignment.
137
+ """
131
138
 
132
139
  def __init__(self, debugger: AgentDebug, trajectory: AgentTrajectory) -> None:
133
140
  self.debugger = debugger
134
141
  self.trajectory = trajectory
142
+ self._step_counter = 0
143
+
144
+ def _next_step(self) -> int:
145
+ self._step_counter += 1
146
+ return self._step_counter
135
147
 
136
148
  def __enter__(self) -> 'TraceSession':
137
149
  return self
@@ -165,6 +177,8 @@ class TraceSession:
165
177
  duration_ms: Optional[float] = None,
166
178
  **metadata: Any,
167
179
  ) -> AgentEvent:
180
+ if step_index is None:
181
+ step_index = self._next_step()
168
182
  return self.debugger.record_event(
169
183
  self.trajectory,
170
184
  event_type=event_type,
File without changes
File without changes
File without changes