agentdebugx 0.2.3__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/PKG-INFO +1 -1
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/23_status_v0_2.md +34 -0
- agentdebugx-0.2.4/docs/benchmarks/e2e_v0_2_3.md +373 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/pyproject.toml +1 -1
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/__init__.py +1 -1
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/analyzers.py +42 -12
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/attribution.py +11 -8
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/recorder.py +15 -1
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/LICENSE +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/README.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/00_overview.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/01_literature_survey.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/02_architecture.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/03_taxonomy.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/04_trace_schema.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/05_adapters.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/06_detectors.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/07_attribution.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/08_recovery.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/09_error_database.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/10_taxonomy_induction.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/11_multimodal.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/12_ui_dashboard.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/13_class_design.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/14_api_reference.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/15_roadmap.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/16_governance.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/17_claude_code_design_patterns.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/18_comparison_codex_vs_design.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/19_error_hub.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/20_deep_debug.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/21_integrations.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/22_industry_track_paper_eval_plan.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/ERROR_TAXONOMY.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/README.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/RESEARCH_SURVEY.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/benchmarks/v0_1_smoke.json +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/docs/benchmarks/v0_1_smoke.md +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/adapters/__init__.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/adapters/base.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/adapters/langgraph.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/adapters/otel.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/adapters/raw.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/cli.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/deep.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/detectors.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/events.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/hub/__init__.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/hub/backend_base.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/hub/backends.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/hub/bundle.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/hub/scrub.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/instrumentation.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/integrations/__init__.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/integrations/claude_skill.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/integrations/openhands.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/judges.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/llm.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/models.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/recovery.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/storage.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/taxonomy.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/traceback.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/ui/__init__.py +0 -0
- {agentdebugx-0.2.3 → agentdebugx-0.2.4}/src/agentdebug/ui/server.py +0 -0
|
@@ -82,6 +82,40 @@ The audit found one real bug and a handful of test gaps:
|
|
|
82
82
|
5. **`recovery.ReflexionSuggestion`** had only an indirect test from DeepDebug
|
|
83
83
|
examples; now has direct happy + empty tests.
|
|
84
84
|
|
|
85
|
+
## 3.6 Real-usage E2E (live Gemini)
|
|
86
|
+
|
|
87
|
+
Beyond unit tests, `scripts/e2e_real_usage.py` builds three realistic failing
|
|
88
|
+
trajectories using **only the public API** (`AgentDebug`, `traced_tool`,
|
|
89
|
+
`SQLiteTraceStore`) and runs the full pipeline against the live LLM.
|
|
90
|
+
|
|
91
|
+
Stage results (see [docs/benchmarks/e2e_v0_2_3.md](./benchmarks/e2e_v0_2_3.md)):
|
|
92
|
+
|
|
93
|
+
| Scenario | Stages OK |
|
|
94
|
+
|---|---|
|
|
95
|
+
| `action_format_then_hallucination` (planner → bad tool call → hallucinated answer) | 12 / 12 |
|
|
96
|
+
| `multiagent_handoff_loss` (researcher → handoff drops constraint → wrong summary) | 12 / 12 |
|
|
97
|
+
| `planning_loop` (browser clicks #submit 4× with no progress) | 12 / 12 |
|
|
98
|
+
| UI smoke (`/healthz`, `/api/v1/traces`, `/api/v1/traces/<id>`, `/api/v1/taxonomy`, `/`) | 5 / 5 |
|
|
99
|
+
| Fresh-venv `pip install agentdebugx==0.2.3` + import + CLI listing | ✅ |
|
|
100
|
+
|
|
101
|
+
**Honest issues the E2E surfaced** (none of these would have been caught by
|
|
102
|
+
the mocked unit tests):
|
|
103
|
+
|
|
104
|
+
1. **LLM judge can return truncated JSON on long traces** — gemini-3-flash
|
|
105
|
+
spent its `max_tokens` budget on reasoning tokens before completing the
|
|
106
|
+
findings array; the pipeline gracefully returned 0 findings rather than
|
|
107
|
+
crashing. Mitigation: per-call `max_tokens=6144`+; document the
|
|
108
|
+
thinking-token trap (done in [docs/20_deep_debug.md §7](./20_deep_debug.md)).
|
|
109
|
+
2. **`BinarySearchAttributor` falls back to `HeuristicAttributor` when its
|
|
110
|
+
probe JSON is truncated** — observed in 2 of 3 scenarios. The fallback
|
|
111
|
+
chain works correctly, but the user loses the O(log N) advantage.
|
|
112
|
+
Followup: tighter bisection prompts; track in `result.raw['probe_count']`.
|
|
113
|
+
3. **`HeuristicAnalyzer` returns `root_cause_step_index=None` when all
|
|
114
|
+
findings have `step_index=None`** — the event recorded via `traced_tool`
|
|
115
|
+
doesn't carry a step index. Real bug; `traced_tool` should auto-assign.
|
|
116
|
+
|
|
117
|
+
These are tracked as v0.2.4 fixes.
|
|
118
|
+
|
|
85
119
|
## 4. Coverage matrix (post-0.2.2)
|
|
86
120
|
|
|
87
121
|
Run `PYTHONPATH=src pytest --cov=agentdebug --cov-report=term`. The two largest
|
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
# AgentDebugX v0.2.3 End-to-End Real-Usage Smoke
|
|
2
|
+
|
|
3
|
+
Scenarios: **3**. LLM model: `gemini-3-flash`. Generated by `scripts/e2e_real_usage.py`.
|
|
4
|
+
|
|
5
|
+
## Per-scenario pipeline status
|
|
6
|
+
|
|
7
|
+
| Scenario | trace_id | OK / Total stages | Failed stages |
|
|
8
|
+
|---|---|---|---|
|
|
9
|
+
| `action_format_then_hallucination` | `trace_f81860…` | 12 / 12 | — |
|
|
10
|
+
| `multiagent_handoff_loss` | `trace_45009e…` | 12 / 12 | — |
|
|
11
|
+
| `planning_loop` | `trace_3d1c98…` | 12 / 12 | — |
|
|
12
|
+
|
|
13
|
+
**UI smoke:** ✅ all endpoints responded
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
GET /healthz -> 200 {"status":"ok"}
|
|
17
|
+
GET /api/v1/traces -> 3 trace(s)
|
|
18
|
+
GET /api/v1/traces/<id> -> 200 events=11 findings=4
|
|
19
|
+
GET /api/v1/taxonomy -> modes=19
|
|
20
|
+
GET / -> 200 content_length=33666 has_brand=True
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## `action_format_then_hallucination`
|
|
24
|
+
|
|
25
|
+
`trace_id=trace_f81860758c6d439aaf1ecd7457de6654`
|
|
26
|
+
|
|
27
|
+
### ✅ `heuristic_analyzer` (0.00s) — 1 finding(s); root=None
|
|
28
|
+
### ✅ `cross_event_detectors` (0.00s) — 0 finding(s) from default_detectors()
|
|
29
|
+
### ✅ `traceback_offline` (0.00s) — rendered
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
AgentTraceback (root cause first, manifested failure last):
|
|
33
|
+
trace_id=trace_f81860758c6d439aaf1ecd7457de6654 framework=e2e-react goal='Find the latest AgentDebug paper, summarize the method, then email alice@example.com'
|
|
34
|
+
|
|
35
|
+
File "root cause", in trajectory
|
|
36
|
+
Step ? agent=search_web mode=system.tool_execution_error confidence=0.86
|
|
37
|
+
event_id=evt_582bbb55430a4be583ad6c374f7c1564
|
|
38
|
+
error> JSON schema validation failed: missing parameter query
|
|
39
|
+
evidence:
|
|
40
|
+
- JSON schema validation failed: missing parameter query
|
|
41
|
+
suggested: Capture tool stderr/status/latency and classify retryable versus non-retryable failures.
|
|
42
|
+
|
|
43
|
+
AgentFailure[system.tool_execution_error]: Likely root cause: Tool execution error in search_web at step None.
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### ✅ `reflexion_suggestion` (0.00s) — 1 proposal(s)
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
Reflexion retry hint for system.tool_execution_error at step None
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### ✅ `critic_recoverer` (0.00s) — 1 verifier proposal(s)
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
Add tool_result_typecheck before system.tool_execution_error (step None, agent search_web)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### ✅ `llm_judge` (27.97s) — 0 finding(s); root=None
|
|
59
|
+
### ✅ `attribute_heuristic` (0.00s) — method=heuristic (no hypotheses)
|
|
60
|
+
### ✅ `attribute_all_at_once` (6.27s) — method=all_at_once agent=search_web step=None conf=0.90
|
|
61
|
+
|
|
62
|
+
```
|
|
63
|
+
The agent failed to provide the required 'query' parameter in the search tool call, which resulted in a validation error and prevented the agent from finding the paper.
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### ✅ `attribute_step_by_step` (17.05s) — method=step_by_step agent=planner step=4 conf=1.00
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
The agent prematurely terminates the task with a generic statement, failing to summarize the method or email the recipient as required by the goal.
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### ✅ `attribute_binary_search` (9.48s) — method=binary_search agent=planner step=4 conf=0.90
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
Binary search located the decisive step within 3 probes over 6 events.
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### ✅ `deep_debug` (25.52s) — 3 finding(s); rounds=6
|
|
79
|
+
|
|
80
|
+
```
|
|
81
|
+
rounds: plan:3794ms / hypothesize:7161ms / verify:h1:2551ms / verify:h2:2381ms / verify:h3:2097ms / refine:7534ms
|
|
82
|
+
summary: The agent failed to provide the required 'query' parameter for the search tool, and the planner subsequently misjudged the task as complete despite failing to find the paper, summarize it, or send the email.
|
|
83
|
+
|
|
84
|
+
AgentTraceback (root cause first, manifested failure last):
|
|
85
|
+
trace_id=trace_f81860758c6d439aaf1ecd7457de6654 framework=e2e-react goal='Find the latest AgentDebug paper, summarize the method, then email alice@example.com'
|
|
86
|
+
|
|
87
|
+
File "root cause", in trajectory
|
|
88
|
+
Step ? agent=search_web mode=action.parameter_error confidence=1.00
|
|
89
|
+
event_id=evt_582bbb55430a4be583ad6c374f7c1564
|
|
90
|
+
error> JSON schema validation failed: missing parameter query
|
|
91
|
+
evidence:
|
|
92
|
+
- JSON schema validation failed: missing parameter query
|
|
93
|
+
- args': '()', 'kwargs': '{}'
|
|
94
|
+
suggested: Validate parameters against tool schemas and ask for missing user/context fields.
|
|
95
|
+
↓ cascaded to
|
|
96
|
+
File "cascade depth 1", in trajectory
|
|
97
|
+
Step 4 agent=planner mode=reflection.progress_misjudge confidence=1.00
|
|
98
|
+
module=reflection
|
|
99
|
+
event_id=evt_047e5ad596874186ac8d4413b8ba8185
|
|
100
|
+
output> Final answer: AgentDebug is a popular paper. Done.
|
|
101
|
+
evidence:
|
|
102
|
+
- Final answer: AgentDebug is a popular paper. Done.
|
|
103
|
+
- error=JSON schema validation failed: missing parameter query
|
|
104
|
+
suggested: Add an external task verifier before termination.
|
|
105
|
+
↓ cascaded to
|
|
106
|
+
File "cascade depth 2", in trajectory
|
|
107
|
+
Step ? agent=system mode=verification.missing_task_validation confidence=1.00
|
|
108
|
+
event_id=evt_0e34f3f892664015b10bba11ed2ac3dd
|
|
109
|
+
evidence:
|
|
110
|
+
- meta={'success': True}
|
|
111
|
+
- Final answer: AgentDebug is a popular paper. Done.
|
|
112
|
+
suggested: Add final-state validation that is independent of the acting agent.
|
|
113
|
+
|
|
114
|
+
AgentFailure[verification.missing_task_validation]: The agent failed to provide the required 'query' parameter for the search tool, and the planner subsequently misjudged the task as complete despite failing to find the paper, summarize it, or send the email.
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### ✅ `hub_round_trip` (0.01s) — pushed=/home/kunlunz2/AgentDebugX/.agentdebug/e2e_hub/bundle_b8fa2127c001463d81c86c8c03e4002a ; bundle_id=bundle_b8fa2127c001463d81c86c8c03e4002a ; listed=1 ; round-trip ok
|
|
118
|
+
|
|
119
|
+
## `multiagent_handoff_loss`
|
|
120
|
+
|
|
121
|
+
`trace_id=trace_45009e26b64341e69af395c4d4cabc07`
|
|
122
|
+
|
|
123
|
+
### ✅ `heuristic_analyzer` (0.00s) — 1 finding(s); root=2
|
|
124
|
+
### ✅ `cross_event_detectors` (0.00s) — 0 finding(s) from default_detectors()
|
|
125
|
+
### ✅ `traceback_offline` (0.00s) — rendered
|
|
126
|
+
|
|
127
|
+
```
|
|
128
|
+
AgentTraceback (root cause first, manifested failure last):
|
|
129
|
+
trace_id=trace_45009e26b64341e69af395c4d4cabc07 framework=e2e-multiagent goal='Find the best paper on agent debugging, prefer the most recent.'
|
|
130
|
+
|
|
131
|
+
File "root cause", in trajectory
|
|
132
|
+
Step 2 agent=researcher mode=multiagent.handoff_loss confidence=0.70
|
|
133
|
+
module=multiagent
|
|
134
|
+
event_id=evt_09730d2f195349639b671b8278d0202c
|
|
135
|
+
output> Please summarize the agent debugging paper.
|
|
136
|
+
evidence:
|
|
137
|
+
- handoff/context signal in event payload
|
|
138
|
+
suggested: Make handoff payloads typed and include goal, constraints, evidence, confidence, and open questions.
|
|
139
|
+
|
|
140
|
+
AgentFailure[multiagent.handoff_loss]: Likely root cause: Handoff context loss in researcher at step 2.
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### ✅ `reflexion_suggestion` (0.00s) — 1 proposal(s)
|
|
144
|
+
|
|
145
|
+
```
|
|
146
|
+
Reflexion retry hint for multiagent.handoff_loss at step 2
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### ✅ `critic_recoverer` (0.00s) — 1 verifier proposal(s)
|
|
150
|
+
|
|
151
|
+
```
|
|
152
|
+
Add handoff_context_contract before multiagent.handoff_loss (step 2, agent researcher)
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### ✅ `llm_judge` (8.01s) — 2 finding(s); root=2
|
|
156
|
+
|
|
157
|
+
```
|
|
158
|
+
- multiagent.handoff_loss (conf=1.00) step=2 agent=researcher
|
|
159
|
+
- verification.missing_task_validation (conf=0.90) step=None agent=system
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### ✅ `attribute_heuristic` (0.00s) — method=heuristic agent=researcher step=2 conf=1.00
|
|
163
|
+
|
|
164
|
+
```
|
|
165
|
+
Earliest finding with non-trivial confidence: Handoff context loss
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### ✅ `attribute_all_at_once` (3.30s) — method=all_at_once agent=researcher step=2 conf=1.00
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
The researcher correctly identified Paper A as the most recent in step 1 but failed to communicate this specific choice or the recency constraint during the handoff in step 2, leading the summarizer to pick the wrong paper.
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### ✅ `attribute_step_by_step` (20.70s) — method=step_by_step agent=researcher step=2 conf=1.00
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
The researcher agent identified the correct paper in the previous step but failed to deliver the result, instead initiating an unnecessary handoff for summarization.
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### ✅ `attribute_binary_search` (7.03s) — method=heuristic agent=researcher step=2 conf=1.00
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
Earliest finding with non-trivial confidence: Handoff context loss
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### ✅ `deep_debug` (30.30s) — 3 finding(s); rounds=6
|
|
187
|
+
|
|
188
|
+
```
|
|
189
|
+
rounds: plan:3749ms / hypothesize:8433ms / verify:h1:3764ms / verify:h2:2344ms / verify:h3:4481ms / refine:7526ms
|
|
190
|
+
summary: The researcher agent hallucinated paper candidates without performing a search and subsequently failed to communicate the user's recency constraints and the selected paper to the summarizer, leading to an incorrect final output.
|
|
191
|
+
|
|
192
|
+
AgentTraceback (root cause first, manifested failure last):
|
|
193
|
+
trace_id=trace_45009e26b64341e69af395c4d4cabc07 framework=e2e-multiagent goal='Find the best paper on agent debugging, prefer the most recent.'
|
|
194
|
+
|
|
195
|
+
File "root cause", in trajectory
|
|
196
|
+
Step 1 agent=researcher mode=memory.hallucination confidence=0.95
|
|
197
|
+
module=planning
|
|
198
|
+
event_id=evt_1f436c0e08534b579faba36acb2e6703
|
|
199
|
+
output> Found two candidate papers: A (May 2025) and B (Mar 2024). A is preferred because it is more recent (per user constraint).
|
|
200
|
+
evidence:
|
|
201
|
+
- Found two candidate papers: A (May 2025) and B (Mar 2024)
|
|
202
|
+
- without any apparent search or data gathering steps
|
|
203
|
+
suggested: Require memory reads to cite the source event or artifact before use.
|
|
204
|
+
↓ cascaded to
|
|
205
|
+
File "cascade depth 1", in trajectory
|
|
206
|
+
Step 2 agent=researcher mode=multiagent.handoff_loss confidence=1.00
|
|
207
|
+
module=multiagent
|
|
208
|
+
event_id=evt_09730d2f195349639b671b8278d0202c
|
|
209
|
+
output> Please summarize the agent debugging paper.
|
|
210
|
+
evidence:
|
|
211
|
+
- Please summarize the agent debugging paper.
|
|
212
|
+
- omitted_context: 'preference for A; recency constraint'
|
|
213
|
+
suggested: Make handoff payloads typed and include goal, constraints, evidence, confidence, and open questions.
|
|
214
|
+
↓ cascaded to
|
|
215
|
+
File "cascade depth 2", in trajectory
|
|
216
|
+
Step 4 agent=summarizer mode=reflection.progress_misjudge confidence=0.90
|
|
217
|
+
module=reflection
|
|
218
|
+
event_id=evt_e7e8080c4dde47ef88c96dc0db743023
|
|
219
|
+
output> I summarized paper B (the only one I knew about).
|
|
220
|
+
evidence:
|
|
221
|
+
- I summarized paper B (the only one I knew about).
|
|
222
|
+
- prefer the most recent
|
|
223
|
+
suggested: Add an external task verifier before termination.
|
|
224
|
+
|
|
225
|
+
AgentFailure[reflection.progress_misjudge]: The researcher agent hallucinated paper candidates without performing a search and subsequently failed to communicate the user's recency constraints and the selected paper to the summarizer, leading to an incorrect final output.
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
### ✅ `hub_round_trip` (0.01s) — pushed=/home/kunlunz2/AgentDebugX/.agentdebug/e2e_hub/bundle_6c474a1056074785a0275407f24fedc1 ; bundle_id=bundle_6c474a1056074785a0275407f24fedc1 ; listed=3 ; round-trip ok
|
|
229
|
+
|
|
230
|
+
## `planning_loop`
|
|
231
|
+
|
|
232
|
+
`trace_id=trace_3d1c98d2424a4c05ae104b942fe0a302`
|
|
233
|
+
|
|
234
|
+
### ✅ `heuristic_analyzer` (0.00s) — 4 finding(s); root=None
|
|
235
|
+
### ✅ `cross_event_detectors` (0.00s) — 3 finding(s) from default_detectors()
|
|
236
|
+
|
|
237
|
+
```
|
|
238
|
+
- planning.inefficient_plan (source=repeated_tool_call)
|
|
239
|
+
- planning.inefficient_plan (source=repeated_state)
|
|
240
|
+
- planning.inefficient_plan (source=repeated_state)
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### ✅ `traceback_offline` (0.00s) — rendered
|
|
244
|
+
|
|
245
|
+
```
|
|
246
|
+
AgentTraceback (root cause first, manifested failure last):
|
|
247
|
+
trace_id=trace_3d1c98d2424a4c05ae104b942fe0a302 framework=e2e-browser goal='Submit the checkout form on shop.example.com'
|
|
248
|
+
|
|
249
|
+
File "root cause", in trajectory
|
|
250
|
+
Step ? agent=browser mode=planning.inefficient_plan confidence=0.67
|
|
251
|
+
event_id=evt_3f474425ad3841b886240a70ec694fa5
|
|
252
|
+
output> no progress; same checkout screen
|
|
253
|
+
evidence:
|
|
254
|
+
- loop/progress signal in event payload
|
|
255
|
+
suggested: Add loop detection over tool calls and state deltas.
|
|
256
|
+
↓ cascaded to
|
|
257
|
+
File "cascade depth 1", in trajectory
|
|
258
|
+
Step ? agent=browser mode=planning.inefficient_plan confidence=0.67
|
|
259
|
+
event_id=evt_bce9260a3a024f89ac18430ac2f660ef
|
|
260
|
+
output> no progress; same checkout screen
|
|
261
|
+
evidence:
|
|
262
|
+
- loop/progress signal in event payload
|
|
263
|
+
suggested: Add loop detection over tool calls and state deltas.
|
|
264
|
+
↓ cascaded to
|
|
265
|
+
File "cascade depth 2", in trajectory
|
|
266
|
+
Step ? agent=browser mode=planning.inefficient_plan confidence=0.67
|
|
267
|
+
event_id=evt_45dc8858899546ff8159af3ce4f8d6dd
|
|
268
|
+
output> no progress; same checkout screen
|
|
269
|
+
evidence:
|
|
270
|
+
- loop/progress signal in event payload
|
|
271
|
+
suggested: Add loop detection over tool calls and state deltas.
|
|
272
|
+
↓ cascaded to
|
|
273
|
+
File "cascade depth 3", in trajectory
|
|
274
|
+
Step ? agent=browser mode=planning.inefficient_plan confidence=0.67
|
|
275
|
+
event_id=evt_90f10dde040341dcb9434192fba3255d
|
|
276
|
+
output> no progress; same checkout screen
|
|
277
|
+
evidence:
|
|
278
|
+
- loop/progress signal in event payload
|
|
279
|
+
suggested: Add loop detection over tool calls and state deltas.
|
|
280
|
+
|
|
281
|
+
AgentFailure[planning.inefficient_plan]: Likely root cause: Inefficient plan in browser at step None.
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
### ✅ `reflexion_suggestion` (0.00s) — 4 proposal(s)
|
|
285
|
+
|
|
286
|
+
```
|
|
287
|
+
Reflexion retry hint for planning.inefficient_plan at step None
|
|
288
|
+
Reflexion retry hint for planning.inefficient_plan at step None
|
|
289
|
+
Reflexion retry hint for planning.inefficient_plan at step None
|
|
290
|
+
Reflexion retry hint for planning.inefficient_plan at step None
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
### ✅ `critic_recoverer` (0.00s) — 4 verifier proposal(s)
|
|
294
|
+
|
|
295
|
+
```
|
|
296
|
+
Add loop_detector_guard before planning.inefficient_plan (step None, agent browser)
|
|
297
|
+
Add loop_detector_guard before planning.inefficient_plan (step None, agent browser)
|
|
298
|
+
Add loop_detector_guard before planning.inefficient_plan (step None, agent browser)
|
|
299
|
+
Add loop_detector_guard before planning.inefficient_plan (step None, agent browser)
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### ✅ `llm_judge` (19.00s) — 2 finding(s); root=1
|
|
303
|
+
|
|
304
|
+
```
|
|
305
|
+
- planning.inefficient_plan (conf=0.90) step=1 agent=planner
|
|
306
|
+
- reflection.progress_misjudge (conf=1.00) step=None agent=system
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
### ✅ `attribute_heuristic` (0.00s) — method=heuristic agent=planner step=1 conf=0.90
|
|
310
|
+
|
|
311
|
+
```
|
|
312
|
+
Earliest finding with non-trivial confidence: Inefficient plan
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
### ✅ `attribute_all_at_once` (5.44s) — method=all_at_once agent=planner step=1 conf=0.90
|
|
316
|
+
|
|
317
|
+
```
|
|
318
|
+
The planner's initial strategy was fundamentally flawed, instructing the agent to repeatedly click a button without verifying form completion or handling errors, which led to the failure.
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
### ✅ `attribute_step_by_step` (50.07s) — method=step_by_step agent=planner step=1 conf=0.90
|
|
322
|
+
|
|
323
|
+
```
|
|
324
|
+
The planner proposed a brute-force clicking strategy without accounting for the necessary form-filling steps required for a checkout process.
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
### ✅ `attribute_binary_search` (6.06s) — method=heuristic agent=planner step=1 conf=0.90
|
|
328
|
+
|
|
329
|
+
```
|
|
330
|
+
Earliest finding with non-trivial confidence: Inefficient plan
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
### ✅ `deep_debug` (38.24s) — 3 finding(s); rounds=6
|
|
334
|
+
|
|
335
|
+
```
|
|
336
|
+
rounds: plan:4530ms / hypothesize:12830ms / verify:h1:4850ms / verify:h2:4577ms / verify:h3:4967ms / refine:6484ms
|
|
337
|
+
summary: The checkout submission failed because the planner ignored mandatory form fields and instead devised a strategy to repeatedly click the submit button, which the browser executed without success.
|
|
338
|
+
|
|
339
|
+
AgentTraceback (root cause first, manifested failure last):
|
|
340
|
+
trace_id=trace_3d1c98d2424a4c05ae104b942fe0a302 framework=e2e-browser goal='Submit the checkout form on shop.example.com'
|
|
341
|
+
|
|
342
|
+
File "root cause", in trajectory
|
|
343
|
+
Step 1 agent=planner mode=planning.constraint_ignorance confidence=1.00
|
|
344
|
+
module=planning
|
|
345
|
+
event_id=evt_b1d43814c6b642079ba7015d13b140c5
|
|
346
|
+
output> Strategy: click #submit until success
|
|
347
|
+
evidence:
|
|
348
|
+
- Strategy: click #submit until success
|
|
349
|
+
suggested: Compile task and tool constraints into pre-action checks.
|
|
350
|
+
↓ cascaded to
|
|
351
|
+
File "cascade depth 1", in trajectory
|
|
352
|
+
Step ? agent=browser mode=planning.inefficient_plan confidence=0.95
|
|
353
|
+
event_id=evt_3ceba7d898d04e5a98f463ea8f9c8e72
|
|
354
|
+
input> {'tool': 'click', 'args': '()', 'kwargs': "{'selector': '#submit'}"}
|
|
355
|
+
evidence:
|
|
356
|
+
- click
|
|
357
|
+
- {'selector': '#submit'}
|
|
358
|
+
- no progress; same checkout screen
|
|
359
|
+
suggested: Add loop detection over tool calls and state deltas.
|
|
360
|
+
↓ cascaded to
|
|
361
|
+
File "cascade depth 2", in trajectory
|
|
362
|
+
Step ? agent=browser mode=reflection.progress_misjudge confidence=0.90
|
|
363
|
+
event_id=evt_90f10dde040341dcb9434192fba3255d
|
|
364
|
+
output> no progress; same checkout screen
|
|
365
|
+
evidence:
|
|
366
|
+
- no progress; same checkout screen
|
|
367
|
+
- meta={'success': True}
|
|
368
|
+
suggested: Add an external task verifier before termination.
|
|
369
|
+
|
|
370
|
+
AgentFailure[reflection.progress_misjudge]: The checkout submission failed because the planner ignored mandatory form fields and instead devised a strategy to repeatedly click the submit button, which the browser executed without success.
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
### ✅ `hub_round_trip` (0.01s) — pushed=/home/kunlunz2/AgentDebugX/.agentdebug/e2e_hub/bundle_15650f55244b4ec98adf6ef0042496ec ; bundle_id=bundle_15650f55244b4ec98adf6ef0042496ec ; listed=4 ; round-trip ok
|
|
@@ -25,7 +25,11 @@ class HeuristicAnalyzer:
|
|
|
25
25
|
|
|
26
26
|
def analyze(self, trajectory: AgentTrajectory) -> DiagnosticReport:
|
|
27
27
|
findings = [finding for event in trajectory.events for finding in self._event_findings(event)]
|
|
28
|
-
root
|
|
28
|
+
# Build event-order map so the root selector can fall back to it when
|
|
29
|
+
# findings lack step_index (e.g., events recorded via traced_tool on
|
|
30
|
+
# pre-0.2.4 captures that pre-date the auto-counter).
|
|
31
|
+
event_order = {evt.event_id: i for i, evt in enumerate(trajectory.events)}
|
|
32
|
+
root = self._select_root_cause(findings, event_order=event_order)
|
|
29
33
|
suggestions = self._dedupe(
|
|
30
34
|
finding.suggestion for finding in findings if finding.suggestion is not None
|
|
31
35
|
)
|
|
@@ -40,11 +44,19 @@ class HeuristicAnalyzer:
|
|
|
40
44
|
if root is not None:
|
|
41
45
|
report.root_cause_event_id = root.event_id
|
|
42
46
|
report.root_cause_agent = root.agent_name
|
|
43
|
-
|
|
47
|
+
# If the finding lacked an explicit step_index, fall back to the
|
|
48
|
+
# event's position so root_cause_step_index is non-null in the UI
|
|
49
|
+
# and the AgentTraceback header reads sensibly.
|
|
50
|
+
inferred_step = root.step_index
|
|
51
|
+
if inferred_step is None and root.event_id is not None:
|
|
52
|
+
pos = event_order.get(root.event_id)
|
|
53
|
+
if pos is not None:
|
|
54
|
+
inferred_step = pos
|
|
55
|
+
report.root_cause_step_index = inferred_step
|
|
44
56
|
report.summary = (
|
|
45
57
|
f'Likely root cause: {root.failure_mode.name}'
|
|
46
58
|
f' in {root.agent_name or "unknown agent"}'
|
|
47
|
-
f' at step {
|
|
59
|
+
f' at step {inferred_step if inferred_step is not None else "?"}.'
|
|
48
60
|
)
|
|
49
61
|
return report
|
|
50
62
|
|
|
@@ -106,17 +118,35 @@ class HeuristicAnalyzer:
|
|
|
106
118
|
) -> Tuple[FailureMode, float, List[str]]:
|
|
107
119
|
return SEED_FAILURE_MODES[mode_id], confidence, evidence
|
|
108
120
|
|
|
109
|
-
def _select_root_cause(
|
|
121
|
+
def _select_root_cause(
|
|
122
|
+
self,
|
|
123
|
+
findings: List[FailureFinding],
|
|
124
|
+
*,
|
|
125
|
+
event_order: Optional[dict[str, int]] = None,
|
|
126
|
+
) -> Optional[FailureFinding]:
|
|
110
127
|
if not findings:
|
|
111
128
|
return None
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
129
|
+
# Primary key: step_index (None pushed to end).
|
|
130
|
+
# Fallback: when step_index is None, use the event's position in the
|
|
131
|
+
# trajectory so we still pick the *earliest* finding rather than just
|
|
132
|
+
# the highest-confidence one.
|
|
133
|
+
event_order = event_order or {}
|
|
134
|
+
|
|
135
|
+
def _key(f: FailureFinding) -> tuple[int, int, int, float]:
|
|
136
|
+
step = f.step_index if f.step_index is not None else 10**9
|
|
137
|
+
order = (
|
|
138
|
+
event_order.get(f.event_id, 10**9)
|
|
139
|
+
if f.event_id is not None and f.step_index is None
|
|
140
|
+
else 10**9
|
|
141
|
+
)
|
|
142
|
+
return (
|
|
143
|
+
0 if f.step_index is not None else 1,
|
|
144
|
+
step,
|
|
145
|
+
order,
|
|
146
|
+
-f.confidence,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
return sorted(findings, key=_key)[0]
|
|
120
150
|
|
|
121
151
|
def _event_text(self, event: AgentEvent) -> str:
|
|
122
152
|
parts = [
|
|
@@ -238,14 +238,14 @@ shown a PREFIX of a failed agent trajectory truncated to its first N events.
|
|
|
238
238
|
Decide whether the failure has ALREADY occurred within this prefix, i.e.,
|
|
239
239
|
whether the trajectory is unrecoverable as of the last event shown.
|
|
240
240
|
|
|
241
|
-
|
|
241
|
+
CRITICAL OUTPUT RULES — these maximize the chance your response parses:
|
|
242
|
+
1. Output ONLY a JSON object. No prose before or after. No markdown fences.
|
|
243
|
+
2. Keep the rationale to ONE short sentence (<= 200 chars). Do NOT include
|
|
244
|
+
line breaks inside string values.
|
|
245
|
+
3. Emit the JSON object COMPLETE; do not stop mid-key.
|
|
242
246
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
"confidence": <float in [0,1]>,
|
|
246
|
-
"rationale": "<one or two sentences>",
|
|
247
|
-
"decisive_event_id": "<event_id or null>"
|
|
248
|
-
}
|
|
247
|
+
Schema:
|
|
248
|
+
{"failure_already_happened": true|false, "confidence": <0..1>, "rationale": "<short>", "decisive_event_id": "<event_id or null>"}
|
|
249
249
|
|
|
250
250
|
Be conservative: only return true when you can point to evidence in the
|
|
251
251
|
prefix that the agent has already taken (or omitted) the decisive step.
|
|
@@ -443,9 +443,12 @@ class BinarySearchAttributor:
|
|
|
443
443
|
llm: LLMClient,
|
|
444
444
|
*,
|
|
445
445
|
fallback: Optional[Attributor] = None,
|
|
446
|
-
max_tokens: int =
|
|
446
|
+
max_tokens: int = 2048,
|
|
447
447
|
context_window: int = 6,
|
|
448
448
|
) -> None:
|
|
449
|
+
# max_tokens default doubled in 0.2.4: thinking models (Gemini, o-series)
|
|
450
|
+
# consume most of the budget on reasoning before any JSON is emitted, so
|
|
451
|
+
# 1024 was empirically truncating bisect probes in the v0.2.3 E2E.
|
|
449
452
|
self.llm = llm
|
|
450
453
|
self.fallback: Attributor = fallback or HeuristicAttributor()
|
|
451
454
|
self.max_tokens = max_tokens
|
|
@@ -127,11 +127,23 @@ class AgentDebug:
|
|
|
127
127
|
|
|
128
128
|
|
|
129
129
|
class TraceSession:
|
|
130
|
-
"""Context manager around a trajectory.
|
|
130
|
+
"""Context manager around a trajectory.
|
|
131
|
+
|
|
132
|
+
Maintains a monotonic step counter so that callers (and instrumentation
|
|
133
|
+
helpers like :func:`agentdebug.instrumentation.traced_tool`) can omit
|
|
134
|
+
``step_index`` and still have downstream analyzers see properly ordered
|
|
135
|
+
steps. Explicit ``step_index`` always wins; passing ``None`` triggers
|
|
136
|
+
auto-assignment.
|
|
137
|
+
"""
|
|
131
138
|
|
|
132
139
|
def __init__(self, debugger: AgentDebug, trajectory: AgentTrajectory) -> None:
|
|
133
140
|
self.debugger = debugger
|
|
134
141
|
self.trajectory = trajectory
|
|
142
|
+
self._step_counter = 0
|
|
143
|
+
|
|
144
|
+
def _next_step(self) -> int:
|
|
145
|
+
self._step_counter += 1
|
|
146
|
+
return self._step_counter
|
|
135
147
|
|
|
136
148
|
def __enter__(self) -> 'TraceSession':
|
|
137
149
|
return self
|
|
@@ -165,6 +177,8 @@ class TraceSession:
|
|
|
165
177
|
duration_ms: Optional[float] = None,
|
|
166
178
|
**metadata: Any,
|
|
167
179
|
) -> AgentEvent:
|
|
180
|
+
if step_index is None:
|
|
181
|
+
step_index = self._next_step()
|
|
168
182
|
return self.debugger.record_event(
|
|
169
183
|
self.trajectory,
|
|
170
184
|
event_type=event_type,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|