agentdebugx 0.2.4__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/PKG-INFO +2 -1
  2. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/21_integrations.md +56 -0
  3. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/22_industry_track_paper_eval_plan.md +25 -0
  4. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/23_status_v0_2.md +39 -5
  5. agentdebugx-0.2.6/docs/benchmarks/e2e_v0_2_4.md +365 -0
  6. agentdebugx-0.2.6/docs/benchmarks/who_when_v0_2_6_leaderboard.md +74 -0
  7. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/pyproject.toml +3 -1
  8. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/__init__.py +1 -1
  9. agentdebugx-0.2.6/src/agentdebug/adapters/crewai.py +233 -0
  10. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/cli.py +6 -0
  11. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/judges.py +23 -17
  12. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/ui/server.py +52 -5
  13. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/LICENSE +0 -0
  14. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/README.md +0 -0
  15. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/00_overview.md +0 -0
  16. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/01_literature_survey.md +0 -0
  17. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/02_architecture.md +0 -0
  18. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/03_taxonomy.md +0 -0
  19. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/04_trace_schema.md +0 -0
  20. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/05_adapters.md +0 -0
  21. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/06_detectors.md +0 -0
  22. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/07_attribution.md +0 -0
  23. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/08_recovery.md +0 -0
  24. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/09_error_database.md +0 -0
  25. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/10_taxonomy_induction.md +0 -0
  26. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/11_multimodal.md +0 -0
  27. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/12_ui_dashboard.md +0 -0
  28. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/13_class_design.md +0 -0
  29. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/14_api_reference.md +0 -0
  30. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/15_roadmap.md +0 -0
  31. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/16_governance.md +0 -0
  32. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/17_claude_code_design_patterns.md +0 -0
  33. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/18_comparison_codex_vs_design.md +0 -0
  34. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/19_error_hub.md +0 -0
  35. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/20_deep_debug.md +0 -0
  36. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/ERROR_TAXONOMY.md +0 -0
  37. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
  38. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/README.md +0 -0
  39. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/RESEARCH_SURVEY.md +0 -0
  40. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/benchmarks/e2e_v0_2_3.md +0 -0
  41. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/benchmarks/v0_1_smoke.json +0 -0
  42. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/docs/benchmarks/v0_1_smoke.md +0 -0
  43. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/adapters/__init__.py +0 -0
  44. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/adapters/base.py +0 -0
  45. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/adapters/langgraph.py +0 -0
  46. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/adapters/otel.py +0 -0
  47. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/adapters/raw.py +0 -0
  48. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/analyzers.py +0 -0
  49. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/attribution.py +0 -0
  50. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/deep.py +0 -0
  51. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/detectors.py +0 -0
  52. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/events.py +0 -0
  53. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/hub/__init__.py +0 -0
  54. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/hub/backend_base.py +0 -0
  55. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/hub/backends.py +0 -0
  56. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/hub/bundle.py +0 -0
  57. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/hub/scrub.py +0 -0
  58. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/instrumentation.py +0 -0
  59. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/integrations/__init__.py +0 -0
  60. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/integrations/claude_skill.py +0 -0
  61. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/integrations/openhands.py +0 -0
  62. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/llm.py +0 -0
  63. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/models.py +0 -0
  64. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/recorder.py +0 -0
  65. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/recovery.py +0 -0
  66. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/storage.py +0 -0
  67. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/taxonomy.py +0 -0
  68. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/traceback.py +0 -0
  69. {agentdebugx-0.2.4 → agentdebugx-0.2.6}/src/agentdebug/ui/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentdebugx
3
- Version: 0.2.4
3
+ Version: 0.2.6
4
4
  Summary: Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -26,6 +26,7 @@ Classifier: Topic :: Software Development :: Quality Assurance
26
26
  Classifier: Topic :: System :: Monitoring
27
27
  Classifier: Typing :: Typed
28
28
  Provides-Extra: all
29
+ Provides-Extra: crewai
29
30
  Provides-Extra: hub-hf
30
31
  Provides-Extra: langgraph
31
32
  Provides-Extra: openhands
@@ -58,6 +58,62 @@ command:
58
58
  | "I need a thorough postmortem" | `agentdebug deep <file|trace_id>` |
59
59
  | "share this with the team" | `agentdebug hub push <trace_id> --to git:...` |
60
60
 
61
+ ## 1.5 CrewAI integration
62
+
63
+ CrewAI emits a typed event stream via a process-global
64
+ `crewai.events.crewai_event_bus`. `agentdebug.adapters.crewai` ships two
65
+ pieces:
66
+
67
+ - `CrewAIBridge(debugger, trajectory)` — context manager that subscribes
68
+ to the bus and translates events into AgentDebug records.
69
+ - `CrewAIAdapter().instrument(debugger)` — used by `agentdebug doctor` to
70
+ report whether CrewAI is importable.
71
+
72
+ ### Recording a Crew run
73
+
74
+ ```python
75
+ from agentdebug import AgentDebug, SQLiteTraceStore
76
+ from agentdebug.adapters.crewai import CrewAIBridge
77
+
78
+ debugger = AgentDebug(store=SQLiteTraceStore('.agentdebug/errors.sqlite'))
79
+ trajectory = debugger.start_trace(goal='build a marketing plan', framework='crewai')
80
+
81
+ with CrewAIBridge(debugger, trajectory):
82
+ crew.kickoff(inputs={...}) # standard CrewAI call
83
+
84
+ debugger.finish_trace(trajectory, success=True)
85
+ ```
86
+
87
+ ### What gets recorded
88
+
89
+ | CrewAI event class | AgentDebug `EventType` | Module |
90
+ |---|---|---|
91
+ | `CrewKickoffStartedEvent` | `AGENT_STEP` | planning |
92
+ | `CrewKickoffCompletedEvent` | `OBSERVATION` | planning |
93
+ | `TaskStartedEvent` | `PLAN` | planning |
94
+ | `TaskCompletedEvent` | `OBSERVATION` | planning |
95
+ | `AgentExecutionStartedEvent` | `AGENT_STEP` | planning |
96
+ | `AgentExecutionCompletedEvent` | `OBSERVATION` | reflection |
97
+ | `LLMCallStartedEvent` | `LLM_CALL` | planning |
98
+ | `LLMCallCompletedEvent` | `LLM_RESPONSE` | planning |
99
+ | `ToolUsageStartedEvent` | `TOOL_CALL` | action |
100
+ | `ToolUsageFinishedEvent` | `TOOL_RESULT` | action |
101
+ | `ToolUsageErrorEvent` | `TOOL_RESULT` (with `error`) | action |
102
+
103
+ Event class names not present in the installed CrewAI version are silently
104
+ skipped — useful for forward/backward compatibility.
105
+
106
+ ### Install
107
+
108
+ ```bash
109
+ pip install 'agentdebugx[crewai]'
110
+ ```
111
+
112
+ ### See also
113
+
114
+ `examples/crewai_demo.py` shows a complete two-agent crew (researcher +
115
+ editor) instrumented with `CrewAIBridge`.
116
+
61
117
  ## 2. OpenHands integration
62
118
 
63
119
  `agentdebug.integrations.openhands` ships two complementary pieces.
@@ -233,3 +233,28 @@ Storage/export:
233
233
  4. Run rule, judge, All-at-Once, and DeepDebug on the same split.
234
234
  5. Add a small human/expert review with raw trace vs paired trace view.
235
235
  6. Trim main body to 6 pages while keeping appendix rich.
236
+
237
+ ## 9. Implemented Experiment Harness
238
+
239
+ The repository now contains an `experiments/` directory for the first paper
240
+ pipeline:
241
+
242
+ - `prepare_who_when.py`: downloads the two public Who&When parquet files from
243
+ Hugging Face and normalizes them into `AgentTrajectory` JSONL plus separate
244
+ gold labels. On 2026-05-16 this produced 184 traces and 4,092 events.
245
+ - `run_e2e_smoke.py`: exercises the public AgentDebugX modules end to end:
246
+ analyzer, detectors, traceback, recoverers, attribution, and Error Hub local
247
+ push/pull. It can optionally call a live OpenAI-compatible endpoint.
248
+ - `run_who_when_eval.py`: runs attribution metrics against Who&When labels,
249
+ including responsible-agent match, exact step match, +/-1 step match, and
250
+ joint agent/step accuracy.
251
+ - `generate_paper_figures_openai.py`: generates the optional paper pipeline
252
+ figure with `gpt-image-2` when `OPENAI_API_KEY` is available.
253
+
254
+ The AgentDebug / AgentErrorBench dataset is currently linked from the upstream
255
+ paper repository as a Google Drive folder rather than a stable direct download
256
+ URL. `prepare_agenterrorbench.py` accepts a local extracted folder under
257
+ `data/agenterrorbench/raw/`, can attempt `gdown` download when the optional
258
+ dependency is installed, preserves environment metadata for ALFWorld, GAIA, and
259
+ WebShop, and outputs the same `AgentTrajectory` + labels format used by the
260
+ Who&When loader.
@@ -82,6 +82,26 @@ The audit found one real bug and a handful of test gaps:
82
82
  5. **`recovery.ReflexionSuggestion`** had only an indirect test from DeepDebug
83
83
  examples; now has direct happy + empty tests.
84
84
 
85
+ ## 3.7 Judge hardening (0.2.6)
86
+
87
+ A v0.2.5 Who&When 5-trace live run had `llm_judge_root.agent_match=0.00`
88
+ because the judge truncated mid-array on long multi-agent debate
89
+ transcripts. Three changes in 0.2.6 lifted that to **0.40** on the same
90
+ sample (same model, same traces):
91
+
92
+ 1. `LLMJudgeAnalyzer.max_tokens` default **4096 → 8192** — leaves room for
93
+ thinking-model reasoning tokens before the JSON object starts.
94
+ 2. `LLMJudgeAnalyzer.max_findings_per_chunk` parameter (default 6) — the
95
+ system prompt now asks the model to cap its findings array, forcing it
96
+ to close the JSON even when many candidates are visible.
97
+ 3. System prompt now has explicit "CRITICAL OUTPUT RULES" — output ONLY
98
+ JSON, no markdown fences, no newlines in string values, complete the
99
+ array.
100
+
101
+ Numbers: see [docs/benchmarks/who_when_v0_2_6_leaderboard.md](./benchmarks/who_when_v0_2_6_leaderboard.md).
102
+ Same trick works for `BinarySearchAttributor` (shipped in 0.2.4) — apply
103
+ to remaining LLM-using analyzers as more thinking models surface this.
104
+
85
105
  ## 3.6 Real-usage E2E (live Gemini)
86
106
 
87
107
  Beyond unit tests, `scripts/e2e_real_usage.py` builds three realistic failing
@@ -142,8 +162,22 @@ Before v0.3 ships, this doc should record green checkmarks for:
142
162
  default verifier templates (JSON-schema guard, final-state check,
143
163
  tool-result type-check, handoff contract, loop-detector guard) and
144
164
  emits per-finding `FixProposal` with rationale + suggested code.
145
- - [ ] One additional framework adapter that goes through the full conformance
146
- suite (CrewAI is the most-requested).
147
- - [ ] HuggingFace Hub round-trip live test (gated on `HF_TOKEN`).
148
- - [ ] Bench harness extended with one published-benchmark loader (Who&When
149
- is the obvious first target we already cite its method).
165
+ - [x] **One additional framework adapter** CrewAI adapter shipped in
166
+ 0.2.5 (`agentdebug.adapters.crewai`). `CrewAIBridge` context manager
167
+ subscribes to `crewai_event_bus`, translates 11 CrewAI event types
168
+ into `AgentEvent`s. Conformance test mocks the bus and verifies
169
+ every documented event mapping plus the version-skew degradation
170
+ path. `examples/crewai_demo.py` shows a working two-agent crew.
171
+ - [x] **HuggingFace Hub round-trip live test** — shipped in 0.2.6 as
172
+ `tests/test_hub_huggingface_live.py`. Gated on `HF_TOKEN` +
173
+ `AGENTDEBUG_HF_LIVE=1` so it never runs in default CI. Creates the
174
+ dataset repo if missing, pushes a bundle, lists, pulls back, verifies
175
+ the trajectory round-trips bit-for-bit. Live-validated against
176
+ `KunlunZhu/agentdebugx-live-test`.
177
+ - [x] **Bench harness with Who&When loader** — `experiments/prepare_who_when.py`
178
+ ingests 184 Algorithm-Generated + Hand-Crafted traces (4092 events) and
179
+ stores labels separately. `experiments/run_who_when_eval.py` runs all
180
+ 4 attributors + DeepDebug against gold labels; reports agent_match,
181
+ exact_step, near_step. Live-Gemini 5-trace validation captured at
182
+ [docs/benchmarks/who_when_v0_2_6_leaderboard.md](./benchmarks/who_when_v0_2_6_leaderboard.md).
183
+ Headline 184-trace run deferred (~6h / ~$5-10 on a frontier model).
@@ -0,0 +1,365 @@
1
+ # AgentDebugX v0.2.3 End-to-End Real-Usage Smoke
2
+
3
+ Scenarios: **3**. LLM model: `gemini-3-flash`. Generated by `scripts/e2e_real_usage.py`.
4
+
5
+ ## Per-scenario pipeline status
6
+
7
+ | Scenario | trace_id | OK / Total stages | Failed stages |
8
+ |---|---|---|---|
9
+ | `action_format_then_hallucination` | `trace_cde22f…` | 12 / 12 | — |
10
+ | `multiagent_handoff_loss` | `trace_84fb3a…` | 12 / 12 | — |
11
+ | `planning_loop` | `trace_c59dc1…` | 12 / 12 | — |
12
+
13
+ **UI smoke:** ✅ all endpoints responded
14
+
15
+ ```
16
+ GET /healthz -> 200 {"status":"ok"}
17
+ GET /api/v1/traces -> 5 trace(s)
18
+ GET /api/v1/traces/<id> -> 200 events=11 findings=4
19
+ GET /api/v1/taxonomy -> modes=19
20
+ GET / -> 200 content_length=33710 has_brand=True
21
+ ```
22
+
23
+ ## `action_format_then_hallucination`
24
+
25
+ `trace_id=trace_cde22f0eb1ec499aa3bc5b88f472257d`
26
+
27
+ ### ✅ `heuristic_analyzer` (0.00s) — 1 finding(s); root=2
28
+ ### ✅ `cross_event_detectors` (0.00s) — 0 finding(s) from default_detectors()
29
+ ### ✅ `traceback_offline` (0.00s) — rendered
30
+
31
+ ```
32
+ AgentTraceback (root cause first, manifested failure last):
33
+ trace_id=trace_cde22f0eb1ec499aa3bc5b88f472257d framework=e2e-react goal='Find the latest AgentDebug paper, summarize the method, then email alice@example.com'
34
+
35
+ File "root cause", in trajectory
36
+ Step 2 agent=search_web mode=system.tool_execution_error confidence=0.86
37
+ event_id=evt_1eb26b89fce447fa8d9d908b50741ac3
38
+ error> JSON schema validation failed: missing parameter query
39
+ evidence:
40
+ - JSON schema validation failed: missing parameter query
41
+ suggested: Capture tool stderr/status/latency and classify retryable versus non-retryable failures.
42
+
43
+ AgentFailure[system.tool_execution_error]: Likely root cause: Tool execution error in search_web at step 2.
44
+ ```
45
+
46
+ ### ✅ `reflexion_suggestion` (0.00s) — 1 proposal(s)
47
+
48
+ ```
49
+ Reflexion retry hint for system.tool_execution_error at step 2
50
+ ```
51
+
52
+ ### ✅ `critic_recoverer` (0.00s) — 1 verifier proposal(s)
53
+
54
+ ```
55
+ Add tool_result_typecheck before system.tool_execution_error (step 2, agent search_web)
56
+ ```
57
+
58
+ ### ✅ `llm_judge` (14.31s) — 3 finding(s); root=1
59
+
60
+ ```
61
+ - action.parameter_error (conf=1.00) step=1 agent=search_web
62
+ - verification.premature_stop (conf=1.00) step=4 agent=planner
63
+ - verification.missing_task_validation (conf=0.90) step=None agent=system
64
+ ```
65
+
66
+ ### ✅ `attribute_heuristic` (0.00s) — method=heuristic agent=search_web step=1 conf=1.00
67
+
68
+ ```
69
+ Earliest finding with non-trivial confidence: Parameter error
70
+ ```
71
+
72
+ ### ✅ `attribute_all_at_once` (7.52s) — method=all_at_once agent=search_web step=1 conf=0.90
73
+
74
+ ```
75
+ The root cause is the malformed tool call in step 1, where the search_web tool was invoked without the required 'query' parameter, leading to a validation error that derailed the entire process.
76
+ ```
77
+
78
+ ### ✅ `attribute_step_by_step` (23.64s) — method=step_by_step agent=search_web step=1 conf=1.00
79
+
80
+ ```
81
+ The agent invoked the search tool without providing a search query, which is a parameter error that prevents the retrieval of the required information.
82
+ ```
83
+
84
+ ### ✅ `attribute_binary_search` (8.01s) — method=binary_search agent=search_web step=2 conf=0.80
85
+
86
+ ```
87
+ Binary search located the decisive step within 2 probes over 6 events.
88
+ ```
89
+
90
+ ### ✅ `deep_debug` (28.00s) — 3 finding(s); rounds=6
91
+
92
+ ```
93
+ rounds: plan:4588ms / hypothesize:8051ms / verify:h1:2874ms / verify:h2:2392ms / verify:h3:2514ms / refine:7581ms
94
+ summary: The agent failed to provide the required 'query' parameter when calling the search tool, and the planner subsequently ignored this failure, incorrectly claiming the task was complete without summarizing the paper or sending the email.
95
+
96
+ AgentTraceback (root cause first, manifested failure last):
97
+ trace_id=trace_cde22f0eb1ec499aa3bc5b88f472257d framework=e2e-react goal='Find the latest AgentDebug paper, summarize the method, then email alice@example.com'
98
+
99
+ File "root cause", in trajectory
100
+ Step 2 agent=search_web mode=action.parameter_error confidence=1.00
101
+ event_id=evt_1eb26b89fce447fa8d9d908b50741ac3
102
+ error> JSON schema validation failed: missing parameter query
103
+ evidence:
104
+ - JSON schema validation failed: missing parameter query
105
+ - kwargs='{}'
106
+ suggested: Validate parameters against tool schemas and ask for missing user/context fields.
107
+ ↓ cascaded to
108
+ File "cascade depth 1", in trajectory
109
+ Step 4 agent=planner mode=reflection.progress_misjudge confidence=1.00
110
+ module=reflection
111
+ event_id=evt_2b76fd64970544908fdb65eee79bbd9d
112
+ output> Final answer: AgentDebug is a popular paper. Done.
113
+ evidence:
114
+ - Final answer: AgentDebug is a popular paper. Done.
115
+ - meta={'metadata': {'final_answer': True}}
116
+ suggested: Add an external task verifier before termination.
117
+ ↓ cascaded to
118
+ File "cascade depth 2", in trajectory
119
+ Step ? agent=system mode=verification.missing_task_validation confidence=1.00
120
+ event_id=evt_15fe36c6a23247cea25efb4a34fccedf
121
+ evidence:
122
+ - meta={'success': True}
123
+ suggested: Add final-state validation that is independent of the acting agent.
124
+
125
+ AgentFailure[verification.missing_task_validation]: The agent failed to provide the required 'query' parameter when calling the search tool, and the planner subsequently ignored this failure, incorrectly claiming the task was complete without summarizing the paper or sending the email.
126
+ ```
127
+
128
+ ### ✅ `hub_round_trip` (0.00s) — pushed=/home/kunlunz2/AgentDebugX/.agentdebug/e2e_hub/bundle_e79cadee55524420ba4719ecd2934536 ; bundle_id=bundle_e79cadee55524420ba4719ecd2934536 ; listed=1 ; round-trip ok
129
+
130
+ ## `multiagent_handoff_loss`
131
+
132
+ `trace_id=trace_84fb3ac2083c449b8f2ca80503861dc5`
133
+
134
+ ### ✅ `heuristic_analyzer` (0.00s) — 1 finding(s); root=2
135
+ ### ✅ `cross_event_detectors` (0.00s) — 0 finding(s) from default_detectors()
136
+ ### ✅ `traceback_offline` (0.00s) — rendered
137
+
138
+ ```
139
+ AgentTraceback (root cause first, manifested failure last):
140
+ trace_id=trace_84fb3ac2083c449b8f2ca80503861dc5 framework=e2e-multiagent goal='Find the best paper on agent debugging, prefer the most recent.'
141
+
142
+ File "root cause", in trajectory
143
+ Step 2 agent=researcher mode=multiagent.handoff_loss confidence=0.70
144
+ module=multiagent
145
+ event_id=evt_800ad2ca86644768b07990f0af08e1b6
146
+ output> Please summarize the agent debugging paper.
147
+ evidence:
148
+ - handoff/context signal in event payload
149
+ suggested: Make handoff payloads typed and include goal, constraints, evidence, confidence, and open questions.
150
+
151
+ AgentFailure[multiagent.handoff_loss]: Likely root cause: Handoff context loss in researcher at step 2.
152
+ ```
153
+
154
+ ### ✅ `reflexion_suggestion` (0.00s) — 1 proposal(s)
155
+
156
+ ```
157
+ Reflexion retry hint for multiagent.handoff_loss at step 2
158
+ ```
159
+
160
+ ### ✅ `critic_recoverer` (0.00s) — 1 verifier proposal(s)
161
+
162
+ ```
163
+ Add handoff_context_contract before multiagent.handoff_loss (step 2, agent researcher)
164
+ ```
165
+
166
+ ### ✅ `llm_judge` (9.36s) — 2 finding(s); root=2
167
+
168
+ ```
169
+ - multiagent.handoff_loss (conf=1.00) step=2 agent=researcher
170
+ - verification.missing_task_validation (conf=0.90) step=None agent=system
171
+ ```
172
+
173
+ ### ✅ `attribute_heuristic` (0.00s) — method=heuristic agent=researcher step=2 conf=1.00
174
+
175
+ ```
176
+ Earliest finding with non-trivial confidence: Handoff context loss
177
+ ```
178
+
179
+ ### ✅ `attribute_all_at_once` (3.72s) — method=all_at_once agent=researcher step=2 conf=1.00
180
+
181
+ ```
182
+ The researcher correctly identified Paper A as the most recent in step 1 but failed to specify this choice or the recency constraint during the handoff in step 2, leading the summarizer to pick the wrong paper.
183
+ ```
184
+
185
+ ### ✅ `attribute_step_by_step` (21.99s) — method=step_by_step agent=researcher step=2 conf=1.00
186
+
187
+ ```
188
+ The researcher failed to specify which paper to summarize during the handoff, losing the context of the 'most recent' paper identified in the previous step.
189
+ ```
190
+
191
+ ### ✅ `attribute_binary_search` (16.41s) — method=binary_search agent=summarizer step=4 conf=0.90
192
+
193
+ ```
194
+ Binary search located the decisive step within 3 probes over 6 events.
195
+ ```
196
+
197
+ ### ✅ `deep_debug` (45.65s) — 3 finding(s); rounds=6
198
+
199
+ ```
200
+ rounds: plan:4990ms / hypothesize:7110ms / verify:h1:2939ms / verify:h2:2320ms / verify:h3:5377ms / refine:22907ms
201
+ summary: The researcher agent hallucinated finding specific papers without performing a search and subsequently failed to communicate the paper selection or user constraints to the summarizer, leading to a total failure of the summarization step.
202
+
203
+ AgentTraceback (root cause first, manifested failure last):
204
+ trace_id=trace_84fb3ac2083c449b8f2ca80503861dc5 framework=e2e-multiagent goal='Find the best paper on agent debugging, prefer the most recent.'
205
+
206
+ File "root cause", in trajectory
207
+ Step 1 agent=researcher mode=memory.hallucination confidence=0.95
208
+ module=planning
209
+ event_id=evt_a4f22a565af84663bab9f0460b122ea8
210
+ output> Found two candidate papers: A (May 2025) and B (Mar 2024). A is preferred because it is more recent (per user constraint).
211
+ evidence:
212
+ - Found two candidate papers: A (May 2025) and B (Mar 2024)
213
+ - The trajectory shows a premature handoff... without any evidence of search or data retrieval occurring.
214
+ suggested: Require memory reads to cite the source event or artifact before use.
215
+ ↓ cascaded to
216
+ File "cascade depth 1", in trajectory
217
+ Step 2 agent=researcher mode=multiagent.handoff_loss confidence=1.00
218
+ module=multiagent
219
+ event_id=evt_800ad2ca86644768b07990f0af08e1b6
220
+ output> Please summarize the agent debugging paper.
221
+ evidence:
222
+ - Please summarize the agent debugging paper.
223
+ - omitted_context: 'preference for A; recency constraint'
224
+ suggested: Make handoff payloads typed and include goal, constraints, evidence, confidence, and open questions.
225
+ ↓ cascaded to
226
+ File "cascade depth 2", in trajectory
227
+ Step 3 agent=summarizer mode=memory.retrieval_failure confidence=0.90
228
+ module=planning
229
+ event_id=evt_8f20248aa2b64a6dad966bd9f336cc77
230
+ input> Please summarize the agent debugging paper.
231
+ evidence:
232
+ - input=Please summarize the agent debugging paper.
233
+ - output=None
234
+ - omitted_context: 'preference for A; recency constraint'
235
+ suggested: Persist the missing state as structured memory and attach it to the next planning step.
236
+
237
+ AgentFailure[memory.retrieval_failure]: The researcher agent hallucinated finding specific papers without performing a search and subsequently failed to communicate the paper selection or user constraints to the summarizer, leading to a total failure of the summarization step.
238
+ ```
239
+
240
+ ### ✅ `hub_round_trip` (0.00s) — pushed=/home/kunlunz2/AgentDebugX/.agentdebug/e2e_hub/bundle_27e79ad1f5454df4b89f9f7fe75bab3e ; bundle_id=bundle_27e79ad1f5454df4b89f9f7fe75bab3e ; listed=3 ; round-trip ok
241
+
242
+ ## `planning_loop`
243
+
244
+ `trace_id=trace_c59dc17f26994841ad361176ddf6b7c0`
245
+
246
+ ### ✅ `heuristic_analyzer` (0.00s) — 4 finding(s); root=2
247
+ ### ✅ `cross_event_detectors` (0.00s) — 3 finding(s) from default_detectors()
248
+
249
+ ```
250
+ - planning.inefficient_plan (source=repeated_tool_call)
251
+ - planning.inefficient_plan (source=repeated_state)
252
+ - planning.inefficient_plan (source=repeated_state)
253
+ ```
254
+
255
+ ### ✅ `traceback_offline` (0.00s) — rendered
256
+
257
+ ```
258
+ AgentTraceback (root cause first, manifested failure last):
259
+ trace_id=trace_c59dc17f26994841ad361176ddf6b7c0 framework=e2e-browser goal='Submit the checkout form on shop.example.com'
260
+
261
+ File "root cause", in trajectory
262
+ Step 2 agent=browser mode=planning.inefficient_plan confidence=0.67
263
+ event_id=evt_1d189d0f66de4329a6d11f567721eea4
264
+ output> no progress; same checkout screen
265
+ evidence:
266
+ - loop/progress signal in event payload
267
+ suggested: Add loop detection over tool calls and state deltas.
268
+ ↓ cascaded to
269
+ File "cascade depth 1", in trajectory
270
+ Step 4 agent=browser mode=planning.inefficient_plan confidence=0.67
271
+ event_id=evt_d6e7dd127fab458896b58bae5cac1954
272
+ output> no progress; same checkout screen
273
+ evidence:
274
+ - loop/progress signal in event payload
275
+ suggested: Add loop detection over tool calls and state deltas.
276
+ ↓ cascaded to
277
+ File "cascade depth 2", in trajectory
278
+ Step 6 agent=browser mode=planning.inefficient_plan confidence=0.67
279
+ event_id=evt_c92a15e9cc894bffaff61ec5f6169150
280
+ output> no progress; same checkout screen
281
+ evidence:
282
+ - loop/progress signal in event payload
283
+ suggested: Add loop detection over tool calls and state deltas.
284
+ ↓ cascaded to
285
+ File "cascade depth 3", in trajectory
286
+ Step 8 agent=browser mode=planning.inefficient_plan confidence=0.67
287
+ event_id=evt_df07344c3e9e4f7fba9cb3df92a73bfd
288
+ output> no progress; same checkout screen
289
+ evidence:
290
+ - loop/progress signal in event payload
291
+ suggested: Add loop detection over tool calls and state deltas.
292
+
293
+ AgentFailure[planning.inefficient_plan]: Likely root cause: Inefficient plan in browser at step 2.
294
+ ```
295
+
296
+ ### ✅ `reflexion_suggestion` (0.00s) — 4 proposal(s)
297
+
298
+ ```
299
+ Reflexion retry hint for planning.inefficient_plan at step 2
300
+ Reflexion retry hint for planning.inefficient_plan at step 4
301
+ Reflexion retry hint for planning.inefficient_plan at step 6
302
+ Reflexion retry hint for planning.inefficient_plan at step 8
303
+ ```
304
+
305
+ ### ✅ `critic_recoverer` (0.00s) — 4 verifier proposal(s)
306
+
307
+ ```
308
+ Add loop_detector_guard before planning.inefficient_plan (step 2, agent browser)
309
+ Add loop_detector_guard before planning.inefficient_plan (step 4, agent browser)
310
+ Add loop_detector_guard before planning.inefficient_plan (step 6, agent browser)
311
+ Add loop_detector_guard before planning.inefficient_plan (step 8, agent browser)
312
+ ```
313
+
314
+ ### ✅ `llm_judge` (23.10s) — 0 finding(s); root=None
315
+ ### ✅ `attribute_heuristic` (0.00s) — method=heuristic (no hypotheses)
316
+ ### ✅ `attribute_all_at_once` (5.36s) — method=all_at_once agent=planner step=1 conf=0.90
317
+
318
+ ```
319
+ The planner established a flawed strategy of repeatedly clicking the submit button without any logic to handle potential form validation errors or investigate why the submission was failing.
320
+ ```
321
+
322
+ ### ✅ `attribute_step_by_step` (57.37s) — method=step_by_step agent=planner step=1 conf=0.90
323
+
324
+ ```
325
+ The planner's strategy is fundamentally flawed as it attempts to click a submit button without first navigating to the website or filling out the required form fields.
326
+ ```
327
+
328
+ ### ✅ `attribute_binary_search` (24.84s) — method=binary_search agent=browser step=4 conf=0.90
329
+
330
+ ```
331
+ Binary search located the decisive step within 3 probes over 11 events.
332
+ ```
333
+
334
+ ### ✅ `deep_debug` (38.32s) — 2 finding(s); rounds=6
335
+
336
+ ```
337
+ rounds: plan:3303ms / hypothesize:9554ms / verify:h1:3739ms / verify:h2:4204ms / verify:h3:8529ms / refine:8988ms
338
+ summary: The agent failed to submit the checkout form because it repeatedly attempted to click the submit button without ensuring all required fields (such as address or payment details) were populated, ignoring the lack of progress between attempts.
339
+
340
+ AgentTraceback (root cause first, manifested failure last):
341
+ trace_id=trace_c59dc17f26994841ad361176ddf6b7c0 framework=e2e-browser goal='Submit the checkout form on shop.example.com'
342
+
343
+ File "root cause", in trajectory
344
+ Step 7 agent=browser mode=planning.inefficient_plan confidence=0.95
345
+ event_id=evt_0b22e12ef8bb48bdad049706d6b77d33
346
+ input> {'tool': 'click', 'args': '()', 'kwargs': "{'selector': '#submit'}"}
347
+ evidence:
348
+ - step=5 ... click ... #submit
349
+ - step=7 ... click ... #submit
350
+ - output=no progress; same checkout screen
351
+ suggested: Add loop detection over tool calls and state deltas.
352
+ ↓ cascaded to
353
+ File "cascade depth 1", in trajectory
354
+ Step 8 agent=browser mode=planning.constraint_ignorance confidence=1.00
355
+ event_id=evt_df07344c3e9e4f7fba9cb3df92a73bfd
356
+ output> no progress; same checkout screen
357
+ evidence:
358
+ - no progress; same checkout screen
359
+ - Strategy: click #submit until success
360
+ suggested: Compile task and tool constraints into pre-action checks.
361
+
362
+ AgentFailure[planning.constraint_ignorance]: The agent failed to submit the checkout form because it repeatedly attempted to click the submit button without ensuring all required fields (such as address or payment details) were populated, ignoring the lack of progress between attempts.
363
+ ```
364
+
365
+ ### ✅ `hub_round_trip` (0.01s) — pushed=/home/kunlunz2/AgentDebugX/.agentdebug/e2e_hub/bundle_8f2dab82d8e74638ab8656f834d353ab ; bundle_id=bundle_8f2dab82d8e74638ab8656f834d353ab ; listed=4 ; round-trip ok
@@ -0,0 +1,74 @@
1
+ # Who&When — 5-trace Live Leaderboard (v0.2.6, gemini-3-flash)
2
+
3
+ Tiny validation sample drawn from `data/who_when/processed/labels.jsonl`
4
+ (first 5 algorithm-generated traces). **Not a publishable benchmark** — the
5
+ full benchmark requires the 184-trace dataset + a frontier model and is
6
+ deferred for cost reasons. This run exists to verify the analysis stack
7
+ produces sensible-shaped numbers and to surface regressions early.
8
+
9
+ ## Aggregate (per attribution method)
10
+
11
+ | Method | agent_match | exact_step | near_step | both_near | DeepDebug rounds |
12
+ |---|---:|---:|---:|---:|---:|
13
+ | `heuristic` (rule baseline) | 0.20 | 0.00 | 0.20 | 0.20 | n/a |
14
+ | `llm_judge_root` (judge's root_cause field) | **0.40** | 0.00 | **0.20** | **0.20** | n/a |
15
+ | `all_at_once` (Who&When method 1) | 0.20 | 0.00 | 0.00 | 0.00 | n/a |
16
+ | `step_by_step` (Who&When method 2) | **0.40** | 0.00 | **0.20** | **0.20** | n/a |
17
+ | `deep_debug_root` (DeepDebug refined root) | 0.20 | 0.00 | 0.20 | 0.00 | 6 / trace |
18
+
19
+ ## What changed in 0.2.6 vs 0.2.5
20
+
21
+ Same 5 traces, same model:
22
+
23
+ | Method | 0.2.5 agent_match | 0.2.6 agent_match | Δ |
24
+ |---|---:|---:|---:|
25
+ | `heuristic` | 0.20 | 0.20 | — |
26
+ | `llm_judge_root` | 0.00 | **0.40** | +0.40 |
27
+ | `all_at_once` | 0.00 | 0.20 | +0.20 |
28
+ | `step_by_step` | 0.00 | **0.40** | +0.40 |
29
+
30
+ The driver was the v0.2.6 judge prompt hardening: `max_tokens` default
31
+ 4096 → 8192, an explicit `max_findings_per_chunk=6` cap surfaced through
32
+ the system prompt, and a "CRITICAL OUTPUT RULES" header (output ONLY JSON,
33
+ no markdown, no newlines in strings, complete the array). Before the
34
+ hardening, the judge truncated mid-array on Who&When debate transcripts
35
+ and returned no findings; after, the structured root_cause is populated.
36
+
37
+ ## Honest caveats
38
+
39
+ * n=5; per-method standard error is ±0.22 — these absolute numbers should
40
+ not be over-interpreted. The 0.4 vs 0.0 jump for two methods is the
41
+ signal worth reporting; everything else is noise.
42
+ * `deep_debug_root` underperformed `step_by_step` on this sample. The
43
+ refine round on 7-event traces tends to converge to the *visible*
44
+ failure rather than the *causal* root (a known Who&When difficulty —
45
+ manifestation vs root cause).
46
+ * No method beats `near_step=0.20` on this sample. Step-localization
47
+ remains hard, matching the published Who&When ceiling (~14% step on
48
+ 127 traces with frontier models).
49
+
50
+ ## Reproducing
51
+
52
+ ```bash
53
+ # Prepare data (once)
54
+ PYTHONPATH=src python experiments/prepare_who_when.py
55
+
56
+ # Set live LLM creds (any OpenAI-compatible endpoint works)
57
+ export AGENTDEBUG_LLM_BASE_URL=...
58
+ export AGENTDEBUG_LLM_API_KEY=...
59
+ export AGENTDEBUG_LLM_MODEL=gemini-3-flash
60
+
61
+ # Without DeepDebug (~1 min)
62
+ PYTHONPATH=src python experiments/run_who_when_eval.py \
63
+ --limit 5 --live-openai \
64
+ --out-dir experiments/runs/who_when_eval_subset
65
+
66
+ # With DeepDebug (~5 min)
67
+ PYTHONPATH=src python experiments/run_who_when_eval.py \
68
+ --limit 5 --live-openai --deep \
69
+ --out-dir experiments/runs/who_when_eval_subset_deep
70
+ ```
71
+
72
+ The headline benchmark (184 traces × 5 methods × DeepDebug) would take
73
+ ~6 hours and ~$5-10 in API cost on a frontier model. Run it once before
74
+ paper submission; do not run on every iteration.
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "agentdebugx"
3
- version = "0.2.4"
3
+ version = "0.2.6"
4
4
  description = "Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`."
5
5
  authors = ["ULab @ UIUC <ulab@illinois.edu>"]
6
6
  license = "MIT"
@@ -52,12 +52,14 @@ httpx = ">=0.24,<1.0"
52
52
  # Optional integrations — install via `pip install agentdebugx[langgraph]` etc.
53
53
  [tool.poetry.extras]
54
54
  langgraph = ["langchain-core"]
55
+ crewai = ["crewai"]
55
56
  otel = ["opentelemetry-api", "opentelemetry-sdk"]
56
57
  ui = ["fastapi", "uvicorn"]
57
58
  hub-hf = ["huggingface_hub"]
58
59
  openhands = ["openhands-ai"]
59
60
  all = [
60
61
  "langchain-core",
62
+ "crewai",
61
63
  "opentelemetry-api",
62
64
  "opentelemetry-sdk",
63
65
  "fastapi",
@@ -96,4 +96,4 @@ __all__ = [
96
96
  'get_failure_mode',
97
97
  ]
98
98
 
99
- __version__ = '0.2.4'
99
+ __version__ = '0.2.6'