agentdebugx 0.2.1__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/PKG-INFO +1 -1
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/14_api_reference.md +26 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/15_roadmap.md +11 -0
- agentdebugx-0.2.2/docs/22_industry_track_paper_eval_plan.md +235 -0
- agentdebugx-0.2.2/docs/23_status_v0_2.md +108 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/README.md +3 -1
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/pyproject.toml +1 -1
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/__init__.py +19 -1
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/attribution.py +172 -3
- agentdebugx-0.2.2/src/agentdebug/detectors.py +284 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/hub/__init__.py +8 -1
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/ui/server.py +12 -11
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/LICENSE +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/README.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/00_overview.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/01_literature_survey.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/02_architecture.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/03_taxonomy.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/04_trace_schema.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/05_adapters.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/06_detectors.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/07_attribution.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/08_recovery.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/09_error_database.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/10_taxonomy_induction.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/11_multimodal.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/12_ui_dashboard.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/13_class_design.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/16_governance.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/17_claude_code_design_patterns.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/18_comparison_codex_vs_design.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/19_error_hub.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/20_deep_debug.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/21_integrations.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/ERROR_TAXONOMY.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/RESEARCH_SURVEY.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/benchmarks/v0_1_smoke.json +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/benchmarks/v0_1_smoke.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/adapters/__init__.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/adapters/base.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/adapters/langgraph.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/adapters/otel.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/adapters/raw.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/analyzers.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/cli.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/deep.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/events.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/hub/backend_base.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/hub/backends.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/hub/bundle.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/hub/scrub.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/instrumentation.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/integrations/__init__.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/integrations/claude_skill.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/integrations/openhands.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/judges.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/llm.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/models.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/recorder.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/recovery.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/storage.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/taxonomy.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/traceback.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/ui/__init__.py +0 -0
|
@@ -215,6 +215,32 @@ agentdebugx taxonomy export --format yaml|json|md
|
|
|
215
215
|
agentdebugx doctor
|
|
216
216
|
```
|
|
217
217
|
|
|
218
|
+
## 11.1 Current shipped `agentdebug` CLI surface
|
|
219
|
+
|
|
220
|
+
The public design above is the long-term `agentdebugx` contract. The current
|
|
221
|
+
package already ships a smaller but working `agentdebug` CLI:
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
agentdebug analyze <trajectory.json> [--suggest] [--traceback]
|
|
225
|
+
agentdebug list --store-sqlite .agentdebug/errors.sqlite
|
|
226
|
+
agentdebug show <trace_id> --store-sqlite .agentdebug/errors.sqlite
|
|
227
|
+
agentdebug judge <trajectory.json|trace_id> --attribute [--traceback]
|
|
228
|
+
agentdebug deep <trajectory.json|trace_id> [--traceback]
|
|
229
|
+
agentdebug hub push <trace_id> --to local:/tmp/hub --store-sqlite ...
|
|
230
|
+
agentdebug hub pull <spec> --bundle <bundle_id> --into .agentdebug/hub_pulls
|
|
231
|
+
agentdebug hub list <spec>
|
|
232
|
+
agentdebug integrations skill --target ~/.claude/skills --name agentdebug
|
|
233
|
+
agentdebug integrations openhands-microagent --target .openhands/microagents
|
|
234
|
+
agentdebug serve --store-sqlite .agentdebug/errors.sqlite --port 7777
|
|
235
|
+
agentdebug doctor
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
`--traceback` renders `AgentTraceback`, a Python-traceback-style cascade view
|
|
239
|
+
implemented by `agentdebug.traceback.format_traceback(report, trajectory)`.
|
|
240
|
+
DeepDebug can provide explicit cascade edges through
|
|
241
|
+
`finding.metadata['cascading_from_event_id']`; heuristic and single-pass judge
|
|
242
|
+
reports fall back to step-index ordering.
|
|
243
|
+
|
|
218
244
|
## 12. Configuration file
|
|
219
245
|
|
|
220
246
|
`~/.agentdebugx/settings.yaml`:
|
|
@@ -28,6 +28,17 @@ Acceptance:
|
|
|
28
28
|
|
|
29
29
|
## v0.2 — Coverage + UI (4 weeks)
|
|
30
30
|
|
|
31
|
+
Already shipped in the v0.2/v0.2.1 line:
|
|
32
|
+
|
|
33
|
+
- Error Hub bundle format + Local/Git/Hugging Face backends.
|
|
34
|
+
- DeepDebug iterative analysis loop.
|
|
35
|
+
- Claude Code Skill generator and OpenHands microagent/EventStream bridge.
|
|
36
|
+
- `AgentTraceback` cascade renderer and CLI `--traceback` support.
|
|
37
|
+
- FastAPI local console with native-trace + error-trace alignment for human
|
|
38
|
+
review.
|
|
39
|
+
|
|
40
|
+
Remaining scope from the original v0.2 plan:
|
|
41
|
+
|
|
31
42
|
- Adapters: CrewAI, OpenHands, smolagents, LlamaIndex, DSPy, Pydantic-AI.
|
|
32
43
|
- Detectors: anomaly family (perplexity, repeated-state, topic-drift).
|
|
33
44
|
- Attribution: `BinarySearchAttributor`, `CounterfactualAttributor`, `EnsembleAttributor`.
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
# 22 - EMNLP Industry Track Paper + Evaluation Plan
|
|
2
|
+
|
|
3
|
+
This note translates the EMNLP 2026 Industry Track call and recent ACL/EMNLP
|
|
4
|
+
Industry Track patterns into a concrete writing and evaluation plan for the
|
|
5
|
+
AgentDebugX paper.
|
|
6
|
+
|
|
7
|
+
## 1. What Industry Track Reviewers Reward
|
|
8
|
+
|
|
9
|
+
Industry-track papers are not judged like pure method papers. The recurring
|
|
10
|
+
shape in strong papers is:
|
|
11
|
+
|
|
12
|
+
1. Real deployment pain, not a benchmark-only motivation.
|
|
13
|
+
2. A system that can actually be used by practitioners.
|
|
14
|
+
3. Evaluation under practical constraints: cost, latency, scale, privacy,
|
|
15
|
+
maintainability, human workflow, and failure modes.
|
|
16
|
+
4. Lessons learned that future builders can reuse.
|
|
17
|
+
5. Clear limitations and responsible deployment boundaries.
|
|
18
|
+
|
|
19
|
+
Examples worth emulating:
|
|
20
|
+
|
|
21
|
+
- **Experience Report: Implementing Machine Translation in a Regulated
|
|
22
|
+
Industry** (EMNLP 2025 Industry): emphasizes legal/security constraints,
|
|
23
|
+
human-in-the-loop validation, and reviewer preferences over more than 11k
|
|
24
|
+
ranked translations. Source: https://aclanthology.org/2025.emnlp-industry/
|
|
25
|
+
- **STREAQ** (EMNLP 2025 Industry): frames the contribution as an industrial
|
|
26
|
+
cost-quality routing system and reports both model quality and operational
|
|
27
|
+
cost reduction. Source: https://aclanthology.org/2025.emnlp-industry.121/
|
|
28
|
+
- **RAVEN** (ACL 2025 Industry): combines industrial data, public benchmarks,
|
|
29
|
+
deployment pipeline, and online A/B validation. Source:
|
|
30
|
+
https://aclanthology.org/2025.acl-industry.3/
|
|
31
|
+
- **ARIA** (EMNLP 2025 Industry): uses a realistic deployed domain plus public
|
|
32
|
+
dynamic-knowledge tasks, and states deployment scope.
|
|
33
|
+
- **AutoPenBench** (EMNLP 2025 Industry): releases an open benchmark, reports
|
|
34
|
+
autonomous versus human-assisted agent success, and uses intermediate
|
|
35
|
+
milestones to show where agents struggle. Source:
|
|
36
|
+
https://aclanthology.org/2025.emnlp-industry.114/
|
|
37
|
+
|
|
38
|
+
For AgentDebugX, the paper should therefore be framed as:
|
|
39
|
+
|
|
40
|
+
> A deployment-oriented debugging layer for agentic NLP systems, evaluated on
|
|
41
|
+
> whether it makes failures observable, attributable, shareable, and easier for
|
|
42
|
+
> humans to fix.
|
|
43
|
+
|
|
44
|
+
The central claim should not be "we beat every attributor." A stronger claim
|
|
45
|
+
for the Industry Track is:
|
|
46
|
+
|
|
47
|
+
> AgentDebugX provides the missing operating layer between raw agent traces and
|
|
48
|
+
> actionable debugging workflows: aligned native/error traces, taxonomy-backed
|
|
49
|
+
> reports, Error Hub bundles, and cost-aware analysis profiles.
|
|
50
|
+
|
|
51
|
+
## 2. Appendix Rule
|
|
52
|
+
|
|
53
|
+
The EMNLP 2026 Industry Track permits appendices after the bibliography. The
|
|
54
|
+
appendix does not count against the 6-page review limit, but the main paper
|
|
55
|
+
must be self-contained and reviewers are not required to review appendices.
|
|
56
|
+
Source: https://2026.emnlp.org/calls/industry_track/
|
|
57
|
+
So:
|
|
58
|
+
|
|
59
|
+
- Main body: problem, system, screenshot, concise evaluation table, key
|
|
60
|
+
findings, limitations.
|
|
61
|
+
- Appendix: benchmark matrix, annotation schema, prompts, model settings,
|
|
62
|
+
study protocol, redaction examples, additional error traces.
|
|
63
|
+
|
|
64
|
+
Do not hide the core evaluation logic in the appendix. Put enough in the main
|
|
65
|
+
body that a reviewer can assess technical quality without reading extra pages.
|
|
66
|
+
|
|
67
|
+
## 3. Main-Body Narrative
|
|
68
|
+
|
|
69
|
+
Recommended six-page spine:
|
|
70
|
+
|
|
71
|
+
1. **Introduction**: agent systems fail through cascades; raw traces are not
|
|
72
|
+
enough; teams need who/when/why/fix.
|
|
73
|
+
2. **Deployment Requirements**: low-friction instrumentation, privacy,
|
|
74
|
+
portable schema, cost-aware analysis, human review.
|
|
75
|
+
3. **System**: recorder + schema + taxonomy + detectors/attributors +
|
|
76
|
+
Error Hub + UI.
|
|
77
|
+
4. **Use Case Figure**: paired native trace and AgentDebugX error trace.
|
|
78
|
+
5. **Evaluation**: benchmark coverage, diagnostic accuracy, human utility,
|
|
79
|
+
operational overhead.
|
|
80
|
+
6. **Lessons/Limitations**: where it works, where it does not, safety.
|
|
81
|
+
|
|
82
|
+
## 4. Evaluation Questions
|
|
83
|
+
|
|
84
|
+
### Q1. Coverage
|
|
85
|
+
|
|
86
|
+
Can AgentDebugX ingest different agent classes without bespoke debugging code?
|
|
87
|
+
|
|
88
|
+
Benchmarks:
|
|
89
|
+
|
|
90
|
+
- AgentErrorBench: failure-labeled traces over ALFWorld, GAIA, WebShop.
|
|
91
|
+
- MAST and Who&When: multi-agent attribution and failure taxonomy labels.
|
|
92
|
+
- AgentRx: 115 failed trajectories with critical-step labels.
|
|
93
|
+
- WebShop/WebArena: web navigation and tool-use.
|
|
94
|
+
- tau-bench: retail/airline tool-agent-user interaction.
|
|
95
|
+
- SWE-bench Lite/Verified: coding agents with executable tests.
|
|
96
|
+
- OSWorld: multimodal desktop/GUI agents.
|
|
97
|
+
|
|
98
|
+
Metric: conversion success rate, required adapter LOC, event coverage, artifact
|
|
99
|
+
coverage, and schema loss notes.
|
|
100
|
+
|
|
101
|
+
### Q2. Diagnostic Accuracy
|
|
102
|
+
|
|
103
|
+
Can AgentDebugX classify and localize failures?
|
|
104
|
+
|
|
105
|
+
Labels:
|
|
106
|
+
|
|
107
|
+
- failure family
|
|
108
|
+
- failure mode
|
|
109
|
+
- root event ID
|
|
110
|
+
- root agent
|
|
111
|
+
- root step
|
|
112
|
+
- cascade edges
|
|
113
|
+
- evidence spans
|
|
114
|
+
- accepted repair
|
|
115
|
+
|
|
116
|
+
Metrics:
|
|
117
|
+
|
|
118
|
+
- family macro-F1
|
|
119
|
+
- mode macro-F1
|
|
120
|
+
- responsible-agent accuracy
|
|
121
|
+
- root-step exact match
|
|
122
|
+
- root-step +/- 1 match
|
|
123
|
+
- cascade-edge F1
|
|
124
|
+
- false-positive rate on successful traces
|
|
125
|
+
- calibration: confidence vs correctness
|
|
126
|
+
|
|
127
|
+
Baselines:
|
|
128
|
+
|
|
129
|
+
- rule analyzer
|
|
130
|
+
- single-pass LLM judge
|
|
131
|
+
- All-at-Once attribution
|
|
132
|
+
- Step-by-Step attribution when implemented
|
|
133
|
+
- DeepDebug verify/refine loop
|
|
134
|
+
- benchmark-native labels or published baselines where available
|
|
135
|
+
|
|
136
|
+
### Q3. Human Utility
|
|
137
|
+
|
|
138
|
+
Does the paired trace view reduce debugging effort?
|
|
139
|
+
|
|
140
|
+
Study design:
|
|
141
|
+
|
|
142
|
+
- 12-24 developers.
|
|
143
|
+
- Within-subject comparison: raw framework trace/logs vs AgentDebugX report.
|
|
144
|
+
- 24-48 total debugging sessions.
|
|
145
|
+
- Counterbalance task order and UI order.
|
|
146
|
+
|
|
147
|
+
Metrics:
|
|
148
|
+
|
|
149
|
+
- time to first plausible root cause
|
|
150
|
+
- time to accepted repair
|
|
151
|
+
- correctness against adjudicated labels
|
|
152
|
+
- number of trace events inspected
|
|
153
|
+
- confidence and workload rating
|
|
154
|
+
- free-text feedback on missing evidence
|
|
155
|
+
|
|
156
|
+
Fallback if recruiting slips:
|
|
157
|
+
|
|
158
|
+
- 3-5 expert agent builders review 30 traces.
|
|
159
|
+
- Ask them to choose between raw trace and AgentDebugX report, rate usefulness,
|
|
160
|
+
and mark incorrect/misleading diagnoses.
|
|
161
|
+
|
|
162
|
+
### Q4. Operational Viability
|
|
163
|
+
|
|
164
|
+
Can teams run this in real workflows?
|
|
165
|
+
|
|
166
|
+
Metrics:
|
|
167
|
+
|
|
168
|
+
- analyzer latency by profile: rule, judge, DeepDebug
|
|
169
|
+
- token cost by trace length
|
|
170
|
+
- local storage overhead
|
|
171
|
+
- UI load time for 100, 1k, 10k events
|
|
172
|
+
- scrubber redaction hit rate
|
|
173
|
+
- scrubber false positives on benign strings
|
|
174
|
+
- Error Hub bundle size and push/pull time
|
|
175
|
+
|
|
176
|
+
## 5. Data Scale
|
|
177
|
+
|
|
178
|
+
Minimum credible submission target:
|
|
179
|
+
|
|
180
|
+
- 500 failed trajectories.
|
|
181
|
+
- 100 successful trajectories for false-positive calibration.
|
|
182
|
+
- At least 30 examples per high-level family where source benchmarks permit.
|
|
183
|
+
- Two annotators per newly labeled trace plus adjudication.
|
|
184
|
+
- DeepDebug on a stratified hard subset of 100 traces.
|
|
185
|
+
|
|
186
|
+
Stronger target:
|
|
187
|
+
|
|
188
|
+
- 1,000 failed trajectories.
|
|
189
|
+
- 200 successful trajectories.
|
|
190
|
+
- DeepDebug on every trace where rule and single-pass judge disagree.
|
|
191
|
+
- 50-100 private pilot traces, scrubbed and reported only in aggregate.
|
|
192
|
+
|
|
193
|
+
## 6. API and Infra Needed
|
|
194
|
+
|
|
195
|
+
Model APIs:
|
|
196
|
+
|
|
197
|
+
- OpenAI-compatible endpoint as the default abstraction.
|
|
198
|
+
- OpenAI, Gemini, Anthropic-through-proxy/LiteLLM, and local vLLM/Ollama where
|
|
199
|
+
feasible.
|
|
200
|
+
|
|
201
|
+
Benchmark APIs:
|
|
202
|
+
|
|
203
|
+
- WebShop/ALFWorld/GAIA loaders for AgentErrorBench.
|
|
204
|
+
- MAST/Who&When/AgentRx importers preserving existing labels.
|
|
205
|
+
- tau-bench user/tool simulator wrapper.
|
|
206
|
+
- WebArena browser harness with DOM/text/action capture.
|
|
207
|
+
- SWE-bench Docker harness with shell, patch, and test-output capture.
|
|
208
|
+
- OSWorld capture path for screenshot, accessibility tree, click/action, and
|
|
209
|
+
verifier result.
|
|
210
|
+
|
|
211
|
+
Storage/export:
|
|
212
|
+
|
|
213
|
+
- SQLite for local experiments.
|
|
214
|
+
- Error Hub bundles for sharing.
|
|
215
|
+
- Parquet manifest roll-up for large result analysis.
|
|
216
|
+
|
|
217
|
+
## 7. What Should Go in the Appendix
|
|
218
|
+
|
|
219
|
+
- Full benchmark matrix.
|
|
220
|
+
- Exact label schema and examples.
|
|
221
|
+
- Prompts for LLM judge, attributor, and DeepDebug.
|
|
222
|
+
- Model settings and token budgets.
|
|
223
|
+
- Human study instructions and consent/safety notes.
|
|
224
|
+
- Redaction examples.
|
|
225
|
+
- Two or three full trace/report examples.
|
|
226
|
+
- Failure cases where AgentDebugX is wrong.
|
|
227
|
+
|
|
228
|
+
## 8. Immediate TODO Before Submission
|
|
229
|
+
|
|
230
|
+
1. Convert at least two public benchmark sources into AgentTrajectory.
|
|
231
|
+
2. Produce a first 100-trace labeled set.
|
|
232
|
+
3. Add an evaluation runner that outputs a single CSV/JSONL.
|
|
233
|
+
4. Run rule, judge, All-at-Once, and DeepDebug on the same split.
|
|
234
|
+
5. Add a small human/expert review with raw trace vs paired trace view.
|
|
235
|
+
6. Trim main body to 6 pages while keeping appendix rich.
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# 23 — Capability + Test Coverage Status (v0.2.2)
|
|
2
|
+
|
|
3
|
+
A live audit of what's implemented, what's tested, and what's specced but
|
|
4
|
+
not yet built. Pair this with [docs/15_roadmap.md](./15_roadmap.md), which is
|
|
5
|
+
the forward-looking plan; this doc is the rear-view mirror.
|
|
6
|
+
|
|
7
|
+
## 1. What ships in v0.2.2 (live on PyPI)
|
|
8
|
+
|
|
9
|
+
| Layer | Module | Status | Tests |
|
|
10
|
+
|---|---|---|---|
|
|
11
|
+
| Trace IR | `agentdebug.models` | ✅ stable | round-trip + enum tests |
|
|
12
|
+
| Storage | `agentdebug.storage` (JSONL + SQLite) | ✅ stable | round-trip + ctx-mgr |
|
|
13
|
+
| Recorder | `agentdebug.recorder` (`AgentDebug`, `TraceSession`) | ✅ stable | record + analyze flow |
|
|
14
|
+
| Rule analyzer | `agentdebug.analyzers.HeuristicAnalyzer` | ✅ stable | match + suggest |
|
|
15
|
+
| Taxonomy | `agentdebug.taxonomy` (19 seed modes) | ✅ stable | get_mode + list |
|
|
16
|
+
| Function instrumentation | `agentdebug.instrumentation.traced_tool` | ✅ stable | happy + raise |
|
|
17
|
+
| Event bus | `agentdebug.events.EventBus` | ✅ stable | fan-out + auto-detach |
|
|
18
|
+
| LLM client | `agentdebug.llm.OpenAICompatClient` | ✅ stable | mocked httpx + env |
|
|
19
|
+
| LLM judge | `agentdebug.judges.LLMJudgeAnalyzer` | ✅ stable | scripted-LLM happy + silent |
|
|
20
|
+
| Attribution | `agentdebug.attribution.HeuristicAttributor` | ✅ stable | first-finding + tiebreak |
|
|
21
|
+
| Attribution | `agentdebug.attribution.AllAtOnceAttributor` | ✅ stable | mocked LLM + fallback |
|
|
22
|
+
| Attribution | `agentdebug.attribution.StepByStepAttributor` | ✅ **new 0.2.2** | scripted-LLM + fallback |
|
|
23
|
+
| Recovery | `agentdebug.recovery.ReflexionSuggestion` | ✅ stable | per-finding + empty |
|
|
24
|
+
| DeepDebug | `agentdebug.deep.DeepDebugAnalyzer` | ✅ stable | full loop + silent LLM |
|
|
25
|
+
| Cascade view | `agentdebug.traceback.format_traceback` | ✅ stable | cascade + step-order + ANSI + empty |
|
|
26
|
+
| Detectors | `agentdebug.detectors.RepeatedToolCall / RepeatedState / StepCountLimit` | ✅ **new 0.2.2** | threshold + window + budget |
|
|
27
|
+
| Hub bundle | `agentdebug.hub.Bundle / pack_bundle / unpack_bundle` | ✅ stable | round-trip |
|
|
28
|
+
| Hub scrubber | `agentdebug.hub.Scrubber` | ✅ stable | 12 redactions + idempotent |
|
|
29
|
+
| Hub backends | `LocalHubBackend`, `GitHubBackend`, `HuggingFaceBackend` | ✅ stable | local-bare-git + local |
|
|
30
|
+
| Adapters | `agentdebug.adapters.raw` (`trace_loop`, `mark_step`) | ✅ stable | end-to-end + ctxvar |
|
|
31
|
+
| Adapters | `agentdebug.adapters.langgraph.LangChainCallbackAdapter` | ✅ stable | gracefully degrades w/o dep |
|
|
32
|
+
| Adapters | `agentdebug.adapters.otel.OTelExportAdapter` | ✅ stable | branch test |
|
|
33
|
+
| Integrations | `agentdebug.integrations.claude_skill` | ✅ stable | skill-bundle write |
|
|
34
|
+
| Integrations | `agentdebug.integrations.openhands` (microagent + bridge) | ⚠️ microagent stable; bridge needs live OpenHands | microagent YAML test |
|
|
35
|
+
| CLI | `agentdebug.cli` (`analyze | judge | deep | list | show | hub | integrations | serve | doctor`) | ✅ stable | 12 subcommand smoke tests |
|
|
36
|
+
| Local UI | `agentdebug.ui` (FastAPI + vanilla JS console) | ✅ stable | endpoint round-trip |
|
|
37
|
+
|
|
38
|
+
**Test counts:** 60+ unit tests + 1 live-LLM smoke test, `mypy --strict` clean
|
|
39
|
+
across 32 source files.
|
|
40
|
+
|
|
41
|
+
## 2. Designed in docs, not yet implemented
|
|
42
|
+
|
|
43
|
+
| Doc | Component | Why deferred | Realistic ship |
|
|
44
|
+
|---|---|---|---|
|
|
45
|
+
| [06_detectors.md](./06_detectors.md) | `trajectory_perplexity` (TrajAD) | needs token-level LM perplexity API or embedding model + baseline calibration | v0.3 |
|
|
46
|
+
| [06_detectors.md](./06_detectors.md) | `topic_drift` (embedding cosine) | needs embedding client; consider reusing `OpenAICompatClient` `/embeddings` | v0.3 |
|
|
47
|
+
| [06_detectors.md](./06_detectors.md) | LTL spec monitors | requires user-supplied spec or LLM-synthesized monitors; gated on RV research | v1.2 |
|
|
48
|
+
| [07_attribution.md](./07_attribution.md) | `BinarySearchAttributor` (ddmin) | requires replayable environment; few frameworks expose it | v0.3 |
|
|
49
|
+
| [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` | requires re-rolling agent actions; same replay constraint | v0.3 |
|
|
50
|
+
| [07_attribution.md](./07_attribution.md) | `SBFLAttributor` (Tarantula/Ochiai) | needs corpus of passing + failing traces of same task; gated on Hub adoption | v0.4 |
|
|
51
|
+
| [07_attribution.md](./07_attribution.md) | `DeltaDebugAttributor` (Zeller) | same replay constraint | v0.3 |
|
|
52
|
+
| [07_attribution.md](./07_attribution.md) | `EnsembleAttributor` | trivial once 2+ heavy backends ship; awaits BinarySearch/Counterfactual | v0.3 |
|
|
53
|
+
| [08_recovery.md](./08_recovery.md) | `SelfRefineLoop` | small but needs a generator-critic-refiner orchestration | v0.3 |
|
|
54
|
+
| [08_recovery.md](./08_recovery.md) | `CriticRecoverer` | needs a verifier registry (search, code-exec, type-check) | v0.3 |
|
|
55
|
+
| [08_recovery.md](./08_recovery.md) | `AutoManualRules` | needs persistent project manual + injection into next-run prompts | v0.3 |
|
|
56
|
+
| [08_recovery.md](./08_recovery.md) | `LangGraphRewind` | depends on LangGraph checkpointer; ships when we have a real LangGraph user | v0.3 |
|
|
57
|
+
| [08_recovery.md](./08_recovery.md) | `SagaRollback` | needs compensation registry on tool definitions; new schema | v0.3 |
|
|
58
|
+
| [08_recovery.md](./08_recovery.md) | `MCTSBranchExploration` (LATS) | heavy; v2 feature | v2.0 |
|
|
59
|
+
| [09_error_database.md](./09_error_database.md) | DuckDB analytical + Parquet archive | optional; Hub bundles already give per-project corpus | v0.3 |
|
|
60
|
+
| [09_error_database.md](./09_error_database.md) | Vector similarity search | needs embedding model + index choice | v0.3 |
|
|
61
|
+
| [10_taxonomy_induction.md](./10_taxonomy_induction.md) | TnT-LLM + BERTopic pipeline | needs ≥ 1k labeled traces to be useful | v0.4 |
|
|
62
|
+
| [11_multimodal.md](./11_multimodal.md) | Screenshot/DOM capture, VLM judge | gated on multimodal user (Claude Computer Use / OpenAI CUA / OpenHands browser) | v1.1 |
|
|
63
|
+
| [12_ui_dashboard.md](./12_ui_dashboard.md) | TUI (Textual) | low priority; CLI + web UI cover the use cases | v0.4 |
|
|
64
|
+
| [12_ui_dashboard.md](./12_ui_dashboard.md) | VSCode extension | needs TS extension scaffolding | v1.0 |
|
|
65
|
+
| [05_adapters.md](./05_adapters.md) | CrewAI, OpenAI Agents SDK, AutoGen, smolagents, LlamaIndex, DSPy, Pydantic-AI | each is ~150 LOC + conformance test; ship as users land | rolling |
|
|
66
|
+
|
|
67
|
+
## 3. Implementation gaps surfaced by the audit
|
|
68
|
+
|
|
69
|
+
The audit found one real bug and a handful of test gaps:
|
|
70
|
+
|
|
71
|
+
1. **`agentdebug.hub.build_manifest` was used by the CLI but not re-exported** —
|
|
72
|
+
would have surfaced as `ImportError` for any user calling
|
|
73
|
+
`agentdebug hub push`. Fixed in 0.2.2 (`hub/__init__.py`) and locked in by a
|
|
74
|
+
CLI smoke test.
|
|
75
|
+
2. **`cli.py` had 0% coverage** — every subcommand now has a smoke test that
|
|
76
|
+
exercises the argparse path; the LLM-required commands assert the
|
|
77
|
+
"missing credentials" exit code without hitting the network.
|
|
78
|
+
3. **`instrumentation.py` (`traced_tool`) had 0% coverage** — happy and
|
|
79
|
+
exception paths now tested.
|
|
80
|
+
4. **`llm.OpenAICompatClient.complete` had no test** — covered by a custom
|
|
81
|
+
`httpx.BaseTransport` that returns canned JSON without a network call.
|
|
82
|
+
5. **`recovery.ReflexionSuggestion`** had only an indirect test from DeepDebug
|
|
83
|
+
examples; now has direct happy + empty tests.
|
|
84
|
+
|
|
85
|
+
## 4. Coverage matrix (post-0.2.2)
|
|
86
|
+
|
|
87
|
+
Run `PYTHONPATH=src pytest --cov=agentdebug --cov-report=term`. The two largest
|
|
88
|
+
remaining gaps are deliberate:
|
|
89
|
+
|
|
90
|
+
- `agentdebug.adapters.langgraph` — exercised only when `langchain_core` is
|
|
91
|
+
installed. The status-test verifies graceful degradation when it isn't.
|
|
92
|
+
- `agentdebug.hub.backends.HuggingFaceBackend` — gated on `huggingface_hub`.
|
|
93
|
+
Round-tripping through real HF requires `HF_TOKEN`; covered by the local
|
|
94
|
+
bare-git test for the analogous push/pull flow.
|
|
95
|
+
|
|
96
|
+
## 5. Acceptance gates for v0.3 (next minor)
|
|
97
|
+
|
|
98
|
+
Before v0.3 ships, this doc should record green checkmarks for:
|
|
99
|
+
|
|
100
|
+
- [ ] One replayable counterfactual attributor (`BinarySearchAttributor` is
|
|
101
|
+
the cheapest entry).
|
|
102
|
+
- [ ] One tool-grounded recovery strategy (`CriticRecoverer`) wired against
|
|
103
|
+
a `Verifier` Protocol.
|
|
104
|
+
- [ ] One additional framework adapter that goes through the full conformance
|
|
105
|
+
suite (CrewAI is the most-requested).
|
|
106
|
+
- [ ] HuggingFace Hub round-trip live test (gated on `HF_TOKEN`).
|
|
107
|
+
- [ ] Bench harness extended with one published-benchmark loader (Who&When
|
|
108
|
+
is the obvious first target — we already cite its method).
|
|
@@ -31,6 +31,8 @@ This `docs/` directory contains the full design specification.
|
|
|
31
31
|
| 19 | [19_error_hub.md](./19_error_hub.md) | **Error Hub** — bundle format, Local / Git / HF backends, scrubbing |
|
|
32
32
|
| 20 | [20_deep_debug.md](./20_deep_debug.md) | **DeepDebug** — iterative multi-turn analysis (plan → hypothesize → verify → refine) |
|
|
33
33
|
| 21 | [21_integrations.md](./21_integrations.md) | **Claude Code Skill** + **OpenHands** microagent + EventStream bridge |
|
|
34
|
+
| 22 | [22_industry_track_paper_eval_plan.md](./22_industry_track_paper_eval_plan.md) | EMNLP Industry Track writing strategy + benchmark / human-study evaluation plan |
|
|
35
|
+
| 23 | [23_status_v0_2.md](./23_status_v0_2.md) | **Capability + test coverage status (v0.2.2)** — what's implemented, what's tested, what's specced but not built |
|
|
34
36
|
|
|
35
37
|
Plus three **narrative** docs that pre-dated this engineering spec and are kept for paper-style framing:
|
|
36
38
|
|
|
@@ -45,7 +47,7 @@ Plus three **narrative** docs that pre-dated this engineering spec and are kept
|
|
|
45
47
|
## How to read this
|
|
46
48
|
|
|
47
49
|
- **First-time reader:** start with [00_overview.md](./00_overview.md), then [02_architecture.md](./02_architecture.md), then [14_api_reference.md](./14_api_reference.md).
|
|
48
|
-
- **Researcher / paper author:** read [01_literature_survey.md](./01_literature_survey.md), [03_taxonomy.md](./03_taxonomy.md), [07_attribution.md](./07_attribution.md), [10_taxonomy_induction.md](./10_taxonomy_induction.md).
|
|
50
|
+
- **Researcher / paper author:** read [01_literature_survey.md](./01_literature_survey.md), [03_taxonomy.md](./03_taxonomy.md), [07_attribution.md](./07_attribution.md), [10_taxonomy_induction.md](./10_taxonomy_induction.md), and [22_industry_track_paper_eval_plan.md](./22_industry_track_paper_eval_plan.md).
|
|
49
51
|
- **Framework integrator:** read [04_trace_schema.md](./04_trace_schema.md), [05_adapters.md](./05_adapters.md), [13_class_design.md](./13_class_design.md).
|
|
50
52
|
- **UI / product:** read [12_ui_dashboard.md](./12_ui_dashboard.md), [09_error_database.md](./09_error_database.md).
|
|
51
53
|
- **Runtime / agent UX designer:** read [17_claude_code_design_patterns.md](./17_claude_code_design_patterns.md), then [02_architecture.md](./02_architecture.md), [08_recovery.md](./08_recovery.md), and [14_api_reference.md](./14_api_reference.md).
|
|
@@ -15,6 +15,16 @@ from agentdebug.attribution import (
|
|
|
15
15
|
Attributor,
|
|
16
16
|
Blame,
|
|
17
17
|
HeuristicAttributor,
|
|
18
|
+
StepByStepAttributor,
|
|
19
|
+
)
|
|
20
|
+
from agentdebug.detectors import (
|
|
21
|
+
Detector,
|
|
22
|
+
DetectorConfig,
|
|
23
|
+
RepeatedStateDetector,
|
|
24
|
+
RepeatedToolCallDetector,
|
|
25
|
+
StepCountLimitDetector,
|
|
26
|
+
default_detectors,
|
|
27
|
+
run_detectors,
|
|
18
28
|
)
|
|
19
29
|
from agentdebug.events import DEFAULT_BUS, BusEvent, EventBus, EventSubscription
|
|
20
30
|
from agentdebug.models import (
|
|
@@ -44,8 +54,16 @@ __all__ = [
|
|
|
44
54
|
'Blame',
|
|
45
55
|
'BusEvent',
|
|
46
56
|
'CascadeFrame',
|
|
57
|
+
'Detector',
|
|
58
|
+
'DetectorConfig',
|
|
59
|
+
'RepeatedStateDetector',
|
|
60
|
+
'RepeatedToolCallDetector',
|
|
61
|
+
'StepByStepAttributor',
|
|
62
|
+
'StepCountLimitDetector',
|
|
47
63
|
'build_cascade',
|
|
64
|
+
'default_detectors',
|
|
48
65
|
'format_traceback',
|
|
66
|
+
'run_detectors',
|
|
49
67
|
'DEFAULT_BUS',
|
|
50
68
|
'DiagnosticReport',
|
|
51
69
|
'EventBus',
|
|
@@ -66,4 +84,4 @@ __all__ = [
|
|
|
66
84
|
'get_failure_mode',
|
|
67
85
|
]
|
|
68
86
|
|
|
69
|
-
__version__ = '0.2.
|
|
87
|
+
__version__ = '0.2.2'
|
|
@@ -19,10 +19,10 @@ from __future__ import annotations
|
|
|
19
19
|
|
|
20
20
|
import logging
|
|
21
21
|
from dataclasses import dataclass, field
|
|
22
|
-
from typing import Any, Dict, List, Optional, Protocol
|
|
22
|
+
from typing import Any, Dict, List, Optional, Protocol, cast
|
|
23
23
|
|
|
24
24
|
from agentdebug.llm import LLMClient, extract_json_block
|
|
25
|
-
from agentdebug.models import AgentTrajectory, FailureFinding, new_id
|
|
25
|
+
from agentdebug.models import AgentEvent, AgentTrajectory, FailureFinding, new_id
|
|
26
26
|
|
|
27
27
|
LOG = logging.getLogger('agentdebug.attribution')
|
|
28
28
|
|
|
@@ -227,4 +227,173 @@ class AllAtOnceAttributor:
|
|
|
227
227
|
return [str(value)]
|
|
228
228
|
|
|
229
229
|
|
|
230
|
-
|
|
230
|
+
_STEP_SYSTEM_PROMPT = """You are AgentDebugX-Attributor, scanning a failed
|
|
231
|
+
agent trajectory one step at a time (the Who&When "Step-by-Step" method,
|
|
232
|
+
arXiv:2505.00212).
|
|
233
|
+
|
|
234
|
+
You will be given the goal, framework, the prior judge findings, AND a
|
|
235
|
+
SINGLE step from the trajectory plus a short window of preceding steps for
|
|
236
|
+
context. Decide whether THIS step is the decisive failure step.
|
|
237
|
+
|
|
238
|
+
Respond ONLY with a JSON object matching this schema (no prose, no markdown):
|
|
239
|
+
|
|
240
|
+
{
|
|
241
|
+
"is_failure_step": true | false,
|
|
242
|
+
"confidence": <float in [0,1]>,
|
|
243
|
+
"rationale": "<one sentence>",
|
|
244
|
+
"evidence": ["<short quoted evidence>", ...]
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
Be CONSERVATIVE: only return true when the trajectory evidence on this step
|
|
248
|
+
specifically caused the cascading failure.
|
|
249
|
+
"""
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
class StepByStepAttributor:
|
|
253
|
+
"""LLM-based attributor mirroring Who&When's Step-by-Step method.
|
|
254
|
+
|
|
255
|
+
Walks the trajectory in order, asking the LLM about each step. Returns
|
|
256
|
+
every step that answered ``is_failure_step=true`` as a Blame hypothesis,
|
|
257
|
+
sorted by step index. Costs O(N) LLM calls; pair with ``max_steps`` to
|
|
258
|
+
bound the budget for long trajectories.
|
|
259
|
+
"""
|
|
260
|
+
|
|
261
|
+
id = 'step_by_step'
|
|
262
|
+
|
|
263
|
+
def __init__(
|
|
264
|
+
self,
|
|
265
|
+
llm: LLMClient,
|
|
266
|
+
*,
|
|
267
|
+
fallback: Optional[Attributor] = None,
|
|
268
|
+
max_steps: int = 30,
|
|
269
|
+
context_window: int = 3,
|
|
270
|
+
max_tokens: int = 1024,
|
|
271
|
+
) -> None:
|
|
272
|
+
self.llm = llm
|
|
273
|
+
self.fallback: Attributor = fallback or HeuristicAttributor()
|
|
274
|
+
self.max_steps = max_steps
|
|
275
|
+
self.context_window = context_window
|
|
276
|
+
self.max_tokens = max_tokens
|
|
277
|
+
|
|
278
|
+
def attribute(
|
|
279
|
+
self,
|
|
280
|
+
trajectory: AgentTrajectory,
|
|
281
|
+
findings: List[FailureFinding],
|
|
282
|
+
) -> AttributionResult:
|
|
283
|
+
events = list(trajectory.events)
|
|
284
|
+
if not events:
|
|
285
|
+
return self.fallback.attribute(trajectory, findings)
|
|
286
|
+
# Scan only the suffix of size max_steps so long traces stay affordable.
|
|
287
|
+
budget = min(self.max_steps, len(events))
|
|
288
|
+
scanned = events[-budget:]
|
|
289
|
+
hypotheses: List[Blame] = []
|
|
290
|
+
for idx, evt in enumerate(scanned):
|
|
291
|
+
absolute_index = len(events) - budget + idx
|
|
292
|
+
ctx_start = max(0, absolute_index - self.context_window)
|
|
293
|
+
context = events[ctx_start:absolute_index]
|
|
294
|
+
verdict = self._classify_step(
|
|
295
|
+
trajectory, findings, evt, context=context
|
|
296
|
+
)
|
|
297
|
+
if verdict is None or not verdict.get('is_failure_step'):
|
|
298
|
+
continue
|
|
299
|
+
hypotheses.append(Blame(
|
|
300
|
+
span_id=evt.event_id,
|
|
301
|
+
step_index=evt.step_index,
|
|
302
|
+
agent_name=evt.agent_name,
|
|
303
|
+
confidence=self._coerce_float(verdict.get('confidence'), 0.5),
|
|
304
|
+
rationale=str(verdict.get('rationale') or ''),
|
|
305
|
+
evidence=self._coerce_str_list(verdict.get('evidence')),
|
|
306
|
+
sources=[self.id],
|
|
307
|
+
))
|
|
308
|
+
if not hypotheses:
|
|
309
|
+
return AttributionResult(
|
|
310
|
+
method=self.id,
|
|
311
|
+
hypotheses=[],
|
|
312
|
+
raw={'scanned_steps': len(scanned)},
|
|
313
|
+
)
|
|
314
|
+
hypotheses.sort(
|
|
315
|
+
key=lambda h: (
|
|
316
|
+
h.step_index is None,
|
|
317
|
+
h.step_index if h.step_index is not None else 10**9,
|
|
318
|
+
)
|
|
319
|
+
)
|
|
320
|
+
return AttributionResult(
|
|
321
|
+
method=self.id,
|
|
322
|
+
hypotheses=hypotheses,
|
|
323
|
+
raw={'scanned_steps': len(scanned)},
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
def _classify_step(
|
|
327
|
+
self,
|
|
328
|
+
trajectory: AgentTrajectory,
|
|
329
|
+
findings: List[FailureFinding],
|
|
330
|
+
event: 'AgentEvent',
|
|
331
|
+
*,
|
|
332
|
+
context: List['AgentEvent'],
|
|
333
|
+
) -> Optional[Dict[str, Any]]:
|
|
334
|
+
findings_doc = '\n'.join(self._render_finding(f) for f in findings[:10]) \
|
|
335
|
+
or '(no judge findings yet)'
|
|
336
|
+
context_doc = '\n'.join(
|
|
337
|
+
f'context event_id={e.event_id} step={e.step_index} agent={e.agent_name}'
|
|
338
|
+
f' type={getattr(e.event_type, "value", e.event_type)}'
|
|
339
|
+
f' output={str(e.output)[:120]} error={str(e.error)[:120]}'
|
|
340
|
+
for e in context
|
|
341
|
+
) or '(no preceding context)'
|
|
342
|
+
prompt = (
|
|
343
|
+
f'GOAL: {trajectory.goal!r}\n'
|
|
344
|
+
f'FRAMEWORK: {trajectory.framework!r}\n\n'
|
|
345
|
+
f'JUDGE FINDINGS:\n{findings_doc}\n\n'
|
|
346
|
+
f'PRECEDING CONTEXT:\n{context_doc}\n\n'
|
|
347
|
+
f'CANDIDATE STEP:\n'
|
|
348
|
+
f' event_id={event.event_id}\n'
|
|
349
|
+
f' step={event.step_index} agent={event.agent_name} '
|
|
350
|
+
f'module={event.module}\n'
|
|
351
|
+
f' type={getattr(event.event_type, "value", event.event_type)}\n'
|
|
352
|
+
f' input={str(event.input)[:300]}\n'
|
|
353
|
+
f' output={str(event.output)[:300]}\n'
|
|
354
|
+
f' error={str(event.error)[:300]}\n'
|
|
355
|
+
)
|
|
356
|
+
try:
|
|
357
|
+
result = self.llm.complete(
|
|
358
|
+
messages=[
|
|
359
|
+
{'role': 'system', 'content': _STEP_SYSTEM_PROMPT},
|
|
360
|
+
{'role': 'user', 'content': prompt},
|
|
361
|
+
],
|
|
362
|
+
max_tokens=self.max_tokens,
|
|
363
|
+
)
|
|
364
|
+
except Exception as exc: # pragma: no cover
|
|
365
|
+
LOG.warning('step_by_step LLM call failed at step %s: %s',
|
|
366
|
+
event.step_index, exc)
|
|
367
|
+
return None
|
|
368
|
+
parsed = extract_json_block(result.text)
|
|
369
|
+
if parsed is None:
|
|
370
|
+
return None
|
|
371
|
+
return cast(Dict[str, Any], parsed)
|
|
372
|
+
|
|
373
|
+
def _render_finding(self, finding: FailureFinding) -> str:
|
|
374
|
+
return (
|
|
375
|
+
f'- mode={finding.failure_mode.mode_id} '
|
|
376
|
+
f'agent={finding.agent_name} step={finding.step_index} '
|
|
377
|
+
f'confidence={finding.confidence:.2f}'
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
@staticmethod
|
|
381
|
+
def _coerce_float(value: Any, default: float) -> float:
|
|
382
|
+
try:
|
|
383
|
+
return float(value)
|
|
384
|
+
except (TypeError, ValueError):
|
|
385
|
+
return default
|
|
386
|
+
|
|
387
|
+
@staticmethod
|
|
388
|
+
def _coerce_str_list(value: Any) -> List[str]:
|
|
389
|
+
if value is None:
|
|
390
|
+
return []
|
|
391
|
+
if isinstance(value, list):
|
|
392
|
+
return [str(v) for v in value]
|
|
393
|
+
return [str(value)]
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
__all__ = [
|
|
397
|
+
'Attributor', 'Blame', 'AttributionResult',
|
|
398
|
+
'HeuristicAttributor', 'AllAtOnceAttributor', 'StepByStepAttributor',
|
|
399
|
+
]
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""Lightweight rule + anomaly detectors that don't need an LLM.
|
|
2
|
+
|
|
3
|
+
These were specified in `docs/06_detectors.md` but not implemented in the
|
|
4
|
+
initial v0.1 ship. The :class:`HeuristicAnalyzer` covers per-event string
|
|
5
|
+
matching; the detectors here cover *cross-event* signals (loops, repeated
|
|
6
|
+
tool calls, no-op streaks) which the heuristic analyzer cannot see in a
|
|
7
|
+
single pass.
|
|
8
|
+
|
|
9
|
+
Each detector implements :class:`Detector` and returns a list of
|
|
10
|
+
:class:`FailureFinding`. They compose with the existing analyzer pipeline
|
|
11
|
+
via :func:`run_detectors`.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from typing import List, Optional, Protocol
|
|
19
|
+
|
|
20
|
+
from agentdebug.models import (
|
|
21
|
+
AgentEvent,
|
|
22
|
+
AgentTrajectory,
|
|
23
|
+
EventType,
|
|
24
|
+
FailureFinding,
|
|
25
|
+
FailureMode,
|
|
26
|
+
new_id,
|
|
27
|
+
)
|
|
28
|
+
from agentdebug.taxonomy import SEED_FAILURE_MODES
|
|
29
|
+
|
|
30
|
+
LOG = logging.getLogger('agentdebug.detectors')
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class DetectorConfig:
|
|
35
|
+
"""Tunables for the rule + anomaly detectors."""
|
|
36
|
+
|
|
37
|
+
repeated_tool_call_threshold: int = 3
|
|
38
|
+
repeated_state_window: int = 4
|
|
39
|
+
repeated_state_threshold: int = 3
|
|
40
|
+
step_count_limit: int = 50
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Detector(Protocol):
|
|
44
|
+
id: str
|
|
45
|
+
|
|
46
|
+
def detect(self, trajectory: AgentTrajectory) -> List[FailureFinding]:
|
|
47
|
+
...
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
# RepeatedToolCallDetector
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
|
|
54
|
+
class RepeatedToolCallDetector:
|
|
55
|
+
"""Flag a tool that's called with identical args >= threshold times.
|
|
56
|
+
|
|
57
|
+
Matches ``FM-1.3 step_repetition`` / ``planning.inefficient_plan``.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
id = 'repeated_tool_call'
|
|
61
|
+
|
|
62
|
+
def __init__(self, *, threshold: int = 3) -> None:
|
|
63
|
+
self.threshold = threshold
|
|
64
|
+
|
|
65
|
+
def detect(self, trajectory: AgentTrajectory) -> List[FailureFinding]:
|
|
66
|
+
counts: dict[tuple[str, str], List[AgentEvent]] = {}
|
|
67
|
+
for evt in trajectory.events:
|
|
68
|
+
if evt.event_type != EventType.TOOL_CALL:
|
|
69
|
+
continue
|
|
70
|
+
key = (evt.agent_name, _normalize(evt.input))
|
|
71
|
+
counts.setdefault(key, []).append(evt)
|
|
72
|
+
findings: List[FailureFinding] = []
|
|
73
|
+
mode = SEED_FAILURE_MODES['planning.inefficient_plan']
|
|
74
|
+
for (agent, signature), events in counts.items():
|
|
75
|
+
if len(events) < self.threshold:
|
|
76
|
+
continue
|
|
77
|
+
target = events[-1]
|
|
78
|
+
findings.append(
|
|
79
|
+
FailureFinding(
|
|
80
|
+
finding_id=new_id('finding'),
|
|
81
|
+
failure_mode=mode,
|
|
82
|
+
event_id=target.event_id,
|
|
83
|
+
agent_name=target.agent_name,
|
|
84
|
+
step_index=target.step_index,
|
|
85
|
+
confidence=min(0.5 + 0.1 * len(events), 0.95),
|
|
86
|
+
evidence=[
|
|
87
|
+
f'{len(events)} identical TOOL_CALL events with '
|
|
88
|
+
f'signature={_truncate(signature, 80)} on agent={agent}',
|
|
89
|
+
],
|
|
90
|
+
suggestion=_suggestion(mode),
|
|
91
|
+
metadata={'source': self.id, 'detected_count': len(events)},
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
return findings
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# ---------------------------------------------------------------------------
|
|
98
|
+
# RepeatedStateDetector
|
|
99
|
+
# ---------------------------------------------------------------------------
|
|
100
|
+
|
|
101
|
+
class RepeatedStateDetector:
|
|
102
|
+
"""Flag a sliding window where the agent's outputs/observations don't change.
|
|
103
|
+
|
|
104
|
+
Catches no-progress loops the per-event matcher cannot see (e.g., agent
|
|
105
|
+
keeps responding "checking..." or producing the same plan over several
|
|
106
|
+
steps). Matches ``planning.inefficient_plan``.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
id = 'repeated_state'
|
|
110
|
+
|
|
111
|
+
def __init__(self, *, window: int = 4, threshold: int = 3) -> None:
|
|
112
|
+
if threshold > window:
|
|
113
|
+
raise ValueError('threshold must be <= window')
|
|
114
|
+
self.window = window
|
|
115
|
+
self.threshold = threshold
|
|
116
|
+
|
|
117
|
+
def detect(self, trajectory: AgentTrajectory) -> List[FailureFinding]:
|
|
118
|
+
events = [
|
|
119
|
+
e for e in trajectory.events
|
|
120
|
+
if e.event_type in {
|
|
121
|
+
EventType.OBSERVATION,
|
|
122
|
+
EventType.AGENT_STEP,
|
|
123
|
+
EventType.PLAN,
|
|
124
|
+
EventType.TOOL_RESULT,
|
|
125
|
+
}
|
|
126
|
+
]
|
|
127
|
+
findings: List[FailureFinding] = []
|
|
128
|
+
if len(events) < self.window:
|
|
129
|
+
return findings
|
|
130
|
+
mode = SEED_FAILURE_MODES['planning.inefficient_plan']
|
|
131
|
+
flagged_event_ids: set[str] = set()
|
|
132
|
+
for i in range(len(events) - self.window + 1):
|
|
133
|
+
window = events[i : i + self.window]
|
|
134
|
+
sigs = [_normalize(_state_signature(e)) for e in window]
|
|
135
|
+
most_common, count = _mode_count(sigs)
|
|
136
|
+
if count < self.threshold:
|
|
137
|
+
continue
|
|
138
|
+
target = window[-1]
|
|
139
|
+
if target.event_id in flagged_event_ids:
|
|
140
|
+
continue
|
|
141
|
+
flagged_event_ids.add(target.event_id)
|
|
142
|
+
findings.append(
|
|
143
|
+
FailureFinding(
|
|
144
|
+
finding_id=new_id('finding'),
|
|
145
|
+
failure_mode=mode,
|
|
146
|
+
event_id=target.event_id,
|
|
147
|
+
agent_name=target.agent_name,
|
|
148
|
+
step_index=target.step_index,
|
|
149
|
+
confidence=min(0.5 + 0.1 * count, 0.9),
|
|
150
|
+
evidence=[
|
|
151
|
+
f'state repeated {count}x within window of '
|
|
152
|
+
f'{self.window} events; signature={_truncate(most_common, 80)}',
|
|
153
|
+
],
|
|
154
|
+
suggestion=_suggestion(mode),
|
|
155
|
+
metadata={
|
|
156
|
+
'source': self.id,
|
|
157
|
+
'window': self.window,
|
|
158
|
+
'repeated_count': count,
|
|
159
|
+
},
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
return findings
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# ---------------------------------------------------------------------------
|
|
166
|
+
# StepCountLimitDetector
|
|
167
|
+
# ---------------------------------------------------------------------------
|
|
168
|
+
|
|
169
|
+
class StepCountLimitDetector:
|
|
170
|
+
"""Flag a trajectory that exceeded a configured step budget.
|
|
171
|
+
|
|
172
|
+
Matches ``AgentBench TLE`` and ``verification.premature_stop`` proxies.
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
id = 'step_count_limit'
|
|
176
|
+
|
|
177
|
+
def __init__(self, *, max_steps: int = 50) -> None:
|
|
178
|
+
self.max_steps = max_steps
|
|
179
|
+
|
|
180
|
+
def detect(self, trajectory: AgentTrajectory) -> List[FailureFinding]:
|
|
181
|
+
if len(trajectory.events) <= self.max_steps:
|
|
182
|
+
return []
|
|
183
|
+
target = trajectory.events[-1]
|
|
184
|
+
mode = SEED_FAILURE_MODES['planning.inefficient_plan']
|
|
185
|
+
return [FailureFinding(
|
|
186
|
+
finding_id=new_id('finding'),
|
|
187
|
+
failure_mode=mode,
|
|
188
|
+
event_id=target.event_id,
|
|
189
|
+
agent_name=target.agent_name,
|
|
190
|
+
step_index=target.step_index,
|
|
191
|
+
confidence=0.7,
|
|
192
|
+
evidence=[
|
|
193
|
+
f'{len(trajectory.events)} events exceeds configured '
|
|
194
|
+
f'max_steps={self.max_steps}',
|
|
195
|
+
],
|
|
196
|
+
suggestion=_suggestion(mode),
|
|
197
|
+
metadata={'source': self.id, 'event_count': len(trajectory.events)},
|
|
198
|
+
)]
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# ---------------------------------------------------------------------------
|
|
202
|
+
# Pipeline runner
|
|
203
|
+
# ---------------------------------------------------------------------------
|
|
204
|
+
|
|
205
|
+
def default_detectors(config: Optional[DetectorConfig] = None) -> List[Detector]:
|
|
206
|
+
cfg = config or DetectorConfig()
|
|
207
|
+
return [
|
|
208
|
+
RepeatedToolCallDetector(threshold=cfg.repeated_tool_call_threshold),
|
|
209
|
+
RepeatedStateDetector(
|
|
210
|
+
window=cfg.repeated_state_window,
|
|
211
|
+
threshold=cfg.repeated_state_threshold,
|
|
212
|
+
),
|
|
213
|
+
StepCountLimitDetector(max_steps=cfg.step_count_limit),
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def run_detectors(
|
|
218
|
+
trajectory: AgentTrajectory,
|
|
219
|
+
detectors: Optional[List[Detector]] = None,
|
|
220
|
+
) -> List[FailureFinding]:
|
|
221
|
+
"""Run a list of detectors over a trajectory and return merged findings."""
|
|
222
|
+
detectors = detectors or default_detectors()
|
|
223
|
+
out: List[FailureFinding] = []
|
|
224
|
+
for d in detectors:
|
|
225
|
+
try:
|
|
226
|
+
out.extend(d.detect(trajectory))
|
|
227
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
228
|
+
LOG.warning('detector %s raised: %s', d.id, exc)
|
|
229
|
+
return out
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
# ---------------------------------------------------------------------------
|
|
233
|
+
# helpers
|
|
234
|
+
# ---------------------------------------------------------------------------
|
|
235
|
+
|
|
236
|
+
def _state_signature(event: AgentEvent) -> str:
|
|
237
|
+
"""A compact, comparison-friendly view of an event's observable state."""
|
|
238
|
+
return '|'.join([
|
|
239
|
+
getattr(event.event_type, 'value', str(event.event_type)),
|
|
240
|
+
str(event.agent_name or ''),
|
|
241
|
+
str(event.module or ''),
|
|
242
|
+
str(event.output or '')[:200],
|
|
243
|
+
str(event.error or '')[:200],
|
|
244
|
+
])
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _normalize(value: object) -> str:
|
|
248
|
+
text = '' if value is None else str(value)
|
|
249
|
+
return ' '.join(text.split())
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _truncate(text: str, max_chars: int) -> str:
|
|
253
|
+
if len(text) > max_chars:
|
|
254
|
+
return text[:max_chars] + '…'
|
|
255
|
+
return text
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _mode_count(items: List[str]) -> tuple[str, int]:
|
|
259
|
+
counts: dict[str, int] = {}
|
|
260
|
+
best: str = ''
|
|
261
|
+
best_count = 0
|
|
262
|
+
for it in items:
|
|
263
|
+
counts[it] = counts.get(it, 0) + 1
|
|
264
|
+
if counts[it] > best_count:
|
|
265
|
+
best_count = counts[it]
|
|
266
|
+
best = it
|
|
267
|
+
return best, best_count
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _suggestion(mode: FailureMode) -> Optional[str]:
|
|
271
|
+
if mode.suggestion_templates:
|
|
272
|
+
return str(mode.suggestion_templates[0])
|
|
273
|
+
return None
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
__all__ = [
|
|
277
|
+
'Detector',
|
|
278
|
+
'DetectorConfig',
|
|
279
|
+
'RepeatedStateDetector',
|
|
280
|
+
'RepeatedToolCallDetector',
|
|
281
|
+
'StepCountLimitDetector',
|
|
282
|
+
'default_detectors',
|
|
283
|
+
'run_detectors',
|
|
284
|
+
]
|
|
@@ -34,7 +34,13 @@ from agentdebug.hub.backends import (
|
|
|
34
34
|
LocalHubBackend,
|
|
35
35
|
backend_from_spec,
|
|
36
36
|
)
|
|
37
|
-
from agentdebug.hub.bundle import
|
|
37
|
+
from agentdebug.hub.bundle import (
|
|
38
|
+
Bundle,
|
|
39
|
+
BundleManifest,
|
|
40
|
+
build_manifest,
|
|
41
|
+
pack_bundle,
|
|
42
|
+
unpack_bundle,
|
|
43
|
+
)
|
|
38
44
|
from agentdebug.hub.scrub import (
|
|
39
45
|
DEFAULT_REDACTIONS,
|
|
40
46
|
ScrubReport,
|
|
@@ -53,6 +59,7 @@ __all__ = [
|
|
|
53
59
|
'ScrubReport',
|
|
54
60
|
'Scrubber',
|
|
55
61
|
'backend_from_spec',
|
|
62
|
+
'build_manifest',
|
|
56
63
|
'pack_bundle',
|
|
57
64
|
'parse_spec',
|
|
58
65
|
'scrub_trajectory',
|
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
Endpoints:
|
|
4
4
|
|
|
5
|
-
* ``GET /``
|
|
6
|
-
* ``GET /api/v1/traces``
|
|
7
|
-
* ``GET /api/v1/traces/{tid}``
|
|
8
|
-
* ``GET /api/v1/traces/{tid}/raw
|
|
9
|
-
* ``GET /api/v1/taxonomy``
|
|
10
|
-
* ``GET /healthz``
|
|
5
|
+
* ``GET /`` - single-page HTML console.
|
|
6
|
+
* ``GET /api/v1/traces`` - list trace IDs in the store.
|
|
7
|
+
* ``GET /api/v1/traces/{tid}`` - fetch a trajectory + freshly analyzed report.
|
|
8
|
+
* ``GET /api/v1/traces/{tid}/raw``- raw trajectory JSON.
|
|
9
|
+
* ``GET /api/v1/taxonomy`` - list seed failure modes.
|
|
10
|
+
* ``GET /healthz`` - liveness.
|
|
11
11
|
|
|
12
12
|
The server is intentionally tiny and built on a no-build (vanilla JS) frontend
|
|
13
13
|
so it ships with the wheel and runs without `npm`.
|
|
@@ -126,7 +126,7 @@ def store_from_path(path: str) -> TraceStore:
|
|
|
126
126
|
return JsonlTraceStore(path)
|
|
127
127
|
|
|
128
128
|
|
|
129
|
-
# Single-file HTML console. Plain DOM + fetch
|
|
129
|
+
# Single-file HTML console. Plain DOM + fetch; no build step required.
|
|
130
130
|
_INDEX_HTML = """<!doctype html>
|
|
131
131
|
<html lang="en">
|
|
132
132
|
<head>
|
|
@@ -394,7 +394,7 @@ function eventProblem(ev) {
|
|
|
394
394
|
const payload = (fmt(ev.error) + ' ' + fmt(ev.output) + ' ' + fmt(ev.metadata)).toLowerCase();
|
|
395
395
|
return Boolean(ev.error || payload.includes('missing context') || payload.includes('premature') || payload.includes('loop') || payload.includes('handoff'));
|
|
396
396
|
}
|
|
397
|
-
async function loadTraceList() {
|
|
397
|
+
async function loadTraceList(selectFirst) {
|
|
398
398
|
const data = await api('/api/v1/traces');
|
|
399
399
|
const ul = document.getElementById('trace-list');
|
|
400
400
|
ul.innerHTML = '';
|
|
@@ -407,7 +407,7 @@ async function loadTraceList() {
|
|
|
407
407
|
li.dataset.tid = tid;
|
|
408
408
|
li.onclick = () => { selectTrace(tid, li); };
|
|
409
409
|
ul.appendChild(li);
|
|
410
|
-
if (idx === 0) selectTrace(tid, li);
|
|
410
|
+
if (idx === 0 && selectFirst) selectTrace(tid, li);
|
|
411
411
|
});
|
|
412
412
|
if (data.traces.length === 0) {
|
|
413
413
|
document.getElementById('detail').innerHTML = '<div class="empty">No traces in store.</div>';
|
|
@@ -442,6 +442,7 @@ function renderTrace(traj, report) {
|
|
|
442
442
|
const events = traj.events || [];
|
|
443
443
|
const findings = report.findings || [];
|
|
444
444
|
const rootId = report.root_cause_event_id;
|
|
445
|
+
const alignmentEvents = events.filter(ev => ev.step_index !== null && ev.step_index !== undefined);
|
|
445
446
|
const families = [...new Set(findings.map(f => f.failure_mode?.family).filter(Boolean))];
|
|
446
447
|
const errorEvents = events.filter(eventProblem).length;
|
|
447
448
|
const rootEvent = events.find(ev => ev.event_id === rootId) || {};
|
|
@@ -467,7 +468,7 @@ function renderTrace(traj, report) {
|
|
|
467
468
|
html += '<div class="trace-legend"><div class="legend-cell"><div class="legend-label">Agent native trace</div><div class="legend-title">What the agent logged, thought, called, or observed.</div></div>';
|
|
468
469
|
html += '<div class="legend-cell"><div class="legend-label">AgentDebugX error trace</div><div class="legend-title">Normalized failure signal, attribution, and repair hint for human review.</div></div></div>';
|
|
469
470
|
html += '<div class="timeline">';
|
|
470
|
-
for (const ev of
|
|
471
|
+
for (const ev of alignmentEvents) html += renderEvent(ev, ev.event_id === rootId, findingForEvent(findings, ev.event_id));
|
|
471
472
|
html += '</div></div></div>';
|
|
472
473
|
|
|
473
474
|
html += '<div class="rail">';
|
|
@@ -589,7 +590,7 @@ if (BOOTSTRAP && BOOTSTRAP.traces) {
|
|
|
589
590
|
document.getElementById('detail').innerHTML = '<div class="empty">No traces in store.</div>';
|
|
590
591
|
}
|
|
591
592
|
}
|
|
592
|
-
loadTraceList();
|
|
593
|
+
loadTraceList(!(BOOTSTRAP && BOOTSTRAP.selected));
|
|
593
594
|
</script>
|
|
594
595
|
</body>
|
|
595
596
|
</html>
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|