agentdebugx 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/PKG-INFO +1 -1
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/14_api_reference.md +26 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/15_roadmap.md +11 -0
- agentdebugx-0.2.3/docs/22_industry_track_paper_eval_plan.md +235 -0
- agentdebugx-0.2.3/docs/23_status_v0_2.md +115 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/README.md +3 -1
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/pyproject.toml +1 -1
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/__init__.py +32 -2
- agentdebugx-0.2.3/src/agentdebug/attribution.py +570 -0
- agentdebugx-0.2.3/src/agentdebug/detectors.py +284 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/hub/__init__.py +8 -1
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/models.py +12 -0
- agentdebugx-0.2.3/src/agentdebug/recovery.py +314 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/ui/server.py +12 -11
- agentdebugx-0.2.1/src/agentdebug/attribution.py +0 -230
- agentdebugx-0.2.1/src/agentdebug/recovery.py +0 -113
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/LICENSE +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/README.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/00_overview.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/01_literature_survey.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/02_architecture.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/03_taxonomy.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/04_trace_schema.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/05_adapters.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/06_detectors.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/07_attribution.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/08_recovery.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/09_error_database.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/10_taxonomy_induction.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/11_multimodal.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/12_ui_dashboard.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/13_class_design.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/16_governance.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/17_claude_code_design_patterns.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/18_comparison_codex_vs_design.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/19_error_hub.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/20_deep_debug.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/21_integrations.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/ERROR_TAXONOMY.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/RESEARCH_SURVEY.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/benchmarks/v0_1_smoke.json +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/docs/benchmarks/v0_1_smoke.md +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/adapters/__init__.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/adapters/base.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/adapters/langgraph.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/adapters/otel.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/adapters/raw.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/analyzers.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/cli.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/deep.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/events.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/hub/backend_base.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/hub/backends.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/hub/bundle.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/hub/scrub.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/instrumentation.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/integrations/__init__.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/integrations/claude_skill.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/integrations/openhands.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/judges.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/llm.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/recorder.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/storage.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/taxonomy.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/traceback.py +0 -0
- {agentdebugx-0.2.1 → agentdebugx-0.2.3}/src/agentdebug/ui/__init__.py +0 -0
|
@@ -215,6 +215,32 @@ agentdebugx taxonomy export --format yaml|json|md
|
|
|
215
215
|
agentdebugx doctor
|
|
216
216
|
```
|
|
217
217
|
|
|
218
|
+
## 11.1 Current shipped `agentdebug` CLI surface
|
|
219
|
+
|
|
220
|
+
The public design above is the long-term `agentdebugx` contract. The current
|
|
221
|
+
package already ships a smaller but working `agentdebug` CLI:
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
agentdebug analyze <trajectory.json> [--suggest] [--traceback]
|
|
225
|
+
agentdebug list --store-sqlite .agentdebug/errors.sqlite
|
|
226
|
+
agentdebug show <trace_id> --store-sqlite .agentdebug/errors.sqlite
|
|
227
|
+
agentdebug judge <trajectory.json|trace_id> --attribute [--traceback]
|
|
228
|
+
agentdebug deep <trajectory.json|trace_id> [--traceback]
|
|
229
|
+
agentdebug hub push <trace_id> --to local:/tmp/hub --store-sqlite ...
|
|
230
|
+
agentdebug hub pull <spec> --bundle <bundle_id> --into .agentdebug/hub_pulls
|
|
231
|
+
agentdebug hub list <spec>
|
|
232
|
+
agentdebug integrations skill --target ~/.claude/skills --name agentdebug
|
|
233
|
+
agentdebug integrations openhands-microagent --target .openhands/microagents
|
|
234
|
+
agentdebug serve --store-sqlite .agentdebug/errors.sqlite --port 7777
|
|
235
|
+
agentdebug doctor
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
`--traceback` renders `AgentTraceback`, a Python-traceback-style cascade view
|
|
239
|
+
implemented by `agentdebug.traceback.format_traceback(report, trajectory)`.
|
|
240
|
+
DeepDebug can provide explicit cascade edges through
|
|
241
|
+
`finding.metadata['cascading_from_event_id']`; heuristic and single-pass judge
|
|
242
|
+
reports fall back to step-index ordering.
|
|
243
|
+
|
|
218
244
|
## 12. Configuration file
|
|
219
245
|
|
|
220
246
|
`~/.agentdebugx/settings.yaml`:
|
|
@@ -28,6 +28,17 @@ Acceptance:
|
|
|
28
28
|
|
|
29
29
|
## v0.2 — Coverage + UI (4 weeks)
|
|
30
30
|
|
|
31
|
+
Already shipped in the v0.2/v0.2.1 line:
|
|
32
|
+
|
|
33
|
+
- Error Hub bundle format + Local/Git/Hugging Face backends.
|
|
34
|
+
- DeepDebug iterative analysis loop.
|
|
35
|
+
- Claude Code Skill generator and OpenHands microagent/EventStream bridge.
|
|
36
|
+
- `AgentTraceback` cascade renderer and CLI `--traceback` support.
|
|
37
|
+
- FastAPI local console with native-trace + error-trace alignment for human
|
|
38
|
+
review.
|
|
39
|
+
|
|
40
|
+
Remaining scope from the original v0.2 plan:
|
|
41
|
+
|
|
31
42
|
- Adapters: CrewAI, OpenHands, smolagents, LlamaIndex, DSPy, Pydantic-AI.
|
|
32
43
|
- Detectors: anomaly family (perplexity, repeated-state, topic-drift).
|
|
33
44
|
- Attribution: `BinarySearchAttributor`, `CounterfactualAttributor`, `EnsembleAttributor`.
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
# 22 - EMNLP Industry Track Paper + Evaluation Plan
|
|
2
|
+
|
|
3
|
+
This note translates the EMNLP 2026 Industry Track call and recent ACL/EMNLP
|
|
4
|
+
Industry Track patterns into a concrete writing and evaluation plan for the
|
|
5
|
+
AgentDebugX paper.
|
|
6
|
+
|
|
7
|
+
## 1. What Industry Track Reviewers Reward
|
|
8
|
+
|
|
9
|
+
Industry-track papers are not judged like pure method papers. The recurring
|
|
10
|
+
shape in strong papers is:
|
|
11
|
+
|
|
12
|
+
1. Real deployment pain, not a benchmark-only motivation.
|
|
13
|
+
2. A system that can actually be used by practitioners.
|
|
14
|
+
3. Evaluation under practical constraints: cost, latency, scale, privacy,
|
|
15
|
+
maintainability, human workflow, and failure modes.
|
|
16
|
+
4. Lessons learned that future builders can reuse.
|
|
17
|
+
5. Clear limitations and responsible deployment boundaries.
|
|
18
|
+
|
|
19
|
+
Examples worth emulating:
|
|
20
|
+
|
|
21
|
+
- **Experience Report: Implementing Machine Translation in a Regulated
|
|
22
|
+
Industry** (EMNLP 2025 Industry): emphasizes legal/security constraints,
|
|
23
|
+
human-in-the-loop validation, and reviewer preferences over more than 11k
|
|
24
|
+
ranked translations. Source: https://aclanthology.org/2025.emnlp-industry/
|
|
25
|
+
- **STREAQ** (EMNLP 2025 Industry): frames the contribution as an industrial
|
|
26
|
+
cost-quality routing system and reports both model quality and operational
|
|
27
|
+
cost reduction. Source: https://aclanthology.org/2025.emnlp-industry.121/
|
|
28
|
+
- **RAVEN** (ACL 2025 Industry): combines industrial data, public benchmarks,
|
|
29
|
+
deployment pipeline, and online A/B validation. Source:
|
|
30
|
+
https://aclanthology.org/2025.acl-industry.3/
|
|
31
|
+
- **ARIA** (EMNLP 2025 Industry): uses a realistic deployed domain plus public
|
|
32
|
+
dynamic-knowledge tasks, and states deployment scope.
|
|
33
|
+
- **AutoPenBench** (EMNLP 2025 Industry): releases an open benchmark, reports
|
|
34
|
+
autonomous versus human-assisted agent success, and uses intermediate
|
|
35
|
+
milestones to show where agents struggle. Source:
|
|
36
|
+
https://aclanthology.org/2025.emnlp-industry.114/
|
|
37
|
+
|
|
38
|
+
For AgentDebugX, the paper should therefore be framed as:
|
|
39
|
+
|
|
40
|
+
> A deployment-oriented debugging layer for agentic NLP systems, evaluated on
|
|
41
|
+
> whether it makes failures observable, attributable, shareable, and easier for
|
|
42
|
+
> humans to fix.
|
|
43
|
+
|
|
44
|
+
The central claim should not be "we beat every attributor." A stronger claim
|
|
45
|
+
for the Industry Track is:
|
|
46
|
+
|
|
47
|
+
> AgentDebugX provides the missing operating layer between raw agent traces and
|
|
48
|
+
> actionable debugging workflows: aligned native/error traces, taxonomy-backed
|
|
49
|
+
> reports, Error Hub bundles, and cost-aware analysis profiles.
|
|
50
|
+
|
|
51
|
+
## 2. Appendix Rule
|
|
52
|
+
|
|
53
|
+
The EMNLP 2026 Industry Track permits appendices after the bibliography. The
|
|
54
|
+
appendix does not count against the 6-page review limit, but the main paper
|
|
55
|
+
must be self-contained and reviewers are not required to review appendices.
|
|
56
|
+
Source: https://2026.emnlp.org/calls/industry_track/
|
|
57
|
+
So:
|
|
58
|
+
|
|
59
|
+
- Main body: problem, system, screenshot, concise evaluation table, key
|
|
60
|
+
findings, limitations.
|
|
61
|
+
- Appendix: benchmark matrix, annotation schema, prompts, model settings,
|
|
62
|
+
study protocol, redaction examples, additional error traces.
|
|
63
|
+
|
|
64
|
+
Do not hide the core evaluation logic in the appendix. Put enough in the main
|
|
65
|
+
body that a reviewer can assess technical quality without reading extra pages.
|
|
66
|
+
|
|
67
|
+
## 3. Main-Body Narrative
|
|
68
|
+
|
|
69
|
+
Recommended six-page spine:
|
|
70
|
+
|
|
71
|
+
1. **Introduction**: agent systems fail through cascades; raw traces are not
|
|
72
|
+
enough; teams need who/when/why/fix.
|
|
73
|
+
2. **Deployment Requirements**: low-friction instrumentation, privacy,
|
|
74
|
+
portable schema, cost-aware analysis, human review.
|
|
75
|
+
3. **System**: recorder + schema + taxonomy + detectors/attributors +
|
|
76
|
+
Error Hub + UI.
|
|
77
|
+
4. **Use Case Figure**: paired native trace and AgentDebugX error trace.
|
|
78
|
+
5. **Evaluation**: benchmark coverage, diagnostic accuracy, human utility,
|
|
79
|
+
operational overhead.
|
|
80
|
+
6. **Lessons/Limitations**: where it works, where it does not, safety.
|
|
81
|
+
|
|
82
|
+
## 4. Evaluation Questions
|
|
83
|
+
|
|
84
|
+
### Q1. Coverage
|
|
85
|
+
|
|
86
|
+
Can AgentDebugX ingest different agent classes without bespoke debugging code?
|
|
87
|
+
|
|
88
|
+
Benchmarks:
|
|
89
|
+
|
|
90
|
+
- AgentErrorBench: failure-labeled traces over ALFWorld, GAIA, WebShop.
|
|
91
|
+
- MAST and Who&When: multi-agent attribution and failure taxonomy labels.
|
|
92
|
+
- AgentRx: 115 failed trajectories with critical-step labels.
|
|
93
|
+
- WebShop/WebArena: web navigation and tool-use.
|
|
94
|
+
- tau-bench: retail/airline tool-agent-user interaction.
|
|
95
|
+
- SWE-bench Lite/Verified: coding agents with executable tests.
|
|
96
|
+
- OSWorld: multimodal desktop/GUI agents.
|
|
97
|
+
|
|
98
|
+
Metric: conversion success rate, required adapter LOC, event coverage, artifact
|
|
99
|
+
coverage, and schema loss notes.
|
|
100
|
+
|
|
101
|
+
### Q2. Diagnostic Accuracy
|
|
102
|
+
|
|
103
|
+
Can AgentDebugX classify and localize failures?
|
|
104
|
+
|
|
105
|
+
Labels:
|
|
106
|
+
|
|
107
|
+
- failure family
|
|
108
|
+
- failure mode
|
|
109
|
+
- root event ID
|
|
110
|
+
- root agent
|
|
111
|
+
- root step
|
|
112
|
+
- cascade edges
|
|
113
|
+
- evidence spans
|
|
114
|
+
- accepted repair
|
|
115
|
+
|
|
116
|
+
Metrics:
|
|
117
|
+
|
|
118
|
+
- family macro-F1
|
|
119
|
+
- mode macro-F1
|
|
120
|
+
- responsible-agent accuracy
|
|
121
|
+
- root-step exact match
|
|
122
|
+
- root-step +/- 1 match
|
|
123
|
+
- cascade-edge F1
|
|
124
|
+
- false-positive rate on successful traces
|
|
125
|
+
- calibration: confidence vs correctness
|
|
126
|
+
|
|
127
|
+
Baselines:
|
|
128
|
+
|
|
129
|
+
- rule analyzer
|
|
130
|
+
- single-pass LLM judge
|
|
131
|
+
- All-at-Once attribution
|
|
132
|
+
- Step-by-Step attribution when implemented
|
|
133
|
+
- DeepDebug verify/refine loop
|
|
134
|
+
- benchmark-native labels or published baselines where available
|
|
135
|
+
|
|
136
|
+
### Q3. Human Utility
|
|
137
|
+
|
|
138
|
+
Does the paired trace view reduce debugging effort?
|
|
139
|
+
|
|
140
|
+
Study design:
|
|
141
|
+
|
|
142
|
+
- 12-24 developers.
|
|
143
|
+
- Within-subject comparison: raw framework trace/logs vs AgentDebugX report.
|
|
144
|
+
- 24-48 total debugging sessions.
|
|
145
|
+
- Counterbalance task order and UI order.
|
|
146
|
+
|
|
147
|
+
Metrics:
|
|
148
|
+
|
|
149
|
+
- time to first plausible root cause
|
|
150
|
+
- time to accepted repair
|
|
151
|
+
- correctness against adjudicated labels
|
|
152
|
+
- number of trace events inspected
|
|
153
|
+
- confidence and workload rating
|
|
154
|
+
- free-text feedback on missing evidence
|
|
155
|
+
|
|
156
|
+
Fallback if recruiting slips:
|
|
157
|
+
|
|
158
|
+
- 3-5 expert agent builders review 30 traces.
|
|
159
|
+
- Ask them to choose between raw trace and AgentDebugX report, rate usefulness,
|
|
160
|
+
and mark incorrect/misleading diagnoses.
|
|
161
|
+
|
|
162
|
+
### Q4. Operational Viability
|
|
163
|
+
|
|
164
|
+
Can teams run this in real workflows?
|
|
165
|
+
|
|
166
|
+
Metrics:
|
|
167
|
+
|
|
168
|
+
- analyzer latency by profile: rule, judge, DeepDebug
|
|
169
|
+
- token cost by trace length
|
|
170
|
+
- local storage overhead
|
|
171
|
+
- UI load time for 100, 1k, 10k events
|
|
172
|
+
- scrubber redaction hit rate
|
|
173
|
+
- scrubber false positives on benign strings
|
|
174
|
+
- Error Hub bundle size and push/pull time
|
|
175
|
+
|
|
176
|
+
## 5. Data Scale
|
|
177
|
+
|
|
178
|
+
Minimum credible submission target:
|
|
179
|
+
|
|
180
|
+
- 500 failed trajectories.
|
|
181
|
+
- 100 successful trajectories for false-positive calibration.
|
|
182
|
+
- At least 30 examples per high-level family where source benchmarks permit.
|
|
183
|
+
- Two annotators per newly labeled trace plus adjudication.
|
|
184
|
+
- DeepDebug on a stratified hard subset of 100 traces.
|
|
185
|
+
|
|
186
|
+
Stronger target:
|
|
187
|
+
|
|
188
|
+
- 1,000 failed trajectories.
|
|
189
|
+
- 200 successful trajectories.
|
|
190
|
+
- DeepDebug on every trace where rule and single-pass judge disagree.
|
|
191
|
+
- 50-100 private pilot traces, scrubbed and reported only in aggregate.
|
|
192
|
+
|
|
193
|
+
## 6. API and Infra Needed
|
|
194
|
+
|
|
195
|
+
Model APIs:
|
|
196
|
+
|
|
197
|
+
- OpenAI-compatible endpoint as the default abstraction.
|
|
198
|
+
- OpenAI, Gemini, Anthropic-through-proxy/LiteLLM, and local vLLM/Ollama where
|
|
199
|
+
feasible.
|
|
200
|
+
|
|
201
|
+
Benchmark APIs:
|
|
202
|
+
|
|
203
|
+
- WebShop/ALFWorld/GAIA loaders for AgentErrorBench.
|
|
204
|
+
- MAST/Who&When/AgentRx importers preserving existing labels.
|
|
205
|
+
- tau-bench user/tool simulator wrapper.
|
|
206
|
+
- WebArena browser harness with DOM/text/action capture.
|
|
207
|
+
- SWE-bench Docker harness with shell, patch, and test-output capture.
|
|
208
|
+
- OSWorld capture path for screenshot, accessibility tree, click/action, and
|
|
209
|
+
verifier result.
|
|
210
|
+
|
|
211
|
+
Storage/export:
|
|
212
|
+
|
|
213
|
+
- SQLite for local experiments.
|
|
214
|
+
- Error Hub bundles for sharing.
|
|
215
|
+
- Parquet manifest roll-up for large result analysis.
|
|
216
|
+
|
|
217
|
+
## 7. What Should Go in the Appendix
|
|
218
|
+
|
|
219
|
+
- Full benchmark matrix.
|
|
220
|
+
- Exact label schema and examples.
|
|
221
|
+
- Prompts for LLM judge, attributor, and DeepDebug.
|
|
222
|
+
- Model settings and token budgets.
|
|
223
|
+
- Human study instructions and consent/safety notes.
|
|
224
|
+
- Redaction examples.
|
|
225
|
+
- Two or three full trace/report examples.
|
|
226
|
+
- Failure cases where AgentDebugX is wrong.
|
|
227
|
+
|
|
228
|
+
## 8. Immediate TODO Before Submission
|
|
229
|
+
|
|
230
|
+
1. Convert at least two public benchmark sources into AgentTrajectory.
|
|
231
|
+
2. Produce a first 100-trace labeled set.
|
|
232
|
+
3. Add an evaluation runner that outputs a single CSV/JSONL.
|
|
233
|
+
4. Run rule, judge, All-at-Once, and DeepDebug on the same split.
|
|
234
|
+
5. Add a small human/expert review with raw trace vs paired trace view.
|
|
235
|
+
6. Trim main body to 6 pages while keeping appendix rich.
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# 23 — Capability + Test Coverage Status (v0.2.3)
|
|
2
|
+
|
|
3
|
+
A live audit of what's implemented, what's tested, and what's specced but
|
|
4
|
+
not yet built. Pair this with [docs/15_roadmap.md](./15_roadmap.md), which is
|
|
5
|
+
the forward-looking plan; this doc is the rear-view mirror.
|
|
6
|
+
|
|
7
|
+
## 1. What ships in v0.2.2 (live on PyPI)
|
|
8
|
+
|
|
9
|
+
| Layer | Module | Status | Tests |
|
|
10
|
+
|---|---|---|---|
|
|
11
|
+
| Trace IR | `agentdebug.models` | ✅ stable | round-trip + enum tests |
|
|
12
|
+
| Storage | `agentdebug.storage` (JSONL + SQLite) | ✅ stable | round-trip + ctx-mgr |
|
|
13
|
+
| Recorder | `agentdebug.recorder` (`AgentDebug`, `TraceSession`) | ✅ stable | record + analyze flow |
|
|
14
|
+
| Rule analyzer | `agentdebug.analyzers.HeuristicAnalyzer` | ✅ stable | match + suggest |
|
|
15
|
+
| Taxonomy | `agentdebug.taxonomy` (19 seed modes) | ✅ stable | get_mode + list |
|
|
16
|
+
| Function instrumentation | `agentdebug.instrumentation.traced_tool` | ✅ stable | happy + raise |
|
|
17
|
+
| Event bus | `agentdebug.events.EventBus` | ✅ stable | fan-out + auto-detach |
|
|
18
|
+
| LLM client | `agentdebug.llm.OpenAICompatClient` | ✅ stable | mocked httpx + env |
|
|
19
|
+
| LLM judge | `agentdebug.judges.LLMJudgeAnalyzer` | ✅ stable | scripted-LLM happy + silent |
|
|
20
|
+
| Attribution | `agentdebug.attribution.HeuristicAttributor` | ✅ stable | first-finding + tiebreak |
|
|
21
|
+
| Attribution | `agentdebug.attribution.AllAtOnceAttributor` | ✅ stable | mocked LLM + fallback |
|
|
22
|
+
| Attribution | `agentdebug.attribution.StepByStepAttributor` | ✅ **new 0.2.2** | scripted-LLM + fallback |
|
|
23
|
+
| Attribution | `agentdebug.attribution.BinarySearchAttributor` | ✅ **new 0.2.3** | oracle-LLM logarithmic convergence + fallback + render elision |
|
|
24
|
+
| Recovery | `agentdebug.recovery.ReflexionSuggestion` | ✅ stable | per-finding + empty |
|
|
25
|
+
| Recovery | `agentdebug.recovery.CriticRecoverer` + `VerifierSpec` registry | ✅ **new 0.2.3** | 5 family-matched verifier templates; dedup + custom-override |
|
|
26
|
+
| DeepDebug | `agentdebug.deep.DeepDebugAnalyzer` | ✅ stable | full loop + silent LLM |
|
|
27
|
+
| Cascade view | `agentdebug.traceback.format_traceback` | ✅ stable | cascade + step-order + ANSI + empty |
|
|
28
|
+
| Detectors | `agentdebug.detectors.RepeatedToolCall / RepeatedState / StepCountLimit` | ✅ **new 0.2.2** | threshold + window + budget |
|
|
29
|
+
| Hub bundle | `agentdebug.hub.Bundle / pack_bundle / unpack_bundle` | ✅ stable | round-trip |
|
|
30
|
+
| Hub scrubber | `agentdebug.hub.Scrubber` | ✅ stable | 12 redactions + idempotent |
|
|
31
|
+
| Hub backends | `LocalHubBackend`, `GitHubBackend`, `HuggingFaceBackend` | ✅ stable | local-bare-git + local |
|
|
32
|
+
| Adapters | `agentdebug.adapters.raw` (`trace_loop`, `mark_step`) | ✅ stable | end-to-end + ctxvar |
|
|
33
|
+
| Adapters | `agentdebug.adapters.langgraph.LangChainCallbackAdapter` | ✅ stable | gracefully degrades w/o dep |
|
|
34
|
+
| Adapters | `agentdebug.adapters.otel.OTelExportAdapter` | ✅ stable | branch test |
|
|
35
|
+
| Integrations | `agentdebug.integrations.claude_skill` | ✅ stable | skill-bundle write |
|
|
36
|
+
| Integrations | `agentdebug.integrations.openhands` (microagent + bridge) | ⚠️ microagent stable; bridge needs live OpenHands | microagent YAML test |
|
|
37
|
+
| CLI | `agentdebug.cli` (`analyze | judge | deep | list | show | hub | integrations | serve | doctor`) | ✅ stable | 12 subcommand smoke tests |
|
|
38
|
+
| Local UI | `agentdebug.ui` (FastAPI + vanilla JS console) | ✅ stable | endpoint round-trip |
|
|
39
|
+
|
|
40
|
+
**Test counts:** 60+ unit tests + 1 live-LLM smoke test, `mypy --strict` clean
|
|
41
|
+
across 32 source files.
|
|
42
|
+
|
|
43
|
+
## 2. Designed in docs, not yet implemented
|
|
44
|
+
|
|
45
|
+
| Doc | Component | Why deferred | Realistic ship |
|
|
46
|
+
|---|---|---|---|
|
|
47
|
+
| [06_detectors.md](./06_detectors.md) | `trajectory_perplexity` (TrajAD) | needs token-level LM perplexity API or embedding model + baseline calibration | v0.3 |
|
|
48
|
+
| [06_detectors.md](./06_detectors.md) | `topic_drift` (embedding cosine) | needs embedding client; consider reusing `OpenAICompatClient` `/embeddings` | v0.3 |
|
|
49
|
+
| [06_detectors.md](./06_detectors.md) | LTL spec monitors | requires user-supplied spec or LLM-synthesized monitors; gated on RV research | v1.2 |
|
|
50
|
+
| [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` | requires re-rolling agent actions; framework-replay dependent | v0.3 |
|
|
51
|
+
| [07_attribution.md](./07_attribution.md) | `SBFLAttributor` (Tarantula/Ochiai) | needs corpus of passing + failing traces of same task; gated on Hub adoption | v0.4 |
|
|
52
|
+
| [07_attribution.md](./07_attribution.md) | `DeltaDebugAttributor` (Zeller) | same replay constraint | v0.3 |
|
|
53
|
+
| [07_attribution.md](./07_attribution.md) | `EnsembleAttributor` | trivial once Counterfactual lands; awaits Counterfactual | v0.3 |
|
|
54
|
+
| [08_recovery.md](./08_recovery.md) | `SelfRefineLoop` | small but needs a generator-critic-refiner orchestration | v0.3 |
|
|
55
|
+
| [08_recovery.md](./08_recovery.md) | `AutoManualRules` | needs persistent project manual + injection into next-run prompts | v0.3 |
|
|
56
|
+
| [08_recovery.md](./08_recovery.md) | `LangGraphRewind` | depends on LangGraph checkpointer; ships when we have a real LangGraph user | v0.3 |
|
|
57
|
+
| [08_recovery.md](./08_recovery.md) | `SagaRollback` | needs compensation registry on tool definitions; new schema | v0.3 |
|
|
58
|
+
| [08_recovery.md](./08_recovery.md) | `MCTSBranchExploration` (LATS) | heavy; v2 feature | v2.0 |
|
|
59
|
+
| [09_error_database.md](./09_error_database.md) | DuckDB analytical + Parquet archive | optional; Hub bundles already give per-project corpus | v0.3 |
|
|
60
|
+
| [09_error_database.md](./09_error_database.md) | Vector similarity search | needs embedding model + index choice | v0.3 |
|
|
61
|
+
| [10_taxonomy_induction.md](./10_taxonomy_induction.md) | TnT-LLM + BERTopic pipeline | needs ≥ 1k labeled traces to be useful | v0.4 |
|
|
62
|
+
| [11_multimodal.md](./11_multimodal.md) | Screenshot/DOM capture, VLM judge | gated on multimodal user (Claude Computer Use / OpenAI CUA / OpenHands browser) | v1.1 |
|
|
63
|
+
| [12_ui_dashboard.md](./12_ui_dashboard.md) | TUI (Textual) | low priority; CLI + web UI cover the use cases | v0.4 |
|
|
64
|
+
| [12_ui_dashboard.md](./12_ui_dashboard.md) | VSCode extension | needs TS extension scaffolding | v1.0 |
|
|
65
|
+
| [05_adapters.md](./05_adapters.md) | CrewAI, OpenAI Agents SDK, AutoGen, smolagents, LlamaIndex, DSPy, Pydantic-AI | each is ~150 LOC + conformance test; ship as users land | rolling |
|
|
66
|
+
|
|
67
|
+
## 3. Implementation gaps surfaced by the audit
|
|
68
|
+
|
|
69
|
+
The audit found one real bug and a handful of test gaps:
|
|
70
|
+
|
|
71
|
+
1. **`agentdebug.hub.build_manifest` was used by the CLI but not re-exported** —
|
|
72
|
+
would have surfaced as `ImportError` for any user calling
|
|
73
|
+
`agentdebug hub push`. Fixed in 0.2.2 (`hub/__init__.py`) and locked in by a
|
|
74
|
+
CLI smoke test.
|
|
75
|
+
2. **`cli.py` had 0% coverage** — every subcommand now has a smoke test that
|
|
76
|
+
exercises the argparse path; the LLM-required commands assert the
|
|
77
|
+
"missing credentials" exit code without hitting the network.
|
|
78
|
+
3. **`instrumentation.py` (`traced_tool`) had 0% coverage** — happy and
|
|
79
|
+
exception paths now tested.
|
|
80
|
+
4. **`llm.OpenAICompatClient.complete` had no test** — covered by a custom
|
|
81
|
+
`httpx.BaseTransport` that returns canned JSON without a network call.
|
|
82
|
+
5. **`recovery.ReflexionSuggestion`** had only an indirect test from DeepDebug
|
|
83
|
+
examples; now has direct happy + empty tests.
|
|
84
|
+
|
|
85
|
+
## 4. Coverage matrix (post-0.2.2)
|
|
86
|
+
|
|
87
|
+
Run `PYTHONPATH=src pytest --cov=agentdebug --cov-report=term`. The two largest
|
|
88
|
+
remaining gaps are deliberate:
|
|
89
|
+
|
|
90
|
+
- `agentdebug.adapters.langgraph` — exercised only when `langchain_core` is
|
|
91
|
+
installed. The status-test verifies graceful degradation when it isn't.
|
|
92
|
+
- `agentdebug.hub.backends.HuggingFaceBackend` — gated on `huggingface_hub`.
|
|
93
|
+
Round-tripping through real HF requires `HF_TOKEN`; covered by the local
|
|
94
|
+
bare-git test for the analogous push/pull flow.
|
|
95
|
+
|
|
96
|
+
## 5. Acceptance gates for v0.3 (next minor)
|
|
97
|
+
|
|
98
|
+
Before v0.3 ships, this doc should record green checkmarks for:
|
|
99
|
+
|
|
100
|
+
- [x] **Logarithmic-cost attributor** (`BinarySearchAttributor`) shipped in
|
|
101
|
+
0.2.3 — Who&When method 3, O(log N) LLM calls, bisects the trajectory
|
|
102
|
+
via prefix evaluation. **Note:** this is not yet a "replayable
|
|
103
|
+
counterfactual" attributor; it predicts whether the failure has
|
|
104
|
+
already occurred from the prefix without re-rolling the agent. True
|
|
105
|
+
counterfactual replay is still v0.3.
|
|
106
|
+
- [x] **Tool-grounded recovery strategy** (`CriticRecoverer` + `VerifierSpec`
|
|
107
|
+
registry) shipped in 0.2.3 — pattern-matches failure modes against 5
|
|
108
|
+
default verifier templates (JSON-schema guard, final-state check,
|
|
109
|
+
tool-result type-check, handoff contract, loop-detector guard) and
|
|
110
|
+
emits per-finding `FixProposal` with rationale + suggested code.
|
|
111
|
+
- [ ] One additional framework adapter that goes through the full conformance
|
|
112
|
+
suite (CrewAI is the most-requested).
|
|
113
|
+
- [ ] HuggingFace Hub round-trip live test (gated on `HF_TOKEN`).
|
|
114
|
+
- [ ] Bench harness extended with one published-benchmark loader (Who&When
|
|
115
|
+
is the obvious first target — we already cite its method).
|
|
@@ -31,6 +31,8 @@ This `docs/` directory contains the full design specification.
|
|
|
31
31
|
| 19 | [19_error_hub.md](./19_error_hub.md) | **Error Hub** — bundle format, Local / Git / HF backends, scrubbing |
|
|
32
32
|
| 20 | [20_deep_debug.md](./20_deep_debug.md) | **DeepDebug** — iterative multi-turn analysis (plan → hypothesize → verify → refine) |
|
|
33
33
|
| 21 | [21_integrations.md](./21_integrations.md) | **Claude Code Skill** + **OpenHands** microagent + EventStream bridge |
|
|
34
|
+
| 22 | [22_industry_track_paper_eval_plan.md](./22_industry_track_paper_eval_plan.md) | EMNLP Industry Track writing strategy + benchmark / human-study evaluation plan |
|
|
35
|
+
| 23 | [23_status_v0_2.md](./23_status_v0_2.md) | **Capability + test coverage status (v0.2.2)** — what's implemented, what's tested, what's specced but not built |
|
|
34
36
|
|
|
35
37
|
Plus three **narrative** docs that pre-dated this engineering spec and are kept for paper-style framing:
|
|
36
38
|
|
|
@@ -45,7 +47,7 @@ Plus three **narrative** docs that pre-dated this engineering spec and are kept
|
|
|
45
47
|
## How to read this
|
|
46
48
|
|
|
47
49
|
- **First-time reader:** start with [00_overview.md](./00_overview.md), then [02_architecture.md](./02_architecture.md), then [14_api_reference.md](./14_api_reference.md).
|
|
48
|
-
- **Researcher / paper author:** read [01_literature_survey.md](./01_literature_survey.md), [03_taxonomy.md](./03_taxonomy.md), [07_attribution.md](./07_attribution.md), [10_taxonomy_induction.md](./10_taxonomy_induction.md).
|
|
50
|
+
- **Researcher / paper author:** read [01_literature_survey.md](./01_literature_survey.md), [03_taxonomy.md](./03_taxonomy.md), [07_attribution.md](./07_attribution.md), [10_taxonomy_induction.md](./10_taxonomy_induction.md), and [22_industry_track_paper_eval_plan.md](./22_industry_track_paper_eval_plan.md).
|
|
49
51
|
- **Framework integrator:** read [04_trace_schema.md](./04_trace_schema.md), [05_adapters.md](./05_adapters.md), [13_class_design.md](./13_class_design.md).
|
|
50
52
|
- **UI / product:** read [12_ui_dashboard.md](./12_ui_dashboard.md), [09_error_database.md](./09_error_database.md).
|
|
51
53
|
- **Runtime / agent UX designer:** read [17_claude_code_design_patterns.md](./17_claude_code_design_patterns.md), then [02_architecture.md](./02_architecture.md), [08_recovery.md](./08_recovery.md), and [14_api_reference.md](./14_api_reference.md).
|
|
@@ -13,8 +13,19 @@ from agentdebug.attribution import (
|
|
|
13
13
|
AllAtOnceAttributor,
|
|
14
14
|
AttributionResult,
|
|
15
15
|
Attributor,
|
|
16
|
+
BinarySearchAttributor,
|
|
16
17
|
Blame,
|
|
17
18
|
HeuristicAttributor,
|
|
19
|
+
StepByStepAttributor,
|
|
20
|
+
)
|
|
21
|
+
from agentdebug.detectors import (
|
|
22
|
+
Detector,
|
|
23
|
+
DetectorConfig,
|
|
24
|
+
RepeatedStateDetector,
|
|
25
|
+
RepeatedToolCallDetector,
|
|
26
|
+
StepCountLimitDetector,
|
|
27
|
+
default_detectors,
|
|
28
|
+
run_detectors,
|
|
18
29
|
)
|
|
19
30
|
from agentdebug.events import DEFAULT_BUS, BusEvent, EventBus, EventSubscription
|
|
20
31
|
from agentdebug.models import (
|
|
@@ -28,7 +39,14 @@ from agentdebug.models import (
|
|
|
28
39
|
Modality,
|
|
29
40
|
)
|
|
30
41
|
from agentdebug.recorder import AgentDebug, TraceSession
|
|
31
|
-
from agentdebug.recovery import
|
|
42
|
+
from agentdebug.recovery import (
|
|
43
|
+
DEFAULT_VERIFIERS,
|
|
44
|
+
CriticRecoverer,
|
|
45
|
+
FixProposal,
|
|
46
|
+
Recoverer,
|
|
47
|
+
ReflexionSuggestion,
|
|
48
|
+
VerifierSpec,
|
|
49
|
+
)
|
|
32
50
|
from agentdebug.traceback import CascadeFrame, build_cascade, format_traceback
|
|
33
51
|
from agentdebug.storage import JsonlTraceStore, SQLiteTraceStore
|
|
34
52
|
from agentdebug.taxonomy import SEED_FAILURE_MODES, get_failure_mode
|
|
@@ -43,9 +61,21 @@ __all__ = [
|
|
|
43
61
|
'Attributor',
|
|
44
62
|
'Blame',
|
|
45
63
|
'BusEvent',
|
|
64
|
+
'BinarySearchAttributor',
|
|
46
65
|
'CascadeFrame',
|
|
66
|
+
'CriticRecoverer',
|
|
67
|
+
'DEFAULT_VERIFIERS',
|
|
68
|
+
'Detector',
|
|
69
|
+
'DetectorConfig',
|
|
70
|
+
'RepeatedStateDetector',
|
|
71
|
+
'RepeatedToolCallDetector',
|
|
72
|
+
'StepByStepAttributor',
|
|
73
|
+
'StepCountLimitDetector',
|
|
74
|
+
'VerifierSpec',
|
|
47
75
|
'build_cascade',
|
|
76
|
+
'default_detectors',
|
|
48
77
|
'format_traceback',
|
|
78
|
+
'run_detectors',
|
|
49
79
|
'DEFAULT_BUS',
|
|
50
80
|
'DiagnosticReport',
|
|
51
81
|
'EventBus',
|
|
@@ -66,4 +96,4 @@ __all__ = [
|
|
|
66
96
|
'get_failure_mode',
|
|
67
97
|
]
|
|
68
98
|
|
|
69
|
-
__version__ = '0.2.
|
|
99
|
+
__version__ = '0.2.3'
|