agentdebugx 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/PKG-INFO +5 -1
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/README.md +4 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/14_api_reference.md +26 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/15_roadmap.md +11 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/20_deep_debug.md +46 -0
- agentdebugx-0.2.2/docs/22_industry_track_paper_eval_plan.md +235 -0
- agentdebugx-0.2.2/docs/23_status_v0_2.md +108 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/README.md +3 -1
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/pyproject.toml +1 -1
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/__init__.py +23 -1
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/attribution.py +172 -3
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/cli.py +42 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/deep.py +23 -2
- agentdebugx-0.2.2/src/agentdebug/detectors.py +284 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/hub/__init__.py +8 -1
- agentdebugx-0.2.2/src/agentdebug/traceback.py +302 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/ui/server.py +141 -20
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/LICENSE +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/00_overview.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/01_literature_survey.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/02_architecture.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/03_taxonomy.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/04_trace_schema.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/05_adapters.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/06_detectors.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/07_attribution.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/08_recovery.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/09_error_database.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/10_taxonomy_induction.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/11_multimodal.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/12_ui_dashboard.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/13_class_design.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/16_governance.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/17_claude_code_design_patterns.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/18_comparison_codex_vs_design.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/19_error_hub.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/21_integrations.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/ERROR_TAXONOMY.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/RESEARCH_SURVEY.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/benchmarks/v0_1_smoke.json +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/benchmarks/v0_1_smoke.md +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/adapters/__init__.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/adapters/base.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/adapters/langgraph.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/adapters/otel.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/adapters/raw.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/analyzers.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/events.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/hub/backend_base.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/hub/backends.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/hub/bundle.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/hub/scrub.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/instrumentation.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/integrations/__init__.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/integrations/claude_skill.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/integrations/openhands.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/judges.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/llm.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/models.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/recorder.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/recovery.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/storage.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/taxonomy.py +0 -0
- {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/ui/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: agentdebugx
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`.
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE
|
|
@@ -187,6 +187,10 @@ agentdebug serve --store-sqlite .agentdebug/errors.sqlite
|
|
|
187
187
|
# DeepDebug — iterative multi-turn analysis (plan -> hypothesize -> verify -> refine)
|
|
188
188
|
agentdebug deep <trajectory.json>
|
|
189
189
|
|
|
190
|
+
# Render the cascade as a Python-traceback (root cause first, manifested failure last)
|
|
191
|
+
agentdebug deep <trajectory.json> --traceback
|
|
192
|
+
agentdebug analyze <trajectory.json> --traceback # works without an LLM too
|
|
193
|
+
|
|
190
194
|
# Error Hub: package + push a trace to a Git remote or HF dataset
|
|
191
195
|
agentdebug hub push <trace_id> \
|
|
192
196
|
--to git:git@github.com:your-org/agentdebug-bundles.git#bundles \
|
|
@@ -147,6 +147,10 @@ agentdebug serve --store-sqlite .agentdebug/errors.sqlite
|
|
|
147
147
|
# DeepDebug — iterative multi-turn analysis (plan -> hypothesize -> verify -> refine)
|
|
148
148
|
agentdebug deep <trajectory.json>
|
|
149
149
|
|
|
150
|
+
# Render the cascade as a Python-traceback (root cause first, manifested failure last)
|
|
151
|
+
agentdebug deep <trajectory.json> --traceback
|
|
152
|
+
agentdebug analyze <trajectory.json> --traceback # works without an LLM too
|
|
153
|
+
|
|
150
154
|
# Error Hub: package + push a trace to a Git remote or HF dataset
|
|
151
155
|
agentdebug hub push <trace_id> \
|
|
152
156
|
--to git:git@github.com:your-org/agentdebug-bundles.git#bundles \
|
|
@@ -215,6 +215,32 @@ agentdebugx taxonomy export --format yaml|json|md
|
|
|
215
215
|
agentdebugx doctor
|
|
216
216
|
```
|
|
217
217
|
|
|
218
|
+
## 11.1 Current shipped `agentdebug` CLI surface
|
|
219
|
+
|
|
220
|
+
The public design above is the long-term `agentdebugx` contract. The current
|
|
221
|
+
package already ships a smaller but working `agentdebug` CLI:
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
agentdebug analyze <trajectory.json> [--suggest] [--traceback]
|
|
225
|
+
agentdebug list --store-sqlite .agentdebug/errors.sqlite
|
|
226
|
+
agentdebug show <trace_id> --store-sqlite .agentdebug/errors.sqlite
|
|
227
|
+
agentdebug judge <trajectory.json|trace_id> --attribute [--traceback]
|
|
228
|
+
agentdebug deep <trajectory.json|trace_id> [--traceback]
|
|
229
|
+
agentdebug hub push <trace_id> --to local:/tmp/hub --store-sqlite ...
|
|
230
|
+
agentdebug hub pull <spec> --bundle <bundle_id> --into .agentdebug/hub_pulls
|
|
231
|
+
agentdebug hub list <spec>
|
|
232
|
+
agentdebug integrations skill --target ~/.claude/skills --name agentdebug
|
|
233
|
+
agentdebug integrations openhands-microagent --target .openhands/microagents
|
|
234
|
+
agentdebug serve --store-sqlite .agentdebug/errors.sqlite --port 7777
|
|
235
|
+
agentdebug doctor
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
`--traceback` renders `AgentTraceback`, a Python-traceback-style cascade view
|
|
239
|
+
implemented by `agentdebug.traceback.format_traceback(report, trajectory)`.
|
|
240
|
+
DeepDebug can provide explicit cascade edges through
|
|
241
|
+
`finding.metadata['cascading_from_event_id']`; heuristic and single-pass judge
|
|
242
|
+
reports fall back to step-index ordering.
|
|
243
|
+
|
|
218
244
|
## 12. Configuration file
|
|
219
245
|
|
|
220
246
|
`~/.agentdebugx/settings.yaml`:
|
|
@@ -28,6 +28,17 @@ Acceptance:
|
|
|
28
28
|
|
|
29
29
|
## v0.2 — Coverage + UI (4 weeks)
|
|
30
30
|
|
|
31
|
+
Already shipped in the v0.2/v0.2.1 line:
|
|
32
|
+
|
|
33
|
+
- Error Hub bundle format + Local/Git/Hugging Face backends.
|
|
34
|
+
- DeepDebug iterative analysis loop.
|
|
35
|
+
- Claude Code Skill generator and OpenHands microagent/EventStream bridge.
|
|
36
|
+
- `AgentTraceback` cascade renderer and CLI `--traceback` support.
|
|
37
|
+
- FastAPI local console with native-trace + error-trace alignment for human
|
|
38
|
+
review.
|
|
39
|
+
|
|
40
|
+
Remaining scope from the original v0.2 plan:
|
|
41
|
+
|
|
31
42
|
- Adapters: CrewAI, OpenHands, smolagents, LlamaIndex, DSPy, Pydantic-AI.
|
|
32
43
|
- Detectors: anomaly family (perplexity, repeated-state, topic-drift).
|
|
33
44
|
- Attribution: `BinarySearchAttributor`, `CounterfactualAttributor`, `EnsembleAttributor`.
|
|
@@ -115,6 +115,52 @@ rounds : plan (4.6s) hypothesize (11.0s)
|
|
|
115
115
|
The single-pass `LLMJudgeAnalyzer` on the same trace returned only the first
|
|
116
116
|
finding. DeepDebug recovered the full cascade and selected the upstream cause.
|
|
117
117
|
|
|
118
|
+
## 6.1 AgentTraceback — Python-traceback-style cascade view
|
|
119
|
+
|
|
120
|
+
Once DeepDebug has populated `finding.metadata['cascading_from_event_id']`,
|
|
121
|
+
`agentdebug.traceback.format_traceback(report, trajectory)` renders the
|
|
122
|
+
cascade in a layout that mirrors Python's `Traceback (most recent call last)`
|
|
123
|
+
— root cause first, manifested failure last, with arrows between hops:
|
|
124
|
+
|
|
125
|
+
```text
|
|
126
|
+
AgentTraceback (root cause first, manifested failure last):
|
|
127
|
+
trace_id=trace_… framework=live-cascade-demo goal='Find latest paper, summarize, then email …'
|
|
128
|
+
|
|
129
|
+
File "root cause", in trajectory
|
|
130
|
+
Step 3 agent=search mode=action.parameter_error confidence=1.00
|
|
131
|
+
module=action
|
|
132
|
+
error> JSON schema validation failed: missing parameter query
|
|
133
|
+
evidence:
|
|
134
|
+
- args={}
|
|
135
|
+
suggested: Validate parameters against tool schemas before execution.
|
|
136
|
+
↓ cascaded to
|
|
137
|
+
File "cascade depth 1", in trajectory
|
|
138
|
+
Step 4 agent=planner mode=verification.premature_stop confidence=1.00
|
|
139
|
+
output> Final answer: AgentDebug is a popular paper.
|
|
140
|
+
↓ cascaded to
|
|
141
|
+
File "cascade depth 1", in trajectory
|
|
142
|
+
Step 4 agent=planner mode=memory.hallucination confidence=0.95
|
|
143
|
+
output> Final answer: AgentDebug is a popular paper.
|
|
144
|
+
|
|
145
|
+
AgentFailure[memory.hallucination]: The search agent failed to provide the
|
|
146
|
+
required 'query' parameter in its tool call, leading to a tool error. The
|
|
147
|
+
planner then hallucinated a generic fact about the paper and prematurely
|
|
148
|
+
terminated the task without completing the summary or email steps.
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
CLI:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
agentdebug deep <trajectory.json> --traceback # render to stdout
|
|
155
|
+
agentdebug analyze <trajectory.json> --traceback # works for rule analyzer too
|
|
156
|
+
agentdebug judge <traj|trace_id> --attribute --traceback
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
When DeepDebug isn't available (heuristic analyzer or single-pass judge),
|
|
160
|
+
the renderer falls back to **step-index ordering** — the earliest finding
|
|
161
|
+
becomes the root and later findings cascade from it. This means
|
|
162
|
+
`--traceback` works on any analyzer in the pipeline, not just DeepDebug.
|
|
163
|
+
|
|
118
164
|
## 7. Failure modes
|
|
119
165
|
|
|
120
166
|
- **Cost blowout** — if `max_hypotheses_to_verify` is high and verify is
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
# 22 - EMNLP Industry Track Paper + Evaluation Plan
|
|
2
|
+
|
|
3
|
+
This note translates the EMNLP 2026 Industry Track call and recent ACL/EMNLP
|
|
4
|
+
Industry Track patterns into a concrete writing and evaluation plan for the
|
|
5
|
+
AgentDebugX paper.
|
|
6
|
+
|
|
7
|
+
## 1. What Industry Track Reviewers Reward
|
|
8
|
+
|
|
9
|
+
Industry-track papers are not judged like pure method papers. The recurring
|
|
10
|
+
shape in strong papers is:
|
|
11
|
+
|
|
12
|
+
1. Real deployment pain, not a benchmark-only motivation.
|
|
13
|
+
2. A system that can actually be used by practitioners.
|
|
14
|
+
3. Evaluation under practical constraints: cost, latency, scale, privacy,
|
|
15
|
+
maintainability, human workflow, and failure modes.
|
|
16
|
+
4. Lessons learned that future builders can reuse.
|
|
17
|
+
5. Clear limitations and responsible deployment boundaries.
|
|
18
|
+
|
|
19
|
+
Examples worth emulating:
|
|
20
|
+
|
|
21
|
+
- **Experience Report: Implementing Machine Translation in a Regulated
|
|
22
|
+
Industry** (EMNLP 2025 Industry): emphasizes legal/security constraints,
|
|
23
|
+
human-in-the-loop validation, and reviewer preferences over more than 11k
|
|
24
|
+
ranked translations. Source: https://aclanthology.org/2025.emnlp-industry/
|
|
25
|
+
- **STREAQ** (EMNLP 2025 Industry): frames the contribution as an industrial
|
|
26
|
+
cost-quality routing system and reports both model quality and operational
|
|
27
|
+
cost reduction. Source: https://aclanthology.org/2025.emnlp-industry.121/
|
|
28
|
+
- **RAVEN** (ACL 2025 Industry): combines industrial data, public benchmarks,
|
|
29
|
+
deployment pipeline, and online A/B validation. Source:
|
|
30
|
+
https://aclanthology.org/2025.acl-industry.3/
|
|
31
|
+
- **ARIA** (EMNLP 2025 Industry): uses a realistic deployed domain plus public
|
|
32
|
+
dynamic-knowledge tasks, and states deployment scope.
|
|
33
|
+
- **AutoPenBench** (EMNLP 2025 Industry): releases an open benchmark, reports
|
|
34
|
+
autonomous versus human-assisted agent success, and uses intermediate
|
|
35
|
+
milestones to show where agents struggle. Source:
|
|
36
|
+
https://aclanthology.org/2025.emnlp-industry.114/
|
|
37
|
+
|
|
38
|
+
For AgentDebugX, the paper should therefore be framed as:
|
|
39
|
+
|
|
40
|
+
> A deployment-oriented debugging layer for agentic NLP systems, evaluated on
|
|
41
|
+
> whether it makes failures observable, attributable, shareable, and easier for
|
|
42
|
+
> humans to fix.
|
|
43
|
+
|
|
44
|
+
The central claim should not be "we beat every attributor." A stronger claim
|
|
45
|
+
for the Industry Track is:
|
|
46
|
+
|
|
47
|
+
> AgentDebugX provides the missing operating layer between raw agent traces and
|
|
48
|
+
> actionable debugging workflows: aligned native/error traces, taxonomy-backed
|
|
49
|
+
> reports, Error Hub bundles, and cost-aware analysis profiles.
|
|
50
|
+
|
|
51
|
+
## 2. Appendix Rule
|
|
52
|
+
|
|
53
|
+
The EMNLP 2026 Industry Track permits appendices after the bibliography. The
|
|
54
|
+
appendix does not count against the 6-page review limit, but the main paper
|
|
55
|
+
must be self-contained and reviewers are not required to review appendices.
|
|
56
|
+
Source: https://2026.emnlp.org/calls/industry_track/
|
|
57
|
+
So:
|
|
58
|
+
|
|
59
|
+
- Main body: problem, system, screenshot, concise evaluation table, key
|
|
60
|
+
findings, limitations.
|
|
61
|
+
- Appendix: benchmark matrix, annotation schema, prompts, model settings,
|
|
62
|
+
study protocol, redaction examples, additional error traces.
|
|
63
|
+
|
|
64
|
+
Do not hide the core evaluation logic in the appendix. Put enough in the main
|
|
65
|
+
body that a reviewer can assess technical quality without reading extra pages.
|
|
66
|
+
|
|
67
|
+
## 3. Main-Body Narrative
|
|
68
|
+
|
|
69
|
+
Recommended six-page spine:
|
|
70
|
+
|
|
71
|
+
1. **Introduction**: agent systems fail through cascades; raw traces are not
|
|
72
|
+
enough; teams need who/when/why/fix.
|
|
73
|
+
2. **Deployment Requirements**: low-friction instrumentation, privacy,
|
|
74
|
+
portable schema, cost-aware analysis, human review.
|
|
75
|
+
3. **System**: recorder + schema + taxonomy + detectors/attributors +
|
|
76
|
+
Error Hub + UI.
|
|
77
|
+
4. **Use Case Figure**: paired native trace and AgentDebugX error trace.
|
|
78
|
+
5. **Evaluation**: benchmark coverage, diagnostic accuracy, human utility,
|
|
79
|
+
operational overhead.
|
|
80
|
+
6. **Lessons/Limitations**: where it works, where it does not, safety.
|
|
81
|
+
|
|
82
|
+
## 4. Evaluation Questions
|
|
83
|
+
|
|
84
|
+
### Q1. Coverage
|
|
85
|
+
|
|
86
|
+
Can AgentDebugX ingest different agent classes without bespoke debugging code?
|
|
87
|
+
|
|
88
|
+
Benchmarks:
|
|
89
|
+
|
|
90
|
+
- AgentErrorBench: failure-labeled traces over ALFWorld, GAIA, WebShop.
|
|
91
|
+
- MAST and Who&When: multi-agent attribution and failure taxonomy labels.
|
|
92
|
+
- AgentRx: 115 failed trajectories with critical-step labels.
|
|
93
|
+
- WebShop/WebArena: web navigation and tool-use.
|
|
94
|
+
- tau-bench: retail/airline tool-agent-user interaction.
|
|
95
|
+
- SWE-bench Lite/Verified: coding agents with executable tests.
|
|
96
|
+
- OSWorld: multimodal desktop/GUI agents.
|
|
97
|
+
|
|
98
|
+
Metric: conversion success rate, required adapter LOC, event coverage, artifact
|
|
99
|
+
coverage, and schema loss notes.
|
|
100
|
+
|
|
101
|
+
### Q2. Diagnostic Accuracy
|
|
102
|
+
|
|
103
|
+
Can AgentDebugX classify and localize failures?
|
|
104
|
+
|
|
105
|
+
Labels:
|
|
106
|
+
|
|
107
|
+
- failure family
|
|
108
|
+
- failure mode
|
|
109
|
+
- root event ID
|
|
110
|
+
- root agent
|
|
111
|
+
- root step
|
|
112
|
+
- cascade edges
|
|
113
|
+
- evidence spans
|
|
114
|
+
- accepted repair
|
|
115
|
+
|
|
116
|
+
Metrics:
|
|
117
|
+
|
|
118
|
+
- family macro-F1
|
|
119
|
+
- mode macro-F1
|
|
120
|
+
- responsible-agent accuracy
|
|
121
|
+
- root-step exact match
|
|
122
|
+
- root-step +/- 1 match
|
|
123
|
+
- cascade-edge F1
|
|
124
|
+
- false-positive rate on successful traces
|
|
125
|
+
- calibration: confidence vs correctness
|
|
126
|
+
|
|
127
|
+
Baselines:
|
|
128
|
+
|
|
129
|
+
- rule analyzer
|
|
130
|
+
- single-pass LLM judge
|
|
131
|
+
- All-at-Once attribution
|
|
132
|
+
- Step-by-Step attribution when implemented
|
|
133
|
+
- DeepDebug verify/refine loop
|
|
134
|
+
- benchmark-native labels or published baselines where available
|
|
135
|
+
|
|
136
|
+
### Q3. Human Utility
|
|
137
|
+
|
|
138
|
+
Does the paired trace view reduce debugging effort?
|
|
139
|
+
|
|
140
|
+
Study design:
|
|
141
|
+
|
|
142
|
+
- 12-24 developers.
|
|
143
|
+
- Within-subject comparison: raw framework trace/logs vs AgentDebugX report.
|
|
144
|
+
- 24-48 total debugging sessions.
|
|
145
|
+
- Counterbalance task order and UI order.
|
|
146
|
+
|
|
147
|
+
Metrics:
|
|
148
|
+
|
|
149
|
+
- time to first plausible root cause
|
|
150
|
+
- time to accepted repair
|
|
151
|
+
- correctness against adjudicated labels
|
|
152
|
+
- number of trace events inspected
|
|
153
|
+
- confidence and workload rating
|
|
154
|
+
- free-text feedback on missing evidence
|
|
155
|
+
|
|
156
|
+
Fallback if recruiting slips:
|
|
157
|
+
|
|
158
|
+
- 3-5 expert agent builders review 30 traces.
|
|
159
|
+
- Ask them to choose between raw trace and AgentDebugX report, rate usefulness,
|
|
160
|
+
and mark incorrect/misleading diagnoses.
|
|
161
|
+
|
|
162
|
+
### Q4. Operational Viability
|
|
163
|
+
|
|
164
|
+
Can teams run this in real workflows?
|
|
165
|
+
|
|
166
|
+
Metrics:
|
|
167
|
+
|
|
168
|
+
- analyzer latency by profile: rule, judge, DeepDebug
|
|
169
|
+
- token cost by trace length
|
|
170
|
+
- local storage overhead
|
|
171
|
+
- UI load time for 100, 1k, 10k events
|
|
172
|
+
- scrubber redaction hit rate
|
|
173
|
+
- scrubber false positives on benign strings
|
|
174
|
+
- Error Hub bundle size and push/pull time
|
|
175
|
+
|
|
176
|
+
## 5. Data Scale
|
|
177
|
+
|
|
178
|
+
Minimum credible submission target:
|
|
179
|
+
|
|
180
|
+
- 500 failed trajectories.
|
|
181
|
+
- 100 successful trajectories for false-positive calibration.
|
|
182
|
+
- At least 30 examples per high-level family where source benchmarks permit.
|
|
183
|
+
- Two annotators per newly labeled trace plus adjudication.
|
|
184
|
+
- DeepDebug on a stratified hard subset of 100 traces.
|
|
185
|
+
|
|
186
|
+
Stronger target:
|
|
187
|
+
|
|
188
|
+
- 1,000 failed trajectories.
|
|
189
|
+
- 200 successful trajectories.
|
|
190
|
+
- DeepDebug on every trace where rule and single-pass judge disagree.
|
|
191
|
+
- 50-100 private pilot traces, scrubbed and reported only in aggregate.
|
|
192
|
+
|
|
193
|
+
## 6. API and Infra Needed
|
|
194
|
+
|
|
195
|
+
Model APIs:
|
|
196
|
+
|
|
197
|
+
- OpenAI-compatible endpoint as the default abstraction.
|
|
198
|
+
- OpenAI, Gemini, Anthropic-through-proxy/LiteLLM, and local vLLM/Ollama where
|
|
199
|
+
feasible.
|
|
200
|
+
|
|
201
|
+
Benchmark APIs:
|
|
202
|
+
|
|
203
|
+
- WebShop/ALFWorld/GAIA loaders for AgentErrorBench.
|
|
204
|
+
- MAST/Who&When/AgentRx importers preserving existing labels.
|
|
205
|
+
- tau-bench user/tool simulator wrapper.
|
|
206
|
+
- WebArena browser harness with DOM/text/action capture.
|
|
207
|
+
- SWE-bench Docker harness with shell, patch, and test-output capture.
|
|
208
|
+
- OSWorld capture path for screenshot, accessibility tree, click/action, and
|
|
209
|
+
verifier result.
|
|
210
|
+
|
|
211
|
+
Storage/export:
|
|
212
|
+
|
|
213
|
+
- SQLite for local experiments.
|
|
214
|
+
- Error Hub bundles for sharing.
|
|
215
|
+
- Parquet manifest roll-up for large result analysis.
|
|
216
|
+
|
|
217
|
+
## 7. What Should Go in the Appendix
|
|
218
|
+
|
|
219
|
+
- Full benchmark matrix.
|
|
220
|
+
- Exact label schema and examples.
|
|
221
|
+
- Prompts for LLM judge, attributor, and DeepDebug.
|
|
222
|
+
- Model settings and token budgets.
|
|
223
|
+
- Human study instructions and consent/safety notes.
|
|
224
|
+
- Redaction examples.
|
|
225
|
+
- Two or three full trace/report examples.
|
|
226
|
+
- Failure cases where AgentDebugX is wrong.
|
|
227
|
+
|
|
228
|
+
## 8. Immediate TODO Before Submission
|
|
229
|
+
|
|
230
|
+
1. Convert at least two public benchmark sources into AgentTrajectory.
|
|
231
|
+
2. Produce a first 100-trace labeled set.
|
|
232
|
+
3. Add an evaluation runner that outputs a single CSV/JSONL.
|
|
233
|
+
4. Run rule, judge, All-at-Once, and DeepDebug on the same split.
|
|
234
|
+
5. Add a small human/expert review with raw trace vs paired trace view.
|
|
235
|
+
6. Trim main body to 6 pages while keeping appendix rich.
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# 23 — Capability + Test Coverage Status (v0.2.2)
|
|
2
|
+
|
|
3
|
+
A live audit of what's implemented, what's tested, and what's specced but
|
|
4
|
+
not yet built. Pair this with [docs/15_roadmap.md](./15_roadmap.md), which is
|
|
5
|
+
the forward-looking plan; this doc is the rear-view mirror.
|
|
6
|
+
|
|
7
|
+
## 1. What ships in v0.2.2 (live on PyPI)
|
|
8
|
+
|
|
9
|
+
| Layer | Module | Status | Tests |
|
|
10
|
+
|---|---|---|---|
|
|
11
|
+
| Trace IR | `agentdebug.models` | ✅ stable | round-trip + enum tests |
|
|
12
|
+
| Storage | `agentdebug.storage` (JSONL + SQLite) | ✅ stable | round-trip + ctx-mgr |
|
|
13
|
+
| Recorder | `agentdebug.recorder` (`AgentDebug`, `TraceSession`) | ✅ stable | record + analyze flow |
|
|
14
|
+
| Rule analyzer | `agentdebug.analyzers.HeuristicAnalyzer` | ✅ stable | match + suggest |
|
|
15
|
+
| Taxonomy | `agentdebug.taxonomy` (19 seed modes) | ✅ stable | get_mode + list |
|
|
16
|
+
| Function instrumentation | `agentdebug.instrumentation.traced_tool` | ✅ stable | happy + raise |
|
|
17
|
+
| Event bus | `agentdebug.events.EventBus` | ✅ stable | fan-out + auto-detach |
|
|
18
|
+
| LLM client | `agentdebug.llm.OpenAICompatClient` | ✅ stable | mocked httpx + env |
|
|
19
|
+
| LLM judge | `agentdebug.judges.LLMJudgeAnalyzer` | ✅ stable | scripted-LLM happy + silent |
|
|
20
|
+
| Attribution | `agentdebug.attribution.HeuristicAttributor` | ✅ stable | first-finding + tiebreak |
|
|
21
|
+
| Attribution | `agentdebug.attribution.AllAtOnceAttributor` | ✅ stable | mocked LLM + fallback |
|
|
22
|
+
| Attribution | `agentdebug.attribution.StepByStepAttributor` | ✅ **new 0.2.2** | scripted-LLM + fallback |
|
|
23
|
+
| Recovery | `agentdebug.recovery.ReflexionSuggestion` | ✅ stable | per-finding + empty |
|
|
24
|
+
| DeepDebug | `agentdebug.deep.DeepDebugAnalyzer` | ✅ stable | full loop + silent LLM |
|
|
25
|
+
| Cascade view | `agentdebug.traceback.format_traceback` | ✅ stable | cascade + step-order + ANSI + empty |
|
|
26
|
+
| Detectors | `agentdebug.detectors.RepeatedToolCall / RepeatedState / StepCountLimit` | ✅ **new 0.2.2** | threshold + window + budget |
|
|
27
|
+
| Hub bundle | `agentdebug.hub.Bundle / pack_bundle / unpack_bundle` | ✅ stable | round-trip |
|
|
28
|
+
| Hub scrubber | `agentdebug.hub.Scrubber` | ✅ stable | 12 redactions + idempotent |
|
|
29
|
+
| Hub backends | `LocalHubBackend`, `GitHubBackend`, `HuggingFaceBackend` | ✅ stable | local-bare-git + local |
|
|
30
|
+
| Adapters | `agentdebug.adapters.raw` (`trace_loop`, `mark_step`) | ✅ stable | end-to-end + ctxvar |
|
|
31
|
+
| Adapters | `agentdebug.adapters.langgraph.LangChainCallbackAdapter` | ✅ stable | gracefully degrades w/o dep |
|
|
32
|
+
| Adapters | `agentdebug.adapters.otel.OTelExportAdapter` | ✅ stable | branch test |
|
|
33
|
+
| Integrations | `agentdebug.integrations.claude_skill` | ✅ stable | skill-bundle write |
|
|
34
|
+
| Integrations | `agentdebug.integrations.openhands` (microagent + bridge) | ⚠️ microagent stable; bridge needs live OpenHands | microagent YAML test |
|
|
35
|
+
| CLI | `agentdebug.cli` (`analyze | judge | deep | list | show | hub | integrations | serve | doctor`) | ✅ stable | 12 subcommand smoke tests |
|
|
36
|
+
| Local UI | `agentdebug.ui` (FastAPI + vanilla JS console) | ✅ stable | endpoint round-trip |
|
|
37
|
+
|
|
38
|
+
**Test counts:** 60+ unit tests + 1 live-LLM smoke test, `mypy --strict` clean
|
|
39
|
+
across 32 source files.
|
|
40
|
+
|
|
41
|
+
## 2. Designed in docs, not yet implemented
|
|
42
|
+
|
|
43
|
+
| Doc | Component | Why deferred | Realistic ship |
|
|
44
|
+
|---|---|---|---|
|
|
45
|
+
| [06_detectors.md](./06_detectors.md) | `trajectory_perplexity` (TrajAD) | needs token-level LM perplexity API or embedding model + baseline calibration | v0.3 |
|
|
46
|
+
| [06_detectors.md](./06_detectors.md) | `topic_drift` (embedding cosine) | needs embedding client; consider reusing `OpenAICompatClient` `/embeddings` | v0.3 |
|
|
47
|
+
| [06_detectors.md](./06_detectors.md) | LTL spec monitors | requires user-supplied spec or LLM-synthesized monitors; gated on RV research | v1.2 |
|
|
48
|
+
| [07_attribution.md](./07_attribution.md) | `BinarySearchAttributor` (ddmin) | requires replayable environment; few frameworks expose it | v0.3 |
|
|
49
|
+
| [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` | requires re-rolling agent actions; same replay constraint | v0.3 |
|
|
50
|
+
| [07_attribution.md](./07_attribution.md) | `SBFLAttributor` (Tarantula/Ochiai) | needs corpus of passing + failing traces of same task; gated on Hub adoption | v0.4 |
|
|
51
|
+
| [07_attribution.md](./07_attribution.md) | `DeltaDebugAttributor` (Zeller) | same replay constraint | v0.3 |
|
|
52
|
+
| [07_attribution.md](./07_attribution.md) | `EnsembleAttributor` | trivial once 2+ heavy backends ship; awaits BinarySearch/Counterfactual | v0.3 |
|
|
53
|
+
| [08_recovery.md](./08_recovery.md) | `SelfRefineLoop` | small but needs a generator-critic-refiner orchestration | v0.3 |
|
|
54
|
+
| [08_recovery.md](./08_recovery.md) | `CriticRecoverer` | needs a verifier registry (search, code-exec, type-check) | v0.3 |
|
|
55
|
+
| [08_recovery.md](./08_recovery.md) | `AutoManualRules` | needs persistent project manual + injection into next-run prompts | v0.3 |
|
|
56
|
+
| [08_recovery.md](./08_recovery.md) | `LangGraphRewind` | depends on LangGraph checkpointer; ships when we have a real LangGraph user | v0.3 |
|
|
57
|
+
| [08_recovery.md](./08_recovery.md) | `SagaRollback` | needs compensation registry on tool definitions; new schema | v0.3 |
|
|
58
|
+
| [08_recovery.md](./08_recovery.md) | `MCTSBranchExploration` (LATS) | heavy; v2 feature | v2.0 |
|
|
59
|
+
| [09_error_database.md](./09_error_database.md) | DuckDB analytical + Parquet archive | optional; Hub bundles already give per-project corpus | v0.3 |
|
|
60
|
+
| [09_error_database.md](./09_error_database.md) | Vector similarity search | needs embedding model + index choice | v0.3 |
|
|
61
|
+
| [10_taxonomy_induction.md](./10_taxonomy_induction.md) | TnT-LLM + BERTopic pipeline | needs ≥ 1k labeled traces to be useful | v0.4 |
|
|
62
|
+
| [11_multimodal.md](./11_multimodal.md) | Screenshot/DOM capture, VLM judge | gated on multimodal user (Claude Computer Use / OpenAI CUA / OpenHands browser) | v1.1 |
|
|
63
|
+
| [12_ui_dashboard.md](./12_ui_dashboard.md) | TUI (Textual) | low priority; CLI + web UI cover the use cases | v0.4 |
|
|
64
|
+
| [12_ui_dashboard.md](./12_ui_dashboard.md) | VSCode extension | needs TS extension scaffolding | v1.0 |
|
|
65
|
+
| [05_adapters.md](./05_adapters.md) | CrewAI, OpenAI Agents SDK, AutoGen, smolagents, LlamaIndex, DSPy, Pydantic-AI | each is ~150 LOC + conformance test; ship as users land | rolling |
|
|
66
|
+
|
|
67
|
+
## 3. Implementation gaps surfaced by the audit
|
|
68
|
+
|
|
69
|
+
The audit found one real bug and a handful of test gaps:
|
|
70
|
+
|
|
71
|
+
1. **`agentdebug.hub.build_manifest` was used by the CLI but not re-exported** —
|
|
72
|
+
would have surfaced as `ImportError` for any user calling
|
|
73
|
+
`agentdebug hub push`. Fixed in 0.2.2 (`hub/__init__.py`) and locked in by a
|
|
74
|
+
CLI smoke test.
|
|
75
|
+
2. **`cli.py` had 0% coverage** — every subcommand now has a smoke test that
|
|
76
|
+
exercises the argparse path; the LLM-required commands assert the
|
|
77
|
+
"missing credentials" exit code without hitting the network.
|
|
78
|
+
3. **`instrumentation.py` (`traced_tool`) had 0% coverage** — happy and
|
|
79
|
+
exception paths now tested.
|
|
80
|
+
4. **`llm.OpenAICompatClient.complete` had no test** — covered by a custom
|
|
81
|
+
`httpx.BaseTransport` that returns canned JSON without a network call.
|
|
82
|
+
5. **`recovery.ReflexionSuggestion`** had only an indirect test from DeepDebug
|
|
83
|
+
examples; now has direct happy + empty tests.
|
|
84
|
+
|
|
85
|
+
## 4. Coverage matrix (post-0.2.2)
|
|
86
|
+
|
|
87
|
+
Run `PYTHONPATH=src pytest --cov=agentdebug --cov-report=term`. The two largest
|
|
88
|
+
remaining gaps are deliberate:
|
|
89
|
+
|
|
90
|
+
- `agentdebug.adapters.langgraph` — exercised only when `langchain_core` is
|
|
91
|
+
installed. The status-test verifies graceful degradation when it isn't.
|
|
92
|
+
- `agentdebug.hub.backends.HuggingFaceBackend` — gated on `huggingface_hub`.
|
|
93
|
+
Round-tripping through real HF requires `HF_TOKEN`; covered by the local
|
|
94
|
+
bare-git test for the analogous push/pull flow.
|
|
95
|
+
|
|
96
|
+
## 5. Acceptance gates for v0.3 (next minor)
|
|
97
|
+
|
|
98
|
+
Before v0.3 ships, this doc should record green checkmarks for:
|
|
99
|
+
|
|
100
|
+
- [ ] One replayable counterfactual attributor (`BinarySearchAttributor` is
|
|
101
|
+
the cheapest entry).
|
|
102
|
+
- [ ] One tool-grounded recovery strategy (`CriticRecoverer`) wired against
|
|
103
|
+
a `Verifier` Protocol.
|
|
104
|
+
- [ ] One additional framework adapter that goes through the full conformance
|
|
105
|
+
suite (CrewAI is the most-requested).
|
|
106
|
+
- [ ] HuggingFace Hub round-trip live test (gated on `HF_TOKEN`).
|
|
107
|
+
- [ ] Bench harness extended with one published-benchmark loader (Who&When
|
|
108
|
+
is the obvious first target — we already cite its method).
|
|
@@ -31,6 +31,8 @@ This `docs/` directory contains the full design specification.
|
|
|
31
31
|
| 19 | [19_error_hub.md](./19_error_hub.md) | **Error Hub** — bundle format, Local / Git / HF backends, scrubbing |
|
|
32
32
|
| 20 | [20_deep_debug.md](./20_deep_debug.md) | **DeepDebug** — iterative multi-turn analysis (plan → hypothesize → verify → refine) |
|
|
33
33
|
| 21 | [21_integrations.md](./21_integrations.md) | **Claude Code Skill** + **OpenHands** microagent + EventStream bridge |
|
|
34
|
+
| 22 | [22_industry_track_paper_eval_plan.md](./22_industry_track_paper_eval_plan.md) | EMNLP Industry Track writing strategy + benchmark / human-study evaluation plan |
|
|
35
|
+
| 23 | [23_status_v0_2.md](./23_status_v0_2.md) | **Capability + test coverage status (v0.2.2)** — what's implemented, what's tested, what's specced but not built |
|
|
34
36
|
|
|
35
37
|
Plus three **narrative** docs that pre-dated this engineering spec and are kept for paper-style framing:
|
|
36
38
|
|
|
@@ -45,7 +47,7 @@ Plus three **narrative** docs that pre-dated this engineering spec and are kept
|
|
|
45
47
|
## How to read this
|
|
46
48
|
|
|
47
49
|
- **First-time reader:** start with [00_overview.md](./00_overview.md), then [02_architecture.md](./02_architecture.md), then [14_api_reference.md](./14_api_reference.md).
|
|
48
|
-
- **Researcher / paper author:** read [01_literature_survey.md](./01_literature_survey.md), [03_taxonomy.md](./03_taxonomy.md), [07_attribution.md](./07_attribution.md), [10_taxonomy_induction.md](./10_taxonomy_induction.md).
|
|
50
|
+
- **Researcher / paper author:** read [01_literature_survey.md](./01_literature_survey.md), [03_taxonomy.md](./03_taxonomy.md), [07_attribution.md](./07_attribution.md), [10_taxonomy_induction.md](./10_taxonomy_induction.md), and [22_industry_track_paper_eval_plan.md](./22_industry_track_paper_eval_plan.md).
|
|
49
51
|
- **Framework integrator:** read [04_trace_schema.md](./04_trace_schema.md), [05_adapters.md](./05_adapters.md), [13_class_design.md](./13_class_design.md).
|
|
50
52
|
- **UI / product:** read [12_ui_dashboard.md](./12_ui_dashboard.md), [09_error_database.md](./09_error_database.md).
|
|
51
53
|
- **Runtime / agent UX designer:** read [17_claude_code_design_patterns.md](./17_claude_code_design_patterns.md), then [02_architecture.md](./02_architecture.md), [08_recovery.md](./08_recovery.md), and [14_api_reference.md](./14_api_reference.md).
|
|
@@ -15,6 +15,16 @@ from agentdebug.attribution import (
|
|
|
15
15
|
Attributor,
|
|
16
16
|
Blame,
|
|
17
17
|
HeuristicAttributor,
|
|
18
|
+
StepByStepAttributor,
|
|
19
|
+
)
|
|
20
|
+
from agentdebug.detectors import (
|
|
21
|
+
Detector,
|
|
22
|
+
DetectorConfig,
|
|
23
|
+
RepeatedStateDetector,
|
|
24
|
+
RepeatedToolCallDetector,
|
|
25
|
+
StepCountLimitDetector,
|
|
26
|
+
default_detectors,
|
|
27
|
+
run_detectors,
|
|
18
28
|
)
|
|
19
29
|
from agentdebug.events import DEFAULT_BUS, BusEvent, EventBus, EventSubscription
|
|
20
30
|
from agentdebug.models import (
|
|
@@ -29,6 +39,7 @@ from agentdebug.models import (
|
|
|
29
39
|
)
|
|
30
40
|
from agentdebug.recorder import AgentDebug, TraceSession
|
|
31
41
|
from agentdebug.recovery import FixProposal, Recoverer, ReflexionSuggestion
|
|
42
|
+
from agentdebug.traceback import CascadeFrame, build_cascade, format_traceback
|
|
32
43
|
from agentdebug.storage import JsonlTraceStore, SQLiteTraceStore
|
|
33
44
|
from agentdebug.taxonomy import SEED_FAILURE_MODES, get_failure_mode
|
|
34
45
|
|
|
@@ -42,6 +53,17 @@ __all__ = [
|
|
|
42
53
|
'Attributor',
|
|
43
54
|
'Blame',
|
|
44
55
|
'BusEvent',
|
|
56
|
+
'CascadeFrame',
|
|
57
|
+
'Detector',
|
|
58
|
+
'DetectorConfig',
|
|
59
|
+
'RepeatedStateDetector',
|
|
60
|
+
'RepeatedToolCallDetector',
|
|
61
|
+
'StepByStepAttributor',
|
|
62
|
+
'StepCountLimitDetector',
|
|
63
|
+
'build_cascade',
|
|
64
|
+
'default_detectors',
|
|
65
|
+
'format_traceback',
|
|
66
|
+
'run_detectors',
|
|
45
67
|
'DEFAULT_BUS',
|
|
46
68
|
'DiagnosticReport',
|
|
47
69
|
'EventBus',
|
|
@@ -62,4 +84,4 @@ __all__ = [
|
|
|
62
84
|
'get_failure_mode',
|
|
63
85
|
]
|
|
64
86
|
|
|
65
|
-
__version__ = '0.2.
|
|
87
|
+
__version__ = '0.2.2'
|