agentdebugx 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/PKG-INFO +5 -1
  2. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/README.md +4 -0
  3. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/14_api_reference.md +26 -0
  4. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/15_roadmap.md +11 -0
  5. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/20_deep_debug.md +46 -0
  6. agentdebugx-0.2.2/docs/22_industry_track_paper_eval_plan.md +235 -0
  7. agentdebugx-0.2.2/docs/23_status_v0_2.md +108 -0
  8. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/README.md +3 -1
  9. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/pyproject.toml +1 -1
  10. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/__init__.py +23 -1
  11. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/attribution.py +172 -3
  12. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/cli.py +42 -0
  13. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/deep.py +23 -2
  14. agentdebugx-0.2.2/src/agentdebug/detectors.py +284 -0
  15. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/hub/__init__.py +8 -1
  16. agentdebugx-0.2.2/src/agentdebug/traceback.py +302 -0
  17. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/ui/server.py +141 -20
  18. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/LICENSE +0 -0
  19. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/00_overview.md +0 -0
  20. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/01_literature_survey.md +0 -0
  21. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/02_architecture.md +0 -0
  22. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/03_taxonomy.md +0 -0
  23. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/04_trace_schema.md +0 -0
  24. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/05_adapters.md +0 -0
  25. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/06_detectors.md +0 -0
  26. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/07_attribution.md +0 -0
  27. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/08_recovery.md +0 -0
  28. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/09_error_database.md +0 -0
  29. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/10_taxonomy_induction.md +0 -0
  30. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/11_multimodal.md +0 -0
  31. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/12_ui_dashboard.md +0 -0
  32. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/13_class_design.md +0 -0
  33. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/16_governance.md +0 -0
  34. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/17_claude_code_design_patterns.md +0 -0
  35. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/18_comparison_codex_vs_design.md +0 -0
  36. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/19_error_hub.md +0 -0
  37. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/21_integrations.md +0 -0
  38. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/ERROR_TAXONOMY.md +0 -0
  39. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
  40. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/RESEARCH_SURVEY.md +0 -0
  41. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/benchmarks/v0_1_smoke.json +0 -0
  42. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/docs/benchmarks/v0_1_smoke.md +0 -0
  43. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/adapters/__init__.py +0 -0
  44. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/adapters/base.py +0 -0
  45. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/adapters/langgraph.py +0 -0
  46. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/adapters/otel.py +0 -0
  47. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/adapters/raw.py +0 -0
  48. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/analyzers.py +0 -0
  49. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/events.py +0 -0
  50. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/hub/backend_base.py +0 -0
  51. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/hub/backends.py +0 -0
  52. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/hub/bundle.py +0 -0
  53. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/hub/scrub.py +0 -0
  54. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/instrumentation.py +0 -0
  55. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/integrations/__init__.py +0 -0
  56. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/integrations/claude_skill.py +0 -0
  57. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/integrations/openhands.py +0 -0
  58. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/judges.py +0 -0
  59. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/llm.py +0 -0
  60. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/models.py +0 -0
  61. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/recorder.py +0 -0
  62. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/recovery.py +0 -0
  63. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/storage.py +0 -0
  64. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/taxonomy.py +0 -0
  65. {agentdebugx-0.2.0 → agentdebugx-0.2.2}/src/agentdebug/ui/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentdebugx
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -187,6 +187,10 @@ agentdebug serve --store-sqlite .agentdebug/errors.sqlite
187
187
  # DeepDebug — iterative multi-turn analysis (plan -> hypothesize -> verify -> refine)
188
188
  agentdebug deep <trajectory.json>
189
189
 
190
+ # Render the cascade as a Python-traceback (root cause first, manifested failure last)
191
+ agentdebug deep <trajectory.json> --traceback
192
+ agentdebug analyze <trajectory.json> --traceback # works without an LLM too
193
+
190
194
  # Error Hub: package + push a trace to a Git remote or HF dataset
191
195
  agentdebug hub push <trace_id> \
192
196
  --to git:git@github.com:your-org/agentdebug-bundles.git#bundles \
@@ -147,6 +147,10 @@ agentdebug serve --store-sqlite .agentdebug/errors.sqlite
147
147
  # DeepDebug — iterative multi-turn analysis (plan -> hypothesize -> verify -> refine)
148
148
  agentdebug deep <trajectory.json>
149
149
 
150
+ # Render the cascade as a Python-traceback (root cause first, manifested failure last)
151
+ agentdebug deep <trajectory.json> --traceback
152
+ agentdebug analyze <trajectory.json> --traceback # works without an LLM too
153
+
150
154
  # Error Hub: package + push a trace to a Git remote or HF dataset
151
155
  agentdebug hub push <trace_id> \
152
156
  --to git:git@github.com:your-org/agentdebug-bundles.git#bundles \
@@ -215,6 +215,32 @@ agentdebugx taxonomy export --format yaml|json|md
215
215
  agentdebugx doctor
216
216
  ```
217
217
 
218
+ ## 11.1 Current shipped `agentdebug` CLI surface
219
+
220
+ The public design above is the long-term `agentdebugx` contract. The current
221
+ package already ships a smaller but working `agentdebug` CLI:
222
+
223
+ ```bash
224
+ agentdebug analyze <trajectory.json> [--suggest] [--traceback]
225
+ agentdebug list --store-sqlite .agentdebug/errors.sqlite
226
+ agentdebug show <trace_id> --store-sqlite .agentdebug/errors.sqlite
227
+ agentdebug judge <trajectory.json|trace_id> --attribute [--traceback]
228
+ agentdebug deep <trajectory.json|trace_id> [--traceback]
229
+ agentdebug hub push <trace_id> --to local:/tmp/hub --store-sqlite ...
230
+ agentdebug hub pull <spec> --bundle <bundle_id> --into .agentdebug/hub_pulls
231
+ agentdebug hub list <spec>
232
+ agentdebug integrations skill --target ~/.claude/skills --name agentdebug
233
+ agentdebug integrations openhands-microagent --target .openhands/microagents
234
+ agentdebug serve --store-sqlite .agentdebug/errors.sqlite --port 7777
235
+ agentdebug doctor
236
+ ```
237
+
238
+ `--traceback` renders `AgentTraceback`, a Python-traceback-style cascade view
239
+ implemented by `agentdebug.traceback.format_traceback(report, trajectory)`.
240
+ DeepDebug can provide explicit cascade edges through
241
+ `finding.metadata['cascading_from_event_id']`; heuristic and single-pass judge
242
+ reports fall back to step-index ordering.
243
+
218
244
  ## 12. Configuration file
219
245
 
220
246
  `~/.agentdebugx/settings.yaml`:
@@ -28,6 +28,17 @@ Acceptance:
28
28
 
29
29
  ## v0.2 — Coverage + UI (4 weeks)
30
30
 
31
+ Already shipped in the v0.2/v0.2.1 line:
32
+
33
+ - Error Hub bundle format + Local/Git/Hugging Face backends.
34
+ - DeepDebug iterative analysis loop.
35
+ - Claude Code Skill generator and OpenHands microagent/EventStream bridge.
36
+ - `AgentTraceback` cascade renderer and CLI `--traceback` support.
37
+ - FastAPI local console with native-trace + error-trace alignment for human
38
+ review.
39
+
40
+ Remaining scope from the original v0.2 plan:
41
+
31
42
  - Adapters: CrewAI, OpenHands, smolagents, LlamaIndex, DSPy, Pydantic-AI.
32
43
  - Detectors: anomaly family (perplexity, repeated-state, topic-drift).
33
44
  - Attribution: `BinarySearchAttributor`, `CounterfactualAttributor`, `EnsembleAttributor`.
@@ -115,6 +115,52 @@ rounds : plan (4.6s) hypothesize (11.0s)
115
115
  The single-pass `LLMJudgeAnalyzer` on the same trace returned only the first
116
116
  finding. DeepDebug recovered the full cascade and selected the upstream cause.
117
117
 
118
+ ## 6.1 AgentTraceback — Python-traceback-style cascade view
119
+
120
+ Once DeepDebug has populated `finding.metadata['cascading_from_event_id']`,
121
+ `agentdebug.traceback.format_traceback(report, trajectory)` renders the
122
+ cascade in a layout that mirrors Python's `Traceback (most recent call last)`
123
+ — root cause first, manifested failure last, with arrows between hops:
124
+
125
+ ```text
126
+ AgentTraceback (root cause first, manifested failure last):
127
+ trace_id=trace_… framework=live-cascade-demo goal='Find latest paper, summarize, then email …'
128
+
129
+ File "root cause", in trajectory
130
+ Step 3 agent=search mode=action.parameter_error confidence=1.00
131
+ module=action
132
+ error> JSON schema validation failed: missing parameter query
133
+ evidence:
134
+ - args={}
135
+ suggested: Validate parameters against tool schemas before execution.
136
+ ↓ cascaded to
137
+ File "cascade depth 1", in trajectory
138
+ Step 4 agent=planner mode=verification.premature_stop confidence=1.00
139
+ output> Final answer: AgentDebug is a popular paper.
140
+ ↓ cascaded to
141
+ File "cascade depth 1", in trajectory
142
+ Step 4 agent=planner mode=memory.hallucination confidence=0.95
143
+ output> Final answer: AgentDebug is a popular paper.
144
+
145
+ AgentFailure[memory.hallucination]: The search agent failed to provide the
146
+ required 'query' parameter in its tool call, leading to a tool error. The
147
+ planner then hallucinated a generic fact about the paper and prematurely
148
+ terminated the task without completing the summary or email steps.
149
+ ```
150
+
151
+ CLI:
152
+
153
+ ```bash
154
+ agentdebug deep <trajectory.json> --traceback # render to stdout
155
+ agentdebug analyze <trajectory.json> --traceback # works for rule analyzer too
156
+ agentdebug judge <traj|trace_id> --attribute --traceback
157
+ ```
158
+
159
+ When DeepDebug isn't available (heuristic analyzer or single-pass judge),
160
+ the renderer falls back to **step-index ordering** — the earliest finding
161
+ becomes the root and later findings cascade from it. This means
162
+ `--traceback` works on any analyzer in the pipeline, not just DeepDebug.
163
+
118
164
  ## 7. Failure modes
119
165
 
120
166
  - **Cost blowout** — if `max_hypotheses_to_verify` is high and verify is
@@ -0,0 +1,235 @@
1
+ # 22 - EMNLP Industry Track Paper + Evaluation Plan
2
+
3
+ This note translates the EMNLP 2026 Industry Track call and recent ACL/EMNLP
4
+ Industry Track patterns into a concrete writing and evaluation plan for the
5
+ AgentDebugX paper.
6
+
7
+ ## 1. What Industry Track Reviewers Reward
8
+
9
+ Industry-track papers are not judged like pure method papers. The recurring
10
+ shape in strong papers is:
11
+
12
+ 1. Real deployment pain, not a benchmark-only motivation.
13
+ 2. A system that can actually be used by practitioners.
14
+ 3. Evaluation under practical constraints: cost, latency, scale, privacy,
15
+ maintainability, human workflow, and failure modes.
16
+ 4. Lessons learned that future builders can reuse.
17
+ 5. Clear limitations and responsible deployment boundaries.
18
+
19
+ Examples worth emulating:
20
+
21
+ - **Experience Report: Implementing Machine Translation in a Regulated
22
+ Industry** (EMNLP 2025 Industry): emphasizes legal/security constraints,
23
+ human-in-the-loop validation, and reviewer preferences over more than 11k
24
+ ranked translations. Source: https://aclanthology.org/2025.emnlp-industry/
25
+ - **STREAQ** (EMNLP 2025 Industry): frames the contribution as an industrial
26
+ cost-quality routing system and reports both model quality and operational
27
+ cost reduction. Source: https://aclanthology.org/2025.emnlp-industry.121/
28
+ - **RAVEN** (ACL 2025 Industry): combines industrial data, public benchmarks,
29
+ deployment pipeline, and online A/B validation. Source:
30
+ https://aclanthology.org/2025.acl-industry.3/
31
+ - **ARIA** (EMNLP 2025 Industry): uses a realistic deployed domain plus public
32
+ dynamic-knowledge tasks, and states deployment scope.
33
+ - **AutoPenBench** (EMNLP 2025 Industry): releases an open benchmark, reports
34
+ autonomous versus human-assisted agent success, and uses intermediate
35
+ milestones to show where agents struggle. Source:
36
+ https://aclanthology.org/2025.emnlp-industry.114/
37
+
38
+ For AgentDebugX, the paper should therefore be framed as:
39
+
40
+ > A deployment-oriented debugging layer for agentic NLP systems, evaluated on
41
+ > whether it makes failures observable, attributable, shareable, and easier for
42
+ > humans to fix.
43
+
44
+ The central claim should not be "we beat every attributor." A stronger claim
45
+ for the Industry Track is:
46
+
47
+ > AgentDebugX provides the missing operating layer between raw agent traces and
48
+ > actionable debugging workflows: aligned native/error traces, taxonomy-backed
49
+ > reports, Error Hub bundles, and cost-aware analysis profiles.
50
+
51
+ ## 2. Appendix Rule
52
+
53
+ The EMNLP 2026 Industry Track permits appendices after the bibliography. The
54
+ appendix does not count against the 6-page review limit, but the main paper
55
+ must be self-contained and reviewers are not required to review appendices.
56
+ Source: https://2026.emnlp.org/calls/industry_track/
57
+ So:
58
+
59
+ - Main body: problem, system, screenshot, concise evaluation table, key
60
+ findings, limitations.
61
+ - Appendix: benchmark matrix, annotation schema, prompts, model settings,
62
+ study protocol, redaction examples, additional error traces.
63
+
64
+ Do not hide the core evaluation logic in the appendix. Put enough in the main
65
+ body that a reviewer can assess technical quality without reading extra pages.
66
+
67
+ ## 3. Main-Body Narrative
68
+
69
+ Recommended six-page spine:
70
+
71
+ 1. **Introduction**: agent systems fail through cascades; raw traces are not
72
+ enough; teams need who/when/why/fix.
73
+ 2. **Deployment Requirements**: low-friction instrumentation, privacy,
74
+ portable schema, cost-aware analysis, human review.
75
+ 3. **System**: recorder + schema + taxonomy + detectors/attributors +
76
+ Error Hub + UI.
77
+ 4. **Use Case Figure**: paired native trace and AgentDebugX error trace.
78
+ 5. **Evaluation**: benchmark coverage, diagnostic accuracy, human utility,
79
+ operational overhead.
80
+ 6. **Lessons/Limitations**: where it works, where it does not, safety.
81
+
82
+ ## 4. Evaluation Questions
83
+
84
+ ### Q1. Coverage
85
+
86
+ Can AgentDebugX ingest different agent classes without bespoke debugging code?
87
+
88
+ Benchmarks:
89
+
90
+ - AgentErrorBench: failure-labeled traces over ALFWorld, GAIA, WebShop.
91
+ - MAST and Who&When: multi-agent attribution and failure taxonomy labels.
92
+ - AgentRx: 115 failed trajectories with critical-step labels.
93
+ - WebShop/WebArena: web navigation and tool-use.
94
+ - tau-bench: retail/airline tool-agent-user interaction.
95
+ - SWE-bench Lite/Verified: coding agents with executable tests.
96
+ - OSWorld: multimodal desktop/GUI agents.
97
+
98
+ Metric: conversion success rate, required adapter LOC, event coverage, artifact
99
+ coverage, and schema loss notes.
100
+
101
+ ### Q2. Diagnostic Accuracy
102
+
103
+ Can AgentDebugX classify and localize failures?
104
+
105
+ Labels:
106
+
107
+ - failure family
108
+ - failure mode
109
+ - root event ID
110
+ - root agent
111
+ - root step
112
+ - cascade edges
113
+ - evidence spans
114
+ - accepted repair
115
+
116
+ Metrics:
117
+
118
+ - family macro-F1
119
+ - mode macro-F1
120
+ - responsible-agent accuracy
121
+ - root-step exact match
122
+ - root-step +/- 1 match
123
+ - cascade-edge F1
124
+ - false-positive rate on successful traces
125
+ - calibration: confidence vs correctness
126
+
127
+ Baselines:
128
+
129
+ - rule analyzer
130
+ - single-pass LLM judge
131
+ - All-at-Once attribution
132
+ - Step-by-Step attribution when implemented
133
+ - DeepDebug verify/refine loop
134
+ - benchmark-native labels or published baselines where available
135
+
136
+ ### Q3. Human Utility
137
+
138
+ Does the paired trace view reduce debugging effort?
139
+
140
+ Study design:
141
+
142
+ - 12-24 developers.
143
+ - Within-subject comparison: raw framework trace/logs vs AgentDebugX report.
144
+ - 24-48 total debugging sessions.
145
+ - Counterbalance task order and UI order.
146
+
147
+ Metrics:
148
+
149
+ - time to first plausible root cause
150
+ - time to accepted repair
151
+ - correctness against adjudicated labels
152
+ - number of trace events inspected
153
+ - confidence and workload rating
154
+ - free-text feedback on missing evidence
155
+
156
+ Fallback if recruiting slips:
157
+
158
+ - 3-5 expert agent builders review 30 traces.
159
+ - Ask them to choose between raw trace and AgentDebugX report, rate usefulness,
160
+ and mark incorrect/misleading diagnoses.
161
+
162
+ ### Q4. Operational Viability
163
+
164
+ Can teams run this in real workflows?
165
+
166
+ Metrics:
167
+
168
+ - analyzer latency by profile: rule, judge, DeepDebug
169
+ - token cost by trace length
170
+ - local storage overhead
171
+ - UI load time for 100, 1k, 10k events
172
+ - scrubber redaction hit rate
173
+ - scrubber false positives on benign strings
174
+ - Error Hub bundle size and push/pull time
175
+
176
+ ## 5. Data Scale
177
+
178
+ Minimum credible submission target:
179
+
180
+ - 500 failed trajectories.
181
+ - 100 successful trajectories for false-positive calibration.
182
+ - At least 30 examples per high-level family where source benchmarks permit.
183
+ - Two annotators per newly labeled trace plus adjudication.
184
+ - DeepDebug on a stratified hard subset of 100 traces.
185
+
186
+ Stronger target:
187
+
188
+ - 1,000 failed trajectories.
189
+ - 200 successful trajectories.
190
+ - DeepDebug on every trace where rule and single-pass judge disagree.
191
+ - 50-100 private pilot traces, scrubbed and reported only in aggregate.
192
+
193
+ ## 6. API and Infra Needed
194
+
195
+ Model APIs:
196
+
197
+ - OpenAI-compatible endpoint as the default abstraction.
198
+ - OpenAI, Gemini, Anthropic-through-proxy/LiteLLM, and local vLLM/Ollama where
199
+ feasible.
200
+
201
+ Benchmark APIs:
202
+
203
+ - WebShop/ALFWorld/GAIA loaders for AgentErrorBench.
204
+ - MAST/Who&When/AgentRx importers preserving existing labels.
205
+ - tau-bench user/tool simulator wrapper.
206
+ - WebArena browser harness with DOM/text/action capture.
207
+ - SWE-bench Docker harness with shell, patch, and test-output capture.
208
+ - OSWorld capture path for screenshot, accessibility tree, click/action, and
209
+ verifier result.
210
+
211
+ Storage/export:
212
+
213
+ - SQLite for local experiments.
214
+ - Error Hub bundles for sharing.
215
+ - Parquet manifest roll-up for large result analysis.
216
+
217
+ ## 7. What Should Go in the Appendix
218
+
219
+ - Full benchmark matrix.
220
+ - Exact label schema and examples.
221
+ - Prompts for LLM judge, attributor, and DeepDebug.
222
+ - Model settings and token budgets.
223
+ - Human study instructions and consent/safety notes.
224
+ - Redaction examples.
225
+ - Two or three full trace/report examples.
226
+ - Failure cases where AgentDebugX is wrong.
227
+
228
+ ## 8. Immediate TODO Before Submission
229
+
230
+ 1. Convert at least two public benchmark sources into AgentTrajectory.
231
+ 2. Produce a first 100-trace labeled set.
232
+ 3. Add an evaluation runner that outputs a single CSV/JSONL.
233
+ 4. Run rule, judge, All-at-Once, and DeepDebug on the same split.
234
+ 5. Add a small human/expert review with raw trace vs paired trace view.
235
+ 6. Trim main body to 6 pages while keeping appendix rich.
@@ -0,0 +1,108 @@
1
+ # 23 — Capability + Test Coverage Status (v0.2.2)
2
+
3
+ A live audit of what's implemented, what's tested, and what's specced but
4
+ not yet built. Pair this with [docs/15_roadmap.md](./15_roadmap.md), which is
5
+ the forward-looking plan; this doc is the rear-view mirror.
6
+
7
+ ## 1. What ships in v0.2.2 (live on PyPI)
8
+
9
+ | Layer | Module | Status | Tests |
10
+ |---|---|---|---|
11
+ | Trace IR | `agentdebug.models` | ✅ stable | round-trip + enum tests |
12
+ | Storage | `agentdebug.storage` (JSONL + SQLite) | ✅ stable | round-trip + ctx-mgr |
13
+ | Recorder | `agentdebug.recorder` (`AgentDebug`, `TraceSession`) | ✅ stable | record + analyze flow |
14
+ | Rule analyzer | `agentdebug.analyzers.HeuristicAnalyzer` | ✅ stable | match + suggest |
15
+ | Taxonomy | `agentdebug.taxonomy` (19 seed modes) | ✅ stable | get_mode + list |
16
+ | Function instrumentation | `agentdebug.instrumentation.traced_tool` | ✅ stable | happy + raise |
17
+ | Event bus | `agentdebug.events.EventBus` | ✅ stable | fan-out + auto-detach |
18
+ | LLM client | `agentdebug.llm.OpenAICompatClient` | ✅ stable | mocked httpx + env |
19
+ | LLM judge | `agentdebug.judges.LLMJudgeAnalyzer` | ✅ stable | scripted-LLM happy + silent |
20
+ | Attribution | `agentdebug.attribution.HeuristicAttributor` | ✅ stable | first-finding + tiebreak |
21
+ | Attribution | `agentdebug.attribution.AllAtOnceAttributor` | ✅ stable | mocked LLM + fallback |
22
+ | Attribution | `agentdebug.attribution.StepByStepAttributor` | ✅ **new 0.2.2** | scripted-LLM + fallback |
23
+ | Recovery | `agentdebug.recovery.ReflexionSuggestion` | ✅ stable | per-finding + empty |
24
+ | DeepDebug | `agentdebug.deep.DeepDebugAnalyzer` | ✅ stable | full loop + silent LLM |
25
+ | Cascade view | `agentdebug.traceback.format_traceback` | ✅ stable | cascade + step-order + ANSI + empty |
26
+ | Detectors | `agentdebug.detectors.RepeatedToolCall / RepeatedState / StepCountLimit` | ✅ **new 0.2.2** | threshold + window + budget |
27
+ | Hub bundle | `agentdebug.hub.Bundle / pack_bundle / unpack_bundle` | ✅ stable | round-trip |
28
+ | Hub scrubber | `agentdebug.hub.Scrubber` | ✅ stable | 12 redactions + idempotent |
29
+ | Hub backends | `LocalHubBackend`, `GitHubBackend`, `HuggingFaceBackend` | ✅ stable | local-bare-git + local |
30
+ | Adapters | `agentdebug.adapters.raw` (`trace_loop`, `mark_step`) | ✅ stable | end-to-end + ctxvar |
31
+ | Adapters | `agentdebug.adapters.langgraph.LangChainCallbackAdapter` | ✅ stable | gracefully degrades w/o dep |
32
+ | Adapters | `agentdebug.adapters.otel.OTelExportAdapter` | ✅ stable | branch test |
33
+ | Integrations | `agentdebug.integrations.claude_skill` | ✅ stable | skill-bundle write |
34
+ | Integrations | `agentdebug.integrations.openhands` (microagent + bridge) | ⚠️ microagent stable; bridge needs live OpenHands | microagent YAML test |
35
+ | CLI | `agentdebug.cli` (`analyze | judge | deep | list | show | hub | integrations | serve | doctor`) | ✅ stable | 12 subcommand smoke tests |
36
+ | Local UI | `agentdebug.ui` (FastAPI + vanilla JS console) | ✅ stable | endpoint round-trip |
37
+
38
+ **Test counts:** 60+ unit tests + 1 live-LLM smoke test, `mypy --strict` clean
39
+ across 32 source files.
40
+
41
+ ## 2. Designed in docs, not yet implemented
42
+
43
+ | Doc | Component | Why deferred | Realistic ship |
44
+ |---|---|---|---|
45
+ | [06_detectors.md](./06_detectors.md) | `trajectory_perplexity` (TrajAD) | needs token-level LM perplexity API or embedding model + baseline calibration | v0.3 |
46
+ | [06_detectors.md](./06_detectors.md) | `topic_drift` (embedding cosine) | needs embedding client; consider reusing `OpenAICompatClient` `/embeddings` | v0.3 |
47
+ | [06_detectors.md](./06_detectors.md) | LTL spec monitors | requires user-supplied spec or LLM-synthesized monitors; gated on RV research | v1.2 |
48
+ | [07_attribution.md](./07_attribution.md) | `BinarySearchAttributor` (ddmin) | requires replayable environment; few frameworks expose it | v0.3 |
49
+ | [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` | requires re-rolling agent actions; same replay constraint | v0.3 |
50
+ | [07_attribution.md](./07_attribution.md) | `SBFLAttributor` (Tarantula/Ochiai) | needs corpus of passing + failing traces of same task; gated on Hub adoption | v0.4 |
51
+ | [07_attribution.md](./07_attribution.md) | `DeltaDebugAttributor` (Zeller) | same replay constraint | v0.3 |
52
+ | [07_attribution.md](./07_attribution.md) | `EnsembleAttributor` | trivial once 2+ heavy backends ship; awaits BinarySearch/Counterfactual | v0.3 |
53
+ | [08_recovery.md](./08_recovery.md) | `SelfRefineLoop` | small but needs a generator-critic-refiner orchestration | v0.3 |
54
+ | [08_recovery.md](./08_recovery.md) | `CriticRecoverer` | needs a verifier registry (search, code-exec, type-check) | v0.3 |
55
+ | [08_recovery.md](./08_recovery.md) | `AutoManualRules` | needs persistent project manual + injection into next-run prompts | v0.3 |
56
+ | [08_recovery.md](./08_recovery.md) | `LangGraphRewind` | depends on LangGraph checkpointer; ships when we have a real LangGraph user | v0.3 |
57
+ | [08_recovery.md](./08_recovery.md) | `SagaRollback` | needs compensation registry on tool definitions; new schema | v0.3 |
58
+ | [08_recovery.md](./08_recovery.md) | `MCTSBranchExploration` (LATS) | heavy; v2 feature | v2.0 |
59
+ | [09_error_database.md](./09_error_database.md) | DuckDB analytical + Parquet archive | optional; Hub bundles already give per-project corpus | v0.3 |
60
+ | [09_error_database.md](./09_error_database.md) | Vector similarity search | needs embedding model + index choice | v0.3 |
61
+ | [10_taxonomy_induction.md](./10_taxonomy_induction.md) | TnT-LLM + BERTopic pipeline | needs ≥ 1k labeled traces to be useful | v0.4 |
62
+ | [11_multimodal.md](./11_multimodal.md) | Screenshot/DOM capture, VLM judge | gated on multimodal user (Claude Computer Use / OpenAI CUA / OpenHands browser) | v1.1 |
63
+ | [12_ui_dashboard.md](./12_ui_dashboard.md) | TUI (Textual) | low priority; CLI + web UI cover the use cases | v0.4 |
64
+ | [12_ui_dashboard.md](./12_ui_dashboard.md) | VSCode extension | needs TS extension scaffolding | v1.0 |
65
+ | [05_adapters.md](./05_adapters.md) | CrewAI, OpenAI Agents SDK, AutoGen, smolagents, LlamaIndex, DSPy, Pydantic-AI | each is ~150 LOC + conformance test; ship as users land | rolling |
66
+
67
+ ## 3. Implementation gaps surfaced by the audit
68
+
69
+ The audit found one real bug and a handful of test gaps:
70
+
71
+ 1. **`agentdebug.hub.build_manifest` was used by the CLI but not re-exported** —
72
+ would have surfaced as `ImportError` for any user calling
73
+ `agentdebug hub push`. Fixed in 0.2.2 (`hub/__init__.py`) and locked in by a
74
+ CLI smoke test.
75
+ 2. **`cli.py` had 0% coverage** — every subcommand now has a smoke test that
76
+ exercises the argparse path; the LLM-required commands assert the
77
+ "missing credentials" exit code without hitting the network.
78
+ 3. **`instrumentation.py` (`traced_tool`) had 0% coverage** — happy and
79
+ exception paths now tested.
80
+ 4. **`llm.OpenAICompatClient.complete` had no test** — covered by a custom
81
+ `httpx.BaseTransport` that returns canned JSON without a network call.
82
+ 5. **`recovery.ReflexionSuggestion`** had only an indirect test from DeepDebug
83
+ examples; now has direct happy + empty tests.
84
+
85
+ ## 4. Coverage matrix (post-0.2.2)
86
+
87
+ Run `PYTHONPATH=src pytest --cov=agentdebug --cov-report=term`. The two largest
88
+ remaining gaps are deliberate:
89
+
90
+ - `agentdebug.adapters.langgraph` — exercised only when `langchain_core` is
91
+ installed. The status-test verifies graceful degradation when it isn't.
92
+ - `agentdebug.hub.backends.HuggingFaceBackend` — gated on `huggingface_hub`.
93
+ Round-tripping through real HF requires `HF_TOKEN`; covered by the local
94
+ bare-git test for the analogous push/pull flow.
95
+
96
+ ## 5. Acceptance gates for v0.3 (next minor)
97
+
98
+ Before v0.3 ships, this doc should record green checkmarks for:
99
+
100
+ - [ ] One replayable counterfactual attributor (`BinarySearchAttributor` is
101
+ the cheapest entry).
102
+ - [ ] One tool-grounded recovery strategy (`CriticRecoverer`) wired against
103
+ a `Verifier` Protocol.
104
+ - [ ] One additional framework adapter that goes through the full conformance
105
+ suite (CrewAI is the most-requested).
106
+ - [ ] HuggingFace Hub round-trip live test (gated on `HF_TOKEN`).
107
+ - [ ] Bench harness extended with one published-benchmark loader (Who&When
108
+ is the obvious first target — we already cite its method).
@@ -31,6 +31,8 @@ This `docs/` directory contains the full design specification.
31
31
  | 19 | [19_error_hub.md](./19_error_hub.md) | **Error Hub** — bundle format, Local / Git / HF backends, scrubbing |
32
32
  | 20 | [20_deep_debug.md](./20_deep_debug.md) | **DeepDebug** — iterative multi-turn analysis (plan → hypothesize → verify → refine) |
33
33
  | 21 | [21_integrations.md](./21_integrations.md) | **Claude Code Skill** + **OpenHands** microagent + EventStream bridge |
34
+ | 22 | [22_industry_track_paper_eval_plan.md](./22_industry_track_paper_eval_plan.md) | EMNLP Industry Track writing strategy + benchmark / human-study evaluation plan |
35
+ | 23 | [23_status_v0_2.md](./23_status_v0_2.md) | **Capability + test coverage status (v0.2.2)** — what's implemented, what's tested, what's specced but not built |
34
36
 
35
37
  Plus three **narrative** docs that pre-dated this engineering spec and are kept for paper-style framing:
36
38
 
@@ -45,7 +47,7 @@ Plus three **narrative** docs that pre-dated this engineering spec and are kept
45
47
  ## How to read this
46
48
 
47
49
  - **First-time reader:** start with [00_overview.md](./00_overview.md), then [02_architecture.md](./02_architecture.md), then [14_api_reference.md](./14_api_reference.md).
48
- - **Researcher / paper author:** read [01_literature_survey.md](./01_literature_survey.md), [03_taxonomy.md](./03_taxonomy.md), [07_attribution.md](./07_attribution.md), [10_taxonomy_induction.md](./10_taxonomy_induction.md).
50
+ - **Researcher / paper author:** read [01_literature_survey.md](./01_literature_survey.md), [03_taxonomy.md](./03_taxonomy.md), [07_attribution.md](./07_attribution.md), [10_taxonomy_induction.md](./10_taxonomy_induction.md), and [22_industry_track_paper_eval_plan.md](./22_industry_track_paper_eval_plan.md).
49
51
  - **Framework integrator:** read [04_trace_schema.md](./04_trace_schema.md), [05_adapters.md](./05_adapters.md), [13_class_design.md](./13_class_design.md).
50
52
  - **UI / product:** read [12_ui_dashboard.md](./12_ui_dashboard.md), [09_error_database.md](./09_error_database.md).
51
53
  - **Runtime / agent UX designer:** read [17_claude_code_design_patterns.md](./17_claude_code_design_patterns.md), then [02_architecture.md](./02_architecture.md), [08_recovery.md](./08_recovery.md), and [14_api_reference.md](./14_api_reference.md).
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "agentdebugx"
3
- version = "0.2.0"
3
+ version = "0.2.2"
4
4
  description = "Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`."
5
5
  authors = ["ULab @ UIUC <ulab@illinois.edu>"]
6
6
  license = "MIT"
@@ -15,6 +15,16 @@ from agentdebug.attribution import (
15
15
  Attributor,
16
16
  Blame,
17
17
  HeuristicAttributor,
18
+ StepByStepAttributor,
19
+ )
20
+ from agentdebug.detectors import (
21
+ Detector,
22
+ DetectorConfig,
23
+ RepeatedStateDetector,
24
+ RepeatedToolCallDetector,
25
+ StepCountLimitDetector,
26
+ default_detectors,
27
+ run_detectors,
18
28
  )
19
29
  from agentdebug.events import DEFAULT_BUS, BusEvent, EventBus, EventSubscription
20
30
  from agentdebug.models import (
@@ -29,6 +39,7 @@ from agentdebug.models import (
29
39
  )
30
40
  from agentdebug.recorder import AgentDebug, TraceSession
31
41
  from agentdebug.recovery import FixProposal, Recoverer, ReflexionSuggestion
42
+ from agentdebug.traceback import CascadeFrame, build_cascade, format_traceback
32
43
  from agentdebug.storage import JsonlTraceStore, SQLiteTraceStore
33
44
  from agentdebug.taxonomy import SEED_FAILURE_MODES, get_failure_mode
34
45
 
@@ -42,6 +53,17 @@ __all__ = [
42
53
  'Attributor',
43
54
  'Blame',
44
55
  'BusEvent',
56
+ 'CascadeFrame',
57
+ 'Detector',
58
+ 'DetectorConfig',
59
+ 'RepeatedStateDetector',
60
+ 'RepeatedToolCallDetector',
61
+ 'StepByStepAttributor',
62
+ 'StepCountLimitDetector',
63
+ 'build_cascade',
64
+ 'default_detectors',
65
+ 'format_traceback',
66
+ 'run_detectors',
45
67
  'DEFAULT_BUS',
46
68
  'DiagnosticReport',
47
69
  'EventBus',
@@ -62,4 +84,4 @@ __all__ = [
62
84
  'get_failure_mode',
63
85
  ]
64
86
 
65
- __version__ = '0.2.0'
87
+ __version__ = '0.2.2'