agentdebugx 0.2.1__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/PKG-INFO +1 -1
  2. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/14_api_reference.md +26 -0
  3. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/15_roadmap.md +11 -0
  4. agentdebugx-0.2.2/docs/22_industry_track_paper_eval_plan.md +235 -0
  5. agentdebugx-0.2.2/docs/23_status_v0_2.md +108 -0
  6. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/README.md +3 -1
  7. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/pyproject.toml +1 -1
  8. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/__init__.py +19 -1
  9. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/attribution.py +172 -3
  10. agentdebugx-0.2.2/src/agentdebug/detectors.py +284 -0
  11. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/hub/__init__.py +8 -1
  12. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/ui/server.py +12 -11
  13. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/LICENSE +0 -0
  14. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/README.md +0 -0
  15. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/00_overview.md +0 -0
  16. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/01_literature_survey.md +0 -0
  17. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/02_architecture.md +0 -0
  18. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/03_taxonomy.md +0 -0
  19. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/04_trace_schema.md +0 -0
  20. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/05_adapters.md +0 -0
  21. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/06_detectors.md +0 -0
  22. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/07_attribution.md +0 -0
  23. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/08_recovery.md +0 -0
  24. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/09_error_database.md +0 -0
  25. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/10_taxonomy_induction.md +0 -0
  26. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/11_multimodal.md +0 -0
  27. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/12_ui_dashboard.md +0 -0
  28. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/13_class_design.md +0 -0
  29. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/16_governance.md +0 -0
  30. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/17_claude_code_design_patterns.md +0 -0
  31. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/18_comparison_codex_vs_design.md +0 -0
  32. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/19_error_hub.md +0 -0
  33. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/20_deep_debug.md +0 -0
  34. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/21_integrations.md +0 -0
  35. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/ERROR_TAXONOMY.md +0 -0
  36. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
  37. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/RESEARCH_SURVEY.md +0 -0
  38. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/benchmarks/v0_1_smoke.json +0 -0
  39. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/docs/benchmarks/v0_1_smoke.md +0 -0
  40. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/adapters/__init__.py +0 -0
  41. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/adapters/base.py +0 -0
  42. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/adapters/langgraph.py +0 -0
  43. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/adapters/otel.py +0 -0
  44. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/adapters/raw.py +0 -0
  45. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/analyzers.py +0 -0
  46. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/cli.py +0 -0
  47. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/deep.py +0 -0
  48. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/events.py +0 -0
  49. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/hub/backend_base.py +0 -0
  50. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/hub/backends.py +0 -0
  51. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/hub/bundle.py +0 -0
  52. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/hub/scrub.py +0 -0
  53. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/instrumentation.py +0 -0
  54. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/integrations/__init__.py +0 -0
  55. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/integrations/claude_skill.py +0 -0
  56. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/integrations/openhands.py +0 -0
  57. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/judges.py +0 -0
  58. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/llm.py +0 -0
  59. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/models.py +0 -0
  60. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/recorder.py +0 -0
  61. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/recovery.py +0 -0
  62. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/storage.py +0 -0
  63. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/taxonomy.py +0 -0
  64. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/traceback.py +0 -0
  65. {agentdebugx-0.2.1 → agentdebugx-0.2.2}/src/agentdebug/ui/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentdebugx
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -215,6 +215,32 @@ agentdebugx taxonomy export --format yaml|json|md
215
215
  agentdebugx doctor
216
216
  ```
217
217
 
218
+ ## 11.1 Current shipped `agentdebug` CLI surface
219
+
220
+ The public design above is the long-term `agentdebugx` contract. The current
221
+ package already ships a smaller but working `agentdebug` CLI:
222
+
223
+ ```bash
224
+ agentdebug analyze <trajectory.json> [--suggest] [--traceback]
225
+ agentdebug list --store-sqlite .agentdebug/errors.sqlite
226
+ agentdebug show <trace_id> --store-sqlite .agentdebug/errors.sqlite
227
+ agentdebug judge <trajectory.json|trace_id> --attribute [--traceback]
228
+ agentdebug deep <trajectory.json|trace_id> [--traceback]
229
+ agentdebug hub push <trace_id> --to local:/tmp/hub --store-sqlite ...
230
+ agentdebug hub pull <spec> --bundle <bundle_id> --into .agentdebug/hub_pulls
231
+ agentdebug hub list <spec>
232
+ agentdebug integrations skill --target ~/.claude/skills --name agentdebug
233
+ agentdebug integrations openhands-microagent --target .openhands/microagents
234
+ agentdebug serve --store-sqlite .agentdebug/errors.sqlite --port 7777
235
+ agentdebug doctor
236
+ ```
237
+
238
+ `--traceback` renders `AgentTraceback`, a Python-traceback-style cascade view
239
+ implemented by `agentdebug.traceback.format_traceback(report, trajectory)`.
240
+ DeepDebug can provide explicit cascade edges through
241
+ `finding.metadata['cascading_from_event_id']`; heuristic and single-pass judge
242
+ reports fall back to step-index ordering.
243
+
218
244
  ## 12. Configuration file
219
245
 
220
246
  `~/.agentdebugx/settings.yaml`:
@@ -28,6 +28,17 @@ Acceptance:
28
28
 
29
29
  ## v0.2 — Coverage + UI (4 weeks)
30
30
 
31
+ Already shipped in the v0.2/v0.2.1 line:
32
+
33
+ - Error Hub bundle format + Local/Git/Hugging Face backends.
34
+ - DeepDebug iterative analysis loop.
35
+ - Claude Code Skill generator and OpenHands microagent/EventStream bridge.
36
+ - `AgentTraceback` cascade renderer and CLI `--traceback` support.
37
+ - FastAPI local console with native-trace + error-trace alignment for human
38
+ review.
39
+
40
+ Remaining scope from the original v0.2 plan:
41
+
31
42
  - Adapters: CrewAI, OpenHands, smolagents, LlamaIndex, DSPy, Pydantic-AI.
32
43
  - Detectors: anomaly family (perplexity, repeated-state, topic-drift).
33
44
  - Attribution: `BinarySearchAttributor`, `CounterfactualAttributor`, `EnsembleAttributor`.
@@ -0,0 +1,235 @@
1
+ # 22 - EMNLP Industry Track Paper + Evaluation Plan
2
+
3
+ This note translates the EMNLP 2026 Industry Track call and recent ACL/EMNLP
4
+ Industry Track patterns into a concrete writing and evaluation plan for the
5
+ AgentDebugX paper.
6
+
7
+ ## 1. What Industry Track Reviewers Reward
8
+
9
+ Industry-track papers are not judged like pure method papers. The recurring
10
+ shape in strong papers is:
11
+
12
+ 1. Real deployment pain, not a benchmark-only motivation.
13
+ 2. A system that can actually be used by practitioners.
14
+ 3. Evaluation under practical constraints: cost, latency, scale, privacy,
15
+ maintainability, human workflow, and failure modes.
16
+ 4. Lessons learned that future builders can reuse.
17
+ 5. Clear limitations and responsible deployment boundaries.
18
+
19
+ Examples worth emulating:
20
+
21
+ - **Experience Report: Implementing Machine Translation in a Regulated
22
+ Industry** (EMNLP 2025 Industry): emphasizes legal/security constraints,
23
+ human-in-the-loop validation, and reviewer preferences over more than 11k
24
+ ranked translations. Source: https://aclanthology.org/2025.emnlp-industry/
25
+ - **STREAQ** (EMNLP 2025 Industry): frames the contribution as an industrial
26
+ cost-quality routing system and reports both model quality and operational
27
+ cost reduction. Source: https://aclanthology.org/2025.emnlp-industry.121/
28
+ - **RAVEN** (ACL 2025 Industry): combines industrial data, public benchmarks,
29
+ deployment pipeline, and online A/B validation. Source:
30
+ https://aclanthology.org/2025.acl-industry.3/
31
+ - **ARIA** (EMNLP 2025 Industry): uses a realistic deployed domain plus public
32
+ dynamic-knowledge tasks, and states deployment scope.
33
+ - **AutoPenBench** (EMNLP 2025 Industry): releases an open benchmark, reports
34
+ autonomous versus human-assisted agent success, and uses intermediate
35
+ milestones to show where agents struggle. Source:
36
+ https://aclanthology.org/2025.emnlp-industry.114/
37
+
38
+ For AgentDebugX, the paper should therefore be framed as:
39
+
40
+ > A deployment-oriented debugging layer for agentic NLP systems, evaluated on
41
+ > whether it makes failures observable, attributable, shareable, and easier for
42
+ > humans to fix.
43
+
44
+ The central claim should not be "we beat every attributor." A stronger claim
45
+ for the Industry Track is:
46
+
47
+ > AgentDebugX provides the missing operating layer between raw agent traces and
48
+ > actionable debugging workflows: aligned native/error traces, taxonomy-backed
49
+ > reports, Error Hub bundles, and cost-aware analysis profiles.
50
+
51
+ ## 2. Appendix Rule
52
+
53
+ The EMNLP 2026 Industry Track permits appendices after the bibliography. The
54
+ appendix does not count against the 6-page review limit, but the main paper
55
+ must be self-contained and reviewers are not required to review appendices.
56
+ Source: https://2026.emnlp.org/calls/industry_track/
57
+ So:
58
+
59
+ - Main body: problem, system, screenshot, concise evaluation table, key
60
+ findings, limitations.
61
+ - Appendix: benchmark matrix, annotation schema, prompts, model settings,
62
+ study protocol, redaction examples, additional error traces.
63
+
64
+ Do not hide the core evaluation logic in the appendix. Put enough in the main
65
+ body that a reviewer can assess technical quality without reading extra pages.
66
+
67
+ ## 3. Main-Body Narrative
68
+
69
+ Recommended six-page spine:
70
+
71
+ 1. **Introduction**: agent systems fail through cascades; raw traces are not
72
+ enough; teams need who/when/why/fix.
73
+ 2. **Deployment Requirements**: low-friction instrumentation, privacy,
74
+ portable schema, cost-aware analysis, human review.
75
+ 3. **System**: recorder + schema + taxonomy + detectors/attributors +
76
+ Error Hub + UI.
77
+ 4. **Use Case Figure**: paired native trace and AgentDebugX error trace.
78
+ 5. **Evaluation**: benchmark coverage, diagnostic accuracy, human utility,
79
+ operational overhead.
80
+ 6. **Lessons/Limitations**: where it works, where it does not, safety.
81
+
82
+ ## 4. Evaluation Questions
83
+
84
+ ### Q1. Coverage
85
+
86
+ Can AgentDebugX ingest different agent classes without bespoke debugging code?
87
+
88
+ Benchmarks:
89
+
90
+ - AgentErrorBench: failure-labeled traces over ALFWorld, GAIA, WebShop.
91
+ - MAST and Who&When: multi-agent attribution and failure taxonomy labels.
92
+ - AgentRx: 115 failed trajectories with critical-step labels.
93
+ - WebShop/WebArena: web navigation and tool-use.
94
+ - tau-bench: retail/airline tool-agent-user interaction.
95
+ - SWE-bench Lite/Verified: coding agents with executable tests.
96
+ - OSWorld: multimodal desktop/GUI agents.
97
+
98
+ Metric: conversion success rate, required adapter LOC, event coverage, artifact
99
+ coverage, and schema loss notes.
100
+
101
+ ### Q2. Diagnostic Accuracy
102
+
103
+ Can AgentDebugX classify and localize failures?
104
+
105
+ Labels:
106
+
107
+ - failure family
108
+ - failure mode
109
+ - root event ID
110
+ - root agent
111
+ - root step
112
+ - cascade edges
113
+ - evidence spans
114
+ - accepted repair
115
+
116
+ Metrics:
117
+
118
+ - family macro-F1
119
+ - mode macro-F1
120
+ - responsible-agent accuracy
121
+ - root-step exact match
122
+ - root-step +/- 1 match
123
+ - cascade-edge F1
124
+ - false-positive rate on successful traces
125
+ - calibration: confidence vs correctness
126
+
127
+ Baselines:
128
+
129
+ - rule analyzer
130
+ - single-pass LLM judge
131
+ - All-at-Once attribution
132
+ - Step-by-Step attribution when implemented
133
+ - DeepDebug verify/refine loop
134
+ - benchmark-native labels or published baselines where available
135
+
136
+ ### Q3. Human Utility
137
+
138
+ Does the paired trace view reduce debugging effort?
139
+
140
+ Study design:
141
+
142
+ - 12-24 developers.
143
+ - Within-subject comparison: raw framework trace/logs vs AgentDebugX report.
144
+ - 24-48 total debugging sessions.
145
+ - Counterbalance task order and UI order.
146
+
147
+ Metrics:
148
+
149
+ - time to first plausible root cause
150
+ - time to accepted repair
151
+ - correctness against adjudicated labels
152
+ - number of trace events inspected
153
+ - confidence and workload rating
154
+ - free-text feedback on missing evidence
155
+
156
+ Fallback if recruiting slips:
157
+
158
+ - 3-5 expert agent builders review 30 traces.
159
+ - Ask them to choose between raw trace and AgentDebugX report, rate usefulness,
160
+ and mark incorrect/misleading diagnoses.
161
+
162
+ ### Q4. Operational Viability
163
+
164
+ Can teams run this in real workflows?
165
+
166
+ Metrics:
167
+
168
+ - analyzer latency by profile: rule, judge, DeepDebug
169
+ - token cost by trace length
170
+ - local storage overhead
171
+ - UI load time for 100, 1k, 10k events
172
+ - scrubber redaction hit rate
173
+ - scrubber false positives on benign strings
174
+ - Error Hub bundle size and push/pull time
175
+
176
+ ## 5. Data Scale
177
+
178
+ Minimum credible submission target:
179
+
180
+ - 500 failed trajectories.
181
+ - 100 successful trajectories for false-positive calibration.
182
+ - At least 30 examples per high-level family where source benchmarks permit.
183
+ - Two annotators per newly labeled trace plus adjudication.
184
+ - DeepDebug on a stratified hard subset of 100 traces.
185
+
186
+ Stronger target:
187
+
188
+ - 1,000 failed trajectories.
189
+ - 200 successful trajectories.
190
+ - DeepDebug on every trace where rule and single-pass judge disagree.
191
+ - 50-100 private pilot traces, scrubbed and reported only in aggregate.
192
+
193
+ ## 6. API and Infra Needed
194
+
195
+ Model APIs:
196
+
197
+ - OpenAI-compatible endpoint as the default abstraction.
198
+ - OpenAI, Gemini, Anthropic-through-proxy/LiteLLM, and local vLLM/Ollama where
199
+ feasible.
200
+
201
+ Benchmark APIs:
202
+
203
+ - WebShop/ALFWorld/GAIA loaders for AgentErrorBench.
204
+ - MAST/Who&When/AgentRx importers preserving existing labels.
205
+ - tau-bench user/tool simulator wrapper.
206
+ - WebArena browser harness with DOM/text/action capture.
207
+ - SWE-bench Docker harness with shell, patch, and test-output capture.
208
+ - OSWorld capture path for screenshot, accessibility tree, click/action, and
209
+ verifier result.
210
+
211
+ Storage/export:
212
+
213
+ - SQLite for local experiments.
214
+ - Error Hub bundles for sharing.
215
+ - Parquet manifest roll-up for large result analysis.
216
+
217
+ ## 7. What Should Go in the Appendix
218
+
219
+ - Full benchmark matrix.
220
+ - Exact label schema and examples.
221
+ - Prompts for LLM judge, attributor, and DeepDebug.
222
+ - Model settings and token budgets.
223
+ - Human study instructions and consent/safety notes.
224
+ - Redaction examples.
225
+ - Two or three full trace/report examples.
226
+ - Failure cases where AgentDebugX is wrong.
227
+
228
+ ## 8. Immediate TODO Before Submission
229
+
230
+ 1. Convert at least two public benchmark sources into AgentTrajectory.
231
+ 2. Produce a first 100-trace labeled set.
232
+ 3. Add an evaluation runner that outputs a single CSV/JSONL.
233
+ 4. Run rule, judge, All-at-Once, and DeepDebug on the same split.
234
+ 5. Add a small human/expert review with raw trace vs paired trace view.
235
+ 6. Trim main body to 6 pages while keeping appendix rich.
@@ -0,0 +1,108 @@
1
+ # 23 — Capability + Test Coverage Status (v0.2.2)
2
+
3
+ A live audit of what's implemented, what's tested, and what's specced but
4
+ not yet built. Pair this with [docs/15_roadmap.md](./15_roadmap.md), which is
5
+ the forward-looking plan; this doc is the rear-view mirror.
6
+
7
+ ## 1. What ships in v0.2.2 (live on PyPI)
8
+
9
+ | Layer | Module | Status | Tests |
10
+ |---|---|---|---|
11
+ | Trace IR | `agentdebug.models` | ✅ stable | round-trip + enum tests |
12
+ | Storage | `agentdebug.storage` (JSONL + SQLite) | ✅ stable | round-trip + ctx-mgr |
13
+ | Recorder | `agentdebug.recorder` (`AgentDebug`, `TraceSession`) | ✅ stable | record + analyze flow |
14
+ | Rule analyzer | `agentdebug.analyzers.HeuristicAnalyzer` | ✅ stable | match + suggest |
15
+ | Taxonomy | `agentdebug.taxonomy` (19 seed modes) | ✅ stable | get_mode + list |
16
+ | Function instrumentation | `agentdebug.instrumentation.traced_tool` | ✅ stable | happy + raise |
17
+ | Event bus | `agentdebug.events.EventBus` | ✅ stable | fan-out + auto-detach |
18
+ | LLM client | `agentdebug.llm.OpenAICompatClient` | ✅ stable | mocked httpx + env |
19
+ | LLM judge | `agentdebug.judges.LLMJudgeAnalyzer` | ✅ stable | scripted-LLM happy + silent |
20
+ | Attribution | `agentdebug.attribution.HeuristicAttributor` | ✅ stable | first-finding + tiebreak |
21
+ | Attribution | `agentdebug.attribution.AllAtOnceAttributor` | ✅ stable | mocked LLM + fallback |
22
+ | Attribution | `agentdebug.attribution.StepByStepAttributor` | ✅ **new 0.2.2** | scripted-LLM + fallback |
23
+ | Recovery | `agentdebug.recovery.ReflexionSuggestion` | ✅ stable | per-finding + empty |
24
+ | DeepDebug | `agentdebug.deep.DeepDebugAnalyzer` | ✅ stable | full loop + silent LLM |
25
+ | Cascade view | `agentdebug.traceback.format_traceback` | ✅ stable | cascade + step-order + ANSI + empty |
26
+ | Detectors | `agentdebug.detectors.RepeatedToolCall / RepeatedState / StepCountLimit` | ✅ **new 0.2.2** | threshold + window + budget |
27
+ | Hub bundle | `agentdebug.hub.Bundle / pack_bundle / unpack_bundle` | ✅ stable | round-trip |
28
+ | Hub scrubber | `agentdebug.hub.Scrubber` | ✅ stable | 12 redactions + idempotent |
29
+ | Hub backends | `LocalHubBackend`, `GitHubBackend`, `HuggingFaceBackend` | ✅ stable | local-bare-git + local |
30
+ | Adapters | `agentdebug.adapters.raw` (`trace_loop`, `mark_step`) | ✅ stable | end-to-end + ctxvar |
31
+ | Adapters | `agentdebug.adapters.langgraph.LangChainCallbackAdapter` | ✅ stable | gracefully degrades w/o dep |
32
+ | Adapters | `agentdebug.adapters.otel.OTelExportAdapter` | ✅ stable | branch test |
33
+ | Integrations | `agentdebug.integrations.claude_skill` | ✅ stable | skill-bundle write |
34
+ | Integrations | `agentdebug.integrations.openhands` (microagent + bridge) | ⚠️ microagent stable; bridge needs live OpenHands | microagent YAML test |
35
+ | CLI | `agentdebug.cli` (`analyze | judge | deep | list | show | hub | integrations | serve | doctor`) | ✅ stable | 12 subcommand smoke tests |
36
+ | Local UI | `agentdebug.ui` (FastAPI + vanilla JS console) | ✅ stable | endpoint round-trip |
37
+
38
+ **Test counts:** 60+ unit tests + 1 live-LLM smoke test, `mypy --strict` clean
39
+ across 32 source files.
40
+
41
+ ## 2. Designed in docs, not yet implemented
42
+
43
+ | Doc | Component | Why deferred | Realistic ship |
44
+ |---|---|---|---|
45
+ | [06_detectors.md](./06_detectors.md) | `trajectory_perplexity` (TrajAD) | needs token-level LM perplexity API or embedding model + baseline calibration | v0.3 |
46
+ | [06_detectors.md](./06_detectors.md) | `topic_drift` (embedding cosine) | needs embedding client; consider reusing `OpenAICompatClient` `/embeddings` | v0.3 |
47
+ | [06_detectors.md](./06_detectors.md) | LTL spec monitors | requires user-supplied spec or LLM-synthesized monitors; gated on RV research | v1.2 |
48
+ | [07_attribution.md](./07_attribution.md) | `BinarySearchAttributor` (ddmin) | requires replayable environment; few frameworks expose it | v0.3 |
49
+ | [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` | requires re-rolling agent actions; same replay constraint | v0.3 |
50
+ | [07_attribution.md](./07_attribution.md) | `SBFLAttributor` (Tarantula/Ochiai) | needs corpus of passing + failing traces of same task; gated on Hub adoption | v0.4 |
51
+ | [07_attribution.md](./07_attribution.md) | `DeltaDebugAttributor` (Zeller) | same replay constraint | v0.3 |
52
+ | [07_attribution.md](./07_attribution.md) | `EnsembleAttributor` | trivial once 2+ heavy backends ship; awaits BinarySearch/Counterfactual | v0.3 |
53
+ | [08_recovery.md](./08_recovery.md) | `SelfRefineLoop` | small but needs a generator-critic-refiner orchestration | v0.3 |
54
+ | [08_recovery.md](./08_recovery.md) | `CriticRecoverer` | needs a verifier registry (search, code-exec, type-check) | v0.3 |
55
+ | [08_recovery.md](./08_recovery.md) | `AutoManualRules` | needs persistent project manual + injection into next-run prompts | v0.3 |
56
+ | [08_recovery.md](./08_recovery.md) | `LangGraphRewind` | depends on LangGraph checkpointer; ships when we have a real LangGraph user | v0.3 |
57
+ | [08_recovery.md](./08_recovery.md) | `SagaRollback` | needs compensation registry on tool definitions; new schema | v0.3 |
58
+ | [08_recovery.md](./08_recovery.md) | `MCTSBranchExploration` (LATS) | heavy; v2 feature | v2.0 |
59
+ | [09_error_database.md](./09_error_database.md) | DuckDB analytical + Parquet archive | optional; Hub bundles already give per-project corpus | v0.3 |
60
+ | [09_error_database.md](./09_error_database.md) | Vector similarity search | needs embedding model + index choice | v0.3 |
61
+ | [10_taxonomy_induction.md](./10_taxonomy_induction.md) | TnT-LLM + BERTopic pipeline | needs ≥ 1k labeled traces to be useful | v0.4 |
62
+ | [11_multimodal.md](./11_multimodal.md) | Screenshot/DOM capture, VLM judge | gated on multimodal user (Claude Computer Use / OpenAI CUA / OpenHands browser) | v1.1 |
63
+ | [12_ui_dashboard.md](./12_ui_dashboard.md) | TUI (Textual) | low priority; CLI + web UI cover the use cases | v0.4 |
64
+ | [12_ui_dashboard.md](./12_ui_dashboard.md) | VSCode extension | needs TS extension scaffolding | v1.0 |
65
+ | [05_adapters.md](./05_adapters.md) | CrewAI, OpenAI Agents SDK, AutoGen, smolagents, LlamaIndex, DSPy, Pydantic-AI | each is ~150 LOC + conformance test; ship as users land | rolling |
66
+
67
+ ## 3. Implementation gaps surfaced by the audit
68
+
69
+ The audit found one real bug and a handful of test gaps:
70
+
71
+ 1. **`agentdebug.hub.build_manifest` was used by the CLI but not re-exported** —
72
+ would have surfaced as `ImportError` for any user calling
73
+ `agentdebug hub push`. Fixed in 0.2.2 (`hub/__init__.py`) and locked in by a
74
+ CLI smoke test.
75
+ 2. **`cli.py` had 0% coverage** — every subcommand now has a smoke test that
76
+ exercises the argparse path; the LLM-required commands assert the
77
+ "missing credentials" exit code without hitting the network.
78
+ 3. **`instrumentation.py` (`traced_tool`) had 0% coverage** — happy and
79
+ exception paths now tested.
80
+ 4. **`llm.OpenAICompatClient.complete` had no test** — covered by a custom
81
+ `httpx.BaseTransport` that returns canned JSON without a network call.
82
+ 5. **`recovery.ReflexionSuggestion`** had only an indirect test from DeepDebug
83
+ examples; now has direct happy + empty tests.
84
+
85
+ ## 4. Coverage matrix (post-0.2.2)
86
+
87
+ Run `PYTHONPATH=src pytest --cov=agentdebug --cov-report=term`. The two largest
88
+ remaining gaps are deliberate:
89
+
90
+ - `agentdebug.adapters.langgraph` — exercised only when `langchain_core` is
91
+ installed. The status-test verifies graceful degradation when it isn't.
92
+ - `agentdebug.hub.backends.HuggingFaceBackend` — gated on `huggingface_hub`.
93
+ Round-tripping through real HF requires `HF_TOKEN`; covered by the local
94
+ bare-git test for the analogous push/pull flow.
95
+
96
+ ## 5. Acceptance gates for v0.3 (next minor)
97
+
98
+ Before v0.3 ships, this doc should record green checkmarks for:
99
+
100
+ - [ ] One replayable counterfactual attributor (`BinarySearchAttributor` is
101
+ the cheapest entry).
102
+ - [ ] One tool-grounded recovery strategy (`CriticRecoverer`) wired against
103
+ a `Verifier` Protocol.
104
+ - [ ] One additional framework adapter that goes through the full conformance
105
+ suite (CrewAI is the most-requested).
106
+ - [ ] HuggingFace Hub round-trip live test (gated on `HF_TOKEN`).
107
+ - [ ] Bench harness extended with one published-benchmark loader (Who&When
108
+ is the obvious first target — we already cite its method).
@@ -31,6 +31,8 @@ This `docs/` directory contains the full design specification.
31
31
  | 19 | [19_error_hub.md](./19_error_hub.md) | **Error Hub** — bundle format, Local / Git / HF backends, scrubbing |
32
32
  | 20 | [20_deep_debug.md](./20_deep_debug.md) | **DeepDebug** — iterative multi-turn analysis (plan → hypothesize → verify → refine) |
33
33
  | 21 | [21_integrations.md](./21_integrations.md) | **Claude Code Skill** + **OpenHands** microagent + EventStream bridge |
34
+ | 22 | [22_industry_track_paper_eval_plan.md](./22_industry_track_paper_eval_plan.md) | EMNLP Industry Track writing strategy + benchmark / human-study evaluation plan |
35
+ | 23 | [23_status_v0_2.md](./23_status_v0_2.md) | **Capability + test coverage status (v0.2.2)** — what's implemented, what's tested, what's specced but not built |
34
36
 
35
37
  Plus three **narrative** docs that pre-dated this engineering spec and are kept for paper-style framing:
36
38
 
@@ -45,7 +47,7 @@ Plus three **narrative** docs that pre-dated this engineering spec and are kept
45
47
  ## How to read this
46
48
 
47
49
  - **First-time reader:** start with [00_overview.md](./00_overview.md), then [02_architecture.md](./02_architecture.md), then [14_api_reference.md](./14_api_reference.md).
48
- - **Researcher / paper author:** read [01_literature_survey.md](./01_literature_survey.md), [03_taxonomy.md](./03_taxonomy.md), [07_attribution.md](./07_attribution.md), [10_taxonomy_induction.md](./10_taxonomy_induction.md).
50
+ - **Researcher / paper author:** read [01_literature_survey.md](./01_literature_survey.md), [03_taxonomy.md](./03_taxonomy.md), [07_attribution.md](./07_attribution.md), [10_taxonomy_induction.md](./10_taxonomy_induction.md), and [22_industry_track_paper_eval_plan.md](./22_industry_track_paper_eval_plan.md).
49
51
  - **Framework integrator:** read [04_trace_schema.md](./04_trace_schema.md), [05_adapters.md](./05_adapters.md), [13_class_design.md](./13_class_design.md).
50
52
  - **UI / product:** read [12_ui_dashboard.md](./12_ui_dashboard.md), [09_error_database.md](./09_error_database.md).
51
53
  - **Runtime / agent UX designer:** read [17_claude_code_design_patterns.md](./17_claude_code_design_patterns.md), then [02_architecture.md](./02_architecture.md), [08_recovery.md](./08_recovery.md), and [14_api_reference.md](./14_api_reference.md).
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "agentdebugx"
3
- version = "0.2.1"
3
+ version = "0.2.2"
4
4
  description = "Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`."
5
5
  authors = ["ULab @ UIUC <ulab@illinois.edu>"]
6
6
  license = "MIT"
@@ -15,6 +15,16 @@ from agentdebug.attribution import (
15
15
  Attributor,
16
16
  Blame,
17
17
  HeuristicAttributor,
18
+ StepByStepAttributor,
19
+ )
20
+ from agentdebug.detectors import (
21
+ Detector,
22
+ DetectorConfig,
23
+ RepeatedStateDetector,
24
+ RepeatedToolCallDetector,
25
+ StepCountLimitDetector,
26
+ default_detectors,
27
+ run_detectors,
18
28
  )
19
29
  from agentdebug.events import DEFAULT_BUS, BusEvent, EventBus, EventSubscription
20
30
  from agentdebug.models import (
@@ -44,8 +54,16 @@ __all__ = [
44
54
  'Blame',
45
55
  'BusEvent',
46
56
  'CascadeFrame',
57
+ 'Detector',
58
+ 'DetectorConfig',
59
+ 'RepeatedStateDetector',
60
+ 'RepeatedToolCallDetector',
61
+ 'StepByStepAttributor',
62
+ 'StepCountLimitDetector',
47
63
  'build_cascade',
64
+ 'default_detectors',
48
65
  'format_traceback',
66
+ 'run_detectors',
49
67
  'DEFAULT_BUS',
50
68
  'DiagnosticReport',
51
69
  'EventBus',
@@ -66,4 +84,4 @@ __all__ = [
66
84
  'get_failure_mode',
67
85
  ]
68
86
 
69
- __version__ = '0.2.1'
87
+ __version__ = '0.2.2'
@@ -19,10 +19,10 @@ from __future__ import annotations
19
19
 
20
20
  import logging
21
21
  from dataclasses import dataclass, field
22
- from typing import Any, Dict, List, Optional, Protocol
22
+ from typing import Any, Dict, List, Optional, Protocol, cast
23
23
 
24
24
  from agentdebug.llm import LLMClient, extract_json_block
25
- from agentdebug.models import AgentTrajectory, FailureFinding, new_id
25
+ from agentdebug.models import AgentEvent, AgentTrajectory, FailureFinding, new_id
26
26
 
27
27
  LOG = logging.getLogger('agentdebug.attribution')
28
28
 
@@ -227,4 +227,173 @@ class AllAtOnceAttributor:
227
227
  return [str(value)]
228
228
 
229
229
 
230
- __all__ = ['Attributor', 'Blame', 'AttributionResult', 'HeuristicAttributor', 'AllAtOnceAttributor']
230
+ _STEP_SYSTEM_PROMPT = """You are AgentDebugX-Attributor, scanning a failed
231
+ agent trajectory one step at a time (the Who&When "Step-by-Step" method,
232
+ arXiv:2505.00212).
233
+
234
+ You will be given the goal, framework, the prior judge findings, AND a
235
+ SINGLE step from the trajectory plus a short window of preceding steps for
236
+ context. Decide whether THIS step is the decisive failure step.
237
+
238
+ Respond ONLY with a JSON object matching this schema (no prose, no markdown):
239
+
240
+ {
241
+ "is_failure_step": true | false,
242
+ "confidence": <float in [0,1]>,
243
+ "rationale": "<one sentence>",
244
+ "evidence": ["<short quoted evidence>", ...]
245
+ }
246
+
247
+ Be CONSERVATIVE: only return true when the trajectory evidence on this step
248
+ specifically caused the cascading failure.
249
+ """
250
+
251
+
252
+ class StepByStepAttributor:
253
+ """LLM-based attributor mirroring Who&When's Step-by-Step method.
254
+
255
+ Walks the trajectory in order, asking the LLM about each step. Returns
256
+ every step that answered ``is_failure_step=true`` as a Blame hypothesis,
257
+ sorted by step index. Costs O(N) LLM calls; pair with ``max_steps`` to
258
+ bound the budget for long trajectories.
259
+ """
260
+
261
+ id = 'step_by_step'
262
+
263
+ def __init__(
264
+ self,
265
+ llm: LLMClient,
266
+ *,
267
+ fallback: Optional[Attributor] = None,
268
+ max_steps: int = 30,
269
+ context_window: int = 3,
270
+ max_tokens: int = 1024,
271
+ ) -> None:
272
+ self.llm = llm
273
+ self.fallback: Attributor = fallback or HeuristicAttributor()
274
+ self.max_steps = max_steps
275
+ self.context_window = context_window
276
+ self.max_tokens = max_tokens
277
+
278
+ def attribute(
279
+ self,
280
+ trajectory: AgentTrajectory,
281
+ findings: List[FailureFinding],
282
+ ) -> AttributionResult:
283
+ events = list(trajectory.events)
284
+ if not events:
285
+ return self.fallback.attribute(trajectory, findings)
286
+ # Scan only the suffix of size max_steps so long traces stay affordable.
287
+ budget = min(self.max_steps, len(events))
288
+ scanned = events[-budget:]
289
+ hypotheses: List[Blame] = []
290
+ for idx, evt in enumerate(scanned):
291
+ absolute_index = len(events) - budget + idx
292
+ ctx_start = max(0, absolute_index - self.context_window)
293
+ context = events[ctx_start:absolute_index]
294
+ verdict = self._classify_step(
295
+ trajectory, findings, evt, context=context
296
+ )
297
+ if verdict is None or not verdict.get('is_failure_step'):
298
+ continue
299
+ hypotheses.append(Blame(
300
+ span_id=evt.event_id,
301
+ step_index=evt.step_index,
302
+ agent_name=evt.agent_name,
303
+ confidence=self._coerce_float(verdict.get('confidence'), 0.5),
304
+ rationale=str(verdict.get('rationale') or ''),
305
+ evidence=self._coerce_str_list(verdict.get('evidence')),
306
+ sources=[self.id],
307
+ ))
308
+ if not hypotheses:
309
+ return AttributionResult(
310
+ method=self.id,
311
+ hypotheses=[],
312
+ raw={'scanned_steps': len(scanned)},
313
+ )
314
+ hypotheses.sort(
315
+ key=lambda h: (
316
+ h.step_index is None,
317
+ h.step_index if h.step_index is not None else 10**9,
318
+ )
319
+ )
320
+ return AttributionResult(
321
+ method=self.id,
322
+ hypotheses=hypotheses,
323
+ raw={'scanned_steps': len(scanned)},
324
+ )
325
+
326
+ def _classify_step(
327
+ self,
328
+ trajectory: AgentTrajectory,
329
+ findings: List[FailureFinding],
330
+ event: 'AgentEvent',
331
+ *,
332
+ context: List['AgentEvent'],
333
+ ) -> Optional[Dict[str, Any]]:
334
+ findings_doc = '\n'.join(self._render_finding(f) for f in findings[:10]) \
335
+ or '(no judge findings yet)'
336
+ context_doc = '\n'.join(
337
+ f'context event_id={e.event_id} step={e.step_index} agent={e.agent_name}'
338
+ f' type={getattr(e.event_type, "value", e.event_type)}'
339
+ f' output={str(e.output)[:120]} error={str(e.error)[:120]}'
340
+ for e in context
341
+ ) or '(no preceding context)'
342
+ prompt = (
343
+ f'GOAL: {trajectory.goal!r}\n'
344
+ f'FRAMEWORK: {trajectory.framework!r}\n\n'
345
+ f'JUDGE FINDINGS:\n{findings_doc}\n\n'
346
+ f'PRECEDING CONTEXT:\n{context_doc}\n\n'
347
+ f'CANDIDATE STEP:\n'
348
+ f' event_id={event.event_id}\n'
349
+ f' step={event.step_index} agent={event.agent_name} '
350
+ f'module={event.module}\n'
351
+ f' type={getattr(event.event_type, "value", event.event_type)}\n'
352
+ f' input={str(event.input)[:300]}\n'
353
+ f' output={str(event.output)[:300]}\n'
354
+ f' error={str(event.error)[:300]}\n'
355
+ )
356
+ try:
357
+ result = self.llm.complete(
358
+ messages=[
359
+ {'role': 'system', 'content': _STEP_SYSTEM_PROMPT},
360
+ {'role': 'user', 'content': prompt},
361
+ ],
362
+ max_tokens=self.max_tokens,
363
+ )
364
+ except Exception as exc: # pragma: no cover
365
+ LOG.warning('step_by_step LLM call failed at step %s: %s',
366
+ event.step_index, exc)
367
+ return None
368
+ parsed = extract_json_block(result.text)
369
+ if parsed is None:
370
+ return None
371
+ return cast(Dict[str, Any], parsed)
372
+
373
+ def _render_finding(self, finding: FailureFinding) -> str:
374
+ return (
375
+ f'- mode={finding.failure_mode.mode_id} '
376
+ f'agent={finding.agent_name} step={finding.step_index} '
377
+ f'confidence={finding.confidence:.2f}'
378
+ )
379
+
380
+ @staticmethod
381
+ def _coerce_float(value: Any, default: float) -> float:
382
+ try:
383
+ return float(value)
384
+ except (TypeError, ValueError):
385
+ return default
386
+
387
+ @staticmethod
388
+ def _coerce_str_list(value: Any) -> List[str]:
389
+ if value is None:
390
+ return []
391
+ if isinstance(value, list):
392
+ return [str(v) for v in value]
393
+ return [str(value)]
394
+
395
+
396
+ __all__ = [
397
+ 'Attributor', 'Blame', 'AttributionResult',
398
+ 'HeuristicAttributor', 'AllAtOnceAttributor', 'StepByStepAttributor',
399
+ ]
@@ -0,0 +1,284 @@
1
+ """Lightweight rule + anomaly detectors that don't need an LLM.
2
+
3
+ These were specified in `docs/06_detectors.md` but not implemented in the
4
+ initial v0.1 ship. The :class:`HeuristicAnalyzer` covers per-event string
5
+ matching; the detectors here cover *cross-event* signals (loops, repeated
6
+ tool calls, no-op streaks) which the heuristic analyzer cannot see in a
7
+ single pass.
8
+
9
+ Each detector implements :class:`Detector` and returns a list of
10
+ :class:`FailureFinding`. They compose with the existing analyzer pipeline
11
+ via :func:`run_detectors`.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import logging
17
+ from dataclasses import dataclass
18
+ from typing import List, Optional, Protocol
19
+
20
+ from agentdebug.models import (
21
+ AgentEvent,
22
+ AgentTrajectory,
23
+ EventType,
24
+ FailureFinding,
25
+ FailureMode,
26
+ new_id,
27
+ )
28
+ from agentdebug.taxonomy import SEED_FAILURE_MODES
29
+
30
+ LOG = logging.getLogger('agentdebug.detectors')
31
+
32
+
33
+ @dataclass
34
+ class DetectorConfig:
35
+ """Tunables for the rule + anomaly detectors."""
36
+
37
+ repeated_tool_call_threshold: int = 3
38
+ repeated_state_window: int = 4
39
+ repeated_state_threshold: int = 3
40
+ step_count_limit: int = 50
41
+
42
+
43
+ class Detector(Protocol):
44
+ id: str
45
+
46
+ def detect(self, trajectory: AgentTrajectory) -> List[FailureFinding]:
47
+ ...
48
+
49
+
50
+ # ---------------------------------------------------------------------------
51
+ # RepeatedToolCallDetector
52
+ # ---------------------------------------------------------------------------
53
+
54
+ class RepeatedToolCallDetector:
55
+ """Flag a tool that's called with identical args >= threshold times.
56
+
57
+ Matches ``FM-1.3 step_repetition`` / ``planning.inefficient_plan``.
58
+ """
59
+
60
+ id = 'repeated_tool_call'
61
+
62
+ def __init__(self, *, threshold: int = 3) -> None:
63
+ self.threshold = threshold
64
+
65
+ def detect(self, trajectory: AgentTrajectory) -> List[FailureFinding]:
66
+ counts: dict[tuple[str, str], List[AgentEvent]] = {}
67
+ for evt in trajectory.events:
68
+ if evt.event_type != EventType.TOOL_CALL:
69
+ continue
70
+ key = (evt.agent_name, _normalize(evt.input))
71
+ counts.setdefault(key, []).append(evt)
72
+ findings: List[FailureFinding] = []
73
+ mode = SEED_FAILURE_MODES['planning.inefficient_plan']
74
+ for (agent, signature), events in counts.items():
75
+ if len(events) < self.threshold:
76
+ continue
77
+ target = events[-1]
78
+ findings.append(
79
+ FailureFinding(
80
+ finding_id=new_id('finding'),
81
+ failure_mode=mode,
82
+ event_id=target.event_id,
83
+ agent_name=target.agent_name,
84
+ step_index=target.step_index,
85
+ confidence=min(0.5 + 0.1 * len(events), 0.95),
86
+ evidence=[
87
+ f'{len(events)} identical TOOL_CALL events with '
88
+ f'signature={_truncate(signature, 80)} on agent={agent}',
89
+ ],
90
+ suggestion=_suggestion(mode),
91
+ metadata={'source': self.id, 'detected_count': len(events)},
92
+ )
93
+ )
94
+ return findings
95
+
96
+
97
+ # ---------------------------------------------------------------------------
98
+ # RepeatedStateDetector
99
+ # ---------------------------------------------------------------------------
100
+
101
+ class RepeatedStateDetector:
102
+ """Flag a sliding window where the agent's outputs/observations don't change.
103
+
104
+ Catches no-progress loops the per-event matcher cannot see (e.g., agent
105
+ keeps responding "checking..." or producing the same plan over several
106
+ steps). Matches ``planning.inefficient_plan``.
107
+ """
108
+
109
+ id = 'repeated_state'
110
+
111
+ def __init__(self, *, window: int = 4, threshold: int = 3) -> None:
112
+ if threshold > window:
113
+ raise ValueError('threshold must be <= window')
114
+ self.window = window
115
+ self.threshold = threshold
116
+
117
+ def detect(self, trajectory: AgentTrajectory) -> List[FailureFinding]:
118
+ events = [
119
+ e for e in trajectory.events
120
+ if e.event_type in {
121
+ EventType.OBSERVATION,
122
+ EventType.AGENT_STEP,
123
+ EventType.PLAN,
124
+ EventType.TOOL_RESULT,
125
+ }
126
+ ]
127
+ findings: List[FailureFinding] = []
128
+ if len(events) < self.window:
129
+ return findings
130
+ mode = SEED_FAILURE_MODES['planning.inefficient_plan']
131
+ flagged_event_ids: set[str] = set()
132
+ for i in range(len(events) - self.window + 1):
133
+ window = events[i : i + self.window]
134
+ sigs = [_normalize(_state_signature(e)) for e in window]
135
+ most_common, count = _mode_count(sigs)
136
+ if count < self.threshold:
137
+ continue
138
+ target = window[-1]
139
+ if target.event_id in flagged_event_ids:
140
+ continue
141
+ flagged_event_ids.add(target.event_id)
142
+ findings.append(
143
+ FailureFinding(
144
+ finding_id=new_id('finding'),
145
+ failure_mode=mode,
146
+ event_id=target.event_id,
147
+ agent_name=target.agent_name,
148
+ step_index=target.step_index,
149
+ confidence=min(0.5 + 0.1 * count, 0.9),
150
+ evidence=[
151
+ f'state repeated {count}x within window of '
152
+ f'{self.window} events; signature={_truncate(most_common, 80)}',
153
+ ],
154
+ suggestion=_suggestion(mode),
155
+ metadata={
156
+ 'source': self.id,
157
+ 'window': self.window,
158
+ 'repeated_count': count,
159
+ },
160
+ )
161
+ )
162
+ return findings
163
+
164
+
165
+ # ---------------------------------------------------------------------------
166
+ # StepCountLimitDetector
167
+ # ---------------------------------------------------------------------------
168
+
169
+ class StepCountLimitDetector:
170
+ """Flag a trajectory that exceeded a configured step budget.
171
+
172
+ Matches ``AgentBench TLE`` and ``verification.premature_stop`` proxies.
173
+ """
174
+
175
+ id = 'step_count_limit'
176
+
177
+ def __init__(self, *, max_steps: int = 50) -> None:
178
+ self.max_steps = max_steps
179
+
180
+ def detect(self, trajectory: AgentTrajectory) -> List[FailureFinding]:
181
+ if len(trajectory.events) <= self.max_steps:
182
+ return []
183
+ target = trajectory.events[-1]
184
+ mode = SEED_FAILURE_MODES['planning.inefficient_plan']
185
+ return [FailureFinding(
186
+ finding_id=new_id('finding'),
187
+ failure_mode=mode,
188
+ event_id=target.event_id,
189
+ agent_name=target.agent_name,
190
+ step_index=target.step_index,
191
+ confidence=0.7,
192
+ evidence=[
193
+ f'{len(trajectory.events)} events exceeds configured '
194
+ f'max_steps={self.max_steps}',
195
+ ],
196
+ suggestion=_suggestion(mode),
197
+ metadata={'source': self.id, 'event_count': len(trajectory.events)},
198
+ )]
199
+
200
+
201
+ # ---------------------------------------------------------------------------
202
+ # Pipeline runner
203
+ # ---------------------------------------------------------------------------
204
+
205
+ def default_detectors(config: Optional[DetectorConfig] = None) -> List[Detector]:
206
+ cfg = config or DetectorConfig()
207
+ return [
208
+ RepeatedToolCallDetector(threshold=cfg.repeated_tool_call_threshold),
209
+ RepeatedStateDetector(
210
+ window=cfg.repeated_state_window,
211
+ threshold=cfg.repeated_state_threshold,
212
+ ),
213
+ StepCountLimitDetector(max_steps=cfg.step_count_limit),
214
+ ]
215
+
216
+
217
+ def run_detectors(
218
+ trajectory: AgentTrajectory,
219
+ detectors: Optional[List[Detector]] = None,
220
+ ) -> List[FailureFinding]:
221
+ """Run a list of detectors over a trajectory and return merged findings."""
222
+ detectors = detectors or default_detectors()
223
+ out: List[FailureFinding] = []
224
+ for d in detectors:
225
+ try:
226
+ out.extend(d.detect(trajectory))
227
+ except Exception as exc: # pragma: no cover - defensive
228
+ LOG.warning('detector %s raised: %s', d.id, exc)
229
+ return out
230
+
231
+
232
+ # ---------------------------------------------------------------------------
233
+ # helpers
234
+ # ---------------------------------------------------------------------------
235
+
236
+ def _state_signature(event: AgentEvent) -> str:
237
+ """A compact, comparison-friendly view of an event's observable state."""
238
+ return '|'.join([
239
+ getattr(event.event_type, 'value', str(event.event_type)),
240
+ str(event.agent_name or ''),
241
+ str(event.module or ''),
242
+ str(event.output or '')[:200],
243
+ str(event.error or '')[:200],
244
+ ])
245
+
246
+
247
+ def _normalize(value: object) -> str:
248
+ text = '' if value is None else str(value)
249
+ return ' '.join(text.split())
250
+
251
+
252
+ def _truncate(text: str, max_chars: int) -> str:
253
+ if len(text) > max_chars:
254
+ return text[:max_chars] + '…'
255
+ return text
256
+
257
+
258
+ def _mode_count(items: List[str]) -> tuple[str, int]:
259
+ counts: dict[str, int] = {}
260
+ best: str = ''
261
+ best_count = 0
262
+ for it in items:
263
+ counts[it] = counts.get(it, 0) + 1
264
+ if counts[it] > best_count:
265
+ best_count = counts[it]
266
+ best = it
267
+ return best, best_count
268
+
269
+
270
+ def _suggestion(mode: FailureMode) -> Optional[str]:
271
+ if mode.suggestion_templates:
272
+ return str(mode.suggestion_templates[0])
273
+ return None
274
+
275
+
276
+ __all__ = [
277
+ 'Detector',
278
+ 'DetectorConfig',
279
+ 'RepeatedStateDetector',
280
+ 'RepeatedToolCallDetector',
281
+ 'StepCountLimitDetector',
282
+ 'default_detectors',
283
+ 'run_detectors',
284
+ ]
@@ -34,7 +34,13 @@ from agentdebug.hub.backends import (
34
34
  LocalHubBackend,
35
35
  backend_from_spec,
36
36
  )
37
- from agentdebug.hub.bundle import Bundle, BundleManifest, pack_bundle, unpack_bundle
37
+ from agentdebug.hub.bundle import (
38
+ Bundle,
39
+ BundleManifest,
40
+ build_manifest,
41
+ pack_bundle,
42
+ unpack_bundle,
43
+ )
38
44
  from agentdebug.hub.scrub import (
39
45
  DEFAULT_REDACTIONS,
40
46
  ScrubReport,
@@ -53,6 +59,7 @@ __all__ = [
53
59
  'ScrubReport',
54
60
  'Scrubber',
55
61
  'backend_from_spec',
62
+ 'build_manifest',
56
63
  'pack_bundle',
57
64
  'parse_spec',
58
65
  'scrub_trajectory',
@@ -2,12 +2,12 @@
2
2
 
3
3
  Endpoints:
4
4
 
5
- * ``GET /`` single-page HTML console.
6
- * ``GET /api/v1/traces`` list trace IDs in the store.
7
- * ``GET /api/v1/traces/{tid}`` fetch a trajectory + freshly analyzed report.
8
- * ``GET /api/v1/traces/{tid}/raw``— raw trajectory JSON.
9
- * ``GET /api/v1/taxonomy`` list seed failure modes.
10
- * ``GET /healthz`` liveness.
5
+ * ``GET /`` - single-page HTML console.
6
+ * ``GET /api/v1/traces`` - list trace IDs in the store.
7
+ * ``GET /api/v1/traces/{tid}`` - fetch a trajectory + freshly analyzed report.
8
+ * ``GET /api/v1/traces/{tid}/raw``- raw trajectory JSON.
9
+ * ``GET /api/v1/taxonomy`` - list seed failure modes.
10
+ * ``GET /healthz`` - liveness.
11
11
 
12
12
  The server is intentionally tiny and built on a no-build (vanilla JS) frontend
13
13
  so it ships with the wheel and runs without `npm`.
@@ -126,7 +126,7 @@ def store_from_path(path: str) -> TraceStore:
126
126
  return JsonlTraceStore(path)
127
127
 
128
128
 
129
- # Single-file HTML console. Plain DOM + fetch no build step required.
129
+ # Single-file HTML console. Plain DOM + fetch; no build step required.
130
130
  _INDEX_HTML = """<!doctype html>
131
131
  <html lang="en">
132
132
  <head>
@@ -394,7 +394,7 @@ function eventProblem(ev) {
394
394
  const payload = (fmt(ev.error) + ' ' + fmt(ev.output) + ' ' + fmt(ev.metadata)).toLowerCase();
395
395
  return Boolean(ev.error || payload.includes('missing context') || payload.includes('premature') || payload.includes('loop') || payload.includes('handoff'));
396
396
  }
397
- async function loadTraceList() {
397
+ async function loadTraceList(selectFirst) {
398
398
  const data = await api('/api/v1/traces');
399
399
  const ul = document.getElementById('trace-list');
400
400
  ul.innerHTML = '';
@@ -407,7 +407,7 @@ async function loadTraceList() {
407
407
  li.dataset.tid = tid;
408
408
  li.onclick = () => { selectTrace(tid, li); };
409
409
  ul.appendChild(li);
410
- if (idx === 0) selectTrace(tid, li);
410
+ if (idx === 0 && selectFirst) selectTrace(tid, li);
411
411
  });
412
412
  if (data.traces.length === 0) {
413
413
  document.getElementById('detail').innerHTML = '<div class="empty">No traces in store.</div>';
@@ -442,6 +442,7 @@ function renderTrace(traj, report) {
442
442
  const events = traj.events || [];
443
443
  const findings = report.findings || [];
444
444
  const rootId = report.root_cause_event_id;
445
+ const alignmentEvents = events.filter(ev => ev.step_index !== null && ev.step_index !== undefined);
445
446
  const families = [...new Set(findings.map(f => f.failure_mode?.family).filter(Boolean))];
446
447
  const errorEvents = events.filter(eventProblem).length;
447
448
  const rootEvent = events.find(ev => ev.event_id === rootId) || {};
@@ -467,7 +468,7 @@ function renderTrace(traj, report) {
467
468
  html += '<div class="trace-legend"><div class="legend-cell"><div class="legend-label">Agent native trace</div><div class="legend-title">What the agent logged, thought, called, or observed.</div></div>';
468
469
  html += '<div class="legend-cell"><div class="legend-label">AgentDebugX error trace</div><div class="legend-title">Normalized failure signal, attribution, and repair hint for human review.</div></div></div>';
469
470
  html += '<div class="timeline">';
470
- for (const ev of events) html += renderEvent(ev, ev.event_id === rootId, findingForEvent(findings, ev.event_id));
471
+ for (const ev of alignmentEvents) html += renderEvent(ev, ev.event_id === rootId, findingForEvent(findings, ev.event_id));
471
472
  html += '</div></div></div>';
472
473
 
473
474
  html += '<div class="rail">';
@@ -589,7 +590,7 @@ if (BOOTSTRAP && BOOTSTRAP.traces) {
589
590
  document.getElementById('detail').innerHTML = '<div class="empty">No traces in store.</div>';
590
591
  }
591
592
  }
592
- loadTraceList();
593
+ loadTraceList(!(BOOTSTRAP && BOOTSTRAP.selected));
593
594
  </script>
594
595
  </body>
595
596
  </html>
File without changes
File without changes