agentdebugx 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/PKG-INFO +29 -2
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/README.md +26 -1
- agentdebugx-0.2.0/docs/19_error_hub.md +148 -0
- agentdebugx-0.2.0/docs/20_deep_debug.md +135 -0
- agentdebugx-0.2.0/docs/21_integrations.md +140 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/README.md +3 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/pyproject.toml +4 -1
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/__init__.py +1 -1
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/cli.py +209 -0
- agentdebugx-0.2.0/src/agentdebug/deep.py +533 -0
- agentdebugx-0.2.0/src/agentdebug/hub/__init__.py +60 -0
- agentdebugx-0.2.0/src/agentdebug/hub/backend_base.py +62 -0
- agentdebugx-0.2.0/src/agentdebug/hub/backends.py +275 -0
- agentdebugx-0.2.0/src/agentdebug/hub/bundle.py +293 -0
- agentdebugx-0.2.0/src/agentdebug/hub/scrub.py +142 -0
- agentdebugx-0.2.0/src/agentdebug/integrations/__init__.py +32 -0
- agentdebugx-0.2.0/src/agentdebug/integrations/claude_skill.py +161 -0
- agentdebugx-0.2.0/src/agentdebug/integrations/openhands.py +159 -0
- agentdebugx-0.2.0/src/agentdebug/ui/server.py +476 -0
- agentdebugx-0.1.0/src/agentdebug/ui/server.py +0 -260
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/LICENSE +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/00_overview.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/01_literature_survey.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/02_architecture.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/03_taxonomy.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/04_trace_schema.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/05_adapters.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/06_detectors.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/07_attribution.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/08_recovery.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/09_error_database.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/10_taxonomy_induction.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/11_multimodal.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/12_ui_dashboard.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/13_class_design.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/14_api_reference.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/15_roadmap.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/16_governance.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/17_claude_code_design_patterns.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/18_comparison_codex_vs_design.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/ERROR_TAXONOMY.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/RESEARCH_SURVEY.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/benchmarks/v0_1_smoke.json +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/benchmarks/v0_1_smoke.md +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/adapters/__init__.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/adapters/base.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/adapters/langgraph.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/adapters/otel.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/adapters/raw.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/analyzers.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/attribution.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/events.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/instrumentation.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/judges.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/llm.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/models.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/recorder.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/recovery.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/storage.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/taxonomy.py +0 -0
- {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/ui/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: agentdebugx
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`.
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE
|
|
@@ -26,7 +26,9 @@ Classifier: Topic :: Software Development :: Quality Assurance
|
|
|
26
26
|
Classifier: Topic :: System :: Monitoring
|
|
27
27
|
Classifier: Typing :: Typed
|
|
28
28
|
Provides-Extra: all
|
|
29
|
+
Provides-Extra: hub-hf
|
|
29
30
|
Provides-Extra: langgraph
|
|
31
|
+
Provides-Extra: openhands
|
|
30
32
|
Provides-Extra: otel
|
|
31
33
|
Provides-Extra: ui
|
|
32
34
|
Requires-Dist: httpx (>=0.24,<1.0)
|
|
@@ -101,7 +103,7 @@ spell out the proposed path before we lock into heavy abstractions.
|
|
|
101
103
|
## Install
|
|
102
104
|
|
|
103
105
|
```bash
|
|
104
|
-
# From PyPI (distribution name
|
|
106
|
+
# From PyPI (distribution name `agentdebugx`; the short import is `agentdebug`)
|
|
105
107
|
pip install agentdebugx
|
|
106
108
|
|
|
107
109
|
# With the optional local dashboard
|
|
@@ -113,6 +115,12 @@ pip install 'agentdebugx[langgraph]'
|
|
|
113
115
|
# With OpenTelemetry GenAI export shim
|
|
114
116
|
pip install 'agentdebugx[otel]'
|
|
115
117
|
|
|
118
|
+
# With Hugging Face Datasets backend for the Error Hub
|
|
119
|
+
pip install 'agentdebugx[hub-hf]'
|
|
120
|
+
|
|
121
|
+
# With OpenHands EventStream bridge
|
|
122
|
+
pip install 'agentdebugx[openhands]'
|
|
123
|
+
|
|
116
124
|
# Everything
|
|
117
125
|
pip install 'agentdebugx[all]'
|
|
118
126
|
```
|
|
@@ -123,6 +131,10 @@ From source:
|
|
|
123
131
|
pip install -e . # or: poetry install
|
|
124
132
|
```
|
|
125
133
|
|
|
134
|
+
> The PyPI distribution name is `agentdebugx` because PyPI's name policy
|
|
135
|
+
> (PEP 503 normalization) collides `agentdebug` with the existing
|
|
136
|
+
> `agent-debug` package. The Python import is always `import agentdebug`.
|
|
137
|
+
|
|
126
138
|
## Quick Start
|
|
127
139
|
|
|
128
140
|
```python
|
|
@@ -172,6 +184,21 @@ agentdebug judge examples/sample_trace.json --attribute
|
|
|
172
184
|
# Launch the local dashboard at http://127.0.0.1:7777
|
|
173
185
|
agentdebug serve --store-sqlite .agentdebug/errors.sqlite
|
|
174
186
|
|
|
187
|
+
# DeepDebug — iterative multi-turn analysis (plan -> hypothesize -> verify -> refine)
|
|
188
|
+
agentdebug deep <trajectory.json>
|
|
189
|
+
|
|
190
|
+
# Error Hub: package + push a trace to a Git remote or HF dataset
|
|
191
|
+
agentdebug hub push <trace_id> \
|
|
192
|
+
--to git:git@github.com:your-org/agentdebug-bundles.git#bundles \
|
|
193
|
+
--store-sqlite .agentdebug/errors.sqlite
|
|
194
|
+
|
|
195
|
+
# Pull a bundle someone else shared
|
|
196
|
+
agentdebug hub pull git:... --bundle bundle_01ABCD --into .agentdebug/hub_pulls
|
|
197
|
+
|
|
198
|
+
# Host integrations: generate a Claude Code Skill, or an OpenHands microagent
|
|
199
|
+
agentdebug integrations skill --target ~/.claude/skills
|
|
200
|
+
agentdebug integrations openhands-microagent --target .openhands/microagents
|
|
201
|
+
|
|
175
202
|
# Diagnose which adapters / integrations are available
|
|
176
203
|
agentdebug doctor
|
|
177
204
|
```
|
|
@@ -63,7 +63,7 @@ spell out the proposed path before we lock into heavy abstractions.
|
|
|
63
63
|
## Install
|
|
64
64
|
|
|
65
65
|
```bash
|
|
66
|
-
# From PyPI (distribution name
|
|
66
|
+
# From PyPI (distribution name `agentdebugx`; the short import is `agentdebug`)
|
|
67
67
|
pip install agentdebugx
|
|
68
68
|
|
|
69
69
|
# With the optional local dashboard
|
|
@@ -75,6 +75,12 @@ pip install 'agentdebugx[langgraph]'
|
|
|
75
75
|
# With OpenTelemetry GenAI export shim
|
|
76
76
|
pip install 'agentdebugx[otel]'
|
|
77
77
|
|
|
78
|
+
# With Hugging Face Datasets backend for the Error Hub
|
|
79
|
+
pip install 'agentdebugx[hub-hf]'
|
|
80
|
+
|
|
81
|
+
# With OpenHands EventStream bridge
|
|
82
|
+
pip install 'agentdebugx[openhands]'
|
|
83
|
+
|
|
78
84
|
# Everything
|
|
79
85
|
pip install 'agentdebugx[all]'
|
|
80
86
|
```
|
|
@@ -85,6 +91,10 @@ From source:
|
|
|
85
91
|
pip install -e . # or: poetry install
|
|
86
92
|
```
|
|
87
93
|
|
|
94
|
+
> The PyPI distribution name is `agentdebugx` because PyPI's name policy
|
|
95
|
+
> (PEP 503 normalization) collides `agentdebug` with the existing
|
|
96
|
+
> `agent-debug` package. The Python import is always `import agentdebug`.
|
|
97
|
+
|
|
88
98
|
## Quick Start
|
|
89
99
|
|
|
90
100
|
```python
|
|
@@ -134,6 +144,21 @@ agentdebug judge examples/sample_trace.json --attribute
|
|
|
134
144
|
# Launch the local dashboard at http://127.0.0.1:7777
|
|
135
145
|
agentdebug serve --store-sqlite .agentdebug/errors.sqlite
|
|
136
146
|
|
|
147
|
+
# DeepDebug — iterative multi-turn analysis (plan -> hypothesize -> verify -> refine)
|
|
148
|
+
agentdebug deep <trajectory.json>
|
|
149
|
+
|
|
150
|
+
# Error Hub: package + push a trace to a Git remote or HF dataset
|
|
151
|
+
agentdebug hub push <trace_id> \
|
|
152
|
+
--to git:git@github.com:your-org/agentdebug-bundles.git#bundles \
|
|
153
|
+
--store-sqlite .agentdebug/errors.sqlite
|
|
154
|
+
|
|
155
|
+
# Pull a bundle someone else shared
|
|
156
|
+
agentdebug hub pull git:... --bundle bundle_01ABCD --into .agentdebug/hub_pulls
|
|
157
|
+
|
|
158
|
+
# Host integrations: generate a Claude Code Skill, or an OpenHands microagent
|
|
159
|
+
agentdebug integrations skill --target ~/.claude/skills
|
|
160
|
+
agentdebug integrations openhands-microagent --target .openhands/microagents
|
|
161
|
+
|
|
137
162
|
# Diagnose which adapters / integrations are available
|
|
138
163
|
agentdebug doctor
|
|
139
164
|
```
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# 19 — Error Hub
|
|
2
|
+
|
|
3
|
+
## 1. What it solves
|
|
4
|
+
|
|
5
|
+
A team-or-community channel for **packaging a failing agent trajectory + its
|
|
6
|
+
analysis** and pushing it to a backend (Git remote, Hugging Face Datasets
|
|
7
|
+
repo, or a local directory), plus pulling and listing what others have
|
|
8
|
+
shared. The Hub is how AgentDebugX makes the trace your colleague is staring
|
|
9
|
+
at into the regression case your CI rejects.
|
|
10
|
+
|
|
11
|
+
## 2. Unit of share — the `Bundle`
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
bundle_<id>/
|
|
15
|
+
├── manifest.json # typed metadata (BundleManifest, schema 1.0.0)
|
|
16
|
+
├── trajectory.json # AgentTrajectory JSON
|
|
17
|
+
├── report.json # DiagnosticReport JSON (optional)
|
|
18
|
+
├── artifacts/ # binary attachments (screenshots, files) — optional
|
|
19
|
+
└── README.md # auto-generated human summary
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
`BundleManifest` carries: `bundle_id`, `trace_id`, `framework`, `goal_summary`
|
|
23
|
+
(truncated, post-scrub), `failure_families`, `failure_mode_ids`,
|
|
24
|
+
`root_cause_step_index`, `root_cause_agent`, `n_events`, `has_report`,
|
|
25
|
+
`has_artifacts`, `license`, `contributor`, `contributor_org`, `scrubbed`,
|
|
26
|
+
`scrubber_version`, `notes`.
|
|
27
|
+
|
|
28
|
+
The directory layout is intentionally framework-, language-, and
|
|
29
|
+
storage-agnostic — anything that can read JSON can ingest a bundle.
|
|
30
|
+
|
|
31
|
+
## 3. Backends — `HubBackend` Protocol
|
|
32
|
+
|
|
33
|
+
| Scheme | Class | Dependencies | Use for |
|
|
34
|
+
|---|---|---|---|
|
|
35
|
+
| `local:/path` | `LocalHubBackend` | stdlib only | tests, air-gapped envs, NFS shares |
|
|
36
|
+
| `git:<remote>[#<subpath>]` | `GitHubBackend` | `git` CLI | GitHub / GitLab / Gitea / self-hosted git |
|
|
37
|
+
| `hf:<repo_id>[#<subpath>]` | `HuggingFaceBackend` | `huggingface_hub` (`[hub-hf]` extra) | community datasets, discoverable corpora |
|
|
38
|
+
|
|
39
|
+
All three implement:
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
class HubBackend(Protocol):
|
|
43
|
+
scheme: str
|
|
44
|
+
def push(self, bundle: Bundle, *, message: str | None = None) -> str: ...
|
|
45
|
+
def pull(self, bundle_id: str, *, into: Path) -> Path: ...
|
|
46
|
+
def list_bundles(self, *, limit: int = 100) -> list[str]: ...
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
`backend_from_spec("git:...#...")` dispatches by scheme prefix; the CLI
|
|
50
|
+
calls this so a single `--to` arg can target any backend.
|
|
51
|
+
|
|
52
|
+
## 4. Privacy: scrubbing is default-on
|
|
53
|
+
|
|
54
|
+
Every `agentdebug hub push` runs `Scrubber.scrub_trajectory` first. The
|
|
55
|
+
default `DEFAULT_REDACTIONS` cover:
|
|
56
|
+
|
|
57
|
+
- API keys: OpenAI, Anthropic, AWS access key + secret, Google, GitHub, PyPI
|
|
58
|
+
- Bearer tokens, URL-embedded credentials
|
|
59
|
+
- PII: emails, E.164 phone numbers, US SSNs, credit cards
|
|
60
|
+
|
|
61
|
+
Opt-out via `--no-scrub` for trusted internal hubs only — the CLI prints a
|
|
62
|
+
warning when used. The scrubber is **idempotent** (running it twice changes
|
|
63
|
+
nothing), versioned (`SCRUBBER_VERSION` is recorded in the manifest), and
|
|
64
|
+
produces an audit report (`ScrubReport.replacements`).
|
|
65
|
+
|
|
66
|
+
For higher recall, drop in a Microsoft Presidio backend via a custom
|
|
67
|
+
`Scrubber` subclass — the architecture is built for replacement.
|
|
68
|
+
|
|
69
|
+
## 5. CLI
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
# Push to a local hub
|
|
73
|
+
agentdebug hub push <trace_id> \
|
|
74
|
+
--to local:/srv/agentdebug-hub \
|
|
75
|
+
--store-sqlite .agentdebug/errors.sqlite
|
|
76
|
+
|
|
77
|
+
# Push to a Git remote (any SSH/HTTPS git host)
|
|
78
|
+
agentdebug hub push <trace_id> \
|
|
79
|
+
--to git:git@github.com:your-org/agentdebug-bundles.git#bundles \
|
|
80
|
+
--store-sqlite .agentdebug/errors.sqlite \
|
|
81
|
+
--message "Customer escalation 2026-05-16"
|
|
82
|
+
|
|
83
|
+
# Push to a Hugging Face dataset (requires HF_TOKEN)
|
|
84
|
+
agentdebug hub push <trace_id> \
|
|
85
|
+
--to hf:your-org/agentdebug-bundles \
|
|
86
|
+
--store-sqlite .agentdebug/errors.sqlite \
|
|
87
|
+
--license CC-BY-4.0
|
|
88
|
+
|
|
89
|
+
# Pull a bundle from anywhere
|
|
90
|
+
agentdebug hub pull git:git@github.com:org/repo.git#bundles \
|
|
91
|
+
--bundle bundle_01ABCD --into .agentdebug/hub_pulls
|
|
92
|
+
|
|
93
|
+
# List what's there
|
|
94
|
+
agentdebug hub list hf:your-org/agentdebug-bundles --limit 100
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## 6. Authentication
|
|
98
|
+
|
|
99
|
+
By design, AgentDebugX never holds credentials.
|
|
100
|
+
|
|
101
|
+
- Git: relies on `git` being configured (SSH key, credential helper, env-based
|
|
102
|
+
PAT). Whatever lets you `git push` works.
|
|
103
|
+
- Hugging Face: relies on `HF_TOKEN` or `huggingface-cli login`. Standard.
|
|
104
|
+
- Local: filesystem permissions.
|
|
105
|
+
|
|
106
|
+
This keeps the security surface small and lets ops teams use their existing
|
|
107
|
+
key-management.
|
|
108
|
+
|
|
109
|
+
## 7. Programmatic API
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from agentdebug.hub import (
|
|
113
|
+
Bundle, build_manifest, backend_from_spec, scrub_trajectory,
|
|
114
|
+
)
|
|
115
|
+
from agentdebug.analyzers import HeuristicAnalyzer
|
|
116
|
+
|
|
117
|
+
scrub_trajectory(trajectory) # mutates in place
|
|
118
|
+
report = HeuristicAnalyzer().analyze(trajectory)
|
|
119
|
+
manifest = build_manifest(trajectory, report=report, scrubbed=True)
|
|
120
|
+
backend = backend_from_spec("local:/srv/hub")
|
|
121
|
+
ref = backend.push(Bundle(manifest=manifest, trajectory=trajectory, report=report))
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## 8. Discoverability + Hugging Face
|
|
125
|
+
|
|
126
|
+
The HF backend uploads each bundle under `bundles/<bundle_id>/` in a Datasets
|
|
127
|
+
repo. Once a project accumulates enough bundles, generate a Parquet roll-up
|
|
128
|
+
of the manifests (planned for v0.3) so you can search by failure family,
|
|
129
|
+
framework, or root-cause step in the HF dataset viewer.
|
|
130
|
+
|
|
131
|
+
## 9. Status (v0.1)
|
|
132
|
+
|
|
133
|
+
Shipping:
|
|
134
|
+
|
|
135
|
+
- `Bundle`, `BundleManifest`, `pack_bundle`, `unpack_bundle`
|
|
136
|
+
- `Scrubber` + `DEFAULT_REDACTIONS` (12 patterns) + `ScrubReport`
|
|
137
|
+
- `LocalHubBackend` (stdlib only)
|
|
138
|
+
- `GitHubBackend` (via `git` CLI; tested against a local bare repo in CI)
|
|
139
|
+
- `HuggingFaceBackend` (gated on `huggingface_hub`)
|
|
140
|
+
- CLI: `hub push | pull | list`
|
|
141
|
+
|
|
142
|
+
Roadmap:
|
|
143
|
+
|
|
144
|
+
- Bundle signing + provenance (Sigstore)
|
|
145
|
+
- Parquet manifest roll-up for fast `hub search`
|
|
146
|
+
- Community moderation queue + take-down workflow
|
|
147
|
+
- Differential bundles (push only what changed since last push)
|
|
148
|
+
- Dataset card auto-generation for HF uploads
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# 20 — DeepDebug: Iterative Multi-Turn Error Analysis
|
|
2
|
+
|
|
3
|
+
## 1. Why a deeper loop
|
|
4
|
+
|
|
5
|
+
A one-shot LLM judge (`LLMJudgeAnalyzer`) is fast but biased:
|
|
6
|
+
|
|
7
|
+
- It tends to flag *manifestations* (tool exceptions, system errors) and miss
|
|
8
|
+
*root causes* (planner ignored a constraint, agent dropped handoff context).
|
|
9
|
+
- It does not separate "I see a finding" from "the trajectory actually supports
|
|
10
|
+
this finding."
|
|
11
|
+
- It returns a flat list, not a causal chain.
|
|
12
|
+
|
|
13
|
+
DeepDebug is the **deep-research-style** counterpart, modeled after recent
|
|
14
|
+
agent-as-judge and deep-research methods. The cost is higher (multiple LLM
|
|
15
|
+
calls per trace) and the quality is higher.
|
|
16
|
+
|
|
17
|
+
## 2. Loop structure
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
R0 plan — LLM proposes investigation focus (which event_ids, what questions)
|
|
21
|
+
R1 hypothesize — LLM produces candidate findings on the focus set
|
|
22
|
+
R2 verify (per-h) — LLM re-reads the trajectory and rates each hypothesis:
|
|
23
|
+
corroborated / weak / contradicted
|
|
24
|
+
R3 refine — LLM merges, drops weak/contradicted, picks single root cause
|
|
25
|
+
Rfinal report — DiagnosticReport + DeepDebugTrace audit trail
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Each round is recorded as a typed `DeepDebugRound`:
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
@dataclass
|
|
32
|
+
class DeepDebugRound:
|
|
33
|
+
name: str # plan | hypothesize | verify:<h_id> | refine
|
|
34
|
+
request_summary: str
|
|
35
|
+
response_summary: str
|
|
36
|
+
duration_ms: int
|
|
37
|
+
payload: dict
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
This audit trail is the point. Users can replay *how* the diagnosis was
|
|
41
|
+
reached, not just *what* it is.
|
|
42
|
+
|
|
43
|
+
## 3. Public API
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from agentdebug.deep import DeepDebugAnalyzer
|
|
47
|
+
from agentdebug.llm import OpenAICompatClient
|
|
48
|
+
|
|
49
|
+
llm = OpenAICompatClient(base_url=..., api_key=..., model='gpt-4o-mini',
|
|
50
|
+
default_max_tokens=8192, timeout=180.0)
|
|
51
|
+
|
|
52
|
+
result = DeepDebugAnalyzer(
|
|
53
|
+
llm=llm,
|
|
54
|
+
max_focus_events=7,
|
|
55
|
+
max_hypotheses_to_verify=6,
|
|
56
|
+
max_tokens=4096,
|
|
57
|
+
).analyze(trajectory)
|
|
58
|
+
|
|
59
|
+
print(result.report.summary)
|
|
60
|
+
for r in result.rounds: # audit trail
|
|
61
|
+
print(r.name, r.duration_ms, 'ms')
|
|
62
|
+
for h in result.hypotheses: # all hypotheses, verified or not
|
|
63
|
+
print(h.id, h.verdict, h.confidence_posterior)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
CLI:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
agentdebug deep <trajectory.json> --out deep.json
|
|
70
|
+
agentdebug deep <trace_id> --store-sqlite .agentdebug/errors.sqlite \
|
|
71
|
+
--base-url https://.../v1 --api-key sk-... --model gemini-3-flash
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## 4. Cost and latency model
|
|
75
|
+
|
|
76
|
+
Per trace: roughly `1 plan + 1 hypothesize + K verify + 1 refine` calls,
|
|
77
|
+
where `K ≤ max_hypotheses_to_verify`. For a 4-event trace with K=3 against
|
|
78
|
+
Gemini-3-flash we observed 6 rounds in ~29 seconds. Use:
|
|
79
|
+
|
|
80
|
+
- `LLMJudgeAnalyzer` for inline / per-step quick scans.
|
|
81
|
+
- `DeepDebugAnalyzer` for postmortems, escalations, hard regressions.
|
|
82
|
+
|
|
83
|
+
## 5. What "verify" actually does
|
|
84
|
+
|
|
85
|
+
Crucially, **verification does not re-execute the agent's tools**. It re-reads
|
|
86
|
+
the *existing* trajectory more carefully against the specific hypothesis.
|
|
87
|
+
Verdict choices:
|
|
88
|
+
|
|
89
|
+
- `corroborated` — trajectory evidence supports the hypothesis.
|
|
90
|
+
- `weak` — trajectory is ambiguous; no clear support either way.
|
|
91
|
+
- `contradicted` — trajectory explicitly conflicts with the hypothesis.
|
|
92
|
+
|
|
93
|
+
Counterfactual replay (where the agent IS re-rolled to test causality) is a
|
|
94
|
+
separate v2 feature; it composes with DeepDebug but is not required for it.
|
|
95
|
+
|
|
96
|
+
## 6. Live result (v0.1, gemini-3-flash, 4-event trace)
|
|
97
|
+
|
|
98
|
+
DeepDebug on a synthetic trace where the planner called `search_web` with
|
|
99
|
+
`args={}` and then summarized a fabricated answer:
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
summary : "The search agent failed to provide a query parameter to the
|
|
103
|
+
search tool, and the planner subsequently ignored this failure,
|
|
104
|
+
hallucinated a result, and terminated the task without emailing
|
|
105
|
+
the summary as required."
|
|
106
|
+
findings : action.parameter_error (s2, 1.00)
|
|
107
|
+
reflection.progress_misjudge (s4, 1.00)
|
|
108
|
+
verification.premature_stop (s4, 1.00)
|
|
109
|
+
root_cause : step=2, agent=search
|
|
110
|
+
rounds : plan (4.6s) hypothesize (11.0s)
|
|
111
|
+
verify:h1 (2.0s) verify:h2 (2.4s) verify:h3 (2.7s)
|
|
112
|
+
refine (6.4s)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
The single-pass `LLMJudgeAnalyzer` on the same trace returned only the first
|
|
116
|
+
finding. DeepDebug recovered the full cascade and selected the upstream cause.
|
|
117
|
+
|
|
118
|
+
## 7. Failure modes
|
|
119
|
+
|
|
120
|
+
- **Cost blowout** — if `max_hypotheses_to_verify` is high and verify is
|
|
121
|
+
expensive, costs scale linearly. Pin a budget.
|
|
122
|
+
- **Hypothesis fishing** — the LLM may invent hypotheses to look thorough.
|
|
123
|
+
The verify round filters most of these (`contradicted` / `weak`); the
|
|
124
|
+
refine round drops any low-confidence stragglers.
|
|
125
|
+
- **Thinking-token budgets** — Gemini-3-flash and o-series models spend a
|
|
126
|
+
large fraction of `max_tokens` on reasoning. Default `max_tokens=4096` is
|
|
127
|
+
conservative; raise to 8192 for long traces or fine-grained verifiers.
|
|
128
|
+
|
|
129
|
+
## 8. Roadmap
|
|
130
|
+
|
|
131
|
+
- Counterfactual-replay round (re-roll oracle-corrected step + re-judge).
|
|
132
|
+
- Step-by-step verifier mode (Who&When-style sweep).
|
|
133
|
+
- Adaptive K — decide how many hypotheses to verify based on round-1 spread.
|
|
134
|
+
- Specialized verifiers (tool-call-arg verifier, multi-agent handoff verifier).
|
|
135
|
+
- Persistent reasoning cache so re-runs on the same trace skip re-thought.
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# 21 — Host-Runtime Integrations: Claude Code Skill + OpenHands
|
|
2
|
+
|
|
3
|
+
This doc covers how AgentDebugX plugs into existing host agent runtimes as a
|
|
4
|
+
sub-module the host can call on demand.
|
|
5
|
+
|
|
6
|
+
The constraints we honor:
|
|
7
|
+
|
|
8
|
+
1. **No fork.** The integration always calls back into the locally installed
|
|
9
|
+
`agentdebug` CLI / Python API, never duplicates AgentDebugX code into the
|
|
10
|
+
host's runtime tree. Upgrading AgentDebugX upgrades the integration.
|
|
11
|
+
2. **Suggest-only by default.** Recovery proposals are never auto-applied;
|
|
12
|
+
the host agent surfaces them to the user.
|
|
13
|
+
3. **Privacy boundary.** Integrations never auto-upload to the community Hub
|
|
14
|
+
without explicit user opt-in.
|
|
15
|
+
|
|
16
|
+
## 1. Claude Code Skill
|
|
17
|
+
|
|
18
|
+
`agentdebug.integrations.claude_skill` generates a Claude Code
|
|
19
|
+
[Skill](https://www.claude.com/claude-code) package that wraps the
|
|
20
|
+
AgentDebugX CLI. Skills are folders of `SKILL.md` + assets that Claude can
|
|
21
|
+
match against and invoke on demand.
|
|
22
|
+
|
|
23
|
+
### Generate
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
agentdebug integrations skill --target ~/.claude/skills --name agentdebug
|
|
27
|
+
# wrote Claude Skill -> /home/<user>/.claude/skills/agentdebug
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Or programmatically:
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
from agentdebug.integrations import build_skill_bundle, write_skill_bundle
|
|
35
|
+
|
|
36
|
+
bundle = build_skill_bundle(name='agentdebug')
|
|
37
|
+
write_skill_bundle(bundle, target_dir=Path.home() / '.claude' / 'skills')
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### What it contains
|
|
41
|
+
|
|
42
|
+
- `SKILL.md` — frontmatter (`name`, `description`, `license`) + trigger
|
|
43
|
+
phrases (e.g., "debug this agent run", "why did this agent fail", "find
|
|
44
|
+
the root cause", "analyze this trajectory") + a recipe for choosing
|
|
45
|
+
between quick-scan / judge / DeepDebug / Hub push based on user intent.
|
|
46
|
+
- `README.md` — refresh instructions.
|
|
47
|
+
|
|
48
|
+
### How Claude uses it
|
|
49
|
+
|
|
50
|
+
When the user mentions a trajectory file, store path, or asks debugging
|
|
51
|
+
questions, Claude's skill matcher fires and Claude invokes the right CLI
|
|
52
|
+
command:
|
|
53
|
+
|
|
54
|
+
| User intent | Skill recipe |
|
|
55
|
+
|---|---|
|
|
56
|
+
| "what failed?" | `agentdebug analyze <file> --suggest` |
|
|
57
|
+
| "who caused it?" | `agentdebug judge <file|trace_id> --attribute` |
|
|
58
|
+
| "I need a thorough postmortem" | `agentdebug deep <file|trace_id>` |
|
|
59
|
+
| "share this with the team" | `agentdebug hub push <trace_id> --to git:...` |
|
|
60
|
+
|
|
61
|
+
## 2. OpenHands integration
|
|
62
|
+
|
|
63
|
+
`agentdebug.integrations.openhands` ships two complementary pieces.
|
|
64
|
+
|
|
65
|
+
### 2.1 Microagent contract (passive — host invokes us)
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
agentdebug integrations openhands-microagent --target .openhands/microagents
|
|
69
|
+
# wrote OpenHands microagent -> .openhands/microagents/agentdebug.md
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
The generated markdown is a [knowledge
|
|
73
|
+
microagent](https://github.com/All-Hands-AI/OpenHands) — Claude / GPT
|
|
74
|
+
running inside OpenHands reads it as context when a trigger phrase fires
|
|
75
|
+
("debug agent failure", "root cause analysis", "analyze trajectory", …),
|
|
76
|
+
then calls the `agentdebug` CLI through OpenHands' shell tool.
|
|
77
|
+
|
|
78
|
+
### 2.2 Event-stream bridge (active — we record OpenHands)
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from agentdebug import AgentDebug
|
|
82
|
+
from agentdebug.integrations import OpenHandsBridge
|
|
83
|
+
|
|
84
|
+
debugger = AgentDebug()
|
|
85
|
+
trajectory = debugger.start_trace(goal="...", framework="openhands")
|
|
86
|
+
bridge = OpenHandsBridge(debugger=debugger, trajectory=trajectory).attach(
|
|
87
|
+
conversation.event_stream
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# ...run the OpenHands session normally...
|
|
91
|
+
|
|
92
|
+
bridge.detach()
|
|
93
|
+
debugger.finish_trace(trajectory, success=success)
|
|
94
|
+
report = HeuristicAnalyzer().analyze(trajectory)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
The bridge subscribes to `EventStreamSubscriber.MAIN` and translates each
|
|
98
|
+
typed `Action` / `Observation` into an `AgentEvent`. `*Action` → `TOOL_CALL`,
|
|
99
|
+
`*Observation` → `TOOL_RESULT`, everything else → `OBSERVATION`. The
|
|
100
|
+
mapping is heuristic but consistent because OpenHands' class names follow
|
|
101
|
+
a strict convention.
|
|
102
|
+
|
|
103
|
+
## 3. Why two entry points?
|
|
104
|
+
|
|
105
|
+
- **Microagent** is the *deferred* path: AgentDebugX runs only when the host
|
|
106
|
+
agent or user asks for it. Zero overhead when idle, no instrumentation of
|
|
107
|
+
the host's hot loop.
|
|
108
|
+
- **EventStreamBridge** is the *passive* path: AgentDebugX captures every
|
|
109
|
+
session, even when nothing went wrong. Costs a small per-step overhead but
|
|
110
|
+
gives you a complete trajectory store you can later analyze (or push to a
|
|
111
|
+
Hub for review).
|
|
112
|
+
|
|
113
|
+
Most teams want both — microagent for on-demand debugging during
|
|
114
|
+
development, bridge for production passive observability.
|
|
115
|
+
|
|
116
|
+
## 4. Beyond Claude Code / OpenHands
|
|
117
|
+
|
|
118
|
+
The same pattern applies to any host runtime that has a tool/skill registry
|
|
119
|
+
+ an event stream. Concretely, the abstractions to copy are:
|
|
120
|
+
|
|
121
|
+
- A **trigger-matched documentation contract** (Skill.md, microagent.md) that
|
|
122
|
+
describes when to invoke AgentDebugX and what arguments to pass.
|
|
123
|
+
- A **structured event stream subscription** that maps host events into
|
|
124
|
+
`AgentEvent`.
|
|
125
|
+
|
|
126
|
+
Future targets include LangGraph's checkpointer hooks, AutoGen's group-chat
|
|
127
|
+
event bus, and Pydantic-AI's OTel integration. See
|
|
128
|
+
[docs/05_adapters.md](./05_adapters.md) for the per-framework adapter plan.
|
|
129
|
+
|
|
130
|
+
## 5. Testing
|
|
131
|
+
|
|
132
|
+
`tests/test_deep_and_integrations.py` covers:
|
|
133
|
+
|
|
134
|
+
- Skill bundle generates valid `SKILL.md` with trigger phrases.
|
|
135
|
+
- OpenHands microagent emits valid YAML front-matter and references
|
|
136
|
+
`CodeActAgent` / suggest-only safety language.
|
|
137
|
+
|
|
138
|
+
Live integration tests (Claude Code + OpenHands) are out of scope for CI —
|
|
139
|
+
they require the host runtimes installed. The generated artifacts are pure
|
|
140
|
+
files, so eyeballing them after generation is the recommended QA path.
|
|
@@ -28,6 +28,9 @@ This `docs/` directory contains the full design specification.
|
|
|
28
28
|
| 17 | [17_claude_code_design_patterns.md](./17_claude_code_design_patterns.md) | Claude Code-inspired runtime patterns: hooks, permissions, skills, subagents, MCP, plugins |
|
|
29
29
|
| 18 | [18_comparison_codex_vs_design.md](./18_comparison_codex_vs_design.md) | Comparison of the working Codex scaffold and the expanded design spec |
|
|
30
30
|
| 18 | [18_comparison_codex_vs_design.md](./18_comparison_codex_vs_design.md) | Reconciliation of the Codex scaffold with this design spec — the merged v0.1 plan |
|
|
31
|
+
| 19 | [19_error_hub.md](./19_error_hub.md) | **Error Hub** — bundle format, Local / Git / HF backends, scrubbing |
|
|
32
|
+
| 20 | [20_deep_debug.md](./20_deep_debug.md) | **DeepDebug** — iterative multi-turn analysis (plan → hypothesize → verify → refine) |
|
|
33
|
+
| 21 | [21_integrations.md](./21_integrations.md) | **Claude Code Skill** + **OpenHands** microagent + EventStream bridge |
|
|
31
34
|
|
|
32
35
|
Plus three **narrative** docs that pre-dated this engineering spec and are kept for paper-style framing:
|
|
33
36
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "agentdebugx"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
description = "Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`."
|
|
5
5
|
authors = ["ULab @ UIUC <ulab@illinois.edu>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -54,12 +54,15 @@ httpx = ">=0.24,<1.0"
|
|
|
54
54
|
langgraph = ["langchain-core"]
|
|
55
55
|
otel = ["opentelemetry-api", "opentelemetry-sdk"]
|
|
56
56
|
ui = ["fastapi", "uvicorn"]
|
|
57
|
+
hub-hf = ["huggingface_hub"]
|
|
58
|
+
openhands = ["openhands-ai"]
|
|
57
59
|
all = [
|
|
58
60
|
"langchain-core",
|
|
59
61
|
"opentelemetry-api",
|
|
60
62
|
"opentelemetry-sdk",
|
|
61
63
|
"fastapi",
|
|
62
64
|
"uvicorn",
|
|
65
|
+
"huggingface_hub",
|
|
63
66
|
]
|
|
64
67
|
|
|
65
68
|
[tool.poetry.scripts]
|