agentdebugx 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/PKG-INFO +29 -2
  2. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/README.md +26 -1
  3. agentdebugx-0.2.0/docs/19_error_hub.md +148 -0
  4. agentdebugx-0.2.0/docs/20_deep_debug.md +135 -0
  5. agentdebugx-0.2.0/docs/21_integrations.md +140 -0
  6. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/README.md +3 -0
  7. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/pyproject.toml +4 -1
  8. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/__init__.py +1 -1
  9. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/cli.py +209 -0
  10. agentdebugx-0.2.0/src/agentdebug/deep.py +533 -0
  11. agentdebugx-0.2.0/src/agentdebug/hub/__init__.py +60 -0
  12. agentdebugx-0.2.0/src/agentdebug/hub/backend_base.py +62 -0
  13. agentdebugx-0.2.0/src/agentdebug/hub/backends.py +275 -0
  14. agentdebugx-0.2.0/src/agentdebug/hub/bundle.py +293 -0
  15. agentdebugx-0.2.0/src/agentdebug/hub/scrub.py +142 -0
  16. agentdebugx-0.2.0/src/agentdebug/integrations/__init__.py +32 -0
  17. agentdebugx-0.2.0/src/agentdebug/integrations/claude_skill.py +161 -0
  18. agentdebugx-0.2.0/src/agentdebug/integrations/openhands.py +159 -0
  19. agentdebugx-0.2.0/src/agentdebug/ui/server.py +476 -0
  20. agentdebugx-0.1.0/src/agentdebug/ui/server.py +0 -260
  21. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/LICENSE +0 -0
  22. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/00_overview.md +0 -0
  23. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/01_literature_survey.md +0 -0
  24. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/02_architecture.md +0 -0
  25. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/03_taxonomy.md +0 -0
  26. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/04_trace_schema.md +0 -0
  27. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/05_adapters.md +0 -0
  28. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/06_detectors.md +0 -0
  29. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/07_attribution.md +0 -0
  30. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/08_recovery.md +0 -0
  31. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/09_error_database.md +0 -0
  32. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/10_taxonomy_induction.md +0 -0
  33. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/11_multimodal.md +0 -0
  34. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/12_ui_dashboard.md +0 -0
  35. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/13_class_design.md +0 -0
  36. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/14_api_reference.md +0 -0
  37. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/15_roadmap.md +0 -0
  38. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/16_governance.md +0 -0
  39. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/17_claude_code_design_patterns.md +0 -0
  40. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/18_comparison_codex_vs_design.md +0 -0
  41. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/ERROR_TAXONOMY.md +0 -0
  42. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
  43. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/RESEARCH_SURVEY.md +0 -0
  44. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/benchmarks/v0_1_smoke.json +0 -0
  45. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/docs/benchmarks/v0_1_smoke.md +0 -0
  46. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/adapters/__init__.py +0 -0
  47. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/adapters/base.py +0 -0
  48. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/adapters/langgraph.py +0 -0
  49. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/adapters/otel.py +0 -0
  50. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/adapters/raw.py +0 -0
  51. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/analyzers.py +0 -0
  52. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/attribution.py +0 -0
  53. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/events.py +0 -0
  54. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/instrumentation.py +0 -0
  55. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/judges.py +0 -0
  56. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/llm.py +0 -0
  57. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/models.py +0 -0
  58. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/recorder.py +0 -0
  59. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/recovery.py +0 -0
  60. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/storage.py +0 -0
  61. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/taxonomy.py +0 -0
  62. {agentdebugx-0.1.0 → agentdebugx-0.2.0}/src/agentdebug/ui/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentdebugx
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -26,7 +26,9 @@ Classifier: Topic :: Software Development :: Quality Assurance
26
26
  Classifier: Topic :: System :: Monitoring
27
27
  Classifier: Typing :: Typed
28
28
  Provides-Extra: all
29
+ Provides-Extra: hub-hf
29
30
  Provides-Extra: langgraph
31
+ Provides-Extra: openhands
30
32
  Provides-Extra: otel
31
33
  Provides-Extra: ui
32
34
  Requires-Dist: httpx (>=0.24,<1.0)
@@ -101,7 +103,7 @@ spell out the proposed path before we lock into heavy abstractions.
101
103
  ## Install
102
104
 
103
105
  ```bash
104
- # From PyPI (distribution name: agentdebugx; import as `agentdebug`)
106
+ # From PyPI (distribution name `agentdebugx`; the short import is `agentdebug`)
105
107
  pip install agentdebugx
106
108
 
107
109
  # With the optional local dashboard
@@ -113,6 +115,12 @@ pip install 'agentdebugx[langgraph]'
113
115
  # With OpenTelemetry GenAI export shim
114
116
  pip install 'agentdebugx[otel]'
115
117
 
118
+ # With Hugging Face Datasets backend for the Error Hub
119
+ pip install 'agentdebugx[hub-hf]'
120
+
121
+ # With OpenHands EventStream bridge
122
+ pip install 'agentdebugx[openhands]'
123
+
116
124
  # Everything
117
125
  pip install 'agentdebugx[all]'
118
126
  ```
@@ -123,6 +131,10 @@ From source:
123
131
  pip install -e . # or: poetry install
124
132
  ```
125
133
 
134
+ > The PyPI distribution name is `agentdebugx` because PyPI's name policy
135
+ > (PEP 503 normalization) collides `agentdebug` with the existing
136
+ > `agent-debug` package. The Python import is always `import agentdebug`.
137
+
126
138
  ## Quick Start
127
139
 
128
140
  ```python
@@ -172,6 +184,21 @@ agentdebug judge examples/sample_trace.json --attribute
172
184
  # Launch the local dashboard at http://127.0.0.1:7777
173
185
  agentdebug serve --store-sqlite .agentdebug/errors.sqlite
174
186
 
187
+ # DeepDebug — iterative multi-turn analysis (plan -> hypothesize -> verify -> refine)
188
+ agentdebug deep <trajectory.json>
189
+
190
+ # Error Hub: package + push a trace to a Git remote or HF dataset
191
+ agentdebug hub push <trace_id> \
192
+ --to git:git@github.com:your-org/agentdebug-bundles.git#bundles \
193
+ --store-sqlite .agentdebug/errors.sqlite
194
+
195
+ # Pull a bundle someone else shared
196
+ agentdebug hub pull git:... --bundle bundle_01ABCD --into .agentdebug/hub_pulls
197
+
198
+ # Host integrations: generate a Claude Code Skill, or an OpenHands microagent
199
+ agentdebug integrations skill --target ~/.claude/skills
200
+ agentdebug integrations openhands-microagent --target .openhands/microagents
201
+
175
202
  # Diagnose which adapters / integrations are available
176
203
  agentdebug doctor
177
204
  ```
@@ -63,7 +63,7 @@ spell out the proposed path before we lock into heavy abstractions.
63
63
  ## Install
64
64
 
65
65
  ```bash
66
- # From PyPI (distribution name: agentdebugx; import as `agentdebug`)
66
+ # From PyPI (distribution name `agentdebugx`; the short import is `agentdebug`)
67
67
  pip install agentdebugx
68
68
 
69
69
  # With the optional local dashboard
@@ -75,6 +75,12 @@ pip install 'agentdebugx[langgraph]'
75
75
  # With OpenTelemetry GenAI export shim
76
76
  pip install 'agentdebugx[otel]'
77
77
 
78
+ # With Hugging Face Datasets backend for the Error Hub
79
+ pip install 'agentdebugx[hub-hf]'
80
+
81
+ # With OpenHands EventStream bridge
82
+ pip install 'agentdebugx[openhands]'
83
+
78
84
  # Everything
79
85
  pip install 'agentdebugx[all]'
80
86
  ```
@@ -85,6 +91,10 @@ From source:
85
91
  pip install -e . # or: poetry install
86
92
  ```
87
93
 
94
+ > The PyPI distribution name is `agentdebugx` because PyPI's name policy
95
+ > (PEP 503 normalization) collides `agentdebug` with the existing
96
+ > `agent-debug` package. The Python import is always `import agentdebug`.
97
+
88
98
  ## Quick Start
89
99
 
90
100
  ```python
@@ -134,6 +144,21 @@ agentdebug judge examples/sample_trace.json --attribute
134
144
  # Launch the local dashboard at http://127.0.0.1:7777
135
145
  agentdebug serve --store-sqlite .agentdebug/errors.sqlite
136
146
 
147
+ # DeepDebug — iterative multi-turn analysis (plan -> hypothesize -> verify -> refine)
148
+ agentdebug deep <trajectory.json>
149
+
150
+ # Error Hub: package + push a trace to a Git remote or HF dataset
151
+ agentdebug hub push <trace_id> \
152
+ --to git:git@github.com:your-org/agentdebug-bundles.git#bundles \
153
+ --store-sqlite .agentdebug/errors.sqlite
154
+
155
+ # Pull a bundle someone else shared
156
+ agentdebug hub pull git:... --bundle bundle_01ABCD --into .agentdebug/hub_pulls
157
+
158
+ # Host integrations: generate a Claude Code Skill, or an OpenHands microagent
159
+ agentdebug integrations skill --target ~/.claude/skills
160
+ agentdebug integrations openhands-microagent --target .openhands/microagents
161
+
137
162
  # Diagnose which adapters / integrations are available
138
163
  agentdebug doctor
139
164
  ```
@@ -0,0 +1,148 @@
1
+ # 19 — Error Hub
2
+
3
+ ## 1. What it solves
4
+
5
+ A team-or-community channel for **packaging a failing agent trajectory + its
6
+ analysis** and pushing it to a backend (Git remote, Hugging Face Datasets
7
+ repo, or a local directory), plus pulling and listing what others have
8
+ shared. The Hub is how AgentDebugX makes the trace your colleague is staring
9
+ at into the regression case your CI rejects.
10
+
11
+ ## 2. Unit of share — the `Bundle`
12
+
13
+ ```
14
+ bundle_<id>/
15
+ ├── manifest.json # typed metadata (BundleManifest, schema 1.0.0)
16
+ ├── trajectory.json # AgentTrajectory JSON
17
+ ├── report.json # DiagnosticReport JSON (optional)
18
+ ├── artifacts/ # binary attachments (screenshots, files) — optional
19
+ └── README.md # auto-generated human summary
20
+ ```
21
+
22
+ `BundleManifest` carries: `bundle_id`, `trace_id`, `framework`, `goal_summary`
23
+ (truncated, post-scrub), `failure_families`, `failure_mode_ids`,
24
+ `root_cause_step_index`, `root_cause_agent`, `n_events`, `has_report`,
25
+ `has_artifacts`, `license`, `contributor`, `contributor_org`, `scrubbed`,
26
+ `scrubber_version`, `notes`.
27
+
28
+ The directory layout is intentionally framework-, language-, and
29
+ storage-agnostic — anything that can read JSON can ingest a bundle.
30
+
31
+ ## 3. Backends — `HubBackend` Protocol
32
+
33
+ | Scheme | Class | Dependencies | Use for |
34
+ |---|---|---|---|
35
+ | `local:/path` | `LocalHubBackend` | stdlib only | tests, air-gapped envs, NFS shares |
36
+ | `git:<remote>[#<subpath>]` | `GitHubBackend` | `git` CLI | GitHub / GitLab / Gitea / self-hosted git |
37
+ | `hf:<repo_id>[#<subpath>]` | `HuggingFaceBackend` | `huggingface_hub` (`[hub-hf]` extra) | community datasets, discoverable corpora |
38
+
39
+ All three implement:
40
+
41
+ ```python
42
+ class HubBackend(Protocol):
43
+ scheme: str
44
+ def push(self, bundle: Bundle, *, message: str | None = None) -> str: ...
45
+ def pull(self, bundle_id: str, *, into: Path) -> Path: ...
46
+ def list_bundles(self, *, limit: int = 100) -> list[str]: ...
47
+ ```
48
+
49
+ `backend_from_spec("git:...#...")` dispatches by scheme prefix; the CLI
50
+ calls this so a single `--to` arg can target any backend.
51
+
52
+ ## 4. Privacy: scrubbing is default-on
53
+
54
+ Every `agentdebug hub push` runs `Scrubber.scrub_trajectory` first. The
55
+ default `DEFAULT_REDACTIONS` cover:
56
+
57
+ - API keys: OpenAI, Anthropic, AWS access key + secret, Google, GitHub, PyPI
58
+ - Bearer tokens, URL-embedded credentials
59
+ - PII: emails, E.164 phone numbers, US SSNs, credit cards
60
+
61
+ Opt-out via `--no-scrub` for trusted internal hubs only — the CLI prints a
62
+ warning when used. The scrubber is **idempotent** (running it twice changes
63
+ nothing), versioned (`SCRUBBER_VERSION` is recorded in the manifest), and
64
+ produces an audit report (`ScrubReport.replacements`).
65
+
66
+ For higher recall, drop in a Microsoft Presidio backend via a custom
67
+ `Scrubber` subclass — the architecture is built for replacement.
68
+
69
+ ## 5. CLI
70
+
71
+ ```bash
72
+ # Push to a local hub
73
+ agentdebug hub push <trace_id> \
74
+ --to local:/srv/agentdebug-hub \
75
+ --store-sqlite .agentdebug/errors.sqlite
76
+
77
+ # Push to a Git remote (any SSH/HTTPS git host)
78
+ agentdebug hub push <trace_id> \
79
+ --to git:git@github.com:your-org/agentdebug-bundles.git#bundles \
80
+ --store-sqlite .agentdebug/errors.sqlite \
81
+ --message "Customer escalation 2026-05-16"
82
+
83
+ # Push to a Hugging Face dataset (requires HF_TOKEN)
84
+ agentdebug hub push <trace_id> \
85
+ --to hf:your-org/agentdebug-bundles \
86
+ --store-sqlite .agentdebug/errors.sqlite \
87
+ --license CC-BY-4.0
88
+
89
+ # Pull a bundle from anywhere
90
+ agentdebug hub pull git:git@github.com:org/repo.git#bundles \
91
+ --bundle bundle_01ABCD --into .agentdebug/hub_pulls
92
+
93
+ # List what's there
94
+ agentdebug hub list hf:your-org/agentdebug-bundles --limit 100
95
+ ```
96
+
97
+ ## 6. Authentication
98
+
99
+ By design, AgentDebugX never holds credentials.
100
+
101
+ - Git: relies on `git` being configured (SSH key, credential helper, env-based
102
+ PAT). Whatever lets you `git push` works.
103
+ - Hugging Face: relies on `HF_TOKEN` or `huggingface-cli login`. Standard.
104
+ - Local: filesystem permissions.
105
+
106
+ This keeps the security surface small and lets ops teams use their existing
107
+ key-management.
108
+
109
+ ## 7. Programmatic API
110
+
111
+ ```python
112
+ from agentdebug.hub import (
113
+ Bundle, build_manifest, backend_from_spec, scrub_trajectory,
114
+ )
115
+ from agentdebug.analyzers import HeuristicAnalyzer
116
+
117
+ scrub_trajectory(trajectory) # mutates in place
118
+ report = HeuristicAnalyzer().analyze(trajectory)
119
+ manifest = build_manifest(trajectory, report=report, scrubbed=True)
120
+ backend = backend_from_spec("local:/srv/hub")
121
+ ref = backend.push(Bundle(manifest=manifest, trajectory=trajectory, report=report))
122
+ ```
123
+
124
+ ## 8. Discoverability + Hugging Face
125
+
126
+ The HF backend uploads each bundle under `bundles/<bundle_id>/` in a Datasets
127
+ repo. Once a project accumulates enough bundles, generate a Parquet roll-up
128
+ of the manifests (planned for v0.3) so you can search by failure family,
129
+ framework, or root-cause step in the HF dataset viewer.
130
+
131
+ ## 9. Status (v0.1)
132
+
133
+ Shipping:
134
+
135
+ - `Bundle`, `BundleManifest`, `pack_bundle`, `unpack_bundle`
136
+ - `Scrubber` + `DEFAULT_REDACTIONS` (12 patterns) + `ScrubReport`
137
+ - `LocalHubBackend` (stdlib only)
138
+ - `GitHubBackend` (via `git` CLI; tested against a local bare repo in CI)
139
+ - `HuggingFaceBackend` (gated on `huggingface_hub`)
140
+ - CLI: `hub push | pull | list`
141
+
142
+ Roadmap:
143
+
144
+ - Bundle signing + provenance (Sigstore)
145
+ - Parquet manifest roll-up for fast `hub search`
146
+ - Community moderation queue + take-down workflow
147
+ - Differential bundles (push only what changed since last push)
148
+ - Dataset card auto-generation for HF uploads
@@ -0,0 +1,135 @@
1
+ # 20 — DeepDebug: Iterative Multi-Turn Error Analysis
2
+
3
+ ## 1. Why a deeper loop
4
+
5
+ A one-shot LLM judge (`LLMJudgeAnalyzer`) is fast but biased:
6
+
7
+ - It tends to flag *manifestations* (tool exceptions, system errors) and miss
8
+ *root causes* (planner ignored a constraint, agent dropped handoff context).
9
+ - It does not separate "I see a finding" from "the trajectory actually supports
10
+ this finding."
11
+ - It returns a flat list, not a causal chain.
12
+
13
+ DeepDebug is the **deep-research-style** counterpart, modeled after recent
14
+ agent-as-judge and deep-research methods. The cost is higher (multiple LLM
15
+ calls per trace) and the quality is higher.
16
+
17
+ ## 2. Loop structure
18
+
19
+ ```
20
+ R0 plan — LLM proposes investigation focus (which event_ids, what questions)
21
+ R1 hypothesize — LLM produces candidate findings on the focus set
22
+ R2 verify (per-h) — LLM re-reads the trajectory and rates each hypothesis:
23
+ corroborated / weak / contradicted
24
+ R3 refine — LLM merges, drops weak/contradicted, picks single root cause
25
+ Rfinal report — DiagnosticReport + DeepDebugTrace audit trail
26
+ ```
27
+
28
+ Each round is recorded as a typed `DeepDebugRound`:
29
+
30
+ ```python
31
+ @dataclass
32
+ class DeepDebugRound:
33
+ name: str # plan | hypothesize | verify:<h_id> | refine
34
+ request_summary: str
35
+ response_summary: str
36
+ duration_ms: int
37
+ payload: dict
38
+ ```
39
+
40
+ This audit trail is the point. Users can replay *how* the diagnosis was
41
+ reached, not just *what* it is.
42
+
43
+ ## 3. Public API
44
+
45
+ ```python
46
+ from agentdebug.deep import DeepDebugAnalyzer
47
+ from agentdebug.llm import OpenAICompatClient
48
+
49
+ llm = OpenAICompatClient(base_url=..., api_key=..., model='gpt-4o-mini',
50
+ default_max_tokens=8192, timeout=180.0)
51
+
52
+ result = DeepDebugAnalyzer(
53
+ llm=llm,
54
+ max_focus_events=7,
55
+ max_hypotheses_to_verify=6,
56
+ max_tokens=4096,
57
+ ).analyze(trajectory)
58
+
59
+ print(result.report.summary)
60
+ for r in result.rounds: # audit trail
61
+ print(r.name, r.duration_ms, 'ms')
62
+ for h in result.hypotheses: # all hypotheses, verified or not
63
+ print(h.id, h.verdict, h.confidence_posterior)
64
+ ```
65
+
66
+ CLI:
67
+
68
+ ```bash
69
+ agentdebug deep <trajectory.json> --out deep.json
70
+ agentdebug deep <trace_id> --store-sqlite .agentdebug/errors.sqlite \
71
+ --base-url https://.../v1 --api-key sk-... --model gemini-3-flash
72
+ ```
73
+
74
+ ## 4. Cost and latency model
75
+
76
+ Per trace: roughly `1 plan + 1 hypothesize + K verify + 1 refine` calls,
77
+ where `K ≤ max_hypotheses_to_verify`. For a 4-event trace with K=3 against
78
+ Gemini-3-flash we observed 6 rounds in ~29 seconds. Use:
79
+
80
+ - `LLMJudgeAnalyzer` for inline / per-step quick scans.
81
+ - `DeepDebugAnalyzer` for postmortems, escalations, hard regressions.
82
+
83
+ ## 5. What "verify" actually does
84
+
85
+ Crucially, **verification does not re-execute the agent's tools**. It re-reads
86
+ the *existing* trajectory more carefully against the specific hypothesis.
87
+ Verdict choices:
88
+
89
+ - `corroborated` — trajectory evidence supports the hypothesis.
90
+ - `weak` — trajectory is ambiguous; no clear support either way.
91
+ - `contradicted` — trajectory explicitly conflicts with the hypothesis.
92
+
93
+ Counterfactual replay (where the agent IS re-rolled to test causality) is a
94
+ separate v2 feature; it composes with DeepDebug but is not required for it.
95
+
96
+ ## 6. Live result (v0.1, gemini-3-flash, 4-event trace)
97
+
98
+ DeepDebug on a synthetic trace where the planner called `search_web` with
99
+ `args={}` and then summarized a fabricated answer:
100
+
101
+ ```
102
+ summary : "The search agent failed to provide a query parameter to the
103
+ search tool, and the planner subsequently ignored this failure,
104
+ hallucinated a result, and terminated the task without emailing
105
+ the summary as required."
106
+ findings : action.parameter_error (s2, 1.00)
107
+ reflection.progress_misjudge (s4, 1.00)
108
+ verification.premature_stop (s4, 1.00)
109
+ root_cause : step=2, agent=search
110
+ rounds : plan (4.6s) hypothesize (11.0s)
111
+ verify:h1 (2.0s) verify:h2 (2.4s) verify:h3 (2.7s)
112
+ refine (6.4s)
113
+ ```
114
+
115
+ The single-pass `LLMJudgeAnalyzer` on the same trace returned only the first
116
+ finding. DeepDebug recovered the full cascade and selected the upstream cause.
117
+
118
+ ## 7. Failure modes
119
+
120
+ - **Cost blowout** — if `max_hypotheses_to_verify` is high and verify is
121
+ expensive, costs scale linearly. Pin a budget.
122
+ - **Hypothesis fishing** — the LLM may invent hypotheses to look thorough.
123
+ The verify round filters most of these (`contradicted` / `weak`); the
124
+ refine round drops any low-confidence stragglers.
125
+ - **Thinking-token budgets** — Gemini-3-flash and o-series models spend a
126
+ large fraction of `max_tokens` on reasoning. Default `max_tokens=4096` is
127
+ conservative; raise to 8192 for long traces or fine-grained verifiers.
128
+
129
+ ## 8. Roadmap
130
+
131
+ - Counterfactual-replay round (re-roll oracle-corrected step + re-judge).
132
+ - Step-by-step verifier mode (Who&When-style sweep).
133
+ - Adaptive K — decide how many hypotheses to verify based on round-1 spread.
134
+ - Specialized verifiers (tool-call-arg verifier, multi-agent handoff verifier).
135
+ - Persistent reasoning cache so re-runs on the same trace skip re-thought.
@@ -0,0 +1,140 @@
1
+ # 21 — Host-Runtime Integrations: Claude Code Skill + OpenHands
2
+
3
+ This doc covers how AgentDebugX plugs into existing host agent runtimes as a
4
+ sub-module the host can call on demand.
5
+
6
+ The constraints we honor:
7
+
8
+ 1. **No fork.** The integration always calls back into the locally installed
9
+ `agentdebug` CLI / Python API, never duplicates AgentDebugX code into the
10
+ host's runtime tree. Upgrading AgentDebugX upgrades the integration.
11
+ 2. **Suggest-only by default.** Recovery proposals are never auto-applied;
12
+ the host agent surfaces them to the user.
13
+ 3. **Privacy boundary.** Integrations never auto-upload to the community Hub
14
+ without explicit user opt-in.
15
+
16
+ ## 1. Claude Code Skill
17
+
18
+ `agentdebug.integrations.claude_skill` generates a Claude Code
19
+ [Skill](https://www.claude.com/claude-code) package that wraps the
20
+ AgentDebugX CLI. Skills are folders of `SKILL.md` + assets that Claude can
21
+ match against and invoke on demand.
22
+
23
+ ### Generate
24
+
25
+ ```bash
26
+ agentdebug integrations skill --target ~/.claude/skills --name agentdebug
27
+ # wrote Claude Skill -> /home/<user>/.claude/skills/agentdebug
28
+ ```
29
+
30
+ Or programmatically:
31
+
32
+ ```python
33
+ from pathlib import Path
34
+ from agentdebug.integrations import build_skill_bundle, write_skill_bundle
35
+
36
+ bundle = build_skill_bundle(name='agentdebug')
37
+ write_skill_bundle(bundle, target_dir=Path.home() / '.claude' / 'skills')
38
+ ```
39
+
40
+ ### What it contains
41
+
42
+ - `SKILL.md` — frontmatter (`name`, `description`, `license`) + trigger
43
+ phrases (e.g., "debug this agent run", "why did this agent fail", "find
44
+ the root cause", "analyze this trajectory") + a recipe for choosing
45
+ between quick-scan / judge / DeepDebug / Hub push based on user intent.
46
+ - `README.md` — refresh instructions.
47
+
48
+ ### How Claude uses it
49
+
50
+ When the user mentions a trajectory file, store path, or asks debugging
51
+ questions, Claude's skill matcher fires and Claude invokes the right CLI
52
+ command:
53
+
54
+ | User intent | Skill recipe |
55
+ |---|---|
56
+ | "what failed?" | `agentdebug analyze <file> --suggest` |
57
+ | "who caused it?" | `agentdebug judge <file|trace_id> --attribute` |
58
+ | "I need a thorough postmortem" | `agentdebug deep <file|trace_id>` |
59
+ | "share this with the team" | `agentdebug hub push <trace_id> --to git:...` |
60
+
61
+ ## 2. OpenHands integration
62
+
63
+ `agentdebug.integrations.openhands` ships two complementary pieces.
64
+
65
+ ### 2.1 Microagent contract (passive — host invokes us)
66
+
67
+ ```bash
68
+ agentdebug integrations openhands-microagent --target .openhands/microagents
69
+ # wrote OpenHands microagent -> .openhands/microagents/agentdebug.md
70
+ ```
71
+
72
+ The generated markdown is a [knowledge
73
+ microagent](https://github.com/All-Hands-AI/OpenHands) — Claude / GPT
74
+ running inside OpenHands reads it as context when a trigger phrase fires
75
+ ("debug agent failure", "root cause analysis", "analyze trajectory", …),
76
+ then calls the `agentdebug` CLI through OpenHands' shell tool.
77
+
78
+ ### 2.2 Event-stream bridge (active — we record OpenHands)
79
+
80
+ ```python
81
+ from agentdebug import AgentDebug
82
+ from agentdebug.integrations import OpenHandsBridge
83
+
84
+ debugger = AgentDebug()
85
+ trajectory = debugger.start_trace(goal="...", framework="openhands")
86
+ bridge = OpenHandsBridge(debugger=debugger, trajectory=trajectory).attach(
87
+ conversation.event_stream
88
+ )
89
+
90
+ # ...run the OpenHands session normally...
91
+
92
+ bridge.detach()
93
+ debugger.finish_trace(trajectory, success=success)
94
+ report = HeuristicAnalyzer().analyze(trajectory)
95
+ ```
96
+
97
+ The bridge subscribes to `EventStreamSubscriber.MAIN` and translates each
98
+ typed `Action` / `Observation` into an `AgentEvent`. `*Action` → `TOOL_CALL`,
99
+ `*Observation` → `TOOL_RESULT`, everything else → `OBSERVATION`. The
100
+ mapping is heuristic but consistent because OpenHands' class names follow
101
+ a strict convention.
102
+
103
+ ## 3. Why two entry points?
104
+
105
+ - **Microagent** is the *deferred* path: AgentDebugX runs only when the host
106
+ agent or user asks for it. Zero overhead when idle, no instrumentation of
107
+ the host's hot loop.
108
+ - **EventStreamBridge** is the *passive* path: AgentDebugX captures every
109
+ session, even when nothing went wrong. Costs a small per-step overhead but
110
+ gives you a complete trajectory store you can later analyze (or push to a
111
+ Hub for review).
112
+
113
+ Most teams want both — microagent for on-demand debugging during
114
+ development, bridge for production passive observability.
115
+
116
+ ## 4. Beyond Claude Code / OpenHands
117
+
118
+ The same pattern applies to any host runtime that has a tool/skill registry
119
+ + an event stream. Concretely, the abstractions to copy are:
120
+
121
+ - A **trigger-matched documentation contract** (Skill.md, microagent.md) that
122
+ describes when to invoke AgentDebugX and what arguments to pass.
123
+ - A **structured event stream subscription** that maps host events into
124
+ `AgentEvent`.
125
+
126
+ Future targets include LangGraph's checkpointer hooks, AutoGen's group-chat
127
+ event bus, and Pydantic-AI's OTel integration. See
128
+ [docs/05_adapters.md](./05_adapters.md) for the per-framework adapter plan.
129
+
130
+ ## 5. Testing
131
+
132
+ `tests/test_deep_and_integrations.py` covers:
133
+
134
+ - Skill bundle generates valid `SKILL.md` with trigger phrases.
135
+ - OpenHands microagent emits valid YAML front-matter and references
136
+ `CodeActAgent` / suggest-only safety language.
137
+
138
+ Live integration tests (Claude Code + OpenHands) are out of scope for CI —
139
+ they require the host runtimes installed. The generated artifacts are pure
140
+ files, so eyeballing them after generation is the recommended QA path.
@@ -28,6 +28,9 @@ This `docs/` directory contains the full design specification.
28
28
  | 17 | [17_claude_code_design_patterns.md](./17_claude_code_design_patterns.md) | Claude Code-inspired runtime patterns: hooks, permissions, skills, subagents, MCP, plugins |
29
29
  | 18 | [18_comparison_codex_vs_design.md](./18_comparison_codex_vs_design.md) | Comparison of the working Codex scaffold and the expanded design spec |
30
30
  | 18 | [18_comparison_codex_vs_design.md](./18_comparison_codex_vs_design.md) | Reconciliation of the Codex scaffold with this design spec — the merged v0.1 plan |
31
+ | 19 | [19_error_hub.md](./19_error_hub.md) | **Error Hub** — bundle format, Local / Git / HF backends, scrubbing |
32
+ | 20 | [20_deep_debug.md](./20_deep_debug.md) | **DeepDebug** — iterative multi-turn analysis (plan → hypothesize → verify → refine) |
33
+ | 21 | [21_integrations.md](./21_integrations.md) | **Claude Code Skill** + **OpenHands** microagent + EventStream bridge |
31
34
 
32
35
  Plus three **narrative** docs that pre-dated this engineering spec and are kept for paper-style framing:
33
36
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "agentdebugx"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  description = "Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`."
5
5
  authors = ["ULab @ UIUC <ulab@illinois.edu>"]
6
6
  license = "MIT"
@@ -54,12 +54,15 @@ httpx = ">=0.24,<1.0"
54
54
  langgraph = ["langchain-core"]
55
55
  otel = ["opentelemetry-api", "opentelemetry-sdk"]
56
56
  ui = ["fastapi", "uvicorn"]
57
+ hub-hf = ["huggingface_hub"]
58
+ openhands = ["openhands-ai"]
57
59
  all = [
58
60
  "langchain-core",
59
61
  "opentelemetry-api",
60
62
  "opentelemetry-sdk",
61
63
  "fastapi",
62
64
  "uvicorn",
65
+ "huggingface_hub",
63
66
  ]
64
67
 
65
68
  [tool.poetry.scripts]
@@ -62,4 +62,4 @@ __all__ = [
62
62
  'get_failure_mode',
63
63
  ]
64
64
 
65
- __version__ = '0.1.0'
65
+ __version__ = '0.2.0'