agentdebugx 0.2.5__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/PKG-INFO +1 -1
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/23_status_v0_2.md +33 -3
- agentdebugx-0.2.6/docs/benchmarks/who_when_v0_2_6_leaderboard.md +74 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/pyproject.toml +1 -1
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/__init__.py +1 -1
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/judges.py +23 -17
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/ui/server.py +52 -5
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/LICENSE +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/README.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/00_overview.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/01_literature_survey.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/02_architecture.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/03_taxonomy.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/04_trace_schema.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/05_adapters.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/06_detectors.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/07_attribution.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/08_recovery.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/09_error_database.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/10_taxonomy_induction.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/11_multimodal.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/12_ui_dashboard.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/13_class_design.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/14_api_reference.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/15_roadmap.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/16_governance.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/17_claude_code_design_patterns.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/18_comparison_codex_vs_design.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/19_error_hub.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/20_deep_debug.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/21_integrations.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/22_industry_track_paper_eval_plan.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/ERROR_TAXONOMY.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/README.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/RESEARCH_SURVEY.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/benchmarks/e2e_v0_2_3.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/benchmarks/e2e_v0_2_4.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/benchmarks/v0_1_smoke.json +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/benchmarks/v0_1_smoke.md +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/adapters/__init__.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/adapters/base.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/adapters/crewai.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/adapters/langgraph.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/adapters/otel.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/adapters/raw.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/analyzers.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/attribution.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/cli.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/deep.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/detectors.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/events.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/hub/__init__.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/hub/backend_base.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/hub/backends.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/hub/bundle.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/hub/scrub.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/instrumentation.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/integrations/__init__.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/integrations/claude_skill.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/integrations/openhands.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/llm.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/models.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/recorder.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/recovery.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/storage.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/taxonomy.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/traceback.py +0 -0
- {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/ui/__init__.py +0 -0
|
@@ -82,6 +82,26 @@ The audit found one real bug and a handful of test gaps:
|
|
|
82
82
|
5. **`recovery.ReflexionSuggestion`** had only an indirect test from DeepDebug
|
|
83
83
|
examples; now has direct happy + empty tests.
|
|
84
84
|
|
|
85
|
+
## 3.7 Judge hardening (0.2.6)
|
|
86
|
+
|
|
87
|
+
A v0.2.5 Who&When 5-trace live run had `llm_judge_root.agent_match=0.00`
|
|
88
|
+
because the judge truncated mid-array on long multi-agent debate
|
|
89
|
+
transcripts. Three changes in 0.2.6 lifted that to **0.40** on the same
|
|
90
|
+
sample (same model, same traces):
|
|
91
|
+
|
|
92
|
+
1. `LLMJudgeAnalyzer.max_tokens` default **4096 → 8192** — leaves room for
|
|
93
|
+
thinking-model reasoning tokens before the JSON object starts.
|
|
94
|
+
2. `LLMJudgeAnalyzer.max_findings_per_chunk` parameter (default 6) — the
|
|
95
|
+
system prompt now asks the model to cap its findings array, forcing it
|
|
96
|
+
to close the JSON even when many candidates are visible.
|
|
97
|
+
3. System prompt now has explicit "CRITICAL OUTPUT RULES" — output ONLY
|
|
98
|
+
JSON, no markdown fences, no newlines in string values, complete the
|
|
99
|
+
array.
|
|
100
|
+
|
|
101
|
+
Numbers: see [docs/benchmarks/who_when_v0_2_6_leaderboard.md](./benchmarks/who_when_v0_2_6_leaderboard.md).
|
|
102
|
+
Same trick works for `BinarySearchAttributor` (shipped in 0.2.4) — apply
|
|
103
|
+
to remaining LLM-using analyzers as more thinking models surface this.
|
|
104
|
+
|
|
85
105
|
## 3.6 Real-usage E2E (live Gemini)
|
|
86
106
|
|
|
87
107
|
Beyond unit tests, `scripts/e2e_real_usage.py` builds three realistic failing
|
|
@@ -148,6 +168,16 @@ Before v0.3 ships, this doc should record green checkmarks for:
|
|
|
148
168
|
into `AgentEvent`s. Conformance test mocks the bus and verifies
|
|
149
169
|
every documented event mapping plus the version-skew degradation
|
|
150
170
|
path. `examples/crewai_demo.py` shows a working two-agent crew.
|
|
151
|
-
- [
|
|
152
|
-
|
|
153
|
-
|
|
171
|
+
- [x] **HuggingFace Hub round-trip live test** — shipped in 0.2.6 as
|
|
172
|
+
`tests/test_hub_huggingface_live.py`. Gated on `HF_TOKEN` +
|
|
173
|
+
`AGENTDEBUG_HF_LIVE=1` so it never runs in default CI. Creates the
|
|
174
|
+
dataset repo if missing, pushes a bundle, lists, pulls back, verifies
|
|
175
|
+
the trajectory round-trips bit-for-bit. Live-validated against
|
|
176
|
+
`KunlunZhu/agentdebugx-live-test`.
|
|
177
|
+
- [x] **Bench harness with Who&When loader** — `experiments/prepare_who_when.py`
|
|
178
|
+
ingests 184 Algorithm-Generated + Hand-Crafted traces (4092 events) and
|
|
179
|
+
stores labels separately. `experiments/run_who_when_eval.py` runs all
|
|
180
|
+
4 attributors + DeepDebug against gold labels; reports agent_match,
|
|
181
|
+
exact_step, near_step. Live-Gemini 5-trace validation captured at
|
|
182
|
+
[docs/benchmarks/who_when_v0_2_6_leaderboard.md](./benchmarks/who_when_v0_2_6_leaderboard.md).
|
|
183
|
+
Headline 184-trace run deferred (~6h / ~$5-10 on a frontier model).
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Who&When — 5-trace Live Leaderboard (v0.2.6, gemini-3-flash)
|
|
2
|
+
|
|
3
|
+
Tiny validation sample drawn from `data/who_when/processed/labels.jsonl`
|
|
4
|
+
(first 5 algorithm-generated traces). **Not a publishable benchmark** — the
|
|
5
|
+
full benchmark requires the 184-trace dataset + a frontier model and is
|
|
6
|
+
deferred for cost reasons. This run exists to verify the analysis stack
|
|
7
|
+
produces sensible-shaped numbers and to surface regressions early.
|
|
8
|
+
|
|
9
|
+
## Aggregate (per attribution method)
|
|
10
|
+
|
|
11
|
+
| Method | agent_match | exact_step | near_step | both_near | DeepDebug rounds |
|
|
12
|
+
|---|---:|---:|---:|---:|---:|
|
|
13
|
+
| `heuristic` (rule baseline) | 0.20 | 0.00 | 0.20 | 0.20 | n/a |
|
|
14
|
+
| `llm_judge_root` (judge's root_cause field) | **0.40** | 0.00 | **0.20** | **0.20** | n/a |
|
|
15
|
+
| `all_at_once` (Who&When method 1) | 0.20 | 0.00 | 0.00 | 0.00 | n/a |
|
|
16
|
+
| `step_by_step` (Who&When method 2) | **0.40** | 0.00 | **0.20** | **0.20** | n/a |
|
|
17
|
+
| `deep_debug_root` (DeepDebug refined root) | 0.20 | 0.00 | 0.20 | 0.00 | 6 / trace |
|
|
18
|
+
|
|
19
|
+
## What changed in 0.2.6 vs 0.2.5
|
|
20
|
+
|
|
21
|
+
Same 5 traces, same model:
|
|
22
|
+
|
|
23
|
+
| Method | 0.2.5 agent_match | 0.2.6 agent_match | Δ |
|
|
24
|
+
|---|---:|---:|---:|
|
|
25
|
+
| `heuristic` | 0.20 | 0.20 | — |
|
|
26
|
+
| `llm_judge_root` | 0.00 | **0.40** | +0.40 |
|
|
27
|
+
| `all_at_once` | 0.00 | 0.20 | +0.20 |
|
|
28
|
+
| `step_by_step` | 0.00 | **0.40** | +0.40 |
|
|
29
|
+
|
|
30
|
+
The driver was the v0.2.6 judge prompt hardening: `max_tokens` default
|
|
31
|
+
4096 → 8192, an explicit `max_findings_per_chunk=6` cap surfaced through
|
|
32
|
+
the system prompt, and a "CRITICAL OUTPUT RULES" header (output ONLY JSON,
|
|
33
|
+
no markdown, no newlines in strings, complete the array). Before the
|
|
34
|
+
hardening, the judge truncated mid-array on Who&When debate transcripts
|
|
35
|
+
and returned no findings; after, the structured root_cause is populated.
|
|
36
|
+
|
|
37
|
+
## Honest caveats
|
|
38
|
+
|
|
39
|
+
* n=5; per-method standard error is ±0.22 — these absolute numbers should
|
|
40
|
+
not be over-interpreted. The 0.4 vs 0.0 jump for two methods is the
|
|
41
|
+
signal worth reporting; everything else is noise.
|
|
42
|
+
* `deep_debug_root` underperformed `step_by_step` on this sample. The
|
|
43
|
+
refine round on 7-event traces tends to converge to the *visible*
|
|
44
|
+
failure rather than the *causal* root (a known Who&When difficulty —
|
|
45
|
+
manifestation vs root cause).
|
|
46
|
+
* No method beats `near_step=0.20` on this sample. Step-localization
|
|
47
|
+
remains hard, matching the published Who&When ceiling (~14% step on
|
|
48
|
+
127 traces with frontier models).
|
|
49
|
+
|
|
50
|
+
## Reproducing
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Prepare data (once)
|
|
54
|
+
PYTHONPATH=src python experiments/prepare_who_when.py
|
|
55
|
+
|
|
56
|
+
# Set live LLM creds (any OpenAI-compatible endpoint works)
|
|
57
|
+
export AGENTDEBUG_LLM_BASE_URL=...
|
|
58
|
+
export AGENTDEBUG_LLM_API_KEY=...
|
|
59
|
+
export AGENTDEBUG_LLM_MODEL=gemini-3-flash
|
|
60
|
+
|
|
61
|
+
# Without DeepDebug (~1 min)
|
|
62
|
+
PYTHONPATH=src python experiments/run_who_when_eval.py \
|
|
63
|
+
--limit 5 --live-openai \
|
|
64
|
+
--out-dir experiments/runs/who_when_eval_subset
|
|
65
|
+
|
|
66
|
+
# With DeepDebug (~5 min)
|
|
67
|
+
PYTHONPATH=src python experiments/run_who_when_eval.py \
|
|
68
|
+
--limit 5 --live-openai --deep \
|
|
69
|
+
--out-dir experiments/runs/who_when_eval_subset_deep
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
The headline benchmark (184 traces × 5 methods × DeepDebug) would take
|
|
73
|
+
~6 hours and ~$5-10 in API cost on a frontier model. Run it once before
|
|
74
|
+
paper submission; do not run on every iteration.
|
|
@@ -39,21 +39,18 @@ the allowed failure mode codes. Be conservative — only flag steps where the
|
|
|
39
39
|
evidence in the event payload supports the label. If the trajectory contains no
|
|
40
40
|
failure, return an empty findings list.
|
|
41
41
|
|
|
42
|
-
|
|
42
|
+
CRITICAL OUTPUT RULES (these maximize the chance your reply parses):
|
|
43
|
+
1. Output ONLY a JSON object. No prose before/after. No markdown fences.
|
|
44
|
+
2. Cap the findings array at {max_findings} entries — pick the most important.
|
|
45
|
+
3. Keep each "evidence" entry under 120 characters; keep each "rationale" /
|
|
46
|
+
"summary" under 200 characters.
|
|
47
|
+
4. Do NOT include newlines inside string values.
|
|
48
|
+
5. Emit the JSON object COMPLETE — never stop mid-key or mid-array.
|
|
43
49
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
"step_index": <int or null>,
|
|
49
|
-
"agent_name": "<agent_name from the input>",
|
|
50
|
-
"failure_mode_id": "<one of the allowed codes>",
|
|
51
|
-
"confidence": <float between 0 and 1>,
|
|
52
|
-
"evidence": ["<short quote or summary of the supporting payload>"]
|
|
53
|
-
}
|
|
54
|
-
],
|
|
55
|
-
"summary": "<one-sentence diagnosis or 'No failure detected.'>"
|
|
56
|
-
}
|
|
50
|
+
Schema (compact — fields in this order):
|
|
51
|
+
{{"findings":[{{"event_id":"...", "step_index":N|null, "agent_name":"...",
|
|
52
|
+
"failure_mode_id":"...", "confidence":0..1, "evidence":["..."]}}, ...],
|
|
53
|
+
"summary":"<short>"}}
|
|
57
54
|
"""
|
|
58
55
|
|
|
59
56
|
|
|
@@ -66,15 +63,21 @@ class LLMJudgeAnalyzer:
|
|
|
66
63
|
*,
|
|
67
64
|
max_events_per_call: int = 80,
|
|
68
65
|
max_evidence_chars: int = 300,
|
|
69
|
-
max_tokens: int =
|
|
66
|
+
max_tokens: int = 8192,
|
|
67
|
+
max_findings_per_chunk: int = 6,
|
|
70
68
|
) -> None:
|
|
71
69
|
self.llm = llm
|
|
72
70
|
self.max_events_per_call = max_events_per_call
|
|
73
71
|
self.max_evidence_chars = max_evidence_chars
|
|
74
72
|
# NOTE: thinking models (Gemini 2.x/3.x, o-series) spend a substantial
|
|
75
73
|
# fraction of `max_tokens` on reasoning tokens before any text is
|
|
76
|
-
# emitted.
|
|
74
|
+
# emitted. 8192 is the safe default after the v0.2.6 Who&When debate-
|
|
75
|
+
# trace observation that 4096 truncated mid-array on long traces.
|
|
77
76
|
self.max_tokens = max_tokens
|
|
77
|
+
# The system prompt asks the model to cap its findings array so the
|
|
78
|
+
# JSON closes even when many candidate failures exist. Reuse the prompt
|
|
79
|
+
# placeholder for this cap.
|
|
80
|
+
self.max_findings_per_chunk = max_findings_per_chunk
|
|
78
81
|
|
|
79
82
|
def analyze(self, trajectory: AgentTrajectory) -> DiagnosticReport:
|
|
80
83
|
events = trajectory.events
|
|
@@ -121,8 +124,11 @@ class LLMJudgeAnalyzer:
|
|
|
121
124
|
self, trajectory: AgentTrajectory, chunk: List[AgentEvent]
|
|
122
125
|
) -> tuple[List[FailureFinding], str]:
|
|
123
126
|
user = self._render_user_prompt(trajectory, chunk)
|
|
127
|
+
# Inject the max_findings cap into the system prompt at format time so
|
|
128
|
+
# we can tune it per-call without forking the prompt.
|
|
129
|
+
system = _SYSTEM_PROMPT.format(max_findings=self.max_findings_per_chunk)
|
|
124
130
|
messages = [
|
|
125
|
-
{'role': 'system', 'content':
|
|
131
|
+
{'role': 'system', 'content': system},
|
|
126
132
|
{'role': 'user', 'content': user},
|
|
127
133
|
]
|
|
128
134
|
result = self.llm.complete(messages=messages, max_tokens=self.max_tokens)
|
|
@@ -211,8 +211,10 @@ _INDEX_HTML = """<!doctype html>
|
|
|
211
211
|
.button {
|
|
212
212
|
border:1px solid #373b3a; border-radius:8px; background:#1a1c1c; color:var(--fg);
|
|
213
213
|
height:32px; padding:0 11px; font-size:12px; display:inline-flex;
|
|
214
|
-
align-items:center; gap:7px;
|
|
214
|
+
align-items:center; justify-content:center; gap:7px; cursor:pointer;
|
|
215
|
+
font-family:inherit; white-space:nowrap;
|
|
215
216
|
}
|
|
217
|
+
.button:hover { border-color:#4b5250; background:#202323; }
|
|
216
218
|
.button.primary { border-color:#356568; color:#d8fdff; background:#173033; }
|
|
217
219
|
.content { padding:22px; max-width:1440px; margin:0 auto; }
|
|
218
220
|
.hero {
|
|
@@ -330,6 +332,18 @@ _INDEX_HTML = """<!doctype html>
|
|
|
330
332
|
.topbar { position:static; }
|
|
331
333
|
.trace-legend, .trace-pair { grid-template-columns:1fr; }
|
|
332
334
|
}
|
|
335
|
+
@media (max-width: 640px) {
|
|
336
|
+
.topbar { display:grid; grid-template-columns:1fr; align-items:start; padding:14px 16px; }
|
|
337
|
+
.top-actions { width:100%; display:grid; grid-template-columns:repeat(3,minmax(0,1fr)); }
|
|
338
|
+
.button { width:100%; min-width:0; padding:0 8px; overflow:hidden; text-overflow:ellipsis; }
|
|
339
|
+
.content { padding:22px 16px; }
|
|
340
|
+
h1 { font-size:27px; line-height:1.1; }
|
|
341
|
+
.stats { grid-template-columns:repeat(2,minmax(0,1fr)); }
|
|
342
|
+
.root-grid { grid-template-columns:1fr; }
|
|
343
|
+
.event { grid-template-columns:46px minmax(0,1fr); padding:10px; }
|
|
344
|
+
.step-index { width:38px; height:38px; }
|
|
345
|
+
.event-grid { grid-template-columns:1fr; }
|
|
346
|
+
}
|
|
333
347
|
</style>
|
|
334
348
|
</head>
|
|
335
349
|
<body>
|
|
@@ -358,9 +372,9 @@ _INDEX_HTML = """<!doctype html>
|
|
|
358
372
|
<div class="brand-sub" id="trace-count">Loading traces</div>
|
|
359
373
|
</div>
|
|
360
374
|
<div class="top-actions">
|
|
361
|
-
<
|
|
362
|
-
<
|
|
363
|
-
<
|
|
375
|
+
<button class="button" id="analyze-btn" type="button">Analyze</button>
|
|
376
|
+
<button class="button" id="export-btn" type="button">Bundle</button>
|
|
377
|
+
<button class="button primary" id="hub-btn" type="button">Hub</button>
|
|
364
378
|
</div>
|
|
365
379
|
</div>
|
|
366
380
|
<div class="content" id="detail">
|
|
@@ -370,6 +384,8 @@ _INDEX_HTML = """<!doctype html>
|
|
|
370
384
|
</div>
|
|
371
385
|
<script>
|
|
372
386
|
const BOOTSTRAP = __BOOTSTRAP_JSON__;
|
|
387
|
+
let CURRENT_TRACE_ID = null;
|
|
388
|
+
let CURRENT_TRACE_DATA = null;
|
|
373
389
|
async function api(path) {
|
|
374
390
|
const r = await fetch(path);
|
|
375
391
|
if (!r.ok) throw new Error('HTTP ' + r.status);
|
|
@@ -430,9 +446,11 @@ function renderTraceList(traceIds, selectedId) {
|
|
|
430
446
|
async function selectTrace(tid, li) {
|
|
431
447
|
document.querySelectorAll('.run').forEach(el => el.classList.remove('active'));
|
|
432
448
|
li.classList.add('active');
|
|
449
|
+
CURRENT_TRACE_ID = tid;
|
|
433
450
|
document.getElementById('detail').innerHTML = '<div class="empty">Loading trace...</div>';
|
|
434
451
|
try {
|
|
435
452
|
const data = await api('/api/v1/traces/' + encodeURIComponent(tid));
|
|
453
|
+
CURRENT_TRACE_DATA = data;
|
|
436
454
|
renderTrace(data.trajectory, data.report);
|
|
437
455
|
} catch (e) {
|
|
438
456
|
document.getElementById('detail').innerHTML = '<div class="empty">' + escapeHtml(e) + '</div>';
|
|
@@ -485,7 +503,7 @@ function renderTrace(traj, report) {
|
|
|
485
503
|
for (const f of findings) html += renderFinding(f);
|
|
486
504
|
html += '</div></div></div>';
|
|
487
505
|
|
|
488
|
-
html += '<div class="panel"><div class="panel-head"><div class="panel-title">Use Case Flow</div><span class="chip cyan">Error Hub</span></div><div class="panel-body"><div class="flow">';
|
|
506
|
+
html += '<div class="panel" id="error-hub-flow"><div class="panel-head"><div class="panel-title">Use Case Flow</div><span class="chip cyan">Error Hub</span></div><div class="panel-body"><div class="flow">';
|
|
489
507
|
html += flow(1, 'Capture trajectory from the running agent with the lightweight recorder or adapter.');
|
|
490
508
|
html += flow(2, 'Diagnose the trace, localize the likely root cause, and generate recovery suggestions.');
|
|
491
509
|
html += flow(3, 'Scrub secrets and PII, package a reproducible error bundle, and publish to Git or Hugging Face.');
|
|
@@ -566,6 +584,32 @@ function renderEvent(ev, isRoot, finding) {
|
|
|
566
584
|
html += '</div></div></div>';
|
|
567
585
|
return html;
|
|
568
586
|
}
|
|
587
|
+
function downloadJson(filename, value) {
|
|
588
|
+
const blob = new Blob([JSON.stringify(value, null, 2)], {type: 'application/json'});
|
|
589
|
+
const url = URL.createObjectURL(blob);
|
|
590
|
+
const a = document.createElement('a');
|
|
591
|
+
a.href = url;
|
|
592
|
+
a.download = filename;
|
|
593
|
+
document.body.appendChild(a);
|
|
594
|
+
a.click();
|
|
595
|
+
a.remove();
|
|
596
|
+
URL.revokeObjectURL(url);
|
|
597
|
+
}
|
|
598
|
+
function bindTopActions() {
|
|
599
|
+
document.getElementById('analyze-btn').onclick = () => {
|
|
600
|
+
const active = document.querySelector('.run.active');
|
|
601
|
+
if (CURRENT_TRACE_ID && active) selectTrace(CURRENT_TRACE_ID, active);
|
|
602
|
+
};
|
|
603
|
+
document.getElementById('export-btn').onclick = () => {
|
|
604
|
+
if (!CURRENT_TRACE_DATA) return;
|
|
605
|
+
const name = (CURRENT_TRACE_ID || 'trace') + '.agentdebugx.report.json';
|
|
606
|
+
downloadJson(name, CURRENT_TRACE_DATA);
|
|
607
|
+
};
|
|
608
|
+
document.getElementById('hub-btn').onclick = () => {
|
|
609
|
+
const flow = document.getElementById('error-hub-flow');
|
|
610
|
+
if (flow) flow.scrollIntoView({behavior: 'smooth', block: 'start'});
|
|
611
|
+
};
|
|
612
|
+
}
|
|
569
613
|
function field(label, value, isError) {
|
|
570
614
|
return '<div class="field ' + (isError ? 'error' : '') + '"><div class="field-label">' + escapeHtml(label) + '</div><div class="field-value">' + escapeHtml(value || '-') + '</div></div>';
|
|
571
615
|
}
|
|
@@ -585,11 +629,14 @@ if (BOOTSTRAP && BOOTSTRAP.traces) {
|
|
|
585
629
|
const selected = BOOTSTRAP.selected ? BOOTSTRAP.selected.trajectory.trace_id : null;
|
|
586
630
|
renderTraceList(BOOTSTRAP.traces, selected);
|
|
587
631
|
if (BOOTSTRAP.selected) {
|
|
632
|
+
CURRENT_TRACE_ID = selected;
|
|
633
|
+
CURRENT_TRACE_DATA = BOOTSTRAP.selected;
|
|
588
634
|
renderTrace(BOOTSTRAP.selected.trajectory, BOOTSTRAP.selected.report);
|
|
589
635
|
} else {
|
|
590
636
|
document.getElementById('detail').innerHTML = '<div class="empty">No traces in store.</div>';
|
|
591
637
|
}
|
|
592
638
|
}
|
|
639
|
+
bindTopActions();
|
|
593
640
|
loadTraceList(!(BOOTSTRAP && BOOTSTRAP.selected));
|
|
594
641
|
</script>
|
|
595
642
|
</body>
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|