agentdebugx 0.2.5__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/PKG-INFO +1 -1
  2. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/23_status_v0_2.md +33 -3
  3. agentdebugx-0.2.6/docs/benchmarks/who_when_v0_2_6_leaderboard.md +74 -0
  4. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/pyproject.toml +1 -1
  5. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/__init__.py +1 -1
  6. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/judges.py +23 -17
  7. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/ui/server.py +52 -5
  8. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/LICENSE +0 -0
  9. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/README.md +0 -0
  10. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/00_overview.md +0 -0
  11. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/01_literature_survey.md +0 -0
  12. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/02_architecture.md +0 -0
  13. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/03_taxonomy.md +0 -0
  14. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/04_trace_schema.md +0 -0
  15. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/05_adapters.md +0 -0
  16. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/06_detectors.md +0 -0
  17. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/07_attribution.md +0 -0
  18. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/08_recovery.md +0 -0
  19. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/09_error_database.md +0 -0
  20. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/10_taxonomy_induction.md +0 -0
  21. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/11_multimodal.md +0 -0
  22. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/12_ui_dashboard.md +0 -0
  23. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/13_class_design.md +0 -0
  24. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/14_api_reference.md +0 -0
  25. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/15_roadmap.md +0 -0
  26. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/16_governance.md +0 -0
  27. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/17_claude_code_design_patterns.md +0 -0
  28. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/18_comparison_codex_vs_design.md +0 -0
  29. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/19_error_hub.md +0 -0
  30. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/20_deep_debug.md +0 -0
  31. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/21_integrations.md +0 -0
  32. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/22_industry_track_paper_eval_plan.md +0 -0
  33. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/ERROR_TAXONOMY.md +0 -0
  34. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
  35. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/README.md +0 -0
  36. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/RESEARCH_SURVEY.md +0 -0
  37. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/benchmarks/e2e_v0_2_3.md +0 -0
  38. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/benchmarks/e2e_v0_2_4.md +0 -0
  39. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/benchmarks/v0_1_smoke.json +0 -0
  40. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/docs/benchmarks/v0_1_smoke.md +0 -0
  41. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/adapters/__init__.py +0 -0
  42. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/adapters/base.py +0 -0
  43. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/adapters/crewai.py +0 -0
  44. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/adapters/langgraph.py +0 -0
  45. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/adapters/otel.py +0 -0
  46. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/adapters/raw.py +0 -0
  47. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/analyzers.py +0 -0
  48. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/attribution.py +0 -0
  49. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/cli.py +0 -0
  50. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/deep.py +0 -0
  51. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/detectors.py +0 -0
  52. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/events.py +0 -0
  53. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/hub/__init__.py +0 -0
  54. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/hub/backend_base.py +0 -0
  55. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/hub/backends.py +0 -0
  56. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/hub/bundle.py +0 -0
  57. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/hub/scrub.py +0 -0
  58. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/instrumentation.py +0 -0
  59. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/integrations/__init__.py +0 -0
  60. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/integrations/claude_skill.py +0 -0
  61. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/integrations/openhands.py +0 -0
  62. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/llm.py +0 -0
  63. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/models.py +0 -0
  64. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/recorder.py +0 -0
  65. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/recovery.py +0 -0
  66. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/storage.py +0 -0
  67. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/taxonomy.py +0 -0
  68. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/traceback.py +0 -0
  69. {agentdebugx-0.2.5 → agentdebugx-0.2.6}/src/agentdebug/ui/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentdebugx
3
- Version: 0.2.5
3
+ Version: 0.2.6
4
4
  Summary: Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -82,6 +82,26 @@ The audit found one real bug and a handful of test gaps:
82
82
  5. **`recovery.ReflexionSuggestion`** had only an indirect test from DeepDebug
83
83
  examples; now has direct happy + empty tests.
84
84
 
85
+ ## 3.7 Judge hardening (0.2.6)
86
+
87
+ A v0.2.5 Who&When 5-trace live run had `llm_judge_root.agent_match=0.00`
88
+ because the judge truncated mid-array on long multi-agent debate
89
+ transcripts. Three changes in 0.2.6 lifted that to **0.40** on the same
90
+ sample (same model, same traces):
91
+
92
+ 1. `LLMJudgeAnalyzer.max_tokens` default **4096 → 8192** — leaves room for
93
+ thinking-model reasoning tokens before the JSON object starts.
94
+ 2. `LLMJudgeAnalyzer.max_findings_per_chunk` parameter (default 6) — the
95
+ system prompt now asks the model to cap its findings array, forcing it
96
+ to close the JSON even when many candidates are visible.
97
+ 3. System prompt now has explicit "CRITICAL OUTPUT RULES" — output ONLY
98
+ JSON, no markdown fences, no newlines in string values, complete the
99
+ array.
100
+
101
+ Numbers: see [docs/benchmarks/who_when_v0_2_6_leaderboard.md](./benchmarks/who_when_v0_2_6_leaderboard.md).
102
+ Same trick works for `BinarySearchAttributor` (shipped in 0.2.4) — apply
103
+ to remaining LLM-using analyzers as more thinking models surface this.
104
+
85
105
  ## 3.6 Real-usage E2E (live Gemini)
86
106
 
87
107
  Beyond unit tests, `scripts/e2e_real_usage.py` builds three realistic failing
@@ -148,6 +168,16 @@ Before v0.3 ships, this doc should record green checkmarks for:
148
168
  into `AgentEvent`s. Conformance test mocks the bus and verifies
149
169
  every documented event mapping plus the version-skew degradation
150
170
  path. `examples/crewai_demo.py` shows a working two-agent crew.
151
- - [ ] HuggingFace Hub round-trip live test (gated on `HF_TOKEN`).
152
- - [ ] Bench harness extended with one published-benchmark loader (Who&When
153
- is the obvious first target we already cite its method).
171
+ - [x] **HuggingFace Hub round-trip live test** shipped in 0.2.6 as
172
+ `tests/test_hub_huggingface_live.py`. Gated on `HF_TOKEN` +
173
+ `AGENTDEBUG_HF_LIVE=1` so it never runs in default CI. Creates the
174
+ dataset repo if missing, pushes a bundle, lists, pulls back, verifies
175
+ the trajectory round-trips bit-for-bit. Live-validated against
176
+ `KunlunZhu/agentdebugx-live-test`.
177
+ - [x] **Bench harness with Who&When loader** — `experiments/prepare_who_when.py`
178
+ ingests 184 Algorithm-Generated + Hand-Crafted traces (4092 events) and
179
+ stores labels separately. `experiments/run_who_when_eval.py` runs all
180
+ 4 attributors + DeepDebug against gold labels; reports agent_match,
181
+ exact_step, near_step. Live-Gemini 5-trace validation captured at
182
+ [docs/benchmarks/who_when_v0_2_6_leaderboard.md](./benchmarks/who_when_v0_2_6_leaderboard.md).
183
+ Headline 184-trace run deferred (~6h / ~$5-10 on a frontier model).
@@ -0,0 +1,74 @@
1
+ # Who&When — 5-trace Live Leaderboard (v0.2.6, gemini-3-flash)
2
+
3
+ Tiny validation sample drawn from `data/who_when/processed/labels.jsonl`
4
+ (first 5 algorithm-generated traces). **Not a publishable benchmark** — the
5
+ full benchmark requires the 184-trace dataset + a frontier model and is
6
+ deferred for cost reasons. This run exists to verify the analysis stack
7
+ produces sensible-shaped numbers and to surface regressions early.
8
+
9
+ ## Aggregate (per attribution method)
10
+
11
+ | Method | agent_match | exact_step | near_step | both_near | DeepDebug rounds |
12
+ |---|---:|---:|---:|---:|---:|
13
+ | `heuristic` (rule baseline) | 0.20 | 0.00 | 0.20 | 0.20 | n/a |
14
+ | `llm_judge_root` (judge's root_cause field) | **0.40** | 0.00 | **0.20** | **0.20** | n/a |
15
+ | `all_at_once` (Who&When method 1) | 0.20 | 0.00 | 0.00 | 0.00 | n/a |
16
+ | `step_by_step` (Who&When method 2) | **0.40** | 0.00 | **0.20** | **0.20** | n/a |
17
+ | `deep_debug_root` (DeepDebug refined root) | 0.20 | 0.00 | 0.20 | 0.00 | 6 / trace |
18
+
19
+ ## What changed in 0.2.6 vs 0.2.5
20
+
21
+ Same 5 traces, same model:
22
+
23
+ | Method | 0.2.5 agent_match | 0.2.6 agent_match | Δ |
24
+ |---|---:|---:|---:|
25
+ | `heuristic` | 0.20 | 0.20 | — |
26
+ | `llm_judge_root` | 0.00 | **0.40** | +0.40 |
27
+ | `all_at_once` | 0.00 | 0.20 | +0.20 |
28
+ | `step_by_step` | 0.00 | **0.40** | +0.40 |
29
+
30
+ The driver was the v0.2.6 judge prompt hardening: `max_tokens` default
31
+ 4096 → 8192, an explicit `max_findings_per_chunk=6` cap surfaced through
32
+ the system prompt, and a "CRITICAL OUTPUT RULES" header (output ONLY JSON,
33
+ no markdown, no newlines in strings, complete the array). Before the
34
+ hardening, the judge truncated mid-array on Who&When debate transcripts
35
+ and returned no findings; after, the structured root_cause is populated.
36
+
37
+ ## Honest caveats
38
+
39
+ * n=5; per-method standard error is ±0.22 — these absolute numbers should
40
+ not be over-interpreted. The 0.4 vs 0.0 jump for two methods is the
41
+ signal worth reporting; everything else is noise.
42
+ * `deep_debug_root` underperformed `step_by_step` on this sample. The
43
+ refine round on 7-event traces tends to converge to the *visible*
44
+ failure rather than the *causal* root (a known Who&When difficulty —
45
+ manifestation vs root cause).
46
+ * No method beats `near_step=0.20` on this sample. Step-localization
47
+ remains hard, matching the published Who&When ceiling (~14% step on
48
+ 127 traces with frontier models).
49
+
50
+ ## Reproducing
51
+
52
+ ```bash
53
+ # Prepare data (once)
54
+ PYTHONPATH=src python experiments/prepare_who_when.py
55
+
56
+ # Set live LLM creds (any OpenAI-compatible endpoint works)
57
+ export AGENTDEBUG_LLM_BASE_URL=...
58
+ export AGENTDEBUG_LLM_API_KEY=...
59
+ export AGENTDEBUG_LLM_MODEL=gemini-3-flash
60
+
61
+ # Without DeepDebug (~1 min)
62
+ PYTHONPATH=src python experiments/run_who_when_eval.py \
63
+ --limit 5 --live-openai \
64
+ --out-dir experiments/runs/who_when_eval_subset
65
+
66
+ # With DeepDebug (~5 min)
67
+ PYTHONPATH=src python experiments/run_who_when_eval.py \
68
+ --limit 5 --live-openai --deep \
69
+ --out-dir experiments/runs/who_when_eval_subset_deep
70
+ ```
71
+
72
+ The headline benchmark (184 traces × 5 methods × DeepDebug) would take
73
+ ~6 hours and ~$5-10 in API cost on a frontier model. Run it once before
74
+ paper submission; do not run on every iteration.
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "agentdebugx"
3
- version = "0.2.5"
3
+ version = "0.2.6"
4
4
  description = "Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`."
5
5
  authors = ["ULab @ UIUC <ulab@illinois.edu>"]
6
6
  license = "MIT"
@@ -96,4 +96,4 @@ __all__ = [
96
96
  'get_failure_mode',
97
97
  ]
98
98
 
99
- __version__ = '0.2.5'
99
+ __version__ = '0.2.6'
@@ -39,21 +39,18 @@ the allowed failure mode codes. Be conservative — only flag steps where the
39
39
  evidence in the event payload supports the label. If the trajectory contains no
40
40
  failure, return an empty findings list.
41
41
 
42
- Respond ONLY with a JSON object matching this schema (no prose, no markdown):
42
+ CRITICAL OUTPUT RULES (these maximize the chance your reply parses):
43
+ 1. Output ONLY a JSON object. No prose before/after. No markdown fences.
44
+ 2. Cap the findings array at {max_findings} entries — pick the most important.
45
+ 3. Keep each "evidence" entry under 120 characters; keep each "rationale" /
46
+ "summary" under 200 characters.
47
+ 4. Do NOT include newlines inside string values.
48
+ 5. Emit the JSON object COMPLETE — never stop mid-key or mid-array.
43
49
 
44
- {
45
- "findings": [
46
- {
47
- "event_id": "<event_id from the input>",
48
- "step_index": <int or null>,
49
- "agent_name": "<agent_name from the input>",
50
- "failure_mode_id": "<one of the allowed codes>",
51
- "confidence": <float between 0 and 1>,
52
- "evidence": ["<short quote or summary of the supporting payload>"]
53
- }
54
- ],
55
- "summary": "<one-sentence diagnosis or 'No failure detected.'>"
56
- }
50
+ Schema (compact — fields in this order):
51
+ {{"findings":[{{"event_id":"...", "step_index":N|null, "agent_name":"...",
52
+ "failure_mode_id":"...", "confidence":0..1, "evidence":["..."]}}, ...],
53
+ "summary":"<short>"}}
57
54
  """
58
55
 
59
56
 
@@ -66,15 +63,21 @@ class LLMJudgeAnalyzer:
66
63
  *,
67
64
  max_events_per_call: int = 80,
68
65
  max_evidence_chars: int = 300,
69
- max_tokens: int = 4096,
66
+ max_tokens: int = 8192,
67
+ max_findings_per_chunk: int = 6,
70
68
  ) -> None:
71
69
  self.llm = llm
72
70
  self.max_events_per_call = max_events_per_call
73
71
  self.max_evidence_chars = max_evidence_chars
74
72
  # NOTE: thinking models (Gemini 2.x/3.x, o-series) spend a substantial
75
73
  # fraction of `max_tokens` on reasoning tokens before any text is
76
- # emitted. 4096 is the safe default; bump higher for long traces.
74
+ # emitted. 8192 is the safe default after the v0.2.6 Who&When debate-
75
+ # trace observation that 4096 truncated mid-array on long traces.
77
76
  self.max_tokens = max_tokens
77
+ # The system prompt asks the model to cap its findings array so the
78
+ # JSON closes even when many candidate failures exist. Reuse the prompt
79
+ # placeholder for this cap.
80
+ self.max_findings_per_chunk = max_findings_per_chunk
78
81
 
79
82
  def analyze(self, trajectory: AgentTrajectory) -> DiagnosticReport:
80
83
  events = trajectory.events
@@ -121,8 +124,11 @@ class LLMJudgeAnalyzer:
121
124
  self, trajectory: AgentTrajectory, chunk: List[AgentEvent]
122
125
  ) -> tuple[List[FailureFinding], str]:
123
126
  user = self._render_user_prompt(trajectory, chunk)
127
+ # Inject the max_findings cap into the system prompt at format time so
128
+ # we can tune it per-call without forking the prompt.
129
+ system = _SYSTEM_PROMPT.format(max_findings=self.max_findings_per_chunk)
124
130
  messages = [
125
- {'role': 'system', 'content': _SYSTEM_PROMPT},
131
+ {'role': 'system', 'content': system},
126
132
  {'role': 'user', 'content': user},
127
133
  ]
128
134
  result = self.llm.complete(messages=messages, max_tokens=self.max_tokens)
@@ -211,8 +211,10 @@ _INDEX_HTML = """<!doctype html>
211
211
  .button {
212
212
  border:1px solid #373b3a; border-radius:8px; background:#1a1c1c; color:var(--fg);
213
213
  height:32px; padding:0 11px; font-size:12px; display:inline-flex;
214
- align-items:center; gap:7px;
214
+ align-items:center; justify-content:center; gap:7px; cursor:pointer;
215
+ font-family:inherit; white-space:nowrap;
215
216
  }
217
+ .button:hover { border-color:#4b5250; background:#202323; }
216
218
  .button.primary { border-color:#356568; color:#d8fdff; background:#173033; }
217
219
  .content { padding:22px; max-width:1440px; margin:0 auto; }
218
220
  .hero {
@@ -330,6 +332,18 @@ _INDEX_HTML = """<!doctype html>
330
332
  .topbar { position:static; }
331
333
  .trace-legend, .trace-pair { grid-template-columns:1fr; }
332
334
  }
335
+ @media (max-width: 640px) {
336
+ .topbar { display:grid; grid-template-columns:1fr; align-items:start; padding:14px 16px; }
337
+ .top-actions { width:100%; display:grid; grid-template-columns:repeat(3,minmax(0,1fr)); }
338
+ .button { width:100%; min-width:0; padding:0 8px; overflow:hidden; text-overflow:ellipsis; }
339
+ .content { padding:22px 16px; }
340
+ h1 { font-size:27px; line-height:1.1; }
341
+ .stats { grid-template-columns:repeat(2,minmax(0,1fr)); }
342
+ .root-grid { grid-template-columns:1fr; }
343
+ .event { grid-template-columns:46px minmax(0,1fr); padding:10px; }
344
+ .step-index { width:38px; height:38px; }
345
+ .event-grid { grid-template-columns:1fr; }
346
+ }
333
347
  </style>
334
348
  </head>
335
349
  <body>
@@ -358,9 +372,9 @@ _INDEX_HTML = """<!doctype html>
358
372
  <div class="brand-sub" id="trace-count">Loading traces</div>
359
373
  </div>
360
374
  <div class="top-actions">
361
- <span class="button">Analyze</span>
362
- <span class="button">Export Bundle</span>
363
- <span class="button primary">Open Error Hub</span>
375
+ <button class="button" id="analyze-btn" type="button">Analyze</button>
376
+ <button class="button" id="export-btn" type="button">Bundle</button>
377
+ <button class="button primary" id="hub-btn" type="button">Hub</button>
364
378
  </div>
365
379
  </div>
366
380
  <div class="content" id="detail">
@@ -370,6 +384,8 @@ _INDEX_HTML = """<!doctype html>
370
384
  </div>
371
385
  <script>
372
386
  const BOOTSTRAP = __BOOTSTRAP_JSON__;
387
+ let CURRENT_TRACE_ID = null;
388
+ let CURRENT_TRACE_DATA = null;
373
389
  async function api(path) {
374
390
  const r = await fetch(path);
375
391
  if (!r.ok) throw new Error('HTTP ' + r.status);
@@ -430,9 +446,11 @@ function renderTraceList(traceIds, selectedId) {
430
446
  async function selectTrace(tid, li) {
431
447
  document.querySelectorAll('.run').forEach(el => el.classList.remove('active'));
432
448
  li.classList.add('active');
449
+ CURRENT_TRACE_ID = tid;
433
450
  document.getElementById('detail').innerHTML = '<div class="empty">Loading trace...</div>';
434
451
  try {
435
452
  const data = await api('/api/v1/traces/' + encodeURIComponent(tid));
453
+ CURRENT_TRACE_DATA = data;
436
454
  renderTrace(data.trajectory, data.report);
437
455
  } catch (e) {
438
456
  document.getElementById('detail').innerHTML = '<div class="empty">' + escapeHtml(e) + '</div>';
@@ -485,7 +503,7 @@ function renderTrace(traj, report) {
485
503
  for (const f of findings) html += renderFinding(f);
486
504
  html += '</div></div></div>';
487
505
 
488
- html += '<div class="panel"><div class="panel-head"><div class="panel-title">Use Case Flow</div><span class="chip cyan">Error Hub</span></div><div class="panel-body"><div class="flow">';
506
+ html += '<div class="panel" id="error-hub-flow"><div class="panel-head"><div class="panel-title">Use Case Flow</div><span class="chip cyan">Error Hub</span></div><div class="panel-body"><div class="flow">';
489
507
  html += flow(1, 'Capture trajectory from the running agent with the lightweight recorder or adapter.');
490
508
  html += flow(2, 'Diagnose the trace, localize the likely root cause, and generate recovery suggestions.');
491
509
  html += flow(3, 'Scrub secrets and PII, package a reproducible error bundle, and publish to Git or Hugging Face.');
@@ -566,6 +584,32 @@ function renderEvent(ev, isRoot, finding) {
566
584
  html += '</div></div></div>';
567
585
  return html;
568
586
  }
587
+ function downloadJson(filename, value) {
588
+ const blob = new Blob([JSON.stringify(value, null, 2)], {type: 'application/json'});
589
+ const url = URL.createObjectURL(blob);
590
+ const a = document.createElement('a');
591
+ a.href = url;
592
+ a.download = filename;
593
+ document.body.appendChild(a);
594
+ a.click();
595
+ a.remove();
596
+ URL.revokeObjectURL(url);
597
+ }
598
+ function bindTopActions() {
599
+ document.getElementById('analyze-btn').onclick = () => {
600
+ const active = document.querySelector('.run.active');
601
+ if (CURRENT_TRACE_ID && active) selectTrace(CURRENT_TRACE_ID, active);
602
+ };
603
+ document.getElementById('export-btn').onclick = () => {
604
+ if (!CURRENT_TRACE_DATA) return;
605
+ const name = (CURRENT_TRACE_ID || 'trace') + '.agentdebugx.report.json';
606
+ downloadJson(name, CURRENT_TRACE_DATA);
607
+ };
608
+ document.getElementById('hub-btn').onclick = () => {
609
+ const flow = document.getElementById('error-hub-flow');
610
+ if (flow) flow.scrollIntoView({behavior: 'smooth', block: 'start'});
611
+ };
612
+ }
569
613
  function field(label, value, isError) {
570
614
  return '<div class="field ' + (isError ? 'error' : '') + '"><div class="field-label">' + escapeHtml(label) + '</div><div class="field-value">' + escapeHtml(value || '-') + '</div></div>';
571
615
  }
@@ -585,11 +629,14 @@ if (BOOTSTRAP && BOOTSTRAP.traces) {
585
629
  const selected = BOOTSTRAP.selected ? BOOTSTRAP.selected.trajectory.trace_id : null;
586
630
  renderTraceList(BOOTSTRAP.traces, selected);
587
631
  if (BOOTSTRAP.selected) {
632
+ CURRENT_TRACE_ID = selected;
633
+ CURRENT_TRACE_DATA = BOOTSTRAP.selected;
588
634
  renderTrace(BOOTSTRAP.selected.trajectory, BOOTSTRAP.selected.report);
589
635
  } else {
590
636
  document.getElementById('detail').innerHTML = '<div class="empty">No traces in store.</div>';
591
637
  }
592
638
  }
639
+ bindTopActions();
593
640
  loadTraceList(!(BOOTSTRAP && BOOTSTRAP.selected));
594
641
  </script>
595
642
  </body>
File without changes
File without changes
File without changes