agentdebugx 0.2.5__tar.gz → 0.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/PKG-INFO +1 -1
  2. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/23_status_v0_2.md +35 -4
  3. agentdebugx-0.2.7/docs/benchmarks/who_when_v0_2_6_leaderboard.md +74 -0
  4. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/pyproject.toml +1 -1
  5. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/__init__.py +3 -1
  6. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/attribution.py +186 -1
  7. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/judges.py +23 -17
  8. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/ui/server.py +52 -5
  9. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/LICENSE +0 -0
  10. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/README.md +0 -0
  11. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/00_overview.md +0 -0
  12. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/01_literature_survey.md +0 -0
  13. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/02_architecture.md +0 -0
  14. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/03_taxonomy.md +0 -0
  15. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/04_trace_schema.md +0 -0
  16. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/05_adapters.md +0 -0
  17. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/06_detectors.md +0 -0
  18. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/07_attribution.md +0 -0
  19. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/08_recovery.md +0 -0
  20. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/09_error_database.md +0 -0
  21. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/10_taxonomy_induction.md +0 -0
  22. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/11_multimodal.md +0 -0
  23. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/12_ui_dashboard.md +0 -0
  24. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/13_class_design.md +0 -0
  25. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/14_api_reference.md +0 -0
  26. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/15_roadmap.md +0 -0
  27. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/16_governance.md +0 -0
  28. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/17_claude_code_design_patterns.md +0 -0
  29. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/18_comparison_codex_vs_design.md +0 -0
  30. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/19_error_hub.md +0 -0
  31. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/20_deep_debug.md +0 -0
  32. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/21_integrations.md +0 -0
  33. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/22_industry_track_paper_eval_plan.md +0 -0
  34. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/ERROR_TAXONOMY.md +0 -0
  35. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
  36. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/README.md +0 -0
  37. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/RESEARCH_SURVEY.md +0 -0
  38. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/benchmarks/e2e_v0_2_3.md +0 -0
  39. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/benchmarks/e2e_v0_2_4.md +0 -0
  40. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/benchmarks/v0_1_smoke.json +0 -0
  41. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/docs/benchmarks/v0_1_smoke.md +0 -0
  42. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/adapters/__init__.py +0 -0
  43. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/adapters/base.py +0 -0
  44. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/adapters/crewai.py +0 -0
  45. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/adapters/langgraph.py +0 -0
  46. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/adapters/otel.py +0 -0
  47. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/adapters/raw.py +0 -0
  48. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/analyzers.py +0 -0
  49. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/cli.py +0 -0
  50. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/deep.py +0 -0
  51. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/detectors.py +0 -0
  52. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/events.py +0 -0
  53. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/hub/__init__.py +0 -0
  54. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/hub/backend_base.py +0 -0
  55. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/hub/backends.py +0 -0
  56. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/hub/bundle.py +0 -0
  57. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/hub/scrub.py +0 -0
  58. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/instrumentation.py +0 -0
  59. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/integrations/__init__.py +0 -0
  60. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/integrations/claude_skill.py +0 -0
  61. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/integrations/openhands.py +0 -0
  62. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/llm.py +0 -0
  63. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/models.py +0 -0
  64. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/recorder.py +0 -0
  65. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/recovery.py +0 -0
  66. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/storage.py +0 -0
  67. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/taxonomy.py +0 -0
  68. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/traceback.py +0 -0
  69. {agentdebugx-0.2.5 → agentdebugx-0.2.7}/src/agentdebug/ui/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentdebugx
3
- Version: 0.2.5
3
+ Version: 0.2.7
4
4
  Summary: Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -21,6 +21,7 @@ the forward-looking plan; this doc is the rear-view mirror.
21
21
  | Attribution | `agentdebug.attribution.AllAtOnceAttributor` | ✅ stable | mocked LLM + fallback |
22
22
  | Attribution | `agentdebug.attribution.StepByStepAttributor` | ✅ **new 0.2.2** | scripted-LLM + fallback |
23
23
  | Attribution | `agentdebug.attribution.BinarySearchAttributor` | ✅ **new 0.2.3** | oracle-LLM logarithmic convergence + fallback + render elision |
24
+ | Attribution | `agentdebug.attribution.CounterfactualAttributor` | ✅ **new 0.2.7** | scripted-rescue-prob ranking + candidate selection priority (findings → errors → tail) + dual fallback (no candidates / silent LLM) |
24
25
  | Recovery | `agentdebug.recovery.ReflexionSuggestion` | ✅ stable | per-finding + empty |
25
26
  | Recovery | `agentdebug.recovery.CriticRecoverer` + `VerifierSpec` registry | ✅ **new 0.2.3** | 5 family-matched verifier templates; dedup + custom-override |
26
27
  | DeepDebug | `agentdebug.deep.DeepDebugAnalyzer` | ✅ stable | full loop + silent LLM |
@@ -47,7 +48,7 @@ across 32 source files.
47
48
  | [06_detectors.md](./06_detectors.md) | `trajectory_perplexity` (TrajAD) | needs token-level LM perplexity API or embedding model + baseline calibration | v0.3 |
48
49
  | [06_detectors.md](./06_detectors.md) | `topic_drift` (embedding cosine) | needs embedding client; consider reusing `OpenAICompatClient` `/embeddings` | v0.3 |
49
50
  | [06_detectors.md](./06_detectors.md) | LTL spec monitors | requires user-supplied spec or LLM-synthesized monitors; gated on RV research | v1.2 |
50
- | [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` | requires re-rolling agent actions; framework-replay dependent | v0.3 |
51
+ | [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` — *real* replay variant | true re-rollout requires framework-specific replay surface; the v0.2.7 LLM-simulated variant ships now, the real-replay variant is gated on adapter support (LangGraph checkpointer / OpenHands rewind) | v0.4 |
51
52
  | [07_attribution.md](./07_attribution.md) | `SBFLAttributor` (Tarantula/Ochiai) | needs corpus of passing + failing traces of same task; gated on Hub adoption | v0.4 |
52
53
  | [07_attribution.md](./07_attribution.md) | `DeltaDebugAttributor` (Zeller) | same replay constraint | v0.3 |
53
54
  | [07_attribution.md](./07_attribution.md) | `EnsembleAttributor` | trivial once Counterfactual lands; awaits Counterfactual | v0.3 |
@@ -82,6 +83,26 @@ The audit found one real bug and a handful of test gaps:
82
83
  5. **`recovery.ReflexionSuggestion`** had only an indirect test from DeepDebug
83
84
  examples; now has direct happy + empty tests.
84
85
 
86
+ ## 3.7 Judge hardening (0.2.6)
87
+
88
+ A v0.2.5 Who&When 5-trace live run had `llm_judge_root.agent_match=0.00`
89
+ because the judge truncated mid-array on long multi-agent debate
90
+ transcripts. Three changes in 0.2.6 lifted that to **0.40** on the same
91
+ sample (same model, same traces):
92
+
93
+ 1. `LLMJudgeAnalyzer.max_tokens` default **4096 → 8192** — leaves room for
94
+ thinking-model reasoning tokens before the JSON object starts.
95
+ 2. `LLMJudgeAnalyzer.max_findings_per_chunk` parameter (default 6) — the
96
+ system prompt now asks the model to cap its findings array, forcing it
97
+ to close the JSON even when many candidates are visible.
98
+ 3. System prompt now has explicit "CRITICAL OUTPUT RULES" — output ONLY
99
+ JSON, no markdown fences, no newlines in string values, complete the
100
+ array.
101
+
102
+ Numbers: see [docs/benchmarks/who_when_v0_2_6_leaderboard.md](./benchmarks/who_when_v0_2_6_leaderboard.md).
103
+ Same trick works for `BinarySearchAttributor` (shipped in 0.2.4) — apply
104
+ to remaining LLM-using analyzers as more thinking models surface this.
105
+
85
106
  ## 3.6 Real-usage E2E (live Gemini)
86
107
 
87
108
  Beyond unit tests, `scripts/e2e_real_usage.py` builds three realistic failing
@@ -148,6 +169,16 @@ Before v0.3 ships, this doc should record green checkmarks for:
148
169
  into `AgentEvent`s. Conformance test mocks the bus and verifies
149
170
  every documented event mapping plus the version-skew degradation
150
171
  path. `examples/crewai_demo.py` shows a working two-agent crew.
151
- - [ ] HuggingFace Hub round-trip live test (gated on `HF_TOKEN`).
152
- - [ ] Bench harness extended with one published-benchmark loader (Who&When
153
- is the obvious first target we already cite its method).
172
+ - [x] **HuggingFace Hub round-trip live test** shipped in 0.2.6 as
173
+ `tests/test_hub_huggingface_live.py`. Gated on `HF_TOKEN` +
174
+ `AGENTDEBUG_HF_LIVE=1` so it never runs in default CI. Creates the
175
+ dataset repo if missing, pushes a bundle, lists, pulls back, verifies
176
+ the trajectory round-trips bit-for-bit. Live-validated against
177
+ `KunlunZhu/agentdebugx-live-test`.
178
+ - [x] **Bench harness with Who&When loader** — `experiments/prepare_who_when.py`
179
+ ingests 184 Algorithm-Generated + Hand-Crafted traces (4092 events) and
180
+ stores labels separately. `experiments/run_who_when_eval.py` runs all
181
+ 4 attributors + DeepDebug against gold labels; reports agent_match,
182
+ exact_step, near_step. Live-Gemini 5-trace validation captured at
183
+ [docs/benchmarks/who_when_v0_2_6_leaderboard.md](./benchmarks/who_when_v0_2_6_leaderboard.md).
184
+ Headline 184-trace run deferred (~6h / ~$5-10 on a frontier model).
@@ -0,0 +1,74 @@
1
+ # Who&When — 5-trace Live Leaderboard (v0.2.6, gemini-3-flash)
2
+
3
+ Tiny validation sample drawn from `data/who_when/processed/labels.jsonl`
4
+ (first 5 algorithm-generated traces). **Not a publishable benchmark** — the
5
+ full benchmark requires the 184-trace dataset + a frontier model and is
6
+ deferred for cost reasons. This run exists to verify the analysis stack
7
+ produces sensible-shaped numbers and to surface regressions early.
8
+
9
+ ## Aggregate (per attribution method)
10
+
11
+ | Method | agent_match | exact_step | near_step | both_near | DeepDebug rounds |
12
+ |---|---:|---:|---:|---:|---:|
13
+ | `heuristic` (rule baseline) | 0.20 | 0.00 | 0.20 | 0.20 | n/a |
14
+ | `llm_judge_root` (judge's root_cause field) | **0.40** | 0.00 | **0.20** | **0.20** | n/a |
15
+ | `all_at_once` (Who&When method 1) | 0.20 | 0.00 | 0.00 | 0.00 | n/a |
16
+ | `step_by_step` (Who&When method 2) | **0.40** | 0.00 | **0.20** | **0.20** | n/a |
17
+ | `deep_debug_root` (DeepDebug refined root) | 0.20 | 0.00 | 0.20 | 0.00 | 6 / trace |
18
+
19
+ ## What changed in 0.2.6 vs 0.2.5
20
+
21
+ Same 5 traces, same model:
22
+
23
+ | Method | 0.2.5 agent_match | 0.2.6 agent_match | Δ |
24
+ |---|---:|---:|---:|
25
+ | `heuristic` | 0.20 | 0.20 | — |
26
+ | `llm_judge_root` | 0.00 | **0.40** | +0.40 |
27
+ | `all_at_once` | 0.00 | 0.20 | +0.20 |
28
+ | `step_by_step` | 0.00 | **0.40** | +0.40 |
29
+
30
+ The driver was the v0.2.6 judge prompt hardening: `max_tokens` default
31
+ 4096 → 8192, an explicit `max_findings_per_chunk=6` cap surfaced through
32
+ the system prompt, and a "CRITICAL OUTPUT RULES" header (output ONLY JSON,
33
+ no markdown, no newlines in strings, complete the array). Before the
34
+ hardening, the judge truncated mid-array on Who&When debate transcripts
35
+ and returned no findings; after, the structured root_cause is populated.
36
+
37
+ ## Honest caveats
38
+
39
+ * n=5; per-method standard error is ±0.22 — these absolute numbers should
40
+ not be over-interpreted. The 0.4 vs 0.0 jump for two methods is the
41
+ signal worth reporting; everything else is noise.
42
+ * `deep_debug_root` underperformed `step_by_step` on this sample. The
43
+ refine round on 7-event traces tends to converge to the *visible*
44
+ failure rather than the *causal* root (a known Who&When difficulty —
45
+ manifestation vs root cause).
46
+ * No method beats `near_step=0.20` on this sample. Step-localization
47
+ remains hard, matching the published Who&When ceiling (~14% step on
48
+ 127 traces with frontier models).
49
+
50
+ ## Reproducing
51
+
52
+ ```bash
53
+ # Prepare data (once)
54
+ PYTHONPATH=src python experiments/prepare_who_when.py
55
+
56
+ # Set live LLM creds (any OpenAI-compatible endpoint works)
57
+ export AGENTDEBUG_LLM_BASE_URL=...
58
+ export AGENTDEBUG_LLM_API_KEY=...
59
+ export AGENTDEBUG_LLM_MODEL=gemini-3-flash
60
+
61
+ # Without DeepDebug (~1 min)
62
+ PYTHONPATH=src python experiments/run_who_when_eval.py \
63
+ --limit 5 --live-openai \
64
+ --out-dir experiments/runs/who_when_eval_subset
65
+
66
+ # With DeepDebug (~5 min)
67
+ PYTHONPATH=src python experiments/run_who_when_eval.py \
68
+ --limit 5 --live-openai --deep \
69
+ --out-dir experiments/runs/who_when_eval_subset_deep
70
+ ```
71
+
72
+ The headline benchmark (184 traces × 5 methods × DeepDebug) would take
73
+ ~6 hours and ~$5-10 in API cost on a frontier model. Run it once before
74
+ paper submission; do not run on every iteration.
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "agentdebugx"
3
- version = "0.2.5"
3
+ version = "0.2.7"
4
4
  description = "Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`."
5
5
  authors = ["ULab @ UIUC <ulab@illinois.edu>"]
6
6
  license = "MIT"
@@ -15,6 +15,7 @@ from agentdebug.attribution import (
15
15
  Attributor,
16
16
  BinarySearchAttributor,
17
17
  Blame,
18
+ CounterfactualAttributor,
18
19
  HeuristicAttributor,
19
20
  StepByStepAttributor,
20
21
  )
@@ -63,6 +64,7 @@ __all__ = [
63
64
  'BusEvent',
64
65
  'BinarySearchAttributor',
65
66
  'CascadeFrame',
67
+ 'CounterfactualAttributor',
66
68
  'CriticRecoverer',
67
69
  'DEFAULT_VERIFIERS',
68
70
  'Detector',
@@ -96,4 +98,4 @@ __all__ = [
96
98
  'get_failure_mode',
97
99
  ]
98
100
 
99
- __version__ = '0.2.5'
101
+ __version__ = '0.2.7'
@@ -566,8 +566,193 @@ def _EVENT_ELLIPSIS(count: int) -> _EllipsisEvent:
566
566
  return _EllipsisEvent(count=count)
567
567
 
568
568
 
569
+ _COUNTERFACTUAL_SYSTEM_PROMPT = """You are AgentDebugX-Attributor running an
570
+ LLM-simulated counterfactual replay (AgenTracer-style, arXiv:2509.03312).
571
+
572
+ You will be given the goal, the full trajectory, and ONE CANDIDATE STEP. Your
573
+ job is to estimate whether the agent would have succeeded if THAT step had
574
+ been done correctly — leaving everything else the same. This isolates the
575
+ step's causal contribution to the failure.
576
+
577
+ CRITICAL OUTPUT RULES (these maximize the chance your reply parses):
578
+ 1. Output ONLY a JSON object. No prose before/after. No markdown fences.
579
+ 2. Keep "rationale" to ONE short sentence (<= 200 chars).
580
+ 3. Do NOT include newlines inside string values.
581
+ 4. Emit the JSON object COMPLETE.
582
+
583
+ Schema:
584
+ {
585
+ "rescue_probability": <0..1>,
586
+ "confidence": <0..1>,
587
+ "rationale": "<short>",
588
+ "would_block_downstream_failures": true | false
589
+ }
590
+
591
+ Higher rescue_probability = correcting this step would more likely have
592
+ rescued the run; this step is therefore more responsible for the failure.
593
+ """
594
+
595
+
596
+ class CounterfactualAttributor:
597
+ """LLM-simulated counterfactual replay.
598
+
599
+ For each of K candidate steps (top-K from prior findings, or
600
+ error-bearing events, or the tail of the trajectory) ask the LLM:
601
+ "if this step had been correct, would the rest of the trajectory still
602
+ fail?" Steps with the highest rescue-probability become the top blame
603
+ hypotheses. Costs O(K) LLM calls — comparable to AllAtOnce, with a
604
+ stronger causal claim per probe.
605
+
606
+ This is *simulated* counterfactual, not real re-rollout — strictly
607
+ weaker than AgenTracer's actual replay, but framework-independent and
608
+ runnable today against any LLM. When the underlying framework gains a
609
+ real replay surface (LangGraph checkpointer, OpenHands rewind), wire
610
+ that in as an alternative ``replay_fn`` and the algorithm carries over.
611
+ """
612
+
613
+ id = 'counterfactual'
614
+
615
+ def __init__(
616
+ self,
617
+ llm: LLMClient,
618
+ *,
619
+ max_candidates: int = 5,
620
+ max_tokens: int = 2048,
621
+ fallback: Optional[Attributor] = None,
622
+ ) -> None:
623
+ self.llm = llm
624
+ self.max_candidates = max_candidates
625
+ self.max_tokens = max_tokens
626
+ self.fallback: Attributor = fallback or HeuristicAttributor()
627
+
628
+ def attribute(
629
+ self,
630
+ trajectory: AgentTrajectory,
631
+ findings: List[FailureFinding],
632
+ ) -> AttributionResult:
633
+ candidates = self._pick_candidates(trajectory, findings)
634
+ if not candidates:
635
+ return self.fallback.attribute(trajectory, findings)
636
+ ranked: List[tuple[AgentEvent, Dict[str, Any]]] = []
637
+ for evt in candidates:
638
+ verdict = self._ask_counterfactual(trajectory, evt)
639
+ if verdict is None:
640
+ continue
641
+ ranked.append((evt, verdict))
642
+ if not ranked:
643
+ return self.fallback.attribute(trajectory, findings)
644
+ # Sort by rescue_probability desc, tie-break by confidence.
645
+ ranked.sort(
646
+ key=lambda r: (
647
+ -self._coerce_float(r[1].get('rescue_probability'), 0.0),
648
+ -self._coerce_float(r[1].get('confidence'), 0.0),
649
+ )
650
+ )
651
+ hypotheses: List[Blame] = []
652
+ for evt, verdict in ranked:
653
+ hypotheses.append(Blame(
654
+ span_id=evt.event_id,
655
+ step_index=evt.step_index,
656
+ agent_name=evt.agent_name,
657
+ confidence=self._coerce_float(verdict.get('rescue_probability'), 0.0),
658
+ rationale=(
659
+ str(verdict.get('rationale') or 'no rationale')
660
+ + f' [rescue_probability={verdict.get("rescue_probability")}]'
661
+ ),
662
+ evidence=[
663
+ f'event_id={evt.event_id}',
664
+ f'step={evt.step_index}',
665
+ ],
666
+ sources=[self.id],
667
+ ))
668
+ return AttributionResult(
669
+ method=self.id,
670
+ hypotheses=hypotheses,
671
+ raw={'candidates_probed': len(ranked)},
672
+ )
673
+
674
+ def _pick_candidates(
675
+ self,
676
+ trajectory: AgentTrajectory,
677
+ findings: List[FailureFinding],
678
+ ) -> List[AgentEvent]:
679
+ events_by_id = {e.event_id: e for e in trajectory.events}
680
+ candidates: List[AgentEvent] = []
681
+ seen: set[str] = set()
682
+ # 1. Prior findings (the judge already nominated suspects).
683
+ for f in findings:
684
+ evt = events_by_id.get(f.event_id) if f.event_id else None
685
+ if evt is not None and evt.event_id not in seen:
686
+ candidates.append(evt)
687
+ seen.add(evt.event_id)
688
+ if len(candidates) >= self.max_candidates:
689
+ return candidates
690
+ # 2. Events that recorded an error directly.
691
+ for evt in trajectory.events:
692
+ if evt.error and evt.event_id not in seen:
693
+ candidates.append(evt)
694
+ seen.add(evt.event_id)
695
+ if len(candidates) >= self.max_candidates:
696
+ return candidates
697
+ # 3. Fallback: tail of the trajectory (failure most often manifests there).
698
+ for evt in reversed(trajectory.events):
699
+ if evt.event_id not in seen:
700
+ candidates.append(evt)
701
+ seen.add(evt.event_id)
702
+ if len(candidates) >= self.max_candidates:
703
+ return candidates
704
+ return candidates
705
+
706
+ def _ask_counterfactual(
707
+ self, trajectory: AgentTrajectory, candidate: AgentEvent,
708
+ ) -> Optional[Dict[str, Any]]:
709
+ events_doc = '\n'.join(
710
+ f'event_id={e.event_id} step={e.step_index} agent={e.agent_name} '
711
+ f'type={getattr(e.event_type, "value", e.event_type)} '
712
+ f'output={str(e.output)[:200]} error={str(e.error)[:200]}'
713
+ for e in trajectory.events
714
+ )
715
+ user = (
716
+ f'GOAL: {trajectory.goal!r}\n'
717
+ f'FRAMEWORK: {trajectory.framework!r}\n\n'
718
+ f'FULL TRAJECTORY:\n{events_doc}\n\n'
719
+ f'CANDIDATE STEP TO COUNTERFACTUALLY CORRECT:\n'
720
+ f' event_id={candidate.event_id}\n'
721
+ f' step={candidate.step_index} agent={candidate.agent_name}\n'
722
+ f' module={candidate.module}\n'
723
+ f' input={str(candidate.input)[:300]}\n'
724
+ f' output={str(candidate.output)[:300]}\n'
725
+ f' error={str(candidate.error)[:300]}\n\n'
726
+ f'Question: if this step had been DONE CORRECTLY, what is the '
727
+ f'probability the run would have succeeded?'
728
+ )
729
+ try:
730
+ result = self.llm.complete(
731
+ messages=[
732
+ {'role': 'system', 'content': _COUNTERFACTUAL_SYSTEM_PROMPT},
733
+ {'role': 'user', 'content': user},
734
+ ],
735
+ max_tokens=self.max_tokens,
736
+ )
737
+ except Exception as exc: # pragma: no cover
738
+ LOG.warning('counterfactual probe failed at event=%s: %s',
739
+ candidate.event_id, exc)
740
+ return None
741
+ parsed = extract_json_block(result.text)
742
+ if parsed is None:
743
+ return None
744
+ return cast(Dict[str, Any], parsed)
745
+
746
+ @staticmethod
747
+ def _coerce_float(value: Any, default: float) -> float:
748
+ try:
749
+ return float(value)
750
+ except (TypeError, ValueError):
751
+ return default
752
+
753
+
569
754
  __all__ = [
570
755
  'Attributor', 'Blame', 'AttributionResult',
571
756
  'HeuristicAttributor', 'AllAtOnceAttributor', 'StepByStepAttributor',
572
- 'BinarySearchAttributor',
757
+ 'BinarySearchAttributor', 'CounterfactualAttributor',
573
758
  ]
@@ -39,21 +39,18 @@ the allowed failure mode codes. Be conservative — only flag steps where the
39
39
  evidence in the event payload supports the label. If the trajectory contains no
40
40
  failure, return an empty findings list.
41
41
 
42
- Respond ONLY with a JSON object matching this schema (no prose, no markdown):
42
+ CRITICAL OUTPUT RULES (these maximize the chance your reply parses):
43
+ 1. Output ONLY a JSON object. No prose before/after. No markdown fences.
44
+ 2. Cap the findings array at {max_findings} entries — pick the most important.
45
+ 3. Keep each "evidence" entry under 120 characters; keep each "rationale" /
46
+ "summary" under 200 characters.
47
+ 4. Do NOT include newlines inside string values.
48
+ 5. Emit the JSON object COMPLETE — never stop mid-key or mid-array.
43
49
 
44
- {
45
- "findings": [
46
- {
47
- "event_id": "<event_id from the input>",
48
- "step_index": <int or null>,
49
- "agent_name": "<agent_name from the input>",
50
- "failure_mode_id": "<one of the allowed codes>",
51
- "confidence": <float between 0 and 1>,
52
- "evidence": ["<short quote or summary of the supporting payload>"]
53
- }
54
- ],
55
- "summary": "<one-sentence diagnosis or 'No failure detected.'>"
56
- }
50
+ Schema (compact — fields in this order):
51
+ {{"findings":[{{"event_id":"...", "step_index":N|null, "agent_name":"...",
52
+ "failure_mode_id":"...", "confidence":0..1, "evidence":["..."]}}, ...],
53
+ "summary":"<short>"}}
57
54
  """
58
55
 
59
56
 
@@ -66,15 +63,21 @@ class LLMJudgeAnalyzer:
66
63
  *,
67
64
  max_events_per_call: int = 80,
68
65
  max_evidence_chars: int = 300,
69
- max_tokens: int = 4096,
66
+ max_tokens: int = 8192,
67
+ max_findings_per_chunk: int = 6,
70
68
  ) -> None:
71
69
  self.llm = llm
72
70
  self.max_events_per_call = max_events_per_call
73
71
  self.max_evidence_chars = max_evidence_chars
74
72
  # NOTE: thinking models (Gemini 2.x/3.x, o-series) spend a substantial
75
73
  # fraction of `max_tokens` on reasoning tokens before any text is
76
- # emitted. 4096 is the safe default; bump higher for long traces.
74
+ # emitted. 8192 is the safe default after the v0.2.6 Who&When debate-
75
+ # trace observation that 4096 truncated mid-array on long traces.
77
76
  self.max_tokens = max_tokens
77
+ # The system prompt asks the model to cap its findings array so the
78
+ # JSON closes even when many candidate failures exist. Reuse the prompt
79
+ # placeholder for this cap.
80
+ self.max_findings_per_chunk = max_findings_per_chunk
78
81
 
79
82
  def analyze(self, trajectory: AgentTrajectory) -> DiagnosticReport:
80
83
  events = trajectory.events
@@ -121,8 +124,11 @@ class LLMJudgeAnalyzer:
121
124
  self, trajectory: AgentTrajectory, chunk: List[AgentEvent]
122
125
  ) -> tuple[List[FailureFinding], str]:
123
126
  user = self._render_user_prompt(trajectory, chunk)
127
+ # Inject the max_findings cap into the system prompt at format time so
128
+ # we can tune it per-call without forking the prompt.
129
+ system = _SYSTEM_PROMPT.format(max_findings=self.max_findings_per_chunk)
124
130
  messages = [
125
- {'role': 'system', 'content': _SYSTEM_PROMPT},
131
+ {'role': 'system', 'content': system},
126
132
  {'role': 'user', 'content': user},
127
133
  ]
128
134
  result = self.llm.complete(messages=messages, max_tokens=self.max_tokens)
@@ -211,8 +211,10 @@ _INDEX_HTML = """<!doctype html>
211
211
  .button {
212
212
  border:1px solid #373b3a; border-radius:8px; background:#1a1c1c; color:var(--fg);
213
213
  height:32px; padding:0 11px; font-size:12px; display:inline-flex;
214
- align-items:center; gap:7px;
214
+ align-items:center; justify-content:center; gap:7px; cursor:pointer;
215
+ font-family:inherit; white-space:nowrap;
215
216
  }
217
+ .button:hover { border-color:#4b5250; background:#202323; }
216
218
  .button.primary { border-color:#356568; color:#d8fdff; background:#173033; }
217
219
  .content { padding:22px; max-width:1440px; margin:0 auto; }
218
220
  .hero {
@@ -330,6 +332,18 @@ _INDEX_HTML = """<!doctype html>
330
332
  .topbar { position:static; }
331
333
  .trace-legend, .trace-pair { grid-template-columns:1fr; }
332
334
  }
335
+ @media (max-width: 640px) {
336
+ .topbar { display:grid; grid-template-columns:1fr; align-items:start; padding:14px 16px; }
337
+ .top-actions { width:100%; display:grid; grid-template-columns:repeat(3,minmax(0,1fr)); }
338
+ .button { width:100%; min-width:0; padding:0 8px; overflow:hidden; text-overflow:ellipsis; }
339
+ .content { padding:22px 16px; }
340
+ h1 { font-size:27px; line-height:1.1; }
341
+ .stats { grid-template-columns:repeat(2,minmax(0,1fr)); }
342
+ .root-grid { grid-template-columns:1fr; }
343
+ .event { grid-template-columns:46px minmax(0,1fr); padding:10px; }
344
+ .step-index { width:38px; height:38px; }
345
+ .event-grid { grid-template-columns:1fr; }
346
+ }
333
347
  </style>
334
348
  </head>
335
349
  <body>
@@ -358,9 +372,9 @@ _INDEX_HTML = """<!doctype html>
358
372
  <div class="brand-sub" id="trace-count">Loading traces</div>
359
373
  </div>
360
374
  <div class="top-actions">
361
- <span class="button">Analyze</span>
362
- <span class="button">Export Bundle</span>
363
- <span class="button primary">Open Error Hub</span>
375
+ <button class="button" id="analyze-btn" type="button">Analyze</button>
376
+ <button class="button" id="export-btn" type="button">Bundle</button>
377
+ <button class="button primary" id="hub-btn" type="button">Hub</button>
364
378
  </div>
365
379
  </div>
366
380
  <div class="content" id="detail">
@@ -370,6 +384,8 @@ _INDEX_HTML = """<!doctype html>
370
384
  </div>
371
385
  <script>
372
386
  const BOOTSTRAP = __BOOTSTRAP_JSON__;
387
+ let CURRENT_TRACE_ID = null;
388
+ let CURRENT_TRACE_DATA = null;
373
389
  async function api(path) {
374
390
  const r = await fetch(path);
375
391
  if (!r.ok) throw new Error('HTTP ' + r.status);
@@ -430,9 +446,11 @@ function renderTraceList(traceIds, selectedId) {
430
446
  async function selectTrace(tid, li) {
431
447
  document.querySelectorAll('.run').forEach(el => el.classList.remove('active'));
432
448
  li.classList.add('active');
449
+ CURRENT_TRACE_ID = tid;
433
450
  document.getElementById('detail').innerHTML = '<div class="empty">Loading trace...</div>';
434
451
  try {
435
452
  const data = await api('/api/v1/traces/' + encodeURIComponent(tid));
453
+ CURRENT_TRACE_DATA = data;
436
454
  renderTrace(data.trajectory, data.report);
437
455
  } catch (e) {
438
456
  document.getElementById('detail').innerHTML = '<div class="empty">' + escapeHtml(e) + '</div>';
@@ -485,7 +503,7 @@ function renderTrace(traj, report) {
485
503
  for (const f of findings) html += renderFinding(f);
486
504
  html += '</div></div></div>';
487
505
 
488
- html += '<div class="panel"><div class="panel-head"><div class="panel-title">Use Case Flow</div><span class="chip cyan">Error Hub</span></div><div class="panel-body"><div class="flow">';
506
+ html += '<div class="panel" id="error-hub-flow"><div class="panel-head"><div class="panel-title">Use Case Flow</div><span class="chip cyan">Error Hub</span></div><div class="panel-body"><div class="flow">';
489
507
  html += flow(1, 'Capture trajectory from the running agent with the lightweight recorder or adapter.');
490
508
  html += flow(2, 'Diagnose the trace, localize the likely root cause, and generate recovery suggestions.');
491
509
  html += flow(3, 'Scrub secrets and PII, package a reproducible error bundle, and publish to Git or Hugging Face.');
@@ -566,6 +584,32 @@ function renderEvent(ev, isRoot, finding) {
566
584
  html += '</div></div></div>';
567
585
  return html;
568
586
  }
587
+ function downloadJson(filename, value) {
588
+ const blob = new Blob([JSON.stringify(value, null, 2)], {type: 'application/json'});
589
+ const url = URL.createObjectURL(blob);
590
+ const a = document.createElement('a');
591
+ a.href = url;
592
+ a.download = filename;
593
+ document.body.appendChild(a);
594
+ a.click();
595
+ a.remove();
596
+ URL.revokeObjectURL(url);
597
+ }
598
+ function bindTopActions() {
599
+ document.getElementById('analyze-btn').onclick = () => {
600
+ const active = document.querySelector('.run.active');
601
+ if (CURRENT_TRACE_ID && active) selectTrace(CURRENT_TRACE_ID, active);
602
+ };
603
+ document.getElementById('export-btn').onclick = () => {
604
+ if (!CURRENT_TRACE_DATA) return;
605
+ const name = (CURRENT_TRACE_ID || 'trace') + '.agentdebugx.report.json';
606
+ downloadJson(name, CURRENT_TRACE_DATA);
607
+ };
608
+ document.getElementById('hub-btn').onclick = () => {
609
+ const flow = document.getElementById('error-hub-flow');
610
+ if (flow) flow.scrollIntoView({behavior: 'smooth', block: 'start'});
611
+ };
612
+ }
569
613
  function field(label, value, isError) {
570
614
  return '<div class="field ' + (isError ? 'error' : '') + '"><div class="field-label">' + escapeHtml(label) + '</div><div class="field-value">' + escapeHtml(value || '-') + '</div></div>';
571
615
  }
@@ -585,11 +629,14 @@ if (BOOTSTRAP && BOOTSTRAP.traces) {
585
629
  const selected = BOOTSTRAP.selected ? BOOTSTRAP.selected.trajectory.trace_id : null;
586
630
  renderTraceList(BOOTSTRAP.traces, selected);
587
631
  if (BOOTSTRAP.selected) {
632
+ CURRENT_TRACE_ID = selected;
633
+ CURRENT_TRACE_DATA = BOOTSTRAP.selected;
588
634
  renderTrace(BOOTSTRAP.selected.trajectory, BOOTSTRAP.selected.report);
589
635
  } else {
590
636
  document.getElementById('detail').innerHTML = '<div class="empty">No traces in store.</div>';
591
637
  }
592
638
  }
639
+ bindTopActions();
593
640
  loadTraceList(!(BOOTSTRAP && BOOTSTRAP.selected));
594
641
  </script>
595
642
  </body>
File without changes
File without changes
File without changes