agentdebugx 0.2.6__tar.gz → 0.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/PKG-INFO +1 -1
  2. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/23_status_v0_2.md +2 -1
  3. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/pyproject.toml +1 -1
  4. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/__init__.py +3 -1
  5. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/attribution.py +186 -1
  6. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/LICENSE +0 -0
  7. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/README.md +0 -0
  8. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/00_overview.md +0 -0
  9. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/01_literature_survey.md +0 -0
  10. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/02_architecture.md +0 -0
  11. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/03_taxonomy.md +0 -0
  12. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/04_trace_schema.md +0 -0
  13. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/05_adapters.md +0 -0
  14. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/06_detectors.md +0 -0
  15. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/07_attribution.md +0 -0
  16. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/08_recovery.md +0 -0
  17. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/09_error_database.md +0 -0
  18. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/10_taxonomy_induction.md +0 -0
  19. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/11_multimodal.md +0 -0
  20. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/12_ui_dashboard.md +0 -0
  21. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/13_class_design.md +0 -0
  22. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/14_api_reference.md +0 -0
  23. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/15_roadmap.md +0 -0
  24. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/16_governance.md +0 -0
  25. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/17_claude_code_design_patterns.md +0 -0
  26. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/18_comparison_codex_vs_design.md +0 -0
  27. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/19_error_hub.md +0 -0
  28. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/20_deep_debug.md +0 -0
  29. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/21_integrations.md +0 -0
  30. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/22_industry_track_paper_eval_plan.md +0 -0
  31. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/ERROR_TAXONOMY.md +0 -0
  32. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
  33. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/README.md +0 -0
  34. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/RESEARCH_SURVEY.md +0 -0
  35. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/benchmarks/e2e_v0_2_3.md +0 -0
  36. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/benchmarks/e2e_v0_2_4.md +0 -0
  37. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/benchmarks/v0_1_smoke.json +0 -0
  38. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/benchmarks/v0_1_smoke.md +0 -0
  39. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/benchmarks/who_when_v0_2_6_leaderboard.md +0 -0
  40. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/adapters/__init__.py +0 -0
  41. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/adapters/base.py +0 -0
  42. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/adapters/crewai.py +0 -0
  43. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/adapters/langgraph.py +0 -0
  44. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/adapters/otel.py +0 -0
  45. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/adapters/raw.py +0 -0
  46. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/analyzers.py +0 -0
  47. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/cli.py +0 -0
  48. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/deep.py +0 -0
  49. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/detectors.py +0 -0
  50. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/events.py +0 -0
  51. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/hub/__init__.py +0 -0
  52. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/hub/backend_base.py +0 -0
  53. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/hub/backends.py +0 -0
  54. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/hub/bundle.py +0 -0
  55. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/hub/scrub.py +0 -0
  56. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/instrumentation.py +0 -0
  57. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/integrations/__init__.py +0 -0
  58. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/integrations/claude_skill.py +0 -0
  59. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/integrations/openhands.py +0 -0
  60. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/judges.py +0 -0
  61. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/llm.py +0 -0
  62. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/models.py +0 -0
  63. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/recorder.py +0 -0
  64. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/recovery.py +0 -0
  65. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/storage.py +0 -0
  66. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/taxonomy.py +0 -0
  67. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/traceback.py +0 -0
  68. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/ui/__init__.py +0 -0
  69. {agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/ui/server.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentdebugx
3
- Version: 0.2.6
3
+ Version: 0.2.7
4
4
  Summary: Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -21,6 +21,7 @@ the forward-looking plan; this doc is the rear-view mirror.
21
21
  | Attribution | `agentdebug.attribution.AllAtOnceAttributor` | ✅ stable | mocked LLM + fallback |
22
22
  | Attribution | `agentdebug.attribution.StepByStepAttributor` | ✅ **new 0.2.2** | scripted-LLM + fallback |
23
23
  | Attribution | `agentdebug.attribution.BinarySearchAttributor` | ✅ **new 0.2.3** | oracle-LLM logarithmic convergence + fallback + render elision |
24
+ | Attribution | `agentdebug.attribution.CounterfactualAttributor` | ✅ **new 0.2.7** | scripted-rescue-prob ranking + candidate selection priority (findings → errors → tail) + dual fallback (no candidates / silent LLM) |
24
25
  | Recovery | `agentdebug.recovery.ReflexionSuggestion` | ✅ stable | per-finding + empty |
25
26
  | Recovery | `agentdebug.recovery.CriticRecoverer` + `VerifierSpec` registry | ✅ **new 0.2.3** | 5 family-matched verifier templates; dedup + custom-override |
26
27
  | DeepDebug | `agentdebug.deep.DeepDebugAnalyzer` | ✅ stable | full loop + silent LLM |
@@ -47,7 +48,7 @@ across 32 source files.
47
48
  | [06_detectors.md](./06_detectors.md) | `trajectory_perplexity` (TrajAD) | needs token-level LM perplexity API or embedding model + baseline calibration | v0.3 |
48
49
  | [06_detectors.md](./06_detectors.md) | `topic_drift` (embedding cosine) | needs embedding client; consider reusing `OpenAICompatClient` `/embeddings` | v0.3 |
49
50
  | [06_detectors.md](./06_detectors.md) | LTL spec monitors | requires user-supplied spec or LLM-synthesized monitors; gated on RV research | v1.2 |
50
- | [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` | requires re-rolling agent actions; framework-replay dependent | v0.3 |
51
+ | [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` — *real* replay variant | true re-rollout requires framework-specific replay surface; the v0.2.7 LLM-simulated variant ships now, the real-replay variant is gated on adapter support (LangGraph checkpointer / OpenHands rewind) | v0.4 |
51
52
  | [07_attribution.md](./07_attribution.md) | `SBFLAttributor` (Tarantula/Ochiai) | needs corpus of passing + failing traces of same task; gated on Hub adoption | v0.4 |
52
53
  | [07_attribution.md](./07_attribution.md) | `DeltaDebugAttributor` (Zeller) | same replay constraint | v0.3 |
53
54
  | [07_attribution.md](./07_attribution.md) | `EnsembleAttributor` | trivial once Counterfactual lands; awaits Counterfactual | v0.3 |
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "agentdebugx"
3
- version = "0.2.6"
3
+ version = "0.2.7"
4
4
  description = "Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`."
5
5
  authors = ["ULab @ UIUC <ulab@illinois.edu>"]
6
6
  license = "MIT"
@@ -15,6 +15,7 @@ from agentdebug.attribution import (
15
15
  Attributor,
16
16
  BinarySearchAttributor,
17
17
  Blame,
18
+ CounterfactualAttributor,
18
19
  HeuristicAttributor,
19
20
  StepByStepAttributor,
20
21
  )
@@ -63,6 +64,7 @@ __all__ = [
63
64
  'BusEvent',
64
65
  'BinarySearchAttributor',
65
66
  'CascadeFrame',
67
+ 'CounterfactualAttributor',
66
68
  'CriticRecoverer',
67
69
  'DEFAULT_VERIFIERS',
68
70
  'Detector',
@@ -96,4 +98,4 @@ __all__ = [
96
98
  'get_failure_mode',
97
99
  ]
98
100
 
99
- __version__ = '0.2.6'
101
+ __version__ = '0.2.7'
@@ -566,8 +566,193 @@ def _EVENT_ELLIPSIS(count: int) -> _EllipsisEvent:
566
566
  return _EllipsisEvent(count=count)
567
567
 
568
568
 
569
+ _COUNTERFACTUAL_SYSTEM_PROMPT = """You are AgentDebugX-Attributor running an
570
+ LLM-simulated counterfactual replay (AgenTracer-style, arXiv:2509.03312).
571
+
572
+ You will be given the goal, the full trajectory, and ONE CANDIDATE STEP. Your
573
+ job is to estimate whether the agent would have succeeded if THAT step had
574
+ been done correctly — leaving everything else the same. This isolates the
575
+ step's causal contribution to the failure.
576
+
577
+ CRITICAL OUTPUT RULES (these maximize the chance your reply parses):
578
+ 1. Output ONLY a JSON object. No prose before/after. No markdown fences.
579
+ 2. Keep "rationale" to ONE short sentence (<= 200 chars).
580
+ 3. Do NOT include newlines inside string values.
581
+ 4. Emit the JSON object COMPLETE.
582
+
583
+ Schema:
584
+ {
585
+ "rescue_probability": <0..1>,
586
+ "confidence": <0..1>,
587
+ "rationale": "<short>",
588
+ "would_block_downstream_failures": true | false
589
+ }
590
+
591
+ Higher rescue_probability = correcting this step would more likely have
592
+ rescued the run; this step is therefore more responsible for the failure.
593
+ """
594
+
595
+
596
+ class CounterfactualAttributor:
597
+ """LLM-simulated counterfactual replay.
598
+
599
+ For each of K candidate steps (top-K from prior findings, or
600
+ error-bearing events, or the tail of the trajectory) ask the LLM:
601
+ "if this step had been correct, would the rest of the trajectory still
602
+ fail?" Steps with the highest rescue-probability become the top blame
603
+ hypotheses. Costs O(K) LLM calls — comparable to AllAtOnce, with a
604
+ stronger causal claim per probe.
605
+
606
+ This is *simulated* counterfactual, not real re-rollout — strictly
607
+ weaker than AgenTracer's actual replay, but framework-independent and
608
+ runnable today against any LLM. When the underlying framework gains a
609
+ real replay surface (LangGraph checkpointer, OpenHands rewind), wire
610
+ that in as an alternative ``replay_fn`` and the algorithm carries over.
611
+ """
612
+
613
+ id = 'counterfactual'
614
+
615
+ def __init__(
616
+ self,
617
+ llm: LLMClient,
618
+ *,
619
+ max_candidates: int = 5,
620
+ max_tokens: int = 2048,
621
+ fallback: Optional[Attributor] = None,
622
+ ) -> None:
623
+ self.llm = llm
624
+ self.max_candidates = max_candidates
625
+ self.max_tokens = max_tokens
626
+ self.fallback: Attributor = fallback or HeuristicAttributor()
627
+
628
+ def attribute(
629
+ self,
630
+ trajectory: AgentTrajectory,
631
+ findings: List[FailureFinding],
632
+ ) -> AttributionResult:
633
+ candidates = self._pick_candidates(trajectory, findings)
634
+ if not candidates:
635
+ return self.fallback.attribute(trajectory, findings)
636
+ ranked: List[tuple[AgentEvent, Dict[str, Any]]] = []
637
+ for evt in candidates:
638
+ verdict = self._ask_counterfactual(trajectory, evt)
639
+ if verdict is None:
640
+ continue
641
+ ranked.append((evt, verdict))
642
+ if not ranked:
643
+ return self.fallback.attribute(trajectory, findings)
644
+ # Sort by rescue_probability desc, tie-break by confidence.
645
+ ranked.sort(
646
+ key=lambda r: (
647
+ -self._coerce_float(r[1].get('rescue_probability'), 0.0),
648
+ -self._coerce_float(r[1].get('confidence'), 0.0),
649
+ )
650
+ )
651
+ hypotheses: List[Blame] = []
652
+ for evt, verdict in ranked:
653
+ hypotheses.append(Blame(
654
+ span_id=evt.event_id,
655
+ step_index=evt.step_index,
656
+ agent_name=evt.agent_name,
657
+ confidence=self._coerce_float(verdict.get('rescue_probability'), 0.0),
658
+ rationale=(
659
+ str(verdict.get('rationale') or 'no rationale')
660
+ + f' [rescue_probability={verdict.get("rescue_probability")}]'
661
+ ),
662
+ evidence=[
663
+ f'event_id={evt.event_id}',
664
+ f'step={evt.step_index}',
665
+ ],
666
+ sources=[self.id],
667
+ ))
668
+ return AttributionResult(
669
+ method=self.id,
670
+ hypotheses=hypotheses,
671
+ raw={'candidates_probed': len(ranked)},
672
+ )
673
+
674
+ def _pick_candidates(
675
+ self,
676
+ trajectory: AgentTrajectory,
677
+ findings: List[FailureFinding],
678
+ ) -> List[AgentEvent]:
679
+ events_by_id = {e.event_id: e for e in trajectory.events}
680
+ candidates: List[AgentEvent] = []
681
+ seen: set[str] = set()
682
+ # 1. Prior findings (the judge already nominated suspects).
683
+ for f in findings:
684
+ evt = events_by_id.get(f.event_id) if f.event_id else None
685
+ if evt is not None and evt.event_id not in seen:
686
+ candidates.append(evt)
687
+ seen.add(evt.event_id)
688
+ if len(candidates) >= self.max_candidates:
689
+ return candidates
690
+ # 2. Events that recorded an error directly.
691
+ for evt in trajectory.events:
692
+ if evt.error and evt.event_id not in seen:
693
+ candidates.append(evt)
694
+ seen.add(evt.event_id)
695
+ if len(candidates) >= self.max_candidates:
696
+ return candidates
697
+ # 3. Fallback: tail of the trajectory (failure most often manifests there).
698
+ for evt in reversed(trajectory.events):
699
+ if evt.event_id not in seen:
700
+ candidates.append(evt)
701
+ seen.add(evt.event_id)
702
+ if len(candidates) >= self.max_candidates:
703
+ return candidates
704
+ return candidates
705
+
706
+ def _ask_counterfactual(
707
+ self, trajectory: AgentTrajectory, candidate: AgentEvent,
708
+ ) -> Optional[Dict[str, Any]]:
709
+ events_doc = '\n'.join(
710
+ f'event_id={e.event_id} step={e.step_index} agent={e.agent_name} '
711
+ f'type={getattr(e.event_type, "value", e.event_type)} '
712
+ f'output={str(e.output)[:200]} error={str(e.error)[:200]}'
713
+ for e in trajectory.events
714
+ )
715
+ user = (
716
+ f'GOAL: {trajectory.goal!r}\n'
717
+ f'FRAMEWORK: {trajectory.framework!r}\n\n'
718
+ f'FULL TRAJECTORY:\n{events_doc}\n\n'
719
+ f'CANDIDATE STEP TO COUNTERFACTUALLY CORRECT:\n'
720
+ f' event_id={candidate.event_id}\n'
721
+ f' step={candidate.step_index} agent={candidate.agent_name}\n'
722
+ f' module={candidate.module}\n'
723
+ f' input={str(candidate.input)[:300]}\n'
724
+ f' output={str(candidate.output)[:300]}\n'
725
+ f' error={str(candidate.error)[:300]}\n\n'
726
+ f'Question: if this step had been DONE CORRECTLY, what is the '
727
+ f'probability the run would have succeeded?'
728
+ )
729
+ try:
730
+ result = self.llm.complete(
731
+ messages=[
732
+ {'role': 'system', 'content': _COUNTERFACTUAL_SYSTEM_PROMPT},
733
+ {'role': 'user', 'content': user},
734
+ ],
735
+ max_tokens=self.max_tokens,
736
+ )
737
+ except Exception as exc: # pragma: no cover
738
+ LOG.warning('counterfactual probe failed at event=%s: %s',
739
+ candidate.event_id, exc)
740
+ return None
741
+ parsed = extract_json_block(result.text)
742
+ if parsed is None:
743
+ return None
744
+ return cast(Dict[str, Any], parsed)
745
+
746
+ @staticmethod
747
+ def _coerce_float(value: Any, default: float) -> float:
748
+ try:
749
+ return float(value)
750
+ except (TypeError, ValueError):
751
+ return default
752
+
753
+
569
754
  __all__ = [
570
755
  'Attributor', 'Blame', 'AttributionResult',
571
756
  'HeuristicAttributor', 'AllAtOnceAttributor', 'StepByStepAttributor',
572
- 'BinarySearchAttributor',
757
+ 'BinarySearchAttributor', 'CounterfactualAttributor',
573
758
  ]
File without changes
File without changes
File without changes