PyPI - agentdebugx - Versions diffs - 0.2.6__tar.gz → 0.2.7__tar.gz - Mend

agentdebugx 0.2.6tar.gz → 0.2.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: agentdebugx
-Version: 0.2.6
+Version: 0.2.7
 Summary: Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`.
 License: MIT
 License-File: LICENSE

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/23_status_v0_2.md RENAMED Viewed

@@ -21,6 +21,7 @@ the forward-looking plan; this doc is the rear-view mirror.
 | Attribution | `agentdebug.attribution.AllAtOnceAttributor` | ✅ stable | mocked LLM + fallback |
 | Attribution | `agentdebug.attribution.StepByStepAttributor` | ✅ **new 0.2.2** | scripted-LLM + fallback |
 | Attribution | `agentdebug.attribution.BinarySearchAttributor` | ✅ **new 0.2.3** | oracle-LLM logarithmic convergence + fallback + render elision |
+| Attribution | `agentdebug.attribution.CounterfactualAttributor` | ✅ **new 0.2.7** | scripted-rescue-prob ranking + candidate selection priority (findings → errors → tail) + dual fallback (no candidates / silent LLM) |
 | Recovery | `agentdebug.recovery.ReflexionSuggestion` | ✅ stable | per-finding + empty |
 | Recovery | `agentdebug.recovery.CriticRecoverer` + `VerifierSpec` registry | ✅ **new 0.2.3** | 5 family-matched verifier templates; dedup + custom-override |
 | DeepDebug | `agentdebug.deep.DeepDebugAnalyzer` | ✅ stable | full loop + silent LLM |
@@ -47,7 +48,7 @@ across 32 source files.
 | [06_detectors.md](./06_detectors.md) | `trajectory_perplexity` (TrajAD) | needs token-level LM perplexity API or embedding model + baseline calibration | v0.3 |
 | [06_detectors.md](./06_detectors.md) | `topic_drift` (embedding cosine) | needs embedding client; consider reusing `OpenAICompatClient` `/embeddings` | v0.3 |
 | [06_detectors.md](./06_detectors.md) | LTL spec monitors | requires user-supplied spec or LLM-synthesized monitors; gated on RV research | v1.2 |
-| [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` | requires re-rolling agent actions; framework-replay dependent | v0.3 |
+| [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` — *real* replay variant | true re-rollout requires framework-specific replay surface; the v0.2.7 LLM-simulated variant ships now, the real-replay variant is gated on adapter support (LangGraph checkpointer / OpenHands rewind) | v0.4 |
 | [07_attribution.md](./07_attribution.md) | `SBFLAttributor` (Tarantula/Ochiai) | needs corpus of passing + failing traces of same task; gated on Hub adoption | v0.4 |
 | [07_attribution.md](./07_attribution.md) | `DeltaDebugAttributor` (Zeller) | same replay constraint | v0.3 |
 | [07_attribution.md](./07_attribution.md) | `EnsembleAttributor` | trivial once Counterfactual lands; awaits Counterfactual | v0.3 |

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "agentdebugx"
-version = "0.2.6"
+version = "0.2.7"
 description = "Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`."
 authors = ["ULab @ UIUC <ulab@illinois.edu>"]
 license = "MIT"

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/__init__.py RENAMED Viewed

@@ -15,6 +15,7 @@ from agentdebug.attribution import (
     Attributor,
     BinarySearchAttributor,
     Blame,
+    CounterfactualAttributor,
     HeuristicAttributor,
     StepByStepAttributor,
 )
@@ -63,6 +64,7 @@ __all__ = [
     'BusEvent',
     'BinarySearchAttributor',
     'CascadeFrame',
+    'CounterfactualAttributor',
     'CriticRecoverer',
     'DEFAULT_VERIFIERS',
     'Detector',
@@ -96,4 +98,4 @@ __all__ = [
     'get_failure_mode',
 ]
-__version__ = '0.2.6'
+__version__ = '0.2.7'

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/attribution.py RENAMED Viewed

@@ -566,8 +566,193 @@ def _EVENT_ELLIPSIS(count: int) -> _EllipsisEvent:
     return _EllipsisEvent(count=count)
+_COUNTERFACTUAL_SYSTEM_PROMPT = """You are AgentDebugX-Attributor running an
+LLM-simulated counterfactual replay (AgenTracer-style, arXiv:2509.03312).
+You will be given the goal, the full trajectory, and ONE CANDIDATE STEP. Your
+job is to estimate whether the agent would have succeeded if THAT step had
+been done correctly — leaving everything else the same. This isolates the
+step's causal contribution to the failure.
+CRITICAL OUTPUT RULES (these maximize the chance your reply parses):
+1. Output ONLY a JSON object. No prose before/after. No markdown fences.
+2. Keep "rationale" to ONE short sentence (<= 200 chars).
+3. Do NOT include newlines inside string values.
+4. Emit the JSON object COMPLETE.
+Schema:
+{
+  "rescue_probability": <0..1>,
+  "confidence": <0..1>,
+  "rationale": "<short>",
+  "would_block_downstream_failures": true | false
+}
+Higher rescue_probability = correcting this step would more likely have
+rescued the run; this step is therefore more responsible for the failure.
+"""
+class CounterfactualAttributor:
+    """LLM-simulated counterfactual replay.
+    For each of K candidate steps (top-K from prior findings, or
+    error-bearing events, or the tail of the trajectory) ask the LLM:
+    "if this step had been correct, would the rest of the trajectory still
+    fail?" Steps with the highest rescue-probability become the top blame
+    hypotheses. Costs O(K) LLM calls — comparable to AllAtOnce, with a
+    stronger causal claim per probe.
+    This is *simulated* counterfactual, not real re-rollout — strictly
+    weaker than AgenTracer's actual replay, but framework-independent and
+    runnable today against any LLM. When the underlying framework gains a
+    real replay surface (LangGraph checkpointer, OpenHands rewind), wire
+    that in as an alternative ``replay_fn`` and the algorithm carries over.
+    """
+    id = 'counterfactual'
+    def __init__(
+        self,
+        llm: LLMClient,
+        *,
+        max_candidates: int = 5,
+        max_tokens: int = 2048,
+        fallback: Optional[Attributor] = None,
+    ) -> None:
+        self.llm = llm
+        self.max_candidates = max_candidates
+        self.max_tokens = max_tokens
+        self.fallback: Attributor = fallback or HeuristicAttributor()
+    def attribute(
+        self,
+        trajectory: AgentTrajectory,
+        findings: List[FailureFinding],
+    ) -> AttributionResult:
+        candidates = self._pick_candidates(trajectory, findings)
+        if not candidates:
+            return self.fallback.attribute(trajectory, findings)
+        ranked: List[tuple[AgentEvent, Dict[str, Any]]] = []
+        for evt in candidates:
+            verdict = self._ask_counterfactual(trajectory, evt)
+            if verdict is None:
+                continue
+            ranked.append((evt, verdict))
+        if not ranked:
+            return self.fallback.attribute(trajectory, findings)
+        # Sort by rescue_probability desc, tie-break by confidence.
+        ranked.sort(
+            key=lambda r: (
+                -self._coerce_float(r[1].get('rescue_probability'), 0.0),
+                -self._coerce_float(r[1].get('confidence'), 0.0),
+            )
+        )
+        hypotheses: List[Blame] = []
+        for evt, verdict in ranked:
+            hypotheses.append(Blame(
+                span_id=evt.event_id,
+                step_index=evt.step_index,
+                agent_name=evt.agent_name,
+                confidence=self._coerce_float(verdict.get('rescue_probability'), 0.0),
+                rationale=(
+                    str(verdict.get('rationale') or 'no rationale')
+                    + f' [rescue_probability={verdict.get("rescue_probability")}]'
+                ),
+                evidence=[
+                    f'event_id={evt.event_id}',
+                    f'step={evt.step_index}',
+                ],
+                sources=[self.id],
+            ))
+        return AttributionResult(
+            method=self.id,
+            hypotheses=hypotheses,
+            raw={'candidates_probed': len(ranked)},
+        )
+    def _pick_candidates(
+        self,
+        trajectory: AgentTrajectory,
+        findings: List[FailureFinding],
+    ) -> List[AgentEvent]:
+        events_by_id = {e.event_id: e for e in trajectory.events}
+        candidates: List[AgentEvent] = []
+        seen: set[str] = set()
+        # 1. Prior findings (the judge already nominated suspects).
+        for f in findings:
+            evt = events_by_id.get(f.event_id) if f.event_id else None
+            if evt is not None and evt.event_id not in seen:
+                candidates.append(evt)
+                seen.add(evt.event_id)
+                if len(candidates) >= self.max_candidates:
+                    return candidates
+        # 2. Events that recorded an error directly.
+        for evt in trajectory.events:
+            if evt.error and evt.event_id not in seen:
+                candidates.append(evt)
+                seen.add(evt.event_id)
+                if len(candidates) >= self.max_candidates:
+                    return candidates
+        # 3. Fallback: tail of the trajectory (failure most often manifests there).
+        for evt in reversed(trajectory.events):
+            if evt.event_id not in seen:
+                candidates.append(evt)
+                seen.add(evt.event_id)
+                if len(candidates) >= self.max_candidates:
+                    return candidates
+        return candidates
+    def _ask_counterfactual(
+        self, trajectory: AgentTrajectory, candidate: AgentEvent,
+    ) -> Optional[Dict[str, Any]]:
+        events_doc = '\n'.join(
+            f'event_id={e.event_id} step={e.step_index} agent={e.agent_name} '
+            f'type={getattr(e.event_type, "value", e.event_type)} '
+            f'output={str(e.output)[:200]} error={str(e.error)[:200]}'
+            for e in trajectory.events
+        )
+        user = (
+            f'GOAL: {trajectory.goal!r}\n'
+            f'FRAMEWORK: {trajectory.framework!r}\n\n'
+            f'FULL TRAJECTORY:\n{events_doc}\n\n'
+            f'CANDIDATE STEP TO COUNTERFACTUALLY CORRECT:\n'
+            f'  event_id={candidate.event_id}\n'
+            f'  step={candidate.step_index} agent={candidate.agent_name}\n'
+            f'  module={candidate.module}\n'
+            f'  input={str(candidate.input)[:300]}\n'
+            f'  output={str(candidate.output)[:300]}\n'
+            f'  error={str(candidate.error)[:300]}\n\n'
+            f'Question: if this step had been DONE CORRECTLY, what is the '
+            f'probability the run would have succeeded?'
+        )
+        try:
+            result = self.llm.complete(
+                messages=[
+                    {'role': 'system', 'content': _COUNTERFACTUAL_SYSTEM_PROMPT},
+                    {'role': 'user', 'content': user},
+                ],
+                max_tokens=self.max_tokens,
+            )
+        except Exception as exc:  # pragma: no cover
+            LOG.warning('counterfactual probe failed at event=%s: %s',
+                        candidate.event_id, exc)
+            return None
+        parsed = extract_json_block(result.text)
+        if parsed is None:
+            return None
+        return cast(Dict[str, Any], parsed)
+    @staticmethod
+    def _coerce_float(value: Any, default: float) -> float:
+        try:
+            return float(value)
+        except (TypeError, ValueError):
+            return default
 __all__ = [
     'Attributor', 'Blame', 'AttributionResult',
     'HeuristicAttributor', 'AllAtOnceAttributor', 'StepByStepAttributor',
-    'BinarySearchAttributor',
+    'BinarySearchAttributor', 'CounterfactualAttributor',
 ]

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/LICENSE RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/README.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/00_overview.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/01_literature_survey.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/02_architecture.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/03_taxonomy.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/04_trace_schema.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/05_adapters.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/06_detectors.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/07_attribution.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/08_recovery.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/09_error_database.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/10_taxonomy_induction.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/11_multimodal.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/12_ui_dashboard.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/13_class_design.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/14_api_reference.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/15_roadmap.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/16_governance.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/17_claude_code_design_patterns.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/18_comparison_codex_vs_design.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/19_error_hub.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/20_deep_debug.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/21_integrations.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/22_industry_track_paper_eval_plan.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/ERROR_TAXONOMY.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/README.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/RESEARCH_SURVEY.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/benchmarks/e2e_v0_2_3.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/benchmarks/e2e_v0_2_4.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/benchmarks/v0_1_smoke.json RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/benchmarks/v0_1_smoke.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/docs/benchmarks/who_when_v0_2_6_leaderboard.md RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/adapters/__init__.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/adapters/base.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/adapters/crewai.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/adapters/langgraph.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/adapters/otel.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/adapters/raw.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/analyzers.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/cli.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/deep.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/detectors.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/events.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/hub/__init__.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/hub/backend_base.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/hub/backends.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/hub/bundle.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/hub/scrub.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/instrumentation.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/integrations/__init__.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/integrations/claude_skill.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/integrations/openhands.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/judges.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/llm.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/models.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/recorder.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/recovery.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/storage.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/taxonomy.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/traceback.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/ui/__init__.py RENAMED Viewed

File without changes

{agentdebugx-0.2.6 → agentdebugx-0.2.7}/src/agentdebug/ui/server.py RENAMED Viewed

File without changes

agentdebugx 0.2.6__tar.gz → 0.2.7__tar.gz

agentdebugx 0.2.6tar.gz → 0.2.7tar.gz