agentdebugx 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/PKG-INFO +1 -1
  2. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/23_status_v0_2.md +16 -9
  3. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/pyproject.toml +1 -1
  4. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/__init__.py +14 -2
  5. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/attribution.py +171 -0
  6. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/models.py +12 -0
  7. agentdebugx-0.2.3/src/agentdebug/recovery.py +314 -0
  8. agentdebugx-0.2.2/src/agentdebug/recovery.py +0 -113
  9. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/LICENSE +0 -0
  10. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/README.md +0 -0
  11. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/00_overview.md +0 -0
  12. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/01_literature_survey.md +0 -0
  13. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/02_architecture.md +0 -0
  14. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/03_taxonomy.md +0 -0
  15. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/04_trace_schema.md +0 -0
  16. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/05_adapters.md +0 -0
  17. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/06_detectors.md +0 -0
  18. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/07_attribution.md +0 -0
  19. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/08_recovery.md +0 -0
  20. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/09_error_database.md +0 -0
  21. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/10_taxonomy_induction.md +0 -0
  22. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/11_multimodal.md +0 -0
  23. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/12_ui_dashboard.md +0 -0
  24. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/13_class_design.md +0 -0
  25. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/14_api_reference.md +0 -0
  26. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/15_roadmap.md +0 -0
  27. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/16_governance.md +0 -0
  28. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/17_claude_code_design_patterns.md +0 -0
  29. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/18_comparison_codex_vs_design.md +0 -0
  30. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/19_error_hub.md +0 -0
  31. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/20_deep_debug.md +0 -0
  32. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/21_integrations.md +0 -0
  33. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/22_industry_track_paper_eval_plan.md +0 -0
  34. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/ERROR_TAXONOMY.md +0 -0
  35. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
  36. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/README.md +0 -0
  37. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/RESEARCH_SURVEY.md +0 -0
  38. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/benchmarks/v0_1_smoke.json +0 -0
  39. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/benchmarks/v0_1_smoke.md +0 -0
  40. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/adapters/__init__.py +0 -0
  41. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/adapters/base.py +0 -0
  42. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/adapters/langgraph.py +0 -0
  43. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/adapters/otel.py +0 -0
  44. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/adapters/raw.py +0 -0
  45. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/analyzers.py +0 -0
  46. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/cli.py +0 -0
  47. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/deep.py +0 -0
  48. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/detectors.py +0 -0
  49. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/events.py +0 -0
  50. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/hub/__init__.py +0 -0
  51. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/hub/backend_base.py +0 -0
  52. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/hub/backends.py +0 -0
  53. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/hub/bundle.py +0 -0
  54. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/hub/scrub.py +0 -0
  55. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/instrumentation.py +0 -0
  56. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/integrations/__init__.py +0 -0
  57. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/integrations/claude_skill.py +0 -0
  58. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/integrations/openhands.py +0 -0
  59. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/judges.py +0 -0
  60. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/llm.py +0 -0
  61. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/recorder.py +0 -0
  62. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/storage.py +0 -0
  63. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/taxonomy.py +0 -0
  64. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/traceback.py +0 -0
  65. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/ui/__init__.py +0 -0
  66. {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/ui/server.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentdebugx
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -1,4 +1,4 @@
1
- # 23 — Capability + Test Coverage Status (v0.2.2)
1
+ # 23 — Capability + Test Coverage Status (v0.2.3)
2
2
 
3
3
  A live audit of what's implemented, what's tested, and what's specced but
4
4
  not yet built. Pair this with [docs/15_roadmap.md](./15_roadmap.md), which is
@@ -20,7 +20,9 @@ the forward-looking plan; this doc is the rear-view mirror.
20
20
  | Attribution | `agentdebug.attribution.HeuristicAttributor` | ✅ stable | first-finding + tiebreak |
21
21
  | Attribution | `agentdebug.attribution.AllAtOnceAttributor` | ✅ stable | mocked LLM + fallback |
22
22
  | Attribution | `agentdebug.attribution.StepByStepAttributor` | ✅ **new 0.2.2** | scripted-LLM + fallback |
23
+ | Attribution | `agentdebug.attribution.BinarySearchAttributor` | ✅ **new 0.2.3** | oracle-LLM logarithmic convergence + fallback + render elision |
23
24
  | Recovery | `agentdebug.recovery.ReflexionSuggestion` | ✅ stable | per-finding + empty |
25
+ | Recovery | `agentdebug.recovery.CriticRecoverer` + `VerifierSpec` registry | ✅ **new 0.2.3** | 5 family-matched verifier templates; dedup + custom-override |
24
26
  | DeepDebug | `agentdebug.deep.DeepDebugAnalyzer` | ✅ stable | full loop + silent LLM |
25
27
  | Cascade view | `agentdebug.traceback.format_traceback` | ✅ stable | cascade + step-order + ANSI + empty |
26
28
  | Detectors | `agentdebug.detectors.RepeatedToolCall / RepeatedState / StepCountLimit` | ✅ **new 0.2.2** | threshold + window + budget |
@@ -45,13 +47,11 @@ across 32 source files.
45
47
  | [06_detectors.md](./06_detectors.md) | `trajectory_perplexity` (TrajAD) | needs token-level LM perplexity API or embedding model + baseline calibration | v0.3 |
46
48
  | [06_detectors.md](./06_detectors.md) | `topic_drift` (embedding cosine) | needs embedding client; consider reusing `OpenAICompatClient` `/embeddings` | v0.3 |
47
49
  | [06_detectors.md](./06_detectors.md) | LTL spec monitors | requires user-supplied spec or LLM-synthesized monitors; gated on RV research | v1.2 |
48
- | [07_attribution.md](./07_attribution.md) | `BinarySearchAttributor` (ddmin) | requires replayable environment; few frameworks expose it | v0.3 |
49
- | [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` | requires re-rolling agent actions; same replay constraint | v0.3 |
50
+ | [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` | requires re-rolling agent actions; framework-replay dependent | v0.3 |
50
51
  | [07_attribution.md](./07_attribution.md) | `SBFLAttributor` (Tarantula/Ochiai) | needs corpus of passing + failing traces of same task; gated on Hub adoption | v0.4 |
51
52
  | [07_attribution.md](./07_attribution.md) | `DeltaDebugAttributor` (Zeller) | same replay constraint | v0.3 |
52
- | [07_attribution.md](./07_attribution.md) | `EnsembleAttributor` | trivial once 2+ heavy backends ship; awaits BinarySearch/Counterfactual | v0.3 |
53
+ | [07_attribution.md](./07_attribution.md) | `EnsembleAttributor` | trivial once Counterfactual lands; awaits Counterfactual | v0.3 |
53
54
  | [08_recovery.md](./08_recovery.md) | `SelfRefineLoop` | small but needs a generator-critic-refiner orchestration | v0.3 |
54
- | [08_recovery.md](./08_recovery.md) | `CriticRecoverer` | needs a verifier registry (search, code-exec, type-check) | v0.3 |
55
55
  | [08_recovery.md](./08_recovery.md) | `AutoManualRules` | needs persistent project manual + injection into next-run prompts | v0.3 |
56
56
  | [08_recovery.md](./08_recovery.md) | `LangGraphRewind` | depends on LangGraph checkpointer; ships when we have a real LangGraph user | v0.3 |
57
57
  | [08_recovery.md](./08_recovery.md) | `SagaRollback` | needs compensation registry on tool definitions; new schema | v0.3 |
@@ -97,10 +97,17 @@ remaining gaps are deliberate:
97
97
 
98
98
  Before v0.3 ships, this doc should record green checkmarks for:
99
99
 
100
- - [ ] One replayable counterfactual attributor (`BinarySearchAttributor` is
101
- the cheapest entry).
102
- - [ ] One tool-grounded recovery strategy (`CriticRecoverer`) wired against
103
- a `Verifier` Protocol.
100
+ - [x] **Logarithmic-cost attributor** (`BinarySearchAttributor`) shipped in
101
+ 0.2.3 Who&When method 3, O(log N) LLM calls, bisects the trajectory
102
+ via prefix evaluation. **Note:** this is not yet a "replayable
103
+ counterfactual" attributor; it predicts whether the failure has
104
+ already occurred from the prefix without re-rolling the agent. True
105
+ counterfactual replay is still v0.3.
106
+ - [x] **Tool-grounded recovery strategy** (`CriticRecoverer` + `VerifierSpec`
107
+ registry) shipped in 0.2.3 — pattern-matches failure modes against 5
108
+ default verifier templates (JSON-schema guard, final-state check,
109
+ tool-result type-check, handoff contract, loop-detector guard) and
110
+ emits per-finding `FixProposal` with rationale + suggested code.
104
111
  - [ ] One additional framework adapter that goes through the full conformance
105
112
  suite (CrewAI is the most-requested).
106
113
  - [ ] HuggingFace Hub round-trip live test (gated on `HF_TOKEN`).
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "agentdebugx"
3
- version = "0.2.2"
3
+ version = "0.2.3"
4
4
  description = "Portable error analysis, tracing, and recovery framework for agentic AI systems. Import as `agentdebug`."
5
5
  authors = ["ULab @ UIUC <ulab@illinois.edu>"]
6
6
  license = "MIT"
@@ -13,6 +13,7 @@ from agentdebug.attribution import (
13
13
  AllAtOnceAttributor,
14
14
  AttributionResult,
15
15
  Attributor,
16
+ BinarySearchAttributor,
16
17
  Blame,
17
18
  HeuristicAttributor,
18
19
  StepByStepAttributor,
@@ -38,7 +39,14 @@ from agentdebug.models import (
38
39
  Modality,
39
40
  )
40
41
  from agentdebug.recorder import AgentDebug, TraceSession
41
- from agentdebug.recovery import FixProposal, Recoverer, ReflexionSuggestion
42
+ from agentdebug.recovery import (
43
+ DEFAULT_VERIFIERS,
44
+ CriticRecoverer,
45
+ FixProposal,
46
+ Recoverer,
47
+ ReflexionSuggestion,
48
+ VerifierSpec,
49
+ )
42
50
  from agentdebug.traceback import CascadeFrame, build_cascade, format_traceback
43
51
  from agentdebug.storage import JsonlTraceStore, SQLiteTraceStore
44
52
  from agentdebug.taxonomy import SEED_FAILURE_MODES, get_failure_mode
@@ -53,13 +61,17 @@ __all__ = [
53
61
  'Attributor',
54
62
  'Blame',
55
63
  'BusEvent',
64
+ 'BinarySearchAttributor',
56
65
  'CascadeFrame',
66
+ 'CriticRecoverer',
67
+ 'DEFAULT_VERIFIERS',
57
68
  'Detector',
58
69
  'DetectorConfig',
59
70
  'RepeatedStateDetector',
60
71
  'RepeatedToolCallDetector',
61
72
  'StepByStepAttributor',
62
73
  'StepCountLimitDetector',
74
+ 'VerifierSpec',
63
75
  'build_cascade',
64
76
  'default_detectors',
65
77
  'format_traceback',
@@ -84,4 +96,4 @@ __all__ = [
84
96
  'get_failure_mode',
85
97
  ]
86
98
 
87
- __version__ = '0.2.2'
99
+ __version__ = '0.2.3'
@@ -21,6 +21,10 @@ import logging
21
21
  from dataclasses import dataclass, field
22
22
  from typing import Any, Dict, List, Optional, Protocol, cast
23
23
 
24
+
25
+ # Forward decl so BinarySearchAttributor.attribute can reference _EllipsisEvent
26
+ # from the helper render path; defined later in the module.
27
+
24
28
  from agentdebug.llm import LLMClient, extract_json_block
25
29
  from agentdebug.models import AgentEvent, AgentTrajectory, FailureFinding, new_id
26
30
 
@@ -227,6 +231,27 @@ class AllAtOnceAttributor:
227
231
  return [str(value)]
228
232
 
229
233
 
234
+ _BISECT_SYSTEM_PROMPT = """You are AgentDebugX-Attributor running the
235
+ Who&When "Binary-Search" attribution method (arXiv:2505.00212). You will be
236
+ shown a PREFIX of a failed agent trajectory truncated to its first N events.
237
+
238
+ Decide whether the failure has ALREADY occurred within this prefix, i.e.,
239
+ whether the trajectory is unrecoverable as of the last event shown.
240
+
241
+ Respond ONLY with a JSON object (no prose, no markdown):
242
+
243
+ {
244
+ "failure_already_happened": true | false,
245
+ "confidence": <float in [0,1]>,
246
+ "rationale": "<one or two sentences>",
247
+ "decisive_event_id": "<event_id or null>"
248
+ }
249
+
250
+ Be conservative: only return true when you can point to evidence in the
251
+ prefix that the agent has already taken (or omitted) the decisive step.
252
+ """
253
+
254
+
230
255
  _STEP_SYSTEM_PROMPT = """You are AgentDebugX-Attributor, scanning a failed
231
256
  agent trajectory one step at a time (the Who&When "Step-by-Step" method,
232
257
  arXiv:2505.00212).
@@ -393,7 +418,153 @@ class StepByStepAttributor:
393
418
  return [str(value)]
394
419
 
395
420
 
421
+ class BinarySearchAttributor:
422
+ """LLM-based attributor implementing Who&When's Binary-Search method.
423
+
424
+ Bisects the trajectory and asks the LLM whether the failure has already
425
+ occurred in each prefix. Costs O(log N) LLM calls vs StepByStep's O(N).
426
+
427
+ The contract:
428
+
429
+ * Pre-condition: the trajectory is known to have failed overall.
430
+ * Loop invariant: ``failure_already_happened`` is False at ``lo`` and
431
+ True at ``hi``. The decisive step lives in ``(lo, hi]``.
432
+ * Termination: ``hi - lo == 1``; ``hi - 1`` is the decisive index.
433
+
434
+ Returns the event at the decisive index as the primary Blame hypothesis.
435
+ Falls back to the configured ``fallback`` attributor when the trajectory
436
+ is empty or the LLM responses are uninterpretable.
437
+ """
438
+
439
+ id = 'binary_search'
440
+
441
+ def __init__(
442
+ self,
443
+ llm: LLMClient,
444
+ *,
445
+ fallback: Optional[Attributor] = None,
446
+ max_tokens: int = 1024,
447
+ context_window: int = 6,
448
+ ) -> None:
449
+ self.llm = llm
450
+ self.fallback: Attributor = fallback or HeuristicAttributor()
451
+ self.max_tokens = max_tokens
452
+ # When formatting a prefix into the LLM prompt we only keep this many
453
+ # events at the head + this many at the tail; the middle is elided.
454
+ # Keeps cost bounded for very long trajectories.
455
+ self.context_window = context_window
456
+
457
+ def attribute(
458
+ self,
459
+ trajectory: AgentTrajectory,
460
+ findings: List[FailureFinding],
461
+ ) -> AttributionResult:
462
+ n = len(trajectory.events)
463
+ if n == 0:
464
+ return self.fallback.attribute(trajectory, findings)
465
+ lo, hi = 0, n
466
+ probe_count = 0
467
+ # Sanity: cap probes at ceil(log2(n)) + 2 to bound cost in pathological cases.
468
+ import math
469
+ max_probes = max(1, int(math.ceil(math.log2(max(n, 2)))) + 2)
470
+ while hi - lo > 1 and probe_count < max_probes:
471
+ mid = (lo + hi) // 2
472
+ probe_count += 1
473
+ verdict = self._probe(trajectory, mid)
474
+ if verdict is None:
475
+ # Uninterpretable response: fall back rather than guess.
476
+ return self.fallback.attribute(trajectory, findings)
477
+ already = bool(verdict.get('failure_already_happened'))
478
+ if already:
479
+ hi = mid
480
+ else:
481
+ lo = mid
482
+ decisive_index = hi - 1
483
+ decisive = trajectory.events[decisive_index]
484
+ return AttributionResult(
485
+ method=self.id,
486
+ hypotheses=[Blame(
487
+ span_id=decisive.event_id,
488
+ step_index=decisive.step_index,
489
+ agent_name=decisive.agent_name,
490
+ confidence=0.6 + 0.1 * min(probe_count, 4),
491
+ rationale=(
492
+ f'Binary search located the decisive step within '
493
+ f'{probe_count} probes over {n} events.'
494
+ ),
495
+ evidence=[
496
+ f'event_id={decisive.event_id}',
497
+ f'step={decisive.step_index}',
498
+ ],
499
+ sources=[self.id],
500
+ )],
501
+ raw={'probe_count': probe_count, 'trajectory_len': n},
502
+ )
503
+
504
+ def _probe(
505
+ self, trajectory: AgentTrajectory, prefix_len: int
506
+ ) -> Optional[Dict[str, Any]]:
507
+ prefix = trajectory.prefix(prefix_len)
508
+ # Render prefix with head + tail elision so long prefixes stay cheap.
509
+ events_doc = self._render_prefix(prefix)
510
+ user = (
511
+ f'GOAL: {trajectory.goal!r}\n'
512
+ f'FRAMEWORK: {trajectory.framework!r}\n\n'
513
+ f'PREFIX (events 1..{prefix_len} of {len(trajectory.events)}):\n'
514
+ f'{events_doc}'
515
+ )
516
+ try:
517
+ result = self.llm.complete(
518
+ messages=[
519
+ {'role': 'system', 'content': _BISECT_SYSTEM_PROMPT},
520
+ {'role': 'user', 'content': user},
521
+ ],
522
+ max_tokens=self.max_tokens,
523
+ )
524
+ except Exception as exc: # pragma: no cover
525
+ LOG.warning('binary_search probe at prefix_len=%s failed: %s',
526
+ prefix_len, exc)
527
+ return None
528
+ parsed = extract_json_block(result.text)
529
+ if parsed is None:
530
+ return None
531
+ return cast(Dict[str, Any], parsed)
532
+
533
+ def _render_prefix(self, prefix: AgentTrajectory) -> str:
534
+ events = prefix.events
535
+ if len(events) <= 2 * self.context_window:
536
+ view = events
537
+ else:
538
+ head = events[: self.context_window]
539
+ tail = events[-self.context_window:]
540
+ elided = len(events) - 2 * self.context_window
541
+ view = head + [_EVENT_ELLIPSIS(elided)] + tail
542
+ return '\n'.join(self._render_event(e) for e in view)
543
+
544
+ @staticmethod
545
+ def _render_event(event: Any) -> str:
546
+ if isinstance(event, _EllipsisEvent):
547
+ return f'... ({event.count} events elided) ...'
548
+ return (
549
+ f'event_id={event.event_id} step={event.step_index} '
550
+ f'agent={event.agent_name} '
551
+ f'type={getattr(event.event_type, "value", event.event_type)} '
552
+ f'output={str(event.output)[:200]} '
553
+ f'error={str(event.error)[:200]}'
554
+ )
555
+
556
+
557
+ @dataclass
558
+ class _EllipsisEvent:
559
+ count: int
560
+
561
+
562
+ def _EVENT_ELLIPSIS(count: int) -> _EllipsisEvent:
563
+ return _EllipsisEvent(count=count)
564
+
565
+
396
566
  __all__ = [
397
567
  'Attributor', 'Blame', 'AttributionResult',
398
568
  'HeuristicAttributor', 'AllAtOnceAttributor', 'StepByStepAttributor',
569
+ 'BinarySearchAttributor',
399
570
  ]
@@ -102,6 +102,18 @@ class AgentTrajectory(BaseModel):
102
102
  self.events.append(event)
103
103
  return event
104
104
 
105
+ def prefix(self, n: int) -> 'AgentTrajectory':
106
+ """Return a copy keeping only the first ``n`` events.
107
+
108
+ Used by replay/attribution backends (Binary-Search, Delta-Debugging)
109
+ that need to ask "would the trajectory still fail if it had stopped
110
+ at step k?" The returned object is a SHALLOW copy of the events list
111
+ but a fresh AgentTrajectory; mutating it does not touch the original.
112
+ """
113
+ truncated = self.model_copy(deep=False) if hasattr(self, 'model_copy') else self.copy(deep=False)
114
+ truncated.events = list(self.events[:max(0, n)])
115
+ return truncated
116
+
105
117
 
106
118
  class FailureMode(BaseModel):
107
119
  """A seed or generated taxonomy node."""
@@ -0,0 +1,314 @@
1
+ """Lightweight recovery suggestions.
2
+
3
+ v0.1 ships ``ReflexionSuggestion`` — a *suggest-only* recovery generator that
4
+ produces a structured retry-prompt artifact based on Reflexion (Shinn et al.,
5
+ NeurIPS 2023, arXiv:2303.11366). Heavier strategies (Self-Refine loop, CRITIC,
6
+ Saga rollback, MCTS) are deferred per the roadmap and will land behind the same
7
+ :class:`Recoverer` protocol.
8
+
9
+ By design, **nothing here re-executes the agent** — recovery proposals are
10
+ artifacts to be surfaced (CLI/UI/PR comment) or fed back into the next run.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from dataclasses import dataclass, field
16
+ from typing import List, Optional, Protocol, Tuple
17
+
18
+ from agentdebug.models import (
19
+ AgentTrajectory,
20
+ DiagnosticReport,
21
+ FailureFinding,
22
+ new_id,
23
+ )
24
+
25
+
26
+ @dataclass
27
+ class FixProposal:
28
+ proposal_id: str
29
+ recoverer_id: str
30
+ target_event_id: Optional[str]
31
+ summary: str
32
+ rationale: str
33
+ confidence: float
34
+ suggestion_text: str
35
+ side_effects: List[str] = field(default_factory=list)
36
+ requires_human_approval: bool = False
37
+
38
+
39
+ class Recoverer(Protocol):
40
+ id: str
41
+
42
+ def suggest(
43
+ self,
44
+ trajectory: AgentTrajectory,
45
+ report: DiagnosticReport,
46
+ ) -> List[FixProposal]:
47
+ ...
48
+
49
+
50
+ class ReflexionSuggestion:
51
+ """Emit a Reflexion-style retry reflection per finding.
52
+
53
+ The output is purely textual — it can be appended to the agent's next
54
+ system prompt, written to a project ``MANUAL.md``, or surfaced in the
55
+ Console. There is no auto-apply.
56
+ """
57
+
58
+ id = 'reflexion'
59
+
60
+ def suggest(
61
+ self,
62
+ trajectory: AgentTrajectory,
63
+ report: DiagnosticReport,
64
+ ) -> List[FixProposal]:
65
+ if not report.findings:
66
+ return []
67
+ proposals: List[FixProposal] = []
68
+ for finding in report.findings:
69
+ proposals.append(self._build_proposal(trajectory, finding))
70
+ return proposals
71
+
72
+ def _build_proposal(
73
+ self, trajectory: AgentTrajectory, finding: FailureFinding
74
+ ) -> FixProposal:
75
+ goal = trajectory.goal or '(no goal recorded)'
76
+ framework = trajectory.framework or '(framework not declared)'
77
+ evidence_block = '\n'.join(f' - {e}' for e in finding.evidence) or ' (none)'
78
+ suggestion_template = (
79
+ finding.suggestion
80
+ or (finding.failure_mode.suggestion_templates[0]
81
+ if finding.failure_mode.suggestion_templates
82
+ else 'Inspect the offending step and constrain the agent at that point.')
83
+ )
84
+ reflection = (
85
+ f'Task: {goal}\n'
86
+ f'Framework: {framework}\n'
87
+ f'Observed failure mode: {finding.failure_mode.mode_id} '
88
+ f'({finding.failure_mode.name})\n'
89
+ f'Located at agent={finding.agent_name}, step={finding.step_index}, '
90
+ f'event_id={finding.event_id}\n'
91
+ f'Evidence:\n{evidence_block}\n'
92
+ f'Next time, do the following:\n {suggestion_template}\n'
93
+ )
94
+ return FixProposal(
95
+ proposal_id=new_id('fix'),
96
+ recoverer_id=self.id,
97
+ target_event_id=finding.event_id,
98
+ summary=(
99
+ f'Reflexion retry hint for {finding.failure_mode.mode_id} '
100
+ f'at step {finding.step_index}'
101
+ ),
102
+ rationale=(
103
+ 'Reflexion (Shinn et al., NeurIPS 2023) converts a failure '
104
+ 'into a verbal hint appended to next attempt.'
105
+ ),
106
+ confidence=min(0.9, max(0.1, finding.confidence)),
107
+ suggestion_text=reflection,
108
+ side_effects=['memory.write'],
109
+ requires_human_approval=False,
110
+ )
111
+
112
+
113
+ @dataclass(frozen=True)
114
+ class VerifierSpec:
115
+ """A pattern describing a tool-grounded verifier that could have caught
116
+ a particular family of failures.
117
+
118
+ Used by :class:`CriticRecoverer` to recommend (not run) the addition of
119
+ a verifier between the failing step and the next side-effect.
120
+ """
121
+
122
+ id: str
123
+ description: str
124
+ matches_families: tuple[str, ...] # e.g. ('action',)
125
+ matches_mode_prefixes: tuple[str, ...] # e.g. ('action.format_error', 'action.parameter_error')
126
+ suggested_code: str # short snippet showing how to add the guard
127
+ rationale: str
128
+
129
+ def matches(self, finding: 'FailureFinding') -> bool:
130
+ if finding.failure_mode.family in self.matches_families:
131
+ return True
132
+ return any(
133
+ finding.failure_mode.mode_id.startswith(p)
134
+ for p in self.matches_mode_prefixes
135
+ )
136
+
137
+
138
+ DEFAULT_VERIFIERS: List[VerifierSpec] = [
139
+ VerifierSpec(
140
+ id='json_schema_guard',
141
+ description='Validate tool arguments against the tool JSON schema before execution.',
142
+ matches_families=('action',),
143
+ matches_mode_prefixes=('action.format_error', 'action.parameter_error'),
144
+ suggested_code=(
145
+ 'from jsonschema import validate, ValidationError\n'
146
+ 'try:\n'
147
+ ' validate(instance=tool_args, schema=tool.schema)\n'
148
+ 'except ValidationError as exc:\n'
149
+ ' return handle_arg_error(exc, tool=tool, args=tool_args)\n'
150
+ ),
151
+ rationale=(
152
+ 'CRITIC (Gou et al., ICLR 2024): a tool-interactive verifier '
153
+ 'catches malformed/missing-argument failures before they hit '
154
+ 'the downstream API.'
155
+ ),
156
+ ),
157
+ VerifierSpec(
158
+ id='final_state_check',
159
+ description='Independent final-state verifier confirms the task is satisfied before terminating.',
160
+ matches_families=('verification', 'reflection'),
161
+ matches_mode_prefixes=('verification.', 'reflection.progress_misjudge'),
162
+ suggested_code=(
163
+ 'def verify_task_complete(goal: str, final_state: dict) -> bool:\n'
164
+ ' # Check every explicit success criterion; do NOT trust the\n'
165
+ ' # acting agent\'s self-report.\n'
166
+ ' return all(criterion(final_state) for criterion in success_criteria(goal))\n'
167
+ '\n'
168
+ 'if not verify_task_complete(goal, state):\n'
169
+ ' trigger_recovery_planning(reason="task not verified complete")\n'
170
+ ),
171
+ rationale=(
172
+ 'MAST (Cemri et al., 2025) shows premature termination + missing '
173
+ 'task validation are dominant multi-agent failure modes. A '
174
+ 'verifier that does not trust the acting agent is the standard '
175
+ 'fix.'
176
+ ),
177
+ ),
178
+ VerifierSpec(
179
+ id='tool_result_typecheck',
180
+ description='Type-check the tool result and require explicit handling of None / empty.',
181
+ matches_families=('action', 'system'),
182
+ matches_mode_prefixes=('action.wrong_tool', 'system.tool_execution_error'),
183
+ suggested_code=(
184
+ 'result = tool.run(args)\n'
185
+ 'if result is None or result == {}:\n'
186
+ ' return handle_empty_result(tool=tool, args=args)\n'
187
+ 'if not isinstance(result, tool.expected_return_type):\n'
188
+ ' return handle_unexpected_type(result, tool=tool)\n'
189
+ ),
190
+ rationale=(
191
+ 'Tool outputs that the agent does not branch on (None, empty '
192
+ 'list, unexpected type) propagate as "agent hallucinated a '
193
+ 'fact" downstream. Force the agent to handle them at the call '
194
+ 'site.'
195
+ ),
196
+ ),
197
+ VerifierSpec(
198
+ id='handoff_context_contract',
199
+ description='Require the receiving agent to restate critical constraints before proceeding.',
200
+ matches_families=('multiagent',),
201
+ matches_mode_prefixes=('multiagent.handoff_loss',),
202
+ suggested_code=(
203
+ 'def handoff(payload: HandoffPayload, to_agent: Agent) -> None:\n'
204
+ ' received = to_agent.read(payload)\n'
205
+ ' if not received.restates(payload.constraints):\n'
206
+ ' raise HandoffContractError(\n'
207
+ ' "receiver did not restate constraints"\n'
208
+ ' )\n'
209
+ ),
210
+ rationale=(
211
+ 'Who&When (Zhang et al., 2025): handoff context loss is the '
212
+ 'most common decisive multi-agent failure step. Typed handoff '
213
+ 'payloads + receiver restating prevent silent dropping.'
214
+ ),
215
+ ),
216
+ VerifierSpec(
217
+ id='loop_detector_guard',
218
+ description='Detect repeated tool calls / no-progress windows and trigger replan.',
219
+ matches_families=('planning',),
220
+ matches_mode_prefixes=('planning.inefficient_plan',),
221
+ suggested_code=(
222
+ 'from agentdebug.detectors import RepeatedToolCallDetector\n'
223
+ 'detector = RepeatedToolCallDetector(threshold=3)\n'
224
+ 'if detector.detect(current_trajectory):\n'
225
+ ' return replan(reason="loop detected")\n'
226
+ ),
227
+ rationale=(
228
+ 'Loops are the canonical inefficient-plan failure. AgentDebugX '
229
+ 'ships an in-process detector you can call between steps.'
230
+ ),
231
+ ),
232
+ ]
233
+
234
+
235
+ class CriticRecoverer:
236
+ """Tool-grounded recovery suggestions modeled on CRITIC (arXiv:2305.11738).
237
+
238
+ Unlike the paper's CRITIC loop (which re-runs the agent against a
239
+ verifier until pass), this recoverer is suggest-only: for each finding
240
+ it matches the failure mode against a registry of :class:`VerifierSpec`
241
+ templates and emits a :class:`FixProposal` with the suggested verifier
242
+ code + rationale. The user decides whether to add the verifier.
243
+
244
+ Pair with :class:`ReflexionSuggestion` for full coverage: Reflexion
245
+ tells the agent what went wrong; CriticRecoverer tells the developer
246
+ what guard to add.
247
+ """
248
+
249
+ id = 'critic'
250
+
251
+ def __init__(
252
+ self,
253
+ verifiers: Optional[List[VerifierSpec]] = None,
254
+ ) -> None:
255
+ self.verifiers = list(verifiers) if verifiers is not None else list(DEFAULT_VERIFIERS)
256
+
257
+ def suggest(
258
+ self,
259
+ trajectory: AgentTrajectory,
260
+ report: DiagnosticReport,
261
+ ) -> List[FixProposal]:
262
+ if not report.findings:
263
+ return []
264
+ proposals: List[FixProposal] = []
265
+ seen: set[tuple[str, str]] = set() # (event_id, verifier_id)
266
+ for finding in report.findings:
267
+ for verifier in self.verifiers:
268
+ if not verifier.matches(finding):
269
+ continue
270
+ key = (finding.event_id or '', verifier.id)
271
+ if key in seen:
272
+ continue
273
+ seen.add(key)
274
+ proposals.append(self._build(finding, verifier))
275
+ return proposals
276
+
277
+ def _build(
278
+ self, finding: FailureFinding, verifier: VerifierSpec,
279
+ ) -> FixProposal:
280
+ summary = (
281
+ f'Add {verifier.id} before {finding.failure_mode.mode_id} '
282
+ f'(step {finding.step_index}, agent {finding.agent_name})'
283
+ )
284
+ text = (
285
+ f'Failure: {finding.failure_mode.mode_id} '
286
+ f'({finding.failure_mode.name})\n'
287
+ f'Located at agent={finding.agent_name}, step={finding.step_index}, '
288
+ f'event_id={finding.event_id}\n\n'
289
+ f'Recommended verifier: {verifier.id}\n'
290
+ f'Rationale: {verifier.rationale}\n\n'
291
+ f'Suggested code:\n'
292
+ f'```python\n{verifier.suggested_code}\n```\n'
293
+ )
294
+ return FixProposal(
295
+ proposal_id=new_id('fix'),
296
+ recoverer_id=self.id,
297
+ target_event_id=finding.event_id,
298
+ summary=summary,
299
+ rationale=verifier.rationale,
300
+ confidence=max(0.3, min(0.85, finding.confidence)),
301
+ suggestion_text=text,
302
+ side_effects=[],
303
+ requires_human_approval=False,
304
+ )
305
+
306
+
307
+ __all__ = [
308
+ 'CriticRecoverer',
309
+ 'DEFAULT_VERIFIERS',
310
+ 'FixProposal',
311
+ 'Recoverer',
312
+ 'ReflexionSuggestion',
313
+ 'VerifierSpec',
314
+ ]
@@ -1,113 +0,0 @@
1
- """Lightweight recovery suggestions.
2
-
3
- v0.1 ships ``ReflexionSuggestion`` — a *suggest-only* recovery generator that
4
- produces a structured retry-prompt artifact based on Reflexion (Shinn et al.,
5
- NeurIPS 2023, arXiv:2303.11366). Heavier strategies (Self-Refine loop, CRITIC,
6
- Saga rollback, MCTS) are deferred per the roadmap and will land behind the same
7
- :class:`Recoverer` protocol.
8
-
9
- By design, **nothing here re-executes the agent** — recovery proposals are
10
- artifacts to be surfaced (CLI/UI/PR comment) or fed back into the next run.
11
- """
12
-
13
- from __future__ import annotations
14
-
15
- from dataclasses import dataclass, field
16
- from typing import List, Optional, Protocol
17
-
18
- from agentdebug.models import (
19
- AgentTrajectory,
20
- DiagnosticReport,
21
- FailureFinding,
22
- new_id,
23
- )
24
-
25
-
26
- @dataclass
27
- class FixProposal:
28
- proposal_id: str
29
- recoverer_id: str
30
- target_event_id: Optional[str]
31
- summary: str
32
- rationale: str
33
- confidence: float
34
- suggestion_text: str
35
- side_effects: List[str] = field(default_factory=list)
36
- requires_human_approval: bool = False
37
-
38
-
39
- class Recoverer(Protocol):
40
- id: str
41
-
42
- def suggest(
43
- self,
44
- trajectory: AgentTrajectory,
45
- report: DiagnosticReport,
46
- ) -> List[FixProposal]:
47
- ...
48
-
49
-
50
- class ReflexionSuggestion:
51
- """Emit a Reflexion-style retry reflection per finding.
52
-
53
- The output is purely textual — it can be appended to the agent's next
54
- system prompt, written to a project ``MANUAL.md``, or surfaced in the
55
- Console. There is no auto-apply.
56
- """
57
-
58
- id = 'reflexion'
59
-
60
- def suggest(
61
- self,
62
- trajectory: AgentTrajectory,
63
- report: DiagnosticReport,
64
- ) -> List[FixProposal]:
65
- if not report.findings:
66
- return []
67
- proposals: List[FixProposal] = []
68
- for finding in report.findings:
69
- proposals.append(self._build_proposal(trajectory, finding))
70
- return proposals
71
-
72
- def _build_proposal(
73
- self, trajectory: AgentTrajectory, finding: FailureFinding
74
- ) -> FixProposal:
75
- goal = trajectory.goal or '(no goal recorded)'
76
- framework = trajectory.framework or '(framework not declared)'
77
- evidence_block = '\n'.join(f' - {e}' for e in finding.evidence) or ' (none)'
78
- suggestion_template = (
79
- finding.suggestion
80
- or (finding.failure_mode.suggestion_templates[0]
81
- if finding.failure_mode.suggestion_templates
82
- else 'Inspect the offending step and constrain the agent at that point.')
83
- )
84
- reflection = (
85
- f'Task: {goal}\n'
86
- f'Framework: {framework}\n'
87
- f'Observed failure mode: {finding.failure_mode.mode_id} '
88
- f'({finding.failure_mode.name})\n'
89
- f'Located at agent={finding.agent_name}, step={finding.step_index}, '
90
- f'event_id={finding.event_id}\n'
91
- f'Evidence:\n{evidence_block}\n'
92
- f'Next time, do the following:\n {suggestion_template}\n'
93
- )
94
- return FixProposal(
95
- proposal_id=new_id('fix'),
96
- recoverer_id=self.id,
97
- target_event_id=finding.event_id,
98
- summary=(
99
- f'Reflexion retry hint for {finding.failure_mode.mode_id} '
100
- f'at step {finding.step_index}'
101
- ),
102
- rationale=(
103
- 'Reflexion (Shinn et al., NeurIPS 2023) converts a failure '
104
- 'into a verbal hint appended to next attempt.'
105
- ),
106
- confidence=min(0.9, max(0.1, finding.confidence)),
107
- suggestion_text=reflection,
108
- side_effects=['memory.write'],
109
- requires_human_approval=False,
110
- )
111
-
112
-
113
- __all__ = ['Recoverer', 'FixProposal', 'ReflexionSuggestion']
File without changes
File without changes
File without changes