agentdebugx 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/PKG-INFO +1 -1
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/23_status_v0_2.md +16 -9
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/pyproject.toml +1 -1
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/__init__.py +14 -2
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/attribution.py +171 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/models.py +12 -0
- agentdebugx-0.2.3/src/agentdebug/recovery.py +314 -0
- agentdebugx-0.2.2/src/agentdebug/recovery.py +0 -113
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/LICENSE +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/README.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/00_overview.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/01_literature_survey.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/02_architecture.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/03_taxonomy.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/04_trace_schema.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/05_adapters.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/06_detectors.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/07_attribution.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/08_recovery.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/09_error_database.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/10_taxonomy_induction.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/11_multimodal.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/12_ui_dashboard.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/13_class_design.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/14_api_reference.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/15_roadmap.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/16_governance.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/17_claude_code_design_patterns.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/18_comparison_codex_vs_design.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/19_error_hub.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/20_deep_debug.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/21_integrations.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/22_industry_track_paper_eval_plan.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/ERROR_TAXONOMY.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/OPEN_SOURCE_DEVELOPMENT_PLAN.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/README.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/RESEARCH_SURVEY.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/benchmarks/v0_1_smoke.json +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/docs/benchmarks/v0_1_smoke.md +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/adapters/__init__.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/adapters/base.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/adapters/langgraph.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/adapters/otel.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/adapters/raw.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/analyzers.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/cli.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/deep.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/detectors.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/events.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/hub/__init__.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/hub/backend_base.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/hub/backends.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/hub/bundle.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/hub/scrub.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/instrumentation.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/integrations/__init__.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/integrations/claude_skill.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/integrations/openhands.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/judges.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/llm.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/recorder.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/storage.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/taxonomy.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/traceback.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/ui/__init__.py +0 -0
- {agentdebugx-0.2.2 → agentdebugx-0.2.3}/src/agentdebug/ui/server.py +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# 23 — Capability + Test Coverage Status (v0.2.
|
|
1
|
+
# 23 — Capability + Test Coverage Status (v0.2.3)
|
|
2
2
|
|
|
3
3
|
A live audit of what's implemented, what's tested, and what's specced but
|
|
4
4
|
not yet built. Pair this with [docs/15_roadmap.md](./15_roadmap.md), which is
|
|
@@ -20,7 +20,9 @@ the forward-looking plan; this doc is the rear-view mirror.
|
|
|
20
20
|
| Attribution | `agentdebug.attribution.HeuristicAttributor` | ✅ stable | first-finding + tiebreak |
|
|
21
21
|
| Attribution | `agentdebug.attribution.AllAtOnceAttributor` | ✅ stable | mocked LLM + fallback |
|
|
22
22
|
| Attribution | `agentdebug.attribution.StepByStepAttributor` | ✅ **new 0.2.2** | scripted-LLM + fallback |
|
|
23
|
+
| Attribution | `agentdebug.attribution.BinarySearchAttributor` | ✅ **new 0.2.3** | oracle-LLM logarithmic convergence + fallback + render elision |
|
|
23
24
|
| Recovery | `agentdebug.recovery.ReflexionSuggestion` | ✅ stable | per-finding + empty |
|
|
25
|
+
| Recovery | `agentdebug.recovery.CriticRecoverer` + `VerifierSpec` registry | ✅ **new 0.2.3** | 5 family-matched verifier templates; dedup + custom-override |
|
|
24
26
|
| DeepDebug | `agentdebug.deep.DeepDebugAnalyzer` | ✅ stable | full loop + silent LLM |
|
|
25
27
|
| Cascade view | `agentdebug.traceback.format_traceback` | ✅ stable | cascade + step-order + ANSI + empty |
|
|
26
28
|
| Detectors | `agentdebug.detectors.RepeatedToolCall / RepeatedState / StepCountLimit` | ✅ **new 0.2.2** | threshold + window + budget |
|
|
@@ -45,13 +47,11 @@ across 32 source files.
|
|
|
45
47
|
| [06_detectors.md](./06_detectors.md) | `trajectory_perplexity` (TrajAD) | needs token-level LM perplexity API or embedding model + baseline calibration | v0.3 |
|
|
46
48
|
| [06_detectors.md](./06_detectors.md) | `topic_drift` (embedding cosine) | needs embedding client; consider reusing `OpenAICompatClient` `/embeddings` | v0.3 |
|
|
47
49
|
| [06_detectors.md](./06_detectors.md) | LTL spec monitors | requires user-supplied spec or LLM-synthesized monitors; gated on RV research | v1.2 |
|
|
48
|
-
| [07_attribution.md](./07_attribution.md) | `
|
|
49
|
-
| [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` | requires re-rolling agent actions; same replay constraint | v0.3 |
|
|
50
|
+
| [07_attribution.md](./07_attribution.md) | `CounterfactualAttributor` | requires re-rolling agent actions; framework-replay dependent | v0.3 |
|
|
50
51
|
| [07_attribution.md](./07_attribution.md) | `SBFLAttributor` (Tarantula/Ochiai) | needs corpus of passing + failing traces of same task; gated on Hub adoption | v0.4 |
|
|
51
52
|
| [07_attribution.md](./07_attribution.md) | `DeltaDebugAttributor` (Zeller) | same replay constraint | v0.3 |
|
|
52
|
-
| [07_attribution.md](./07_attribution.md) | `EnsembleAttributor` | trivial once
|
|
53
|
+
| [07_attribution.md](./07_attribution.md) | `EnsembleAttributor` | trivial once Counterfactual lands; awaits Counterfactual | v0.3 |
|
|
53
54
|
| [08_recovery.md](./08_recovery.md) | `SelfRefineLoop` | small but needs a generator-critic-refiner orchestration | v0.3 |
|
|
54
|
-
| [08_recovery.md](./08_recovery.md) | `CriticRecoverer` | needs a verifier registry (search, code-exec, type-check) | v0.3 |
|
|
55
55
|
| [08_recovery.md](./08_recovery.md) | `AutoManualRules` | needs persistent project manual + injection into next-run prompts | v0.3 |
|
|
56
56
|
| [08_recovery.md](./08_recovery.md) | `LangGraphRewind` | depends on LangGraph checkpointer; ships when we have a real LangGraph user | v0.3 |
|
|
57
57
|
| [08_recovery.md](./08_recovery.md) | `SagaRollback` | needs compensation registry on tool definitions; new schema | v0.3 |
|
|
@@ -97,10 +97,17 @@ remaining gaps are deliberate:
|
|
|
97
97
|
|
|
98
98
|
Before v0.3 ships, this doc should record green checkmarks for:
|
|
99
99
|
|
|
100
|
-
- [
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
100
|
+
- [x] **Logarithmic-cost attributor** (`BinarySearchAttributor`) shipped in
|
|
101
|
+
0.2.3 — Who&When method 3, O(log N) LLM calls, bisects the trajectory
|
|
102
|
+
via prefix evaluation. **Note:** this is not yet a "replayable
|
|
103
|
+
counterfactual" attributor; it predicts whether the failure has
|
|
104
|
+
already occurred from the prefix without re-rolling the agent. True
|
|
105
|
+
counterfactual replay is still v0.3.
|
|
106
|
+
- [x] **Tool-grounded recovery strategy** (`CriticRecoverer` + `VerifierSpec`
|
|
107
|
+
registry) shipped in 0.2.3 — pattern-matches failure modes against 5
|
|
108
|
+
default verifier templates (JSON-schema guard, final-state check,
|
|
109
|
+
tool-result type-check, handoff contract, loop-detector guard) and
|
|
110
|
+
emits per-finding `FixProposal` with rationale + suggested code.
|
|
104
111
|
- [ ] One additional framework adapter that goes through the full conformance
|
|
105
112
|
suite (CrewAI is the most-requested).
|
|
106
113
|
- [ ] HuggingFace Hub round-trip live test (gated on `HF_TOKEN`).
|
|
@@ -13,6 +13,7 @@ from agentdebug.attribution import (
|
|
|
13
13
|
AllAtOnceAttributor,
|
|
14
14
|
AttributionResult,
|
|
15
15
|
Attributor,
|
|
16
|
+
BinarySearchAttributor,
|
|
16
17
|
Blame,
|
|
17
18
|
HeuristicAttributor,
|
|
18
19
|
StepByStepAttributor,
|
|
@@ -38,7 +39,14 @@ from agentdebug.models import (
|
|
|
38
39
|
Modality,
|
|
39
40
|
)
|
|
40
41
|
from agentdebug.recorder import AgentDebug, TraceSession
|
|
41
|
-
from agentdebug.recovery import
|
|
42
|
+
from agentdebug.recovery import (
|
|
43
|
+
DEFAULT_VERIFIERS,
|
|
44
|
+
CriticRecoverer,
|
|
45
|
+
FixProposal,
|
|
46
|
+
Recoverer,
|
|
47
|
+
ReflexionSuggestion,
|
|
48
|
+
VerifierSpec,
|
|
49
|
+
)
|
|
42
50
|
from agentdebug.traceback import CascadeFrame, build_cascade, format_traceback
|
|
43
51
|
from agentdebug.storage import JsonlTraceStore, SQLiteTraceStore
|
|
44
52
|
from agentdebug.taxonomy import SEED_FAILURE_MODES, get_failure_mode
|
|
@@ -53,13 +61,17 @@ __all__ = [
|
|
|
53
61
|
'Attributor',
|
|
54
62
|
'Blame',
|
|
55
63
|
'BusEvent',
|
|
64
|
+
'BinarySearchAttributor',
|
|
56
65
|
'CascadeFrame',
|
|
66
|
+
'CriticRecoverer',
|
|
67
|
+
'DEFAULT_VERIFIERS',
|
|
57
68
|
'Detector',
|
|
58
69
|
'DetectorConfig',
|
|
59
70
|
'RepeatedStateDetector',
|
|
60
71
|
'RepeatedToolCallDetector',
|
|
61
72
|
'StepByStepAttributor',
|
|
62
73
|
'StepCountLimitDetector',
|
|
74
|
+
'VerifierSpec',
|
|
63
75
|
'build_cascade',
|
|
64
76
|
'default_detectors',
|
|
65
77
|
'format_traceback',
|
|
@@ -84,4 +96,4 @@ __all__ = [
|
|
|
84
96
|
'get_failure_mode',
|
|
85
97
|
]
|
|
86
98
|
|
|
87
|
-
__version__ = '0.2.
|
|
99
|
+
__version__ = '0.2.3'
|
|
@@ -21,6 +21,10 @@ import logging
|
|
|
21
21
|
from dataclasses import dataclass, field
|
|
22
22
|
from typing import Any, Dict, List, Optional, Protocol, cast
|
|
23
23
|
|
|
24
|
+
|
|
25
|
+
# Forward decl so BinarySearchAttributor.attribute can reference _EllipsisEvent
|
|
26
|
+
# from the helper render path; defined later in the module.
|
|
27
|
+
|
|
24
28
|
from agentdebug.llm import LLMClient, extract_json_block
|
|
25
29
|
from agentdebug.models import AgentEvent, AgentTrajectory, FailureFinding, new_id
|
|
26
30
|
|
|
@@ -227,6 +231,27 @@ class AllAtOnceAttributor:
|
|
|
227
231
|
return [str(value)]
|
|
228
232
|
|
|
229
233
|
|
|
234
|
+
_BISECT_SYSTEM_PROMPT = """You are AgentDebugX-Attributor running the
|
|
235
|
+
Who&When "Binary-Search" attribution method (arXiv:2505.00212). You will be
|
|
236
|
+
shown a PREFIX of a failed agent trajectory truncated to its first N events.
|
|
237
|
+
|
|
238
|
+
Decide whether the failure has ALREADY occurred within this prefix, i.e.,
|
|
239
|
+
whether the trajectory is unrecoverable as of the last event shown.
|
|
240
|
+
|
|
241
|
+
Respond ONLY with a JSON object (no prose, no markdown):
|
|
242
|
+
|
|
243
|
+
{
|
|
244
|
+
"failure_already_happened": true | false,
|
|
245
|
+
"confidence": <float in [0,1]>,
|
|
246
|
+
"rationale": "<one or two sentences>",
|
|
247
|
+
"decisive_event_id": "<event_id or null>"
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
Be conservative: only return true when you can point to evidence in the
|
|
251
|
+
prefix that the agent has already taken (or omitted) the decisive step.
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
|
|
230
255
|
_STEP_SYSTEM_PROMPT = """You are AgentDebugX-Attributor, scanning a failed
|
|
231
256
|
agent trajectory one step at a time (the Who&When "Step-by-Step" method,
|
|
232
257
|
arXiv:2505.00212).
|
|
@@ -393,7 +418,153 @@ class StepByStepAttributor:
|
|
|
393
418
|
return [str(value)]
|
|
394
419
|
|
|
395
420
|
|
|
421
|
+
class BinarySearchAttributor:
|
|
422
|
+
"""LLM-based attributor implementing Who&When's Binary-Search method.
|
|
423
|
+
|
|
424
|
+
Bisects the trajectory and asks the LLM whether the failure has already
|
|
425
|
+
occurred in each prefix. Costs O(log N) LLM calls vs StepByStep's O(N).
|
|
426
|
+
|
|
427
|
+
The contract:
|
|
428
|
+
|
|
429
|
+
* Pre-condition: the trajectory is known to have failed overall.
|
|
430
|
+
* Loop invariant: ``failure_already_happened`` is False at ``lo`` and
|
|
431
|
+
True at ``hi``. The decisive step lives in ``(lo, hi]``.
|
|
432
|
+
* Termination: ``hi - lo == 1``; ``hi - 1`` is the decisive index.
|
|
433
|
+
|
|
434
|
+
Returns the event at the decisive index as the primary Blame hypothesis.
|
|
435
|
+
Falls back to the configured ``fallback`` attributor when the trajectory
|
|
436
|
+
is empty or the LLM responses are uninterpretable.
|
|
437
|
+
"""
|
|
438
|
+
|
|
439
|
+
id = 'binary_search'
|
|
440
|
+
|
|
441
|
+
def __init__(
|
|
442
|
+
self,
|
|
443
|
+
llm: LLMClient,
|
|
444
|
+
*,
|
|
445
|
+
fallback: Optional[Attributor] = None,
|
|
446
|
+
max_tokens: int = 1024,
|
|
447
|
+
context_window: int = 6,
|
|
448
|
+
) -> None:
|
|
449
|
+
self.llm = llm
|
|
450
|
+
self.fallback: Attributor = fallback or HeuristicAttributor()
|
|
451
|
+
self.max_tokens = max_tokens
|
|
452
|
+
# When formatting a prefix into the LLM prompt we only keep this many
|
|
453
|
+
# events at the head + this many at the tail; the middle is elided.
|
|
454
|
+
# Keeps cost bounded for very long trajectories.
|
|
455
|
+
self.context_window = context_window
|
|
456
|
+
|
|
457
|
+
def attribute(
|
|
458
|
+
self,
|
|
459
|
+
trajectory: AgentTrajectory,
|
|
460
|
+
findings: List[FailureFinding],
|
|
461
|
+
) -> AttributionResult:
|
|
462
|
+
n = len(trajectory.events)
|
|
463
|
+
if n == 0:
|
|
464
|
+
return self.fallback.attribute(trajectory, findings)
|
|
465
|
+
lo, hi = 0, n
|
|
466
|
+
probe_count = 0
|
|
467
|
+
# Sanity: cap probes at ceil(log2(n)) + 2 to bound cost in pathological cases.
|
|
468
|
+
import math
|
|
469
|
+
max_probes = max(1, int(math.ceil(math.log2(max(n, 2)))) + 2)
|
|
470
|
+
while hi - lo > 1 and probe_count < max_probes:
|
|
471
|
+
mid = (lo + hi) // 2
|
|
472
|
+
probe_count += 1
|
|
473
|
+
verdict = self._probe(trajectory, mid)
|
|
474
|
+
if verdict is None:
|
|
475
|
+
# Uninterpretable response: fall back rather than guess.
|
|
476
|
+
return self.fallback.attribute(trajectory, findings)
|
|
477
|
+
already = bool(verdict.get('failure_already_happened'))
|
|
478
|
+
if already:
|
|
479
|
+
hi = mid
|
|
480
|
+
else:
|
|
481
|
+
lo = mid
|
|
482
|
+
decisive_index = hi - 1
|
|
483
|
+
decisive = trajectory.events[decisive_index]
|
|
484
|
+
return AttributionResult(
|
|
485
|
+
method=self.id,
|
|
486
|
+
hypotheses=[Blame(
|
|
487
|
+
span_id=decisive.event_id,
|
|
488
|
+
step_index=decisive.step_index,
|
|
489
|
+
agent_name=decisive.agent_name,
|
|
490
|
+
confidence=0.6 + 0.1 * min(probe_count, 4),
|
|
491
|
+
rationale=(
|
|
492
|
+
f'Binary search located the decisive step within '
|
|
493
|
+
f'{probe_count} probes over {n} events.'
|
|
494
|
+
),
|
|
495
|
+
evidence=[
|
|
496
|
+
f'event_id={decisive.event_id}',
|
|
497
|
+
f'step={decisive.step_index}',
|
|
498
|
+
],
|
|
499
|
+
sources=[self.id],
|
|
500
|
+
)],
|
|
501
|
+
raw={'probe_count': probe_count, 'trajectory_len': n},
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
def _probe(
|
|
505
|
+
self, trajectory: AgentTrajectory, prefix_len: int
|
|
506
|
+
) -> Optional[Dict[str, Any]]:
|
|
507
|
+
prefix = trajectory.prefix(prefix_len)
|
|
508
|
+
# Render prefix with head + tail elision so long prefixes stay cheap.
|
|
509
|
+
events_doc = self._render_prefix(prefix)
|
|
510
|
+
user = (
|
|
511
|
+
f'GOAL: {trajectory.goal!r}\n'
|
|
512
|
+
f'FRAMEWORK: {trajectory.framework!r}\n\n'
|
|
513
|
+
f'PREFIX (events 1..{prefix_len} of {len(trajectory.events)}):\n'
|
|
514
|
+
f'{events_doc}'
|
|
515
|
+
)
|
|
516
|
+
try:
|
|
517
|
+
result = self.llm.complete(
|
|
518
|
+
messages=[
|
|
519
|
+
{'role': 'system', 'content': _BISECT_SYSTEM_PROMPT},
|
|
520
|
+
{'role': 'user', 'content': user},
|
|
521
|
+
],
|
|
522
|
+
max_tokens=self.max_tokens,
|
|
523
|
+
)
|
|
524
|
+
except Exception as exc: # pragma: no cover
|
|
525
|
+
LOG.warning('binary_search probe at prefix_len=%s failed: %s',
|
|
526
|
+
prefix_len, exc)
|
|
527
|
+
return None
|
|
528
|
+
parsed = extract_json_block(result.text)
|
|
529
|
+
if parsed is None:
|
|
530
|
+
return None
|
|
531
|
+
return cast(Dict[str, Any], parsed)
|
|
532
|
+
|
|
533
|
+
def _render_prefix(self, prefix: AgentTrajectory) -> str:
|
|
534
|
+
events = prefix.events
|
|
535
|
+
if len(events) <= 2 * self.context_window:
|
|
536
|
+
view = events
|
|
537
|
+
else:
|
|
538
|
+
head = events[: self.context_window]
|
|
539
|
+
tail = events[-self.context_window:]
|
|
540
|
+
elided = len(events) - 2 * self.context_window
|
|
541
|
+
view = head + [_EVENT_ELLIPSIS(elided)] + tail
|
|
542
|
+
return '\n'.join(self._render_event(e) for e in view)
|
|
543
|
+
|
|
544
|
+
@staticmethod
|
|
545
|
+
def _render_event(event: Any) -> str:
|
|
546
|
+
if isinstance(event, _EllipsisEvent):
|
|
547
|
+
return f'... ({event.count} events elided) ...'
|
|
548
|
+
return (
|
|
549
|
+
f'event_id={event.event_id} step={event.step_index} '
|
|
550
|
+
f'agent={event.agent_name} '
|
|
551
|
+
f'type={getattr(event.event_type, "value", event.event_type)} '
|
|
552
|
+
f'output={str(event.output)[:200]} '
|
|
553
|
+
f'error={str(event.error)[:200]}'
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
@dataclass
|
|
558
|
+
class _EllipsisEvent:
|
|
559
|
+
count: int
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def _EVENT_ELLIPSIS(count: int) -> _EllipsisEvent:
|
|
563
|
+
return _EllipsisEvent(count=count)
|
|
564
|
+
|
|
565
|
+
|
|
396
566
|
__all__ = [
|
|
397
567
|
'Attributor', 'Blame', 'AttributionResult',
|
|
398
568
|
'HeuristicAttributor', 'AllAtOnceAttributor', 'StepByStepAttributor',
|
|
569
|
+
'BinarySearchAttributor',
|
|
399
570
|
]
|
|
@@ -102,6 +102,18 @@ class AgentTrajectory(BaseModel):
|
|
|
102
102
|
self.events.append(event)
|
|
103
103
|
return event
|
|
104
104
|
|
|
105
|
+
def prefix(self, n: int) -> 'AgentTrajectory':
|
|
106
|
+
"""Return a copy keeping only the first ``n`` events.
|
|
107
|
+
|
|
108
|
+
Used by replay/attribution backends (Binary-Search, Delta-Debugging)
|
|
109
|
+
that need to ask "would the trajectory still fail if it had stopped
|
|
110
|
+
at step k?" The returned object is a SHALLOW copy of the events list
|
|
111
|
+
but a fresh AgentTrajectory; mutating it does not touch the original.
|
|
112
|
+
"""
|
|
113
|
+
truncated = self.model_copy(deep=False) if hasattr(self, 'model_copy') else self.copy(deep=False)
|
|
114
|
+
truncated.events = list(self.events[:max(0, n)])
|
|
115
|
+
return truncated
|
|
116
|
+
|
|
105
117
|
|
|
106
118
|
class FailureMode(BaseModel):
|
|
107
119
|
"""A seed or generated taxonomy node."""
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
"""Lightweight recovery suggestions.
|
|
2
|
+
|
|
3
|
+
v0.1 ships ``ReflexionSuggestion`` — a *suggest-only* recovery generator that
|
|
4
|
+
produces a structured retry-prompt artifact based on Reflexion (Shinn et al.,
|
|
5
|
+
NeurIPS 2023, arXiv:2303.11366). Heavier strategies (Self-Refine loop, CRITIC,
|
|
6
|
+
Saga rollback, MCTS) are deferred per the roadmap and will land behind the same
|
|
7
|
+
:class:`Recoverer` protocol.
|
|
8
|
+
|
|
9
|
+
By design, **nothing here re-executes the agent** — recovery proposals are
|
|
10
|
+
artifacts to be surfaced (CLI/UI/PR comment) or fed back into the next run.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from typing import List, Optional, Protocol, Tuple
|
|
17
|
+
|
|
18
|
+
from agentdebug.models import (
|
|
19
|
+
AgentTrajectory,
|
|
20
|
+
DiagnosticReport,
|
|
21
|
+
FailureFinding,
|
|
22
|
+
new_id,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class FixProposal:
|
|
28
|
+
proposal_id: str
|
|
29
|
+
recoverer_id: str
|
|
30
|
+
target_event_id: Optional[str]
|
|
31
|
+
summary: str
|
|
32
|
+
rationale: str
|
|
33
|
+
confidence: float
|
|
34
|
+
suggestion_text: str
|
|
35
|
+
side_effects: List[str] = field(default_factory=list)
|
|
36
|
+
requires_human_approval: bool = False
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class Recoverer(Protocol):
|
|
40
|
+
id: str
|
|
41
|
+
|
|
42
|
+
def suggest(
|
|
43
|
+
self,
|
|
44
|
+
trajectory: AgentTrajectory,
|
|
45
|
+
report: DiagnosticReport,
|
|
46
|
+
) -> List[FixProposal]:
|
|
47
|
+
...
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ReflexionSuggestion:
|
|
51
|
+
"""Emit a Reflexion-style retry reflection per finding.
|
|
52
|
+
|
|
53
|
+
The output is purely textual — it can be appended to the agent's next
|
|
54
|
+
system prompt, written to a project ``MANUAL.md``, or surfaced in the
|
|
55
|
+
Console. There is no auto-apply.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
id = 'reflexion'
|
|
59
|
+
|
|
60
|
+
def suggest(
|
|
61
|
+
self,
|
|
62
|
+
trajectory: AgentTrajectory,
|
|
63
|
+
report: DiagnosticReport,
|
|
64
|
+
) -> List[FixProposal]:
|
|
65
|
+
if not report.findings:
|
|
66
|
+
return []
|
|
67
|
+
proposals: List[FixProposal] = []
|
|
68
|
+
for finding in report.findings:
|
|
69
|
+
proposals.append(self._build_proposal(trajectory, finding))
|
|
70
|
+
return proposals
|
|
71
|
+
|
|
72
|
+
def _build_proposal(
|
|
73
|
+
self, trajectory: AgentTrajectory, finding: FailureFinding
|
|
74
|
+
) -> FixProposal:
|
|
75
|
+
goal = trajectory.goal or '(no goal recorded)'
|
|
76
|
+
framework = trajectory.framework or '(framework not declared)'
|
|
77
|
+
evidence_block = '\n'.join(f' - {e}' for e in finding.evidence) or ' (none)'
|
|
78
|
+
suggestion_template = (
|
|
79
|
+
finding.suggestion
|
|
80
|
+
or (finding.failure_mode.suggestion_templates[0]
|
|
81
|
+
if finding.failure_mode.suggestion_templates
|
|
82
|
+
else 'Inspect the offending step and constrain the agent at that point.')
|
|
83
|
+
)
|
|
84
|
+
reflection = (
|
|
85
|
+
f'Task: {goal}\n'
|
|
86
|
+
f'Framework: {framework}\n'
|
|
87
|
+
f'Observed failure mode: {finding.failure_mode.mode_id} '
|
|
88
|
+
f'({finding.failure_mode.name})\n'
|
|
89
|
+
f'Located at agent={finding.agent_name}, step={finding.step_index}, '
|
|
90
|
+
f'event_id={finding.event_id}\n'
|
|
91
|
+
f'Evidence:\n{evidence_block}\n'
|
|
92
|
+
f'Next time, do the following:\n {suggestion_template}\n'
|
|
93
|
+
)
|
|
94
|
+
return FixProposal(
|
|
95
|
+
proposal_id=new_id('fix'),
|
|
96
|
+
recoverer_id=self.id,
|
|
97
|
+
target_event_id=finding.event_id,
|
|
98
|
+
summary=(
|
|
99
|
+
f'Reflexion retry hint for {finding.failure_mode.mode_id} '
|
|
100
|
+
f'at step {finding.step_index}'
|
|
101
|
+
),
|
|
102
|
+
rationale=(
|
|
103
|
+
'Reflexion (Shinn et al., NeurIPS 2023) converts a failure '
|
|
104
|
+
'into a verbal hint appended to next attempt.'
|
|
105
|
+
),
|
|
106
|
+
confidence=min(0.9, max(0.1, finding.confidence)),
|
|
107
|
+
suggestion_text=reflection,
|
|
108
|
+
side_effects=['memory.write'],
|
|
109
|
+
requires_human_approval=False,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@dataclass(frozen=True)
|
|
114
|
+
class VerifierSpec:
|
|
115
|
+
"""A pattern describing a tool-grounded verifier that could have caught
|
|
116
|
+
a particular family of failures.
|
|
117
|
+
|
|
118
|
+
Used by :class:`CriticRecoverer` to recommend (not run) the addition of
|
|
119
|
+
a verifier between the failing step and the next side-effect.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
id: str
|
|
123
|
+
description: str
|
|
124
|
+
matches_families: tuple[str, ...] # e.g. ('action',)
|
|
125
|
+
matches_mode_prefixes: tuple[str, ...] # e.g. ('action.format_error', 'action.parameter_error')
|
|
126
|
+
suggested_code: str # short snippet showing how to add the guard
|
|
127
|
+
rationale: str
|
|
128
|
+
|
|
129
|
+
def matches(self, finding: 'FailureFinding') -> bool:
|
|
130
|
+
if finding.failure_mode.family in self.matches_families:
|
|
131
|
+
return True
|
|
132
|
+
return any(
|
|
133
|
+
finding.failure_mode.mode_id.startswith(p)
|
|
134
|
+
for p in self.matches_mode_prefixes
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
DEFAULT_VERIFIERS: List[VerifierSpec] = [
|
|
139
|
+
VerifierSpec(
|
|
140
|
+
id='json_schema_guard',
|
|
141
|
+
description='Validate tool arguments against the tool JSON schema before execution.',
|
|
142
|
+
matches_families=('action',),
|
|
143
|
+
matches_mode_prefixes=('action.format_error', 'action.parameter_error'),
|
|
144
|
+
suggested_code=(
|
|
145
|
+
'from jsonschema import validate, ValidationError\n'
|
|
146
|
+
'try:\n'
|
|
147
|
+
' validate(instance=tool_args, schema=tool.schema)\n'
|
|
148
|
+
'except ValidationError as exc:\n'
|
|
149
|
+
' return handle_arg_error(exc, tool=tool, args=tool_args)\n'
|
|
150
|
+
),
|
|
151
|
+
rationale=(
|
|
152
|
+
'CRITIC (Gou et al., ICLR 2024): a tool-interactive verifier '
|
|
153
|
+
'catches malformed/missing-argument failures before they hit '
|
|
154
|
+
'the downstream API.'
|
|
155
|
+
),
|
|
156
|
+
),
|
|
157
|
+
VerifierSpec(
|
|
158
|
+
id='final_state_check',
|
|
159
|
+
description='Independent final-state verifier confirms the task is satisfied before terminating.',
|
|
160
|
+
matches_families=('verification', 'reflection'),
|
|
161
|
+
matches_mode_prefixes=('verification.', 'reflection.progress_misjudge'),
|
|
162
|
+
suggested_code=(
|
|
163
|
+
'def verify_task_complete(goal: str, final_state: dict) -> bool:\n'
|
|
164
|
+
' # Check every explicit success criterion; do NOT trust the\n'
|
|
165
|
+
' # acting agent\'s self-report.\n'
|
|
166
|
+
' return all(criterion(final_state) for criterion in success_criteria(goal))\n'
|
|
167
|
+
'\n'
|
|
168
|
+
'if not verify_task_complete(goal, state):\n'
|
|
169
|
+
' trigger_recovery_planning(reason="task not verified complete")\n'
|
|
170
|
+
),
|
|
171
|
+
rationale=(
|
|
172
|
+
'MAST (Cemri et al., 2025) shows premature termination + missing '
|
|
173
|
+
'task validation are dominant multi-agent failure modes. A '
|
|
174
|
+
'verifier that does not trust the acting agent is the standard '
|
|
175
|
+
'fix.'
|
|
176
|
+
),
|
|
177
|
+
),
|
|
178
|
+
VerifierSpec(
|
|
179
|
+
id='tool_result_typecheck',
|
|
180
|
+
description='Type-check the tool result and require explicit handling of None / empty.',
|
|
181
|
+
matches_families=('action', 'system'),
|
|
182
|
+
matches_mode_prefixes=('action.wrong_tool', 'system.tool_execution_error'),
|
|
183
|
+
suggested_code=(
|
|
184
|
+
'result = tool.run(args)\n'
|
|
185
|
+
'if result is None or result == {}:\n'
|
|
186
|
+
' return handle_empty_result(tool=tool, args=args)\n'
|
|
187
|
+
'if not isinstance(result, tool.expected_return_type):\n'
|
|
188
|
+
' return handle_unexpected_type(result, tool=tool)\n'
|
|
189
|
+
),
|
|
190
|
+
rationale=(
|
|
191
|
+
'Tool outputs that the agent does not branch on (None, empty '
|
|
192
|
+
'list, unexpected type) propagate as "agent hallucinated a '
|
|
193
|
+
'fact" downstream. Force the agent to handle them at the call '
|
|
194
|
+
'site.'
|
|
195
|
+
),
|
|
196
|
+
),
|
|
197
|
+
VerifierSpec(
|
|
198
|
+
id='handoff_context_contract',
|
|
199
|
+
description='Require the receiving agent to restate critical constraints before proceeding.',
|
|
200
|
+
matches_families=('multiagent',),
|
|
201
|
+
matches_mode_prefixes=('multiagent.handoff_loss',),
|
|
202
|
+
suggested_code=(
|
|
203
|
+
'def handoff(payload: HandoffPayload, to_agent: Agent) -> None:\n'
|
|
204
|
+
' received = to_agent.read(payload)\n'
|
|
205
|
+
' if not received.restates(payload.constraints):\n'
|
|
206
|
+
' raise HandoffContractError(\n'
|
|
207
|
+
' "receiver did not restate constraints"\n'
|
|
208
|
+
' )\n'
|
|
209
|
+
),
|
|
210
|
+
rationale=(
|
|
211
|
+
'Who&When (Zhang et al., 2025): handoff context loss is the '
|
|
212
|
+
'most common decisive multi-agent failure step. Typed handoff '
|
|
213
|
+
'payloads + receiver restating prevent silent dropping.'
|
|
214
|
+
),
|
|
215
|
+
),
|
|
216
|
+
VerifierSpec(
|
|
217
|
+
id='loop_detector_guard',
|
|
218
|
+
description='Detect repeated tool calls / no-progress windows and trigger replan.',
|
|
219
|
+
matches_families=('planning',),
|
|
220
|
+
matches_mode_prefixes=('planning.inefficient_plan',),
|
|
221
|
+
suggested_code=(
|
|
222
|
+
'from agentdebug.detectors import RepeatedToolCallDetector\n'
|
|
223
|
+
'detector = RepeatedToolCallDetector(threshold=3)\n'
|
|
224
|
+
'if detector.detect(current_trajectory):\n'
|
|
225
|
+
' return replan(reason="loop detected")\n'
|
|
226
|
+
),
|
|
227
|
+
rationale=(
|
|
228
|
+
'Loops are the canonical inefficient-plan failure. AgentDebugX '
|
|
229
|
+
'ships an in-process detector you can call between steps.'
|
|
230
|
+
),
|
|
231
|
+
),
|
|
232
|
+
]
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class CriticRecoverer:
|
|
236
|
+
"""Tool-grounded recovery suggestions modeled on CRITIC (arXiv:2305.11738).
|
|
237
|
+
|
|
238
|
+
Unlike the paper's CRITIC loop (which re-runs the agent against a
|
|
239
|
+
verifier until pass), this recoverer is suggest-only: for each finding
|
|
240
|
+
it matches the failure mode against a registry of :class:`VerifierSpec`
|
|
241
|
+
templates and emits a :class:`FixProposal` with the suggested verifier
|
|
242
|
+
code + rationale. The user decides whether to add the verifier.
|
|
243
|
+
|
|
244
|
+
Pair with :class:`ReflexionSuggestion` for full coverage: Reflexion
|
|
245
|
+
tells the agent what went wrong; CriticRecoverer tells the developer
|
|
246
|
+
what guard to add.
|
|
247
|
+
"""
|
|
248
|
+
|
|
249
|
+
id = 'critic'
|
|
250
|
+
|
|
251
|
+
def __init__(
|
|
252
|
+
self,
|
|
253
|
+
verifiers: Optional[List[VerifierSpec]] = None,
|
|
254
|
+
) -> None:
|
|
255
|
+
self.verifiers = list(verifiers) if verifiers is not None else list(DEFAULT_VERIFIERS)
|
|
256
|
+
|
|
257
|
+
def suggest(
|
|
258
|
+
self,
|
|
259
|
+
trajectory: AgentTrajectory,
|
|
260
|
+
report: DiagnosticReport,
|
|
261
|
+
) -> List[FixProposal]:
|
|
262
|
+
if not report.findings:
|
|
263
|
+
return []
|
|
264
|
+
proposals: List[FixProposal] = []
|
|
265
|
+
seen: set[tuple[str, str]] = set() # (event_id, verifier_id)
|
|
266
|
+
for finding in report.findings:
|
|
267
|
+
for verifier in self.verifiers:
|
|
268
|
+
if not verifier.matches(finding):
|
|
269
|
+
continue
|
|
270
|
+
key = (finding.event_id or '', verifier.id)
|
|
271
|
+
if key in seen:
|
|
272
|
+
continue
|
|
273
|
+
seen.add(key)
|
|
274
|
+
proposals.append(self._build(finding, verifier))
|
|
275
|
+
return proposals
|
|
276
|
+
|
|
277
|
+
def _build(
|
|
278
|
+
self, finding: FailureFinding, verifier: VerifierSpec,
|
|
279
|
+
) -> FixProposal:
|
|
280
|
+
summary = (
|
|
281
|
+
f'Add {verifier.id} before {finding.failure_mode.mode_id} '
|
|
282
|
+
f'(step {finding.step_index}, agent {finding.agent_name})'
|
|
283
|
+
)
|
|
284
|
+
text = (
|
|
285
|
+
f'Failure: {finding.failure_mode.mode_id} '
|
|
286
|
+
f'({finding.failure_mode.name})\n'
|
|
287
|
+
f'Located at agent={finding.agent_name}, step={finding.step_index}, '
|
|
288
|
+
f'event_id={finding.event_id}\n\n'
|
|
289
|
+
f'Recommended verifier: {verifier.id}\n'
|
|
290
|
+
f'Rationale: {verifier.rationale}\n\n'
|
|
291
|
+
f'Suggested code:\n'
|
|
292
|
+
f'```python\n{verifier.suggested_code}\n```\n'
|
|
293
|
+
)
|
|
294
|
+
return FixProposal(
|
|
295
|
+
proposal_id=new_id('fix'),
|
|
296
|
+
recoverer_id=self.id,
|
|
297
|
+
target_event_id=finding.event_id,
|
|
298
|
+
summary=summary,
|
|
299
|
+
rationale=verifier.rationale,
|
|
300
|
+
confidence=max(0.3, min(0.85, finding.confidence)),
|
|
301
|
+
suggestion_text=text,
|
|
302
|
+
side_effects=[],
|
|
303
|
+
requires_human_approval=False,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
__all__ = [
|
|
308
|
+
'CriticRecoverer',
|
|
309
|
+
'DEFAULT_VERIFIERS',
|
|
310
|
+
'FixProposal',
|
|
311
|
+
'Recoverer',
|
|
312
|
+
'ReflexionSuggestion',
|
|
313
|
+
'VerifierSpec',
|
|
314
|
+
]
|
|
@@ -1,113 +0,0 @@
|
|
|
1
|
-
"""Lightweight recovery suggestions.
|
|
2
|
-
|
|
3
|
-
v0.1 ships ``ReflexionSuggestion`` — a *suggest-only* recovery generator that
|
|
4
|
-
produces a structured retry-prompt artifact based on Reflexion (Shinn et al.,
|
|
5
|
-
NeurIPS 2023, arXiv:2303.11366). Heavier strategies (Self-Refine loop, CRITIC,
|
|
6
|
-
Saga rollback, MCTS) are deferred per the roadmap and will land behind the same
|
|
7
|
-
:class:`Recoverer` protocol.
|
|
8
|
-
|
|
9
|
-
By design, **nothing here re-executes the agent** — recovery proposals are
|
|
10
|
-
artifacts to be surfaced (CLI/UI/PR comment) or fed back into the next run.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
from __future__ import annotations
|
|
14
|
-
|
|
15
|
-
from dataclasses import dataclass, field
|
|
16
|
-
from typing import List, Optional, Protocol
|
|
17
|
-
|
|
18
|
-
from agentdebug.models import (
|
|
19
|
-
AgentTrajectory,
|
|
20
|
-
DiagnosticReport,
|
|
21
|
-
FailureFinding,
|
|
22
|
-
new_id,
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
@dataclass
|
|
27
|
-
class FixProposal:
|
|
28
|
-
proposal_id: str
|
|
29
|
-
recoverer_id: str
|
|
30
|
-
target_event_id: Optional[str]
|
|
31
|
-
summary: str
|
|
32
|
-
rationale: str
|
|
33
|
-
confidence: float
|
|
34
|
-
suggestion_text: str
|
|
35
|
-
side_effects: List[str] = field(default_factory=list)
|
|
36
|
-
requires_human_approval: bool = False
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
class Recoverer(Protocol):
|
|
40
|
-
id: str
|
|
41
|
-
|
|
42
|
-
def suggest(
|
|
43
|
-
self,
|
|
44
|
-
trajectory: AgentTrajectory,
|
|
45
|
-
report: DiagnosticReport,
|
|
46
|
-
) -> List[FixProposal]:
|
|
47
|
-
...
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class ReflexionSuggestion:
|
|
51
|
-
"""Emit a Reflexion-style retry reflection per finding.
|
|
52
|
-
|
|
53
|
-
The output is purely textual — it can be appended to the agent's next
|
|
54
|
-
system prompt, written to a project ``MANUAL.md``, or surfaced in the
|
|
55
|
-
Console. There is no auto-apply.
|
|
56
|
-
"""
|
|
57
|
-
|
|
58
|
-
id = 'reflexion'
|
|
59
|
-
|
|
60
|
-
def suggest(
|
|
61
|
-
self,
|
|
62
|
-
trajectory: AgentTrajectory,
|
|
63
|
-
report: DiagnosticReport,
|
|
64
|
-
) -> List[FixProposal]:
|
|
65
|
-
if not report.findings:
|
|
66
|
-
return []
|
|
67
|
-
proposals: List[FixProposal] = []
|
|
68
|
-
for finding in report.findings:
|
|
69
|
-
proposals.append(self._build_proposal(trajectory, finding))
|
|
70
|
-
return proposals
|
|
71
|
-
|
|
72
|
-
def _build_proposal(
|
|
73
|
-
self, trajectory: AgentTrajectory, finding: FailureFinding
|
|
74
|
-
) -> FixProposal:
|
|
75
|
-
goal = trajectory.goal or '(no goal recorded)'
|
|
76
|
-
framework = trajectory.framework or '(framework not declared)'
|
|
77
|
-
evidence_block = '\n'.join(f' - {e}' for e in finding.evidence) or ' (none)'
|
|
78
|
-
suggestion_template = (
|
|
79
|
-
finding.suggestion
|
|
80
|
-
or (finding.failure_mode.suggestion_templates[0]
|
|
81
|
-
if finding.failure_mode.suggestion_templates
|
|
82
|
-
else 'Inspect the offending step and constrain the agent at that point.')
|
|
83
|
-
)
|
|
84
|
-
reflection = (
|
|
85
|
-
f'Task: {goal}\n'
|
|
86
|
-
f'Framework: {framework}\n'
|
|
87
|
-
f'Observed failure mode: {finding.failure_mode.mode_id} '
|
|
88
|
-
f'({finding.failure_mode.name})\n'
|
|
89
|
-
f'Located at agent={finding.agent_name}, step={finding.step_index}, '
|
|
90
|
-
f'event_id={finding.event_id}\n'
|
|
91
|
-
f'Evidence:\n{evidence_block}\n'
|
|
92
|
-
f'Next time, do the following:\n {suggestion_template}\n'
|
|
93
|
-
)
|
|
94
|
-
return FixProposal(
|
|
95
|
-
proposal_id=new_id('fix'),
|
|
96
|
-
recoverer_id=self.id,
|
|
97
|
-
target_event_id=finding.event_id,
|
|
98
|
-
summary=(
|
|
99
|
-
f'Reflexion retry hint for {finding.failure_mode.mode_id} '
|
|
100
|
-
f'at step {finding.step_index}'
|
|
101
|
-
),
|
|
102
|
-
rationale=(
|
|
103
|
-
'Reflexion (Shinn et al., NeurIPS 2023) converts a failure '
|
|
104
|
-
'into a verbal hint appended to next attempt.'
|
|
105
|
-
),
|
|
106
|
-
confidence=min(0.9, max(0.1, finding.confidence)),
|
|
107
|
-
suggestion_text=reflection,
|
|
108
|
-
side_effects=['memory.write'],
|
|
109
|
-
requires_human_approval=False,
|
|
110
|
-
)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
__all__ = ['Recoverer', 'FixProposal', 'ReflexionSuggestion']
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|