agentdebugx 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentdebug/__init__.py +65 -0
- agentdebug/adapters/__init__.py +10 -0
- agentdebug/adapters/base.py +22 -0
- agentdebug/adapters/langgraph.py +261 -0
- agentdebug/adapters/otel.py +151 -0
- agentdebug/adapters/raw.py +134 -0
- agentdebug/analyzers.py +152 -0
- agentdebug/attribution.py +230 -0
- agentdebug/cli.py +272 -0
- agentdebug/events.py +114 -0
- agentdebug/instrumentation.py +57 -0
- agentdebug/judges.py +258 -0
- agentdebug/llm.py +165 -0
- agentdebug/models.py +169 -0
- agentdebug/recorder.py +183 -0
- agentdebug/recovery.py +113 -0
- agentdebug/storage.py +167 -0
- agentdebug/taxonomy.py +271 -0
- agentdebug/ui/__init__.py +14 -0
- agentdebug/ui/server.py +260 -0
- agentdebugx-0.1.0.dist-info/METADATA +217 -0
- agentdebugx-0.1.0.dist-info/RECORD +25 -0
- agentdebugx-0.1.0.dist-info/WHEEL +4 -0
- agentdebugx-0.1.0.dist-info/entry_points.txt +3 -0
- agentdebugx-0.1.0.dist-info/licenses/LICENSE +21 -0
agentdebug/analyzers.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Trajectory analyzers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Iterable, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
from agentdebug.models import (
|
|
8
|
+
AgentEvent,
|
|
9
|
+
AgentTrajectory,
|
|
10
|
+
DiagnosticReport,
|
|
11
|
+
EventType,
|
|
12
|
+
FailureFinding,
|
|
13
|
+
FailureMode,
|
|
14
|
+
)
|
|
15
|
+
from agentdebug.taxonomy import SEED_FAILURE_MODES
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class HeuristicAnalyzer:
|
|
19
|
+
"""A deterministic baseline analyzer.
|
|
20
|
+
|
|
21
|
+
This is intentionally simple: it gives projects an immediate local signal
|
|
22
|
+
while future LLM judges, constraint synthesis, and learned attribution
|
|
23
|
+
models share the same output schema.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def analyze(self, trajectory: AgentTrajectory) -> DiagnosticReport:
|
|
27
|
+
findings = [finding for event in trajectory.events for finding in self._event_findings(event)]
|
|
28
|
+
root = self._select_root_cause(findings)
|
|
29
|
+
suggestions = self._dedupe(
|
|
30
|
+
finding.suggestion for finding in findings if finding.suggestion is not None
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
report = DiagnosticReport(
|
|
34
|
+
trace_id=trajectory.trace_id,
|
|
35
|
+
task_id=trajectory.task_id,
|
|
36
|
+
findings=findings,
|
|
37
|
+
suggestions=suggestions,
|
|
38
|
+
metadata={'analyzer': self.__class__.__name__},
|
|
39
|
+
)
|
|
40
|
+
if root is not None:
|
|
41
|
+
report.root_cause_event_id = root.event_id
|
|
42
|
+
report.root_cause_agent = root.agent_name
|
|
43
|
+
report.root_cause_step_index = root.step_index
|
|
44
|
+
report.summary = (
|
|
45
|
+
f'Likely root cause: {root.failure_mode.name}'
|
|
46
|
+
f' in {root.agent_name or "unknown agent"}'
|
|
47
|
+
f' at step {root.step_index}.'
|
|
48
|
+
)
|
|
49
|
+
return report
|
|
50
|
+
|
|
51
|
+
def _event_findings(self, event: AgentEvent) -> List[FailureFinding]:
|
|
52
|
+
text = self._event_text(event)
|
|
53
|
+
matched = self._match_failure_mode(event, text)
|
|
54
|
+
if matched is None:
|
|
55
|
+
return []
|
|
56
|
+
|
|
57
|
+
failure_mode, confidence, evidence = matched
|
|
58
|
+
return [
|
|
59
|
+
FailureFinding(
|
|
60
|
+
failure_mode=failure_mode,
|
|
61
|
+
event_id=event.event_id,
|
|
62
|
+
agent_name=event.agent_name,
|
|
63
|
+
step_index=event.step_index,
|
|
64
|
+
confidence=confidence,
|
|
65
|
+
evidence=evidence,
|
|
66
|
+
suggestion=self._suggestion(failure_mode),
|
|
67
|
+
metadata={'event_type': self._event_type_value(event.event_type)},
|
|
68
|
+
)
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
def _match_failure_mode(
|
|
72
|
+
self, event: AgentEvent, text: str
|
|
73
|
+
) -> Optional[Tuple[FailureMode, float, List[str]]]:
|
|
74
|
+
lowered = text.lower()
|
|
75
|
+
if event.error:
|
|
76
|
+
if event.event_type in {EventType.TOOL_CALL, EventType.TOOL_RESULT}:
|
|
77
|
+
return self._result('system.tool_execution_error', 0.86, [event.error])
|
|
78
|
+
return self._result('system.environment_error', 0.72, [event.error])
|
|
79
|
+
|
|
80
|
+
if self._contains(lowered, ['context length', 'rate limit', 'token limit']):
|
|
81
|
+
return self._result('system.llm_limit', 0.82, ['limit signal in event payload'])
|
|
82
|
+
if self._contains(lowered, ['json', 'schema', 'malformed', 'parse error']):
|
|
83
|
+
return self._result('action.format_error', 0.78, ['format/schema signal in event payload'])
|
|
84
|
+
if self._contains(lowered, ['missing parameter', 'invalid parameter', 'argument']):
|
|
85
|
+
return self._result('action.parameter_error', 0.76, ['parameter signal in event payload'])
|
|
86
|
+
if self._contains(lowered, ['unknown tool', 'wrong tool', 'tool mismatch']):
|
|
87
|
+
return self._result('action.wrong_tool', 0.8, ['tool-selection signal in event payload'])
|
|
88
|
+
if self._contains(lowered, ['loop', 'repeated', 'no progress', 'max steps']):
|
|
89
|
+
return self._result('planning.inefficient_plan', 0.67, ['loop/progress signal in event payload'])
|
|
90
|
+
if self._contains(lowered, ['policy violation', 'constraint ignored', 'must not']):
|
|
91
|
+
return self._result('planning.constraint_ignorance', 0.74, ['constraint signal in event payload'])
|
|
92
|
+
if self._contains(lowered, ['premature', 'early termination', 'not complete']):
|
|
93
|
+
return self._result('verification.premature_stop', 0.71, ['termination signal in event payload'])
|
|
94
|
+
if self._contains(lowered, ['handoff', 'lost context', 'missing context']):
|
|
95
|
+
return self._result('multiagent.handoff_loss', 0.7, ['handoff/context signal in event payload'])
|
|
96
|
+
if self._contains(lowered, ['screenshot', 'ocr', 'visual', 'ui element']):
|
|
97
|
+
return self._result('multimodal.perception_error', 0.62, ['multimodal perception signal in event payload'])
|
|
98
|
+
|
|
99
|
+
status = str(event.metadata.get('status', '')).lower()
|
|
100
|
+
if status in {'failed', 'failure', 'error'}:
|
|
101
|
+
return self._result('system.environment_error', 0.6, ['metadata status marks event as failed'])
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
def _result(
|
|
105
|
+
self, mode_id: str, confidence: float, evidence: List[str]
|
|
106
|
+
) -> Tuple[FailureMode, float, List[str]]:
|
|
107
|
+
return SEED_FAILURE_MODES[mode_id], confidence, evidence
|
|
108
|
+
|
|
109
|
+
def _select_root_cause(self, findings: List[FailureFinding]) -> Optional[FailureFinding]:
|
|
110
|
+
if not findings:
|
|
111
|
+
return None
|
|
112
|
+
return sorted(
|
|
113
|
+
findings,
|
|
114
|
+
key=lambda finding: (
|
|
115
|
+
finding.step_index is None,
|
|
116
|
+
finding.step_index if finding.step_index is not None else 10**9,
|
|
117
|
+
-finding.confidence,
|
|
118
|
+
),
|
|
119
|
+
)[0]
|
|
120
|
+
|
|
121
|
+
def _event_text(self, event: AgentEvent) -> str:
|
|
122
|
+
parts = [
|
|
123
|
+
self._event_type_value(event.event_type),
|
|
124
|
+
event.module or '',
|
|
125
|
+
str(event.input or ''),
|
|
126
|
+
str(event.output or ''),
|
|
127
|
+
event.error or '',
|
|
128
|
+
str(event.metadata),
|
|
129
|
+
]
|
|
130
|
+
return '\n'.join(parts)
|
|
131
|
+
|
|
132
|
+
def _suggestion(self, failure_mode: FailureMode) -> Optional[str]:
|
|
133
|
+
if not failure_mode.suggestion_templates:
|
|
134
|
+
return None
|
|
135
|
+
return str(failure_mode.suggestion_templates[0])
|
|
136
|
+
|
|
137
|
+
def _contains(self, text: str, needles: Iterable[str]) -> bool:
|
|
138
|
+
return any(needle in text for needle in needles)
|
|
139
|
+
|
|
140
|
+
def _dedupe(self, values: Iterable[str]) -> List[str]:
|
|
141
|
+
seen = set()
|
|
142
|
+
result = []
|
|
143
|
+
for value in values:
|
|
144
|
+
if value in seen:
|
|
145
|
+
continue
|
|
146
|
+
seen.add(value)
|
|
147
|
+
result.append(value)
|
|
148
|
+
return result
|
|
149
|
+
|
|
150
|
+
def _event_type_value(self, event_type: EventType) -> str:
|
|
151
|
+
value = getattr(event_type, 'value', event_type)
|
|
152
|
+
return str(value)
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""Failure attribution.
|
|
2
|
+
|
|
3
|
+
v0.1 ships two backends, both behind the same :class:`Attributor` protocol:
|
|
4
|
+
|
|
5
|
+
* :class:`HeuristicAttributor` — uses the earliest finding (with confidence
|
|
6
|
+
tie-break) from a ``DiagnosticReport`` to derive blame. Zero-cost; always
|
|
7
|
+
available; matches what :class:`agentdebug.analyzers.HeuristicAnalyzer` does
|
|
8
|
+
internally today.
|
|
9
|
+
* :class:`AllAtOnceAttributor` — feeds the full trajectory + findings to an
|
|
10
|
+
LLM and asks for a single blame hypothesis. Uses the Who&When "All-at-Once"
|
|
11
|
+
method from arXiv:2505.00212.
|
|
12
|
+
|
|
13
|
+
Both produce an :class:`AttributionResult` carrying a list of :class:`Blame`
|
|
14
|
+
hypotheses with confidence, rationale, and source attribution. Honest UX: we
|
|
15
|
+
always return ranked hypotheses, never single-point claims.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import logging
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from typing import Any, Dict, List, Optional, Protocol
|
|
23
|
+
|
|
24
|
+
from agentdebug.llm import LLMClient, extract_json_block
|
|
25
|
+
from agentdebug.models import AgentTrajectory, FailureFinding, new_id
|
|
26
|
+
|
|
27
|
+
LOG = logging.getLogger('agentdebug.attribution')
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class Blame:
|
|
32
|
+
span_id: Optional[str]
|
|
33
|
+
step_index: Optional[int]
|
|
34
|
+
agent_name: Optional[str]
|
|
35
|
+
confidence: float
|
|
36
|
+
rationale: str
|
|
37
|
+
evidence: List[str] = field(default_factory=list)
|
|
38
|
+
sources: List[str] = field(default_factory=list)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class AttributionResult:
|
|
43
|
+
method: str
|
|
44
|
+
hypotheses: List[Blame]
|
|
45
|
+
elapsed_ms: int = 0
|
|
46
|
+
raw: Dict[str, Any] = field(default_factory=dict)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class Attributor(Protocol):
|
|
50
|
+
id: str
|
|
51
|
+
|
|
52
|
+
def attribute(
|
|
53
|
+
self,
|
|
54
|
+
trajectory: AgentTrajectory,
|
|
55
|
+
findings: List[FailureFinding],
|
|
56
|
+
) -> AttributionResult:
|
|
57
|
+
...
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class HeuristicAttributor:
|
|
61
|
+
"""Cheap, model-free fallback that picks the earliest highest-confidence finding."""
|
|
62
|
+
|
|
63
|
+
id = 'heuristic'
|
|
64
|
+
|
|
65
|
+
def attribute(
|
|
66
|
+
self,
|
|
67
|
+
trajectory: AgentTrajectory,
|
|
68
|
+
findings: List[FailureFinding],
|
|
69
|
+
) -> AttributionResult:
|
|
70
|
+
if not findings:
|
|
71
|
+
return AttributionResult(method=self.id, hypotheses=[])
|
|
72
|
+
ranked = sorted(
|
|
73
|
+
findings,
|
|
74
|
+
key=lambda f: (
|
|
75
|
+
f.step_index is None,
|
|
76
|
+
f.step_index if f.step_index is not None else 10**9,
|
|
77
|
+
-f.confidence,
|
|
78
|
+
),
|
|
79
|
+
)
|
|
80
|
+
primary = ranked[0]
|
|
81
|
+
return AttributionResult(
|
|
82
|
+
method=self.id,
|
|
83
|
+
hypotheses=[
|
|
84
|
+
Blame(
|
|
85
|
+
span_id=primary.event_id,
|
|
86
|
+
step_index=primary.step_index,
|
|
87
|
+
agent_name=primary.agent_name,
|
|
88
|
+
confidence=primary.confidence,
|
|
89
|
+
rationale=(
|
|
90
|
+
f'Earliest finding with non-trivial confidence: '
|
|
91
|
+
f'{primary.failure_mode.name}'
|
|
92
|
+
),
|
|
93
|
+
evidence=list(primary.evidence),
|
|
94
|
+
sources=[self.id],
|
|
95
|
+
)
|
|
96
|
+
],
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
_ATTR_SYSTEM_PROMPT = """You are AgentDebugX-Attributor. Given a failed agent
|
|
101
|
+
trajectory and the list of step-level failure findings already produced by a
|
|
102
|
+
judge, identify the single MOST RESPONSIBLE step (the decisive root cause).
|
|
103
|
+
|
|
104
|
+
Respond ONLY with a JSON object matching this schema (no prose, no markdown):
|
|
105
|
+
|
|
106
|
+
{
|
|
107
|
+
"span_id": "<event_id from the input or null>",
|
|
108
|
+
"step_index": <int or null>,
|
|
109
|
+
"agent_name": "<agent_name from the input or null>",
|
|
110
|
+
"confidence": <float between 0 and 1>,
|
|
111
|
+
"rationale": "<one or two sentences justifying the choice>",
|
|
112
|
+
"evidence": ["<short quoted evidence>", ...]
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
If the trajectory does not appear to have failed, return all fields as null and
|
|
116
|
+
confidence 0.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class AllAtOnceAttributor:
|
|
121
|
+
"""LLM-based attributor mirroring Who&When's All-at-Once method."""
|
|
122
|
+
|
|
123
|
+
id = 'all_at_once'
|
|
124
|
+
|
|
125
|
+
def __init__(
|
|
126
|
+
self,
|
|
127
|
+
llm: LLMClient,
|
|
128
|
+
*,
|
|
129
|
+
fallback: Optional[Attributor] = None,
|
|
130
|
+
max_findings: int = 20,
|
|
131
|
+
max_tokens: int = 2048,
|
|
132
|
+
) -> None:
|
|
133
|
+
self.llm = llm
|
|
134
|
+
self.fallback: Attributor = fallback or HeuristicAttributor()
|
|
135
|
+
self.max_findings = max_findings
|
|
136
|
+
self.max_tokens = max_tokens
|
|
137
|
+
|
|
138
|
+
def attribute(
|
|
139
|
+
self,
|
|
140
|
+
trajectory: AgentTrajectory,
|
|
141
|
+
findings: List[FailureFinding],
|
|
142
|
+
) -> AttributionResult:
|
|
143
|
+
prompt_user = self._render_prompt(trajectory, findings[: self.max_findings])
|
|
144
|
+
messages = [
|
|
145
|
+
{'role': 'system', 'content': _ATTR_SYSTEM_PROMPT},
|
|
146
|
+
{'role': 'user', 'content': prompt_user},
|
|
147
|
+
]
|
|
148
|
+
try:
|
|
149
|
+
result = self.llm.complete(messages=messages, max_tokens=self.max_tokens)
|
|
150
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
151
|
+
LOG.warning('LLM attribution failed; falling back: %s', exc)
|
|
152
|
+
return self.fallback.attribute(trajectory, findings)
|
|
153
|
+
parsed = extract_json_block(result.text)
|
|
154
|
+
if not parsed:
|
|
155
|
+
LOG.info('LLM attributor returned no JSON; falling back')
|
|
156
|
+
return self.fallback.attribute(trajectory, findings)
|
|
157
|
+
blame = Blame(
|
|
158
|
+
span_id=self._coerce_str(parsed.get('span_id')),
|
|
159
|
+
step_index=self._coerce_int(parsed.get('step_index')),
|
|
160
|
+
agent_name=self._coerce_str(parsed.get('agent_name')),
|
|
161
|
+
confidence=self._coerce_float(parsed.get('confidence'), default=0.0),
|
|
162
|
+
rationale=str(parsed.get('rationale') or ''),
|
|
163
|
+
evidence=self._coerce_str_list(parsed.get('evidence')),
|
|
164
|
+
sources=[self.id],
|
|
165
|
+
)
|
|
166
|
+
return AttributionResult(
|
|
167
|
+
method=self.id,
|
|
168
|
+
hypotheses=[blame],
|
|
169
|
+
raw={'finding_count': len(findings)},
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def _render_prompt(
|
|
173
|
+
self,
|
|
174
|
+
trajectory: AgentTrajectory,
|
|
175
|
+
findings: List[FailureFinding],
|
|
176
|
+
) -> str:
|
|
177
|
+
findings_doc = '\n'.join(self._render_finding(f) for f in findings) or '(none)'
|
|
178
|
+
events_doc = '\n'.join(
|
|
179
|
+
f'event_id={e.event_id} step={e.step_index} agent={e.agent_name} '
|
|
180
|
+
f'type={getattr(e.event_type, "value", e.event_type)} '
|
|
181
|
+
f'output={str(e.output)[:200]} error={str(e.error)[:200]}'
|
|
182
|
+
for e in trajectory.events
|
|
183
|
+
)
|
|
184
|
+
return (
|
|
185
|
+
f'GOAL: {trajectory.goal!r}\n'
|
|
186
|
+
f'FRAMEWORK: {trajectory.framework!r}\n\n'
|
|
187
|
+
f'JUDGE FINDINGS:\n{findings_doc}\n\n'
|
|
188
|
+
f'EVENTS:\n{events_doc}\n'
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
def _render_finding(self, finding: FailureFinding) -> str:
|
|
192
|
+
return (
|
|
193
|
+
f'- mode={finding.failure_mode.mode_id} '
|
|
194
|
+
f'agent={finding.agent_name} step={finding.step_index} '
|
|
195
|
+
f'confidence={finding.confidence:.2f} '
|
|
196
|
+
f'evidence={"; ".join(finding.evidence)[:200]}'
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
@staticmethod
|
|
200
|
+
def _coerce_str(value: Any) -> Optional[str]:
|
|
201
|
+
if value is None:
|
|
202
|
+
return None
|
|
203
|
+
return str(value)
|
|
204
|
+
|
|
205
|
+
@staticmethod
|
|
206
|
+
def _coerce_int(value: Any) -> Optional[int]:
|
|
207
|
+
if value is None:
|
|
208
|
+
return None
|
|
209
|
+
try:
|
|
210
|
+
return int(value)
|
|
211
|
+
except (TypeError, ValueError):
|
|
212
|
+
return None
|
|
213
|
+
|
|
214
|
+
@staticmethod
|
|
215
|
+
def _coerce_float(value: Any, *, default: float) -> float:
|
|
216
|
+
try:
|
|
217
|
+
return float(value)
|
|
218
|
+
except (TypeError, ValueError):
|
|
219
|
+
return default
|
|
220
|
+
|
|
221
|
+
@staticmethod
|
|
222
|
+
def _coerce_str_list(value: Any) -> List[str]:
|
|
223
|
+
if value is None:
|
|
224
|
+
return []
|
|
225
|
+
if isinstance(value, list):
|
|
226
|
+
return [str(v) for v in value]
|
|
227
|
+
return [str(value)]
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
__all__ = ['Attributor', 'Blame', 'AttributionResult', 'HeuristicAttributor', 'AllAtOnceAttributor']
|
agentdebug/cli.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
"""Command line entry points.
|
|
2
|
+
|
|
3
|
+
v0.1 commands:
|
|
4
|
+
|
|
5
|
+
* ``agentdebug analyze`` — run an analyzer on a trajectory JSON file.
|
|
6
|
+
* ``agentdebug list`` — list trace IDs in a store.
|
|
7
|
+
* ``agentdebug show`` — pretty-print a trajectory by ID.
|
|
8
|
+
* ``agentdebug judge`` — run :class:`LLMJudgeAnalyzer` against a stored trajectory.
|
|
9
|
+
* ``agentdebug doctor`` — report which adapters/integrations are available.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import os
|
|
16
|
+
import sys
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Optional, Sequence
|
|
19
|
+
|
|
20
|
+
from agentdebug.analyzers import HeuristicAnalyzer
|
|
21
|
+
from agentdebug.attribution import AllAtOnceAttributor, HeuristicAttributor
|
|
22
|
+
from agentdebug.models import DiagnosticReport, model_to_json, trajectory_from_json
|
|
23
|
+
from agentdebug.recovery import FixProposal, ReflexionSuggestion
|
|
24
|
+
from agentdebug.storage import JsonlTraceStore, SQLiteTraceStore, TraceStore
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def main(argv: Optional[Sequence[str]] = None) -> int:
|
|
28
|
+
parser = argparse.ArgumentParser(prog='agentdebug')
|
|
29
|
+
sub = parser.add_subparsers(dest='command', required=True)
|
|
30
|
+
|
|
31
|
+
p_analyze = sub.add_parser('analyze', help='Analyze a trajectory JSON file')
|
|
32
|
+
p_analyze.add_argument('trajectory', help='Path to an AgentTrajectory JSON file')
|
|
33
|
+
p_analyze.add_argument('--out', help='Optional output path for the report')
|
|
34
|
+
p_analyze.add_argument(
|
|
35
|
+
'--suggest',
|
|
36
|
+
action='store_true',
|
|
37
|
+
help='Also emit Reflexion-style retry suggestions for each finding',
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
p_list = sub.add_parser('list', help='List trace IDs in a store')
|
|
41
|
+
_add_store_args(p_list)
|
|
42
|
+
|
|
43
|
+
p_show = sub.add_parser('show', help='Print a stored trajectory as JSON')
|
|
44
|
+
_add_store_args(p_show)
|
|
45
|
+
p_show.add_argument('trace_id', help='Trace ID to print')
|
|
46
|
+
|
|
47
|
+
p_judge = sub.add_parser(
|
|
48
|
+
'judge',
|
|
49
|
+
help='Run the LLM judge against a trajectory JSON file or stored trace',
|
|
50
|
+
)
|
|
51
|
+
p_judge.add_argument(
|
|
52
|
+
'target',
|
|
53
|
+
help='Path to a trajectory JSON file, or a trace_id when --store is set',
|
|
54
|
+
)
|
|
55
|
+
_add_store_args(p_judge, required=False)
|
|
56
|
+
p_judge.add_argument(
|
|
57
|
+
'--model',
|
|
58
|
+
default=os.environ.get('AGENTDEBUG_LLM_MODEL', 'gemini-3-flash'),
|
|
59
|
+
)
|
|
60
|
+
p_judge.add_argument('--base-url', dest='base_url')
|
|
61
|
+
p_judge.add_argument('--api-key', dest='api_key')
|
|
62
|
+
p_judge.add_argument('--out', help='Optional output path for the report')
|
|
63
|
+
p_judge.add_argument(
|
|
64
|
+
'--attribute',
|
|
65
|
+
action='store_true',
|
|
66
|
+
help='Also run AllAtOnceAttributor on the judge findings',
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
sub.add_parser('doctor', help='Report adapter and integration availability')
|
|
70
|
+
|
|
71
|
+
p_serve = sub.add_parser('serve', help='Run the local FastAPI dashboard')
|
|
72
|
+
_add_store_args(p_serve, required=True)
|
|
73
|
+
p_serve.add_argument('--host', default='127.0.0.1')
|
|
74
|
+
p_serve.add_argument('--port', type=int, default=7777)
|
|
75
|
+
|
|
76
|
+
args = parser.parse_args(argv)
|
|
77
|
+
if args.command == 'analyze':
|
|
78
|
+
return _cmd_analyze(args)
|
|
79
|
+
if args.command == 'list':
|
|
80
|
+
return _cmd_list(args)
|
|
81
|
+
if args.command == 'show':
|
|
82
|
+
return _cmd_show(args)
|
|
83
|
+
if args.command == 'judge':
|
|
84
|
+
return _cmd_judge(args)
|
|
85
|
+
if args.command == 'doctor':
|
|
86
|
+
return _cmd_doctor()
|
|
87
|
+
if args.command == 'serve':
|
|
88
|
+
return _cmd_serve(args)
|
|
89
|
+
return 1
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ---------------- subcommands ----------------
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _cmd_analyze(args: argparse.Namespace) -> int:
|
|
96
|
+
trajectory_path = Path(args.trajectory)
|
|
97
|
+
trajectory = trajectory_from_json(trajectory_path.read_text(encoding='utf-8'))
|
|
98
|
+
report = HeuristicAnalyzer().analyze(trajectory)
|
|
99
|
+
rendered = model_to_json(report, indent=2)
|
|
100
|
+
if args.suggest:
|
|
101
|
+
proposals = ReflexionSuggestion().suggest(trajectory, report)
|
|
102
|
+
rendered = _augment_with_suggestions(rendered, report, proposals)
|
|
103
|
+
_emit(rendered, args.out)
|
|
104
|
+
return 0
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _cmd_list(args: argparse.Namespace) -> int:
|
|
108
|
+
store = _resolve_store(args)
|
|
109
|
+
if store is None:
|
|
110
|
+
print('No store configured. Use --store-sqlite or --store-jsonl.', file=sys.stderr)
|
|
111
|
+
return 2
|
|
112
|
+
for trace_id in store.list_traces():
|
|
113
|
+
print(trace_id)
|
|
114
|
+
return 0
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _cmd_show(args: argparse.Namespace) -> int:
|
|
118
|
+
store = _resolve_store(args)
|
|
119
|
+
if store is None:
|
|
120
|
+
print('No store configured. Use --store-sqlite or --store-jsonl.', file=sys.stderr)
|
|
121
|
+
return 2
|
|
122
|
+
trajectory = store.load_trajectory(args.trace_id)
|
|
123
|
+
if trajectory is None:
|
|
124
|
+
print(f'Unknown trace_id: {args.trace_id}', file=sys.stderr)
|
|
125
|
+
return 3
|
|
126
|
+
print(model_to_json(trajectory, indent=2))
|
|
127
|
+
return 0
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _cmd_judge(args: argparse.Namespace) -> int:
|
|
131
|
+
from agentdebug.judges import LLMJudgeAnalyzer
|
|
132
|
+
from agentdebug.llm import OpenAICompatClient
|
|
133
|
+
|
|
134
|
+
# Load trajectory: file path first, fall back to store lookup.
|
|
135
|
+
trajectory_path = Path(args.target)
|
|
136
|
+
if trajectory_path.exists():
|
|
137
|
+
trajectory = trajectory_from_json(
|
|
138
|
+
trajectory_path.read_text(encoding='utf-8')
|
|
139
|
+
)
|
|
140
|
+
else:
|
|
141
|
+
store = _resolve_store(args)
|
|
142
|
+
if store is None:
|
|
143
|
+
print(
|
|
144
|
+
f'Could not find {args.target!r} on disk and no store configured.',
|
|
145
|
+
file=sys.stderr,
|
|
146
|
+
)
|
|
147
|
+
return 2
|
|
148
|
+
loaded = store.load_trajectory(args.target)
|
|
149
|
+
if loaded is None:
|
|
150
|
+
print(f'Unknown trace_id: {args.target}', file=sys.stderr)
|
|
151
|
+
return 3
|
|
152
|
+
trajectory = loaded
|
|
153
|
+
|
|
154
|
+
base_url = args.base_url or os.environ.get('AGENTDEBUG_LLM_BASE_URL')
|
|
155
|
+
api_key = args.api_key or os.environ.get('AGENTDEBUG_LLM_API_KEY')
|
|
156
|
+
if not base_url or not api_key:
|
|
157
|
+
print(
|
|
158
|
+
'LLM judge requires --base-url and --api-key (or '
|
|
159
|
+
'AGENTDEBUG_LLM_BASE_URL / AGENTDEBUG_LLM_API_KEY).',
|
|
160
|
+
file=sys.stderr,
|
|
161
|
+
)
|
|
162
|
+
return 4
|
|
163
|
+
|
|
164
|
+
llm = OpenAICompatClient(base_url=base_url, api_key=api_key, model=args.model)
|
|
165
|
+
report = LLMJudgeAnalyzer(llm=llm).analyze(trajectory)
|
|
166
|
+
rendered = model_to_json(report, indent=2)
|
|
167
|
+
if args.attribute:
|
|
168
|
+
blame = AllAtOnceAttributor(llm=llm).attribute(trajectory, report.findings)
|
|
169
|
+
rendered = _augment_with_blame(rendered, blame)
|
|
170
|
+
_emit(rendered, args.out)
|
|
171
|
+
return 0
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _cmd_serve(args: argparse.Namespace) -> int:
|
|
175
|
+
store = _resolve_store(args)
|
|
176
|
+
if store is None:
|
|
177
|
+
print(
|
|
178
|
+
'serve requires --store-sqlite or --store-jsonl.', file=sys.stderr
|
|
179
|
+
)
|
|
180
|
+
return 2
|
|
181
|
+
try:
|
|
182
|
+
from agentdebug.ui import serve
|
|
183
|
+
|
|
184
|
+
serve(store, host=args.host, port=args.port)
|
|
185
|
+
except ImportError as exc:
|
|
186
|
+
print(str(exc), file=sys.stderr)
|
|
187
|
+
return 5
|
|
188
|
+
return 0
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _cmd_doctor() -> int:
|
|
192
|
+
statuses = []
|
|
193
|
+
try:
|
|
194
|
+
from agentdebug.adapters.langgraph import LangGraphAdapter
|
|
195
|
+
|
|
196
|
+
statuses.append(LangGraphAdapter().instrument(_dummy_debugger()))
|
|
197
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
198
|
+
statuses.append(_status('langgraph', False, str(exc)))
|
|
199
|
+
try:
|
|
200
|
+
from agentdebug.adapters.otel import OTelExportAdapter
|
|
201
|
+
|
|
202
|
+
statuses.append(OTelExportAdapter().instrument(_dummy_debugger()))
|
|
203
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
204
|
+
statuses.append(_status('otel', False, str(exc)))
|
|
205
|
+
from agentdebug.adapters.raw import RawLoopAdapter
|
|
206
|
+
|
|
207
|
+
statuses.append(RawLoopAdapter().instrument(_dummy_debugger()))
|
|
208
|
+
for s in statuses:
|
|
209
|
+
flag = '✓' if s.implemented else '✗'
|
|
210
|
+
print(f' {flag} {s.framework:<10} {s.notes}')
|
|
211
|
+
return 0
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
# ---------------- helpers ----------------
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _add_store_args(p: argparse.ArgumentParser, *, required: bool = False) -> None:
|
|
218
|
+
group = p.add_mutually_exclusive_group(required=required)
|
|
219
|
+
group.add_argument('--store-sqlite', help='Path to a SQLite trace store')
|
|
220
|
+
group.add_argument('--store-jsonl', help='Path to a JSONL trace store')
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _resolve_store(args: argparse.Namespace) -> Optional[TraceStore]:
|
|
224
|
+
if getattr(args, 'store_sqlite', None):
|
|
225
|
+
return SQLiteTraceStore(args.store_sqlite)
|
|
226
|
+
if getattr(args, 'store_jsonl', None):
|
|
227
|
+
return JsonlTraceStore(args.store_jsonl)
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _emit(rendered: str, out_path: Optional[str]) -> None:
|
|
232
|
+
if out_path is None:
|
|
233
|
+
print(rendered)
|
|
234
|
+
return
|
|
235
|
+
out = Path(out_path)
|
|
236
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
237
|
+
out.write_text(rendered + '\n', encoding='utf-8')
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _augment_with_suggestions(
|
|
241
|
+
rendered: str, report: DiagnosticReport, proposals: list[FixProposal]
|
|
242
|
+
) -> str:
|
|
243
|
+
proposals_block = '\n'.join(
|
|
244
|
+
f'-- proposal {p.proposal_id} ({p.recoverer_id}) --\n{p.suggestion_text}'
|
|
245
|
+
for p in proposals
|
|
246
|
+
)
|
|
247
|
+
return rendered + (
|
|
248
|
+
'\n\n# === Reflexion suggestions ===\n' + proposals_block
|
|
249
|
+
if proposals_block
|
|
250
|
+
else ''
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _augment_with_blame(rendered: str, blame_result: object) -> str:
|
|
255
|
+
return rendered + '\n\n# === Attribution ===\n' + repr(blame_result)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _status(framework: str, implemented: bool, notes: str) -> object:
|
|
259
|
+
from agentdebug.adapters.base import AdapterStatus
|
|
260
|
+
|
|
261
|
+
return AdapterStatus(framework=framework, implemented=implemented, notes=notes)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _dummy_debugger() -> object:
|
|
265
|
+
from agentdebug.recorder import AgentDebug
|
|
266
|
+
from agentdebug.storage import JsonlTraceStore
|
|
267
|
+
|
|
268
|
+
return AgentDebug(store=JsonlTraceStore('.agentdebug/_doctor.jsonl'))
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
if __name__ == '__main__':
|
|
272
|
+
raise SystemExit(main())
|