agentdebugx 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentdebug/__init__.py +65 -0
- agentdebug/adapters/__init__.py +10 -0
- agentdebug/adapters/base.py +22 -0
- agentdebug/adapters/langgraph.py +261 -0
- agentdebug/adapters/otel.py +151 -0
- agentdebug/adapters/raw.py +134 -0
- agentdebug/analyzers.py +152 -0
- agentdebug/attribution.py +230 -0
- agentdebug/cli.py +272 -0
- agentdebug/events.py +114 -0
- agentdebug/instrumentation.py +57 -0
- agentdebug/judges.py +258 -0
- agentdebug/llm.py +165 -0
- agentdebug/models.py +169 -0
- agentdebug/recorder.py +183 -0
- agentdebug/recovery.py +113 -0
- agentdebug/storage.py +167 -0
- agentdebug/taxonomy.py +271 -0
- agentdebug/ui/__init__.py +14 -0
- agentdebug/ui/server.py +260 -0
- agentdebugx-0.1.0.dist-info/METADATA +217 -0
- agentdebugx-0.1.0.dist-info/RECORD +25 -0
- agentdebugx-0.1.0.dist-info/WHEEL +4 -0
- agentdebugx-0.1.0.dist-info/entry_points.txt +3 -0
- agentdebugx-0.1.0.dist-info/licenses/LICENSE +21 -0
agentdebug/events.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""In-process pub/sub event bus for live trace introspection.
|
|
2
|
+
|
|
3
|
+
The bus is intentionally simple: synchronous fan-out, bounded queue per
|
|
4
|
+
subscriber, and never raises out of a publish call. Detectors that need to run
|
|
5
|
+
on a streaming trace subscribe here; offline detectors continue to walk the
|
|
6
|
+
``AgentTrajectory`` directly.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from typing import Any, Callable, DefaultDict, Dict, List, Optional
|
|
15
|
+
from uuid import uuid4
|
|
16
|
+
|
|
17
|
+
from agentdebug.models import AgentEvent, AgentTrajectory, DiagnosticReport
|
|
18
|
+
|
|
19
|
+
LOG = logging.getLogger('agentdebug.events')
|
|
20
|
+
|
|
21
|
+
EventHandler = Callable[['BusEvent'], None]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True)
|
|
25
|
+
class BusEvent:
|
|
26
|
+
"""Wrapper passed to bus subscribers.
|
|
27
|
+
|
|
28
|
+
The ``kind`` field is one of: ``trace.start``, ``trace.event``,
|
|
29
|
+
``trace.end``, ``analysis.report``.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
kind: str
|
|
33
|
+
trajectory: Optional[AgentTrajectory] = None
|
|
34
|
+
event: Optional[AgentEvent] = None
|
|
35
|
+
report: Optional[DiagnosticReport] = None
|
|
36
|
+
payload: Dict[str, Any] = field(default_factory=dict)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class EventSubscription:
|
|
41
|
+
"""Returned by :meth:`EventBus.subscribe`; call ``.unsubscribe()`` to detach."""
|
|
42
|
+
|
|
43
|
+
id: str
|
|
44
|
+
kind: str
|
|
45
|
+
handler: EventHandler
|
|
46
|
+
_bus: 'EventBus'
|
|
47
|
+
|
|
48
|
+
def unsubscribe(self) -> None:
|
|
49
|
+
self._bus._remove(self)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class EventBus:
|
|
53
|
+
"""A tiny synchronous pub/sub bus.
|
|
54
|
+
|
|
55
|
+
The bus never raises during ``publish`` — handler exceptions are logged and
|
|
56
|
+
a per-handler error counter is incremented. Misbehaving handlers are
|
|
57
|
+
auto-detached after ``error_budget`` consecutive failures so a broken
|
|
58
|
+
detector cannot wedge a live agent.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
KIND_ANY = '*'
|
|
62
|
+
|
|
63
|
+
def __init__(self, *, error_budget: int = 5) -> None:
|
|
64
|
+
self._subs: DefaultDict[str, List[EventSubscription]] = defaultdict(list)
|
|
65
|
+
self._error_counts: Dict[str, int] = {}
|
|
66
|
+
self._error_budget = error_budget
|
|
67
|
+
|
|
68
|
+
def subscribe(self, kind: str, handler: EventHandler) -> EventSubscription:
|
|
69
|
+
sub = EventSubscription(
|
|
70
|
+
id=f'sub_{uuid4().hex}',
|
|
71
|
+
kind=kind,
|
|
72
|
+
handler=handler,
|
|
73
|
+
_bus=self,
|
|
74
|
+
)
|
|
75
|
+
self._subs[kind].append(sub)
|
|
76
|
+
self._error_counts[sub.id] = 0
|
|
77
|
+
return sub
|
|
78
|
+
|
|
79
|
+
def publish(self, evt: BusEvent) -> None:
|
|
80
|
+
# Fan-out to exact-kind subscribers AND wildcard subscribers.
|
|
81
|
+
for sub in list(self._subs.get(evt.kind, ())):
|
|
82
|
+
self._invoke(sub, evt)
|
|
83
|
+
for sub in list(self._subs.get(self.KIND_ANY, ())):
|
|
84
|
+
self._invoke(sub, evt)
|
|
85
|
+
|
|
86
|
+
def subscribers(self, kind: str) -> int:
|
|
87
|
+
return len(self._subs.get(kind, ()))
|
|
88
|
+
|
|
89
|
+
def _invoke(self, sub: EventSubscription, evt: BusEvent) -> None:
|
|
90
|
+
try:
|
|
91
|
+
sub.handler(evt)
|
|
92
|
+
self._error_counts[sub.id] = 0
|
|
93
|
+
except Exception as exc: # pragma: no cover - defensive only
|
|
94
|
+
LOG.warning(
|
|
95
|
+
'event handler %s raised on kind=%s: %s', sub.id, evt.kind, exc
|
|
96
|
+
)
|
|
97
|
+
self._error_counts[sub.id] = self._error_counts.get(sub.id, 0) + 1
|
|
98
|
+
if self._error_counts[sub.id] >= self._error_budget:
|
|
99
|
+
LOG.warning(
|
|
100
|
+
'detaching subscription %s after %d failures',
|
|
101
|
+
sub.id,
|
|
102
|
+
self._error_budget,
|
|
103
|
+
)
|
|
104
|
+
self._remove(sub)
|
|
105
|
+
|
|
106
|
+
def _remove(self, sub: EventSubscription) -> None:
|
|
107
|
+
bucket = self._subs.get(sub.kind, [])
|
|
108
|
+
self._subs[sub.kind] = [s for s in bucket if s.id != sub.id]
|
|
109
|
+
self._error_counts.pop(sub.id, None)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# A process-global default bus. Tracers that don't take an explicit bus argument
|
|
113
|
+
# use this one; tests construct their own.
|
|
114
|
+
DEFAULT_BUS = EventBus()
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Small helpers for instrumenting plain Python functions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from functools import wraps
|
|
7
|
+
from typing import Any, Callable, Optional, TypeVar, cast
|
|
8
|
+
|
|
9
|
+
from agentdebug.models import EventType
|
|
10
|
+
from agentdebug.recorder import TraceSession
|
|
11
|
+
|
|
12
|
+
F = TypeVar('F', bound=Callable[..., Any])
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def traced_tool(
|
|
16
|
+
session: TraceSession,
|
|
17
|
+
agent_name: str = 'tool',
|
|
18
|
+
tool_name: Optional[str] = None,
|
|
19
|
+
) -> Callable[[F], F]:
|
|
20
|
+
"""Record a function invocation as tool.call/tool.result events."""
|
|
21
|
+
|
|
22
|
+
def decorator(func: F) -> F:
|
|
23
|
+
name = tool_name or func.__name__
|
|
24
|
+
|
|
25
|
+
@wraps(func)
|
|
26
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
27
|
+
started = time.perf_counter()
|
|
28
|
+
call_event = session.record(
|
|
29
|
+
EventType.TOOL_CALL,
|
|
30
|
+
agent_name=agent_name,
|
|
31
|
+
input={'tool': name, 'args': repr(args), 'kwargs': repr(kwargs)},
|
|
32
|
+
)
|
|
33
|
+
try:
|
|
34
|
+
result = func(*args, **kwargs)
|
|
35
|
+
except Exception as exc:
|
|
36
|
+
session.record(
|
|
37
|
+
EventType.TOOL_RESULT,
|
|
38
|
+
agent_name=agent_name,
|
|
39
|
+
parent_event_id=call_event.event_id,
|
|
40
|
+
error=str(exc),
|
|
41
|
+
duration_ms=(time.perf_counter() - started) * 1000,
|
|
42
|
+
tool=name,
|
|
43
|
+
)
|
|
44
|
+
raise
|
|
45
|
+
session.record(
|
|
46
|
+
EventType.TOOL_RESULT,
|
|
47
|
+
agent_name=agent_name,
|
|
48
|
+
parent_event_id=call_event.event_id,
|
|
49
|
+
output=result,
|
|
50
|
+
duration_ms=(time.perf_counter() - started) * 1000,
|
|
51
|
+
tool=name,
|
|
52
|
+
)
|
|
53
|
+
return result
|
|
54
|
+
|
|
55
|
+
return cast(F, wrapper)
|
|
56
|
+
|
|
57
|
+
return decorator
|
agentdebug/judges.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""LLM-based judge analyzer.
|
|
2
|
+
|
|
3
|
+
Walks an ``AgentTrajectory`` and asks an LLM to label step-level failures
|
|
4
|
+
against the 19 seed modes shipped in :mod:`agentdebug.taxonomy`. The judge is
|
|
5
|
+
intentionally schemed to the existing taxonomy so the rest of the pipeline
|
|
6
|
+
(``DiagnosticReport``, recovery, attribution) doesn't need a parallel label
|
|
7
|
+
space.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
from typing import Any, Dict, List, Optional, Sequence
|
|
14
|
+
|
|
15
|
+
from agentdebug.llm import LLMClient, extract_json_block
|
|
16
|
+
from agentdebug.models import (
|
|
17
|
+
AgentEvent,
|
|
18
|
+
AgentTrajectory,
|
|
19
|
+
DiagnosticReport,
|
|
20
|
+
EventType,
|
|
21
|
+
FailureFinding,
|
|
22
|
+
FailureMode,
|
|
23
|
+
new_id,
|
|
24
|
+
)
|
|
25
|
+
from agentdebug.taxonomy import SEED_FAILURE_MODES
|
|
26
|
+
|
|
27
|
+
LOG = logging.getLogger('agentdebug.judges')
|
|
28
|
+
|
|
29
|
+
_SYSTEM_PROMPT = """You are AgentDebugX-Judge, an expert at diagnosing failures
|
|
30
|
+
in LLM agent trajectories.
|
|
31
|
+
|
|
32
|
+
You will be given:
|
|
33
|
+
* the agent's goal,
|
|
34
|
+
* a list of ALLOWED failure mode codes with descriptions,
|
|
35
|
+
* the chronological events of one agent run.
|
|
36
|
+
|
|
37
|
+
Your job: identify each step where a failure occurred and label it with ONE of
|
|
38
|
+
the allowed failure mode codes. Be conservative — only flag steps where the
|
|
39
|
+
evidence in the event payload supports the label. If the trajectory contains no
|
|
40
|
+
failure, return an empty findings list.
|
|
41
|
+
|
|
42
|
+
Respond ONLY with a JSON object matching this schema (no prose, no markdown):
|
|
43
|
+
|
|
44
|
+
{
|
|
45
|
+
"findings": [
|
|
46
|
+
{
|
|
47
|
+
"event_id": "<event_id from the input>",
|
|
48
|
+
"step_index": <int or null>,
|
|
49
|
+
"agent_name": "<agent_name from the input>",
|
|
50
|
+
"failure_mode_id": "<one of the allowed codes>",
|
|
51
|
+
"confidence": <float between 0 and 1>,
|
|
52
|
+
"evidence": ["<short quote or summary of the supporting payload>"]
|
|
53
|
+
}
|
|
54
|
+
],
|
|
55
|
+
"summary": "<one-sentence diagnosis or 'No failure detected.'>"
|
|
56
|
+
}
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class LLMJudgeAnalyzer:
|
|
61
|
+
"""LLM-as-judge analyzer schemed to the 19 seed failure modes."""
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
llm: LLMClient,
|
|
66
|
+
*,
|
|
67
|
+
max_events_per_call: int = 80,
|
|
68
|
+
max_evidence_chars: int = 300,
|
|
69
|
+
max_tokens: int = 4096,
|
|
70
|
+
) -> None:
|
|
71
|
+
self.llm = llm
|
|
72
|
+
self.max_events_per_call = max_events_per_call
|
|
73
|
+
self.max_evidence_chars = max_evidence_chars
|
|
74
|
+
# NOTE: thinking models (Gemini 2.x/3.x, o-series) spend a substantial
|
|
75
|
+
# fraction of `max_tokens` on reasoning tokens before any text is
|
|
76
|
+
# emitted. 4096 is the safe default; bump higher for long traces.
|
|
77
|
+
self.max_tokens = max_tokens
|
|
78
|
+
|
|
79
|
+
def analyze(self, trajectory: AgentTrajectory) -> DiagnosticReport:
|
|
80
|
+
events = trajectory.events
|
|
81
|
+
findings: List[FailureFinding] = []
|
|
82
|
+
summary_parts: List[str] = []
|
|
83
|
+
for chunk in self._chunk_events(events):
|
|
84
|
+
findings_chunk, summary = self._judge_chunk(trajectory, chunk)
|
|
85
|
+
findings.extend(findings_chunk)
|
|
86
|
+
if summary:
|
|
87
|
+
summary_parts.append(summary)
|
|
88
|
+
root = self._select_root(findings)
|
|
89
|
+
report = DiagnosticReport(
|
|
90
|
+
trace_id=trajectory.trace_id,
|
|
91
|
+
task_id=trajectory.task_id,
|
|
92
|
+
findings=findings,
|
|
93
|
+
suggestions=self._collect_suggestions(findings),
|
|
94
|
+
metadata={'analyzer': self.__class__.__name__, 'model': self.llm.model},
|
|
95
|
+
)
|
|
96
|
+
report.summary = (
|
|
97
|
+
' '.join(s for s in summary_parts if s)
|
|
98
|
+
or (
|
|
99
|
+
f'Likely root cause: {root.failure_mode.name}'
|
|
100
|
+
f' in {root.agent_name or "unknown agent"}'
|
|
101
|
+
f' at step {root.step_index}.'
|
|
102
|
+
if root
|
|
103
|
+
else 'No failure was detected.'
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
if root is not None:
|
|
107
|
+
report.root_cause_event_id = root.event_id
|
|
108
|
+
report.root_cause_agent = root.agent_name
|
|
109
|
+
report.root_cause_step_index = root.step_index
|
|
110
|
+
return report
|
|
111
|
+
|
|
112
|
+
def _chunk_events(self, events: Sequence[AgentEvent]) -> List[List[AgentEvent]]:
|
|
113
|
+
if not events:
|
|
114
|
+
return []
|
|
115
|
+
return [
|
|
116
|
+
list(events[i : i + self.max_events_per_call])
|
|
117
|
+
for i in range(0, len(events), self.max_events_per_call)
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
def _judge_chunk(
|
|
121
|
+
self, trajectory: AgentTrajectory, chunk: List[AgentEvent]
|
|
122
|
+
) -> tuple[List[FailureFinding], str]:
|
|
123
|
+
user = self._render_user_prompt(trajectory, chunk)
|
|
124
|
+
messages = [
|
|
125
|
+
{'role': 'system', 'content': _SYSTEM_PROMPT},
|
|
126
|
+
{'role': 'user', 'content': user},
|
|
127
|
+
]
|
|
128
|
+
result = self.llm.complete(messages=messages, max_tokens=self.max_tokens)
|
|
129
|
+
parsed = extract_json_block(result.text)
|
|
130
|
+
if not parsed:
|
|
131
|
+
LOG.warning('LLM judge returned no JSON; raw=%r', result.text[:300])
|
|
132
|
+
return [], ''
|
|
133
|
+
raw_findings = parsed.get('findings') or []
|
|
134
|
+
summary = str(parsed.get('summary') or '')
|
|
135
|
+
findings: List[FailureFinding] = []
|
|
136
|
+
for raw in raw_findings:
|
|
137
|
+
if not isinstance(raw, dict):
|
|
138
|
+
continue
|
|
139
|
+
mode_id = str(raw.get('failure_mode_id') or '')
|
|
140
|
+
failure_mode = SEED_FAILURE_MODES.get(mode_id)
|
|
141
|
+
if failure_mode is None:
|
|
142
|
+
LOG.debug('skipping unknown failure_mode_id from judge: %s', mode_id)
|
|
143
|
+
continue
|
|
144
|
+
findings.append(
|
|
145
|
+
FailureFinding(
|
|
146
|
+
finding_id=new_id('finding'),
|
|
147
|
+
failure_mode=failure_mode,
|
|
148
|
+
event_id=self._coerce_str(raw.get('event_id')),
|
|
149
|
+
agent_name=self._coerce_str(raw.get('agent_name')),
|
|
150
|
+
step_index=self._coerce_int(raw.get('step_index')),
|
|
151
|
+
confidence=self._coerce_float(raw.get('confidence'), default=0.5),
|
|
152
|
+
evidence=self._coerce_str_list(raw.get('evidence')),
|
|
153
|
+
suggestion=self._suggestion(failure_mode),
|
|
154
|
+
metadata={'source': 'llm_judge'},
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
return findings, summary
|
|
158
|
+
|
|
159
|
+
def _render_user_prompt(
|
|
160
|
+
self, trajectory: AgentTrajectory, chunk: List[AgentEvent]
|
|
161
|
+
) -> str:
|
|
162
|
+
modes_doc = '\n'.join(
|
|
163
|
+
f'- {mode_id}: {mode.description}'
|
|
164
|
+
for mode_id, mode in SEED_FAILURE_MODES.items()
|
|
165
|
+
)
|
|
166
|
+
events_doc = '\n'.join(self._render_event(evt) for evt in chunk)
|
|
167
|
+
return (
|
|
168
|
+
f'GOAL: {trajectory.goal!r}\n'
|
|
169
|
+
f'FRAMEWORK: {trajectory.framework!r}\n\n'
|
|
170
|
+
f'ALLOWED FAILURE MODES:\n{modes_doc}\n\n'
|
|
171
|
+
f'EVENTS:\n{events_doc}\n'
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
def _render_event(self, event: AgentEvent) -> str:
|
|
175
|
+
def shorten(value: Any) -> str:
|
|
176
|
+
text = '' if value is None else str(value)
|
|
177
|
+
if len(text) > self.max_evidence_chars:
|
|
178
|
+
text = text[: self.max_evidence_chars] + '…'
|
|
179
|
+
return text
|
|
180
|
+
|
|
181
|
+
return (
|
|
182
|
+
f'event_id={event.event_id} '
|
|
183
|
+
f'type={self._event_type_value(event.event_type)} '
|
|
184
|
+
f'agent={event.agent_name} '
|
|
185
|
+
f'module={event.module} step={event.step_index} '
|
|
186
|
+
f'input={shorten(event.input)} '
|
|
187
|
+
f'output={shorten(event.output)} '
|
|
188
|
+
f'error={shorten(event.error)} '
|
|
189
|
+
f'metadata={shorten(event.metadata)}'
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
def _select_root(
|
|
193
|
+
self, findings: List[FailureFinding]
|
|
194
|
+
) -> Optional[FailureFinding]:
|
|
195
|
+
if not findings:
|
|
196
|
+
return None
|
|
197
|
+
return sorted(
|
|
198
|
+
findings,
|
|
199
|
+
key=lambda finding: (
|
|
200
|
+
finding.step_index is None,
|
|
201
|
+
finding.step_index if finding.step_index is not None else 10**9,
|
|
202
|
+
-finding.confidence,
|
|
203
|
+
),
|
|
204
|
+
)[0]
|
|
205
|
+
|
|
206
|
+
def _collect_suggestions(self, findings: List[FailureFinding]) -> List[str]:
|
|
207
|
+
seen = set()
|
|
208
|
+
out: List[str] = []
|
|
209
|
+
for f in findings:
|
|
210
|
+
if f.suggestion and f.suggestion not in seen:
|
|
211
|
+
seen.add(f.suggestion)
|
|
212
|
+
out.append(f.suggestion)
|
|
213
|
+
return out
|
|
214
|
+
|
|
215
|
+
def _suggestion(self, failure_mode: FailureMode) -> Optional[str]:
|
|
216
|
+
if not failure_mode.suggestion_templates:
|
|
217
|
+
return None
|
|
218
|
+
return str(failure_mode.suggestion_templates[0])
|
|
219
|
+
|
|
220
|
+
@staticmethod
|
|
221
|
+
def _coerce_str(value: Any) -> Optional[str]:
|
|
222
|
+
if value is None:
|
|
223
|
+
return None
|
|
224
|
+
return str(value)
|
|
225
|
+
|
|
226
|
+
@staticmethod
|
|
227
|
+
def _coerce_int(value: Any) -> Optional[int]:
|
|
228
|
+
if value is None:
|
|
229
|
+
return None
|
|
230
|
+
try:
|
|
231
|
+
return int(value)
|
|
232
|
+
except (TypeError, ValueError):
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
@staticmethod
|
|
236
|
+
def _coerce_float(value: Any, *, default: float) -> float:
|
|
237
|
+
try:
|
|
238
|
+
return float(value)
|
|
239
|
+
except (TypeError, ValueError):
|
|
240
|
+
return default
|
|
241
|
+
|
|
242
|
+
@staticmethod
|
|
243
|
+
def _coerce_str_list(value: Any) -> List[str]:
|
|
244
|
+
if value is None:
|
|
245
|
+
return []
|
|
246
|
+
if isinstance(value, list):
|
|
247
|
+
return [str(v) for v in value]
|
|
248
|
+
return [str(value)]
|
|
249
|
+
|
|
250
|
+
@staticmethod
|
|
251
|
+
def _event_type_value(event_type: EventType) -> str:
|
|
252
|
+
value = getattr(event_type, 'value', event_type)
|
|
253
|
+
if isinstance(value, str):
|
|
254
|
+
return value
|
|
255
|
+
return str(value)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
__all__ = ['LLMJudgeAnalyzer']
|
agentdebug/llm.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Thin LLM client abstraction.
|
|
2
|
+
|
|
3
|
+
AgentDebugX needs an LLM for the judge analyzer and for the All-at-Once
|
|
4
|
+
attributor. We use an OpenAI-compatible chat-completions interface so users can
|
|
5
|
+
point us at OpenAI, Anthropic via LiteLLM, the Gemini endpoint they hand us, or
|
|
6
|
+
a local vLLM/Ollama deployment.
|
|
7
|
+
|
|
8
|
+
The implementation deliberately avoids depending on the ``openai`` Python SDK:
|
|
9
|
+
a single ``httpx`` POST keeps the install lightweight and lets users target any
|
|
10
|
+
``/v1/chat/completions``-compatible URL.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from typing import Any, Dict, List, Optional, Protocol
|
|
20
|
+
|
|
21
|
+
import httpx
|
|
22
|
+
|
|
23
|
+
LOG = logging.getLogger('agentdebug.llm')
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class CompletionResult:
|
|
28
|
+
text: str
|
|
29
|
+
raw: Dict[str, Any]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class LLMClient(Protocol):
|
|
33
|
+
model: str
|
|
34
|
+
|
|
35
|
+
def complete(
|
|
36
|
+
self,
|
|
37
|
+
messages: List[Dict[str, Any]],
|
|
38
|
+
*,
|
|
39
|
+
response_format: Optional[Dict[str, Any]] = None,
|
|
40
|
+
temperature: float = 0.0,
|
|
41
|
+
max_tokens: int = 2048,
|
|
42
|
+
timeout: float = 60.0,
|
|
43
|
+
) -> CompletionResult:
|
|
44
|
+
...
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class OpenAICompatClient:
|
|
48
|
+
"""OpenAI-compatible chat completions client.
|
|
49
|
+
|
|
50
|
+
Works against:
|
|
51
|
+
|
|
52
|
+
* OpenAI (``base_url='https://api.openai.com/v1'``)
|
|
53
|
+
* LiteLLM proxy (any ``/v1`` URL)
|
|
54
|
+
* The Gemini gateway used in this repo:
|
|
55
|
+
``https://compliant-wagner-simulations-coaches.trycloudflare.com/v1``
|
|
56
|
+
with ``model='gemini-3-flash'``
|
|
57
|
+
* vLLM / Ollama with their OpenAI-compat servers
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
*,
|
|
63
|
+
base_url: str,
|
|
64
|
+
api_key: str,
|
|
65
|
+
model: str,
|
|
66
|
+
default_max_tokens: int = 2048,
|
|
67
|
+
timeout: float = 60.0,
|
|
68
|
+
) -> None:
|
|
69
|
+
self.base_url = base_url.rstrip('/')
|
|
70
|
+
self.api_key = api_key
|
|
71
|
+
self.model = model
|
|
72
|
+
self.default_max_tokens = default_max_tokens
|
|
73
|
+
self.timeout = timeout
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def from_env(
|
|
77
|
+
cls,
|
|
78
|
+
*,
|
|
79
|
+
env_prefix: str = 'AGENTDEBUG_LLM',
|
|
80
|
+
model: Optional[str] = None,
|
|
81
|
+
) -> 'OpenAICompatClient':
|
|
82
|
+
"""Construct from environment variables.
|
|
83
|
+
|
|
84
|
+
Reads ``<PREFIX>_BASE_URL``, ``<PREFIX>_API_KEY``, ``<PREFIX>_MODEL``.
|
|
85
|
+
"""
|
|
86
|
+
base_url = os.environ[f'{env_prefix}_BASE_URL']
|
|
87
|
+
api_key = os.environ[f'{env_prefix}_API_KEY']
|
|
88
|
+
model_id: str = (
|
|
89
|
+
model if model is not None
|
|
90
|
+
else os.environ.get(f'{env_prefix}_MODEL', 'gpt-4o-mini')
|
|
91
|
+
)
|
|
92
|
+
return cls(base_url=base_url, api_key=api_key, model=model_id)
|
|
93
|
+
|
|
94
|
+
def complete(
|
|
95
|
+
self,
|
|
96
|
+
messages: List[Dict[str, Any]],
|
|
97
|
+
*,
|
|
98
|
+
response_format: Optional[Dict[str, Any]] = None,
|
|
99
|
+
temperature: float = 0.0,
|
|
100
|
+
max_tokens: Optional[int] = None,
|
|
101
|
+
timeout: Optional[float] = None,
|
|
102
|
+
) -> CompletionResult:
|
|
103
|
+
body: Dict[str, Any] = {
|
|
104
|
+
'model': self.model,
|
|
105
|
+
'messages': messages,
|
|
106
|
+
'temperature': temperature,
|
|
107
|
+
'max_tokens': max_tokens or self.default_max_tokens,
|
|
108
|
+
}
|
|
109
|
+
if response_format is not None:
|
|
110
|
+
body['response_format'] = response_format
|
|
111
|
+
url = f'{self.base_url}/chat/completions'
|
|
112
|
+
headers = {
|
|
113
|
+
'Authorization': f'Bearer {self.api_key}',
|
|
114
|
+
'Content-Type': 'application/json',
|
|
115
|
+
}
|
|
116
|
+
resp = httpx.post(
|
|
117
|
+
url, headers=headers, json=body, timeout=timeout or self.timeout
|
|
118
|
+
)
|
|
119
|
+
resp.raise_for_status()
|
|
120
|
+
data: Dict[str, Any] = resp.json()
|
|
121
|
+
choice = (data.get('choices') or [{}])[0]
|
|
122
|
+
message = choice.get('message') or {}
|
|
123
|
+
text = message.get('content') or ''
|
|
124
|
+
if not text:
|
|
125
|
+
LOG.warning(
|
|
126
|
+
'empty content from %s (model=%s, finish_reason=%s, usage=%s)',
|
|
127
|
+
url,
|
|
128
|
+
self.model,
|
|
129
|
+
choice.get('finish_reason'),
|
|
130
|
+
data.get('usage'),
|
|
131
|
+
)
|
|
132
|
+
return CompletionResult(text=text, raw=data)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def extract_json_block(text: str) -> Optional[Dict[str, Any]]:
|
|
136
|
+
"""Extract the first top-level JSON object from a possibly-fenced response."""
|
|
137
|
+
if not text:
|
|
138
|
+
return None
|
|
139
|
+
# Strip code fences if present.
|
|
140
|
+
cleaned = text.strip()
|
|
141
|
+
if cleaned.startswith('```'):
|
|
142
|
+
# remove ``` or ```json prefix and trailing ```
|
|
143
|
+
cleaned = cleaned.split('\n', 1)[1] if '\n' in cleaned else cleaned[3:]
|
|
144
|
+
if cleaned.endswith('```'):
|
|
145
|
+
cleaned = cleaned[: -len('```')]
|
|
146
|
+
cleaned = cleaned.strip()
|
|
147
|
+
# Try strict first.
|
|
148
|
+
try:
|
|
149
|
+
parsed = json.loads(cleaned)
|
|
150
|
+
if isinstance(parsed, dict):
|
|
151
|
+
return parsed
|
|
152
|
+
except json.JSONDecodeError:
|
|
153
|
+
pass
|
|
154
|
+
# Fallback: greedy slice between the first { and the last }.
|
|
155
|
+
start = cleaned.find('{')
|
|
156
|
+
end = cleaned.rfind('}')
|
|
157
|
+
if start == -1 or end == -1 or end <= start:
|
|
158
|
+
return None
|
|
159
|
+
try:
|
|
160
|
+
parsed = json.loads(cleaned[start : end + 1])
|
|
161
|
+
if isinstance(parsed, dict):
|
|
162
|
+
return parsed
|
|
163
|
+
except json.JSONDecodeError:
|
|
164
|
+
return None
|
|
165
|
+
return None
|