agentdebugx 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
agentdebug/events.py ADDED
@@ -0,0 +1,114 @@
1
+ """In-process pub/sub event bus for live trace introspection.
2
+
3
+ The bus is intentionally simple: synchronous fan-out, bounded queue per
4
+ subscriber, and never raises out of a publish call. Detectors that need to run
5
+ on a streaming trace subscribe here; offline detectors continue to walk the
6
+ ``AgentTrajectory`` directly.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ from collections import defaultdict
13
+ from dataclasses import dataclass, field
14
+ from typing import Any, Callable, DefaultDict, Dict, List, Optional
15
+ from uuid import uuid4
16
+
17
+ from agentdebug.models import AgentEvent, AgentTrajectory, DiagnosticReport
18
+
19
+ LOG = logging.getLogger('agentdebug.events')
20
+
21
+ EventHandler = Callable[['BusEvent'], None]
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class BusEvent:
26
+ """Wrapper passed to bus subscribers.
27
+
28
+ The ``kind`` field is one of: ``trace.start``, ``trace.event``,
29
+ ``trace.end``, ``analysis.report``.
30
+ """
31
+
32
+ kind: str
33
+ trajectory: Optional[AgentTrajectory] = None
34
+ event: Optional[AgentEvent] = None
35
+ report: Optional[DiagnosticReport] = None
36
+ payload: Dict[str, Any] = field(default_factory=dict)
37
+
38
+
39
+ @dataclass
40
+ class EventSubscription:
41
+ """Returned by :meth:`EventBus.subscribe`; call ``.unsubscribe()`` to detach."""
42
+
43
+ id: str
44
+ kind: str
45
+ handler: EventHandler
46
+ _bus: 'EventBus'
47
+
48
+ def unsubscribe(self) -> None:
49
+ self._bus._remove(self)
50
+
51
+
52
+ class EventBus:
53
+ """A tiny synchronous pub/sub bus.
54
+
55
+ The bus never raises during ``publish`` — handler exceptions are logged and
56
+ a per-handler error counter is incremented. Misbehaving handlers are
57
+ auto-detached after ``error_budget`` consecutive failures so a broken
58
+ detector cannot wedge a live agent.
59
+ """
60
+
61
+ KIND_ANY = '*'
62
+
63
+ def __init__(self, *, error_budget: int = 5) -> None:
64
+ self._subs: DefaultDict[str, List[EventSubscription]] = defaultdict(list)
65
+ self._error_counts: Dict[str, int] = {}
66
+ self._error_budget = error_budget
67
+
68
+ def subscribe(self, kind: str, handler: EventHandler) -> EventSubscription:
69
+ sub = EventSubscription(
70
+ id=f'sub_{uuid4().hex}',
71
+ kind=kind,
72
+ handler=handler,
73
+ _bus=self,
74
+ )
75
+ self._subs[kind].append(sub)
76
+ self._error_counts[sub.id] = 0
77
+ return sub
78
+
79
+ def publish(self, evt: BusEvent) -> None:
80
+ # Fan-out to exact-kind subscribers AND wildcard subscribers.
81
+ for sub in list(self._subs.get(evt.kind, ())):
82
+ self._invoke(sub, evt)
83
+ for sub in list(self._subs.get(self.KIND_ANY, ())):
84
+ self._invoke(sub, evt)
85
+
86
+ def subscribers(self, kind: str) -> int:
87
+ return len(self._subs.get(kind, ()))
88
+
89
+ def _invoke(self, sub: EventSubscription, evt: BusEvent) -> None:
90
+ try:
91
+ sub.handler(evt)
92
+ self._error_counts[sub.id] = 0
93
+ except Exception as exc: # pragma: no cover - defensive only
94
+ LOG.warning(
95
+ 'event handler %s raised on kind=%s: %s', sub.id, evt.kind, exc
96
+ )
97
+ self._error_counts[sub.id] = self._error_counts.get(sub.id, 0) + 1
98
+ if self._error_counts[sub.id] >= self._error_budget:
99
+ LOG.warning(
100
+ 'detaching subscription %s after %d failures',
101
+ sub.id,
102
+ self._error_budget,
103
+ )
104
+ self._remove(sub)
105
+
106
+ def _remove(self, sub: EventSubscription) -> None:
107
+ bucket = self._subs.get(sub.kind, [])
108
+ self._subs[sub.kind] = [s for s in bucket if s.id != sub.id]
109
+ self._error_counts.pop(sub.id, None)
110
+
111
+
112
+ # A process-global default bus. Tracers that don't take an explicit bus argument
113
+ # use this one; tests construct their own.
114
+ DEFAULT_BUS = EventBus()
@@ -0,0 +1,57 @@
1
+ """Small helpers for instrumenting plain Python functions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+ from functools import wraps
7
+ from typing import Any, Callable, Optional, TypeVar, cast
8
+
9
+ from agentdebug.models import EventType
10
+ from agentdebug.recorder import TraceSession
11
+
12
+ F = TypeVar('F', bound=Callable[..., Any])
13
+
14
+
15
+ def traced_tool(
16
+ session: TraceSession,
17
+ agent_name: str = 'tool',
18
+ tool_name: Optional[str] = None,
19
+ ) -> Callable[[F], F]:
20
+ """Record a function invocation as tool.call/tool.result events."""
21
+
22
+ def decorator(func: F) -> F:
23
+ name = tool_name or func.__name__
24
+
25
+ @wraps(func)
26
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
27
+ started = time.perf_counter()
28
+ call_event = session.record(
29
+ EventType.TOOL_CALL,
30
+ agent_name=agent_name,
31
+ input={'tool': name, 'args': repr(args), 'kwargs': repr(kwargs)},
32
+ )
33
+ try:
34
+ result = func(*args, **kwargs)
35
+ except Exception as exc:
36
+ session.record(
37
+ EventType.TOOL_RESULT,
38
+ agent_name=agent_name,
39
+ parent_event_id=call_event.event_id,
40
+ error=str(exc),
41
+ duration_ms=(time.perf_counter() - started) * 1000,
42
+ tool=name,
43
+ )
44
+ raise
45
+ session.record(
46
+ EventType.TOOL_RESULT,
47
+ agent_name=agent_name,
48
+ parent_event_id=call_event.event_id,
49
+ output=result,
50
+ duration_ms=(time.perf_counter() - started) * 1000,
51
+ tool=name,
52
+ )
53
+ return result
54
+
55
+ return cast(F, wrapper)
56
+
57
+ return decorator
agentdebug/judges.py ADDED
@@ -0,0 +1,258 @@
1
+ """LLM-based judge analyzer.
2
+
3
+ Walks an ``AgentTrajectory`` and asks an LLM to label step-level failures
4
+ against the 19 seed modes shipped in :mod:`agentdebug.taxonomy`. The judge is
5
+ intentionally schemed to the existing taxonomy so the rest of the pipeline
6
+ (``DiagnosticReport``, recovery, attribution) doesn't need a parallel label
7
+ space.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ from typing import Any, Dict, List, Optional, Sequence
14
+
15
+ from agentdebug.llm import LLMClient, extract_json_block
16
+ from agentdebug.models import (
17
+ AgentEvent,
18
+ AgentTrajectory,
19
+ DiagnosticReport,
20
+ EventType,
21
+ FailureFinding,
22
+ FailureMode,
23
+ new_id,
24
+ )
25
+ from agentdebug.taxonomy import SEED_FAILURE_MODES
26
+
27
+ LOG = logging.getLogger('agentdebug.judges')
28
+
29
+ _SYSTEM_PROMPT = """You are AgentDebugX-Judge, an expert at diagnosing failures
30
+ in LLM agent trajectories.
31
+
32
+ You will be given:
33
+ * the agent's goal,
34
+ * a list of ALLOWED failure mode codes with descriptions,
35
+ * the chronological events of one agent run.
36
+
37
+ Your job: identify each step where a failure occurred and label it with ONE of
38
+ the allowed failure mode codes. Be conservative — only flag steps where the
39
+ evidence in the event payload supports the label. If the trajectory contains no
40
+ failure, return an empty findings list.
41
+
42
+ Respond ONLY with a JSON object matching this schema (no prose, no markdown):
43
+
44
+ {
45
+ "findings": [
46
+ {
47
+ "event_id": "<event_id from the input>",
48
+ "step_index": <int or null>,
49
+ "agent_name": "<agent_name from the input>",
50
+ "failure_mode_id": "<one of the allowed codes>",
51
+ "confidence": <float between 0 and 1>,
52
+ "evidence": ["<short quote or summary of the supporting payload>"]
53
+ }
54
+ ],
55
+ "summary": "<one-sentence diagnosis or 'No failure detected.'>"
56
+ }
57
+ """
58
+
59
+
60
+ class LLMJudgeAnalyzer:
61
+ """LLM-as-judge analyzer schemed to the 19 seed failure modes."""
62
+
63
+ def __init__(
64
+ self,
65
+ llm: LLMClient,
66
+ *,
67
+ max_events_per_call: int = 80,
68
+ max_evidence_chars: int = 300,
69
+ max_tokens: int = 4096,
70
+ ) -> None:
71
+ self.llm = llm
72
+ self.max_events_per_call = max_events_per_call
73
+ self.max_evidence_chars = max_evidence_chars
74
+ # NOTE: thinking models (Gemini 2.x/3.x, o-series) spend a substantial
75
+ # fraction of `max_tokens` on reasoning tokens before any text is
76
+ # emitted. 4096 is the safe default; bump higher for long traces.
77
+ self.max_tokens = max_tokens
78
+
79
+ def analyze(self, trajectory: AgentTrajectory) -> DiagnosticReport:
80
+ events = trajectory.events
81
+ findings: List[FailureFinding] = []
82
+ summary_parts: List[str] = []
83
+ for chunk in self._chunk_events(events):
84
+ findings_chunk, summary = self._judge_chunk(trajectory, chunk)
85
+ findings.extend(findings_chunk)
86
+ if summary:
87
+ summary_parts.append(summary)
88
+ root = self._select_root(findings)
89
+ report = DiagnosticReport(
90
+ trace_id=trajectory.trace_id,
91
+ task_id=trajectory.task_id,
92
+ findings=findings,
93
+ suggestions=self._collect_suggestions(findings),
94
+ metadata={'analyzer': self.__class__.__name__, 'model': self.llm.model},
95
+ )
96
+ report.summary = (
97
+ ' '.join(s for s in summary_parts if s)
98
+ or (
99
+ f'Likely root cause: {root.failure_mode.name}'
100
+ f' in {root.agent_name or "unknown agent"}'
101
+ f' at step {root.step_index}.'
102
+ if root
103
+ else 'No failure was detected.'
104
+ )
105
+ )
106
+ if root is not None:
107
+ report.root_cause_event_id = root.event_id
108
+ report.root_cause_agent = root.agent_name
109
+ report.root_cause_step_index = root.step_index
110
+ return report
111
+
112
+ def _chunk_events(self, events: Sequence[AgentEvent]) -> List[List[AgentEvent]]:
113
+ if not events:
114
+ return []
115
+ return [
116
+ list(events[i : i + self.max_events_per_call])
117
+ for i in range(0, len(events), self.max_events_per_call)
118
+ ]
119
+
120
+ def _judge_chunk(
121
+ self, trajectory: AgentTrajectory, chunk: List[AgentEvent]
122
+ ) -> tuple[List[FailureFinding], str]:
123
+ user = self._render_user_prompt(trajectory, chunk)
124
+ messages = [
125
+ {'role': 'system', 'content': _SYSTEM_PROMPT},
126
+ {'role': 'user', 'content': user},
127
+ ]
128
+ result = self.llm.complete(messages=messages, max_tokens=self.max_tokens)
129
+ parsed = extract_json_block(result.text)
130
+ if not parsed:
131
+ LOG.warning('LLM judge returned no JSON; raw=%r', result.text[:300])
132
+ return [], ''
133
+ raw_findings = parsed.get('findings') or []
134
+ summary = str(parsed.get('summary') or '')
135
+ findings: List[FailureFinding] = []
136
+ for raw in raw_findings:
137
+ if not isinstance(raw, dict):
138
+ continue
139
+ mode_id = str(raw.get('failure_mode_id') or '')
140
+ failure_mode = SEED_FAILURE_MODES.get(mode_id)
141
+ if failure_mode is None:
142
+ LOG.debug('skipping unknown failure_mode_id from judge: %s', mode_id)
143
+ continue
144
+ findings.append(
145
+ FailureFinding(
146
+ finding_id=new_id('finding'),
147
+ failure_mode=failure_mode,
148
+ event_id=self._coerce_str(raw.get('event_id')),
149
+ agent_name=self._coerce_str(raw.get('agent_name')),
150
+ step_index=self._coerce_int(raw.get('step_index')),
151
+ confidence=self._coerce_float(raw.get('confidence'), default=0.5),
152
+ evidence=self._coerce_str_list(raw.get('evidence')),
153
+ suggestion=self._suggestion(failure_mode),
154
+ metadata={'source': 'llm_judge'},
155
+ )
156
+ )
157
+ return findings, summary
158
+
159
+ def _render_user_prompt(
160
+ self, trajectory: AgentTrajectory, chunk: List[AgentEvent]
161
+ ) -> str:
162
+ modes_doc = '\n'.join(
163
+ f'- {mode_id}: {mode.description}'
164
+ for mode_id, mode in SEED_FAILURE_MODES.items()
165
+ )
166
+ events_doc = '\n'.join(self._render_event(evt) for evt in chunk)
167
+ return (
168
+ f'GOAL: {trajectory.goal!r}\n'
169
+ f'FRAMEWORK: {trajectory.framework!r}\n\n'
170
+ f'ALLOWED FAILURE MODES:\n{modes_doc}\n\n'
171
+ f'EVENTS:\n{events_doc}\n'
172
+ )
173
+
174
+ def _render_event(self, event: AgentEvent) -> str:
175
+ def shorten(value: Any) -> str:
176
+ text = '' if value is None else str(value)
177
+ if len(text) > self.max_evidence_chars:
178
+ text = text[: self.max_evidence_chars] + '…'
179
+ return text
180
+
181
+ return (
182
+ f'event_id={event.event_id} '
183
+ f'type={self._event_type_value(event.event_type)} '
184
+ f'agent={event.agent_name} '
185
+ f'module={event.module} step={event.step_index} '
186
+ f'input={shorten(event.input)} '
187
+ f'output={shorten(event.output)} '
188
+ f'error={shorten(event.error)} '
189
+ f'metadata={shorten(event.metadata)}'
190
+ )
191
+
192
+ def _select_root(
193
+ self, findings: List[FailureFinding]
194
+ ) -> Optional[FailureFinding]:
195
+ if not findings:
196
+ return None
197
+ return sorted(
198
+ findings,
199
+ key=lambda finding: (
200
+ finding.step_index is None,
201
+ finding.step_index if finding.step_index is not None else 10**9,
202
+ -finding.confidence,
203
+ ),
204
+ )[0]
205
+
206
+ def _collect_suggestions(self, findings: List[FailureFinding]) -> List[str]:
207
+ seen = set()
208
+ out: List[str] = []
209
+ for f in findings:
210
+ if f.suggestion and f.suggestion not in seen:
211
+ seen.add(f.suggestion)
212
+ out.append(f.suggestion)
213
+ return out
214
+
215
+ def _suggestion(self, failure_mode: FailureMode) -> Optional[str]:
216
+ if not failure_mode.suggestion_templates:
217
+ return None
218
+ return str(failure_mode.suggestion_templates[0])
219
+
220
+ @staticmethod
221
+ def _coerce_str(value: Any) -> Optional[str]:
222
+ if value is None:
223
+ return None
224
+ return str(value)
225
+
226
+ @staticmethod
227
+ def _coerce_int(value: Any) -> Optional[int]:
228
+ if value is None:
229
+ return None
230
+ try:
231
+ return int(value)
232
+ except (TypeError, ValueError):
233
+ return None
234
+
235
+ @staticmethod
236
+ def _coerce_float(value: Any, *, default: float) -> float:
237
+ try:
238
+ return float(value)
239
+ except (TypeError, ValueError):
240
+ return default
241
+
242
+ @staticmethod
243
+ def _coerce_str_list(value: Any) -> List[str]:
244
+ if value is None:
245
+ return []
246
+ if isinstance(value, list):
247
+ return [str(v) for v in value]
248
+ return [str(value)]
249
+
250
+ @staticmethod
251
+ def _event_type_value(event_type: EventType) -> str:
252
+ value = getattr(event_type, 'value', event_type)
253
+ if isinstance(value, str):
254
+ return value
255
+ return str(value)
256
+
257
+
258
+ __all__ = ['LLMJudgeAnalyzer']
agentdebug/llm.py ADDED
@@ -0,0 +1,165 @@
1
+ """Thin LLM client abstraction.
2
+
3
+ AgentDebugX needs an LLM for the judge analyzer and for the All-at-Once
4
+ attributor. We use an OpenAI-compatible chat-completions interface so users can
5
+ point us at OpenAI, Anthropic via LiteLLM, the Gemini endpoint they hand us, or
6
+ a local vLLM/Ollama deployment.
7
+
8
+ The implementation deliberately avoids depending on the ``openai`` Python SDK:
9
+ a single ``httpx`` POST keeps the install lightweight and lets users target any
10
+ ``/v1/chat/completions``-compatible URL.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import logging
17
+ import os
18
+ from dataclasses import dataclass
19
+ from typing import Any, Dict, List, Optional, Protocol
20
+
21
+ import httpx
22
+
23
+ LOG = logging.getLogger('agentdebug.llm')
24
+
25
+
26
+ @dataclass
27
+ class CompletionResult:
28
+ text: str
29
+ raw: Dict[str, Any]
30
+
31
+
32
+ class LLMClient(Protocol):
33
+ model: str
34
+
35
+ def complete(
36
+ self,
37
+ messages: List[Dict[str, Any]],
38
+ *,
39
+ response_format: Optional[Dict[str, Any]] = None,
40
+ temperature: float = 0.0,
41
+ max_tokens: int = 2048,
42
+ timeout: float = 60.0,
43
+ ) -> CompletionResult:
44
+ ...
45
+
46
+
47
+ class OpenAICompatClient:
48
+ """OpenAI-compatible chat completions client.
49
+
50
+ Works against:
51
+
52
+ * OpenAI (``base_url='https://api.openai.com/v1'``)
53
+ * LiteLLM proxy (any ``/v1`` URL)
54
+ * The Gemini gateway used in this repo:
55
+ ``https://compliant-wagner-simulations-coaches.trycloudflare.com/v1``
56
+ with ``model='gemini-3-flash'``
57
+ * vLLM / Ollama with their OpenAI-compat servers
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ *,
63
+ base_url: str,
64
+ api_key: str,
65
+ model: str,
66
+ default_max_tokens: int = 2048,
67
+ timeout: float = 60.0,
68
+ ) -> None:
69
+ self.base_url = base_url.rstrip('/')
70
+ self.api_key = api_key
71
+ self.model = model
72
+ self.default_max_tokens = default_max_tokens
73
+ self.timeout = timeout
74
+
75
+ @classmethod
76
+ def from_env(
77
+ cls,
78
+ *,
79
+ env_prefix: str = 'AGENTDEBUG_LLM',
80
+ model: Optional[str] = None,
81
+ ) -> 'OpenAICompatClient':
82
+ """Construct from environment variables.
83
+
84
+ Reads ``<PREFIX>_BASE_URL``, ``<PREFIX>_API_KEY``, ``<PREFIX>_MODEL``.
85
+ """
86
+ base_url = os.environ[f'{env_prefix}_BASE_URL']
87
+ api_key = os.environ[f'{env_prefix}_API_KEY']
88
+ model_id: str = (
89
+ model if model is not None
90
+ else os.environ.get(f'{env_prefix}_MODEL', 'gpt-4o-mini')
91
+ )
92
+ return cls(base_url=base_url, api_key=api_key, model=model_id)
93
+
94
+ def complete(
95
+ self,
96
+ messages: List[Dict[str, Any]],
97
+ *,
98
+ response_format: Optional[Dict[str, Any]] = None,
99
+ temperature: float = 0.0,
100
+ max_tokens: Optional[int] = None,
101
+ timeout: Optional[float] = None,
102
+ ) -> CompletionResult:
103
+ body: Dict[str, Any] = {
104
+ 'model': self.model,
105
+ 'messages': messages,
106
+ 'temperature': temperature,
107
+ 'max_tokens': max_tokens or self.default_max_tokens,
108
+ }
109
+ if response_format is not None:
110
+ body['response_format'] = response_format
111
+ url = f'{self.base_url}/chat/completions'
112
+ headers = {
113
+ 'Authorization': f'Bearer {self.api_key}',
114
+ 'Content-Type': 'application/json',
115
+ }
116
+ resp = httpx.post(
117
+ url, headers=headers, json=body, timeout=timeout or self.timeout
118
+ )
119
+ resp.raise_for_status()
120
+ data: Dict[str, Any] = resp.json()
121
+ choice = (data.get('choices') or [{}])[0]
122
+ message = choice.get('message') or {}
123
+ text = message.get('content') or ''
124
+ if not text:
125
+ LOG.warning(
126
+ 'empty content from %s (model=%s, finish_reason=%s, usage=%s)',
127
+ url,
128
+ self.model,
129
+ choice.get('finish_reason'),
130
+ data.get('usage'),
131
+ )
132
+ return CompletionResult(text=text, raw=data)
133
+
134
+
135
+ def extract_json_block(text: str) -> Optional[Dict[str, Any]]:
136
+ """Extract the first top-level JSON object from a possibly-fenced response."""
137
+ if not text:
138
+ return None
139
+ # Strip code fences if present.
140
+ cleaned = text.strip()
141
+ if cleaned.startswith('```'):
142
+ # remove ``` or ```json prefix and trailing ```
143
+ cleaned = cleaned.split('\n', 1)[1] if '\n' in cleaned else cleaned[3:]
144
+ if cleaned.endswith('```'):
145
+ cleaned = cleaned[: -len('```')]
146
+ cleaned = cleaned.strip()
147
+ # Try strict first.
148
+ try:
149
+ parsed = json.loads(cleaned)
150
+ if isinstance(parsed, dict):
151
+ return parsed
152
+ except json.JSONDecodeError:
153
+ pass
154
+ # Fallback: greedy slice between the first { and the last }.
155
+ start = cleaned.find('{')
156
+ end = cleaned.rfind('}')
157
+ if start == -1 or end == -1 or end <= start:
158
+ return None
159
+ try:
160
+ parsed = json.loads(cleaned[start : end + 1])
161
+ if isinstance(parsed, dict):
162
+ return parsed
163
+ except json.JSONDecodeError:
164
+ return None
165
+ return None