debugerai 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debugai/tracing.py ADDED
@@ -0,0 +1,283 @@
1
+ """Native observability — traces, spans, sessions, scores (Langfuse-style).
2
+
3
+ A `Trace` is one request through an LLM app; it holds nested `Span`s
4
+ (retrieval, generation, …), `Score`s (DebugAI's diagnosis + evals), and rolled-up
5
+ latency / token / cost. Traces can be grouped into a session (a conversation).
6
+
7
+ tracer = Tracer(sink=store.add_trace)
8
+ with tracer.trace("support.answer", session_id="s1") as t:
9
+ with t.span("retrieval", kind="retrieval") as s:
10
+ s.output = chunks
11
+ with t.span("generation", kind="generation", model="claude-haiku-4-5") as s:
12
+ s.output = answer
13
+ s.set_usage(prompt=120, completion=30)
14
+ t.add_score("confidence", 0.95)
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import contextlib
20
+ import time
21
+ import uuid
22
+ from dataclasses import asdict, dataclass, field
23
+ from typing import Any, Callable
24
+
25
+ # USD per 1M tokens (input, output). Prefix match; unknown models → 0 cost.
26
+ MODEL_PRICES: dict[str, tuple[float, float]] = {
27
+ # ── Anthropic ─────────────────────────────────────────────────
28
+ "claude-opus-4-8": (15.0, 75.0),
29
+ "claude-sonnet-4-6": (3.0, 15.0),
30
+ "claude-haiku-4-5": (0.80, 4.0),
31
+ "claude-3-5-sonnet": (3.0, 15.0),
32
+ "claude-3-5-haiku": (0.80, 4.0),
33
+ "claude-3-opus": (15.0, 75.0),
34
+ "claude-3-sonnet": (3.0, 15.0),
35
+ "claude-3-haiku": (0.25, 1.25),
36
+ # ── OpenAI ────────────────────────────────────────────────────
37
+ "gpt-4o-mini": (0.15, 0.60),
38
+ "gpt-4o": (2.50, 10.0),
39
+ "gpt-4.1": (2.0, 8.0),
40
+ "gpt-4.1-mini": (0.40, 1.60),
41
+ "gpt-4-turbo": (10.0, 30.0),
42
+ "gpt-4": (30.0, 60.0),
43
+ "gpt-3.5-turbo": (0.50, 1.50),
44
+ "o1": (15.0, 60.0),
45
+ "o1-mini": (3.0, 12.0),
46
+ "o3": (10.0, 40.0),
47
+ "o3-mini": (1.10, 4.40),
48
+ "o4-mini": (1.10, 4.40),
49
+ # ── Google Gemini ──────────────────────────────────────────────
50
+ "gemini-2.0-flash": (0.10, 0.40),
51
+ "gemini-2.0-pro": (1.25, 5.0),
52
+ "gemini-1.5-flash": (0.075, 0.30),
53
+ "gemini-1.5-pro": (1.25, 5.0),
54
+ "gemini-1.0-pro": (0.50, 1.50),
55
+ # ── Mistral AI ────────────────────────────────────────────────
56
+ "mistral-large": (2.0, 6.0),
57
+ "mistral-small": (0.20, 0.60),
58
+ "mistral-nemo": (0.15, 0.15),
59
+ "codestral": (0.20, 0.60),
60
+ # ── Groq (fast inference — pricing per 1M tokens) ─────────────
61
+ "groq/llama-3.3-70b": (0.59, 0.79),
62
+ "groq/llama-3.1-8b": (0.05, 0.08),
63
+ "groq/mixtral-8x7b": (0.24, 0.24),
64
+ "groq/gemma2-9b": (0.20, 0.20),
65
+ # ── Together AI ───────────────────────────────────────────────
66
+ "together/llama-3.3-70b": (0.88, 0.88),
67
+ "together/qwen2.5-72b": (1.20, 1.20),
68
+ # ── Cohere ────────────────────────────────────────────────────
69
+ "command-r-plus": (2.50, 10.0),
70
+ "command-r": (0.15, 0.60),
71
+ "command-": (1.0, 2.0), # prefix for older Command models
72
+ # ── Local / Ollama — no API cost ──────────────────────────────
73
+ # All ollama/* and local model prefixes return 0 cost.
74
+ "ollama/": (0.0, 0.0),
75
+ "qwen": (0.0, 0.0),
76
+ "llama": (0.0, 0.0),
77
+ "phi": (0.0, 0.0),
78
+ "deepseek": (0.0, 0.0),
79
+ "codellama": (0.0, 0.0),
80
+ "gemma": (0.0, 0.0),
81
+ "mixtral": (0.0, 0.0), # local via Ollama (hosted Mixtral on Groq has price above)
82
+ "vicuna": (0.0, 0.0),
83
+ }
84
+
85
+
86
+ def estimate_cost(model: str | None, prompt_tokens: int, completion_tokens: int,
87
+ extra_prices: dict | None = None) -> float:
88
+ """Estimate cost for a model call. ``extra_prices`` (from DebugAIConfig.model_prices)
89
+ is checked first and takes precedence over the built-in table."""
90
+ if not model:
91
+ return 0.0
92
+ price = None
93
+ # Check user overrides first.
94
+ combined = dict(MODEL_PRICES)
95
+ if extra_prices:
96
+ combined.update(extra_prices)
97
+ for prefix, p in combined.items():
98
+ if model.startswith(prefix):
99
+ price = p
100
+ break
101
+ if price is None:
102
+ return 0.0
103
+ return round((prompt_tokens * price[0] + completion_tokens * price[1]) / 1_000_000, 6)
104
+
105
+
106
+ def _now_ms() -> float:
107
+ return time.time() * 1000.0
108
+
109
+
110
+ @dataclass
111
+ class Score:
112
+ name: str
113
+ value: float | str | bool
114
+ data_type: str = "numeric" # numeric | categorical | boolean
115
+ comment: str = ""
116
+ source: str = "debugai"
117
+
118
+ def to_dict(self) -> dict:
119
+ return asdict(self)
120
+
121
+
122
+ @dataclass
123
+ class Span:
124
+ name: str
125
+ kind: str = "span" # retrieval | generation | tool | span
126
+ start_ms: float = field(default_factory=_now_ms)
127
+ end_ms: float | None = None
128
+ input: Any = None
129
+ output: Any = None
130
+ model: str | None = None
131
+ prompt_tokens: int = 0
132
+ completion_tokens: int = 0
133
+ metadata: dict = field(default_factory=dict)
134
+ _t0: float = field(default=0.0, repr=False)
135
+
136
+ def set_usage(self, prompt: int = 0, completion: int = 0) -> None:
137
+ self.prompt_tokens, self.completion_tokens = prompt, completion
138
+
139
+ def end(self) -> None:
140
+ if self.end_ms is None:
141
+ self.end_ms = _now_ms()
142
+
143
+ @property
144
+ def duration_ms(self) -> float:
145
+ if self.end_ms is None:
146
+ return 0.0
147
+ return round(self.end_ms - self.start_ms, 2)
148
+
149
+ def to_dict(self) -> dict:
150
+ return {
151
+ "name": self.name, "kind": self.kind,
152
+ "start_ms": self.start_ms, "end_ms": self.end_ms,
153
+ "duration_ms": self.duration_ms,
154
+ "input": _trim(self.input), "output": _trim(self.output),
155
+ "model": self.model,
156
+ "prompt_tokens": self.prompt_tokens,
157
+ "completion_tokens": self.completion_tokens,
158
+ "metadata": self.metadata,
159
+ }
160
+
161
+
162
+ @dataclass
163
+ class Trace:
164
+ name: str
165
+ id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
166
+ session_id: str | None = None
167
+ start_ms: float = field(default_factory=_now_ms)
168
+ end_ms: float | None = None
169
+ spans: list[Span] = field(default_factory=list)
170
+ scores: list[Score] = field(default_factory=list)
171
+ metadata: dict = field(default_factory=dict)
172
+ status: str = "ok" # ok | failing | error
173
+ model: str | None = None
174
+ diagnosis: dict | None = None
175
+ timestamp: str | None = None
176
+
177
+ # --- building ---
178
+ @contextlib.contextmanager
179
+ def span(self, name: str, kind: str = "span", model: str | None = None):
180
+ s = Span(name=name, kind=kind, model=model)
181
+ try:
182
+ yield s
183
+ finally:
184
+ s.end()
185
+ self.spans.append(s)
186
+
187
+ def add_span(self, span: Span) -> None:
188
+ if span.end_ms is None:
189
+ span.end()
190
+ self.spans.append(span)
191
+
192
+ def add_score(self, name: str, value, data_type: str = "numeric", comment: str = "") -> None:
193
+ self.scores.append(Score(name=name, value=value, data_type=data_type, comment=comment))
194
+
195
+ def end(self) -> None:
196
+ if self.end_ms is None:
197
+ self.end_ms = _now_ms()
198
+
199
+ # --- rollups ---
200
+ @property
201
+ def duration_ms(self) -> float:
202
+ if self.end_ms is None:
203
+ return round(max((s.end_ms or s.start_ms for s in self.spans), default=self.start_ms) - self.start_ms, 2)
204
+ return round(self.end_ms - self.start_ms, 2)
205
+
206
+ @property
207
+ def prompt_tokens(self) -> int:
208
+ return sum(s.prompt_tokens for s in self.spans)
209
+
210
+ @property
211
+ def completion_tokens(self) -> int:
212
+ return sum(s.completion_tokens for s in self.spans)
213
+
214
+ @property
215
+ def total_tokens(self) -> int:
216
+ return self.prompt_tokens + self.completion_tokens
217
+
218
+ @property
219
+ def cost_usd(self) -> float:
220
+ total = 0.0
221
+ for s in self.spans:
222
+ total += estimate_cost(s.model or self.model, s.prompt_tokens, s.completion_tokens)
223
+ return round(total, 6)
224
+
225
+ def to_dict(self) -> dict:
226
+ return {
227
+ "id": self.id, "name": self.name, "session_id": self.session_id,
228
+ "timestamp": self.timestamp, "status": self.status, "model": self.model,
229
+ "start_ms": self.start_ms, "end_ms": self.end_ms,
230
+ "duration_ms": self.duration_ms,
231
+ "prompt_tokens": self.prompt_tokens, "completion_tokens": self.completion_tokens,
232
+ "total_tokens": self.total_tokens, "cost_usd": self.cost_usd,
233
+ "spans": [s.to_dict() for s in self.spans],
234
+ "scores": [s.to_dict() for s in self.scores],
235
+ "metadata": self.metadata, "diagnosis": self.diagnosis,
236
+ }
237
+
238
+
239
+ def scores_from_diagnosis(diagnosis: dict) -> list[Score]:
240
+ """Attach a diagnosis to a trace as Langfuse-style scores."""
241
+ if not diagnosis:
242
+ return []
243
+ healthy = bool(diagnosis.get("healthy"))
244
+ scores = [Score(name="healthy", value=healthy, data_type="boolean")]
245
+ primary = diagnosis.get("primary") or {}
246
+ if not healthy and primary:
247
+ scores.append(Score(name="failure", value=primary.get("failure", "unknown"),
248
+ data_type="categorical"))
249
+ scores.append(Score(name="confidence", value=primary.get("confidence", 0.0),
250
+ data_type="numeric", comment=primary.get("severity", "")))
251
+ return scores
252
+
253
+
254
+ def status_from_diagnosis(diagnosis: dict) -> str:
255
+ if not diagnosis:
256
+ return "ok"
257
+ return "ok" if diagnosis.get("healthy") else "failing"
258
+
259
+
260
+ def _trim(value: Any, limit: int = 600) -> Any:
261
+ if isinstance(value, str) and len(value) > limit:
262
+ return value[:limit] + "…"
263
+ if isinstance(value, list):
264
+ return [_trim(v, limit) for v in value]
265
+ return value
266
+
267
+
268
+ class Tracer:
269
+ """Creates traces and hands finished ones to a sink callback."""
270
+
271
+ def __init__(self, sink: Callable[[Trace], None] | None = None):
272
+ self.sink = sink
273
+
274
+ @contextlib.contextmanager
275
+ def trace(self, name: str, session_id: str | None = None, model: str | None = None,
276
+ metadata: dict | None = None):
277
+ t = Trace(name=name, session_id=session_id, model=model, metadata=metadata or {})
278
+ try:
279
+ yield t
280
+ finally:
281
+ t.end()
282
+ if self.sink is not None:
283
+ self.sink(t)