traccia 0.1.2__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- traccia/__init__.py +73 -0
- traccia/auto.py +748 -0
- traccia/auto_instrumentation.py +74 -0
- traccia/cli.py +349 -0
- traccia/config.py +699 -0
- traccia/context/__init__.py +33 -0
- traccia/context/context.py +67 -0
- traccia/context/propagators.py +283 -0
- traccia/errors.py +48 -0
- traccia/exporter/__init__.py +8 -0
- traccia/exporter/console_exporter.py +31 -0
- traccia/exporter/file_exporter.py +178 -0
- traccia/exporter/http_exporter.py +214 -0
- traccia/exporter/otlp_exporter.py +190 -0
- traccia/instrumentation/__init__.py +26 -0
- traccia/instrumentation/anthropic.py +92 -0
- traccia/instrumentation/decorator.py +263 -0
- traccia/instrumentation/fastapi.py +38 -0
- traccia/instrumentation/http_client.py +21 -0
- traccia/instrumentation/http_server.py +25 -0
- traccia/instrumentation/openai.py +358 -0
- traccia/instrumentation/requests.py +68 -0
- traccia/integrations/__init__.py +39 -0
- traccia/integrations/langchain/__init__.py +14 -0
- traccia/integrations/langchain/callback.py +418 -0
- traccia/integrations/langchain/utils.py +129 -0
- traccia/integrations/openai_agents/__init__.py +73 -0
- traccia/integrations/openai_agents/processor.py +262 -0
- traccia/pricing_config.py +58 -0
- traccia/processors/__init__.py +35 -0
- traccia/processors/agent_enricher.py +159 -0
- traccia/processors/batch_processor.py +140 -0
- traccia/processors/cost_engine.py +71 -0
- traccia/processors/cost_processor.py +70 -0
- traccia/processors/drop_policy.py +44 -0
- traccia/processors/logging_processor.py +31 -0
- traccia/processors/rate_limiter.py +223 -0
- traccia/processors/sampler.py +22 -0
- traccia/processors/token_counter.py +216 -0
- traccia/runtime_config.py +127 -0
- traccia/tracer/__init__.py +15 -0
- traccia/tracer/otel_adapter.py +577 -0
- traccia/tracer/otel_utils.py +24 -0
- traccia/tracer/provider.py +155 -0
- traccia/tracer/span.py +286 -0
- traccia/tracer/span_context.py +16 -0
- traccia/tracer/tracer.py +243 -0
- traccia/utils/__init__.py +19 -0
- traccia/utils/helpers.py +95 -0
- {traccia-0.1.2.dist-info → traccia-0.1.6.dist-info}/METADATA +72 -15
- traccia-0.1.6.dist-info/RECORD +55 -0
- traccia-0.1.6.dist-info/top_level.txt +1 -0
- traccia-0.1.2.dist-info/RECORD +0 -6
- traccia-0.1.2.dist-info/top_level.txt +0 -1
- {traccia-0.1.2.dist-info → traccia-0.1.6.dist-info}/WHEEL +0 -0
- {traccia-0.1.2.dist-info → traccia-0.1.6.dist-info}/entry_points.txt +0 -0
- {traccia-0.1.2.dist-info → traccia-0.1.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Cost calculation based on model pricing and token usage."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Dict, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
DEFAULT_PRICING: Dict[str, Dict[str, float]] = {
|
|
8
|
+
# prices per 1k tokens
|
|
9
|
+
"gpt-4": {"prompt": 0.03, "completion": 0.06},
|
|
10
|
+
"gpt-4o": {"prompt": 0.005, "completion": 0.015},
|
|
11
|
+
"gpt-3.5-turbo": {"prompt": 0.0015, "completion": 0.002},
|
|
12
|
+
"claude-3-opus": {"prompt": 0.015, "completion": 0.075},
|
|
13
|
+
"claude-3-sonnet": {"prompt": 0.003, "completion": 0.015},
|
|
14
|
+
# Add other models via pricing overrides:
|
|
15
|
+
# - env: AGENT_DASHBOARD_PRICING_JSON='{"model": {"prompt": x, "completion": y}}'
|
|
16
|
+
# - start_tracing(pricing_override={...})
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def _lookup_price(model: str, table: Dict[str, Dict[str, float]]) -> Optional[Tuple[str, Dict[str, float]]]:
|
|
20
|
+
"""
|
|
21
|
+
Return (matched_key, price_dict) for a given model name.
|
|
22
|
+
|
|
23
|
+
Supports exact matches and prefix matches for version-suffixed model names:
|
|
24
|
+
e.g. "claude-3-opus-20240229" -> "claude-3-opus"
|
|
25
|
+
"gpt-4o-2024-08-06" -> "gpt-4o"
|
|
26
|
+
"""
|
|
27
|
+
if not model:
|
|
28
|
+
return None
|
|
29
|
+
m = str(model).strip()
|
|
30
|
+
if not m:
|
|
31
|
+
return None
|
|
32
|
+
# exact (case sensitive + lower)
|
|
33
|
+
if m in table:
|
|
34
|
+
return m, table[m]
|
|
35
|
+
ml = m.lower()
|
|
36
|
+
if ml in table:
|
|
37
|
+
return ml, table[ml]
|
|
38
|
+
# prefix match (longest key wins)
|
|
39
|
+
for key in sorted(table.keys(), key=len, reverse=True):
|
|
40
|
+
if ml.startswith(key.lower()):
|
|
41
|
+
return key, table[key]
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def match_pricing_model_key(
|
|
46
|
+
model: str, pricing_table: Optional[Dict[str, Dict[str, float]]] = None
|
|
47
|
+
) -> Optional[str]:
|
|
48
|
+
"""Return the pricing table key that would be used for `model`, if any."""
|
|
49
|
+
table = pricing_table or DEFAULT_PRICING
|
|
50
|
+
matched = _lookup_price(model, table)
|
|
51
|
+
if not matched:
|
|
52
|
+
return None
|
|
53
|
+
key, _ = matched
|
|
54
|
+
return key
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def compute_cost(
|
|
58
|
+
model: str,
|
|
59
|
+
prompt_tokens: int,
|
|
60
|
+
completion_tokens: int,
|
|
61
|
+
pricing_table: Optional[Dict[str, Dict[str, float]]] = None,
|
|
62
|
+
) -> Optional[float]:
|
|
63
|
+
table = pricing_table or DEFAULT_PRICING
|
|
64
|
+
matched = _lookup_price(model, table)
|
|
65
|
+
if not matched:
|
|
66
|
+
return None
|
|
67
|
+
_, price = matched
|
|
68
|
+
prompt_cost = (prompt_tokens / 1000.0) * price.get("prompt", 0.0)
|
|
69
|
+
completion_cost = (completion_tokens / 1000.0) * price.get("completion", 0.0)
|
|
70
|
+
return round(prompt_cost + completion_cost, 6)
|
|
71
|
+
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Span processor that annotates spans with cost based on token usage and pricing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Dict, Optional
|
|
6
|
+
|
|
7
|
+
from traccia.processors.cost_engine import compute_cost, match_pricing_model_key
|
|
8
|
+
from traccia.tracer.provider import SpanProcessor
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CostAnnotatingProcessor(SpanProcessor):
|
|
12
|
+
"""
|
|
13
|
+
Adds `llm.cost.usd` to spans when token usage and model info are available.
|
|
14
|
+
|
|
15
|
+
Expects spans to carry:
|
|
16
|
+
- llm.model (model name)
|
|
17
|
+
- llm.usage.prompt_tokens
|
|
18
|
+
- llm.usage.completion_tokens
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
pricing_table: Optional[Dict[str, Dict[str, float]]] = None,
|
|
24
|
+
*,
|
|
25
|
+
pricing_source: str = "default",
|
|
26
|
+
) -> None:
|
|
27
|
+
self.pricing_table = pricing_table or {}
|
|
28
|
+
self.pricing_source = pricing_source
|
|
29
|
+
|
|
30
|
+
def on_end(self, span) -> None:
|
|
31
|
+
if "llm.cost.usd" in (span.attributes or {}):
|
|
32
|
+
return
|
|
33
|
+
model = span.attributes.get("llm.model")
|
|
34
|
+
prompt = span.attributes.get("llm.usage.prompt_tokens")
|
|
35
|
+
completion = span.attributes.get("llm.usage.completion_tokens")
|
|
36
|
+
# Anthropic-style names (also supported)
|
|
37
|
+
if prompt is None:
|
|
38
|
+
prompt = span.attributes.get("llm.usage.input_tokens")
|
|
39
|
+
if completion is None:
|
|
40
|
+
completion = span.attributes.get("llm.usage.output_tokens")
|
|
41
|
+
if not model or prompt is None or completion is None:
|
|
42
|
+
return
|
|
43
|
+
cost = compute_cost(
|
|
44
|
+
model,
|
|
45
|
+
int(prompt),
|
|
46
|
+
int(completion),
|
|
47
|
+
pricing_table=self.pricing_table,
|
|
48
|
+
)
|
|
49
|
+
if cost is not None:
|
|
50
|
+
span.set_attribute("llm.cost.usd", cost)
|
|
51
|
+
# Provenance for downstream analysis.
|
|
52
|
+
span.set_attribute("llm.cost.source", span.attributes.get("llm.usage.source", "unknown"))
|
|
53
|
+
span.set_attribute("llm.pricing.source", self.pricing_source)
|
|
54
|
+
key = match_pricing_model_key(model, self.pricing_table)
|
|
55
|
+
if key:
|
|
56
|
+
span.set_attribute("llm.pricing.model_key", key)
|
|
57
|
+
|
|
58
|
+
def shutdown(self) -> None:
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
def force_flush(self, timeout: Optional[float] = None) -> None:
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
def update_pricing_table(
|
|
65
|
+
self, pricing_table: Dict[str, Dict[str, float]], pricing_source: Optional[str] = None
|
|
66
|
+
) -> None:
|
|
67
|
+
self.pricing_table = pricing_table
|
|
68
|
+
if pricing_source:
|
|
69
|
+
self.pricing_source = pricing_source
|
|
70
|
+
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Queue overflow handling strategies for span buffering."""
|
|
2
|
+
|
|
3
|
+
from collections import deque
|
|
4
|
+
from typing import Deque
|
|
5
|
+
|
|
6
|
+
from traccia.tracer.span import Span
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DropPolicy:
|
|
10
|
+
"""Base policy deciding how to handle span queue overflow."""
|
|
11
|
+
|
|
12
|
+
def handle(self, queue: Deque[Span], span: Span, max_size: int) -> bool:
|
|
13
|
+
"""
|
|
14
|
+
Apply the drop policy.
|
|
15
|
+
|
|
16
|
+
Returns True if the span was enqueued, False if it was dropped.
|
|
17
|
+
"""
|
|
18
|
+
raise NotImplementedError
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DropOldestPolicy(DropPolicy):
|
|
22
|
+
"""Drop the oldest span to make room for a new one."""
|
|
23
|
+
|
|
24
|
+
def handle(self, queue: Deque[Span], span: Span, max_size: int) -> bool:
|
|
25
|
+
if len(queue) >= max_size and queue:
|
|
26
|
+
queue.popleft()
|
|
27
|
+
if len(queue) < max_size:
|
|
28
|
+
queue.append(span)
|
|
29
|
+
return True
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class DropNewestPolicy(DropPolicy):
|
|
34
|
+
"""Drop the incoming span if the queue is full."""
|
|
35
|
+
|
|
36
|
+
def handle(self, queue: Deque[Span], span: Span, max_size: int) -> bool:
|
|
37
|
+
if len(queue) < max_size:
|
|
38
|
+
queue.append(span)
|
|
39
|
+
return True
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
DEFAULT_DROP_POLICY = DropOldestPolicy()
|
|
44
|
+
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Span processor that logs spans when they end."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from traccia.tracer.provider import SpanProcessor
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class LoggingSpanProcessor(SpanProcessor):
|
|
12
|
+
"""Logs span summary on end using the standard logging module."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, logger: Optional[logging.Logger] = None) -> None:
|
|
15
|
+
self.logger = logger or logging.getLogger("traccia.traces")
|
|
16
|
+
|
|
17
|
+
def on_end(self, span) -> None:
|
|
18
|
+
attrs = span.attributes or {}
|
|
19
|
+
msg = (
|
|
20
|
+
f"[trace] name={span.name} trace_id={span.context.trace_id} "
|
|
21
|
+
f"span_id={span.context.span_id} status={span.status.name} "
|
|
22
|
+
f"duration_ns={span.duration_ns} attrs={attrs}"
|
|
23
|
+
)
|
|
24
|
+
self.logger.info(msg)
|
|
25
|
+
|
|
26
|
+
def shutdown(self) -> None:
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
def force_flush(self, timeout: Optional[float] = None) -> None:
|
|
30
|
+
return None
|
|
31
|
+
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""Rate limiting processor for span export."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import threading
|
|
7
|
+
import time
|
|
8
|
+
from collections import deque
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
|
12
|
+
|
|
13
|
+
from traccia.errors import RateLimitError
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RateLimiter:
|
|
19
|
+
"""
|
|
20
|
+
Token bucket rate limiter with hybrid blocking/dropping behavior.
|
|
21
|
+
|
|
22
|
+
Features:
|
|
23
|
+
- Token bucket algorithm for smooth rate limiting
|
|
24
|
+
- Short blocking period before dropping spans
|
|
25
|
+
- Detailed logging of dropped spans
|
|
26
|
+
- Thread-safe implementation
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
max_spans_per_second: Optional[float] = None,
|
|
32
|
+
max_block_ms: int = 100,
|
|
33
|
+
):
|
|
34
|
+
"""
|
|
35
|
+
Initialize rate limiter.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
max_spans_per_second: Maximum spans per second (None = unlimited)
|
|
39
|
+
max_block_ms: Maximum milliseconds to block before dropping
|
|
40
|
+
"""
|
|
41
|
+
self.max_spans_per_second = max_spans_per_second
|
|
42
|
+
self.max_block_ms = max_block_ms
|
|
43
|
+
self.enabled = max_spans_per_second is not None and max_spans_per_second > 0
|
|
44
|
+
|
|
45
|
+
# Token bucket state
|
|
46
|
+
self._tokens: float = max_spans_per_second or 0
|
|
47
|
+
self._max_tokens: float = max_spans_per_second or 0
|
|
48
|
+
self._last_refill_time: float = time.time()
|
|
49
|
+
self._lock = threading.Lock()
|
|
50
|
+
|
|
51
|
+
# Stats
|
|
52
|
+
self._total_spans = 0
|
|
53
|
+
self._dropped_spans = 0
|
|
54
|
+
self._blocked_spans = 0
|
|
55
|
+
|
|
56
|
+
# Recent timestamps for sliding window (backup)
|
|
57
|
+
self._recent_timestamps: deque = deque()
|
|
58
|
+
self._window_seconds = 1.0
|
|
59
|
+
|
|
60
|
+
def acquire(self, span: Optional[ReadableSpan] = None) -> bool:
|
|
61
|
+
"""
|
|
62
|
+
Try to acquire permission to process a span.
|
|
63
|
+
|
|
64
|
+
Returns True if span should be processed, False if it should be dropped.
|
|
65
|
+
|
|
66
|
+
Behavior:
|
|
67
|
+
1. If unlimited (disabled), always return True
|
|
68
|
+
2. Try to acquire a token immediately
|
|
69
|
+
3. If no token, block for up to max_block_ms
|
|
70
|
+
4. If still no token after blocking, drop and return False
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
span: Optional span for logging purposes
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
True if span should be processed, False if dropped
|
|
77
|
+
"""
|
|
78
|
+
if not self.enabled:
|
|
79
|
+
return True
|
|
80
|
+
|
|
81
|
+
self._total_spans += 1
|
|
82
|
+
|
|
83
|
+
with self._lock:
|
|
84
|
+
# Refill tokens based on elapsed time
|
|
85
|
+
self._refill_tokens()
|
|
86
|
+
|
|
87
|
+
# Try to acquire immediately
|
|
88
|
+
if self._tokens >= 1.0:
|
|
89
|
+
self._tokens -= 1.0
|
|
90
|
+
return True
|
|
91
|
+
|
|
92
|
+
# No tokens available, try blocking
|
|
93
|
+
if self.max_block_ms > 0:
|
|
94
|
+
block_start = time.time()
|
|
95
|
+
blocked_ms = 0
|
|
96
|
+
|
|
97
|
+
while blocked_ms < self.max_block_ms:
|
|
98
|
+
# Release lock briefly to allow other threads
|
|
99
|
+
self._lock.release()
|
|
100
|
+
time.sleep(0.001) # Sleep 1ms
|
|
101
|
+
self._lock.acquire()
|
|
102
|
+
|
|
103
|
+
# Refill and try again
|
|
104
|
+
self._refill_tokens()
|
|
105
|
+
if self._tokens >= 1.0:
|
|
106
|
+
self._tokens -= 1.0
|
|
107
|
+
self._blocked_spans += 1
|
|
108
|
+
return True
|
|
109
|
+
|
|
110
|
+
blocked_ms = (time.time() - block_start) * 1000
|
|
111
|
+
|
|
112
|
+
# Still no tokens after blocking - drop the span
|
|
113
|
+
self._dropped_spans += 1
|
|
114
|
+
|
|
115
|
+
# Log dropped span
|
|
116
|
+
span_name = span.name if span else "unknown"
|
|
117
|
+
logger.warning(
|
|
118
|
+
f"Rate limit exceeded - dropping span '{span_name}'. "
|
|
119
|
+
f"Total dropped: {self._dropped_spans}/{self._total_spans} "
|
|
120
|
+
f"({self._dropped_spans / self._total_spans * 100:.1f}%)"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
return False
|
|
124
|
+
|
|
125
|
+
def _refill_tokens(self) -> None:
|
|
126
|
+
"""Refill tokens based on elapsed time (token bucket algorithm)."""
|
|
127
|
+
now = time.time()
|
|
128
|
+
elapsed = now - self._last_refill_time
|
|
129
|
+
|
|
130
|
+
if elapsed > 0:
|
|
131
|
+
# Add tokens based on rate and elapsed time
|
|
132
|
+
new_tokens = elapsed * self.max_spans_per_second
|
|
133
|
+
self._tokens = min(self._max_tokens, self._tokens + new_tokens)
|
|
134
|
+
self._last_refill_time = now
|
|
135
|
+
|
|
136
|
+
def get_stats(self) -> dict:
|
|
137
|
+
"""Get rate limiting statistics."""
|
|
138
|
+
with self._lock:
|
|
139
|
+
drop_rate = (self._dropped_spans / self._total_spans * 100) if self._total_spans > 0 else 0
|
|
140
|
+
return {
|
|
141
|
+
"enabled": self.enabled,
|
|
142
|
+
"max_spans_per_second": self.max_spans_per_second,
|
|
143
|
+
"total_spans": self._total_spans,
|
|
144
|
+
"dropped_spans": self._dropped_spans,
|
|
145
|
+
"blocked_spans": self._blocked_spans,
|
|
146
|
+
"drop_rate_percent": round(drop_rate, 2),
|
|
147
|
+
"current_tokens": round(self._tokens, 2),
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
def reset_stats(self) -> None:
|
|
151
|
+
"""Reset statistics counters."""
|
|
152
|
+
with self._lock:
|
|
153
|
+
self._total_spans = 0
|
|
154
|
+
self._dropped_spans = 0
|
|
155
|
+
self._blocked_spans = 0
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class RateLimitingSpanProcessor:
|
|
159
|
+
"""
|
|
160
|
+
Span processor that enforces rate limiting before passing to next processor.
|
|
161
|
+
|
|
162
|
+
This should be added early in the processor chain to drop spans before
|
|
163
|
+
they consume resources in downstream processors.
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
def __init__(
|
|
167
|
+
self,
|
|
168
|
+
next_processor,
|
|
169
|
+
max_spans_per_second: Optional[float] = None,
|
|
170
|
+
max_block_ms: int = 100,
|
|
171
|
+
):
|
|
172
|
+
"""
|
|
173
|
+
Initialize rate limiting processor.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
next_processor: Next processor in the chain
|
|
177
|
+
max_spans_per_second: Maximum spans per second (None = unlimited)
|
|
178
|
+
max_block_ms: Maximum milliseconds to block before dropping
|
|
179
|
+
"""
|
|
180
|
+
self.next_processor = next_processor
|
|
181
|
+
self.rate_limiter = RateLimiter(
|
|
182
|
+
max_spans_per_second=max_spans_per_second,
|
|
183
|
+
max_block_ms=max_block_ms,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
def on_start(self, span, parent_context=None):
|
|
187
|
+
"""Called when span starts - pass through to next processor."""
|
|
188
|
+
if self.next_processor and hasattr(self.next_processor, 'on_start'):
|
|
189
|
+
self.next_processor.on_start(span, parent_context)
|
|
190
|
+
|
|
191
|
+
def on_end(self, span):
|
|
192
|
+
"""
|
|
193
|
+
Called when span ends - check rate limit before passing to next processor.
|
|
194
|
+
|
|
195
|
+
If rate limit is exceeded, span is dropped and not passed to next processor.
|
|
196
|
+
"""
|
|
197
|
+
# Check rate limit
|
|
198
|
+
if not self.rate_limiter.acquire(span):
|
|
199
|
+
# Span dropped - don't pass to next processor
|
|
200
|
+
return
|
|
201
|
+
|
|
202
|
+
# Pass to next processor
|
|
203
|
+
if self.next_processor and hasattr(self.next_processor, 'on_end'):
|
|
204
|
+
self.next_processor.on_end(span)
|
|
205
|
+
|
|
206
|
+
def shutdown(self):
|
|
207
|
+
"""Shutdown processor and log final stats."""
|
|
208
|
+
stats = self.rate_limiter.get_stats()
|
|
209
|
+
if stats["enabled"] and stats["dropped_spans"] > 0:
|
|
210
|
+
logger.info(
|
|
211
|
+
f"Rate limiter shutdown. Final stats: "
|
|
212
|
+
f"{stats['dropped_spans']}/{stats['total_spans']} spans dropped "
|
|
213
|
+
f"({stats['drop_rate_percent']}%)"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
if self.next_processor and hasattr(self.next_processor, 'shutdown'):
|
|
217
|
+
self.next_processor.shutdown()
|
|
218
|
+
|
|
219
|
+
def force_flush(self, timeout_millis: int = 30000):
|
|
220
|
+
"""Force flush - pass through to next processor."""
|
|
221
|
+
if self.next_processor and hasattr(self.next_processor, 'force_flush'):
|
|
222
|
+
return self.next_processor.force_flush(timeout_millis)
|
|
223
|
+
return True
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Sampling decisions for traces."""
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class SamplingResult:
|
|
9
|
+
sampled: bool
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Sampler:
|
|
13
|
+
"""Head-based sampler using a fixed probability."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, sample_rate: float = 1.0) -> None:
|
|
16
|
+
if not 0.0 <= sample_rate <= 1.0:
|
|
17
|
+
raise ValueError("sample_rate must be between 0.0 and 1.0")
|
|
18
|
+
self.sample_rate = sample_rate
|
|
19
|
+
|
|
20
|
+
def should_sample(self) -> SamplingResult:
|
|
21
|
+
return SamplingResult(sampled=random.random() <= self.sample_rate)
|
|
22
|
+
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
"""Token counting utilities and processor for spans with LLM usage.
|
|
2
|
+
|
|
3
|
+
Best practice:
|
|
4
|
+
- Prefer provider-reported usage tokens when available.
|
|
5
|
+
- Otherwise, estimate with the vendor tokenizer when available (tiktoken for
|
|
6
|
+
OpenAI) and record the estimate source on the span.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any, Dict, Optional, Tuple
|
|
12
|
+
|
|
13
|
+
from traccia.tracer.provider import SpanProcessor
|
|
14
|
+
|
|
15
|
+
try: # optional dependency for accurate counting
|
|
16
|
+
import tiktoken # type: ignore
|
|
17
|
+
except Exception: # pragma: no cover
|
|
18
|
+
tiktoken = None # fallback to heuristic
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
MODEL_TO_ENCODING = {
|
|
22
|
+
# OpenAI mappings (approximate; kept current as of gpt-4o family)
|
|
23
|
+
"gpt-4o": "o200k_base",
|
|
24
|
+
"gpt-4o-mini": "o200k_base",
|
|
25
|
+
"gpt-4": "cl100k_base",
|
|
26
|
+
"gpt-3.5-turbo": "cl100k_base",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _encoding_for_model(model: Optional[str]):
|
|
31
|
+
if tiktoken is None:
|
|
32
|
+
return None
|
|
33
|
+
if not model:
|
|
34
|
+
return None
|
|
35
|
+
m = str(model)
|
|
36
|
+
# First try tiktoken's model registry (best when available).
|
|
37
|
+
try:
|
|
38
|
+
return tiktoken.encoding_for_model(m)
|
|
39
|
+
except Exception:
|
|
40
|
+
pass
|
|
41
|
+
# Then try our explicit mapping, supporting version-suffixed models by prefix.
|
|
42
|
+
encoding_name = MODEL_TO_ENCODING.get(m)
|
|
43
|
+
if encoding_name is None:
|
|
44
|
+
for key in sorted(MODEL_TO_ENCODING.keys(), key=len, reverse=True):
|
|
45
|
+
if m.startswith(key):
|
|
46
|
+
encoding_name = MODEL_TO_ENCODING[key]
|
|
47
|
+
break
|
|
48
|
+
if encoding_name:
|
|
49
|
+
try:
|
|
50
|
+
return tiktoken.get_encoding(encoding_name)
|
|
51
|
+
except Exception:
|
|
52
|
+
return None
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _count_with_tiktoken(text: str, model: Optional[str]) -> Optional[int]:
|
|
57
|
+
if tiktoken is None or not text:
|
|
58
|
+
return None
|
|
59
|
+
encoding = _encoding_for_model(model)
|
|
60
|
+
if encoding is None:
|
|
61
|
+
return None
|
|
62
|
+
try:
|
|
63
|
+
return len(encoding.encode(text))
|
|
64
|
+
except Exception:
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def estimate_tokens_from_text(text: str, model: Optional[str] = None) -> int:
|
|
69
|
+
"""
|
|
70
|
+
Estimate tokens. Prefer model-accurate count via tiktoken when available,
|
|
71
|
+
otherwise fall back to a rough whitespace split.
|
|
72
|
+
"""
|
|
73
|
+
if not text:
|
|
74
|
+
return 0
|
|
75
|
+
exact = _count_with_tiktoken(text, model)
|
|
76
|
+
if exact is not None:
|
|
77
|
+
return exact
|
|
78
|
+
return len(text.split())
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def estimate_tokens_from_text_with_source(
|
|
82
|
+
text: str, model: Optional[str] = None
|
|
83
|
+
) -> Tuple[int, str]:
|
|
84
|
+
"""
|
|
85
|
+
Return (token_count, source) where source is:
|
|
86
|
+
- "estimated.tiktoken"
|
|
87
|
+
- "estimated.heuristic"
|
|
88
|
+
"""
|
|
89
|
+
if not text:
|
|
90
|
+
return 0, "estimated.heuristic"
|
|
91
|
+
exact = _count_with_tiktoken(text, model)
|
|
92
|
+
if exact is not None:
|
|
93
|
+
return exact, "estimated.tiktoken"
|
|
94
|
+
return len(text.split()), "estimated.heuristic"
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _openai_chat_overhead(model: Optional[str]) -> Tuple[int, int, int]:
|
|
98
|
+
"""
|
|
99
|
+
Return (tokens_per_message, tokens_per_name, tokens_for_reply).
|
|
100
|
+
|
|
101
|
+
These constants are model-dependent in OpenAI's chat format. For estimation
|
|
102
|
+
we use a reasonable default that is close for many modern chat models.
|
|
103
|
+
"""
|
|
104
|
+
# Defaults (works reasonably for gpt-4/4o families as an estimate)
|
|
105
|
+
return 3, 1, 3
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def estimate_openai_chat_prompt_tokens_with_source(
|
|
109
|
+
messages: Any, model: Optional[str] = None
|
|
110
|
+
) -> Optional[Tuple[int, str]]:
|
|
111
|
+
"""
|
|
112
|
+
Estimate prompt tokens from a list of chat messages.
|
|
113
|
+
|
|
114
|
+
This is best-effort and should be treated as an estimate unless provider
|
|
115
|
+
usage is available.
|
|
116
|
+
"""
|
|
117
|
+
if not isinstance(messages, (list, tuple)) or not messages:
|
|
118
|
+
return None
|
|
119
|
+
if tiktoken is None:
|
|
120
|
+
# Heuristic fallback: count whitespace tokens across role/content.
|
|
121
|
+
parts = []
|
|
122
|
+
for msg in list(messages)[:50]:
|
|
123
|
+
if not isinstance(msg, dict):
|
|
124
|
+
continue
|
|
125
|
+
role = msg.get("role") or ""
|
|
126
|
+
content = msg.get("content") or ""
|
|
127
|
+
if not isinstance(content, str):
|
|
128
|
+
content = str(content)
|
|
129
|
+
parts.append(f"{role} {content}".strip())
|
|
130
|
+
text = "\n".join([p for p in parts if p])
|
|
131
|
+
return (len(text.split()), "estimated.chat_heuristic")
|
|
132
|
+
try:
|
|
133
|
+
encoding = _encoding_for_model(model) or tiktoken.get_encoding("cl100k_base")
|
|
134
|
+
except Exception:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
tokens_per_message, tokens_per_name, tokens_for_reply = _openai_chat_overhead(model)
|
|
138
|
+
total = 0
|
|
139
|
+
for msg in list(messages)[:50]:
|
|
140
|
+
if not isinstance(msg, dict):
|
|
141
|
+
continue
|
|
142
|
+
total += tokens_per_message
|
|
143
|
+
role = msg.get("role") or ""
|
|
144
|
+
name = msg.get("name")
|
|
145
|
+
content = msg.get("content") or ""
|
|
146
|
+
if not isinstance(content, str):
|
|
147
|
+
content = str(content)
|
|
148
|
+
total += len(encoding.encode(str(role)))
|
|
149
|
+
total += len(encoding.encode(content))
|
|
150
|
+
if name:
|
|
151
|
+
total += tokens_per_name
|
|
152
|
+
total += len(encoding.encode(str(name)))
|
|
153
|
+
total += tokens_for_reply
|
|
154
|
+
return total, "estimated.tiktoken_chat"
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class TokenCountingProcessor(SpanProcessor):
|
|
158
|
+
"""
|
|
159
|
+
A processor that infers token counts when not provided by the LLM response.
|
|
160
|
+
It prefers a model-specific tokenizer (tiktoken) when available.
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
def on_end(self, span) -> None:
|
|
164
|
+
prompt = span.attributes.get("llm.prompt")
|
|
165
|
+
completion = span.attributes.get("llm.completion")
|
|
166
|
+
model = span.attributes.get("llm.model")
|
|
167
|
+
openai_messages = span.attributes.get("llm.openai.messages")
|
|
168
|
+
|
|
169
|
+
wrote_any = False
|
|
170
|
+
wrote_prompt = False
|
|
171
|
+
wrote_completion = False
|
|
172
|
+
|
|
173
|
+
if "llm.usage.prompt_tokens" not in span.attributes:
|
|
174
|
+
# Prefer chat-structure estimation when available.
|
|
175
|
+
est = estimate_openai_chat_prompt_tokens_with_source(openai_messages, model)
|
|
176
|
+
if est is not None:
|
|
177
|
+
count, source = est
|
|
178
|
+
span.set_attribute("llm.usage.prompt_tokens", count)
|
|
179
|
+
span.set_attribute("llm.usage.prompt_source", source)
|
|
180
|
+
wrote_any = True
|
|
181
|
+
wrote_prompt = True
|
|
182
|
+
elif isinstance(prompt, str):
|
|
183
|
+
count, source = estimate_tokens_from_text_with_source(prompt, model)
|
|
184
|
+
span.set_attribute("llm.usage.prompt_tokens", count)
|
|
185
|
+
span.set_attribute("llm.usage.prompt_source", source)
|
|
186
|
+
wrote_any = True
|
|
187
|
+
wrote_prompt = True
|
|
188
|
+
|
|
189
|
+
if "llm.usage.completion_tokens" not in span.attributes and isinstance(completion, str):
|
|
190
|
+
count, source = estimate_tokens_from_text_with_source(completion, model)
|
|
191
|
+
span.set_attribute("llm.usage.completion_tokens", count)
|
|
192
|
+
span.set_attribute("llm.usage.completion_source", source)
|
|
193
|
+
wrote_any = True
|
|
194
|
+
wrote_completion = True
|
|
195
|
+
|
|
196
|
+
# Synthesize overall usage source if not provided by instrumentation.
|
|
197
|
+
if wrote_any and "llm.usage.source" not in span.attributes:
|
|
198
|
+
ps = span.attributes.get("llm.usage.prompt_source")
|
|
199
|
+
cs = span.attributes.get("llm.usage.completion_source")
|
|
200
|
+
if ps and cs and ps == cs:
|
|
201
|
+
span.set_attribute("llm.usage.source", ps)
|
|
202
|
+
elif ps or cs:
|
|
203
|
+
span.set_attribute("llm.usage.source", "mixed")
|
|
204
|
+
|
|
205
|
+
# If provider already marked usage as provider_usage, and we filled any missing
|
|
206
|
+
# fields, mark it as mixed.
|
|
207
|
+
if wrote_any and span.attributes.get("llm.usage.source") == "provider_usage":
|
|
208
|
+
if wrote_prompt or wrote_completion:
|
|
209
|
+
span.set_attribute("llm.usage.source", "mixed")
|
|
210
|
+
|
|
211
|
+
def shutdown(self) -> None:
|
|
212
|
+
return None
|
|
213
|
+
|
|
214
|
+
def force_flush(self, timeout: Optional[float] = None) -> None:
|
|
215
|
+
return None
|
|
216
|
+
|