judgeval 0.16.0__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/api/api_types.py +2 -1
- judgeval/data/judgment_types.py +2 -1
- judgeval/logger.py +1 -1
- judgeval/tracer/__init__.py +10 -7
- judgeval/tracer/keys.py +7 -3
- judgeval/tracer/llm/__init__.py +2 -1259
- judgeval/tracer/llm/config.py +110 -0
- judgeval/tracer/llm/constants.py +10 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +611 -0
- judgeval/tracer/llm/llm_google/__init__.py +0 -0
- judgeval/tracer/llm/llm_google/config.py +24 -0
- judgeval/tracer/llm/llm_google/wrapper.py +426 -0
- judgeval/tracer/llm/llm_groq/__init__.py +0 -0
- judgeval/tracer/llm/llm_groq/config.py +23 -0
- judgeval/tracer/llm/llm_groq/wrapper.py +477 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +637 -0
- judgeval/tracer/llm/llm_together/__init__.py +0 -0
- judgeval/tracer/llm/llm_together/config.py +23 -0
- judgeval/tracer/llm/llm_together/wrapper.py +478 -0
- judgeval/tracer/llm/providers.py +5 -5
- judgeval/tracer/processors/__init__.py +1 -1
- judgeval/trainer/console.py +1 -1
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +21 -0
- judgeval/utils/{decorators.py → decorators/use_once.py} +0 -11
- judgeval/utils/meta.py +1 -1
- judgeval/utils/version_check.py +1 -1
- judgeval/version.py +1 -1
- {judgeval-0.16.0.dist-info → judgeval-0.16.1.dist-info}/METADATA +1 -1
- {judgeval-0.16.0.dist-info → judgeval-0.16.1.dist-info}/RECORD +37 -23
- judgeval/tracer/llm/google/__init__.py +0 -21
- judgeval/tracer/llm/groq/__init__.py +0 -20
- judgeval/tracer/llm/together/__init__.py +0 -20
- /judgeval/tracer/llm/{anthropic/__init__.py → llm_anthropic/config.py} +0 -0
- /judgeval/tracer/llm/{openai/__init__.py → llm_openai/config.py} +0 -0
- {judgeval-0.16.0.dist-info → judgeval-0.16.1.dist-info}/WHEEL +0 -0
- {judgeval-0.16.0.dist-info → judgeval-0.16.1.dist-info}/entry_points.txt +0 -0
- {judgeval-0.16.0.dist-info → judgeval-0.16.1.dist-info}/licenses/LICENSE.md +0 -0
judgeval/tracer/llm/__init__.py
CHANGED
@@ -1,1264 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
import functools
|
3
|
-
from typing import (
|
4
|
-
Tuple,
|
5
|
-
Optional,
|
6
|
-
Any,
|
7
|
-
TYPE_CHECKING,
|
8
|
-
Union,
|
9
|
-
AsyncGenerator,
|
10
|
-
Generator,
|
11
|
-
Iterator,
|
12
|
-
AsyncIterator,
|
13
|
-
)
|
14
|
-
from functools import wraps
|
15
|
-
from enum import Enum
|
16
|
-
from judgeval.data.trace import TraceUsage
|
17
|
-
from judgeval.logger import judgeval_logger
|
18
|
-
from litellm.cost_calculator import cost_per_token as _original_cost_per_token
|
19
|
-
from opentelemetry.trace import Span
|
20
2
|
|
21
|
-
from judgeval.tracer.llm.providers import (
|
22
|
-
HAS_OPENAI,
|
23
|
-
HAS_TOGETHER,
|
24
|
-
HAS_ANTHROPIC,
|
25
|
-
HAS_GOOGLE_GENAI,
|
26
|
-
HAS_GROQ,
|
27
|
-
ApiClient,
|
28
|
-
)
|
29
|
-
from judgeval.tracer.managers import sync_span_context, async_span_context
|
30
|
-
from judgeval.tracer.keys import AttributeKeys
|
31
|
-
from judgeval.utils.serialize import safe_serialize
|
32
|
-
from judgeval.tracer.utils import set_span_attribute
|
33
3
|
|
34
|
-
|
35
|
-
from judgeval.tracer import Tracer
|
4
|
+
from .config import _detect_provider, wrap_provider
|
36
5
|
|
37
6
|
|
38
|
-
|
39
|
-
"""Enum for different LLM provider types."""
|
40
|
-
|
41
|
-
OPENAI = "openai"
|
42
|
-
ANTHROPIC = "anthropic"
|
43
|
-
TOGETHER = "together"
|
44
|
-
GOOGLE = "google"
|
45
|
-
GROQ = "groq"
|
46
|
-
DEFAULT = "default"
|
47
|
-
|
48
|
-
|
49
|
-
@wraps(_original_cost_per_token)
|
50
|
-
def cost_per_token(
|
51
|
-
*args: Any, **kwargs: Any
|
52
|
-
) -> Tuple[Optional[float], Optional[float]]:
|
53
|
-
try:
|
54
|
-
prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = (
|
55
|
-
_original_cost_per_token(*args, **kwargs)
|
56
|
-
)
|
57
|
-
if (
|
58
|
-
prompt_tokens_cost_usd_dollar == 0
|
59
|
-
and completion_tokens_cost_usd_dollar == 0
|
60
|
-
):
|
61
|
-
judgeval_logger.warning("LiteLLM returned a total of 0 for cost per token")
|
62
|
-
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
63
|
-
except Exception as e:
|
64
|
-
judgeval_logger.warning(f"Error calculating cost per token: {e}")
|
65
|
-
return None, None
|
66
|
-
|
67
|
-
|
68
|
-
def _detect_provider(client: ApiClient) -> ProviderType:
|
69
|
-
"""Detect the provider type of the client once to avoid repeated isinstance checks."""
|
70
|
-
if HAS_OPENAI:
|
71
|
-
from judgeval.tracer.llm.providers import openai_OpenAI, openai_AsyncOpenAI
|
72
|
-
|
73
|
-
assert openai_OpenAI is not None, "OpenAI client not found"
|
74
|
-
assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
|
75
|
-
if isinstance(client, (openai_OpenAI, openai_AsyncOpenAI)):
|
76
|
-
return ProviderType.OPENAI
|
77
|
-
|
78
|
-
if HAS_ANTHROPIC:
|
79
|
-
from judgeval.tracer.llm.providers import (
|
80
|
-
anthropic_Anthropic,
|
81
|
-
anthropic_AsyncAnthropic,
|
82
|
-
)
|
83
|
-
|
84
|
-
assert anthropic_Anthropic is not None, "Anthropic client not found"
|
85
|
-
assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
|
86
|
-
if isinstance(client, (anthropic_Anthropic, anthropic_AsyncAnthropic)):
|
87
|
-
return ProviderType.ANTHROPIC
|
88
|
-
|
89
|
-
if HAS_TOGETHER:
|
90
|
-
from judgeval.tracer.llm.providers import (
|
91
|
-
together_Together,
|
92
|
-
together_AsyncTogether,
|
93
|
-
)
|
94
|
-
|
95
|
-
assert together_Together is not None, "Together client not found"
|
96
|
-
assert together_AsyncTogether is not None, "Together async client not found"
|
97
|
-
if isinstance(client, (together_Together, together_AsyncTogether)):
|
98
|
-
return ProviderType.TOGETHER
|
99
|
-
|
100
|
-
if HAS_GOOGLE_GENAI:
|
101
|
-
from judgeval.tracer.llm.providers import (
|
102
|
-
google_genai_Client,
|
103
|
-
google_genai_AsyncClient,
|
104
|
-
)
|
105
|
-
|
106
|
-
assert google_genai_Client is not None, "Google GenAI client not found"
|
107
|
-
assert google_genai_AsyncClient is not None, (
|
108
|
-
"Google GenAI async client not found"
|
109
|
-
)
|
110
|
-
if isinstance(client, (google_genai_Client, google_genai_AsyncClient)):
|
111
|
-
return ProviderType.GOOGLE
|
112
|
-
|
113
|
-
if HAS_GROQ:
|
114
|
-
from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
|
115
|
-
|
116
|
-
assert groq_Groq is not None, "Groq client not found"
|
117
|
-
assert groq_AsyncGroq is not None, "Groq async client not found"
|
118
|
-
if isinstance(client, (groq_Groq, groq_AsyncGroq)):
|
119
|
-
return ProviderType.GROQ
|
120
|
-
|
121
|
-
return ProviderType.DEFAULT
|
122
|
-
|
123
|
-
|
124
|
-
# Provider-specific content extraction handlers
|
125
|
-
def _extract_openai_content(chunk) -> str:
|
126
|
-
"""Extract content from OpenAI streaming chunk."""
|
127
|
-
if (
|
128
|
-
hasattr(chunk, "choices")
|
129
|
-
and chunk.choices
|
130
|
-
and hasattr(chunk.choices[0], "delta")
|
131
|
-
):
|
132
|
-
delta_content = getattr(chunk.choices[0].delta, "content", None)
|
133
|
-
if delta_content:
|
134
|
-
return delta_content
|
135
|
-
return ""
|
136
|
-
|
137
|
-
|
138
|
-
def _extract_anthropic_content(chunk) -> str:
|
139
|
-
"""Extract content from Anthropic streaming chunk."""
|
140
|
-
if hasattr(chunk, "type"):
|
141
|
-
if chunk.type == "content_block_delta":
|
142
|
-
if hasattr(chunk, "delta"):
|
143
|
-
if hasattr(chunk.delta, "text"):
|
144
|
-
return chunk.delta.text or ""
|
145
|
-
elif hasattr(chunk.delta, "partial_json"):
|
146
|
-
# Tool use input streaming - return raw JSON to accumulate properly
|
147
|
-
return chunk.delta.partial_json or ""
|
148
|
-
elif chunk.type == "content_block_start":
|
149
|
-
if hasattr(chunk, "content_block") and hasattr(chunk.content_block, "type"):
|
150
|
-
if chunk.content_block.type == "tool_use":
|
151
|
-
tool_info = {
|
152
|
-
"type": "tool_use",
|
153
|
-
"id": getattr(chunk.content_block, "id", None),
|
154
|
-
"name": getattr(chunk.content_block, "name", None),
|
155
|
-
}
|
156
|
-
return f"[TOOL_USE_START: {tool_info}]"
|
157
|
-
elif hasattr(chunk, "delta") and hasattr(chunk.delta, "text"):
|
158
|
-
return chunk.delta.text or ""
|
159
|
-
elif hasattr(chunk, "text"):
|
160
|
-
return chunk.text or ""
|
161
|
-
return ""
|
162
|
-
|
163
|
-
|
164
|
-
def _extract_together_content(chunk) -> str:
|
165
|
-
"""Extract content from Together streaming chunk."""
|
166
|
-
if hasattr(chunk, "choices") and chunk.choices:
|
167
|
-
choice = chunk.choices[0]
|
168
|
-
if hasattr(choice, "delta") and hasattr(choice.delta, "content"):
|
169
|
-
return choice.delta.content or ""
|
170
|
-
return ""
|
171
|
-
|
172
|
-
|
173
|
-
def _extract_groq_content(chunk) -> str:
|
174
|
-
"""Extract content from Groq streaming chunk."""
|
175
|
-
if hasattr(chunk, "choices") and chunk.choices:
|
176
|
-
choice = chunk.choices[0]
|
177
|
-
if hasattr(choice, "delta") and hasattr(choice.delta, "content"):
|
178
|
-
return choice.delta.content or ""
|
179
|
-
return ""
|
180
|
-
|
181
|
-
|
182
|
-
# Provider-specific chunk usage extraction handlers
|
183
|
-
def _extract_openai_chunk_usage(chunk) -> Any:
|
184
|
-
"""Extract usage data from OpenAI streaming chunk."""
|
185
|
-
if hasattr(chunk, "usage") and chunk.usage:
|
186
|
-
return chunk.usage
|
187
|
-
return None
|
188
|
-
|
189
|
-
|
190
|
-
def _extract_anthropic_chunk_usage(chunk) -> Any:
|
191
|
-
"""Extract usage data from Anthropic streaming chunk."""
|
192
|
-
if hasattr(chunk, "type"):
|
193
|
-
if chunk.type == "message_start":
|
194
|
-
if hasattr(chunk, "message") and hasattr(chunk.message, "usage"):
|
195
|
-
return chunk.message.usage
|
196
|
-
elif chunk.type == "message_delta":
|
197
|
-
if hasattr(chunk, "usage"):
|
198
|
-
return chunk.usage
|
199
|
-
elif chunk.type == "message_stop":
|
200
|
-
if hasattr(chunk, "usage"):
|
201
|
-
return chunk.usage
|
202
|
-
return None
|
203
|
-
|
204
|
-
|
205
|
-
def _extract_together_chunk_usage(chunk) -> Any:
|
206
|
-
"""Extract usage data from Together streaming chunk."""
|
207
|
-
if hasattr(chunk, "usage") and chunk.usage:
|
208
|
-
return chunk.usage
|
209
|
-
return None
|
210
|
-
|
211
|
-
|
212
|
-
def _extract_groq_chunk_usage(chunk) -> Any:
|
213
|
-
"""Extract usage data from Groq streaming chunk."""
|
214
|
-
# Groq provides usage data in the last chunk when stream_options={"include_usage": True} is used
|
215
|
-
if hasattr(chunk, "usage") and chunk.usage:
|
216
|
-
return chunk.usage
|
217
|
-
return None
|
218
|
-
|
219
|
-
|
220
|
-
# Provider-specific token extraction handlers
|
221
|
-
def _extract_openai_tokens(usage_data) -> tuple[int, int, int, int]:
|
222
|
-
"""Extract token counts from OpenAI usage data."""
|
223
|
-
prompt_tokens = (
|
224
|
-
usage_data.prompt_tokens
|
225
|
-
if hasattr(usage_data, "prompt_tokens") and usage_data.prompt_tokens is not None
|
226
|
-
else 0
|
227
|
-
)
|
228
|
-
completion_tokens = (
|
229
|
-
usage_data.completion_tokens
|
230
|
-
if hasattr(usage_data, "completion_tokens")
|
231
|
-
and usage_data.completion_tokens is not None
|
232
|
-
else 0
|
233
|
-
)
|
234
|
-
return prompt_tokens, completion_tokens, 0, 0
|
235
|
-
|
236
|
-
|
237
|
-
def _extract_anthropic_tokens(usage_data) -> tuple[int, int, int, int]:
|
238
|
-
"""Extract token counts from Anthropic usage data."""
|
239
|
-
prompt_tokens = (
|
240
|
-
usage_data.input_tokens
|
241
|
-
if hasattr(usage_data, "input_tokens") and usage_data.input_tokens is not None
|
242
|
-
else 0
|
243
|
-
)
|
244
|
-
completion_tokens = (
|
245
|
-
usage_data.output_tokens
|
246
|
-
if hasattr(usage_data, "output_tokens") and usage_data.output_tokens is not None
|
247
|
-
else 0
|
248
|
-
)
|
249
|
-
cache_read_input_tokens = (
|
250
|
-
usage_data.cache_read_input_tokens
|
251
|
-
if hasattr(usage_data, "cache_read_input_tokens")
|
252
|
-
and usage_data.cache_read_input_tokens is not None
|
253
|
-
else 0
|
254
|
-
)
|
255
|
-
cache_creation_input_tokens = (
|
256
|
-
usage_data.cache_creation_input_tokens
|
257
|
-
if hasattr(usage_data, "cache_creation_input_tokens")
|
258
|
-
and usage_data.cache_creation_input_tokens is not None
|
259
|
-
else 0
|
260
|
-
)
|
261
|
-
return (
|
262
|
-
prompt_tokens,
|
263
|
-
completion_tokens,
|
264
|
-
cache_read_input_tokens,
|
265
|
-
cache_creation_input_tokens,
|
266
|
-
)
|
267
|
-
|
268
|
-
|
269
|
-
def _extract_together_tokens(usage_data) -> tuple[int, int, int, int]:
|
270
|
-
"""Extract token counts from Together usage data."""
|
271
|
-
prompt_tokens = (
|
272
|
-
usage_data.prompt_tokens
|
273
|
-
if hasattr(usage_data, "prompt_tokens") and usage_data.prompt_tokens is not None
|
274
|
-
else 0
|
275
|
-
)
|
276
|
-
completion_tokens = (
|
277
|
-
usage_data.completion_tokens
|
278
|
-
if hasattr(usage_data, "completion_tokens")
|
279
|
-
and usage_data.completion_tokens is not None
|
280
|
-
else 0
|
281
|
-
)
|
282
|
-
return prompt_tokens, completion_tokens, 0, 0
|
283
|
-
|
284
|
-
|
285
|
-
def _extract_groq_tokens(usage_data) -> tuple[int, int, int, int]:
|
286
|
-
"""Extract token counts from Groq usage data."""
|
287
|
-
prompt_tokens = (
|
288
|
-
usage_data.prompt_tokens
|
289
|
-
if hasattr(usage_data, "prompt_tokens") and usage_data.prompt_tokens is not None
|
290
|
-
else 0
|
291
|
-
)
|
292
|
-
completion_tokens = (
|
293
|
-
usage_data.completion_tokens
|
294
|
-
if hasattr(usage_data, "completion_tokens")
|
295
|
-
and usage_data.completion_tokens is not None
|
296
|
-
else 0
|
297
|
-
)
|
298
|
-
# Extract cached tokens from prompt_tokens_details.cached_tokens
|
299
|
-
cache_read_input_tokens = 0
|
300
|
-
if (
|
301
|
-
hasattr(usage_data, "prompt_tokens_details")
|
302
|
-
and usage_data.prompt_tokens_details
|
303
|
-
):
|
304
|
-
if (
|
305
|
-
hasattr(usage_data.prompt_tokens_details, "cached_tokens")
|
306
|
-
and usage_data.prompt_tokens_details.cached_tokens is not None
|
307
|
-
):
|
308
|
-
cache_read_input_tokens = usage_data.prompt_tokens_details.cached_tokens
|
309
|
-
|
310
|
-
return prompt_tokens, completion_tokens, cache_read_input_tokens, 0
|
311
|
-
|
312
|
-
|
313
|
-
# Provider-specific output formatting handlers
|
314
|
-
def _format_openai_output(response: Any) -> tuple[Optional[str], Optional[TraceUsage]]:
|
315
|
-
"""Format output data from OpenAI response."""
|
316
|
-
from judgeval.tracer.llm.providers import (
|
317
|
-
openai_ChatCompletion,
|
318
|
-
openai_Response,
|
319
|
-
openai_ParsedChatCompletion,
|
320
|
-
)
|
321
|
-
|
322
|
-
model_name = None
|
323
|
-
message_content = None
|
324
|
-
prompt_tokens = 0
|
325
|
-
completion_tokens = 0
|
326
|
-
cache_read_input_tokens = 0
|
327
|
-
cache_creation_input_tokens = 0
|
328
|
-
|
329
|
-
if openai_ChatCompletion and isinstance(response, openai_ChatCompletion):
|
330
|
-
model_name = response.model or ""
|
331
|
-
prompt_tokens = (
|
332
|
-
response.usage.prompt_tokens
|
333
|
-
if response.usage and response.usage.prompt_tokens is not None
|
334
|
-
else 0
|
335
|
-
)
|
336
|
-
completion_tokens = (
|
337
|
-
response.usage.completion_tokens
|
338
|
-
if response.usage and response.usage.completion_tokens is not None
|
339
|
-
else 0
|
340
|
-
)
|
341
|
-
cache_read_input_tokens = (
|
342
|
-
response.usage.prompt_tokens_details.cached_tokens
|
343
|
-
if response.usage
|
344
|
-
and response.usage.prompt_tokens_details
|
345
|
-
and response.usage.prompt_tokens_details.cached_tokens is not None
|
346
|
-
else 0
|
347
|
-
)
|
348
|
-
|
349
|
-
if openai_ParsedChatCompletion and isinstance(
|
350
|
-
response, openai_ParsedChatCompletion
|
351
|
-
):
|
352
|
-
message_content = response.choices[0].message.parsed
|
353
|
-
else:
|
354
|
-
message_content = response.choices[0].message.content
|
355
|
-
elif openai_Response and isinstance(response, openai_Response):
|
356
|
-
model_name = response.model or ""
|
357
|
-
prompt_tokens = (
|
358
|
-
response.usage.input_tokens
|
359
|
-
if response.usage and response.usage.input_tokens is not None
|
360
|
-
else 0
|
361
|
-
)
|
362
|
-
completion_tokens = (
|
363
|
-
response.usage.output_tokens
|
364
|
-
if response.usage and response.usage.output_tokens is not None
|
365
|
-
else 0
|
366
|
-
)
|
367
|
-
cache_read_input_tokens = (
|
368
|
-
response.usage.input_tokens_details.cached_tokens
|
369
|
-
if response.usage
|
370
|
-
and response.usage.input_tokens_details
|
371
|
-
and response.usage.input_tokens_details.cached_tokens is not None
|
372
|
-
else 0
|
373
|
-
)
|
374
|
-
output0 = response.output[0]
|
375
|
-
if (
|
376
|
-
hasattr(output0, "content")
|
377
|
-
and output0.content
|
378
|
-
and hasattr(output0.content, "__iter__")
|
379
|
-
):
|
380
|
-
message_content = "".join(
|
381
|
-
seg.text for seg in output0.content if hasattr(seg, "text") and seg.text
|
382
|
-
)
|
383
|
-
|
384
|
-
if model_name:
|
385
|
-
return message_content, _create_usage(
|
386
|
-
model_name,
|
387
|
-
prompt_tokens,
|
388
|
-
completion_tokens,
|
389
|
-
cache_read_input_tokens,
|
390
|
-
cache_creation_input_tokens,
|
391
|
-
)
|
392
|
-
|
393
|
-
return None, None
|
394
|
-
|
395
|
-
|
396
|
-
def _format_anthropic_output(
|
397
|
-
response: Any,
|
398
|
-
) -> tuple[Optional[str], Optional[TraceUsage]]:
|
399
|
-
"""Format output data from Anthropic response."""
|
400
|
-
model_name = getattr(response, "model", "") or ""
|
401
|
-
usage = getattr(response, "usage", None)
|
402
|
-
prompt_tokens = (
|
403
|
-
usage.input_tokens
|
404
|
-
if usage and hasattr(usage, "input_tokens") and usage.input_tokens is not None
|
405
|
-
else 0
|
406
|
-
)
|
407
|
-
completion_tokens = (
|
408
|
-
usage.output_tokens
|
409
|
-
if usage and hasattr(usage, "output_tokens") and usage.output_tokens is not None
|
410
|
-
else 0
|
411
|
-
)
|
412
|
-
cache_read_input_tokens = (
|
413
|
-
usage.cache_read_input_tokens
|
414
|
-
if usage
|
415
|
-
and hasattr(usage, "cache_read_input_tokens")
|
416
|
-
and usage.cache_read_input_tokens is not None
|
417
|
-
else 0
|
418
|
-
)
|
419
|
-
cache_creation_input_tokens = (
|
420
|
-
usage.cache_creation_input_tokens
|
421
|
-
if usage
|
422
|
-
and hasattr(usage, "cache_creation_input_tokens")
|
423
|
-
and usage.cache_creation_input_tokens is not None
|
424
|
-
else 0
|
425
|
-
)
|
426
|
-
# Extract content from Anthropic response, handling both text and tool use blocks
|
427
|
-
message_content = None
|
428
|
-
if hasattr(response, "content") and response.content:
|
429
|
-
content_parts = []
|
430
|
-
for content_block in response.content:
|
431
|
-
block_type = getattr(content_block, "type", None)
|
432
|
-
if block_type == "text":
|
433
|
-
# Text content block
|
434
|
-
content_parts.append(getattr(content_block, "text", ""))
|
435
|
-
elif block_type == "tool_use":
|
436
|
-
# Tool use block - serialize the tool call information
|
437
|
-
tool_info = {
|
438
|
-
"type": "tool_use",
|
439
|
-
"id": getattr(content_block, "id", None),
|
440
|
-
"name": getattr(content_block, "name", None),
|
441
|
-
"input": getattr(content_block, "input", None),
|
442
|
-
}
|
443
|
-
content_parts.append(f"[TOOL_USE: {tool_info}]")
|
444
|
-
message_content = "\n".join(content_parts) if content_parts else None
|
445
|
-
|
446
|
-
if model_name:
|
447
|
-
return message_content, _create_usage(
|
448
|
-
model_name,
|
449
|
-
prompt_tokens,
|
450
|
-
completion_tokens,
|
451
|
-
cache_read_input_tokens,
|
452
|
-
cache_creation_input_tokens,
|
453
|
-
)
|
454
|
-
|
455
|
-
return None, None
|
456
|
-
|
457
|
-
|
458
|
-
def _format_together_output(
|
459
|
-
response: Any,
|
460
|
-
) -> tuple[Optional[str], Optional[TraceUsage]]:
|
461
|
-
"""Format output data from Together response."""
|
462
|
-
model_name = (response.model or "") if hasattr(response, "model") else ""
|
463
|
-
prompt_tokens = (
|
464
|
-
response.usage.prompt_tokens
|
465
|
-
if hasattr(response.usage, "prompt_tokens")
|
466
|
-
and response.usage.prompt_tokens is not None
|
467
|
-
else 0
|
468
|
-
)
|
469
|
-
completion_tokens = (
|
470
|
-
response.usage.completion_tokens
|
471
|
-
if hasattr(response.usage, "completion_tokens")
|
472
|
-
and response.usage.completion_tokens is not None
|
473
|
-
else 0
|
474
|
-
)
|
475
|
-
message_content = (
|
476
|
-
response.choices[0].message.content if hasattr(response, "choices") else None
|
477
|
-
)
|
478
|
-
|
479
|
-
if model_name:
|
480
|
-
model_name = "together_ai/" + model_name
|
481
|
-
return message_content, _create_usage(
|
482
|
-
model_name,
|
483
|
-
prompt_tokens,
|
484
|
-
completion_tokens,
|
485
|
-
0,
|
486
|
-
0,
|
487
|
-
)
|
488
|
-
|
489
|
-
return None, None
|
490
|
-
|
491
|
-
|
492
|
-
def _format_google_output(response: Any) -> tuple[Optional[str], Optional[TraceUsage]]:
|
493
|
-
"""Format output data from Google GenAI response."""
|
494
|
-
model_name = getattr(response, "model_version", "") or ""
|
495
|
-
usage_metadata = getattr(response, "usage_metadata", None)
|
496
|
-
prompt_tokens = (
|
497
|
-
usage_metadata.prompt_token_count
|
498
|
-
if usage_metadata
|
499
|
-
and hasattr(usage_metadata, "prompt_token_count")
|
500
|
-
and usage_metadata.prompt_token_count is not None
|
501
|
-
else 0
|
502
|
-
)
|
503
|
-
completion_tokens = (
|
504
|
-
usage_metadata.candidates_token_count
|
505
|
-
if usage_metadata
|
506
|
-
and hasattr(usage_metadata, "candidates_token_count")
|
507
|
-
and usage_metadata.candidates_token_count is not None
|
508
|
-
else 0
|
509
|
-
)
|
510
|
-
message_content = (
|
511
|
-
response.candidates[0].content.parts[0].text
|
512
|
-
if hasattr(response, "candidates")
|
513
|
-
else None
|
514
|
-
)
|
515
|
-
|
516
|
-
cache_read_input_tokens = 0
|
517
|
-
if usage_metadata and hasattr(usage_metadata, "cached_content_token_count"):
|
518
|
-
cache_read_input_tokens = usage_metadata.cached_content_token_count or 0
|
519
|
-
|
520
|
-
if model_name:
|
521
|
-
return message_content, _create_usage(
|
522
|
-
model_name,
|
523
|
-
prompt_tokens,
|
524
|
-
completion_tokens,
|
525
|
-
cache_read_input_tokens,
|
526
|
-
0,
|
527
|
-
)
|
528
|
-
|
529
|
-
return None, None
|
530
|
-
|
531
|
-
|
532
|
-
def _format_groq_output(response: Any) -> tuple[Optional[str], Optional[TraceUsage]]:
|
533
|
-
"""Format output data from Groq response."""
|
534
|
-
model_name = (response.model or "") if hasattr(response, "model") else ""
|
535
|
-
prompt_tokens = (
|
536
|
-
response.usage.prompt_tokens
|
537
|
-
if hasattr(response.usage, "prompt_tokens")
|
538
|
-
and response.usage.prompt_tokens is not None
|
539
|
-
else 0
|
540
|
-
)
|
541
|
-
completion_tokens = (
|
542
|
-
response.usage.completion_tokens
|
543
|
-
if hasattr(response.usage, "completion_tokens")
|
544
|
-
and response.usage.completion_tokens is not None
|
545
|
-
else 0
|
546
|
-
)
|
547
|
-
# Extract cached tokens from prompt_tokens_details.cached_tokens
|
548
|
-
cache_read_input_tokens = 0
|
549
|
-
if (
|
550
|
-
hasattr(response, "usage")
|
551
|
-
and response.usage
|
552
|
-
and hasattr(response.usage, "prompt_tokens_details")
|
553
|
-
and response.usage.prompt_tokens_details
|
554
|
-
):
|
555
|
-
if (
|
556
|
-
hasattr(response.usage.prompt_tokens_details, "cached_tokens")
|
557
|
-
and response.usage.prompt_tokens_details.cached_tokens is not None
|
558
|
-
):
|
559
|
-
cache_read_input_tokens = response.usage.prompt_tokens_details.cached_tokens
|
560
|
-
|
561
|
-
message_content = (
|
562
|
-
response.choices[0].message.content if hasattr(response, "choices") else None
|
563
|
-
)
|
564
|
-
|
565
|
-
if model_name:
|
566
|
-
model_name = "groq/" + model_name
|
567
|
-
return message_content, _create_usage(
|
568
|
-
model_name,
|
569
|
-
prompt_tokens,
|
570
|
-
completion_tokens,
|
571
|
-
cache_read_input_tokens,
|
572
|
-
0,
|
573
|
-
)
|
574
|
-
|
575
|
-
return None, None
|
576
|
-
|
577
|
-
|
578
|
-
class _TracedGeneratorBase:
|
579
|
-
"""Base class with common logic for parsing stream chunks."""
|
580
|
-
|
581
|
-
__slots__ = (
|
582
|
-
"tracer",
|
583
|
-
"client",
|
584
|
-
"span",
|
585
|
-
"accumulated_content",
|
586
|
-
"model_name",
|
587
|
-
"provider_type",
|
588
|
-
)
|
589
|
-
|
590
|
-
tracer: Tracer
|
591
|
-
client: ApiClient
|
592
|
-
span: Span
|
593
|
-
accumulated_content: str
|
594
|
-
model_name: str
|
595
|
-
provider_type: ProviderType
|
596
|
-
|
597
|
-
def __init__(self, tracer: Tracer, client: ApiClient, span: Span, model_name: str):
|
598
|
-
"""Initialize the base traced generator.
|
599
|
-
|
600
|
-
Args:
|
601
|
-
tracer: The tracer instance
|
602
|
-
client: The API client
|
603
|
-
span: The OpenTelemetry span
|
604
|
-
model_name: The model name (empty string default allows fallback to usage_data.model)
|
605
|
-
"""
|
606
|
-
self.tracer = tracer
|
607
|
-
self.client = client
|
608
|
-
self.span = span
|
609
|
-
self.accumulated_content = ""
|
610
|
-
self.model_name = model_name
|
611
|
-
self.provider_type = _detect_provider(client)
|
612
|
-
|
613
|
-
def _extract_content(self, chunk) -> str:
|
614
|
-
"""Extract content from streaming chunk based on provider."""
|
615
|
-
if self.provider_type == ProviderType.OPENAI:
|
616
|
-
return _extract_openai_content(chunk)
|
617
|
-
elif self.provider_type == ProviderType.ANTHROPIC:
|
618
|
-
return _extract_anthropic_content(chunk)
|
619
|
-
elif self.provider_type == ProviderType.TOGETHER:
|
620
|
-
return _extract_together_content(chunk)
|
621
|
-
elif self.provider_type == ProviderType.GROQ:
|
622
|
-
return _extract_groq_content(chunk)
|
623
|
-
else:
|
624
|
-
# Default case - assume OpenAI-compatible for unknown providers
|
625
|
-
return _extract_openai_content(chunk)
|
626
|
-
|
627
|
-
def _process_chunk_usage(self, chunk):
|
628
|
-
"""Process usage data from streaming chunks based on provider."""
|
629
|
-
usage_data = _extract_chunk_usage(self.client, chunk)
|
630
|
-
if usage_data:
|
631
|
-
_process_usage_data(
|
632
|
-
self.span, usage_data, self.tracer, self.client, self.model_name
|
633
|
-
)
|
634
|
-
|
635
|
-
def __del__(self):
|
636
|
-
"""
|
637
|
-
Fallback cleanup for unclosed spans. This is a safety mechanism only - spans
|
638
|
-
should normally be finalized in StopIteration/StopAsyncIteration handlers.
|
639
|
-
|
640
|
-
Note: __del__ is not guaranteed to be called in all situations (e.g., reference
|
641
|
-
cycles, program exit), so this should not be relied upon as the primary cleanup
|
642
|
-
mechanism. The primary finalization happens in the iterator protocol methods.
|
643
|
-
"""
|
644
|
-
if self.span:
|
645
|
-
try:
|
646
|
-
self._finalize_span()
|
647
|
-
except Exception as e:
|
648
|
-
judgeval_logger.warning(
|
649
|
-
f"Error during span finalization in __del__: {e}"
|
650
|
-
)
|
651
|
-
|
652
|
-
def _finalize_span(self):
|
653
|
-
"""Finalize the span by setting completion content and ending it."""
|
654
|
-
if self.span:
|
655
|
-
set_span_attribute(
|
656
|
-
self.span, AttributeKeys.GEN_AI_COMPLETION, self.accumulated_content
|
657
|
-
)
|
658
|
-
self.span.end()
|
659
|
-
self.span = None
|
660
|
-
|
661
|
-
|
662
|
-
class TracedGenerator(_TracedGeneratorBase):
|
663
|
-
"""Generator wrapper that adds OpenTelemetry tracing without consuming the stream."""
|
664
|
-
|
665
|
-
__slots__ = ("generator",)
|
666
|
-
|
667
|
-
generator: Union[Generator[Any, None, None], Iterator[Any]]
|
668
|
-
|
669
|
-
def __init__(
|
670
|
-
self,
|
671
|
-
tracer: Tracer,
|
672
|
-
generator: Union[Generator[Any, None, None], Iterator[Any]],
|
673
|
-
client: ApiClient,
|
674
|
-
span: Span,
|
675
|
-
model_name: str,
|
676
|
-
):
|
677
|
-
super().__init__(tracer, client, span, model_name)
|
678
|
-
self.generator = generator
|
679
|
-
|
680
|
-
def __iter__(self):
|
681
|
-
return self
|
682
|
-
|
683
|
-
def __next__(self):
|
684
|
-
try:
|
685
|
-
chunk = next(self.generator)
|
686
|
-
|
687
|
-
content = self._extract_content(chunk)
|
688
|
-
if content:
|
689
|
-
self.accumulated_content += content
|
690
|
-
self._process_chunk_usage(chunk)
|
691
|
-
|
692
|
-
return chunk
|
693
|
-
|
694
|
-
except StopIteration:
|
695
|
-
self._finalize_span()
|
696
|
-
raise
|
697
|
-
except Exception as e:
|
698
|
-
if self.span:
|
699
|
-
self.span.record_exception(e)
|
700
|
-
self.span.end()
|
701
|
-
raise
|
702
|
-
|
703
|
-
|
704
|
-
class TracedAsyncGenerator(_TracedGeneratorBase):
|
705
|
-
"""Async generator wrapper that adds OpenTelemetry tracing without consuming the stream."""
|
706
|
-
|
707
|
-
__slots__ = ("async_generator",)
|
708
|
-
|
709
|
-
async_generator: Union[AsyncGenerator[Any, None], AsyncIterator[Any]]
|
710
|
-
|
711
|
-
def __init__(
|
712
|
-
self,
|
713
|
-
tracer: Tracer,
|
714
|
-
async_generator: Union[AsyncGenerator[Any, None], AsyncIterator[Any]],
|
715
|
-
client: ApiClient,
|
716
|
-
span: Span,
|
717
|
-
model_name: str,
|
718
|
-
):
|
719
|
-
super().__init__(tracer, client, span, model_name)
|
720
|
-
self.async_generator = async_generator
|
721
|
-
|
722
|
-
def __aiter__(self):
|
723
|
-
return self
|
724
|
-
|
725
|
-
async def __anext__(self):
|
726
|
-
try:
|
727
|
-
chunk = await self.async_generator.__anext__()
|
728
|
-
|
729
|
-
content = self._extract_content(chunk)
|
730
|
-
if content:
|
731
|
-
self.accumulated_content += content
|
732
|
-
|
733
|
-
self._process_chunk_usage(chunk)
|
734
|
-
|
735
|
-
return chunk
|
736
|
-
|
737
|
-
except StopAsyncIteration:
|
738
|
-
self._finalize_span()
|
739
|
-
raise
|
740
|
-
except Exception as e:
|
741
|
-
if self.span:
|
742
|
-
self.span.record_exception(e)
|
743
|
-
self.span.end()
|
744
|
-
raise
|
745
|
-
|
746
|
-
|
747
|
-
class TracedSyncContextManager:
|
748
|
-
"""Sync context manager wrapper for streaming methods."""
|
749
|
-
|
750
|
-
def __init__(
|
751
|
-
self,
|
752
|
-
tracer: Tracer,
|
753
|
-
context_manager: Any,
|
754
|
-
client: ApiClient,
|
755
|
-
span: Span,
|
756
|
-
model_name: str,
|
757
|
-
):
|
758
|
-
self.tracer = tracer
|
759
|
-
self.context_manager = context_manager
|
760
|
-
self.client = client
|
761
|
-
self.span = span
|
762
|
-
self.stream: Optional[Any] = None
|
763
|
-
self.model_name = model_name
|
764
|
-
|
765
|
-
def __enter__(self):
|
766
|
-
self.stream = self.context_manager.__enter__()
|
767
|
-
return TracedGenerator(
|
768
|
-
self.tracer, self.stream, self.client, self.span, self.model_name
|
769
|
-
)
|
770
|
-
|
771
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
772
|
-
return self.context_manager.__exit__(exc_type, exc_val, exc_tb)
|
773
|
-
|
774
|
-
def __del__(self):
|
775
|
-
"""Cleanup span if not properly closed."""
|
776
|
-
if self.span:
|
777
|
-
try:
|
778
|
-
self.span.end()
|
779
|
-
except Exception:
|
780
|
-
pass
|
781
|
-
|
782
|
-
|
783
|
-
class TracedAsyncContextManager:
|
784
|
-
"""Async context manager wrapper for streaming methods."""
|
785
|
-
|
786
|
-
def __init__(
|
787
|
-
self,
|
788
|
-
tracer: Tracer,
|
789
|
-
context_manager: Any,
|
790
|
-
client: ApiClient,
|
791
|
-
span: Span,
|
792
|
-
model_name: str,
|
793
|
-
):
|
794
|
-
self.tracer = tracer
|
795
|
-
self.context_manager = context_manager
|
796
|
-
self.client = client
|
797
|
-
self.span = span
|
798
|
-
self.stream: Optional[Any] = None
|
799
|
-
self.model_name = model_name
|
800
|
-
|
801
|
-
async def __aenter__(self):
|
802
|
-
self.stream = await self.context_manager.__aenter__()
|
803
|
-
return TracedAsyncGenerator(
|
804
|
-
self.tracer, self.stream, self.client, self.span, self.model_name
|
805
|
-
)
|
806
|
-
|
807
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
808
|
-
return await self.context_manager.__aexit__(exc_type, exc_val, exc_tb)
|
809
|
-
|
810
|
-
def __del__(self):
|
811
|
-
"""Cleanup span if not properly closed."""
|
812
|
-
if self.span:
|
813
|
-
try:
|
814
|
-
self.span.end()
|
815
|
-
except Exception:
|
816
|
-
pass
|
817
|
-
|
818
|
-
|
819
|
-
def _extract_chunk_usage(client: ApiClient, chunk) -> Any:
|
820
|
-
"""Extract usage data from streaming chunks based on provider."""
|
821
|
-
provider_type = _detect_provider(client)
|
822
|
-
|
823
|
-
if provider_type == ProviderType.OPENAI:
|
824
|
-
return _extract_openai_chunk_usage(chunk)
|
825
|
-
elif provider_type == ProviderType.ANTHROPIC:
|
826
|
-
return _extract_anthropic_chunk_usage(chunk)
|
827
|
-
elif provider_type == ProviderType.TOGETHER:
|
828
|
-
return _extract_together_chunk_usage(chunk)
|
829
|
-
elif provider_type == ProviderType.GROQ:
|
830
|
-
return _extract_groq_chunk_usage(chunk)
|
831
|
-
else:
|
832
|
-
# Default case - assume OpenAI-compatible for unknown providers
|
833
|
-
return _extract_openai_chunk_usage(chunk)
|
834
|
-
|
835
|
-
|
836
|
-
def _extract_usage_tokens(client: ApiClient, usage_data) -> tuple[int, int, int, int]:
|
837
|
-
"""Extract token counts from usage data based on provider."""
|
838
|
-
provider_type = _detect_provider(client)
|
839
|
-
|
840
|
-
if provider_type == ProviderType.OPENAI:
|
841
|
-
return _extract_openai_tokens(usage_data)
|
842
|
-
elif provider_type == ProviderType.ANTHROPIC:
|
843
|
-
return _extract_anthropic_tokens(usage_data)
|
844
|
-
elif provider_type == ProviderType.TOGETHER:
|
845
|
-
return _extract_together_tokens(usage_data)
|
846
|
-
elif provider_type == ProviderType.GROQ:
|
847
|
-
return _extract_groq_tokens(usage_data)
|
848
|
-
else:
|
849
|
-
# Default case - assume OpenAI-compatible for unknown providers
|
850
|
-
return _extract_openai_tokens(usage_data)
|
851
|
-
|
852
|
-
|
853
|
-
def _process_usage_data(
|
854
|
-
span, usage_data, tracer: Tracer, client: ApiClient, model_name: str
|
855
|
-
):
|
856
|
-
"""Process usage data and set span attributes."""
|
857
|
-
(
|
858
|
-
prompt_tokens,
|
859
|
-
completion_tokens,
|
860
|
-
cache_read_input_tokens,
|
861
|
-
cache_creation_input_tokens,
|
862
|
-
) = _extract_usage_tokens(client, usage_data)
|
863
|
-
|
864
|
-
if prompt_tokens or completion_tokens:
|
865
|
-
final_model_name = getattr(usage_data, "model", None) or model_name
|
866
|
-
|
867
|
-
# Add provider prefixes for cost calculation
|
868
|
-
provider_type = _detect_provider(client)
|
869
|
-
if (
|
870
|
-
provider_type == ProviderType.TOGETHER
|
871
|
-
and final_model_name
|
872
|
-
and not final_model_name.startswith("together_ai/")
|
873
|
-
):
|
874
|
-
final_model_name = "together_ai/" + final_model_name
|
875
|
-
elif (
|
876
|
-
provider_type == ProviderType.GROQ
|
877
|
-
and final_model_name
|
878
|
-
and not final_model_name.startswith("groq/")
|
879
|
-
):
|
880
|
-
final_model_name = "groq/" + final_model_name
|
881
|
-
|
882
|
-
usage = _create_usage(
|
883
|
-
final_model_name,
|
884
|
-
prompt_tokens,
|
885
|
-
completion_tokens,
|
886
|
-
cache_read_input_tokens,
|
887
|
-
cache_creation_input_tokens,
|
888
|
-
)
|
889
|
-
_set_usage_attributes(span, usage, tracer)
|
890
|
-
|
891
|
-
|
892
|
-
def _set_usage_attributes(span, usage: TraceUsage, tracer: Tracer):
|
893
|
-
"""Set usage attributes on the span for non-streaming responses."""
|
894
|
-
|
895
|
-
set_span_attribute(span, AttributeKeys.GEN_AI_RESPONSE_MODEL, usage.model_name)
|
896
|
-
set_span_attribute(
|
897
|
-
span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, usage.prompt_tokens
|
898
|
-
)
|
899
|
-
set_span_attribute(
|
900
|
-
span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, usage.completion_tokens
|
901
|
-
)
|
902
|
-
set_span_attribute(
|
903
|
-
span, AttributeKeys.GEN_AI_USAGE_COMPLETION_TOKENS, usage.completion_tokens
|
904
|
-
)
|
905
|
-
set_span_attribute(
|
906
|
-
span, AttributeKeys.GEN_AI_USAGE_TOTAL_COST, usage.total_cost_usd
|
907
|
-
)
|
908
|
-
|
909
|
-
|
910
|
-
def wrap_provider(tracer: Tracer, client: ApiClient) -> ApiClient:
|
911
|
-
"""
|
912
|
-
Wraps an API client to add tracing capabilities.
|
913
|
-
Supports OpenAI, Together, Anthropic, Google GenAI, and Groq clients.
|
914
|
-
"""
|
915
|
-
|
916
|
-
def wrapped(function, span_name):
|
917
|
-
@functools.wraps(function)
|
918
|
-
def wrapper(*args, **kwargs):
|
919
|
-
if kwargs.get("stream", False):
|
920
|
-
span = tracer.get_tracer().start_span(
|
921
|
-
span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
922
|
-
)
|
923
|
-
tracer.add_agent_attributes_to_span(span)
|
924
|
-
set_span_attribute(
|
925
|
-
span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
926
|
-
)
|
927
|
-
model_name = kwargs.get("model", "")
|
928
|
-
|
929
|
-
# Add provider prefix for Groq clients
|
930
|
-
if HAS_GROQ:
|
931
|
-
from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
|
932
|
-
|
933
|
-
if (
|
934
|
-
isinstance(client, (groq_Groq, groq_AsyncGroq))
|
935
|
-
and model_name
|
936
|
-
and not model_name.startswith("groq/")
|
937
|
-
):
|
938
|
-
model_name = "groq/" + model_name
|
939
|
-
|
940
|
-
response = function(*args, **kwargs)
|
941
|
-
return TracedGenerator(tracer, response, client, span, model_name)
|
942
|
-
else:
|
943
|
-
with sync_span_context(
|
944
|
-
tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
945
|
-
) as span:
|
946
|
-
tracer.add_agent_attributes_to_span(span)
|
947
|
-
set_span_attribute(
|
948
|
-
span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
949
|
-
)
|
950
|
-
try:
|
951
|
-
response = function(*args, **kwargs)
|
952
|
-
output, usage = _format_output_data(client, response)
|
953
|
-
set_span_attribute(
|
954
|
-
span, AttributeKeys.GEN_AI_COMPLETION, output
|
955
|
-
)
|
956
|
-
if usage:
|
957
|
-
_set_usage_attributes(span, usage, tracer)
|
958
|
-
return response
|
959
|
-
except Exception as e:
|
960
|
-
span.record_exception(e)
|
961
|
-
raise
|
962
|
-
|
963
|
-
return wrapper
|
964
|
-
|
965
|
-
def wrapped_async(function, span_name):
|
966
|
-
@functools.wraps(function)
|
967
|
-
async def wrapper(*args, **kwargs):
|
968
|
-
if kwargs.get("stream", False):
|
969
|
-
span = tracer.get_tracer().start_span(
|
970
|
-
span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
971
|
-
)
|
972
|
-
tracer.add_agent_attributes_to_span(span)
|
973
|
-
set_span_attribute(
|
974
|
-
span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
975
|
-
)
|
976
|
-
model_name = kwargs.get("model", "")
|
977
|
-
|
978
|
-
# Add provider prefix for Groq clients
|
979
|
-
if HAS_GROQ:
|
980
|
-
from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
|
981
|
-
|
982
|
-
if (
|
983
|
-
isinstance(client, (groq_Groq, groq_AsyncGroq))
|
984
|
-
and model_name
|
985
|
-
and not model_name.startswith("groq/")
|
986
|
-
):
|
987
|
-
model_name = "groq/" + model_name
|
988
|
-
|
989
|
-
response = await function(*args, **kwargs)
|
990
|
-
return TracedAsyncGenerator(tracer, response, client, span, model_name)
|
991
|
-
else:
|
992
|
-
async with async_span_context(
|
993
|
-
tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
994
|
-
) as span:
|
995
|
-
tracer.add_agent_attributes_to_span(span)
|
996
|
-
set_span_attribute(
|
997
|
-
span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
998
|
-
)
|
999
|
-
try:
|
1000
|
-
response = await function(*args, **kwargs)
|
1001
|
-
output, usage = _format_output_data(client, response)
|
1002
|
-
set_span_attribute(
|
1003
|
-
span, AttributeKeys.GEN_AI_COMPLETION, output
|
1004
|
-
)
|
1005
|
-
if usage:
|
1006
|
-
_set_usage_attributes(span, usage, tracer)
|
1007
|
-
return response
|
1008
|
-
except Exception as e:
|
1009
|
-
span.record_exception(e)
|
1010
|
-
raise
|
1011
|
-
|
1012
|
-
return wrapper
|
1013
|
-
|
1014
|
-
def wrapped_sync_context_manager(function, span_name):
|
1015
|
-
"""Special wrapper for sync context manager methods."""
|
1016
|
-
|
1017
|
-
@functools.wraps(function)
|
1018
|
-
def wrapper(*args, **kwargs):
|
1019
|
-
span = tracer.get_tracer().start_span(
|
1020
|
-
span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
1021
|
-
)
|
1022
|
-
tracer.add_agent_attributes_to_span(span)
|
1023
|
-
set_span_attribute(
|
1024
|
-
span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
1025
|
-
)
|
1026
|
-
model_name = kwargs.get("model", "")
|
1027
|
-
|
1028
|
-
# Add provider prefix for Groq clients
|
1029
|
-
if HAS_GROQ:
|
1030
|
-
from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
|
1031
|
-
|
1032
|
-
if (
|
1033
|
-
isinstance(client, (groq_Groq, groq_AsyncGroq))
|
1034
|
-
and model_name
|
1035
|
-
and not model_name.startswith("groq/")
|
1036
|
-
):
|
1037
|
-
model_name = "groq/" + model_name
|
1038
|
-
|
1039
|
-
original_context_manager = function(*args, **kwargs)
|
1040
|
-
return TracedSyncContextManager(
|
1041
|
-
tracer, original_context_manager, client, span, model_name
|
1042
|
-
)
|
1043
|
-
|
1044
|
-
return wrapper
|
1045
|
-
|
1046
|
-
def wrapped_async_context_manager(function, span_name):
|
1047
|
-
"""Special wrapper for async context manager methods."""
|
1048
|
-
|
1049
|
-
@functools.wraps(function)
|
1050
|
-
def wrapper(*args, **kwargs):
|
1051
|
-
span = tracer.get_tracer().start_span(
|
1052
|
-
span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
1053
|
-
)
|
1054
|
-
tracer.add_agent_attributes_to_span(span)
|
1055
|
-
set_span_attribute(
|
1056
|
-
span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
1057
|
-
)
|
1058
|
-
model_name = kwargs.get("model", "")
|
1059
|
-
|
1060
|
-
# Add provider prefix for Groq clients
|
1061
|
-
if HAS_GROQ:
|
1062
|
-
from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
|
1063
|
-
|
1064
|
-
if (
|
1065
|
-
isinstance(client, (groq_Groq, groq_AsyncGroq))
|
1066
|
-
and model_name
|
1067
|
-
and not model_name.startswith("groq/")
|
1068
|
-
):
|
1069
|
-
model_name = "groq/" + model_name
|
1070
|
-
|
1071
|
-
original_context_manager = function(*args, **kwargs)
|
1072
|
-
return TracedAsyncContextManager(
|
1073
|
-
tracer, original_context_manager, client, span, model_name
|
1074
|
-
)
|
1075
|
-
|
1076
|
-
return wrapper
|
1077
|
-
|
1078
|
-
if HAS_OPENAI:
|
1079
|
-
from judgeval.tracer.llm.providers import openai_OpenAI, openai_AsyncOpenAI
|
1080
|
-
|
1081
|
-
assert openai_OpenAI is not None, "OpenAI client not found"
|
1082
|
-
assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
|
1083
|
-
span_name = "OPENAI_API_CALL"
|
1084
|
-
if isinstance(client, openai_OpenAI):
|
1085
|
-
setattr(
|
1086
|
-
client.chat.completions,
|
1087
|
-
"create",
|
1088
|
-
wrapped(client.chat.completions.create, span_name),
|
1089
|
-
)
|
1090
|
-
setattr(
|
1091
|
-
client.responses, "create", wrapped(client.responses.create, span_name)
|
1092
|
-
)
|
1093
|
-
setattr(
|
1094
|
-
client.beta.chat.completions,
|
1095
|
-
"parse",
|
1096
|
-
wrapped(client.beta.chat.completions.parse, span_name),
|
1097
|
-
)
|
1098
|
-
elif isinstance(client, openai_AsyncOpenAI):
|
1099
|
-
setattr(
|
1100
|
-
client.chat.completions,
|
1101
|
-
"create",
|
1102
|
-
wrapped_async(client.chat.completions.create, span_name),
|
1103
|
-
)
|
1104
|
-
setattr(
|
1105
|
-
client.responses,
|
1106
|
-
"create",
|
1107
|
-
wrapped_async(client.responses.create, span_name),
|
1108
|
-
)
|
1109
|
-
setattr(
|
1110
|
-
client.beta.chat.completions,
|
1111
|
-
"parse",
|
1112
|
-
wrapped_async(client.beta.chat.completions.parse, span_name),
|
1113
|
-
)
|
1114
|
-
|
1115
|
-
if HAS_TOGETHER:
|
1116
|
-
from judgeval.tracer.llm.providers import (
|
1117
|
-
together_Together,
|
1118
|
-
together_AsyncTogether,
|
1119
|
-
)
|
1120
|
-
|
1121
|
-
assert together_Together is not None, "Together client not found"
|
1122
|
-
assert together_AsyncTogether is not None, "Together async client not found"
|
1123
|
-
span_name = "TOGETHER_API_CALL"
|
1124
|
-
if isinstance(client, together_Together):
|
1125
|
-
setattr(
|
1126
|
-
client.chat.completions,
|
1127
|
-
"create",
|
1128
|
-
wrapped(client.chat.completions.create, span_name),
|
1129
|
-
)
|
1130
|
-
elif isinstance(client, together_AsyncTogether):
|
1131
|
-
setattr(
|
1132
|
-
client.chat.completions,
|
1133
|
-
"create",
|
1134
|
-
wrapped_async(client.chat.completions.create, span_name),
|
1135
|
-
)
|
1136
|
-
|
1137
|
-
if HAS_ANTHROPIC:
|
1138
|
-
from judgeval.tracer.llm.providers import (
|
1139
|
-
anthropic_Anthropic,
|
1140
|
-
anthropic_AsyncAnthropic,
|
1141
|
-
)
|
1142
|
-
|
1143
|
-
assert anthropic_Anthropic is not None, "Anthropic client not found"
|
1144
|
-
assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
|
1145
|
-
span_name = "ANTHROPIC_API_CALL"
|
1146
|
-
if isinstance(client, anthropic_Anthropic):
|
1147
|
-
setattr(
|
1148
|
-
client.messages, "create", wrapped(client.messages.create, span_name)
|
1149
|
-
)
|
1150
|
-
setattr(
|
1151
|
-
client.messages,
|
1152
|
-
"stream",
|
1153
|
-
wrapped_sync_context_manager(client.messages.stream, span_name),
|
1154
|
-
)
|
1155
|
-
elif isinstance(client, anthropic_AsyncAnthropic):
|
1156
|
-
setattr(
|
1157
|
-
client.messages,
|
1158
|
-
"create",
|
1159
|
-
wrapped_async(client.messages.create, span_name),
|
1160
|
-
)
|
1161
|
-
setattr(
|
1162
|
-
client.messages,
|
1163
|
-
"stream",
|
1164
|
-
wrapped_async_context_manager(client.messages.stream, span_name),
|
1165
|
-
)
|
1166
|
-
|
1167
|
-
if HAS_GOOGLE_GENAI:
|
1168
|
-
from judgeval.tracer.llm.providers import (
|
1169
|
-
google_genai_Client,
|
1170
|
-
google_genai_AsyncClient,
|
1171
|
-
)
|
1172
|
-
|
1173
|
-
assert google_genai_Client is not None, "Google GenAI client not found"
|
1174
|
-
assert google_genai_AsyncClient is not None, (
|
1175
|
-
"Google GenAI async client not found"
|
1176
|
-
)
|
1177
|
-
span_name = "GOOGLE_API_CALL"
|
1178
|
-
if isinstance(client, google_genai_Client):
|
1179
|
-
setattr(
|
1180
|
-
client.models,
|
1181
|
-
"generate_content",
|
1182
|
-
wrapped(client.models.generate_content, span_name),
|
1183
|
-
)
|
1184
|
-
elif isinstance(client, google_genai_AsyncClient):
|
1185
|
-
setattr(
|
1186
|
-
client.models,
|
1187
|
-
"generate_content",
|
1188
|
-
wrapped_async(client.models.generate_content, span_name),
|
1189
|
-
)
|
1190
|
-
|
1191
|
-
if HAS_GROQ:
|
1192
|
-
from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
|
1193
|
-
|
1194
|
-
assert groq_Groq is not None, "Groq client not found"
|
1195
|
-
assert groq_AsyncGroq is not None, "Groq async client not found"
|
1196
|
-
span_name = "GROQ_API_CALL"
|
1197
|
-
if isinstance(client, groq_Groq):
|
1198
|
-
setattr(
|
1199
|
-
client.chat.completions,
|
1200
|
-
"create",
|
1201
|
-
wrapped(client.chat.completions.create, span_name),
|
1202
|
-
)
|
1203
|
-
elif isinstance(client, groq_AsyncGroq):
|
1204
|
-
setattr(
|
1205
|
-
client.chat.completions,
|
1206
|
-
"create",
|
1207
|
-
wrapped_async(client.chat.completions.create, span_name),
|
1208
|
-
)
|
1209
|
-
|
1210
|
-
return client
|
1211
|
-
|
1212
|
-
|
1213
|
-
def _format_output_data(
|
1214
|
-
client: ApiClient, response: Any
|
1215
|
-
) -> tuple[Optional[str], Optional[TraceUsage]]:
|
1216
|
-
"""Format output data from LLM response based on provider."""
|
1217
|
-
provider_type = _detect_provider(client)
|
1218
|
-
|
1219
|
-
if provider_type == ProviderType.OPENAI:
|
1220
|
-
return _format_openai_output(response)
|
1221
|
-
elif provider_type == ProviderType.ANTHROPIC:
|
1222
|
-
return _format_anthropic_output(response)
|
1223
|
-
elif provider_type == ProviderType.TOGETHER:
|
1224
|
-
return _format_together_output(response)
|
1225
|
-
elif provider_type == ProviderType.GOOGLE:
|
1226
|
-
return _format_google_output(response)
|
1227
|
-
elif provider_type == ProviderType.GROQ:
|
1228
|
-
return _format_groq_output(response)
|
1229
|
-
else:
|
1230
|
-
# Default case - assume OpenAI-compatible for unknown providers
|
1231
|
-
judgeval_logger.info(
|
1232
|
-
f"Unknown client type {type(client)}, assuming OpenAI-compatible"
|
1233
|
-
)
|
1234
|
-
return _format_openai_output(response)
|
1235
|
-
|
1236
|
-
|
1237
|
-
def _create_usage(
|
1238
|
-
model_name: str,
|
1239
|
-
prompt_tokens: int,
|
1240
|
-
completion_tokens: int,
|
1241
|
-
cache_read_input_tokens: int = 0,
|
1242
|
-
cache_creation_input_tokens: int = 0,
|
1243
|
-
) -> TraceUsage:
|
1244
|
-
prompt_cost, completion_cost = cost_per_token(
|
1245
|
-
model=model_name,
|
1246
|
-
prompt_tokens=prompt_tokens,
|
1247
|
-
completion_tokens=completion_tokens,
|
1248
|
-
cache_read_input_tokens=cache_read_input_tokens,
|
1249
|
-
cache_creation_input_tokens=cache_creation_input_tokens,
|
1250
|
-
)
|
1251
|
-
total_cost_usd = (
|
1252
|
-
(prompt_cost + completion_cost) if prompt_cost and completion_cost else None
|
1253
|
-
)
|
1254
|
-
return TraceUsage(
|
1255
|
-
prompt_tokens=prompt_tokens,
|
1256
|
-
completion_tokens=completion_tokens,
|
1257
|
-
total_tokens=prompt_tokens + completion_tokens,
|
1258
|
-
cache_read_input_tokens=cache_read_input_tokens,
|
1259
|
-
cache_creation_input_tokens=cache_creation_input_tokens,
|
1260
|
-
prompt_tokens_cost_usd=prompt_cost,
|
1261
|
-
completion_tokens_cost_usd=completion_cost,
|
1262
|
-
total_cost_usd=total_cost_usd,
|
1263
|
-
model_name=model_name,
|
1264
|
-
)
|
7
|
+
__all__ = ["_detect_provider", "wrap_provider"]
|