judgeval 0.15.0__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. judgeval/api/__init__.py +4 -18
  2. judgeval/api/api_types.py +18 -2
  3. judgeval/data/judgment_types.py +18 -2
  4. judgeval/logger.py +1 -1
  5. judgeval/tracer/__init__.py +10 -7
  6. judgeval/tracer/keys.py +7 -3
  7. judgeval/tracer/llm/__init__.py +2 -1227
  8. judgeval/tracer/llm/config.py +110 -0
  9. judgeval/tracer/llm/constants.py +10 -0
  10. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  11. judgeval/tracer/llm/llm_anthropic/wrapper.py +611 -0
  12. judgeval/tracer/llm/llm_google/__init__.py +0 -0
  13. judgeval/tracer/llm/llm_google/config.py +24 -0
  14. judgeval/tracer/llm/llm_google/wrapper.py +426 -0
  15. judgeval/tracer/llm/llm_groq/__init__.py +0 -0
  16. judgeval/tracer/llm/llm_groq/config.py +23 -0
  17. judgeval/tracer/llm/llm_groq/wrapper.py +477 -0
  18. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  19. judgeval/tracer/llm/llm_openai/wrapper.py +637 -0
  20. judgeval/tracer/llm/llm_together/__init__.py +0 -0
  21. judgeval/tracer/llm/llm_together/config.py +23 -0
  22. judgeval/tracer/llm/llm_together/wrapper.py +478 -0
  23. judgeval/tracer/llm/providers.py +5 -5
  24. judgeval/tracer/processors/__init__.py +1 -1
  25. judgeval/trainer/console.py +1 -1
  26. judgeval/utils/decorators/__init__.py +0 -0
  27. judgeval/utils/decorators/dont_throw.py +21 -0
  28. judgeval/utils/{decorators.py → decorators/use_once.py} +0 -11
  29. judgeval/utils/meta.py +1 -1
  30. judgeval/utils/version_check.py +1 -1
  31. judgeval/version.py +1 -1
  32. judgeval-0.16.1.dist-info/METADATA +266 -0
  33. {judgeval-0.15.0.dist-info → judgeval-0.16.1.dist-info}/RECORD +38 -24
  34. judgeval/tracer/llm/google/__init__.py +0 -21
  35. judgeval/tracer/llm/groq/__init__.py +0 -20
  36. judgeval/tracer/llm/together/__init__.py +0 -20
  37. judgeval-0.15.0.dist-info/METADATA +0 -158
  38. /judgeval/tracer/llm/{anthropic/__init__.py → llm_anthropic/config.py} +0 -0
  39. /judgeval/tracer/llm/{openai/__init__.py → llm_openai/config.py} +0 -0
  40. {judgeval-0.15.0.dist-info → judgeval-0.16.1.dist-info}/WHEEL +0 -0
  41. {judgeval-0.15.0.dist-info → judgeval-0.16.1.dist-info}/entry_points.txt +0 -0
  42. {judgeval-0.15.0.dist-info → judgeval-0.16.1.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,1232 +1,7 @@
1
1
  from __future__ import annotations
2
- import functools
3
- from typing import (
4
- Tuple,
5
- Optional,
6
- Any,
7
- TYPE_CHECKING,
8
- Union,
9
- AsyncGenerator,
10
- Generator,
11
- Iterator,
12
- AsyncIterator,
13
- )
14
- from functools import wraps
15
- from enum import Enum
16
- from judgeval.data.trace import TraceUsage
17
- from judgeval.logger import judgeval_logger
18
- from litellm.cost_calculator import cost_per_token as _original_cost_per_token
19
- from opentelemetry.trace import Span
20
2
 
21
- from judgeval.tracer.llm.providers import (
22
- HAS_OPENAI,
23
- HAS_TOGETHER,
24
- HAS_ANTHROPIC,
25
- HAS_GOOGLE_GENAI,
26
- HAS_GROQ,
27
- ApiClient,
28
- )
29
- from judgeval.tracer.managers import sync_span_context, async_span_context
30
- from judgeval.tracer.keys import AttributeKeys
31
- from judgeval.utils.serialize import safe_serialize
32
- from judgeval.tracer.utils import set_span_attribute
33
3
 
34
- if TYPE_CHECKING:
35
- from judgeval.tracer import Tracer
4
+ from .config import _detect_provider, wrap_provider
36
5
 
37
6
 
38
- class ProviderType(Enum):
39
- """Enum for different LLM provider types."""
40
-
41
- OPENAI = "openai"
42
- ANTHROPIC = "anthropic"
43
- TOGETHER = "together"
44
- GOOGLE = "google"
45
- GROQ = "groq"
46
- DEFAULT = "default"
47
-
48
-
49
- @wraps(_original_cost_per_token)
50
- def cost_per_token(
51
- *args: Any, **kwargs: Any
52
- ) -> Tuple[Optional[float], Optional[float]]:
53
- try:
54
- prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = (
55
- _original_cost_per_token(*args, **kwargs)
56
- )
57
- if (
58
- prompt_tokens_cost_usd_dollar == 0
59
- and completion_tokens_cost_usd_dollar == 0
60
- ):
61
- judgeval_logger.warning("LiteLLM returned a total of 0 for cost per token")
62
- return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
63
- except Exception as e:
64
- judgeval_logger.warning(f"Error calculating cost per token: {e}")
65
- return None, None
66
-
67
-
68
- def _detect_provider(client: ApiClient) -> ProviderType:
69
- """Detect the provider type of the client once to avoid repeated isinstance checks."""
70
- if HAS_OPENAI:
71
- from judgeval.tracer.llm.providers import openai_OpenAI, openai_AsyncOpenAI
72
-
73
- assert openai_OpenAI is not None, "OpenAI client not found"
74
- assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
75
- if isinstance(client, (openai_OpenAI, openai_AsyncOpenAI)):
76
- return ProviderType.OPENAI
77
-
78
- if HAS_ANTHROPIC:
79
- from judgeval.tracer.llm.providers import (
80
- anthropic_Anthropic,
81
- anthropic_AsyncAnthropic,
82
- )
83
-
84
- assert anthropic_Anthropic is not None, "Anthropic client not found"
85
- assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
86
- if isinstance(client, (anthropic_Anthropic, anthropic_AsyncAnthropic)):
87
- return ProviderType.ANTHROPIC
88
-
89
- if HAS_TOGETHER:
90
- from judgeval.tracer.llm.providers import (
91
- together_Together,
92
- together_AsyncTogether,
93
- )
94
-
95
- assert together_Together is not None, "Together client not found"
96
- assert together_AsyncTogether is not None, "Together async client not found"
97
- if isinstance(client, (together_Together, together_AsyncTogether)):
98
- return ProviderType.TOGETHER
99
-
100
- if HAS_GOOGLE_GENAI:
101
- from judgeval.tracer.llm.providers import (
102
- google_genai_Client,
103
- google_genai_AsyncClient,
104
- )
105
-
106
- assert google_genai_Client is not None, "Google GenAI client not found"
107
- assert google_genai_AsyncClient is not None, (
108
- "Google GenAI async client not found"
109
- )
110
- if isinstance(client, (google_genai_Client, google_genai_AsyncClient)):
111
- return ProviderType.GOOGLE
112
-
113
- if HAS_GROQ:
114
- from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
115
-
116
- assert groq_Groq is not None, "Groq client not found"
117
- assert groq_AsyncGroq is not None, "Groq async client not found"
118
- if isinstance(client, (groq_Groq, groq_AsyncGroq)):
119
- return ProviderType.GROQ
120
-
121
- return ProviderType.DEFAULT
122
-
123
-
124
- # Provider-specific content extraction handlers
125
- def _extract_openai_content(chunk) -> str:
126
- """Extract content from OpenAI streaming chunk."""
127
- if (
128
- hasattr(chunk, "choices")
129
- and chunk.choices
130
- and hasattr(chunk.choices[0], "delta")
131
- ):
132
- delta_content = getattr(chunk.choices[0].delta, "content", None)
133
- if delta_content:
134
- return delta_content
135
- return ""
136
-
137
-
138
- def _extract_anthropic_content(chunk) -> str:
139
- """Extract content from Anthropic streaming chunk."""
140
- if hasattr(chunk, "type") and chunk.type == "content_block_delta":
141
- if hasattr(chunk, "delta") and hasattr(chunk.delta, "text"):
142
- return chunk.delta.text or ""
143
- elif hasattr(chunk, "delta") and hasattr(chunk.delta, "text"):
144
- return chunk.delta.text or ""
145
- elif hasattr(chunk, "text"):
146
- return chunk.text or ""
147
- return ""
148
-
149
-
150
- def _extract_together_content(chunk) -> str:
151
- """Extract content from Together streaming chunk."""
152
- if hasattr(chunk, "choices") and chunk.choices:
153
- choice = chunk.choices[0]
154
- if hasattr(choice, "delta") and hasattr(choice.delta, "content"):
155
- return choice.delta.content or ""
156
- return ""
157
-
158
-
159
- def _extract_groq_content(chunk) -> str:
160
- """Extract content from Groq streaming chunk."""
161
- if hasattr(chunk, "choices") and chunk.choices:
162
- choice = chunk.choices[0]
163
- if hasattr(choice, "delta") and hasattr(choice.delta, "content"):
164
- return choice.delta.content or ""
165
- return ""
166
-
167
-
168
- # Provider-specific chunk usage extraction handlers
169
- def _extract_openai_chunk_usage(chunk) -> Any:
170
- """Extract usage data from OpenAI streaming chunk."""
171
- if hasattr(chunk, "usage") and chunk.usage:
172
- return chunk.usage
173
- return None
174
-
175
-
176
- def _extract_anthropic_chunk_usage(chunk) -> Any:
177
- """Extract usage data from Anthropic streaming chunk."""
178
- if hasattr(chunk, "type"):
179
- if chunk.type == "message_start":
180
- if hasattr(chunk, "message") and hasattr(chunk.message, "usage"):
181
- return chunk.message.usage
182
- elif chunk.type == "message_delta":
183
- if hasattr(chunk, "usage"):
184
- return chunk.usage
185
- elif chunk.type == "message_stop":
186
- if hasattr(chunk, "usage"):
187
- return chunk.usage
188
- return None
189
-
190
-
191
- def _extract_together_chunk_usage(chunk) -> Any:
192
- """Extract usage data from Together streaming chunk."""
193
- if hasattr(chunk, "usage") and chunk.usage:
194
- return chunk.usage
195
- return None
196
-
197
-
198
- def _extract_groq_chunk_usage(chunk) -> Any:
199
- """Extract usage data from Groq streaming chunk."""
200
- # Groq provides usage data in the last chunk when stream_options={"include_usage": True} is used
201
- if hasattr(chunk, "usage") and chunk.usage:
202
- return chunk.usage
203
- return None
204
-
205
-
206
- # Provider-specific token extraction handlers
207
- def _extract_openai_tokens(usage_data) -> tuple[int, int, int, int]:
208
- """Extract token counts from OpenAI usage data."""
209
- prompt_tokens = (
210
- usage_data.prompt_tokens
211
- if hasattr(usage_data, "prompt_tokens") and usage_data.prompt_tokens is not None
212
- else 0
213
- )
214
- completion_tokens = (
215
- usage_data.completion_tokens
216
- if hasattr(usage_data, "completion_tokens")
217
- and usage_data.completion_tokens is not None
218
- else 0
219
- )
220
- return prompt_tokens, completion_tokens, 0, 0
221
-
222
-
223
- def _extract_anthropic_tokens(usage_data) -> tuple[int, int, int, int]:
224
- """Extract token counts from Anthropic usage data."""
225
- prompt_tokens = (
226
- usage_data.input_tokens
227
- if hasattr(usage_data, "input_tokens") and usage_data.input_tokens is not None
228
- else 0
229
- )
230
- completion_tokens = (
231
- usage_data.output_tokens
232
- if hasattr(usage_data, "output_tokens") and usage_data.output_tokens is not None
233
- else 0
234
- )
235
- cache_read_input_tokens = (
236
- usage_data.cache_read_input_tokens
237
- if hasattr(usage_data, "cache_read_input_tokens")
238
- and usage_data.cache_read_input_tokens is not None
239
- else 0
240
- )
241
- cache_creation_input_tokens = (
242
- usage_data.cache_creation_input_tokens
243
- if hasattr(usage_data, "cache_creation_input_tokens")
244
- and usage_data.cache_creation_input_tokens is not None
245
- else 0
246
- )
247
- return (
248
- prompt_tokens,
249
- completion_tokens,
250
- cache_read_input_tokens,
251
- cache_creation_input_tokens,
252
- )
253
-
254
-
255
- def _extract_together_tokens(usage_data) -> tuple[int, int, int, int]:
256
- """Extract token counts from Together usage data."""
257
- prompt_tokens = (
258
- usage_data.prompt_tokens
259
- if hasattr(usage_data, "prompt_tokens") and usage_data.prompt_tokens is not None
260
- else 0
261
- )
262
- completion_tokens = (
263
- usage_data.completion_tokens
264
- if hasattr(usage_data, "completion_tokens")
265
- and usage_data.completion_tokens is not None
266
- else 0
267
- )
268
- return prompt_tokens, completion_tokens, 0, 0
269
-
270
-
271
- def _extract_groq_tokens(usage_data) -> tuple[int, int, int, int]:
272
- """Extract token counts from Groq usage data."""
273
- prompt_tokens = (
274
- usage_data.prompt_tokens
275
- if hasattr(usage_data, "prompt_tokens") and usage_data.prompt_tokens is not None
276
- else 0
277
- )
278
- completion_tokens = (
279
- usage_data.completion_tokens
280
- if hasattr(usage_data, "completion_tokens")
281
- and usage_data.completion_tokens is not None
282
- else 0
283
- )
284
- # Extract cached tokens from prompt_tokens_details.cached_tokens
285
- cache_read_input_tokens = 0
286
- if (
287
- hasattr(usage_data, "prompt_tokens_details")
288
- and usage_data.prompt_tokens_details
289
- ):
290
- if (
291
- hasattr(usage_data.prompt_tokens_details, "cached_tokens")
292
- and usage_data.prompt_tokens_details.cached_tokens is not None
293
- ):
294
- cache_read_input_tokens = usage_data.prompt_tokens_details.cached_tokens
295
-
296
- return prompt_tokens, completion_tokens, cache_read_input_tokens, 0
297
-
298
-
299
- # Provider-specific output formatting handlers
300
- def _format_openai_output(response: Any) -> tuple[Optional[str], Optional[TraceUsage]]:
301
- """Format output data from OpenAI response."""
302
- from judgeval.tracer.llm.providers import (
303
- openai_ChatCompletion,
304
- openai_Response,
305
- openai_ParsedChatCompletion,
306
- )
307
-
308
- model_name = None
309
- message_content = None
310
- prompt_tokens = 0
311
- completion_tokens = 0
312
- cache_read_input_tokens = 0
313
- cache_creation_input_tokens = 0
314
-
315
- if openai_ChatCompletion and isinstance(response, openai_ChatCompletion):
316
- model_name = response.model or ""
317
- prompt_tokens = (
318
- response.usage.prompt_tokens
319
- if response.usage and response.usage.prompt_tokens is not None
320
- else 0
321
- )
322
- completion_tokens = (
323
- response.usage.completion_tokens
324
- if response.usage and response.usage.completion_tokens is not None
325
- else 0
326
- )
327
- cache_read_input_tokens = (
328
- response.usage.prompt_tokens_details.cached_tokens
329
- if response.usage
330
- and response.usage.prompt_tokens_details
331
- and response.usage.prompt_tokens_details.cached_tokens is not None
332
- else 0
333
- )
334
-
335
- if openai_ParsedChatCompletion and isinstance(
336
- response, openai_ParsedChatCompletion
337
- ):
338
- message_content = response.choices[0].message.parsed
339
- else:
340
- message_content = response.choices[0].message.content
341
- elif openai_Response and isinstance(response, openai_Response):
342
- model_name = response.model or ""
343
- prompt_tokens = (
344
- response.usage.input_tokens
345
- if response.usage and response.usage.input_tokens is not None
346
- else 0
347
- )
348
- completion_tokens = (
349
- response.usage.output_tokens
350
- if response.usage and response.usage.output_tokens is not None
351
- else 0
352
- )
353
- cache_read_input_tokens = (
354
- response.usage.input_tokens_details.cached_tokens
355
- if response.usage
356
- and response.usage.input_tokens_details
357
- and response.usage.input_tokens_details.cached_tokens is not None
358
- else 0
359
- )
360
- output0 = response.output[0]
361
- if (
362
- hasattr(output0, "content")
363
- and output0.content
364
- and hasattr(output0.content, "__iter__")
365
- ):
366
- message_content = "".join(
367
- seg.text for seg in output0.content if hasattr(seg, "text") and seg.text
368
- )
369
-
370
- if model_name:
371
- return message_content, _create_usage(
372
- model_name,
373
- prompt_tokens,
374
- completion_tokens,
375
- cache_read_input_tokens,
376
- cache_creation_input_tokens,
377
- )
378
-
379
- return None, None
380
-
381
-
382
- def _format_anthropic_output(
383
- response: Any,
384
- ) -> tuple[Optional[str], Optional[TraceUsage]]:
385
- """Format output data from Anthropic response."""
386
- model_name = getattr(response, "model", "") or ""
387
- usage = getattr(response, "usage", None)
388
- prompt_tokens = (
389
- usage.input_tokens
390
- if usage and hasattr(usage, "input_tokens") and usage.input_tokens is not None
391
- else 0
392
- )
393
- completion_tokens = (
394
- usage.output_tokens
395
- if usage and hasattr(usage, "output_tokens") and usage.output_tokens is not None
396
- else 0
397
- )
398
- cache_read_input_tokens = (
399
- usage.cache_read_input_tokens
400
- if usage
401
- and hasattr(usage, "cache_read_input_tokens")
402
- and usage.cache_read_input_tokens is not None
403
- else 0
404
- )
405
- cache_creation_input_tokens = (
406
- usage.cache_creation_input_tokens
407
- if usage
408
- and hasattr(usage, "cache_creation_input_tokens")
409
- and usage.cache_creation_input_tokens is not None
410
- else 0
411
- )
412
- message_content = response.content[0].text if hasattr(response, "content") else None
413
-
414
- if model_name:
415
- return message_content, _create_usage(
416
- model_name,
417
- prompt_tokens,
418
- completion_tokens,
419
- cache_read_input_tokens,
420
- cache_creation_input_tokens,
421
- )
422
-
423
- return None, None
424
-
425
-
426
- def _format_together_output(
427
- response: Any,
428
- ) -> tuple[Optional[str], Optional[TraceUsage]]:
429
- """Format output data from Together response."""
430
- model_name = (response.model or "") if hasattr(response, "model") else ""
431
- prompt_tokens = (
432
- response.usage.prompt_tokens
433
- if hasattr(response.usage, "prompt_tokens")
434
- and response.usage.prompt_tokens is not None
435
- else 0
436
- )
437
- completion_tokens = (
438
- response.usage.completion_tokens
439
- if hasattr(response.usage, "completion_tokens")
440
- and response.usage.completion_tokens is not None
441
- else 0
442
- )
443
- message_content = (
444
- response.choices[0].message.content if hasattr(response, "choices") else None
445
- )
446
-
447
- if model_name:
448
- model_name = "together_ai/" + model_name
449
- return message_content, _create_usage(
450
- model_name,
451
- prompt_tokens,
452
- completion_tokens,
453
- 0,
454
- 0,
455
- )
456
-
457
- return None, None
458
-
459
-
460
- def _format_google_output(response: Any) -> tuple[Optional[str], Optional[TraceUsage]]:
461
- """Format output data from Google GenAI response."""
462
- model_name = getattr(response, "model_version", "") or ""
463
- usage_metadata = getattr(response, "usage_metadata", None)
464
- prompt_tokens = (
465
- usage_metadata.prompt_token_count
466
- if usage_metadata
467
- and hasattr(usage_metadata, "prompt_token_count")
468
- and usage_metadata.prompt_token_count is not None
469
- else 0
470
- )
471
- completion_tokens = (
472
- usage_metadata.candidates_token_count
473
- if usage_metadata
474
- and hasattr(usage_metadata, "candidates_token_count")
475
- and usage_metadata.candidates_token_count is not None
476
- else 0
477
- )
478
- message_content = (
479
- response.candidates[0].content.parts[0].text
480
- if hasattr(response, "candidates")
481
- else None
482
- )
483
-
484
- cache_read_input_tokens = 0
485
- if usage_metadata and hasattr(usage_metadata, "cached_content_token_count"):
486
- cache_read_input_tokens = usage_metadata.cached_content_token_count or 0
487
-
488
- if model_name:
489
- return message_content, _create_usage(
490
- model_name,
491
- prompt_tokens,
492
- completion_tokens,
493
- cache_read_input_tokens,
494
- 0,
495
- )
496
-
497
- return None, None
498
-
499
-
500
- def _format_groq_output(response: Any) -> tuple[Optional[str], Optional[TraceUsage]]:
501
- """Format output data from Groq response."""
502
- model_name = (response.model or "") if hasattr(response, "model") else ""
503
- prompt_tokens = (
504
- response.usage.prompt_tokens
505
- if hasattr(response.usage, "prompt_tokens")
506
- and response.usage.prompt_tokens is not None
507
- else 0
508
- )
509
- completion_tokens = (
510
- response.usage.completion_tokens
511
- if hasattr(response.usage, "completion_tokens")
512
- and response.usage.completion_tokens is not None
513
- else 0
514
- )
515
- # Extract cached tokens from prompt_tokens_details.cached_tokens
516
- cache_read_input_tokens = 0
517
- if (
518
- hasattr(response, "usage")
519
- and response.usage
520
- and hasattr(response.usage, "prompt_tokens_details")
521
- and response.usage.prompt_tokens_details
522
- ):
523
- if (
524
- hasattr(response.usage.prompt_tokens_details, "cached_tokens")
525
- and response.usage.prompt_tokens_details.cached_tokens is not None
526
- ):
527
- cache_read_input_tokens = response.usage.prompt_tokens_details.cached_tokens
528
-
529
- message_content = (
530
- response.choices[0].message.content if hasattr(response, "choices") else None
531
- )
532
-
533
- if model_name:
534
- model_name = "groq/" + model_name
535
- return message_content, _create_usage(
536
- model_name,
537
- prompt_tokens,
538
- completion_tokens,
539
- cache_read_input_tokens,
540
- 0,
541
- )
542
-
543
- return None, None
544
-
545
-
546
- class _TracedGeneratorBase:
547
- """Base class with common logic for parsing stream chunks."""
548
-
549
- __slots__ = (
550
- "tracer",
551
- "client",
552
- "span",
553
- "accumulated_content",
554
- "model_name",
555
- "provider_type",
556
- )
557
-
558
- tracer: Tracer
559
- client: ApiClient
560
- span: Span
561
- accumulated_content: str
562
- model_name: str
563
- provider_type: ProviderType
564
-
565
- def __init__(self, tracer: Tracer, client: ApiClient, span: Span, model_name: str):
566
- """Initialize the base traced generator.
567
-
568
- Args:
569
- tracer: The tracer instance
570
- client: The API client
571
- span: The OpenTelemetry span
572
- model_name: The model name (empty string default allows fallback to usage_data.model)
573
- """
574
- self.tracer = tracer
575
- self.client = client
576
- self.span = span
577
- self.accumulated_content = ""
578
- self.model_name = model_name
579
- self.provider_type = _detect_provider(client)
580
-
581
- def _extract_content(self, chunk) -> str:
582
- """Extract content from streaming chunk based on provider."""
583
- if self.provider_type == ProviderType.OPENAI:
584
- return _extract_openai_content(chunk)
585
- elif self.provider_type == ProviderType.ANTHROPIC:
586
- return _extract_anthropic_content(chunk)
587
- elif self.provider_type == ProviderType.TOGETHER:
588
- return _extract_together_content(chunk)
589
- elif self.provider_type == ProviderType.GROQ:
590
- return _extract_groq_content(chunk)
591
- else:
592
- # Default case - assume OpenAI-compatible for unknown providers
593
- return _extract_openai_content(chunk)
594
-
595
- def _process_chunk_usage(self, chunk):
596
- """Process usage data from streaming chunks based on provider."""
597
- usage_data = _extract_chunk_usage(self.client, chunk)
598
- if usage_data:
599
- _process_usage_data(
600
- self.span, usage_data, self.tracer, self.client, self.model_name
601
- )
602
-
603
- def __del__(self):
604
- """
605
- Fallback cleanup for unclosed spans. This is a safety mechanism only - spans
606
- should normally be finalized in StopIteration/StopAsyncIteration handlers.
607
-
608
- Note: __del__ is not guaranteed to be called in all situations (e.g., reference
609
- cycles, program exit), so this should not be relied upon as the primary cleanup
610
- mechanism. The primary finalization happens in the iterator protocol methods.
611
- """
612
- if self.span:
613
- try:
614
- self._finalize_span()
615
- except Exception as e:
616
- judgeval_logger.warning(
617
- f"Error during span finalization in __del__: {e}"
618
- )
619
-
620
- def _finalize_span(self):
621
- """Finalize the span by setting completion content and ending it."""
622
- if self.span:
623
- set_span_attribute(
624
- self.span, AttributeKeys.GEN_AI_COMPLETION, self.accumulated_content
625
- )
626
- self.span.end()
627
- self.span = None
628
-
629
-
630
- class TracedGenerator(_TracedGeneratorBase):
631
- """Generator wrapper that adds OpenTelemetry tracing without consuming the stream."""
632
-
633
- __slots__ = ("generator",)
634
-
635
- generator: Union[Generator[Any, None, None], Iterator[Any]]
636
-
637
- def __init__(
638
- self,
639
- tracer: Tracer,
640
- generator: Union[Generator[Any, None, None], Iterator[Any]],
641
- client: ApiClient,
642
- span: Span,
643
- model_name: str,
644
- ):
645
- super().__init__(tracer, client, span, model_name)
646
- self.generator = generator
647
-
648
- def __iter__(self):
649
- return self
650
-
651
- def __next__(self):
652
- try:
653
- chunk = next(self.generator)
654
-
655
- content = self._extract_content(chunk)
656
- if content:
657
- self.accumulated_content += content
658
- self._process_chunk_usage(chunk)
659
-
660
- return chunk
661
-
662
- except StopIteration:
663
- self._finalize_span()
664
- raise
665
- except Exception as e:
666
- if self.span:
667
- self.span.record_exception(e)
668
- self.span.end()
669
- raise
670
-
671
-
672
- class TracedAsyncGenerator(_TracedGeneratorBase):
673
- """Async generator wrapper that adds OpenTelemetry tracing without consuming the stream."""
674
-
675
- __slots__ = ("async_generator",)
676
-
677
- async_generator: Union[AsyncGenerator[Any, None], AsyncIterator[Any]]
678
-
679
- def __init__(
680
- self,
681
- tracer: Tracer,
682
- async_generator: Union[AsyncGenerator[Any, None], AsyncIterator[Any]],
683
- client: ApiClient,
684
- span: Span,
685
- model_name: str,
686
- ):
687
- super().__init__(tracer, client, span, model_name)
688
- self.async_generator = async_generator
689
-
690
- def __aiter__(self):
691
- return self
692
-
693
- async def __anext__(self):
694
- try:
695
- chunk = await self.async_generator.__anext__()
696
-
697
- content = self._extract_content(chunk)
698
- if content:
699
- self.accumulated_content += content
700
-
701
- self._process_chunk_usage(chunk)
702
-
703
- return chunk
704
-
705
- except StopAsyncIteration:
706
- self._finalize_span()
707
- raise
708
- except Exception as e:
709
- if self.span:
710
- self.span.record_exception(e)
711
- self.span.end()
712
- raise
713
-
714
-
715
- class TracedSyncContextManager:
716
- """Sync context manager wrapper for streaming methods."""
717
-
718
- def __init__(
719
- self,
720
- tracer: Tracer,
721
- context_manager: Any,
722
- client: ApiClient,
723
- span: Span,
724
- model_name: str,
725
- ):
726
- self.tracer = tracer
727
- self.context_manager = context_manager
728
- self.client = client
729
- self.span = span
730
- self.stream: Optional[Any] = None
731
- self.model_name = model_name
732
-
733
- def __enter__(self):
734
- self.stream = self.context_manager.__enter__()
735
- return TracedGenerator(
736
- self.tracer, self.stream, self.client, self.span, self.model_name
737
- )
738
-
739
- def __exit__(self, exc_type, exc_val, exc_tb):
740
- return self.context_manager.__exit__(exc_type, exc_val, exc_tb)
741
-
742
- def __del__(self):
743
- """Cleanup span if not properly closed."""
744
- if self.span:
745
- try:
746
- self.span.end()
747
- except Exception:
748
- pass
749
-
750
-
751
- class TracedAsyncContextManager:
752
- """Async context manager wrapper for streaming methods."""
753
-
754
- def __init__(
755
- self,
756
- tracer: Tracer,
757
- context_manager: Any,
758
- client: ApiClient,
759
- span: Span,
760
- model_name: str,
761
- ):
762
- self.tracer = tracer
763
- self.context_manager = context_manager
764
- self.client = client
765
- self.span = span
766
- self.stream: Optional[Any] = None
767
- self.model_name = model_name
768
-
769
- async def __aenter__(self):
770
- self.stream = await self.context_manager.__aenter__()
771
- return TracedAsyncGenerator(
772
- self.tracer, self.stream, self.client, self.span, self.model_name
773
- )
774
-
775
- async def __aexit__(self, exc_type, exc_val, exc_tb):
776
- return await self.context_manager.__aexit__(exc_type, exc_val, exc_tb)
777
-
778
- def __del__(self):
779
- """Cleanup span if not properly closed."""
780
- if self.span:
781
- try:
782
- self.span.end()
783
- except Exception:
784
- pass
785
-
786
-
787
- def _extract_chunk_usage(client: ApiClient, chunk) -> Any:
788
- """Extract usage data from streaming chunks based on provider."""
789
- provider_type = _detect_provider(client)
790
-
791
- if provider_type == ProviderType.OPENAI:
792
- return _extract_openai_chunk_usage(chunk)
793
- elif provider_type == ProviderType.ANTHROPIC:
794
- return _extract_anthropic_chunk_usage(chunk)
795
- elif provider_type == ProviderType.TOGETHER:
796
- return _extract_together_chunk_usage(chunk)
797
- elif provider_type == ProviderType.GROQ:
798
- return _extract_groq_chunk_usage(chunk)
799
- else:
800
- # Default case - assume OpenAI-compatible for unknown providers
801
- return _extract_openai_chunk_usage(chunk)
802
-
803
-
804
- def _extract_usage_tokens(client: ApiClient, usage_data) -> tuple[int, int, int, int]:
805
- """Extract token counts from usage data based on provider."""
806
- provider_type = _detect_provider(client)
807
-
808
- if provider_type == ProviderType.OPENAI:
809
- return _extract_openai_tokens(usage_data)
810
- elif provider_type == ProviderType.ANTHROPIC:
811
- return _extract_anthropic_tokens(usage_data)
812
- elif provider_type == ProviderType.TOGETHER:
813
- return _extract_together_tokens(usage_data)
814
- elif provider_type == ProviderType.GROQ:
815
- return _extract_groq_tokens(usage_data)
816
- else:
817
- # Default case - assume OpenAI-compatible for unknown providers
818
- return _extract_openai_tokens(usage_data)
819
-
820
-
821
- def _process_usage_data(
822
- span, usage_data, tracer: Tracer, client: ApiClient, model_name: str
823
- ):
824
- """Process usage data and set span attributes."""
825
- (
826
- prompt_tokens,
827
- completion_tokens,
828
- cache_read_input_tokens,
829
- cache_creation_input_tokens,
830
- ) = _extract_usage_tokens(client, usage_data)
831
-
832
- if prompt_tokens or completion_tokens:
833
- final_model_name = getattr(usage_data, "model", None) or model_name
834
-
835
- # Add provider prefixes for cost calculation
836
- provider_type = _detect_provider(client)
837
- if (
838
- provider_type == ProviderType.TOGETHER
839
- and final_model_name
840
- and not final_model_name.startswith("together_ai/")
841
- ):
842
- final_model_name = "together_ai/" + final_model_name
843
- elif (
844
- provider_type == ProviderType.GROQ
845
- and final_model_name
846
- and not final_model_name.startswith("groq/")
847
- ):
848
- final_model_name = "groq/" + final_model_name
849
-
850
- usage = _create_usage(
851
- final_model_name,
852
- prompt_tokens,
853
- completion_tokens,
854
- cache_read_input_tokens,
855
- cache_creation_input_tokens,
856
- )
857
- _set_usage_attributes(span, usage, tracer)
858
-
859
-
860
- def _set_usage_attributes(span, usage: TraceUsage, tracer: Tracer):
861
- """Set usage attributes on the span for non-streaming responses."""
862
-
863
- set_span_attribute(span, AttributeKeys.GEN_AI_RESPONSE_MODEL, usage.model_name)
864
- set_span_attribute(
865
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, usage.prompt_tokens
866
- )
867
- set_span_attribute(
868
- span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, usage.completion_tokens
869
- )
870
- set_span_attribute(
871
- span, AttributeKeys.GEN_AI_USAGE_COMPLETION_TOKENS, usage.completion_tokens
872
- )
873
- set_span_attribute(
874
- span, AttributeKeys.GEN_AI_USAGE_TOTAL_COST, usage.total_cost_usd
875
- )
876
-
877
-
878
- def wrap_provider(tracer: Tracer, client: ApiClient) -> ApiClient:
879
- """
880
- Wraps an API client to add tracing capabilities.
881
- Supports OpenAI, Together, Anthropic, Google GenAI, and Groq clients.
882
- """
883
-
884
- def wrapped(function, span_name):
885
- @functools.wraps(function)
886
- def wrapper(*args, **kwargs):
887
- if kwargs.get("stream", False):
888
- span = tracer.get_tracer().start_span(
889
- span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
890
- )
891
- tracer.add_agent_attributes_to_span(span)
892
- set_span_attribute(
893
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
894
- )
895
- model_name = kwargs.get("model", "")
896
-
897
- # Add provider prefix for Groq clients
898
- if HAS_GROQ:
899
- from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
900
-
901
- if (
902
- isinstance(client, (groq_Groq, groq_AsyncGroq))
903
- and model_name
904
- and not model_name.startswith("groq/")
905
- ):
906
- model_name = "groq/" + model_name
907
-
908
- response = function(*args, **kwargs)
909
- return TracedGenerator(tracer, response, client, span, model_name)
910
- else:
911
- with sync_span_context(
912
- tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
913
- ) as span:
914
- tracer.add_agent_attributes_to_span(span)
915
- set_span_attribute(
916
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
917
- )
918
- try:
919
- response = function(*args, **kwargs)
920
- output, usage = _format_output_data(client, response)
921
- set_span_attribute(
922
- span, AttributeKeys.GEN_AI_COMPLETION, output
923
- )
924
- if usage:
925
- _set_usage_attributes(span, usage, tracer)
926
- return response
927
- except Exception as e:
928
- span.record_exception(e)
929
- raise
930
-
931
- return wrapper
932
-
933
- def wrapped_async(function, span_name):
934
- @functools.wraps(function)
935
- async def wrapper(*args, **kwargs):
936
- if kwargs.get("stream", False):
937
- span = tracer.get_tracer().start_span(
938
- span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
939
- )
940
- tracer.add_agent_attributes_to_span(span)
941
- set_span_attribute(
942
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
943
- )
944
- model_name = kwargs.get("model", "")
945
-
946
- # Add provider prefix for Groq clients
947
- if HAS_GROQ:
948
- from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
949
-
950
- if (
951
- isinstance(client, (groq_Groq, groq_AsyncGroq))
952
- and model_name
953
- and not model_name.startswith("groq/")
954
- ):
955
- model_name = "groq/" + model_name
956
-
957
- response = await function(*args, **kwargs)
958
- return TracedAsyncGenerator(tracer, response, client, span, model_name)
959
- else:
960
- async with async_span_context(
961
- tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
962
- ) as span:
963
- tracer.add_agent_attributes_to_span(span)
964
- set_span_attribute(
965
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
966
- )
967
- try:
968
- response = await function(*args, **kwargs)
969
- output, usage = _format_output_data(client, response)
970
- set_span_attribute(
971
- span, AttributeKeys.GEN_AI_COMPLETION, output
972
- )
973
- if usage:
974
- _set_usage_attributes(span, usage, tracer)
975
- return response
976
- except Exception as e:
977
- span.record_exception(e)
978
- raise
979
-
980
- return wrapper
981
-
982
- def wrapped_sync_context_manager(function, span_name):
983
- """Special wrapper for sync context manager methods."""
984
-
985
- @functools.wraps(function)
986
- def wrapper(*args, **kwargs):
987
- span = tracer.get_tracer().start_span(
988
- span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
989
- )
990
- tracer.add_agent_attributes_to_span(span)
991
- set_span_attribute(
992
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
993
- )
994
- model_name = kwargs.get("model", "")
995
-
996
- # Add provider prefix for Groq clients
997
- if HAS_GROQ:
998
- from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
999
-
1000
- if (
1001
- isinstance(client, (groq_Groq, groq_AsyncGroq))
1002
- and model_name
1003
- and not model_name.startswith("groq/")
1004
- ):
1005
- model_name = "groq/" + model_name
1006
-
1007
- original_context_manager = function(*args, **kwargs)
1008
- return TracedSyncContextManager(
1009
- tracer, original_context_manager, client, span, model_name
1010
- )
1011
-
1012
- return wrapper
1013
-
1014
- def wrapped_async_context_manager(function, span_name):
1015
- """Special wrapper for async context manager methods."""
1016
-
1017
- @functools.wraps(function)
1018
- def wrapper(*args, **kwargs):
1019
- span = tracer.get_tracer().start_span(
1020
- span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
1021
- )
1022
- tracer.add_agent_attributes_to_span(span)
1023
- set_span_attribute(
1024
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
1025
- )
1026
- model_name = kwargs.get("model", "")
1027
-
1028
- # Add provider prefix for Groq clients
1029
- if HAS_GROQ:
1030
- from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
1031
-
1032
- if (
1033
- isinstance(client, (groq_Groq, groq_AsyncGroq))
1034
- and model_name
1035
- and not model_name.startswith("groq/")
1036
- ):
1037
- model_name = "groq/" + model_name
1038
-
1039
- original_context_manager = function(*args, **kwargs)
1040
- return TracedAsyncContextManager(
1041
- tracer, original_context_manager, client, span, model_name
1042
- )
1043
-
1044
- return wrapper
1045
-
1046
- if HAS_OPENAI:
1047
- from judgeval.tracer.llm.providers import openai_OpenAI, openai_AsyncOpenAI
1048
-
1049
- assert openai_OpenAI is not None, "OpenAI client not found"
1050
- assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
1051
- span_name = "OPENAI_API_CALL"
1052
- if isinstance(client, openai_OpenAI):
1053
- setattr(
1054
- client.chat.completions,
1055
- "create",
1056
- wrapped(client.chat.completions.create, span_name),
1057
- )
1058
- setattr(
1059
- client.responses, "create", wrapped(client.responses.create, span_name)
1060
- )
1061
- setattr(
1062
- client.beta.chat.completions,
1063
- "parse",
1064
- wrapped(client.beta.chat.completions.parse, span_name),
1065
- )
1066
- elif isinstance(client, openai_AsyncOpenAI):
1067
- setattr(
1068
- client.chat.completions,
1069
- "create",
1070
- wrapped_async(client.chat.completions.create, span_name),
1071
- )
1072
- setattr(
1073
- client.responses,
1074
- "create",
1075
- wrapped_async(client.responses.create, span_name),
1076
- )
1077
- setattr(
1078
- client.beta.chat.completions,
1079
- "parse",
1080
- wrapped_async(client.beta.chat.completions.parse, span_name),
1081
- )
1082
-
1083
- if HAS_TOGETHER:
1084
- from judgeval.tracer.llm.providers import (
1085
- together_Together,
1086
- together_AsyncTogether,
1087
- )
1088
-
1089
- assert together_Together is not None, "Together client not found"
1090
- assert together_AsyncTogether is not None, "Together async client not found"
1091
- span_name = "TOGETHER_API_CALL"
1092
- if isinstance(client, together_Together):
1093
- setattr(
1094
- client.chat.completions,
1095
- "create",
1096
- wrapped(client.chat.completions.create, span_name),
1097
- )
1098
- elif isinstance(client, together_AsyncTogether):
1099
- setattr(
1100
- client.chat.completions,
1101
- "create",
1102
- wrapped_async(client.chat.completions.create, span_name),
1103
- )
1104
-
1105
- if HAS_ANTHROPIC:
1106
- from judgeval.tracer.llm.providers import (
1107
- anthropic_Anthropic,
1108
- anthropic_AsyncAnthropic,
1109
- )
1110
-
1111
- assert anthropic_Anthropic is not None, "Anthropic client not found"
1112
- assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
1113
- span_name = "ANTHROPIC_API_CALL"
1114
- if isinstance(client, anthropic_Anthropic):
1115
- setattr(
1116
- client.messages, "create", wrapped(client.messages.create, span_name)
1117
- )
1118
- setattr(
1119
- client.messages,
1120
- "stream",
1121
- wrapped_sync_context_manager(client.messages.stream, span_name),
1122
- )
1123
- elif isinstance(client, anthropic_AsyncAnthropic):
1124
- setattr(
1125
- client.messages,
1126
- "create",
1127
- wrapped_async(client.messages.create, span_name),
1128
- )
1129
- setattr(
1130
- client.messages,
1131
- "stream",
1132
- wrapped_async_context_manager(client.messages.stream, span_name),
1133
- )
1134
-
1135
- if HAS_GOOGLE_GENAI:
1136
- from judgeval.tracer.llm.providers import (
1137
- google_genai_Client,
1138
- google_genai_AsyncClient,
1139
- )
1140
-
1141
- assert google_genai_Client is not None, "Google GenAI client not found"
1142
- assert google_genai_AsyncClient is not None, (
1143
- "Google GenAI async client not found"
1144
- )
1145
- span_name = "GOOGLE_API_CALL"
1146
- if isinstance(client, google_genai_Client):
1147
- setattr(
1148
- client.models,
1149
- "generate_content",
1150
- wrapped(client.models.generate_content, span_name),
1151
- )
1152
- elif isinstance(client, google_genai_AsyncClient):
1153
- setattr(
1154
- client.models,
1155
- "generate_content",
1156
- wrapped_async(client.models.generate_content, span_name),
1157
- )
1158
-
1159
- if HAS_GROQ:
1160
- from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
1161
-
1162
- assert groq_Groq is not None, "Groq client not found"
1163
- assert groq_AsyncGroq is not None, "Groq async client not found"
1164
- span_name = "GROQ_API_CALL"
1165
- if isinstance(client, groq_Groq):
1166
- setattr(
1167
- client.chat.completions,
1168
- "create",
1169
- wrapped(client.chat.completions.create, span_name),
1170
- )
1171
- elif isinstance(client, groq_AsyncGroq):
1172
- setattr(
1173
- client.chat.completions,
1174
- "create",
1175
- wrapped_async(client.chat.completions.create, span_name),
1176
- )
1177
-
1178
- return client
1179
-
1180
-
1181
- def _format_output_data(
1182
- client: ApiClient, response: Any
1183
- ) -> tuple[Optional[str], Optional[TraceUsage]]:
1184
- """Format output data from LLM response based on provider."""
1185
- provider_type = _detect_provider(client)
1186
-
1187
- if provider_type == ProviderType.OPENAI:
1188
- return _format_openai_output(response)
1189
- elif provider_type == ProviderType.ANTHROPIC:
1190
- return _format_anthropic_output(response)
1191
- elif provider_type == ProviderType.TOGETHER:
1192
- return _format_together_output(response)
1193
- elif provider_type == ProviderType.GOOGLE:
1194
- return _format_google_output(response)
1195
- elif provider_type == ProviderType.GROQ:
1196
- return _format_groq_output(response)
1197
- else:
1198
- # Default case - assume OpenAI-compatible for unknown providers
1199
- judgeval_logger.info(
1200
- f"Unknown client type {type(client)}, assuming OpenAI-compatible"
1201
- )
1202
- return _format_openai_output(response)
1203
-
1204
-
1205
- def _create_usage(
1206
- model_name: str,
1207
- prompt_tokens: int,
1208
- completion_tokens: int,
1209
- cache_read_input_tokens: int = 0,
1210
- cache_creation_input_tokens: int = 0,
1211
- ) -> TraceUsage:
1212
- prompt_cost, completion_cost = cost_per_token(
1213
- model=model_name,
1214
- prompt_tokens=prompt_tokens,
1215
- completion_tokens=completion_tokens,
1216
- cache_read_input_tokens=cache_read_input_tokens,
1217
- cache_creation_input_tokens=cache_creation_input_tokens,
1218
- )
1219
- total_cost_usd = (
1220
- (prompt_cost + completion_cost) if prompt_cost and completion_cost else None
1221
- )
1222
- return TraceUsage(
1223
- prompt_tokens=prompt_tokens,
1224
- completion_tokens=completion_tokens,
1225
- total_tokens=prompt_tokens + completion_tokens,
1226
- cache_read_input_tokens=cache_read_input_tokens,
1227
- cache_creation_input_tokens=cache_creation_input_tokens,
1228
- prompt_tokens_cost_usd=prompt_cost,
1229
- completion_tokens_cost_usd=completion_cost,
1230
- total_cost_usd=total_cost_usd,
1231
- model_name=model_name,
1232
- )
7
+ __all__ = ["_detect_provider", "wrap_provider"]