judgeval 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. judgeval/api/api_types.py +2 -1
  2. judgeval/data/judgment_types.py +2 -1
  3. judgeval/logger.py +1 -1
  4. judgeval/tracer/__init__.py +10 -7
  5. judgeval/tracer/keys.py +7 -3
  6. judgeval/tracer/llm/__init__.py +2 -1259
  7. judgeval/tracer/llm/config.py +110 -0
  8. judgeval/tracer/llm/constants.py +10 -0
  9. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  10. judgeval/tracer/llm/llm_anthropic/wrapper.py +611 -0
  11. judgeval/tracer/llm/llm_google/__init__.py +0 -0
  12. judgeval/tracer/llm/llm_google/config.py +24 -0
  13. judgeval/tracer/llm/llm_google/wrapper.py +426 -0
  14. judgeval/tracer/llm/llm_groq/__init__.py +0 -0
  15. judgeval/tracer/llm/llm_groq/config.py +23 -0
  16. judgeval/tracer/llm/llm_groq/wrapper.py +477 -0
  17. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  18. judgeval/tracer/llm/llm_openai/wrapper.py +637 -0
  19. judgeval/tracer/llm/llm_together/__init__.py +0 -0
  20. judgeval/tracer/llm/llm_together/config.py +23 -0
  21. judgeval/tracer/llm/llm_together/wrapper.py +478 -0
  22. judgeval/tracer/llm/providers.py +5 -5
  23. judgeval/tracer/processors/__init__.py +1 -1
  24. judgeval/trainer/console.py +1 -1
  25. judgeval/utils/decorators/__init__.py +0 -0
  26. judgeval/utils/decorators/dont_throw.py +21 -0
  27. judgeval/utils/{decorators.py → decorators/use_once.py} +0 -11
  28. judgeval/utils/meta.py +1 -1
  29. judgeval/utils/version_check.py +1 -1
  30. judgeval/version.py +1 -1
  31. {judgeval-0.16.0.dist-info → judgeval-0.16.2.dist-info}/METADATA +1 -1
  32. {judgeval-0.16.0.dist-info → judgeval-0.16.2.dist-info}/RECORD +37 -23
  33. judgeval/tracer/llm/google/__init__.py +0 -21
  34. judgeval/tracer/llm/groq/__init__.py +0 -20
  35. judgeval/tracer/llm/together/__init__.py +0 -20
  36. /judgeval/tracer/llm/{anthropic/__init__.py → llm_anthropic/config.py} +0 -0
  37. /judgeval/tracer/llm/{openai/__init__.py → llm_openai/config.py} +0 -0
  38. {judgeval-0.16.0.dist-info → judgeval-0.16.2.dist-info}/WHEEL +0 -0
  39. {judgeval-0.16.0.dist-info → judgeval-0.16.2.dist-info}/entry_points.txt +0 -0
  40. {judgeval-0.16.0.dist-info → judgeval-0.16.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,1264 +1,7 @@
1
1
  from __future__ import annotations
2
- import functools
3
- from typing import (
4
- Tuple,
5
- Optional,
6
- Any,
7
- TYPE_CHECKING,
8
- Union,
9
- AsyncGenerator,
10
- Generator,
11
- Iterator,
12
- AsyncIterator,
13
- )
14
- from functools import wraps
15
- from enum import Enum
16
- from judgeval.data.trace import TraceUsage
17
- from judgeval.logger import judgeval_logger
18
- from litellm.cost_calculator import cost_per_token as _original_cost_per_token
19
- from opentelemetry.trace import Span
20
2
 
21
- from judgeval.tracer.llm.providers import (
22
- HAS_OPENAI,
23
- HAS_TOGETHER,
24
- HAS_ANTHROPIC,
25
- HAS_GOOGLE_GENAI,
26
- HAS_GROQ,
27
- ApiClient,
28
- )
29
- from judgeval.tracer.managers import sync_span_context, async_span_context
30
- from judgeval.tracer.keys import AttributeKeys
31
- from judgeval.utils.serialize import safe_serialize
32
- from judgeval.tracer.utils import set_span_attribute
33
3
 
34
- if TYPE_CHECKING:
35
- from judgeval.tracer import Tracer
4
+ from .config import _detect_provider, wrap_provider
36
5
 
37
6
 
38
- class ProviderType(Enum):
39
- """Enum for different LLM provider types."""
40
-
41
- OPENAI = "openai"
42
- ANTHROPIC = "anthropic"
43
- TOGETHER = "together"
44
- GOOGLE = "google"
45
- GROQ = "groq"
46
- DEFAULT = "default"
47
-
48
-
49
- @wraps(_original_cost_per_token)
50
- def cost_per_token(
51
- *args: Any, **kwargs: Any
52
- ) -> Tuple[Optional[float], Optional[float]]:
53
- try:
54
- prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = (
55
- _original_cost_per_token(*args, **kwargs)
56
- )
57
- if (
58
- prompt_tokens_cost_usd_dollar == 0
59
- and completion_tokens_cost_usd_dollar == 0
60
- ):
61
- judgeval_logger.warning("LiteLLM returned a total of 0 for cost per token")
62
- return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
63
- except Exception as e:
64
- judgeval_logger.warning(f"Error calculating cost per token: {e}")
65
- return None, None
66
-
67
-
68
- def _detect_provider(client: ApiClient) -> ProviderType:
69
- """Detect the provider type of the client once to avoid repeated isinstance checks."""
70
- if HAS_OPENAI:
71
- from judgeval.tracer.llm.providers import openai_OpenAI, openai_AsyncOpenAI
72
-
73
- assert openai_OpenAI is not None, "OpenAI client not found"
74
- assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
75
- if isinstance(client, (openai_OpenAI, openai_AsyncOpenAI)):
76
- return ProviderType.OPENAI
77
-
78
- if HAS_ANTHROPIC:
79
- from judgeval.tracer.llm.providers import (
80
- anthropic_Anthropic,
81
- anthropic_AsyncAnthropic,
82
- )
83
-
84
- assert anthropic_Anthropic is not None, "Anthropic client not found"
85
- assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
86
- if isinstance(client, (anthropic_Anthropic, anthropic_AsyncAnthropic)):
87
- return ProviderType.ANTHROPIC
88
-
89
- if HAS_TOGETHER:
90
- from judgeval.tracer.llm.providers import (
91
- together_Together,
92
- together_AsyncTogether,
93
- )
94
-
95
- assert together_Together is not None, "Together client not found"
96
- assert together_AsyncTogether is not None, "Together async client not found"
97
- if isinstance(client, (together_Together, together_AsyncTogether)):
98
- return ProviderType.TOGETHER
99
-
100
- if HAS_GOOGLE_GENAI:
101
- from judgeval.tracer.llm.providers import (
102
- google_genai_Client,
103
- google_genai_AsyncClient,
104
- )
105
-
106
- assert google_genai_Client is not None, "Google GenAI client not found"
107
- assert google_genai_AsyncClient is not None, (
108
- "Google GenAI async client not found"
109
- )
110
- if isinstance(client, (google_genai_Client, google_genai_AsyncClient)):
111
- return ProviderType.GOOGLE
112
-
113
- if HAS_GROQ:
114
- from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
115
-
116
- assert groq_Groq is not None, "Groq client not found"
117
- assert groq_AsyncGroq is not None, "Groq async client not found"
118
- if isinstance(client, (groq_Groq, groq_AsyncGroq)):
119
- return ProviderType.GROQ
120
-
121
- return ProviderType.DEFAULT
122
-
123
-
124
- # Provider-specific content extraction handlers
125
- def _extract_openai_content(chunk) -> str:
126
- """Extract content from OpenAI streaming chunk."""
127
- if (
128
- hasattr(chunk, "choices")
129
- and chunk.choices
130
- and hasattr(chunk.choices[0], "delta")
131
- ):
132
- delta_content = getattr(chunk.choices[0].delta, "content", None)
133
- if delta_content:
134
- return delta_content
135
- return ""
136
-
137
-
138
- def _extract_anthropic_content(chunk) -> str:
139
- """Extract content from Anthropic streaming chunk."""
140
- if hasattr(chunk, "type"):
141
- if chunk.type == "content_block_delta":
142
- if hasattr(chunk, "delta"):
143
- if hasattr(chunk.delta, "text"):
144
- return chunk.delta.text or ""
145
- elif hasattr(chunk.delta, "partial_json"):
146
- # Tool use input streaming - return raw JSON to accumulate properly
147
- return chunk.delta.partial_json or ""
148
- elif chunk.type == "content_block_start":
149
- if hasattr(chunk, "content_block") and hasattr(chunk.content_block, "type"):
150
- if chunk.content_block.type == "tool_use":
151
- tool_info = {
152
- "type": "tool_use",
153
- "id": getattr(chunk.content_block, "id", None),
154
- "name": getattr(chunk.content_block, "name", None),
155
- }
156
- return f"[TOOL_USE_START: {tool_info}]"
157
- elif hasattr(chunk, "delta") and hasattr(chunk.delta, "text"):
158
- return chunk.delta.text or ""
159
- elif hasattr(chunk, "text"):
160
- return chunk.text or ""
161
- return ""
162
-
163
-
164
- def _extract_together_content(chunk) -> str:
165
- """Extract content from Together streaming chunk."""
166
- if hasattr(chunk, "choices") and chunk.choices:
167
- choice = chunk.choices[0]
168
- if hasattr(choice, "delta") and hasattr(choice.delta, "content"):
169
- return choice.delta.content or ""
170
- return ""
171
-
172
-
173
- def _extract_groq_content(chunk) -> str:
174
- """Extract content from Groq streaming chunk."""
175
- if hasattr(chunk, "choices") and chunk.choices:
176
- choice = chunk.choices[0]
177
- if hasattr(choice, "delta") and hasattr(choice.delta, "content"):
178
- return choice.delta.content or ""
179
- return ""
180
-
181
-
182
- # Provider-specific chunk usage extraction handlers
183
- def _extract_openai_chunk_usage(chunk) -> Any:
184
- """Extract usage data from OpenAI streaming chunk."""
185
- if hasattr(chunk, "usage") and chunk.usage:
186
- return chunk.usage
187
- return None
188
-
189
-
190
- def _extract_anthropic_chunk_usage(chunk) -> Any:
191
- """Extract usage data from Anthropic streaming chunk."""
192
- if hasattr(chunk, "type"):
193
- if chunk.type == "message_start":
194
- if hasattr(chunk, "message") and hasattr(chunk.message, "usage"):
195
- return chunk.message.usage
196
- elif chunk.type == "message_delta":
197
- if hasattr(chunk, "usage"):
198
- return chunk.usage
199
- elif chunk.type == "message_stop":
200
- if hasattr(chunk, "usage"):
201
- return chunk.usage
202
- return None
203
-
204
-
205
- def _extract_together_chunk_usage(chunk) -> Any:
206
- """Extract usage data from Together streaming chunk."""
207
- if hasattr(chunk, "usage") and chunk.usage:
208
- return chunk.usage
209
- return None
210
-
211
-
212
- def _extract_groq_chunk_usage(chunk) -> Any:
213
- """Extract usage data from Groq streaming chunk."""
214
- # Groq provides usage data in the last chunk when stream_options={"include_usage": True} is used
215
- if hasattr(chunk, "usage") and chunk.usage:
216
- return chunk.usage
217
- return None
218
-
219
-
220
- # Provider-specific token extraction handlers
221
- def _extract_openai_tokens(usage_data) -> tuple[int, int, int, int]:
222
- """Extract token counts from OpenAI usage data."""
223
- prompt_tokens = (
224
- usage_data.prompt_tokens
225
- if hasattr(usage_data, "prompt_tokens") and usage_data.prompt_tokens is not None
226
- else 0
227
- )
228
- completion_tokens = (
229
- usage_data.completion_tokens
230
- if hasattr(usage_data, "completion_tokens")
231
- and usage_data.completion_tokens is not None
232
- else 0
233
- )
234
- return prompt_tokens, completion_tokens, 0, 0
235
-
236
-
237
- def _extract_anthropic_tokens(usage_data) -> tuple[int, int, int, int]:
238
- """Extract token counts from Anthropic usage data."""
239
- prompt_tokens = (
240
- usage_data.input_tokens
241
- if hasattr(usage_data, "input_tokens") and usage_data.input_tokens is not None
242
- else 0
243
- )
244
- completion_tokens = (
245
- usage_data.output_tokens
246
- if hasattr(usage_data, "output_tokens") and usage_data.output_tokens is not None
247
- else 0
248
- )
249
- cache_read_input_tokens = (
250
- usage_data.cache_read_input_tokens
251
- if hasattr(usage_data, "cache_read_input_tokens")
252
- and usage_data.cache_read_input_tokens is not None
253
- else 0
254
- )
255
- cache_creation_input_tokens = (
256
- usage_data.cache_creation_input_tokens
257
- if hasattr(usage_data, "cache_creation_input_tokens")
258
- and usage_data.cache_creation_input_tokens is not None
259
- else 0
260
- )
261
- return (
262
- prompt_tokens,
263
- completion_tokens,
264
- cache_read_input_tokens,
265
- cache_creation_input_tokens,
266
- )
267
-
268
-
269
- def _extract_together_tokens(usage_data) -> tuple[int, int, int, int]:
270
- """Extract token counts from Together usage data."""
271
- prompt_tokens = (
272
- usage_data.prompt_tokens
273
- if hasattr(usage_data, "prompt_tokens") and usage_data.prompt_tokens is not None
274
- else 0
275
- )
276
- completion_tokens = (
277
- usage_data.completion_tokens
278
- if hasattr(usage_data, "completion_tokens")
279
- and usage_data.completion_tokens is not None
280
- else 0
281
- )
282
- return prompt_tokens, completion_tokens, 0, 0
283
-
284
-
285
- def _extract_groq_tokens(usage_data) -> tuple[int, int, int, int]:
286
- """Extract token counts from Groq usage data."""
287
- prompt_tokens = (
288
- usage_data.prompt_tokens
289
- if hasattr(usage_data, "prompt_tokens") and usage_data.prompt_tokens is not None
290
- else 0
291
- )
292
- completion_tokens = (
293
- usage_data.completion_tokens
294
- if hasattr(usage_data, "completion_tokens")
295
- and usage_data.completion_tokens is not None
296
- else 0
297
- )
298
- # Extract cached tokens from prompt_tokens_details.cached_tokens
299
- cache_read_input_tokens = 0
300
- if (
301
- hasattr(usage_data, "prompt_tokens_details")
302
- and usage_data.prompt_tokens_details
303
- ):
304
- if (
305
- hasattr(usage_data.prompt_tokens_details, "cached_tokens")
306
- and usage_data.prompt_tokens_details.cached_tokens is not None
307
- ):
308
- cache_read_input_tokens = usage_data.prompt_tokens_details.cached_tokens
309
-
310
- return prompt_tokens, completion_tokens, cache_read_input_tokens, 0
311
-
312
-
313
- # Provider-specific output formatting handlers
314
- def _format_openai_output(response: Any) -> tuple[Optional[str], Optional[TraceUsage]]:
315
- """Format output data from OpenAI response."""
316
- from judgeval.tracer.llm.providers import (
317
- openai_ChatCompletion,
318
- openai_Response,
319
- openai_ParsedChatCompletion,
320
- )
321
-
322
- model_name = None
323
- message_content = None
324
- prompt_tokens = 0
325
- completion_tokens = 0
326
- cache_read_input_tokens = 0
327
- cache_creation_input_tokens = 0
328
-
329
- if openai_ChatCompletion and isinstance(response, openai_ChatCompletion):
330
- model_name = response.model or ""
331
- prompt_tokens = (
332
- response.usage.prompt_tokens
333
- if response.usage and response.usage.prompt_tokens is not None
334
- else 0
335
- )
336
- completion_tokens = (
337
- response.usage.completion_tokens
338
- if response.usage and response.usage.completion_tokens is not None
339
- else 0
340
- )
341
- cache_read_input_tokens = (
342
- response.usage.prompt_tokens_details.cached_tokens
343
- if response.usage
344
- and response.usage.prompt_tokens_details
345
- and response.usage.prompt_tokens_details.cached_tokens is not None
346
- else 0
347
- )
348
-
349
- if openai_ParsedChatCompletion and isinstance(
350
- response, openai_ParsedChatCompletion
351
- ):
352
- message_content = response.choices[0].message.parsed
353
- else:
354
- message_content = response.choices[0].message.content
355
- elif openai_Response and isinstance(response, openai_Response):
356
- model_name = response.model or ""
357
- prompt_tokens = (
358
- response.usage.input_tokens
359
- if response.usage and response.usage.input_tokens is not None
360
- else 0
361
- )
362
- completion_tokens = (
363
- response.usage.output_tokens
364
- if response.usage and response.usage.output_tokens is not None
365
- else 0
366
- )
367
- cache_read_input_tokens = (
368
- response.usage.input_tokens_details.cached_tokens
369
- if response.usage
370
- and response.usage.input_tokens_details
371
- and response.usage.input_tokens_details.cached_tokens is not None
372
- else 0
373
- )
374
- output0 = response.output[0]
375
- if (
376
- hasattr(output0, "content")
377
- and output0.content
378
- and hasattr(output0.content, "__iter__")
379
- ):
380
- message_content = "".join(
381
- seg.text for seg in output0.content if hasattr(seg, "text") and seg.text
382
- )
383
-
384
- if model_name:
385
- return message_content, _create_usage(
386
- model_name,
387
- prompt_tokens,
388
- completion_tokens,
389
- cache_read_input_tokens,
390
- cache_creation_input_tokens,
391
- )
392
-
393
- return None, None
394
-
395
-
396
- def _format_anthropic_output(
397
- response: Any,
398
- ) -> tuple[Optional[str], Optional[TraceUsage]]:
399
- """Format output data from Anthropic response."""
400
- model_name = getattr(response, "model", "") or ""
401
- usage = getattr(response, "usage", None)
402
- prompt_tokens = (
403
- usage.input_tokens
404
- if usage and hasattr(usage, "input_tokens") and usage.input_tokens is not None
405
- else 0
406
- )
407
- completion_tokens = (
408
- usage.output_tokens
409
- if usage and hasattr(usage, "output_tokens") and usage.output_tokens is not None
410
- else 0
411
- )
412
- cache_read_input_tokens = (
413
- usage.cache_read_input_tokens
414
- if usage
415
- and hasattr(usage, "cache_read_input_tokens")
416
- and usage.cache_read_input_tokens is not None
417
- else 0
418
- )
419
- cache_creation_input_tokens = (
420
- usage.cache_creation_input_tokens
421
- if usage
422
- and hasattr(usage, "cache_creation_input_tokens")
423
- and usage.cache_creation_input_tokens is not None
424
- else 0
425
- )
426
- # Extract content from Anthropic response, handling both text and tool use blocks
427
- message_content = None
428
- if hasattr(response, "content") and response.content:
429
- content_parts = []
430
- for content_block in response.content:
431
- block_type = getattr(content_block, "type", None)
432
- if block_type == "text":
433
- # Text content block
434
- content_parts.append(getattr(content_block, "text", ""))
435
- elif block_type == "tool_use":
436
- # Tool use block - serialize the tool call information
437
- tool_info = {
438
- "type": "tool_use",
439
- "id": getattr(content_block, "id", None),
440
- "name": getattr(content_block, "name", None),
441
- "input": getattr(content_block, "input", None),
442
- }
443
- content_parts.append(f"[TOOL_USE: {tool_info}]")
444
- message_content = "\n".join(content_parts) if content_parts else None
445
-
446
- if model_name:
447
- return message_content, _create_usage(
448
- model_name,
449
- prompt_tokens,
450
- completion_tokens,
451
- cache_read_input_tokens,
452
- cache_creation_input_tokens,
453
- )
454
-
455
- return None, None
456
-
457
-
458
- def _format_together_output(
459
- response: Any,
460
- ) -> tuple[Optional[str], Optional[TraceUsage]]:
461
- """Format output data from Together response."""
462
- model_name = (response.model or "") if hasattr(response, "model") else ""
463
- prompt_tokens = (
464
- response.usage.prompt_tokens
465
- if hasattr(response.usage, "prompt_tokens")
466
- and response.usage.prompt_tokens is not None
467
- else 0
468
- )
469
- completion_tokens = (
470
- response.usage.completion_tokens
471
- if hasattr(response.usage, "completion_tokens")
472
- and response.usage.completion_tokens is not None
473
- else 0
474
- )
475
- message_content = (
476
- response.choices[0].message.content if hasattr(response, "choices") else None
477
- )
478
-
479
- if model_name:
480
- model_name = "together_ai/" + model_name
481
- return message_content, _create_usage(
482
- model_name,
483
- prompt_tokens,
484
- completion_tokens,
485
- 0,
486
- 0,
487
- )
488
-
489
- return None, None
490
-
491
-
492
- def _format_google_output(response: Any) -> tuple[Optional[str], Optional[TraceUsage]]:
493
- """Format output data from Google GenAI response."""
494
- model_name = getattr(response, "model_version", "") or ""
495
- usage_metadata = getattr(response, "usage_metadata", None)
496
- prompt_tokens = (
497
- usage_metadata.prompt_token_count
498
- if usage_metadata
499
- and hasattr(usage_metadata, "prompt_token_count")
500
- and usage_metadata.prompt_token_count is not None
501
- else 0
502
- )
503
- completion_tokens = (
504
- usage_metadata.candidates_token_count
505
- if usage_metadata
506
- and hasattr(usage_metadata, "candidates_token_count")
507
- and usage_metadata.candidates_token_count is not None
508
- else 0
509
- )
510
- message_content = (
511
- response.candidates[0].content.parts[0].text
512
- if hasattr(response, "candidates")
513
- else None
514
- )
515
-
516
- cache_read_input_tokens = 0
517
- if usage_metadata and hasattr(usage_metadata, "cached_content_token_count"):
518
- cache_read_input_tokens = usage_metadata.cached_content_token_count or 0
519
-
520
- if model_name:
521
- return message_content, _create_usage(
522
- model_name,
523
- prompt_tokens,
524
- completion_tokens,
525
- cache_read_input_tokens,
526
- 0,
527
- )
528
-
529
- return None, None
530
-
531
-
532
- def _format_groq_output(response: Any) -> tuple[Optional[str], Optional[TraceUsage]]:
533
- """Format output data from Groq response."""
534
- model_name = (response.model or "") if hasattr(response, "model") else ""
535
- prompt_tokens = (
536
- response.usage.prompt_tokens
537
- if hasattr(response.usage, "prompt_tokens")
538
- and response.usage.prompt_tokens is not None
539
- else 0
540
- )
541
- completion_tokens = (
542
- response.usage.completion_tokens
543
- if hasattr(response.usage, "completion_tokens")
544
- and response.usage.completion_tokens is not None
545
- else 0
546
- )
547
- # Extract cached tokens from prompt_tokens_details.cached_tokens
548
- cache_read_input_tokens = 0
549
- if (
550
- hasattr(response, "usage")
551
- and response.usage
552
- and hasattr(response.usage, "prompt_tokens_details")
553
- and response.usage.prompt_tokens_details
554
- ):
555
- if (
556
- hasattr(response.usage.prompt_tokens_details, "cached_tokens")
557
- and response.usage.prompt_tokens_details.cached_tokens is not None
558
- ):
559
- cache_read_input_tokens = response.usage.prompt_tokens_details.cached_tokens
560
-
561
- message_content = (
562
- response.choices[0].message.content if hasattr(response, "choices") else None
563
- )
564
-
565
- if model_name:
566
- model_name = "groq/" + model_name
567
- return message_content, _create_usage(
568
- model_name,
569
- prompt_tokens,
570
- completion_tokens,
571
- cache_read_input_tokens,
572
- 0,
573
- )
574
-
575
- return None, None
576
-
577
-
578
- class _TracedGeneratorBase:
579
- """Base class with common logic for parsing stream chunks."""
580
-
581
- __slots__ = (
582
- "tracer",
583
- "client",
584
- "span",
585
- "accumulated_content",
586
- "model_name",
587
- "provider_type",
588
- )
589
-
590
- tracer: Tracer
591
- client: ApiClient
592
- span: Span
593
- accumulated_content: str
594
- model_name: str
595
- provider_type: ProviderType
596
-
597
- def __init__(self, tracer: Tracer, client: ApiClient, span: Span, model_name: str):
598
- """Initialize the base traced generator.
599
-
600
- Args:
601
- tracer: The tracer instance
602
- client: The API client
603
- span: The OpenTelemetry span
604
- model_name: The model name (empty string default allows fallback to usage_data.model)
605
- """
606
- self.tracer = tracer
607
- self.client = client
608
- self.span = span
609
- self.accumulated_content = ""
610
- self.model_name = model_name
611
- self.provider_type = _detect_provider(client)
612
-
613
- def _extract_content(self, chunk) -> str:
614
- """Extract content from streaming chunk based on provider."""
615
- if self.provider_type == ProviderType.OPENAI:
616
- return _extract_openai_content(chunk)
617
- elif self.provider_type == ProviderType.ANTHROPIC:
618
- return _extract_anthropic_content(chunk)
619
- elif self.provider_type == ProviderType.TOGETHER:
620
- return _extract_together_content(chunk)
621
- elif self.provider_type == ProviderType.GROQ:
622
- return _extract_groq_content(chunk)
623
- else:
624
- # Default case - assume OpenAI-compatible for unknown providers
625
- return _extract_openai_content(chunk)
626
-
627
- def _process_chunk_usage(self, chunk):
628
- """Process usage data from streaming chunks based on provider."""
629
- usage_data = _extract_chunk_usage(self.client, chunk)
630
- if usage_data:
631
- _process_usage_data(
632
- self.span, usage_data, self.tracer, self.client, self.model_name
633
- )
634
-
635
- def __del__(self):
636
- """
637
- Fallback cleanup for unclosed spans. This is a safety mechanism only - spans
638
- should normally be finalized in StopIteration/StopAsyncIteration handlers.
639
-
640
- Note: __del__ is not guaranteed to be called in all situations (e.g., reference
641
- cycles, program exit), so this should not be relied upon as the primary cleanup
642
- mechanism. The primary finalization happens in the iterator protocol methods.
643
- """
644
- if self.span:
645
- try:
646
- self._finalize_span()
647
- except Exception as e:
648
- judgeval_logger.warning(
649
- f"Error during span finalization in __del__: {e}"
650
- )
651
-
652
- def _finalize_span(self):
653
- """Finalize the span by setting completion content and ending it."""
654
- if self.span:
655
- set_span_attribute(
656
- self.span, AttributeKeys.GEN_AI_COMPLETION, self.accumulated_content
657
- )
658
- self.span.end()
659
- self.span = None
660
-
661
-
662
- class TracedGenerator(_TracedGeneratorBase):
663
- """Generator wrapper that adds OpenTelemetry tracing without consuming the stream."""
664
-
665
- __slots__ = ("generator",)
666
-
667
- generator: Union[Generator[Any, None, None], Iterator[Any]]
668
-
669
- def __init__(
670
- self,
671
- tracer: Tracer,
672
- generator: Union[Generator[Any, None, None], Iterator[Any]],
673
- client: ApiClient,
674
- span: Span,
675
- model_name: str,
676
- ):
677
- super().__init__(tracer, client, span, model_name)
678
- self.generator = generator
679
-
680
- def __iter__(self):
681
- return self
682
-
683
- def __next__(self):
684
- try:
685
- chunk = next(self.generator)
686
-
687
- content = self._extract_content(chunk)
688
- if content:
689
- self.accumulated_content += content
690
- self._process_chunk_usage(chunk)
691
-
692
- return chunk
693
-
694
- except StopIteration:
695
- self._finalize_span()
696
- raise
697
- except Exception as e:
698
- if self.span:
699
- self.span.record_exception(e)
700
- self.span.end()
701
- raise
702
-
703
-
704
- class TracedAsyncGenerator(_TracedGeneratorBase):
705
- """Async generator wrapper that adds OpenTelemetry tracing without consuming the stream."""
706
-
707
- __slots__ = ("async_generator",)
708
-
709
- async_generator: Union[AsyncGenerator[Any, None], AsyncIterator[Any]]
710
-
711
- def __init__(
712
- self,
713
- tracer: Tracer,
714
- async_generator: Union[AsyncGenerator[Any, None], AsyncIterator[Any]],
715
- client: ApiClient,
716
- span: Span,
717
- model_name: str,
718
- ):
719
- super().__init__(tracer, client, span, model_name)
720
- self.async_generator = async_generator
721
-
722
- def __aiter__(self):
723
- return self
724
-
725
- async def __anext__(self):
726
- try:
727
- chunk = await self.async_generator.__anext__()
728
-
729
- content = self._extract_content(chunk)
730
- if content:
731
- self.accumulated_content += content
732
-
733
- self._process_chunk_usage(chunk)
734
-
735
- return chunk
736
-
737
- except StopAsyncIteration:
738
- self._finalize_span()
739
- raise
740
- except Exception as e:
741
- if self.span:
742
- self.span.record_exception(e)
743
- self.span.end()
744
- raise
745
-
746
-
747
- class TracedSyncContextManager:
748
- """Sync context manager wrapper for streaming methods."""
749
-
750
- def __init__(
751
- self,
752
- tracer: Tracer,
753
- context_manager: Any,
754
- client: ApiClient,
755
- span: Span,
756
- model_name: str,
757
- ):
758
- self.tracer = tracer
759
- self.context_manager = context_manager
760
- self.client = client
761
- self.span = span
762
- self.stream: Optional[Any] = None
763
- self.model_name = model_name
764
-
765
- def __enter__(self):
766
- self.stream = self.context_manager.__enter__()
767
- return TracedGenerator(
768
- self.tracer, self.stream, self.client, self.span, self.model_name
769
- )
770
-
771
- def __exit__(self, exc_type, exc_val, exc_tb):
772
- return self.context_manager.__exit__(exc_type, exc_val, exc_tb)
773
-
774
- def __del__(self):
775
- """Cleanup span if not properly closed."""
776
- if self.span:
777
- try:
778
- self.span.end()
779
- except Exception:
780
- pass
781
-
782
-
783
- class TracedAsyncContextManager:
784
- """Async context manager wrapper for streaming methods."""
785
-
786
- def __init__(
787
- self,
788
- tracer: Tracer,
789
- context_manager: Any,
790
- client: ApiClient,
791
- span: Span,
792
- model_name: str,
793
- ):
794
- self.tracer = tracer
795
- self.context_manager = context_manager
796
- self.client = client
797
- self.span = span
798
- self.stream: Optional[Any] = None
799
- self.model_name = model_name
800
-
801
- async def __aenter__(self):
802
- self.stream = await self.context_manager.__aenter__()
803
- return TracedAsyncGenerator(
804
- self.tracer, self.stream, self.client, self.span, self.model_name
805
- )
806
-
807
- async def __aexit__(self, exc_type, exc_val, exc_tb):
808
- return await self.context_manager.__aexit__(exc_type, exc_val, exc_tb)
809
-
810
- def __del__(self):
811
- """Cleanup span if not properly closed."""
812
- if self.span:
813
- try:
814
- self.span.end()
815
- except Exception:
816
- pass
817
-
818
-
819
- def _extract_chunk_usage(client: ApiClient, chunk) -> Any:
820
- """Extract usage data from streaming chunks based on provider."""
821
- provider_type = _detect_provider(client)
822
-
823
- if provider_type == ProviderType.OPENAI:
824
- return _extract_openai_chunk_usage(chunk)
825
- elif provider_type == ProviderType.ANTHROPIC:
826
- return _extract_anthropic_chunk_usage(chunk)
827
- elif provider_type == ProviderType.TOGETHER:
828
- return _extract_together_chunk_usage(chunk)
829
- elif provider_type == ProviderType.GROQ:
830
- return _extract_groq_chunk_usage(chunk)
831
- else:
832
- # Default case - assume OpenAI-compatible for unknown providers
833
- return _extract_openai_chunk_usage(chunk)
834
-
835
-
836
- def _extract_usage_tokens(client: ApiClient, usage_data) -> tuple[int, int, int, int]:
837
- """Extract token counts from usage data based on provider."""
838
- provider_type = _detect_provider(client)
839
-
840
- if provider_type == ProviderType.OPENAI:
841
- return _extract_openai_tokens(usage_data)
842
- elif provider_type == ProviderType.ANTHROPIC:
843
- return _extract_anthropic_tokens(usage_data)
844
- elif provider_type == ProviderType.TOGETHER:
845
- return _extract_together_tokens(usage_data)
846
- elif provider_type == ProviderType.GROQ:
847
- return _extract_groq_tokens(usage_data)
848
- else:
849
- # Default case - assume OpenAI-compatible for unknown providers
850
- return _extract_openai_tokens(usage_data)
851
-
852
-
853
- def _process_usage_data(
854
- span, usage_data, tracer: Tracer, client: ApiClient, model_name: str
855
- ):
856
- """Process usage data and set span attributes."""
857
- (
858
- prompt_tokens,
859
- completion_tokens,
860
- cache_read_input_tokens,
861
- cache_creation_input_tokens,
862
- ) = _extract_usage_tokens(client, usage_data)
863
-
864
- if prompt_tokens or completion_tokens:
865
- final_model_name = getattr(usage_data, "model", None) or model_name
866
-
867
- # Add provider prefixes for cost calculation
868
- provider_type = _detect_provider(client)
869
- if (
870
- provider_type == ProviderType.TOGETHER
871
- and final_model_name
872
- and not final_model_name.startswith("together_ai/")
873
- ):
874
- final_model_name = "together_ai/" + final_model_name
875
- elif (
876
- provider_type == ProviderType.GROQ
877
- and final_model_name
878
- and not final_model_name.startswith("groq/")
879
- ):
880
- final_model_name = "groq/" + final_model_name
881
-
882
- usage = _create_usage(
883
- final_model_name,
884
- prompt_tokens,
885
- completion_tokens,
886
- cache_read_input_tokens,
887
- cache_creation_input_tokens,
888
- )
889
- _set_usage_attributes(span, usage, tracer)
890
-
891
-
892
- def _set_usage_attributes(span, usage: TraceUsage, tracer: Tracer):
893
- """Set usage attributes on the span for non-streaming responses."""
894
-
895
- set_span_attribute(span, AttributeKeys.GEN_AI_RESPONSE_MODEL, usage.model_name)
896
- set_span_attribute(
897
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, usage.prompt_tokens
898
- )
899
- set_span_attribute(
900
- span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, usage.completion_tokens
901
- )
902
- set_span_attribute(
903
- span, AttributeKeys.GEN_AI_USAGE_COMPLETION_TOKENS, usage.completion_tokens
904
- )
905
- set_span_attribute(
906
- span, AttributeKeys.GEN_AI_USAGE_TOTAL_COST, usage.total_cost_usd
907
- )
908
-
909
-
910
- def wrap_provider(tracer: Tracer, client: ApiClient) -> ApiClient:
911
- """
912
- Wraps an API client to add tracing capabilities.
913
- Supports OpenAI, Together, Anthropic, Google GenAI, and Groq clients.
914
- """
915
-
916
- def wrapped(function, span_name):
917
- @functools.wraps(function)
918
- def wrapper(*args, **kwargs):
919
- if kwargs.get("stream", False):
920
- span = tracer.get_tracer().start_span(
921
- span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
922
- )
923
- tracer.add_agent_attributes_to_span(span)
924
- set_span_attribute(
925
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
926
- )
927
- model_name = kwargs.get("model", "")
928
-
929
- # Add provider prefix for Groq clients
930
- if HAS_GROQ:
931
- from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
932
-
933
- if (
934
- isinstance(client, (groq_Groq, groq_AsyncGroq))
935
- and model_name
936
- and not model_name.startswith("groq/")
937
- ):
938
- model_name = "groq/" + model_name
939
-
940
- response = function(*args, **kwargs)
941
- return TracedGenerator(tracer, response, client, span, model_name)
942
- else:
943
- with sync_span_context(
944
- tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
945
- ) as span:
946
- tracer.add_agent_attributes_to_span(span)
947
- set_span_attribute(
948
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
949
- )
950
- try:
951
- response = function(*args, **kwargs)
952
- output, usage = _format_output_data(client, response)
953
- set_span_attribute(
954
- span, AttributeKeys.GEN_AI_COMPLETION, output
955
- )
956
- if usage:
957
- _set_usage_attributes(span, usage, tracer)
958
- return response
959
- except Exception as e:
960
- span.record_exception(e)
961
- raise
962
-
963
- return wrapper
964
-
965
- def wrapped_async(function, span_name):
966
- @functools.wraps(function)
967
- async def wrapper(*args, **kwargs):
968
- if kwargs.get("stream", False):
969
- span = tracer.get_tracer().start_span(
970
- span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
971
- )
972
- tracer.add_agent_attributes_to_span(span)
973
- set_span_attribute(
974
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
975
- )
976
- model_name = kwargs.get("model", "")
977
-
978
- # Add provider prefix for Groq clients
979
- if HAS_GROQ:
980
- from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
981
-
982
- if (
983
- isinstance(client, (groq_Groq, groq_AsyncGroq))
984
- and model_name
985
- and not model_name.startswith("groq/")
986
- ):
987
- model_name = "groq/" + model_name
988
-
989
- response = await function(*args, **kwargs)
990
- return TracedAsyncGenerator(tracer, response, client, span, model_name)
991
- else:
992
- async with async_span_context(
993
- tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
994
- ) as span:
995
- tracer.add_agent_attributes_to_span(span)
996
- set_span_attribute(
997
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
998
- )
999
- try:
1000
- response = await function(*args, **kwargs)
1001
- output, usage = _format_output_data(client, response)
1002
- set_span_attribute(
1003
- span, AttributeKeys.GEN_AI_COMPLETION, output
1004
- )
1005
- if usage:
1006
- _set_usage_attributes(span, usage, tracer)
1007
- return response
1008
- except Exception as e:
1009
- span.record_exception(e)
1010
- raise
1011
-
1012
- return wrapper
1013
-
1014
- def wrapped_sync_context_manager(function, span_name):
1015
- """Special wrapper for sync context manager methods."""
1016
-
1017
- @functools.wraps(function)
1018
- def wrapper(*args, **kwargs):
1019
- span = tracer.get_tracer().start_span(
1020
- span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
1021
- )
1022
- tracer.add_agent_attributes_to_span(span)
1023
- set_span_attribute(
1024
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
1025
- )
1026
- model_name = kwargs.get("model", "")
1027
-
1028
- # Add provider prefix for Groq clients
1029
- if HAS_GROQ:
1030
- from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
1031
-
1032
- if (
1033
- isinstance(client, (groq_Groq, groq_AsyncGroq))
1034
- and model_name
1035
- and not model_name.startswith("groq/")
1036
- ):
1037
- model_name = "groq/" + model_name
1038
-
1039
- original_context_manager = function(*args, **kwargs)
1040
- return TracedSyncContextManager(
1041
- tracer, original_context_manager, client, span, model_name
1042
- )
1043
-
1044
- return wrapper
1045
-
1046
- def wrapped_async_context_manager(function, span_name):
1047
- """Special wrapper for async context manager methods."""
1048
-
1049
- @functools.wraps(function)
1050
- def wrapper(*args, **kwargs):
1051
- span = tracer.get_tracer().start_span(
1052
- span_name, attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
1053
- )
1054
- tracer.add_agent_attributes_to_span(span)
1055
- set_span_attribute(
1056
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
1057
- )
1058
- model_name = kwargs.get("model", "")
1059
-
1060
- # Add provider prefix for Groq clients
1061
- if HAS_GROQ:
1062
- from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
1063
-
1064
- if (
1065
- isinstance(client, (groq_Groq, groq_AsyncGroq))
1066
- and model_name
1067
- and not model_name.startswith("groq/")
1068
- ):
1069
- model_name = "groq/" + model_name
1070
-
1071
- original_context_manager = function(*args, **kwargs)
1072
- return TracedAsyncContextManager(
1073
- tracer, original_context_manager, client, span, model_name
1074
- )
1075
-
1076
- return wrapper
1077
-
1078
- if HAS_OPENAI:
1079
- from judgeval.tracer.llm.providers import openai_OpenAI, openai_AsyncOpenAI
1080
-
1081
- assert openai_OpenAI is not None, "OpenAI client not found"
1082
- assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
1083
- span_name = "OPENAI_API_CALL"
1084
- if isinstance(client, openai_OpenAI):
1085
- setattr(
1086
- client.chat.completions,
1087
- "create",
1088
- wrapped(client.chat.completions.create, span_name),
1089
- )
1090
- setattr(
1091
- client.responses, "create", wrapped(client.responses.create, span_name)
1092
- )
1093
- setattr(
1094
- client.beta.chat.completions,
1095
- "parse",
1096
- wrapped(client.beta.chat.completions.parse, span_name),
1097
- )
1098
- elif isinstance(client, openai_AsyncOpenAI):
1099
- setattr(
1100
- client.chat.completions,
1101
- "create",
1102
- wrapped_async(client.chat.completions.create, span_name),
1103
- )
1104
- setattr(
1105
- client.responses,
1106
- "create",
1107
- wrapped_async(client.responses.create, span_name),
1108
- )
1109
- setattr(
1110
- client.beta.chat.completions,
1111
- "parse",
1112
- wrapped_async(client.beta.chat.completions.parse, span_name),
1113
- )
1114
-
1115
- if HAS_TOGETHER:
1116
- from judgeval.tracer.llm.providers import (
1117
- together_Together,
1118
- together_AsyncTogether,
1119
- )
1120
-
1121
- assert together_Together is not None, "Together client not found"
1122
- assert together_AsyncTogether is not None, "Together async client not found"
1123
- span_name = "TOGETHER_API_CALL"
1124
- if isinstance(client, together_Together):
1125
- setattr(
1126
- client.chat.completions,
1127
- "create",
1128
- wrapped(client.chat.completions.create, span_name),
1129
- )
1130
- elif isinstance(client, together_AsyncTogether):
1131
- setattr(
1132
- client.chat.completions,
1133
- "create",
1134
- wrapped_async(client.chat.completions.create, span_name),
1135
- )
1136
-
1137
- if HAS_ANTHROPIC:
1138
- from judgeval.tracer.llm.providers import (
1139
- anthropic_Anthropic,
1140
- anthropic_AsyncAnthropic,
1141
- )
1142
-
1143
- assert anthropic_Anthropic is not None, "Anthropic client not found"
1144
- assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
1145
- span_name = "ANTHROPIC_API_CALL"
1146
- if isinstance(client, anthropic_Anthropic):
1147
- setattr(
1148
- client.messages, "create", wrapped(client.messages.create, span_name)
1149
- )
1150
- setattr(
1151
- client.messages,
1152
- "stream",
1153
- wrapped_sync_context_manager(client.messages.stream, span_name),
1154
- )
1155
- elif isinstance(client, anthropic_AsyncAnthropic):
1156
- setattr(
1157
- client.messages,
1158
- "create",
1159
- wrapped_async(client.messages.create, span_name),
1160
- )
1161
- setattr(
1162
- client.messages,
1163
- "stream",
1164
- wrapped_async_context_manager(client.messages.stream, span_name),
1165
- )
1166
-
1167
- if HAS_GOOGLE_GENAI:
1168
- from judgeval.tracer.llm.providers import (
1169
- google_genai_Client,
1170
- google_genai_AsyncClient,
1171
- )
1172
-
1173
- assert google_genai_Client is not None, "Google GenAI client not found"
1174
- assert google_genai_AsyncClient is not None, (
1175
- "Google GenAI async client not found"
1176
- )
1177
- span_name = "GOOGLE_API_CALL"
1178
- if isinstance(client, google_genai_Client):
1179
- setattr(
1180
- client.models,
1181
- "generate_content",
1182
- wrapped(client.models.generate_content, span_name),
1183
- )
1184
- elif isinstance(client, google_genai_AsyncClient):
1185
- setattr(
1186
- client.models,
1187
- "generate_content",
1188
- wrapped_async(client.models.generate_content, span_name),
1189
- )
1190
-
1191
- if HAS_GROQ:
1192
- from judgeval.tracer.llm.providers import groq_Groq, groq_AsyncGroq
1193
-
1194
- assert groq_Groq is not None, "Groq client not found"
1195
- assert groq_AsyncGroq is not None, "Groq async client not found"
1196
- span_name = "GROQ_API_CALL"
1197
- if isinstance(client, groq_Groq):
1198
- setattr(
1199
- client.chat.completions,
1200
- "create",
1201
- wrapped(client.chat.completions.create, span_name),
1202
- )
1203
- elif isinstance(client, groq_AsyncGroq):
1204
- setattr(
1205
- client.chat.completions,
1206
- "create",
1207
- wrapped_async(client.chat.completions.create, span_name),
1208
- )
1209
-
1210
- return client
1211
-
1212
-
1213
- def _format_output_data(
1214
- client: ApiClient, response: Any
1215
- ) -> tuple[Optional[str], Optional[TraceUsage]]:
1216
- """Format output data from LLM response based on provider."""
1217
- provider_type = _detect_provider(client)
1218
-
1219
- if provider_type == ProviderType.OPENAI:
1220
- return _format_openai_output(response)
1221
- elif provider_type == ProviderType.ANTHROPIC:
1222
- return _format_anthropic_output(response)
1223
- elif provider_type == ProviderType.TOGETHER:
1224
- return _format_together_output(response)
1225
- elif provider_type == ProviderType.GOOGLE:
1226
- return _format_google_output(response)
1227
- elif provider_type == ProviderType.GROQ:
1228
- return _format_groq_output(response)
1229
- else:
1230
- # Default case - assume OpenAI-compatible for unknown providers
1231
- judgeval_logger.info(
1232
- f"Unknown client type {type(client)}, assuming OpenAI-compatible"
1233
- )
1234
- return _format_openai_output(response)
1235
-
1236
-
1237
- def _create_usage(
1238
- model_name: str,
1239
- prompt_tokens: int,
1240
- completion_tokens: int,
1241
- cache_read_input_tokens: int = 0,
1242
- cache_creation_input_tokens: int = 0,
1243
- ) -> TraceUsage:
1244
- prompt_cost, completion_cost = cost_per_token(
1245
- model=model_name,
1246
- prompt_tokens=prompt_tokens,
1247
- completion_tokens=completion_tokens,
1248
- cache_read_input_tokens=cache_read_input_tokens,
1249
- cache_creation_input_tokens=cache_creation_input_tokens,
1250
- )
1251
- total_cost_usd = (
1252
- (prompt_cost + completion_cost) if prompt_cost and completion_cost else None
1253
- )
1254
- return TraceUsage(
1255
- prompt_tokens=prompt_tokens,
1256
- completion_tokens=completion_tokens,
1257
- total_tokens=prompt_tokens + completion_tokens,
1258
- cache_read_input_tokens=cache_read_input_tokens,
1259
- cache_creation_input_tokens=cache_creation_input_tokens,
1260
- prompt_tokens_cost_usd=prompt_cost,
1261
- completion_tokens_cost_usd=completion_cost,
1262
- total_cost_usd=total_cost_usd,
1263
- model_name=model_name,
1264
- )
7
+ __all__ = ["_detect_provider", "wrap_provider"]