judgeval 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +2 -0
- judgeval/clients.py +2 -1
- judgeval/common/api/api.py +2 -16
- judgeval/common/api/json_encoder.py +242 -0
- judgeval/common/tracer/core.py +498 -215
- judgeval/common/tracer/providers.py +119 -0
- judgeval/common/tracer/span_transformer.py +14 -25
- judgeval/constants.py +1 -0
- judgeval/data/judgment_types.py +2 -1
- judgeval/data/trace.py +5 -122
- judgeval/data/trace_run.py +2 -1
- judgeval/dataset.py +2 -0
- judgeval/evaluation_run.py +6 -2
- judgeval/judges/litellm_judge.py +2 -1
- judgeval/judges/mixture_of_judges.py +2 -1
- judgeval/judges/utils.py +2 -1
- judgeval/judgment_client.py +11 -6
- judgeval/local_eval_queue.py +192 -0
- judgeval/run_evaluation.py +11 -6
- judgeval/scorers/score.py +33 -11
- judgeval/utils/async_utils.py +36 -0
- {judgeval-0.4.0.dist-info → judgeval-0.5.0.dist-info}/METADATA +9 -12
- {judgeval-0.4.0.dist-info → judgeval-0.5.0.dist-info}/RECORD +25 -21
- {judgeval-0.4.0.dist-info → judgeval-0.5.0.dist-info}/WHEEL +0 -0
- {judgeval-0.4.0.dist-info → judgeval-0.5.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/tracer/core.py
CHANGED
@@ -26,11 +26,15 @@ from typing import (
|
|
26
26
|
Generator,
|
27
27
|
List,
|
28
28
|
Optional,
|
29
|
+
ParamSpec,
|
29
30
|
Tuple,
|
31
|
+
TypeVar,
|
30
32
|
Union,
|
31
33
|
TypeAlias,
|
34
|
+
overload,
|
32
35
|
)
|
33
36
|
import types
|
37
|
+
import random
|
34
38
|
|
35
39
|
|
36
40
|
from judgeval.common.tracer.constants import _TRACE_FILEPATH_BLOCKLIST
|
@@ -38,40 +42,33 @@ from judgeval.common.tracer.constants import _TRACE_FILEPATH_BLOCKLIST
|
|
38
42
|
from judgeval.common.tracer.otel_span_processor import JudgmentSpanProcessor
|
39
43
|
from judgeval.common.tracer.span_processor import SpanProcessorBase
|
40
44
|
from judgeval.common.tracer.trace_manager import TraceManagerClient
|
41
|
-
from litellm import cost_per_token as _original_cost_per_token
|
42
|
-
from openai import OpenAI, AsyncOpenAI
|
43
|
-
from openai.types.chat.chat_completion import ChatCompletion
|
44
|
-
from openai.types.responses.response import Response
|
45
|
-
from openai.types.chat import ParsedChatCompletion
|
46
|
-
from together import Together, AsyncTogether
|
47
|
-
from anthropic import Anthropic, AsyncAnthropic
|
48
|
-
from google import genai
|
49
|
-
from groq import Groq, AsyncGroq
|
50
45
|
|
51
46
|
from judgeval.data import Example, Trace, TraceSpan, TraceUsage
|
52
47
|
from judgeval.scorers import APIScorerConfig, BaseScorer
|
53
48
|
from judgeval.evaluation_run import EvaluationRun
|
54
|
-
from judgeval.
|
49
|
+
from judgeval.local_eval_queue import LocalEvaluationQueue
|
50
|
+
from judgeval.common.api import JudgmentApiClient
|
51
|
+
from judgeval.common.utils import OptExcInfo, validate_api_key
|
55
52
|
from judgeval.common.logger import judgeval_logger
|
56
53
|
|
54
|
+
from litellm import cost_per_token as _original_cost_per_token # type: ignore
|
55
|
+
from judgeval.common.tracer.providers import (
|
56
|
+
HAS_OPENAI,
|
57
|
+
HAS_TOGETHER,
|
58
|
+
HAS_ANTHROPIC,
|
59
|
+
HAS_GOOGLE_GENAI,
|
60
|
+
HAS_GROQ,
|
61
|
+
ApiClient,
|
62
|
+
)
|
63
|
+
from judgeval.constants import DEFAULT_GPT_MODEL
|
64
|
+
|
57
65
|
|
58
66
|
current_trace_var = contextvars.ContextVar[Optional["TraceClient"]](
|
59
67
|
"current_trace", default=None
|
60
68
|
)
|
61
69
|
current_span_var = contextvars.ContextVar[Optional[str]]("current_span", default=None)
|
62
70
|
|
63
|
-
|
64
|
-
OpenAI,
|
65
|
-
Together,
|
66
|
-
Anthropic,
|
67
|
-
AsyncOpenAI,
|
68
|
-
AsyncAnthropic,
|
69
|
-
AsyncTogether,
|
70
|
-
genai.Client,
|
71
|
-
genai.client.AsyncClient,
|
72
|
-
Groq,
|
73
|
-
AsyncGroq,
|
74
|
-
]
|
71
|
+
|
75
72
|
SpanType: TypeAlias = str
|
76
73
|
|
77
74
|
|
@@ -113,10 +110,6 @@ class TraceClient:
|
|
113
110
|
|
114
111
|
self.otel_span_processor = tracer.otel_span_processor
|
115
112
|
|
116
|
-
judgeval_logger.info(
|
117
|
-
f"🎯 TraceClient using span processor for trace {self.trace_id}"
|
118
|
-
)
|
119
|
-
|
120
113
|
def get_current_span(self):
|
121
114
|
"""Get the current span from the context var"""
|
122
115
|
return self.tracer.get_current_span()
|
@@ -181,85 +174,53 @@ class TraceClient:
|
|
181
174
|
|
182
175
|
def async_evaluate(
|
183
176
|
self,
|
184
|
-
|
185
|
-
example:
|
186
|
-
|
187
|
-
actual_output: Optional[Union[str, List[str]]] = None,
|
188
|
-
expected_output: Optional[Union[str, List[str]]] = None,
|
189
|
-
context: Optional[List[str]] = None,
|
190
|
-
retrieval_context: Optional[List[str]] = None,
|
191
|
-
tools_called: Optional[List[str]] = None,
|
192
|
-
expected_tools: Optional[List[str]] = None,
|
193
|
-
additional_metadata: Optional[Dict[str, Any]] = None,
|
194
|
-
model: Optional[str] = None,
|
195
|
-
span_id: Optional[str] = None,
|
177
|
+
scorer: Union[APIScorerConfig, BaseScorer],
|
178
|
+
example: Example,
|
179
|
+
model: str = DEFAULT_GPT_MODEL,
|
196
180
|
):
|
197
|
-
if not self.enable_evaluations:
|
198
|
-
return
|
199
|
-
|
200
181
|
start_time = time.time()
|
182
|
+
span_id = self.get_current_span()
|
183
|
+
eval_run_name = (
|
184
|
+
f"{self.name.capitalize()}-{span_id}-{scorer.score_type.capitalize()}"
|
185
|
+
)
|
201
186
|
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
187
|
+
if isinstance(scorer, APIScorerConfig):
|
188
|
+
eval_run = EvaluationRun(
|
189
|
+
organization_id=self.tracer.organization_id,
|
190
|
+
project_name=self.project_name,
|
191
|
+
eval_name=eval_run_name,
|
192
|
+
examples=[example],
|
193
|
+
scorers=[scorer],
|
194
|
+
model=model,
|
195
|
+
trace_span_id=span_id,
|
196
|
+
)
|
210
197
|
|
211
|
-
|
212
|
-
if any(
|
213
|
-
param is not None
|
214
|
-
for param in [
|
215
|
-
input,
|
216
|
-
actual_output,
|
217
|
-
expected_output,
|
218
|
-
context,
|
219
|
-
retrieval_context,
|
220
|
-
tools_called,
|
221
|
-
expected_tools,
|
222
|
-
additional_metadata,
|
223
|
-
]
|
224
|
-
):
|
225
|
-
example = Example(
|
226
|
-
input=input,
|
227
|
-
actual_output=actual_output,
|
228
|
-
expected_output=expected_output,
|
229
|
-
context=context,
|
230
|
-
retrieval_context=retrieval_context,
|
231
|
-
tools_called=tools_called,
|
232
|
-
expected_tools=expected_tools,
|
233
|
-
additional_metadata=additional_metadata,
|
234
|
-
)
|
235
|
-
else:
|
236
|
-
raise ValueError(
|
237
|
-
"Either 'example' or at least one of the individual parameters (input, actual_output, etc.) must be provided"
|
238
|
-
)
|
198
|
+
self.add_eval_run(eval_run, start_time)
|
239
199
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
200
|
+
if span_id:
|
201
|
+
current_span = self.span_id_to_span.get(span_id)
|
202
|
+
if current_span:
|
203
|
+
self.otel_span_processor.queue_evaluation_run(
|
204
|
+
eval_run, span_id=span_id, span_data=current_span
|
205
|
+
)
|
206
|
+
elif isinstance(scorer, BaseScorer):
|
207
|
+
# Handle custom scorers using local evaluation queue
|
208
|
+
eval_run = EvaluationRun(
|
209
|
+
organization_id=self.tracer.organization_id,
|
210
|
+
project_name=self.project_name,
|
211
|
+
eval_name=eval_run_name,
|
212
|
+
examples=[example],
|
213
|
+
scorers=[scorer],
|
214
|
+
model=model,
|
215
|
+
judgment_api_key=self.tracer.api_key,
|
216
|
+
trace_span_id=span_id,
|
217
|
+
trace_id=self.trace_id,
|
218
|
+
)
|
254
219
|
|
255
|
-
|
220
|
+
self.add_eval_run(eval_run, start_time)
|
256
221
|
|
257
|
-
|
258
|
-
|
259
|
-
if current_span:
|
260
|
-
self.otel_span_processor.queue_evaluation_run(
|
261
|
-
eval_run, span_id=span_id_to_use, span_data=current_span
|
262
|
-
)
|
222
|
+
# Enqueue the evaluation run to the local evaluation queue
|
223
|
+
self.tracer.local_eval_queue.enqueue(eval_run)
|
263
224
|
|
264
225
|
def add_eval_run(self, eval_run: EvaluationRun, start_time: float):
|
265
226
|
current_span_id = eval_run.trace_span_id
|
@@ -517,7 +478,7 @@ class TraceClient:
|
|
517
478
|
|
518
479
|
|
519
480
|
def _capture_exception_for_trace(
|
520
|
-
current_trace: Optional[TraceClient], exc_info:
|
481
|
+
current_trace: Optional[TraceClient], exc_info: OptExcInfo
|
521
482
|
):
|
522
483
|
if not current_trace:
|
523
484
|
return
|
@@ -841,6 +802,10 @@ class _DeepTracer:
|
|
841
802
|
self._original_threading_trace = None
|
842
803
|
|
843
804
|
|
805
|
+
T = TypeVar("T", bound=Callable[..., Any])
|
806
|
+
P = ParamSpec("P")
|
807
|
+
|
808
|
+
|
844
809
|
class Tracer:
|
845
810
|
# Tracer.current_trace class variable is currently used in wrap()
|
846
811
|
# TODO: Keep track of cross-context state for current trace and current span ID solely through class variables instead of instance variables?
|
@@ -954,6 +919,15 @@ class Tracer:
|
|
954
919
|
else:
|
955
920
|
self.otel_span_processor = SpanProcessorBase()
|
956
921
|
|
922
|
+
# Initialize local evaluation queue for custom scorers
|
923
|
+
self.local_eval_queue = LocalEvaluationQueue()
|
924
|
+
|
925
|
+
# Start workers with callback to log results only if monitoring is enabled
|
926
|
+
if enable_evaluations and enable_monitoring:
|
927
|
+
self.local_eval_queue.start_workers(
|
928
|
+
callback=self._log_eval_results_callback
|
929
|
+
)
|
930
|
+
|
957
931
|
atexit.register(self._cleanup_on_exit)
|
958
932
|
except Exception as e:
|
959
933
|
judgeval_logger.error(
|
@@ -1189,11 +1163,24 @@ class Tracer:
|
|
1189
1163
|
else:
|
1190
1164
|
trace_client_instance.record_state_after(state)
|
1191
1165
|
|
1166
|
+
@overload
|
1167
|
+
def observe(
|
1168
|
+
self, func: T, *, name: Optional[str] = None, span_type: SpanType = "span"
|
1169
|
+
) -> T: ...
|
1170
|
+
|
1171
|
+
@overload
|
1172
|
+
def observe(
|
1173
|
+
self,
|
1174
|
+
*,
|
1175
|
+
name: Optional[str] = None,
|
1176
|
+
span_type: SpanType = "span",
|
1177
|
+
) -> Callable[[T], T]: ...
|
1178
|
+
|
1192
1179
|
def observe(
|
1193
1180
|
self,
|
1194
|
-
func=None,
|
1181
|
+
func: Optional[T] = None,
|
1195
1182
|
*,
|
1196
|
-
name=None,
|
1183
|
+
name: Optional[str] = None,
|
1197
1184
|
span_type: SpanType = "span",
|
1198
1185
|
):
|
1199
1186
|
"""
|
@@ -1210,8 +1197,8 @@ class Tracer:
|
|
1210
1197
|
return func if func else lambda f: f
|
1211
1198
|
|
1212
1199
|
if func is None:
|
1213
|
-
return lambda
|
1214
|
-
|
1200
|
+
return lambda func: self.observe(
|
1201
|
+
func,
|
1215
1202
|
name=name,
|
1216
1203
|
span_type=span_type,
|
1217
1204
|
)
|
@@ -1220,8 +1207,8 @@ class Tracer:
|
|
1220
1207
|
original_span_name = name or func.__name__
|
1221
1208
|
|
1222
1209
|
# Store custom attributes on the function object
|
1223
|
-
func._judgment_span_name = original_span_name
|
1224
|
-
func._judgment_span_type = span_type
|
1210
|
+
func._judgment_span_name = original_span_name # type: ignore
|
1211
|
+
func._judgment_span_type = span_type # type: ignore
|
1225
1212
|
|
1226
1213
|
except Exception:
|
1227
1214
|
return func
|
@@ -1532,15 +1519,51 @@ class Tracer:
|
|
1532
1519
|
|
1533
1520
|
return decorate_class if cls is None else decorate_class(cls)
|
1534
1521
|
|
1535
|
-
def async_evaluate(
|
1522
|
+
def async_evaluate(
|
1523
|
+
self,
|
1524
|
+
scorer: Union[APIScorerConfig, BaseScorer],
|
1525
|
+
example: Example,
|
1526
|
+
model: str = DEFAULT_GPT_MODEL,
|
1527
|
+
sampling_rate: float = 1,
|
1528
|
+
):
|
1536
1529
|
try:
|
1537
1530
|
if not self.enable_monitoring or not self.enable_evaluations:
|
1538
1531
|
return
|
1539
1532
|
|
1540
|
-
|
1533
|
+
if not isinstance(scorer, (APIScorerConfig, BaseScorer)):
|
1534
|
+
judgeval_logger.warning(
|
1535
|
+
f"Scorer must be an instance of APIScorerConfig or BaseScorer, got {type(scorer)}, skipping evaluation"
|
1536
|
+
)
|
1537
|
+
return
|
1538
|
+
|
1539
|
+
if not isinstance(example, Example):
|
1540
|
+
judgeval_logger.warning(
|
1541
|
+
f"Example must be an instance of Example, got {type(example)} skipping evaluation"
|
1542
|
+
)
|
1543
|
+
return
|
1544
|
+
|
1545
|
+
if sampling_rate < 0:
|
1546
|
+
judgeval_logger.warning(
|
1547
|
+
"Cannot set sampling_rate below 0, skipping evaluation"
|
1548
|
+
)
|
1549
|
+
return
|
1550
|
+
|
1551
|
+
if sampling_rate > 1:
|
1552
|
+
judgeval_logger.warning(
|
1553
|
+
"Cannot set sampling_rate above 1, skipping evaluation"
|
1554
|
+
)
|
1555
|
+
return
|
1541
1556
|
|
1557
|
+
percentage = random.uniform(0, 1)
|
1558
|
+
if percentage > sampling_rate:
|
1559
|
+
judgeval_logger.info("Skipping async_evaluate due to sampling rate")
|
1560
|
+
return
|
1561
|
+
|
1562
|
+
current_trace = self.get_current_trace()
|
1542
1563
|
if current_trace:
|
1543
|
-
current_trace.async_evaluate(
|
1564
|
+
current_trace.async_evaluate(
|
1565
|
+
scorer=scorer, example=example, model=model
|
1566
|
+
)
|
1544
1567
|
else:
|
1545
1568
|
judgeval_logger.warning(
|
1546
1569
|
"No trace found (context var or fallback), skipping evaluation"
|
@@ -1613,9 +1636,68 @@ class Tracer:
|
|
1613
1636
|
self.otel_span_processor.shutdown()
|
1614
1637
|
self.otel_span_processor = SpanProcessorBase()
|
1615
1638
|
|
1639
|
+
def wait_for_completion(self, timeout: Optional[float] = 30.0) -> bool:
|
1640
|
+
"""Wait for all evaluations and span processing to complete.
|
1641
|
+
|
1642
|
+
This method blocks until all queued evaluations are processed and
|
1643
|
+
all pending spans are flushed to the server.
|
1644
|
+
|
1645
|
+
Args:
|
1646
|
+
timeout: Maximum time to wait in seconds. Defaults to 30 seconds.
|
1647
|
+
None means wait indefinitely.
|
1648
|
+
|
1649
|
+
Returns:
|
1650
|
+
True if all processing completed within the timeout, False otherwise.
|
1651
|
+
|
1652
|
+
"""
|
1653
|
+
try:
|
1654
|
+
judgeval_logger.debug(
|
1655
|
+
"Waiting for all evaluations and spans to complete..."
|
1656
|
+
)
|
1657
|
+
|
1658
|
+
# Wait for all queued evaluation work to complete
|
1659
|
+
eval_completed = self.local_eval_queue.wait_for_completion()
|
1660
|
+
if not eval_completed:
|
1661
|
+
judgeval_logger.warning(
|
1662
|
+
f"Local evaluation queue did not complete within {timeout} seconds"
|
1663
|
+
)
|
1664
|
+
return False
|
1665
|
+
|
1666
|
+
self.flush_background_spans()
|
1667
|
+
|
1668
|
+
judgeval_logger.debug("All evaluations and spans completed successfully")
|
1669
|
+
return True
|
1670
|
+
|
1671
|
+
except Exception as e:
|
1672
|
+
judgeval_logger.warning(f"Error while waiting for completion: {e}")
|
1673
|
+
return False
|
1674
|
+
|
1675
|
+
def _log_eval_results_callback(self, evaluation_run, scoring_results):
|
1676
|
+
"""Callback to log evaluation results after local processing."""
|
1677
|
+
try:
|
1678
|
+
if scoring_results and self.enable_evaluations and self.enable_monitoring:
|
1679
|
+
# Convert scoring results to the format expected by API client
|
1680
|
+
results_dict = [
|
1681
|
+
result.model_dump(warnings=False) for result in scoring_results
|
1682
|
+
]
|
1683
|
+
api_client = JudgmentApiClient(self.api_key, self.organization_id)
|
1684
|
+
api_client.log_evaluation_results(
|
1685
|
+
results_dict, evaluation_run.model_dump(warnings=False)
|
1686
|
+
)
|
1687
|
+
except Exception as e:
|
1688
|
+
judgeval_logger.warning(f"Failed to log local evaluation results: {e}")
|
1689
|
+
|
1616
1690
|
def _cleanup_on_exit(self):
|
1617
1691
|
"""Cleanup handler called on application exit to ensure spans are flushed."""
|
1618
1692
|
try:
|
1693
|
+
# Wait for all queued evaluation work to complete before stopping
|
1694
|
+
completed = self.local_eval_queue.wait_for_completion()
|
1695
|
+
if not completed:
|
1696
|
+
judgeval_logger.warning(
|
1697
|
+
"Local evaluation queue did not complete within 30 seconds"
|
1698
|
+
)
|
1699
|
+
|
1700
|
+
self.local_eval_queue.stop_workers()
|
1619
1701
|
self.flush_background_spans()
|
1620
1702
|
except Exception as e:
|
1621
1703
|
judgeval_logger.warning(f"Error during tracer cleanup: {e}")
|
@@ -1697,33 +1779,76 @@ def wrap(
|
|
1697
1779
|
|
1698
1780
|
return wrapper
|
1699
1781
|
|
1700
|
-
if
|
1701
|
-
|
1702
|
-
|
1703
|
-
|
1704
|
-
|
1705
|
-
|
1706
|
-
|
1707
|
-
|
1708
|
-
client.beta.chat.completions, "parse",
|
1782
|
+
if HAS_OPENAI:
|
1783
|
+
from judgeval.common.tracer.providers import openai_OpenAI, openai_AsyncOpenAI
|
1784
|
+
|
1785
|
+
assert openai_OpenAI is not None, "OpenAI client not found"
|
1786
|
+
assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
|
1787
|
+
if isinstance(client, (openai_OpenAI)):
|
1788
|
+
setattr(client.chat.completions, "create", wrapped(original_create))
|
1789
|
+
setattr(client.responses, "create", wrapped(original_responses_create))
|
1790
|
+
setattr(client.beta.chat.completions, "parse", wrapped(original_beta_parse))
|
1791
|
+
elif isinstance(client, (openai_AsyncOpenAI)):
|
1792
|
+
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1793
|
+
setattr(
|
1794
|
+
client.responses, "create", wrapped_async(original_responses_create)
|
1795
|
+
)
|
1796
|
+
setattr(
|
1797
|
+
client.beta.chat.completions,
|
1798
|
+
"parse",
|
1799
|
+
wrapped_async(original_beta_parse),
|
1800
|
+
)
|
1801
|
+
|
1802
|
+
if HAS_TOGETHER:
|
1803
|
+
from judgeval.common.tracer.providers import (
|
1804
|
+
together_Together,
|
1805
|
+
together_AsyncTogether,
|
1709
1806
|
)
|
1710
|
-
elif isinstance(client, (Together)):
|
1711
|
-
setattr(client.chat.completions, "create", wrapped(original_create))
|
1712
|
-
elif isinstance(client, (AsyncTogether)):
|
1713
|
-
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1714
|
-
elif isinstance(client, (Anthropic)):
|
1715
|
-
setattr(client.messages, "create", wrapped(original_create))
|
1716
|
-
elif isinstance(client, (AsyncAnthropic)):
|
1717
|
-
setattr(client.messages, "create", wrapped_async(original_create))
|
1718
|
-
elif isinstance(client, (genai.Client)):
|
1719
|
-
setattr(client.models, "generate_content", wrapped(original_create))
|
1720
|
-
elif isinstance(client, (genai.client.AsyncClient)):
|
1721
|
-
setattr(client.models, "generate_content", wrapped_async(original_create))
|
1722
|
-
elif isinstance(client, (Groq)):
|
1723
|
-
setattr(client.chat.completions, "create", wrapped(original_create))
|
1724
|
-
elif isinstance(client, (AsyncGroq)):
|
1725
|
-
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1726
1807
|
|
1808
|
+
assert together_Together is not None, "Together client not found"
|
1809
|
+
assert together_AsyncTogether is not None, "Together async client not found"
|
1810
|
+
if isinstance(client, (together_Together)):
|
1811
|
+
setattr(client.chat.completions, "create", wrapped(original_create))
|
1812
|
+
elif isinstance(client, (together_AsyncTogether)):
|
1813
|
+
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1814
|
+
|
1815
|
+
if HAS_ANTHROPIC:
|
1816
|
+
from judgeval.common.tracer.providers import (
|
1817
|
+
anthropic_Anthropic,
|
1818
|
+
anthropic_AsyncAnthropic,
|
1819
|
+
)
|
1820
|
+
|
1821
|
+
assert anthropic_Anthropic is not None, "Anthropic client not found"
|
1822
|
+
assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
|
1823
|
+
if isinstance(client, (anthropic_Anthropic)):
|
1824
|
+
setattr(client.messages, "create", wrapped(original_create))
|
1825
|
+
elif isinstance(client, (anthropic_AsyncAnthropic)):
|
1826
|
+
setattr(client.messages, "create", wrapped_async(original_create))
|
1827
|
+
|
1828
|
+
if HAS_GOOGLE_GENAI:
|
1829
|
+
from judgeval.common.tracer.providers import (
|
1830
|
+
google_genai_Client,
|
1831
|
+
google_genai_AsyncClient,
|
1832
|
+
)
|
1833
|
+
|
1834
|
+
assert google_genai_Client is not None, "Google GenAI client not found"
|
1835
|
+
assert google_genai_AsyncClient is not None, (
|
1836
|
+
"Google GenAI async client not found"
|
1837
|
+
)
|
1838
|
+
if isinstance(client, (google_genai_Client)):
|
1839
|
+
setattr(client.models, "generate_content", wrapped(original_create))
|
1840
|
+
elif isinstance(client, (google_genai_AsyncClient)):
|
1841
|
+
setattr(client.models, "generate_content", wrapped_async(original_create))
|
1842
|
+
|
1843
|
+
if HAS_GROQ:
|
1844
|
+
from judgeval.common.tracer.providers import groq_Groq, groq_AsyncGroq
|
1845
|
+
|
1846
|
+
assert groq_Groq is not None, "Groq client not found"
|
1847
|
+
assert groq_AsyncGroq is not None, "Groq async client not found"
|
1848
|
+
if isinstance(client, (groq_Groq)):
|
1849
|
+
setattr(client.chat.completions, "create", wrapped(original_create))
|
1850
|
+
elif isinstance(client, (groq_AsyncGroq)):
|
1851
|
+
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1727
1852
|
return client
|
1728
1853
|
|
1729
1854
|
|
@@ -1749,28 +1874,87 @@ def _get_client_config(
|
|
1749
1874
|
Raises:
|
1750
1875
|
ValueError: If client type is not supported
|
1751
1876
|
"""
|
1752
|
-
|
1753
|
-
|
1754
|
-
|
1755
|
-
|
1756
|
-
|
1757
|
-
|
1758
|
-
|
1877
|
+
|
1878
|
+
if HAS_OPENAI:
|
1879
|
+
from judgeval.common.tracer.providers import openai_OpenAI, openai_AsyncOpenAI
|
1880
|
+
|
1881
|
+
assert openai_OpenAI is not None, "OpenAI client not found"
|
1882
|
+
assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
|
1883
|
+
if isinstance(client, (openai_OpenAI)):
|
1884
|
+
return (
|
1885
|
+
"OPENAI_API_CALL",
|
1886
|
+
client.chat.completions.create,
|
1887
|
+
client.responses.create,
|
1888
|
+
None,
|
1889
|
+
client.beta.chat.completions.parse,
|
1890
|
+
)
|
1891
|
+
elif isinstance(client, (openai_AsyncOpenAI)):
|
1892
|
+
return (
|
1893
|
+
"OPENAI_API_CALL",
|
1894
|
+
client.chat.completions.create,
|
1895
|
+
client.responses.create,
|
1896
|
+
None,
|
1897
|
+
client.beta.chat.completions.parse,
|
1898
|
+
)
|
1899
|
+
if HAS_TOGETHER:
|
1900
|
+
from judgeval.common.tracer.providers import (
|
1901
|
+
together_Together,
|
1902
|
+
together_AsyncTogether,
|
1759
1903
|
)
|
1760
|
-
|
1761
|
-
|
1762
|
-
|
1763
|
-
|
1764
|
-
|
1765
|
-
|
1766
|
-
"
|
1767
|
-
|
1768
|
-
|
1769
|
-
|
1770
|
-
|
1904
|
+
|
1905
|
+
assert together_Together is not None, "Together client not found"
|
1906
|
+
assert together_AsyncTogether is not None, "Together async client not found"
|
1907
|
+
if isinstance(client, (together_Together)):
|
1908
|
+
return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
|
1909
|
+
elif isinstance(client, (together_AsyncTogether)):
|
1910
|
+
return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
|
1911
|
+
if HAS_ANTHROPIC:
|
1912
|
+
from judgeval.common.tracer.providers import (
|
1913
|
+
anthropic_Anthropic,
|
1914
|
+
anthropic_AsyncAnthropic,
|
1915
|
+
)
|
1916
|
+
|
1917
|
+
assert anthropic_Anthropic is not None, "Anthropic client not found"
|
1918
|
+
assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
|
1919
|
+
if isinstance(client, (anthropic_Anthropic)):
|
1920
|
+
return (
|
1921
|
+
"ANTHROPIC_API_CALL",
|
1922
|
+
client.messages.create,
|
1923
|
+
None,
|
1924
|
+
client.messages.stream,
|
1925
|
+
None,
|
1926
|
+
)
|
1927
|
+
elif isinstance(client, (anthropic_AsyncAnthropic)):
|
1928
|
+
return (
|
1929
|
+
"ANTHROPIC_API_CALL",
|
1930
|
+
client.messages.create,
|
1931
|
+
None,
|
1932
|
+
client.messages.stream,
|
1933
|
+
None,
|
1934
|
+
)
|
1935
|
+
if HAS_GOOGLE_GENAI:
|
1936
|
+
from judgeval.common.tracer.providers import (
|
1937
|
+
google_genai_Client,
|
1938
|
+
google_genai_AsyncClient,
|
1771
1939
|
)
|
1772
|
-
|
1773
|
-
|
1940
|
+
|
1941
|
+
assert google_genai_Client is not None, "Google GenAI client not found"
|
1942
|
+
assert google_genai_AsyncClient is not None, (
|
1943
|
+
"Google GenAI async client not found"
|
1944
|
+
)
|
1945
|
+
if isinstance(client, (google_genai_Client)):
|
1946
|
+
return "GOOGLE_API_CALL", client.models.generate_content, None, None, None
|
1947
|
+
elif isinstance(client, (google_genai_AsyncClient)):
|
1948
|
+
return "GOOGLE_API_CALL", client.models.generate_content, None, None, None
|
1949
|
+
if HAS_GROQ:
|
1950
|
+
from judgeval.common.tracer.providers import groq_Groq, groq_AsyncGroq
|
1951
|
+
|
1952
|
+
assert groq_Groq is not None, "Groq client not found"
|
1953
|
+
assert groq_AsyncGroq is not None, "Groq async client not found"
|
1954
|
+
if isinstance(client, (groq_Groq)):
|
1955
|
+
return "GROQ_API_CALL", client.chat.completions.create, None, None, None
|
1956
|
+
elif isinstance(client, (groq_AsyncGroq)):
|
1957
|
+
return "GROQ_API_CALL", client.chat.completions.create, None, None, None
|
1774
1958
|
raise ValueError(f"Unsupported client type: {type(client)}")
|
1775
1959
|
|
1776
1960
|
|
@@ -1794,73 +1978,173 @@ def _format_output_data(
|
|
1794
1978
|
model_name = None
|
1795
1979
|
message_content = None
|
1796
1980
|
|
1797
|
-
if
|
1798
|
-
|
1799
|
-
|
1800
|
-
|
1801
|
-
|
1802
|
-
|
1981
|
+
if HAS_OPENAI:
|
1982
|
+
from judgeval.common.tracer.providers import (
|
1983
|
+
openai_OpenAI,
|
1984
|
+
openai_AsyncOpenAI,
|
1985
|
+
openai_ChatCompletion,
|
1986
|
+
openai_Response,
|
1987
|
+
openai_ParsedChatCompletion,
|
1988
|
+
)
|
1989
|
+
|
1990
|
+
assert openai_OpenAI is not None, "OpenAI client not found"
|
1991
|
+
assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
|
1992
|
+
assert openai_ChatCompletion is not None, "OpenAI chat completion not found"
|
1993
|
+
assert openai_Response is not None, "OpenAI response not found"
|
1994
|
+
assert openai_ParsedChatCompletion is not None, (
|
1995
|
+
"OpenAI parsed chat completion not found"
|
1996
|
+
)
|
1997
|
+
|
1998
|
+
if isinstance(client, (openai_OpenAI, openai_AsyncOpenAI)):
|
1999
|
+
if isinstance(response, openai_ChatCompletion):
|
2000
|
+
model_name = response.model
|
2001
|
+
prompt_tokens = response.usage.prompt_tokens if response.usage else 0
|
2002
|
+
completion_tokens = (
|
2003
|
+
response.usage.completion_tokens if response.usage else 0
|
2004
|
+
)
|
2005
|
+
cache_read_input_tokens = (
|
2006
|
+
response.usage.prompt_tokens_details.cached_tokens
|
2007
|
+
if response.usage
|
2008
|
+
and response.usage.prompt_tokens_details
|
2009
|
+
and response.usage.prompt_tokens_details.cached_tokens
|
2010
|
+
else 0
|
2011
|
+
)
|
2012
|
+
|
2013
|
+
if isinstance(response, openai_ParsedChatCompletion):
|
2014
|
+
message_content = response.choices[0].message.parsed
|
2015
|
+
else:
|
2016
|
+
message_content = response.choices[0].message.content
|
2017
|
+
elif isinstance(response, openai_Response):
|
2018
|
+
model_name = response.model
|
2019
|
+
prompt_tokens = response.usage.input_tokens if response.usage else 0
|
2020
|
+
completion_tokens = (
|
2021
|
+
response.usage.output_tokens if response.usage else 0
|
2022
|
+
)
|
2023
|
+
cache_read_input_tokens = (
|
2024
|
+
response.usage.input_tokens_details.cached_tokens
|
2025
|
+
if response.usage and response.usage.input_tokens_details
|
2026
|
+
else 0
|
2027
|
+
)
|
2028
|
+
if hasattr(response.output[0], "content"):
|
2029
|
+
message_content = "".join(
|
2030
|
+
seg.text
|
2031
|
+
for seg in response.output[0].content
|
2032
|
+
if hasattr(seg, "text")
|
2033
|
+
)
|
2034
|
+
# Note: LiteLLM seems to use cache_read_input_tokens to calculate the cost for OpenAI
|
2035
|
+
return message_content, _create_usage(
|
2036
|
+
model_name,
|
2037
|
+
prompt_tokens,
|
2038
|
+
completion_tokens,
|
2039
|
+
cache_read_input_tokens,
|
2040
|
+
cache_creation_input_tokens,
|
2041
|
+
)
|
2042
|
+
|
2043
|
+
if HAS_TOGETHER:
|
2044
|
+
from judgeval.common.tracer.providers import (
|
2045
|
+
together_Together,
|
2046
|
+
together_AsyncTogether,
|
2047
|
+
)
|
2048
|
+
|
2049
|
+
assert together_Together is not None, "Together client not found"
|
2050
|
+
assert together_AsyncTogether is not None, "Together async client not found"
|
2051
|
+
|
2052
|
+
if isinstance(client, (together_Together, together_AsyncTogether)):
|
2053
|
+
model_name = "together_ai/" + response.model
|
2054
|
+
prompt_tokens = response.usage.prompt_tokens
|
2055
|
+
completion_tokens = response.usage.completion_tokens
|
2056
|
+
message_content = response.choices[0].message.content
|
2057
|
+
|
2058
|
+
# As of 2025-07-14, Together does not do any input cache token tracking
|
2059
|
+
return message_content, _create_usage(
|
2060
|
+
model_name,
|
2061
|
+
prompt_tokens,
|
2062
|
+
completion_tokens,
|
2063
|
+
cache_read_input_tokens,
|
2064
|
+
cache_creation_input_tokens,
|
1803
2065
|
)
|
1804
|
-
|
1805
|
-
|
1806
|
-
|
1807
|
-
|
1808
|
-
|
1809
|
-
|
2066
|
+
|
2067
|
+
if HAS_GOOGLE_GENAI:
|
2068
|
+
from judgeval.common.tracer.providers import (
|
2069
|
+
google_genai_Client,
|
2070
|
+
google_genai_AsyncClient,
|
2071
|
+
)
|
2072
|
+
|
2073
|
+
assert google_genai_Client is not None, "Google GenAI client not found"
|
2074
|
+
assert google_genai_AsyncClient is not None, (
|
2075
|
+
"Google GenAI async client not found"
|
2076
|
+
)
|
2077
|
+
if isinstance(client, (google_genai_Client, google_genai_AsyncClient)):
|
2078
|
+
model_name = response.model_version
|
2079
|
+
prompt_tokens = response.usage_metadata.prompt_token_count
|
2080
|
+
completion_tokens = response.usage_metadata.candidates_token_count
|
2081
|
+
message_content = response.candidates[0].content.parts[0].text
|
2082
|
+
|
2083
|
+
if hasattr(response.usage_metadata, "cached_content_token_count"):
|
2084
|
+
cache_read_input_tokens = (
|
2085
|
+
response.usage_metadata.cached_content_token_count
|
2086
|
+
)
|
2087
|
+
return message_content, _create_usage(
|
2088
|
+
model_name,
|
2089
|
+
prompt_tokens,
|
2090
|
+
completion_tokens,
|
2091
|
+
cache_read_input_tokens,
|
2092
|
+
cache_creation_input_tokens,
|
1810
2093
|
)
|
1811
2094
|
|
1812
|
-
|
1813
|
-
|
1814
|
-
|
1815
|
-
|
1816
|
-
|
2095
|
+
if HAS_ANTHROPIC:
|
2096
|
+
from judgeval.common.tracer.providers import (
|
2097
|
+
anthropic_Anthropic,
|
2098
|
+
anthropic_AsyncAnthropic,
|
2099
|
+
)
|
2100
|
+
|
2101
|
+
assert anthropic_Anthropic is not None, "Anthropic client not found"
|
2102
|
+
assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
|
2103
|
+
if isinstance(client, (anthropic_Anthropic, anthropic_AsyncAnthropic)):
|
1817
2104
|
model_name = response.model
|
1818
|
-
prompt_tokens = response.usage.input_tokens
|
1819
|
-
completion_tokens = response.usage.output_tokens
|
1820
|
-
cache_read_input_tokens =
|
1821
|
-
|
1822
|
-
|
1823
|
-
|
2105
|
+
prompt_tokens = response.usage.input_tokens
|
2106
|
+
completion_tokens = response.usage.output_tokens
|
2107
|
+
cache_read_input_tokens = response.usage.cache_read_input_tokens
|
2108
|
+
cache_creation_input_tokens = response.usage.cache_creation_input_tokens
|
2109
|
+
message_content = response.content[0].text
|
2110
|
+
return message_content, _create_usage(
|
2111
|
+
model_name,
|
2112
|
+
prompt_tokens,
|
2113
|
+
completion_tokens,
|
2114
|
+
cache_read_input_tokens,
|
2115
|
+
cache_creation_input_tokens,
|
1824
2116
|
)
|
1825
|
-
if hasattr(response.output[0], "content"):
|
1826
|
-
message_content = "".join(
|
1827
|
-
seg.text
|
1828
|
-
for seg in response.output[0].content
|
1829
|
-
if hasattr(seg, "text")
|
1830
|
-
)
|
1831
2117
|
|
1832
|
-
|
1833
|
-
|
1834
|
-
|
1835
|
-
|
1836
|
-
|
1837
|
-
|
1838
|
-
|
1839
|
-
|
1840
|
-
|
1841
|
-
|
1842
|
-
|
1843
|
-
|
1844
|
-
|
1845
|
-
|
1846
|
-
|
1847
|
-
|
1848
|
-
|
1849
|
-
|
1850
|
-
|
1851
|
-
|
1852
|
-
|
1853
|
-
cache_creation_input_tokens = response.usage.cache_creation_input_tokens
|
1854
|
-
message_content = response.content[0].text
|
1855
|
-
elif isinstance(client, (Groq, AsyncGroq)):
|
1856
|
-
model_name = "groq/" + response.model
|
1857
|
-
prompt_tokens = response.usage.prompt_tokens
|
1858
|
-
completion_tokens = response.usage.completion_tokens
|
1859
|
-
message_content = response.choices[0].message.content
|
1860
|
-
else:
|
1861
|
-
judgeval_logger.warning(f"Unsupported client type: {type(client)}")
|
1862
|
-
return None, None
|
2118
|
+
if HAS_GROQ:
|
2119
|
+
from judgeval.common.tracer.providers import groq_Groq, groq_AsyncGroq
|
2120
|
+
|
2121
|
+
assert groq_Groq is not None, "Groq client not found"
|
2122
|
+
assert groq_AsyncGroq is not None, "Groq async client not found"
|
2123
|
+
if isinstance(client, (groq_Groq, groq_AsyncGroq)):
|
2124
|
+
model_name = "groq/" + response.model
|
2125
|
+
prompt_tokens = response.usage.prompt_tokens
|
2126
|
+
completion_tokens = response.usage.completion_tokens
|
2127
|
+
message_content = response.choices[0].message.content
|
2128
|
+
return message_content, _create_usage(
|
2129
|
+
model_name,
|
2130
|
+
prompt_tokens,
|
2131
|
+
completion_tokens,
|
2132
|
+
cache_read_input_tokens,
|
2133
|
+
cache_creation_input_tokens,
|
2134
|
+
)
|
2135
|
+
|
2136
|
+
judgeval_logger.warning(f"Unsupported client type: {type(client)}")
|
2137
|
+
return None, None
|
2138
|
+
|
1863
2139
|
|
2140
|
+
def _create_usage(
|
2141
|
+
model_name: str,
|
2142
|
+
prompt_tokens: int,
|
2143
|
+
completion_tokens: int,
|
2144
|
+
cache_read_input_tokens: int = 0,
|
2145
|
+
cache_creation_input_tokens: int = 0,
|
2146
|
+
) -> TraceUsage:
|
2147
|
+
"""Helper function to create TraceUsage object with cost calculation."""
|
1864
2148
|
prompt_cost, completion_cost = cost_per_token(
|
1865
2149
|
model=model_name,
|
1866
2150
|
prompt_tokens=prompt_tokens,
|
@@ -1871,7 +2155,7 @@ def _format_output_data(
|
|
1871
2155
|
total_cost_usd = (
|
1872
2156
|
(prompt_cost + completion_cost) if prompt_cost and completion_cost else None
|
1873
2157
|
)
|
1874
|
-
|
2158
|
+
return TraceUsage(
|
1875
2159
|
prompt_tokens=prompt_tokens,
|
1876
2160
|
completion_tokens=completion_tokens,
|
1877
2161
|
total_tokens=prompt_tokens + completion_tokens,
|
@@ -1882,7 +2166,6 @@ def _format_output_data(
|
|
1882
2166
|
total_cost_usd=total_cost_usd,
|
1883
2167
|
model_name=model_name,
|
1884
2168
|
)
|
1885
|
-
return message_content, usage
|
1886
2169
|
|
1887
2170
|
|
1888
2171
|
def combine_args_kwargs(func, args, kwargs):
|