judgeval 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +2 -0
- judgeval/cli.py +65 -0
- judgeval/clients.py +2 -1
- judgeval/common/api/api.py +46 -54
- judgeval/common/api/constants.py +18 -5
- judgeval/common/api/json_encoder.py +241 -0
- judgeval/common/tracer/core.py +772 -467
- judgeval/common/tracer/otel_span_processor.py +1 -1
- judgeval/common/tracer/providers.py +119 -0
- judgeval/common/tracer/span_processor.py +1 -1
- judgeval/common/tracer/span_transformer.py +16 -26
- judgeval/constants.py +1 -0
- judgeval/data/evaluation_run.py +104 -0
- judgeval/data/judgment_types.py +38 -8
- judgeval/data/trace.py +6 -122
- judgeval/data/trace_run.py +2 -3
- judgeval/dataset.py +2 -0
- judgeval/integrations/langgraph.py +2 -1
- judgeval/judges/litellm_judge.py +2 -1
- judgeval/judges/mixture_of_judges.py +2 -1
- judgeval/judges/utils.py +2 -1
- judgeval/judgment_client.py +113 -53
- judgeval/local_eval_queue.py +190 -0
- judgeval/run_evaluation.py +43 -197
- judgeval/scorers/base_scorer.py +9 -10
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
- judgeval/scorers/score.py +33 -11
- judgeval/utils/async_utils.py +36 -0
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/METADATA +11 -12
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/RECORD +33 -27
- judgeval-0.6.0.dist-info/entry_points.txt +2 -0
- judgeval/evaluation_run.py +0 -76
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/WHEEL +0 -0
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/tracer/core.py
CHANGED
@@ -26,11 +26,15 @@ from typing import (
|
|
26
26
|
Generator,
|
27
27
|
List,
|
28
28
|
Optional,
|
29
|
+
ParamSpec,
|
29
30
|
Tuple,
|
31
|
+
TypeVar,
|
30
32
|
Union,
|
31
33
|
TypeAlias,
|
34
|
+
overload,
|
32
35
|
)
|
33
36
|
import types
|
37
|
+
import random
|
34
38
|
|
35
39
|
|
36
40
|
from judgeval.common.tracer.constants import _TRACE_FILEPATH_BLOCKLIST
|
@@ -38,40 +42,33 @@ from judgeval.common.tracer.constants import _TRACE_FILEPATH_BLOCKLIST
|
|
38
42
|
from judgeval.common.tracer.otel_span_processor import JudgmentSpanProcessor
|
39
43
|
from judgeval.common.tracer.span_processor import SpanProcessorBase
|
40
44
|
from judgeval.common.tracer.trace_manager import TraceManagerClient
|
41
|
-
from litellm import cost_per_token as _original_cost_per_token
|
42
|
-
from openai import OpenAI, AsyncOpenAI
|
43
|
-
from openai.types.chat.chat_completion import ChatCompletion
|
44
|
-
from openai.types.responses.response import Response
|
45
|
-
from openai.types.chat import ParsedChatCompletion
|
46
|
-
from together import Together, AsyncTogether
|
47
|
-
from anthropic import Anthropic, AsyncAnthropic
|
48
|
-
from google import genai
|
49
|
-
from groq import Groq, AsyncGroq
|
50
45
|
|
51
46
|
from judgeval.data import Example, Trace, TraceSpan, TraceUsage
|
52
47
|
from judgeval.scorers import APIScorerConfig, BaseScorer
|
53
|
-
from judgeval.evaluation_run import EvaluationRun
|
54
|
-
from judgeval.
|
48
|
+
from judgeval.data.evaluation_run import EvaluationRun
|
49
|
+
from judgeval.local_eval_queue import LocalEvaluationQueue
|
50
|
+
from judgeval.common.api import JudgmentApiClient
|
51
|
+
from judgeval.common.utils import OptExcInfo, validate_api_key
|
55
52
|
from judgeval.common.logger import judgeval_logger
|
56
53
|
|
54
|
+
from litellm import cost_per_token as _original_cost_per_token # type: ignore
|
55
|
+
from judgeval.common.tracer.providers import (
|
56
|
+
HAS_OPENAI,
|
57
|
+
HAS_TOGETHER,
|
58
|
+
HAS_ANTHROPIC,
|
59
|
+
HAS_GOOGLE_GENAI,
|
60
|
+
HAS_GROQ,
|
61
|
+
ApiClient,
|
62
|
+
)
|
63
|
+
from judgeval.constants import DEFAULT_GPT_MODEL
|
64
|
+
|
57
65
|
|
58
66
|
current_trace_var = contextvars.ContextVar[Optional["TraceClient"]](
|
59
67
|
"current_trace", default=None
|
60
68
|
)
|
61
69
|
current_span_var = contextvars.ContextVar[Optional[str]]("current_span", default=None)
|
62
70
|
|
63
|
-
|
64
|
-
OpenAI,
|
65
|
-
Together,
|
66
|
-
Anthropic,
|
67
|
-
AsyncOpenAI,
|
68
|
-
AsyncAnthropic,
|
69
|
-
AsyncTogether,
|
70
|
-
genai.Client,
|
71
|
-
genai.client.AsyncClient,
|
72
|
-
Groq,
|
73
|
-
AsyncGroq,
|
74
|
-
]
|
71
|
+
|
75
72
|
SpanType: TypeAlias = str
|
76
73
|
|
77
74
|
|
@@ -113,10 +110,6 @@ class TraceClient:
|
|
113
110
|
|
114
111
|
self.otel_span_processor = tracer.otel_span_processor
|
115
112
|
|
116
|
-
judgeval_logger.info(
|
117
|
-
f"🎯 TraceClient using span processor for trace {self.trace_id}"
|
118
|
-
)
|
119
|
-
|
120
113
|
def get_current_span(self):
|
121
114
|
"""Get the current span from the context var"""
|
122
115
|
return self.tracer.get_current_span()
|
@@ -181,85 +174,53 @@ class TraceClient:
|
|
181
174
|
|
182
175
|
def async_evaluate(
|
183
176
|
self,
|
184
|
-
|
185
|
-
example:
|
186
|
-
|
187
|
-
actual_output: Optional[Union[str, List[str]]] = None,
|
188
|
-
expected_output: Optional[Union[str, List[str]]] = None,
|
189
|
-
context: Optional[List[str]] = None,
|
190
|
-
retrieval_context: Optional[List[str]] = None,
|
191
|
-
tools_called: Optional[List[str]] = None,
|
192
|
-
expected_tools: Optional[List[str]] = None,
|
193
|
-
additional_metadata: Optional[Dict[str, Any]] = None,
|
194
|
-
model: Optional[str] = None,
|
195
|
-
span_id: Optional[str] = None,
|
177
|
+
scorer: Union[APIScorerConfig, BaseScorer],
|
178
|
+
example: Example,
|
179
|
+
model: str = DEFAULT_GPT_MODEL,
|
196
180
|
):
|
197
|
-
if not self.enable_evaluations:
|
198
|
-
return
|
199
|
-
|
200
181
|
start_time = time.time()
|
182
|
+
span_id = self.get_current_span()
|
183
|
+
eval_run_name = (
|
184
|
+
f"{self.name.capitalize()}-{span_id}-{scorer.score_type.capitalize()}"
|
185
|
+
)
|
186
|
+
hosted_scoring = isinstance(scorer, APIScorerConfig) or (
|
187
|
+
isinstance(scorer, BaseScorer) and scorer.server_hosted
|
188
|
+
)
|
189
|
+
if hosted_scoring:
|
190
|
+
eval_run = EvaluationRun(
|
191
|
+
organization_id=self.tracer.organization_id,
|
192
|
+
project_name=self.project_name,
|
193
|
+
eval_name=eval_run_name,
|
194
|
+
examples=[example],
|
195
|
+
scorers=[scorer],
|
196
|
+
model=model,
|
197
|
+
trace_span_id=span_id,
|
198
|
+
)
|
201
199
|
|
202
|
-
|
203
|
-
if not scorers:
|
204
|
-
judgeval_logger.warning("No valid scorers available for evaluation")
|
205
|
-
return
|
206
|
-
|
207
|
-
except Exception as e:
|
208
|
-
judgeval_logger.warning(f"Failed to load scorers: {str(e)}")
|
209
|
-
return
|
210
|
-
|
211
|
-
if example is None:
|
212
|
-
if any(
|
213
|
-
param is not None
|
214
|
-
for param in [
|
215
|
-
input,
|
216
|
-
actual_output,
|
217
|
-
expected_output,
|
218
|
-
context,
|
219
|
-
retrieval_context,
|
220
|
-
tools_called,
|
221
|
-
expected_tools,
|
222
|
-
additional_metadata,
|
223
|
-
]
|
224
|
-
):
|
225
|
-
example = Example(
|
226
|
-
input=input,
|
227
|
-
actual_output=actual_output,
|
228
|
-
expected_output=expected_output,
|
229
|
-
context=context,
|
230
|
-
retrieval_context=retrieval_context,
|
231
|
-
tools_called=tools_called,
|
232
|
-
expected_tools=expected_tools,
|
233
|
-
additional_metadata=additional_metadata,
|
234
|
-
)
|
235
|
-
else:
|
236
|
-
raise ValueError(
|
237
|
-
"Either 'example' or at least one of the individual parameters (input, actual_output, etc.) must be provided"
|
238
|
-
)
|
200
|
+
self.add_eval_run(eval_run, start_time)
|
239
201
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
202
|
+
if span_id:
|
203
|
+
current_span = self.span_id_to_span.get(span_id)
|
204
|
+
if current_span:
|
205
|
+
self.otel_span_processor.queue_evaluation_run(
|
206
|
+
eval_run, span_id=span_id, span_data=current_span
|
207
|
+
)
|
208
|
+
else:
|
209
|
+
# Handle custom scorers using local evaluation queue
|
210
|
+
eval_run = EvaluationRun(
|
211
|
+
organization_id=self.tracer.organization_id,
|
212
|
+
project_name=self.project_name,
|
213
|
+
eval_name=eval_run_name,
|
214
|
+
examples=[example],
|
215
|
+
scorers=[scorer],
|
216
|
+
model=model,
|
217
|
+
trace_span_id=span_id,
|
218
|
+
)
|
254
219
|
|
255
|
-
|
220
|
+
self.add_eval_run(eval_run, start_time)
|
256
221
|
|
257
|
-
|
258
|
-
|
259
|
-
if current_span:
|
260
|
-
self.otel_span_processor.queue_evaluation_run(
|
261
|
-
eval_run, span_id=span_id_to_use, span_data=current_span
|
262
|
-
)
|
222
|
+
# Enqueue the evaluation run to the local evaluation queue
|
223
|
+
self.tracer.local_eval_queue.enqueue(eval_run)
|
263
224
|
|
264
225
|
def add_eval_run(self, eval_run: EvaluationRun, start_time: float):
|
265
226
|
current_span_id = eval_run.trace_span_id
|
@@ -290,6 +251,14 @@ class TraceClient:
|
|
290
251
|
|
291
252
|
self.otel_span_processor.queue_span_update(span, span_state="agent_name")
|
292
253
|
|
254
|
+
def record_class_name(self, class_name: str):
|
255
|
+
current_span_id = self.get_current_span()
|
256
|
+
if current_span_id:
|
257
|
+
span = self.span_id_to_span[current_span_id]
|
258
|
+
span.class_name = class_name
|
259
|
+
|
260
|
+
self.otel_span_processor.queue_span_update(span, span_state="class_name")
|
261
|
+
|
293
262
|
def record_state_before(self, state: dict):
|
294
263
|
"""Records the agent's state before a tool execution on the current span.
|
295
264
|
|
@@ -316,35 +285,13 @@ class TraceClient:
|
|
316
285
|
|
317
286
|
self.otel_span_processor.queue_span_update(span, span_state="state_after")
|
318
287
|
|
319
|
-
async def _update_coroutine(self, span: TraceSpan, coroutine: Any, field: str):
|
320
|
-
"""Helper method to update the output of a trace entry once the coroutine completes"""
|
321
|
-
try:
|
322
|
-
result = await coroutine
|
323
|
-
setattr(span, field, result)
|
324
|
-
|
325
|
-
if field == "output":
|
326
|
-
self.otel_span_processor.queue_span_update(span, span_state="output")
|
327
|
-
|
328
|
-
return result
|
329
|
-
except Exception as e:
|
330
|
-
setattr(span, field, f"Error: {str(e)}")
|
331
|
-
|
332
|
-
if field == "output":
|
333
|
-
self.otel_span_processor.queue_span_update(span, span_state="output")
|
334
|
-
|
335
|
-
raise
|
336
|
-
|
337
288
|
def record_output(self, output: Any):
|
338
289
|
current_span_id = self.get_current_span()
|
339
290
|
if current_span_id:
|
340
291
|
span = self.span_id_to_span[current_span_id]
|
341
|
-
span.output =
|
342
|
-
|
343
|
-
if inspect.iscoroutine(output):
|
344
|
-
asyncio.create_task(self._update_coroutine(span, output, "output"))
|
292
|
+
span.output = output
|
345
293
|
|
346
|
-
|
347
|
-
self.otel_span_processor.queue_span_update(span, span_state="output")
|
294
|
+
self.otel_span_processor.queue_span_update(span, span_state="output")
|
348
295
|
|
349
296
|
return span
|
350
297
|
return None
|
@@ -517,7 +464,7 @@ class TraceClient:
|
|
517
464
|
|
518
465
|
|
519
466
|
def _capture_exception_for_trace(
|
520
|
-
current_trace: Optional[TraceClient], exc_info:
|
467
|
+
current_trace: Optional[TraceClient], exc_info: OptExcInfo
|
521
468
|
):
|
522
469
|
if not current_trace:
|
523
470
|
return
|
@@ -681,6 +628,7 @@ class _DeepTracer:
|
|
681
628
|
|
682
629
|
qual_name = self._get_qual_name(frame)
|
683
630
|
instance_name = None
|
631
|
+
class_name = None
|
684
632
|
if "self" in frame.f_locals:
|
685
633
|
instance = frame.f_locals["self"]
|
686
634
|
class_name = instance.__class__.__name__
|
@@ -754,6 +702,7 @@ class _DeepTracer:
|
|
754
702
|
parent_span_id=parent_span_id,
|
755
703
|
function=qual_name,
|
756
704
|
agent_name=instance_name,
|
705
|
+
class_name=class_name,
|
757
706
|
)
|
758
707
|
current_trace.add_span(span)
|
759
708
|
|
@@ -841,6 +790,10 @@ class _DeepTracer:
|
|
841
790
|
self._original_threading_trace = None
|
842
791
|
|
843
792
|
|
793
|
+
T = TypeVar("T", bound=Callable[..., Any])
|
794
|
+
P = ParamSpec("P")
|
795
|
+
|
796
|
+
|
844
797
|
class Tracer:
|
845
798
|
# Tracer.current_trace class variable is currently used in wrap()
|
846
799
|
# TODO: Keep track of cross-context state for current trace and current span ID solely through class variables instead of instance variables?
|
@@ -954,6 +907,15 @@ class Tracer:
|
|
954
907
|
else:
|
955
908
|
self.otel_span_processor = SpanProcessorBase()
|
956
909
|
|
910
|
+
# Initialize local evaluation queue for custom scorers
|
911
|
+
self.local_eval_queue = LocalEvaluationQueue()
|
912
|
+
|
913
|
+
# Start workers with callback to log results only if monitoring is enabled
|
914
|
+
if enable_evaluations and enable_monitoring:
|
915
|
+
self.local_eval_queue.start_workers(
|
916
|
+
callback=self._log_eval_results_callback
|
917
|
+
)
|
918
|
+
|
957
919
|
atexit.register(self._cleanup_on_exit)
|
958
920
|
except Exception as e:
|
959
921
|
judgeval_logger.error(
|
@@ -1089,10 +1051,10 @@ class Tracer:
|
|
1089
1051
|
# Reset the context variable
|
1090
1052
|
self.reset_current_trace(token)
|
1091
1053
|
|
1092
|
-
def
|
1054
|
+
def agent(
|
1093
1055
|
self,
|
1094
|
-
identifier: str,
|
1095
|
-
track_state: bool = False,
|
1056
|
+
identifier: Optional[str] = None,
|
1057
|
+
track_state: Optional[bool] = False,
|
1096
1058
|
track_attributes: Optional[List[str]] = None,
|
1097
1059
|
field_mappings: Optional[Dict[str, str]] = None,
|
1098
1060
|
):
|
@@ -1130,11 +1092,18 @@ class Tracer:
|
|
1130
1092
|
"track_state": track_state,
|
1131
1093
|
"track_attributes": track_attributes,
|
1132
1094
|
"field_mappings": field_mappings or {},
|
1095
|
+
"class_name": class_name,
|
1133
1096
|
}
|
1134
1097
|
return cls
|
1135
1098
|
|
1136
1099
|
return decorator
|
1137
1100
|
|
1101
|
+
def identify(self, *args, **kwargs):
|
1102
|
+
judgeval_logger.warning(
|
1103
|
+
"identify() is deprecated and may not be supported in future versions of judgeval. Use the agent() decorator instead."
|
1104
|
+
)
|
1105
|
+
return self.agent(*args, **kwargs)
|
1106
|
+
|
1138
1107
|
def _capture_instance_state(
|
1139
1108
|
self, instance: Any, class_config: Dict[str, Any]
|
1140
1109
|
) -> Dict[str, Any]:
|
@@ -1189,11 +1158,24 @@ class Tracer:
|
|
1189
1158
|
else:
|
1190
1159
|
trace_client_instance.record_state_after(state)
|
1191
1160
|
|
1161
|
+
@overload
|
1162
|
+
def observe(
|
1163
|
+
self, func: T, *, name: Optional[str] = None, span_type: SpanType = "span"
|
1164
|
+
) -> T: ...
|
1165
|
+
|
1166
|
+
@overload
|
1167
|
+
def observe(
|
1168
|
+
self,
|
1169
|
+
*,
|
1170
|
+
name: Optional[str] = None,
|
1171
|
+
span_type: SpanType = "span",
|
1172
|
+
) -> Callable[[T], T]: ...
|
1173
|
+
|
1192
1174
|
def observe(
|
1193
1175
|
self,
|
1194
|
-
func=None,
|
1176
|
+
func: Optional[T] = None,
|
1195
1177
|
*,
|
1196
|
-
name=None,
|
1178
|
+
name: Optional[str] = None,
|
1197
1179
|
span_type: SpanType = "span",
|
1198
1180
|
):
|
1199
1181
|
"""
|
@@ -1210,8 +1192,8 @@ class Tracer:
|
|
1210
1192
|
return func if func else lambda f: f
|
1211
1193
|
|
1212
1194
|
if func is None:
|
1213
|
-
return lambda
|
1214
|
-
|
1195
|
+
return lambda func: self.observe(
|
1196
|
+
func,
|
1215
1197
|
name=name,
|
1216
1198
|
span_type=span_type,
|
1217
1199
|
)
|
@@ -1220,131 +1202,262 @@ class Tracer:
|
|
1220
1202
|
original_span_name = name or func.__name__
|
1221
1203
|
|
1222
1204
|
# Store custom attributes on the function object
|
1223
|
-
func._judgment_span_name = original_span_name
|
1224
|
-
func._judgment_span_type = span_type
|
1205
|
+
func._judgment_span_name = original_span_name # type: ignore
|
1206
|
+
func._judgment_span_type = span_type # type: ignore
|
1225
1207
|
|
1226
1208
|
except Exception:
|
1227
1209
|
return func
|
1228
1210
|
|
1229
|
-
|
1211
|
+
def _record_span_data(span, args, kwargs):
|
1212
|
+
"""Helper function to record inputs, agent info, and state on a span."""
|
1213
|
+
# Get class and agent info
|
1214
|
+
class_name = None
|
1215
|
+
agent_name = None
|
1216
|
+
if args and hasattr(args[0], "__class__"):
|
1217
|
+
class_name = args[0].__class__.__name__
|
1218
|
+
agent_name = get_instance_prefixed_name(
|
1219
|
+
args[0], class_name, self.class_identifiers
|
1220
|
+
)
|
1230
1221
|
|
1231
|
-
|
1232
|
-
|
1233
|
-
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1222
|
+
# Record inputs, agent name, class name
|
1223
|
+
inputs = combine_args_kwargs(func, args, kwargs)
|
1224
|
+
span.record_input(inputs)
|
1225
|
+
if agent_name:
|
1226
|
+
span.record_agent_name(agent_name)
|
1227
|
+
if class_name and class_name in self.class_identifiers:
|
1228
|
+
span.record_class_name(class_name)
|
1229
|
+
|
1230
|
+
# Capture state before execution
|
1231
|
+
self._conditionally_capture_and_record_state(span, args, is_before=True)
|
1232
|
+
|
1233
|
+
return class_name, agent_name
|
1237
1234
|
|
1238
|
-
|
1239
|
-
|
1240
|
-
|
1241
|
-
|
1235
|
+
def _finalize_span_data(span, result, args):
|
1236
|
+
"""Helper function to record outputs and final state on a span."""
|
1237
|
+
# Record output
|
1238
|
+
span.record_output(result)
|
1239
|
+
|
1240
|
+
# Capture state after execution
|
1241
|
+
self._conditionally_capture_and_record_state(span, args, is_before=False)
|
1242
|
+
|
1243
|
+
def _cleanup_trace(current_trace, trace_token, wrapper_type="function"):
|
1244
|
+
"""Helper function to handle trace cleanup in finally blocks."""
|
1245
|
+
try:
|
1246
|
+
trace_id, server_response = current_trace.save(final_save=True)
|
1247
|
+
|
1248
|
+
complete_trace_data = {
|
1249
|
+
"trace_id": current_trace.trace_id,
|
1250
|
+
"name": current_trace.name,
|
1251
|
+
"project_name": current_trace.project_name,
|
1252
|
+
"created_at": datetime.fromtimestamp(
|
1253
|
+
current_trace.start_time or time.time(),
|
1254
|
+
timezone.utc,
|
1255
|
+
).isoformat(),
|
1256
|
+
"duration": current_trace.get_duration(),
|
1257
|
+
"trace_spans": [
|
1258
|
+
span.model_dump() for span in current_trace.trace_spans
|
1259
|
+
],
|
1260
|
+
"evaluation_runs": [
|
1261
|
+
run.model_dump() for run in current_trace.evaluation_runs
|
1262
|
+
],
|
1263
|
+
"offline_mode": self.offline_mode,
|
1264
|
+
"parent_trace_id": current_trace.parent_trace_id,
|
1265
|
+
"parent_name": current_trace.parent_name,
|
1266
|
+
"customer_id": current_trace.customer_id,
|
1267
|
+
"tags": current_trace.tags,
|
1268
|
+
"metadata": current_trace.metadata,
|
1269
|
+
"update_id": current_trace.update_id,
|
1270
|
+
}
|
1271
|
+
self.traces.append(complete_trace_data)
|
1272
|
+
self.reset_current_trace(trace_token)
|
1273
|
+
except Exception as e:
|
1274
|
+
judgeval_logger.warning(f"Issue with {wrapper_type} cleanup: {e}")
|
1275
|
+
|
1276
|
+
def _execute_in_span(
|
1277
|
+
current_trace, span_name, span_type, execution_func, args, kwargs
|
1278
|
+
):
|
1279
|
+
"""Helper function to execute code within a span context."""
|
1280
|
+
with current_trace.span(span_name, span_type=span_type) as span:
|
1281
|
+
_record_span_data(span, args, kwargs)
|
1282
|
+
|
1283
|
+
try:
|
1284
|
+
result = execution_func()
|
1285
|
+
_finalize_span_data(span, result, args)
|
1286
|
+
return result
|
1287
|
+
except Exception as e:
|
1288
|
+
_capture_exception_for_trace(current_trace, sys.exc_info())
|
1289
|
+
raise e
|
1290
|
+
|
1291
|
+
async def _execute_in_span_async(
|
1292
|
+
current_trace, span_name, span_type, async_execution_func, args, kwargs
|
1293
|
+
):
|
1294
|
+
"""Helper function to execute async code within a span context."""
|
1295
|
+
with current_trace.span(span_name, span_type=span_type) as span:
|
1296
|
+
_record_span_data(span, args, kwargs)
|
1297
|
+
|
1298
|
+
try:
|
1299
|
+
result = await async_execution_func()
|
1300
|
+
_finalize_span_data(span, result, args)
|
1301
|
+
return result
|
1302
|
+
except Exception as e:
|
1303
|
+
_capture_exception_for_trace(current_trace, sys.exc_info())
|
1304
|
+
raise e
|
1305
|
+
|
1306
|
+
def _create_new_trace(self, span_name):
|
1307
|
+
"""Helper function to create a new trace and set it as current."""
|
1308
|
+
trace_id = str(uuid.uuid4())
|
1309
|
+
project = self.project_name
|
1310
|
+
|
1311
|
+
current_trace = TraceClient(
|
1312
|
+
self,
|
1313
|
+
trace_id,
|
1314
|
+
span_name,
|
1315
|
+
project_name=project,
|
1316
|
+
enable_monitoring=self.enable_monitoring,
|
1317
|
+
enable_evaluations=self.enable_evaluations,
|
1318
|
+
)
|
1319
|
+
|
1320
|
+
trace_token = self.set_current_trace(current_trace)
|
1321
|
+
return current_trace, trace_token
|
1322
|
+
|
1323
|
+
def _execute_with_auto_trace_creation(
|
1324
|
+
span_name, span_type, execution_func, args, kwargs
|
1325
|
+
):
|
1326
|
+
"""Helper function that handles automatic trace creation and span execution."""
|
1327
|
+
current_trace = self.get_current_trace()
|
1328
|
+
|
1329
|
+
if not current_trace:
|
1330
|
+
current_trace, trace_token = _create_new_trace(self, span_name)
|
1331
|
+
|
1332
|
+
try:
|
1333
|
+
result = _execute_in_span(
|
1334
|
+
current_trace,
|
1335
|
+
span_name,
|
1336
|
+
span_type,
|
1337
|
+
execution_func,
|
1338
|
+
args,
|
1339
|
+
kwargs,
|
1242
1340
|
)
|
1341
|
+
return result
|
1342
|
+
finally:
|
1343
|
+
# Cleanup the trace we created
|
1344
|
+
_cleanup_trace(current_trace, trace_token, "auto_trace")
|
1345
|
+
else:
|
1346
|
+
# Use existing trace
|
1347
|
+
return _execute_in_span(
|
1348
|
+
current_trace, span_name, span_type, execution_func, args, kwargs
|
1349
|
+
)
|
1243
1350
|
|
1244
|
-
|
1351
|
+
async def _execute_with_auto_trace_creation_async(
|
1352
|
+
span_name, span_type, async_execution_func, args, kwargs
|
1353
|
+
):
|
1354
|
+
"""Helper function that handles automatic trace creation and async span execution."""
|
1355
|
+
current_trace = self.get_current_trace()
|
1245
1356
|
|
1246
|
-
|
1247
|
-
|
1248
|
-
project = self.project_name
|
1357
|
+
if not current_trace:
|
1358
|
+
current_trace, trace_token = _create_new_trace(self, span_name)
|
1249
1359
|
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1360
|
+
try:
|
1361
|
+
result = await _execute_in_span_async(
|
1362
|
+
current_trace,
|
1253
1363
|
span_name,
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1364
|
+
span_type,
|
1365
|
+
async_execution_func,
|
1366
|
+
args,
|
1367
|
+
kwargs,
|
1257
1368
|
)
|
1369
|
+
return result
|
1370
|
+
finally:
|
1371
|
+
# Cleanup the trace we created
|
1372
|
+
_cleanup_trace(current_trace, trace_token, "async_auto_trace")
|
1373
|
+
else:
|
1374
|
+
# Use existing trace
|
1375
|
+
return await _execute_in_span_async(
|
1376
|
+
current_trace,
|
1377
|
+
span_name,
|
1378
|
+
span_type,
|
1379
|
+
async_execution_func,
|
1380
|
+
args,
|
1381
|
+
kwargs,
|
1382
|
+
)
|
1258
1383
|
|
1259
|
-
|
1384
|
+
# Check for generator functions first
|
1385
|
+
if inspect.isgeneratorfunction(func):
|
1260
1386
|
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
if agent_name:
|
1266
|
-
span.record_agent_name(agent_name)
|
1267
|
-
|
1268
|
-
self._conditionally_capture_and_record_state(
|
1269
|
-
span, args, is_before=True
|
1270
|
-
)
|
1387
|
+
@functools.wraps(func)
|
1388
|
+
def generator_wrapper(*args, **kwargs):
|
1389
|
+
# Get the generator from the original function
|
1390
|
+
generator = func(*args, **kwargs)
|
1271
1391
|
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
|
1277
|
-
|
1278
|
-
|
1279
|
-
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
|
1284
|
-
self._conditionally_capture_and_record_state(
|
1285
|
-
span, args, is_before=False
|
1392
|
+
# Create wrapper generator that creates spans for each yield
|
1393
|
+
def traced_generator():
|
1394
|
+
while True:
|
1395
|
+
try:
|
1396
|
+
# Handle automatic trace creation and span execution
|
1397
|
+
item = _execute_with_auto_trace_creation(
|
1398
|
+
original_span_name,
|
1399
|
+
span_type,
|
1400
|
+
lambda: next(generator),
|
1401
|
+
args,
|
1402
|
+
kwargs,
|
1286
1403
|
)
|
1404
|
+
yield item
|
1405
|
+
except StopIteration:
|
1406
|
+
break
|
1407
|
+
|
1408
|
+
return traced_generator()
|
1409
|
+
|
1410
|
+
return generator_wrapper
|
1411
|
+
|
1412
|
+
# Check for async generator functions
|
1413
|
+
elif inspect.isasyncgenfunction(func):
|
1287
1414
|
|
1288
|
-
|
1289
|
-
|
1290
|
-
|
1415
|
+
@functools.wraps(func)
|
1416
|
+
def async_generator_wrapper(*args, **kwargs):
|
1417
|
+
# Get the async generator from the original function
|
1418
|
+
async_generator = func(*args, **kwargs)
|
1419
|
+
|
1420
|
+
# Create wrapper async generator that creates spans for each yield
|
1421
|
+
async def traced_async_generator():
|
1422
|
+
while True:
|
1291
1423
|
try:
|
1292
|
-
|
1293
|
-
|
1294
|
-
|
1295
|
-
|
1296
|
-
|
1297
|
-
|
1298
|
-
|
1299
|
-
"duration": current_trace.get_duration(),
|
1300
|
-
"trace_spans": [
|
1301
|
-
span.model_dump()
|
1302
|
-
for span in current_trace.trace_spans
|
1303
|
-
],
|
1304
|
-
"offline_mode": self.offline_mode,
|
1305
|
-
"parent_trace_id": current_trace.parent_trace_id,
|
1306
|
-
"parent_name": current_trace.parent_name,
|
1307
|
-
}
|
1308
|
-
|
1309
|
-
trace_id, server_response = current_trace.save(
|
1310
|
-
final_save=True
|
1424
|
+
# Handle automatic trace creation and span execution
|
1425
|
+
item = await _execute_with_auto_trace_creation_async(
|
1426
|
+
original_span_name,
|
1427
|
+
span_type,
|
1428
|
+
lambda: async_generator.__anext__(),
|
1429
|
+
args,
|
1430
|
+
kwargs,
|
1311
1431
|
)
|
1432
|
+
if inspect.iscoroutine(item):
|
1433
|
+
item = await item
|
1434
|
+
yield item
|
1435
|
+
except StopAsyncIteration:
|
1436
|
+
break
|
1312
1437
|
|
1313
|
-
|
1438
|
+
return traced_async_generator()
|
1314
1439
|
|
1315
|
-
|
1316
|
-
except Exception as e:
|
1317
|
-
judgeval_logger.warning(f"Issue with async_wrapper: {e}")
|
1318
|
-
pass
|
1319
|
-
else:
|
1320
|
-
with current_trace.span(span_name, span_type=span_type) as span:
|
1321
|
-
inputs = combine_args_kwargs(func, args, kwargs)
|
1322
|
-
span.record_input(inputs)
|
1323
|
-
if agent_name:
|
1324
|
-
span.record_agent_name(agent_name)
|
1325
|
-
|
1326
|
-
# Capture state before execution
|
1327
|
-
self._conditionally_capture_and_record_state(
|
1328
|
-
span, args, is_before=True
|
1329
|
-
)
|
1440
|
+
return async_generator_wrapper
|
1330
1441
|
|
1331
|
-
|
1332
|
-
if self.deep_tracing:
|
1333
|
-
with _DeepTracer(self):
|
1334
|
-
result = await func(*args, **kwargs)
|
1335
|
-
else:
|
1336
|
-
result = await func(*args, **kwargs)
|
1337
|
-
except Exception as e:
|
1338
|
-
_capture_exception_for_trace(current_trace, sys.exc_info())
|
1339
|
-
raise e
|
1340
|
-
|
1341
|
-
# Capture state after execution
|
1342
|
-
self._conditionally_capture_and_record_state(
|
1343
|
-
span, args, is_before=False
|
1344
|
-
)
|
1442
|
+
elif asyncio.iscoroutinefunction(func):
|
1345
1443
|
|
1346
|
-
|
1347
|
-
|
1444
|
+
@functools.wraps(func)
|
1445
|
+
async def async_wrapper(*args, **kwargs):
|
1446
|
+
nonlocal original_span_name
|
1447
|
+
span_name = original_span_name
|
1448
|
+
|
1449
|
+
async def async_execution():
|
1450
|
+
if self.deep_tracing:
|
1451
|
+
with _DeepTracer(self):
|
1452
|
+
return await func(*args, **kwargs)
|
1453
|
+
else:
|
1454
|
+
return await func(*args, **kwargs)
|
1455
|
+
|
1456
|
+
result = await _execute_with_auto_trace_creation_async(
|
1457
|
+
span_name, span_type, async_execution, args, kwargs
|
1458
|
+
)
|
1459
|
+
|
1460
|
+
return result
|
1348
1461
|
|
1349
1462
|
return async_wrapper
|
1350
1463
|
else:
|
@@ -1352,122 +1465,18 @@ class Tracer:
|
|
1352
1465
|
@functools.wraps(func)
|
1353
1466
|
def wrapper(*args, **kwargs):
|
1354
1467
|
nonlocal original_span_name
|
1355
|
-
class_name = None
|
1356
1468
|
span_name = original_span_name
|
1357
|
-
agent_name = None
|
1358
|
-
if args and hasattr(args[0], "__class__"):
|
1359
|
-
class_name = args[0].__class__.__name__
|
1360
|
-
agent_name = get_instance_prefixed_name(
|
1361
|
-
args[0], class_name, self.class_identifiers
|
1362
|
-
)
|
1363
|
-
# Get current trace from context
|
1364
|
-
current_trace = self.get_current_trace()
|
1365
|
-
|
1366
|
-
# If there's no current trace, create a root trace
|
1367
|
-
if not current_trace:
|
1368
|
-
trace_id = str(uuid.uuid4())
|
1369
|
-
project = self.project_name
|
1370
|
-
|
1371
|
-
# Create a new trace client to serve as the root
|
1372
|
-
current_trace = TraceClient(
|
1373
|
-
self,
|
1374
|
-
trace_id,
|
1375
|
-
span_name,
|
1376
|
-
project_name=project,
|
1377
|
-
enable_monitoring=self.enable_monitoring,
|
1378
|
-
enable_evaluations=self.enable_evaluations,
|
1379
|
-
)
|
1380
|
-
|
1381
|
-
trace_token = self.set_current_trace(current_trace)
|
1382
|
-
|
1383
|
-
try:
|
1384
|
-
with current_trace.span(span_name, span_type=span_type) as span:
|
1385
|
-
# Record inputs
|
1386
|
-
inputs = combine_args_kwargs(func, args, kwargs)
|
1387
|
-
span.record_input(inputs)
|
1388
|
-
if agent_name:
|
1389
|
-
span.record_agent_name(agent_name)
|
1390
|
-
# Capture state before execution
|
1391
|
-
self._conditionally_capture_and_record_state(
|
1392
|
-
span, args, is_before=True
|
1393
|
-
)
|
1394
|
-
|
1395
|
-
try:
|
1396
|
-
if self.deep_tracing:
|
1397
|
-
with _DeepTracer(self):
|
1398
|
-
result = func(*args, **kwargs)
|
1399
|
-
else:
|
1400
|
-
result = func(*args, **kwargs)
|
1401
|
-
except Exception as e:
|
1402
|
-
_capture_exception_for_trace(
|
1403
|
-
current_trace, sys.exc_info()
|
1404
|
-
)
|
1405
|
-
raise e
|
1406
|
-
|
1407
|
-
# Capture state after execution
|
1408
|
-
self._conditionally_capture_and_record_state(
|
1409
|
-
span, args, is_before=False
|
1410
|
-
)
|
1411
|
-
|
1412
|
-
# Record output
|
1413
|
-
span.record_output(result)
|
1414
|
-
return result
|
1415
|
-
finally:
|
1416
|
-
try:
|
1417
|
-
trace_id, server_response = current_trace.save(
|
1418
|
-
final_save=True
|
1419
|
-
)
|
1420
1469
|
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1426
|
-
|
1427
|
-
).isoformat(),
|
1428
|
-
"duration": current_trace.get_duration(),
|
1429
|
-
"trace_spans": [
|
1430
|
-
span.model_dump()
|
1431
|
-
for span in current_trace.trace_spans
|
1432
|
-
],
|
1433
|
-
"offline_mode": self.offline_mode,
|
1434
|
-
"parent_trace_id": current_trace.parent_trace_id,
|
1435
|
-
"parent_name": current_trace.parent_name,
|
1436
|
-
}
|
1437
|
-
self.traces.append(complete_trace_data)
|
1438
|
-
self.reset_current_trace(trace_token)
|
1439
|
-
except Exception as e:
|
1440
|
-
judgeval_logger.warning(f"Issue with save: {e}")
|
1441
|
-
pass
|
1442
|
-
else:
|
1443
|
-
with current_trace.span(span_name, span_type=span_type) as span:
|
1444
|
-
inputs = combine_args_kwargs(func, args, kwargs)
|
1445
|
-
span.record_input(inputs)
|
1446
|
-
if agent_name:
|
1447
|
-
span.record_agent_name(agent_name)
|
1448
|
-
|
1449
|
-
# Capture state before execution
|
1450
|
-
self._conditionally_capture_and_record_state(
|
1451
|
-
span, args, is_before=True
|
1452
|
-
)
|
1470
|
+
def sync_execution():
|
1471
|
+
if self.deep_tracing:
|
1472
|
+
with _DeepTracer(self):
|
1473
|
+
return func(*args, **kwargs)
|
1474
|
+
else:
|
1475
|
+
return func(*args, **kwargs)
|
1453
1476
|
|
1454
|
-
|
1455
|
-
|
1456
|
-
|
1457
|
-
result = func(*args, **kwargs)
|
1458
|
-
else:
|
1459
|
-
result = func(*args, **kwargs)
|
1460
|
-
except Exception as e:
|
1461
|
-
_capture_exception_for_trace(current_trace, sys.exc_info())
|
1462
|
-
raise e
|
1463
|
-
|
1464
|
-
# Capture state after execution
|
1465
|
-
self._conditionally_capture_and_record_state(
|
1466
|
-
span, args, is_before=False
|
1467
|
-
)
|
1468
|
-
|
1469
|
-
span.record_output(result)
|
1470
|
-
return result
|
1477
|
+
return _execute_with_auto_trace_creation(
|
1478
|
+
span_name, span_type, sync_execution, args, kwargs
|
1479
|
+
)
|
1471
1480
|
|
1472
1481
|
return wrapper
|
1473
1482
|
|
@@ -1532,15 +1541,51 @@ class Tracer:
|
|
1532
1541
|
|
1533
1542
|
return decorate_class if cls is None else decorate_class(cls)
|
1534
1543
|
|
1535
|
-
def async_evaluate(
|
1544
|
+
def async_evaluate(
|
1545
|
+
self,
|
1546
|
+
scorer: Union[APIScorerConfig, BaseScorer],
|
1547
|
+
example: Example,
|
1548
|
+
model: str = DEFAULT_GPT_MODEL,
|
1549
|
+
sampling_rate: float = 1,
|
1550
|
+
):
|
1536
1551
|
try:
|
1537
1552
|
if not self.enable_monitoring or not self.enable_evaluations:
|
1538
1553
|
return
|
1539
1554
|
|
1540
|
-
|
1555
|
+
if not isinstance(scorer, (APIScorerConfig, BaseScorer)):
|
1556
|
+
judgeval_logger.warning(
|
1557
|
+
f"Scorer must be an instance of APIScorerConfig or BaseScorer, got {type(scorer)}, skipping evaluation"
|
1558
|
+
)
|
1559
|
+
return
|
1560
|
+
|
1561
|
+
if not isinstance(example, Example):
|
1562
|
+
judgeval_logger.warning(
|
1563
|
+
f"Example must be an instance of Example, got {type(example)} skipping evaluation"
|
1564
|
+
)
|
1565
|
+
return
|
1566
|
+
|
1567
|
+
if sampling_rate < 0:
|
1568
|
+
judgeval_logger.warning(
|
1569
|
+
"Cannot set sampling_rate below 0, skipping evaluation"
|
1570
|
+
)
|
1571
|
+
return
|
1572
|
+
|
1573
|
+
if sampling_rate > 1:
|
1574
|
+
judgeval_logger.warning(
|
1575
|
+
"Cannot set sampling_rate above 1, skipping evaluation"
|
1576
|
+
)
|
1577
|
+
return
|
1578
|
+
|
1579
|
+
percentage = random.uniform(0, 1)
|
1580
|
+
if percentage > sampling_rate:
|
1581
|
+
judgeval_logger.info("Skipping async_evaluate due to sampling rate")
|
1582
|
+
return
|
1541
1583
|
|
1584
|
+
current_trace = self.get_current_trace()
|
1542
1585
|
if current_trace:
|
1543
|
-
current_trace.async_evaluate(
|
1586
|
+
current_trace.async_evaluate(
|
1587
|
+
scorer=scorer, example=example, model=model
|
1588
|
+
)
|
1544
1589
|
else:
|
1545
1590
|
judgeval_logger.warning(
|
1546
1591
|
"No trace found (context var or fallback), skipping evaluation"
|
@@ -1613,9 +1658,68 @@ class Tracer:
|
|
1613
1658
|
self.otel_span_processor.shutdown()
|
1614
1659
|
self.otel_span_processor = SpanProcessorBase()
|
1615
1660
|
|
1661
|
+
def wait_for_completion(self, timeout: Optional[float] = 30.0) -> bool:
|
1662
|
+
"""Wait for all evaluations and span processing to complete.
|
1663
|
+
|
1664
|
+
This method blocks until all queued evaluations are processed and
|
1665
|
+
all pending spans are flushed to the server.
|
1666
|
+
|
1667
|
+
Args:
|
1668
|
+
timeout: Maximum time to wait in seconds. Defaults to 30 seconds.
|
1669
|
+
None means wait indefinitely.
|
1670
|
+
|
1671
|
+
Returns:
|
1672
|
+
True if all processing completed within the timeout, False otherwise.
|
1673
|
+
|
1674
|
+
"""
|
1675
|
+
try:
|
1676
|
+
judgeval_logger.debug(
|
1677
|
+
"Waiting for all evaluations and spans to complete..."
|
1678
|
+
)
|
1679
|
+
|
1680
|
+
# Wait for all queued evaluation work to complete
|
1681
|
+
eval_completed = self.local_eval_queue.wait_for_completion()
|
1682
|
+
if not eval_completed:
|
1683
|
+
judgeval_logger.warning(
|
1684
|
+
f"Local evaluation queue did not complete within {timeout} seconds"
|
1685
|
+
)
|
1686
|
+
return False
|
1687
|
+
|
1688
|
+
self.flush_background_spans()
|
1689
|
+
|
1690
|
+
judgeval_logger.debug("All evaluations and spans completed successfully")
|
1691
|
+
return True
|
1692
|
+
|
1693
|
+
except Exception as e:
|
1694
|
+
judgeval_logger.warning(f"Error while waiting for completion: {e}")
|
1695
|
+
return False
|
1696
|
+
|
1697
|
+
def _log_eval_results_callback(self, evaluation_run, scoring_results):
|
1698
|
+
"""Callback to log evaluation results after local processing."""
|
1699
|
+
try:
|
1700
|
+
if scoring_results and self.enable_evaluations and self.enable_monitoring:
|
1701
|
+
# Convert scoring results to the format expected by API client
|
1702
|
+
results_dict = [
|
1703
|
+
result.model_dump(warnings=False) for result in scoring_results
|
1704
|
+
]
|
1705
|
+
api_client = JudgmentApiClient(self.api_key, self.organization_id)
|
1706
|
+
api_client.log_evaluation_results(
|
1707
|
+
results_dict, evaluation_run.model_dump(warnings=False)
|
1708
|
+
)
|
1709
|
+
except Exception as e:
|
1710
|
+
judgeval_logger.warning(f"Failed to log local evaluation results: {e}")
|
1711
|
+
|
1616
1712
|
def _cleanup_on_exit(self):
|
1617
1713
|
"""Cleanup handler called on application exit to ensure spans are flushed."""
|
1618
1714
|
try:
|
1715
|
+
# Wait for all queued evaluation work to complete before stopping
|
1716
|
+
completed = self.local_eval_queue.wait_for_completion()
|
1717
|
+
if not completed:
|
1718
|
+
judgeval_logger.warning(
|
1719
|
+
"Local evaluation queue did not complete within 30 seconds"
|
1720
|
+
)
|
1721
|
+
|
1722
|
+
self.local_eval_queue.stop_workers()
|
1619
1723
|
self.flush_background_spans()
|
1620
1724
|
except Exception as e:
|
1621
1725
|
judgeval_logger.warning(f"Error during tracer cleanup: {e}")
|
@@ -1697,33 +1801,76 @@ def wrap(
|
|
1697
1801
|
|
1698
1802
|
return wrapper
|
1699
1803
|
|
1700
|
-
if
|
1701
|
-
|
1702
|
-
|
1703
|
-
|
1704
|
-
|
1705
|
-
|
1706
|
-
|
1707
|
-
|
1708
|
-
client.beta.chat.completions, "parse",
|
1804
|
+
if HAS_OPENAI:
|
1805
|
+
from judgeval.common.tracer.providers import openai_OpenAI, openai_AsyncOpenAI
|
1806
|
+
|
1807
|
+
assert openai_OpenAI is not None, "OpenAI client not found"
|
1808
|
+
assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
|
1809
|
+
if isinstance(client, (openai_OpenAI)):
|
1810
|
+
setattr(client.chat.completions, "create", wrapped(original_create))
|
1811
|
+
setattr(client.responses, "create", wrapped(original_responses_create))
|
1812
|
+
setattr(client.beta.chat.completions, "parse", wrapped(original_beta_parse))
|
1813
|
+
elif isinstance(client, (openai_AsyncOpenAI)):
|
1814
|
+
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1815
|
+
setattr(
|
1816
|
+
client.responses, "create", wrapped_async(original_responses_create)
|
1817
|
+
)
|
1818
|
+
setattr(
|
1819
|
+
client.beta.chat.completions,
|
1820
|
+
"parse",
|
1821
|
+
wrapped_async(original_beta_parse),
|
1822
|
+
)
|
1823
|
+
|
1824
|
+
if HAS_TOGETHER:
|
1825
|
+
from judgeval.common.tracer.providers import (
|
1826
|
+
together_Together,
|
1827
|
+
together_AsyncTogether,
|
1828
|
+
)
|
1829
|
+
|
1830
|
+
assert together_Together is not None, "Together client not found"
|
1831
|
+
assert together_AsyncTogether is not None, "Together async client not found"
|
1832
|
+
if isinstance(client, (together_Together)):
|
1833
|
+
setattr(client.chat.completions, "create", wrapped(original_create))
|
1834
|
+
elif isinstance(client, (together_AsyncTogether)):
|
1835
|
+
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1836
|
+
|
1837
|
+
if HAS_ANTHROPIC:
|
1838
|
+
from judgeval.common.tracer.providers import (
|
1839
|
+
anthropic_Anthropic,
|
1840
|
+
anthropic_AsyncAnthropic,
|
1841
|
+
)
|
1842
|
+
|
1843
|
+
assert anthropic_Anthropic is not None, "Anthropic client not found"
|
1844
|
+
assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
|
1845
|
+
if isinstance(client, (anthropic_Anthropic)):
|
1846
|
+
setattr(client.messages, "create", wrapped(original_create))
|
1847
|
+
elif isinstance(client, (anthropic_AsyncAnthropic)):
|
1848
|
+
setattr(client.messages, "create", wrapped_async(original_create))
|
1849
|
+
|
1850
|
+
if HAS_GOOGLE_GENAI:
|
1851
|
+
from judgeval.common.tracer.providers import (
|
1852
|
+
google_genai_Client,
|
1853
|
+
google_genai_AsyncClient,
|
1709
1854
|
)
|
1710
|
-
elif isinstance(client, (Together)):
|
1711
|
-
setattr(client.chat.completions, "create", wrapped(original_create))
|
1712
|
-
elif isinstance(client, (AsyncTogether)):
|
1713
|
-
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1714
|
-
elif isinstance(client, (Anthropic)):
|
1715
|
-
setattr(client.messages, "create", wrapped(original_create))
|
1716
|
-
elif isinstance(client, (AsyncAnthropic)):
|
1717
|
-
setattr(client.messages, "create", wrapped_async(original_create))
|
1718
|
-
elif isinstance(client, (genai.Client)):
|
1719
|
-
setattr(client.models, "generate_content", wrapped(original_create))
|
1720
|
-
elif isinstance(client, (genai.client.AsyncClient)):
|
1721
|
-
setattr(client.models, "generate_content", wrapped_async(original_create))
|
1722
|
-
elif isinstance(client, (Groq)):
|
1723
|
-
setattr(client.chat.completions, "create", wrapped(original_create))
|
1724
|
-
elif isinstance(client, (AsyncGroq)):
|
1725
|
-
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1726
1855
|
|
1856
|
+
assert google_genai_Client is not None, "Google GenAI client not found"
|
1857
|
+
assert google_genai_AsyncClient is not None, (
|
1858
|
+
"Google GenAI async client not found"
|
1859
|
+
)
|
1860
|
+
if isinstance(client, (google_genai_Client)):
|
1861
|
+
setattr(client.models, "generate_content", wrapped(original_create))
|
1862
|
+
elif isinstance(client, (google_genai_AsyncClient)):
|
1863
|
+
setattr(client.models, "generate_content", wrapped_async(original_create))
|
1864
|
+
|
1865
|
+
if HAS_GROQ:
|
1866
|
+
from judgeval.common.tracer.providers import groq_Groq, groq_AsyncGroq
|
1867
|
+
|
1868
|
+
assert groq_Groq is not None, "Groq client not found"
|
1869
|
+
assert groq_AsyncGroq is not None, "Groq async client not found"
|
1870
|
+
if isinstance(client, (groq_Groq)):
|
1871
|
+
setattr(client.chat.completions, "create", wrapped(original_create))
|
1872
|
+
elif isinstance(client, (groq_AsyncGroq)):
|
1873
|
+
setattr(client.chat.completions, "create", wrapped_async(original_create))
|
1727
1874
|
return client
|
1728
1875
|
|
1729
1876
|
|
@@ -1749,28 +1896,87 @@ def _get_client_config(
|
|
1749
1896
|
Raises:
|
1750
1897
|
ValueError: If client type is not supported
|
1751
1898
|
"""
|
1752
|
-
|
1753
|
-
|
1754
|
-
|
1755
|
-
|
1756
|
-
|
1757
|
-
|
1758
|
-
|
1899
|
+
|
1900
|
+
if HAS_OPENAI:
|
1901
|
+
from judgeval.common.tracer.providers import openai_OpenAI, openai_AsyncOpenAI
|
1902
|
+
|
1903
|
+
assert openai_OpenAI is not None, "OpenAI client not found"
|
1904
|
+
assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
|
1905
|
+
if isinstance(client, (openai_OpenAI)):
|
1906
|
+
return (
|
1907
|
+
"OPENAI_API_CALL",
|
1908
|
+
client.chat.completions.create,
|
1909
|
+
client.responses.create,
|
1910
|
+
None,
|
1911
|
+
client.beta.chat.completions.parse,
|
1912
|
+
)
|
1913
|
+
elif isinstance(client, (openai_AsyncOpenAI)):
|
1914
|
+
return (
|
1915
|
+
"OPENAI_API_CALL",
|
1916
|
+
client.chat.completions.create,
|
1917
|
+
client.responses.create,
|
1918
|
+
None,
|
1919
|
+
client.beta.chat.completions.parse,
|
1920
|
+
)
|
1921
|
+
if HAS_TOGETHER:
|
1922
|
+
from judgeval.common.tracer.providers import (
|
1923
|
+
together_Together,
|
1924
|
+
together_AsyncTogether,
|
1759
1925
|
)
|
1760
|
-
|
1761
|
-
|
1762
|
-
|
1763
|
-
|
1764
|
-
|
1765
|
-
|
1766
|
-
"
|
1767
|
-
|
1768
|
-
|
1769
|
-
|
1770
|
-
|
1926
|
+
|
1927
|
+
assert together_Together is not None, "Together client not found"
|
1928
|
+
assert together_AsyncTogether is not None, "Together async client not found"
|
1929
|
+
if isinstance(client, (together_Together)):
|
1930
|
+
return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
|
1931
|
+
elif isinstance(client, (together_AsyncTogether)):
|
1932
|
+
return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
|
1933
|
+
if HAS_ANTHROPIC:
|
1934
|
+
from judgeval.common.tracer.providers import (
|
1935
|
+
anthropic_Anthropic,
|
1936
|
+
anthropic_AsyncAnthropic,
|
1937
|
+
)
|
1938
|
+
|
1939
|
+
assert anthropic_Anthropic is not None, "Anthropic client not found"
|
1940
|
+
assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
|
1941
|
+
if isinstance(client, (anthropic_Anthropic)):
|
1942
|
+
return (
|
1943
|
+
"ANTHROPIC_API_CALL",
|
1944
|
+
client.messages.create,
|
1945
|
+
None,
|
1946
|
+
client.messages.stream,
|
1947
|
+
None,
|
1948
|
+
)
|
1949
|
+
elif isinstance(client, (anthropic_AsyncAnthropic)):
|
1950
|
+
return (
|
1951
|
+
"ANTHROPIC_API_CALL",
|
1952
|
+
client.messages.create,
|
1953
|
+
None,
|
1954
|
+
client.messages.stream,
|
1955
|
+
None,
|
1956
|
+
)
|
1957
|
+
if HAS_GOOGLE_GENAI:
|
1958
|
+
from judgeval.common.tracer.providers import (
|
1959
|
+
google_genai_Client,
|
1960
|
+
google_genai_AsyncClient,
|
1771
1961
|
)
|
1772
|
-
|
1773
|
-
|
1962
|
+
|
1963
|
+
assert google_genai_Client is not None, "Google GenAI client not found"
|
1964
|
+
assert google_genai_AsyncClient is not None, (
|
1965
|
+
"Google GenAI async client not found"
|
1966
|
+
)
|
1967
|
+
if isinstance(client, (google_genai_Client)):
|
1968
|
+
return "GOOGLE_API_CALL", client.models.generate_content, None, None, None
|
1969
|
+
elif isinstance(client, (google_genai_AsyncClient)):
|
1970
|
+
return "GOOGLE_API_CALL", client.models.generate_content, None, None, None
|
1971
|
+
if HAS_GROQ:
|
1972
|
+
from judgeval.common.tracer.providers import groq_Groq, groq_AsyncGroq
|
1973
|
+
|
1974
|
+
assert groq_Groq is not None, "Groq client not found"
|
1975
|
+
assert groq_AsyncGroq is not None, "Groq async client not found"
|
1976
|
+
if isinstance(client, (groq_Groq)):
|
1977
|
+
return "GROQ_API_CALL", client.chat.completions.create, None, None, None
|
1978
|
+
elif isinstance(client, (groq_AsyncGroq)):
|
1979
|
+
return "GROQ_API_CALL", client.chat.completions.create, None, None, None
|
1774
1980
|
raise ValueError(f"Unsupported client type: {type(client)}")
|
1775
1981
|
|
1776
1982
|
|
@@ -1794,73 +2000,173 @@ def _format_output_data(
|
|
1794
2000
|
model_name = None
|
1795
2001
|
message_content = None
|
1796
2002
|
|
1797
|
-
if
|
1798
|
-
|
1799
|
-
|
1800
|
-
|
1801
|
-
|
1802
|
-
|
2003
|
+
if HAS_OPENAI:
|
2004
|
+
from judgeval.common.tracer.providers import (
|
2005
|
+
openai_OpenAI,
|
2006
|
+
openai_AsyncOpenAI,
|
2007
|
+
openai_ChatCompletion,
|
2008
|
+
openai_Response,
|
2009
|
+
openai_ParsedChatCompletion,
|
2010
|
+
)
|
2011
|
+
|
2012
|
+
assert openai_OpenAI is not None, "OpenAI client not found"
|
2013
|
+
assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
|
2014
|
+
assert openai_ChatCompletion is not None, "OpenAI chat completion not found"
|
2015
|
+
assert openai_Response is not None, "OpenAI response not found"
|
2016
|
+
assert openai_ParsedChatCompletion is not None, (
|
2017
|
+
"OpenAI parsed chat completion not found"
|
2018
|
+
)
|
2019
|
+
|
2020
|
+
if isinstance(client, (openai_OpenAI, openai_AsyncOpenAI)):
|
2021
|
+
if isinstance(response, openai_ChatCompletion):
|
2022
|
+
model_name = response.model
|
2023
|
+
prompt_tokens = response.usage.prompt_tokens if response.usage else 0
|
2024
|
+
completion_tokens = (
|
2025
|
+
response.usage.completion_tokens if response.usage else 0
|
2026
|
+
)
|
2027
|
+
cache_read_input_tokens = (
|
2028
|
+
response.usage.prompt_tokens_details.cached_tokens
|
2029
|
+
if response.usage
|
2030
|
+
and response.usage.prompt_tokens_details
|
2031
|
+
and response.usage.prompt_tokens_details.cached_tokens
|
2032
|
+
else 0
|
2033
|
+
)
|
2034
|
+
|
2035
|
+
if isinstance(response, openai_ParsedChatCompletion):
|
2036
|
+
message_content = response.choices[0].message.parsed
|
2037
|
+
else:
|
2038
|
+
message_content = response.choices[0].message.content
|
2039
|
+
elif isinstance(response, openai_Response):
|
2040
|
+
model_name = response.model
|
2041
|
+
prompt_tokens = response.usage.input_tokens if response.usage else 0
|
2042
|
+
completion_tokens = (
|
2043
|
+
response.usage.output_tokens if response.usage else 0
|
2044
|
+
)
|
2045
|
+
cache_read_input_tokens = (
|
2046
|
+
response.usage.input_tokens_details.cached_tokens
|
2047
|
+
if response.usage and response.usage.input_tokens_details
|
2048
|
+
else 0
|
2049
|
+
)
|
2050
|
+
if hasattr(response.output[0], "content"):
|
2051
|
+
message_content = "".join(
|
2052
|
+
seg.text
|
2053
|
+
for seg in response.output[0].content
|
2054
|
+
if hasattr(seg, "text")
|
2055
|
+
)
|
2056
|
+
# Note: LiteLLM seems to use cache_read_input_tokens to calculate the cost for OpenAI
|
2057
|
+
return message_content, _create_usage(
|
2058
|
+
model_name,
|
2059
|
+
prompt_tokens,
|
2060
|
+
completion_tokens,
|
2061
|
+
cache_read_input_tokens,
|
2062
|
+
cache_creation_input_tokens,
|
1803
2063
|
)
|
1804
|
-
|
1805
|
-
|
1806
|
-
|
1807
|
-
|
1808
|
-
|
1809
|
-
|
2064
|
+
|
2065
|
+
if HAS_TOGETHER:
|
2066
|
+
from judgeval.common.tracer.providers import (
|
2067
|
+
together_Together,
|
2068
|
+
together_AsyncTogether,
|
2069
|
+
)
|
2070
|
+
|
2071
|
+
assert together_Together is not None, "Together client not found"
|
2072
|
+
assert together_AsyncTogether is not None, "Together async client not found"
|
2073
|
+
|
2074
|
+
if isinstance(client, (together_Together, together_AsyncTogether)):
|
2075
|
+
model_name = "together_ai/" + response.model
|
2076
|
+
prompt_tokens = response.usage.prompt_tokens
|
2077
|
+
completion_tokens = response.usage.completion_tokens
|
2078
|
+
message_content = response.choices[0].message.content
|
2079
|
+
|
2080
|
+
# As of 2025-07-14, Together does not do any input cache token tracking
|
2081
|
+
return message_content, _create_usage(
|
2082
|
+
model_name,
|
2083
|
+
prompt_tokens,
|
2084
|
+
completion_tokens,
|
2085
|
+
cache_read_input_tokens,
|
2086
|
+
cache_creation_input_tokens,
|
1810
2087
|
)
|
1811
2088
|
|
1812
|
-
|
1813
|
-
|
1814
|
-
|
1815
|
-
|
1816
|
-
|
2089
|
+
if HAS_GOOGLE_GENAI:
|
2090
|
+
from judgeval.common.tracer.providers import (
|
2091
|
+
google_genai_Client,
|
2092
|
+
google_genai_AsyncClient,
|
2093
|
+
)
|
2094
|
+
|
2095
|
+
assert google_genai_Client is not None, "Google GenAI client not found"
|
2096
|
+
assert google_genai_AsyncClient is not None, (
|
2097
|
+
"Google GenAI async client not found"
|
2098
|
+
)
|
2099
|
+
if isinstance(client, (google_genai_Client, google_genai_AsyncClient)):
|
2100
|
+
model_name = response.model_version
|
2101
|
+
prompt_tokens = response.usage_metadata.prompt_token_count
|
2102
|
+
completion_tokens = response.usage_metadata.candidates_token_count
|
2103
|
+
message_content = response.candidates[0].content.parts[0].text
|
2104
|
+
|
2105
|
+
if hasattr(response.usage_metadata, "cached_content_token_count"):
|
2106
|
+
cache_read_input_tokens = (
|
2107
|
+
response.usage_metadata.cached_content_token_count
|
2108
|
+
)
|
2109
|
+
return message_content, _create_usage(
|
2110
|
+
model_name,
|
2111
|
+
prompt_tokens,
|
2112
|
+
completion_tokens,
|
2113
|
+
cache_read_input_tokens,
|
2114
|
+
cache_creation_input_tokens,
|
2115
|
+
)
|
2116
|
+
|
2117
|
+
if HAS_ANTHROPIC:
|
2118
|
+
from judgeval.common.tracer.providers import (
|
2119
|
+
anthropic_Anthropic,
|
2120
|
+
anthropic_AsyncAnthropic,
|
2121
|
+
)
|
2122
|
+
|
2123
|
+
assert anthropic_Anthropic is not None, "Anthropic client not found"
|
2124
|
+
assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
|
2125
|
+
if isinstance(client, (anthropic_Anthropic, anthropic_AsyncAnthropic)):
|
1817
2126
|
model_name = response.model
|
1818
|
-
prompt_tokens = response.usage.input_tokens
|
1819
|
-
completion_tokens = response.usage.output_tokens
|
1820
|
-
cache_read_input_tokens =
|
1821
|
-
|
1822
|
-
|
1823
|
-
|
2127
|
+
prompt_tokens = response.usage.input_tokens
|
2128
|
+
completion_tokens = response.usage.output_tokens
|
2129
|
+
cache_read_input_tokens = response.usage.cache_read_input_tokens
|
2130
|
+
cache_creation_input_tokens = response.usage.cache_creation_input_tokens
|
2131
|
+
message_content = response.content[0].text
|
2132
|
+
return message_content, _create_usage(
|
2133
|
+
model_name,
|
2134
|
+
prompt_tokens,
|
2135
|
+
completion_tokens,
|
2136
|
+
cache_read_input_tokens,
|
2137
|
+
cache_creation_input_tokens,
|
1824
2138
|
)
|
1825
|
-
if hasattr(response.output[0], "content"):
|
1826
|
-
message_content = "".join(
|
1827
|
-
seg.text
|
1828
|
-
for seg in response.output[0].content
|
1829
|
-
if hasattr(seg, "text")
|
1830
|
-
)
|
1831
2139
|
|
1832
|
-
|
1833
|
-
|
1834
|
-
|
1835
|
-
|
1836
|
-
|
1837
|
-
|
1838
|
-
|
1839
|
-
|
1840
|
-
|
1841
|
-
|
1842
|
-
|
1843
|
-
|
1844
|
-
|
1845
|
-
|
1846
|
-
|
1847
|
-
|
1848
|
-
|
1849
|
-
|
1850
|
-
|
1851
|
-
|
1852
|
-
cache_read_input_tokens = response.usage.cache_read_input_tokens
|
1853
|
-
cache_creation_input_tokens = response.usage.cache_creation_input_tokens
|
1854
|
-
message_content = response.content[0].text
|
1855
|
-
elif isinstance(client, (Groq, AsyncGroq)):
|
1856
|
-
model_name = "groq/" + response.model
|
1857
|
-
prompt_tokens = response.usage.prompt_tokens
|
1858
|
-
completion_tokens = response.usage.completion_tokens
|
1859
|
-
message_content = response.choices[0].message.content
|
1860
|
-
else:
|
1861
|
-
judgeval_logger.warning(f"Unsupported client type: {type(client)}")
|
1862
|
-
return None, None
|
2140
|
+
if HAS_GROQ:
|
2141
|
+
from judgeval.common.tracer.providers import groq_Groq, groq_AsyncGroq
|
2142
|
+
|
2143
|
+
assert groq_Groq is not None, "Groq client not found"
|
2144
|
+
assert groq_AsyncGroq is not None, "Groq async client not found"
|
2145
|
+
if isinstance(client, (groq_Groq, groq_AsyncGroq)):
|
2146
|
+
model_name = "groq/" + response.model
|
2147
|
+
prompt_tokens = response.usage.prompt_tokens
|
2148
|
+
completion_tokens = response.usage.completion_tokens
|
2149
|
+
message_content = response.choices[0].message.content
|
2150
|
+
return message_content, _create_usage(
|
2151
|
+
model_name,
|
2152
|
+
prompt_tokens,
|
2153
|
+
completion_tokens,
|
2154
|
+
cache_read_input_tokens,
|
2155
|
+
cache_creation_input_tokens,
|
2156
|
+
)
|
2157
|
+
|
2158
|
+
judgeval_logger.warning(f"Unsupported client type: {type(client)}")
|
2159
|
+
return None, None
|
1863
2160
|
|
2161
|
+
|
2162
|
+
def _create_usage(
|
2163
|
+
model_name: str,
|
2164
|
+
prompt_tokens: int,
|
2165
|
+
completion_tokens: int,
|
2166
|
+
cache_read_input_tokens: int = 0,
|
2167
|
+
cache_creation_input_tokens: int = 0,
|
2168
|
+
) -> TraceUsage:
|
2169
|
+
"""Helper function to create TraceUsage object with cost calculation."""
|
1864
2170
|
prompt_cost, completion_cost = cost_per_token(
|
1865
2171
|
model=model_name,
|
1866
2172
|
prompt_tokens=prompt_tokens,
|
@@ -1871,7 +2177,7 @@ def _format_output_data(
|
|
1871
2177
|
total_cost_usd = (
|
1872
2178
|
(prompt_cost + completion_cost) if prompt_cost and completion_cost else None
|
1873
2179
|
)
|
1874
|
-
|
2180
|
+
return TraceUsage(
|
1875
2181
|
prompt_tokens=prompt_tokens,
|
1876
2182
|
completion_tokens=completion_tokens,
|
1877
2183
|
total_tokens=prompt_tokens + completion_tokens,
|
@@ -1882,7 +2188,6 @@ def _format_output_data(
|
|
1882
2188
|
total_cost_usd=total_cost_usd,
|
1883
2189
|
model_name=model_name,
|
1884
2190
|
)
|
1885
|
-
return message_content, usage
|
1886
2191
|
|
1887
2192
|
|
1888
2193
|
def combine_args_kwargs(func, args, kwargs):
|
@@ -1940,13 +2245,13 @@ def get_instance_prefixed_name(instance, class_name, class_identifiers):
|
|
1940
2245
|
"""
|
1941
2246
|
if class_name in class_identifiers:
|
1942
2247
|
class_config = class_identifiers[class_name]
|
1943
|
-
attr = class_config
|
1944
|
-
|
1945
|
-
|
1946
|
-
|
1947
|
-
|
1948
|
-
|
1949
|
-
|
1950
|
-
|
1951
|
-
|
1952
|
-
|
2248
|
+
attr = class_config.get("identifier")
|
2249
|
+
if attr:
|
2250
|
+
if hasattr(instance, attr) and not callable(getattr(instance, attr)):
|
2251
|
+
instance_name = getattr(instance, attr)
|
2252
|
+
return instance_name
|
2253
|
+
else:
|
2254
|
+
raise Exception(
|
2255
|
+
f"Attribute {attr} does not exist for {class_name}. Check your agent() decorator."
|
2256
|
+
)
|
2257
|
+
return None
|