judgeval 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,11 +26,15 @@ from typing import (
26
26
  Generator,
27
27
  List,
28
28
  Optional,
29
+ ParamSpec,
29
30
  Tuple,
31
+ TypeVar,
30
32
  Union,
31
33
  TypeAlias,
34
+ overload,
32
35
  )
33
36
  import types
37
+ import random
34
38
 
35
39
 
36
40
  from judgeval.common.tracer.constants import _TRACE_FILEPATH_BLOCKLIST
@@ -38,40 +42,33 @@ from judgeval.common.tracer.constants import _TRACE_FILEPATH_BLOCKLIST
38
42
  from judgeval.common.tracer.otel_span_processor import JudgmentSpanProcessor
39
43
  from judgeval.common.tracer.span_processor import SpanProcessorBase
40
44
  from judgeval.common.tracer.trace_manager import TraceManagerClient
41
- from litellm import cost_per_token as _original_cost_per_token
42
- from openai import OpenAI, AsyncOpenAI
43
- from openai.types.chat.chat_completion import ChatCompletion
44
- from openai.types.responses.response import Response
45
- from openai.types.chat import ParsedChatCompletion
46
- from together import Together, AsyncTogether
47
- from anthropic import Anthropic, AsyncAnthropic
48
- from google import genai
49
- from groq import Groq, AsyncGroq
50
45
 
51
46
  from judgeval.data import Example, Trace, TraceSpan, TraceUsage
52
47
  from judgeval.scorers import APIScorerConfig, BaseScorer
53
- from judgeval.evaluation_run import EvaluationRun
54
- from judgeval.common.utils import ExcInfo, validate_api_key
48
+ from judgeval.data.evaluation_run import EvaluationRun
49
+ from judgeval.local_eval_queue import LocalEvaluationQueue
50
+ from judgeval.common.api import JudgmentApiClient
51
+ from judgeval.common.utils import OptExcInfo, validate_api_key
55
52
  from judgeval.common.logger import judgeval_logger
56
53
 
54
+ from litellm import cost_per_token as _original_cost_per_token # type: ignore
55
+ from judgeval.common.tracer.providers import (
56
+ HAS_OPENAI,
57
+ HAS_TOGETHER,
58
+ HAS_ANTHROPIC,
59
+ HAS_GOOGLE_GENAI,
60
+ HAS_GROQ,
61
+ ApiClient,
62
+ )
63
+ from judgeval.constants import DEFAULT_GPT_MODEL
64
+
57
65
 
58
66
  current_trace_var = contextvars.ContextVar[Optional["TraceClient"]](
59
67
  "current_trace", default=None
60
68
  )
61
69
  current_span_var = contextvars.ContextVar[Optional[str]]("current_span", default=None)
62
70
 
63
- ApiClient: TypeAlias = Union[
64
- OpenAI,
65
- Together,
66
- Anthropic,
67
- AsyncOpenAI,
68
- AsyncAnthropic,
69
- AsyncTogether,
70
- genai.Client,
71
- genai.client.AsyncClient,
72
- Groq,
73
- AsyncGroq,
74
- ]
71
+
75
72
  SpanType: TypeAlias = str
76
73
 
77
74
 
@@ -113,10 +110,6 @@ class TraceClient:
113
110
 
114
111
  self.otel_span_processor = tracer.otel_span_processor
115
112
 
116
- judgeval_logger.info(
117
- f"🎯 TraceClient using span processor for trace {self.trace_id}"
118
- )
119
-
120
113
  def get_current_span(self):
121
114
  """Get the current span from the context var"""
122
115
  return self.tracer.get_current_span()
@@ -181,85 +174,53 @@ class TraceClient:
181
174
 
182
175
  def async_evaluate(
183
176
  self,
184
- scorers: List[Union[APIScorerConfig, BaseScorer]],
185
- example: Optional[Example] = None,
186
- input: Optional[str] = None,
187
- actual_output: Optional[Union[str, List[str]]] = None,
188
- expected_output: Optional[Union[str, List[str]]] = None,
189
- context: Optional[List[str]] = None,
190
- retrieval_context: Optional[List[str]] = None,
191
- tools_called: Optional[List[str]] = None,
192
- expected_tools: Optional[List[str]] = None,
193
- additional_metadata: Optional[Dict[str, Any]] = None,
194
- model: Optional[str] = None,
195
- span_id: Optional[str] = None,
177
+ scorer: Union[APIScorerConfig, BaseScorer],
178
+ example: Example,
179
+ model: str = DEFAULT_GPT_MODEL,
196
180
  ):
197
- if not self.enable_evaluations:
198
- return
199
-
200
181
  start_time = time.time()
182
+ span_id = self.get_current_span()
183
+ eval_run_name = (
184
+ f"{self.name.capitalize()}-{span_id}-{scorer.score_type.capitalize()}"
185
+ )
186
+ hosted_scoring = isinstance(scorer, APIScorerConfig) or (
187
+ isinstance(scorer, BaseScorer) and scorer.server_hosted
188
+ )
189
+ if hosted_scoring:
190
+ eval_run = EvaluationRun(
191
+ organization_id=self.tracer.organization_id,
192
+ project_name=self.project_name,
193
+ eval_name=eval_run_name,
194
+ examples=[example],
195
+ scorers=[scorer],
196
+ model=model,
197
+ trace_span_id=span_id,
198
+ )
201
199
 
202
- try:
203
- if not scorers:
204
- judgeval_logger.warning("No valid scorers available for evaluation")
205
- return
206
-
207
- except Exception as e:
208
- judgeval_logger.warning(f"Failed to load scorers: {str(e)}")
209
- return
210
-
211
- if example is None:
212
- if any(
213
- param is not None
214
- for param in [
215
- input,
216
- actual_output,
217
- expected_output,
218
- context,
219
- retrieval_context,
220
- tools_called,
221
- expected_tools,
222
- additional_metadata,
223
- ]
224
- ):
225
- example = Example(
226
- input=input,
227
- actual_output=actual_output,
228
- expected_output=expected_output,
229
- context=context,
230
- retrieval_context=retrieval_context,
231
- tools_called=tools_called,
232
- expected_tools=expected_tools,
233
- additional_metadata=additional_metadata,
234
- )
235
- else:
236
- raise ValueError(
237
- "Either 'example' or at least one of the individual parameters (input, actual_output, etc.) must be provided"
238
- )
200
+ self.add_eval_run(eval_run, start_time)
239
201
 
240
- span_id_to_use = span_id if span_id is not None else self.get_current_span()
241
-
242
- eval_run = EvaluationRun(
243
- organization_id=self.tracer.organization_id,
244
- project_name=self.project_name,
245
- eval_name=f"{self.name.capitalize()}-"
246
- f"{span_id_to_use}-"
247
- f"[{','.join(scorer.score_type.capitalize() for scorer in scorers)}]",
248
- examples=[example],
249
- scorers=scorers,
250
- model=model,
251
- judgment_api_key=self.tracer.api_key,
252
- trace_span_id=span_id_to_use,
253
- )
202
+ if span_id:
203
+ current_span = self.span_id_to_span.get(span_id)
204
+ if current_span:
205
+ self.otel_span_processor.queue_evaluation_run(
206
+ eval_run, span_id=span_id, span_data=current_span
207
+ )
208
+ else:
209
+ # Handle custom scorers using local evaluation queue
210
+ eval_run = EvaluationRun(
211
+ organization_id=self.tracer.organization_id,
212
+ project_name=self.project_name,
213
+ eval_name=eval_run_name,
214
+ examples=[example],
215
+ scorers=[scorer],
216
+ model=model,
217
+ trace_span_id=span_id,
218
+ )
254
219
 
255
- self.add_eval_run(eval_run, start_time)
220
+ self.add_eval_run(eval_run, start_time)
256
221
 
257
- if span_id_to_use:
258
- current_span = self.span_id_to_span.get(span_id_to_use)
259
- if current_span:
260
- self.otel_span_processor.queue_evaluation_run(
261
- eval_run, span_id=span_id_to_use, span_data=current_span
262
- )
222
+ # Enqueue the evaluation run to the local evaluation queue
223
+ self.tracer.local_eval_queue.enqueue(eval_run)
263
224
 
264
225
  def add_eval_run(self, eval_run: EvaluationRun, start_time: float):
265
226
  current_span_id = eval_run.trace_span_id
@@ -290,6 +251,14 @@ class TraceClient:
290
251
 
291
252
  self.otel_span_processor.queue_span_update(span, span_state="agent_name")
292
253
 
254
+ def record_class_name(self, class_name: str):
255
+ current_span_id = self.get_current_span()
256
+ if current_span_id:
257
+ span = self.span_id_to_span[current_span_id]
258
+ span.class_name = class_name
259
+
260
+ self.otel_span_processor.queue_span_update(span, span_state="class_name")
261
+
293
262
  def record_state_before(self, state: dict):
294
263
  """Records the agent's state before a tool execution on the current span.
295
264
 
@@ -316,35 +285,13 @@ class TraceClient:
316
285
 
317
286
  self.otel_span_processor.queue_span_update(span, span_state="state_after")
318
287
 
319
- async def _update_coroutine(self, span: TraceSpan, coroutine: Any, field: str):
320
- """Helper method to update the output of a trace entry once the coroutine completes"""
321
- try:
322
- result = await coroutine
323
- setattr(span, field, result)
324
-
325
- if field == "output":
326
- self.otel_span_processor.queue_span_update(span, span_state="output")
327
-
328
- return result
329
- except Exception as e:
330
- setattr(span, field, f"Error: {str(e)}")
331
-
332
- if field == "output":
333
- self.otel_span_processor.queue_span_update(span, span_state="output")
334
-
335
- raise
336
-
337
288
  def record_output(self, output: Any):
338
289
  current_span_id = self.get_current_span()
339
290
  if current_span_id:
340
291
  span = self.span_id_to_span[current_span_id]
341
- span.output = "<pending>" if inspect.iscoroutine(output) else output
342
-
343
- if inspect.iscoroutine(output):
344
- asyncio.create_task(self._update_coroutine(span, output, "output"))
292
+ span.output = output
345
293
 
346
- if not inspect.iscoroutine(output):
347
- self.otel_span_processor.queue_span_update(span, span_state="output")
294
+ self.otel_span_processor.queue_span_update(span, span_state="output")
348
295
 
349
296
  return span
350
297
  return None
@@ -517,7 +464,7 @@ class TraceClient:
517
464
 
518
465
 
519
466
  def _capture_exception_for_trace(
520
- current_trace: Optional[TraceClient], exc_info: ExcInfo
467
+ current_trace: Optional[TraceClient], exc_info: OptExcInfo
521
468
  ):
522
469
  if not current_trace:
523
470
  return
@@ -681,6 +628,7 @@ class _DeepTracer:
681
628
 
682
629
  qual_name = self._get_qual_name(frame)
683
630
  instance_name = None
631
+ class_name = None
684
632
  if "self" in frame.f_locals:
685
633
  instance = frame.f_locals["self"]
686
634
  class_name = instance.__class__.__name__
@@ -754,6 +702,7 @@ class _DeepTracer:
754
702
  parent_span_id=parent_span_id,
755
703
  function=qual_name,
756
704
  agent_name=instance_name,
705
+ class_name=class_name,
757
706
  )
758
707
  current_trace.add_span(span)
759
708
 
@@ -841,6 +790,10 @@ class _DeepTracer:
841
790
  self._original_threading_trace = None
842
791
 
843
792
 
793
+ T = TypeVar("T", bound=Callable[..., Any])
794
+ P = ParamSpec("P")
795
+
796
+
844
797
  class Tracer:
845
798
  # Tracer.current_trace class variable is currently used in wrap()
846
799
  # TODO: Keep track of cross-context state for current trace and current span ID solely through class variables instead of instance variables?
@@ -954,6 +907,15 @@ class Tracer:
954
907
  else:
955
908
  self.otel_span_processor = SpanProcessorBase()
956
909
 
910
+ # Initialize local evaluation queue for custom scorers
911
+ self.local_eval_queue = LocalEvaluationQueue()
912
+
913
+ # Start workers with callback to log results only if monitoring is enabled
914
+ if enable_evaluations and enable_monitoring:
915
+ self.local_eval_queue.start_workers(
916
+ callback=self._log_eval_results_callback
917
+ )
918
+
957
919
  atexit.register(self._cleanup_on_exit)
958
920
  except Exception as e:
959
921
  judgeval_logger.error(
@@ -1089,10 +1051,10 @@ class Tracer:
1089
1051
  # Reset the context variable
1090
1052
  self.reset_current_trace(token)
1091
1053
 
1092
- def identify(
1054
+ def agent(
1093
1055
  self,
1094
- identifier: str,
1095
- track_state: bool = False,
1056
+ identifier: Optional[str] = None,
1057
+ track_state: Optional[bool] = False,
1096
1058
  track_attributes: Optional[List[str]] = None,
1097
1059
  field_mappings: Optional[Dict[str, str]] = None,
1098
1060
  ):
@@ -1130,11 +1092,18 @@ class Tracer:
1130
1092
  "track_state": track_state,
1131
1093
  "track_attributes": track_attributes,
1132
1094
  "field_mappings": field_mappings or {},
1095
+ "class_name": class_name,
1133
1096
  }
1134
1097
  return cls
1135
1098
 
1136
1099
  return decorator
1137
1100
 
1101
+ def identify(self, *args, **kwargs):
1102
+ judgeval_logger.warning(
1103
+ "identify() is deprecated and may not be supported in future versions of judgeval. Use the agent() decorator instead."
1104
+ )
1105
+ return self.agent(*args, **kwargs)
1106
+
1138
1107
  def _capture_instance_state(
1139
1108
  self, instance: Any, class_config: Dict[str, Any]
1140
1109
  ) -> Dict[str, Any]:
@@ -1189,11 +1158,24 @@ class Tracer:
1189
1158
  else:
1190
1159
  trace_client_instance.record_state_after(state)
1191
1160
 
1161
+ @overload
1162
+ def observe(
1163
+ self, func: T, *, name: Optional[str] = None, span_type: SpanType = "span"
1164
+ ) -> T: ...
1165
+
1166
+ @overload
1167
+ def observe(
1168
+ self,
1169
+ *,
1170
+ name: Optional[str] = None,
1171
+ span_type: SpanType = "span",
1172
+ ) -> Callable[[T], T]: ...
1173
+
1192
1174
  def observe(
1193
1175
  self,
1194
- func=None,
1176
+ func: Optional[T] = None,
1195
1177
  *,
1196
- name=None,
1178
+ name: Optional[str] = None,
1197
1179
  span_type: SpanType = "span",
1198
1180
  ):
1199
1181
  """
@@ -1210,8 +1192,8 @@ class Tracer:
1210
1192
  return func if func else lambda f: f
1211
1193
 
1212
1194
  if func is None:
1213
- return lambda f: self.observe(
1214
- f,
1195
+ return lambda func: self.observe(
1196
+ func,
1215
1197
  name=name,
1216
1198
  span_type=span_type,
1217
1199
  )
@@ -1220,131 +1202,262 @@ class Tracer:
1220
1202
  original_span_name = name or func.__name__
1221
1203
 
1222
1204
  # Store custom attributes on the function object
1223
- func._judgment_span_name = original_span_name
1224
- func._judgment_span_type = span_type
1205
+ func._judgment_span_name = original_span_name # type: ignore
1206
+ func._judgment_span_type = span_type # type: ignore
1225
1207
 
1226
1208
  except Exception:
1227
1209
  return func
1228
1210
 
1229
- if asyncio.iscoroutinefunction(func):
1211
+ def _record_span_data(span, args, kwargs):
1212
+ """Helper function to record inputs, agent info, and state on a span."""
1213
+ # Get class and agent info
1214
+ class_name = None
1215
+ agent_name = None
1216
+ if args and hasattr(args[0], "__class__"):
1217
+ class_name = args[0].__class__.__name__
1218
+ agent_name = get_instance_prefixed_name(
1219
+ args[0], class_name, self.class_identifiers
1220
+ )
1230
1221
 
1231
- @functools.wraps(func)
1232
- async def async_wrapper(*args, **kwargs):
1233
- nonlocal original_span_name
1234
- class_name = None
1235
- span_name = original_span_name
1236
- agent_name = None
1222
+ # Record inputs, agent name, class name
1223
+ inputs = combine_args_kwargs(func, args, kwargs)
1224
+ span.record_input(inputs)
1225
+ if agent_name:
1226
+ span.record_agent_name(agent_name)
1227
+ if class_name and class_name in self.class_identifiers:
1228
+ span.record_class_name(class_name)
1229
+
1230
+ # Capture state before execution
1231
+ self._conditionally_capture_and_record_state(span, args, is_before=True)
1232
+
1233
+ return class_name, agent_name
1237
1234
 
1238
- if args and hasattr(args[0], "__class__"):
1239
- class_name = args[0].__class__.__name__
1240
- agent_name = get_instance_prefixed_name(
1241
- args[0], class_name, self.class_identifiers
1235
+ def _finalize_span_data(span, result, args):
1236
+ """Helper function to record outputs and final state on a span."""
1237
+ # Record output
1238
+ span.record_output(result)
1239
+
1240
+ # Capture state after execution
1241
+ self._conditionally_capture_and_record_state(span, args, is_before=False)
1242
+
1243
+ def _cleanup_trace(current_trace, trace_token, wrapper_type="function"):
1244
+ """Helper function to handle trace cleanup in finally blocks."""
1245
+ try:
1246
+ trace_id, server_response = current_trace.save(final_save=True)
1247
+
1248
+ complete_trace_data = {
1249
+ "trace_id": current_trace.trace_id,
1250
+ "name": current_trace.name,
1251
+ "project_name": current_trace.project_name,
1252
+ "created_at": datetime.fromtimestamp(
1253
+ current_trace.start_time or time.time(),
1254
+ timezone.utc,
1255
+ ).isoformat(),
1256
+ "duration": current_trace.get_duration(),
1257
+ "trace_spans": [
1258
+ span.model_dump() for span in current_trace.trace_spans
1259
+ ],
1260
+ "evaluation_runs": [
1261
+ run.model_dump() for run in current_trace.evaluation_runs
1262
+ ],
1263
+ "offline_mode": self.offline_mode,
1264
+ "parent_trace_id": current_trace.parent_trace_id,
1265
+ "parent_name": current_trace.parent_name,
1266
+ "customer_id": current_trace.customer_id,
1267
+ "tags": current_trace.tags,
1268
+ "metadata": current_trace.metadata,
1269
+ "update_id": current_trace.update_id,
1270
+ }
1271
+ self.traces.append(complete_trace_data)
1272
+ self.reset_current_trace(trace_token)
1273
+ except Exception as e:
1274
+ judgeval_logger.warning(f"Issue with {wrapper_type} cleanup: {e}")
1275
+
1276
+ def _execute_in_span(
1277
+ current_trace, span_name, span_type, execution_func, args, kwargs
1278
+ ):
1279
+ """Helper function to execute code within a span context."""
1280
+ with current_trace.span(span_name, span_type=span_type) as span:
1281
+ _record_span_data(span, args, kwargs)
1282
+
1283
+ try:
1284
+ result = execution_func()
1285
+ _finalize_span_data(span, result, args)
1286
+ return result
1287
+ except Exception as e:
1288
+ _capture_exception_for_trace(current_trace, sys.exc_info())
1289
+ raise e
1290
+
1291
+ async def _execute_in_span_async(
1292
+ current_trace, span_name, span_type, async_execution_func, args, kwargs
1293
+ ):
1294
+ """Helper function to execute async code within a span context."""
1295
+ with current_trace.span(span_name, span_type=span_type) as span:
1296
+ _record_span_data(span, args, kwargs)
1297
+
1298
+ try:
1299
+ result = await async_execution_func()
1300
+ _finalize_span_data(span, result, args)
1301
+ return result
1302
+ except Exception as e:
1303
+ _capture_exception_for_trace(current_trace, sys.exc_info())
1304
+ raise e
1305
+
1306
+ def _create_new_trace(self, span_name):
1307
+ """Helper function to create a new trace and set it as current."""
1308
+ trace_id = str(uuid.uuid4())
1309
+ project = self.project_name
1310
+
1311
+ current_trace = TraceClient(
1312
+ self,
1313
+ trace_id,
1314
+ span_name,
1315
+ project_name=project,
1316
+ enable_monitoring=self.enable_monitoring,
1317
+ enable_evaluations=self.enable_evaluations,
1318
+ )
1319
+
1320
+ trace_token = self.set_current_trace(current_trace)
1321
+ return current_trace, trace_token
1322
+
1323
+ def _execute_with_auto_trace_creation(
1324
+ span_name, span_type, execution_func, args, kwargs
1325
+ ):
1326
+ """Helper function that handles automatic trace creation and span execution."""
1327
+ current_trace = self.get_current_trace()
1328
+
1329
+ if not current_trace:
1330
+ current_trace, trace_token = _create_new_trace(self, span_name)
1331
+
1332
+ try:
1333
+ result = _execute_in_span(
1334
+ current_trace,
1335
+ span_name,
1336
+ span_type,
1337
+ execution_func,
1338
+ args,
1339
+ kwargs,
1242
1340
  )
1341
+ return result
1342
+ finally:
1343
+ # Cleanup the trace we created
1344
+ _cleanup_trace(current_trace, trace_token, "auto_trace")
1345
+ else:
1346
+ # Use existing trace
1347
+ return _execute_in_span(
1348
+ current_trace, span_name, span_type, execution_func, args, kwargs
1349
+ )
1243
1350
 
1244
- current_trace = self.get_current_trace()
1351
+ async def _execute_with_auto_trace_creation_async(
1352
+ span_name, span_type, async_execution_func, args, kwargs
1353
+ ):
1354
+ """Helper function that handles automatic trace creation and async span execution."""
1355
+ current_trace = self.get_current_trace()
1245
1356
 
1246
- if not current_trace:
1247
- trace_id = str(uuid.uuid4())
1248
- project = self.project_name
1357
+ if not current_trace:
1358
+ current_trace, trace_token = _create_new_trace(self, span_name)
1249
1359
 
1250
- current_trace = TraceClient(
1251
- self,
1252
- trace_id,
1360
+ try:
1361
+ result = await _execute_in_span_async(
1362
+ current_trace,
1253
1363
  span_name,
1254
- project_name=project,
1255
- enable_monitoring=self.enable_monitoring,
1256
- enable_evaluations=self.enable_evaluations,
1364
+ span_type,
1365
+ async_execution_func,
1366
+ args,
1367
+ kwargs,
1257
1368
  )
1369
+ return result
1370
+ finally:
1371
+ # Cleanup the trace we created
1372
+ _cleanup_trace(current_trace, trace_token, "async_auto_trace")
1373
+ else:
1374
+ # Use existing trace
1375
+ return await _execute_in_span_async(
1376
+ current_trace,
1377
+ span_name,
1378
+ span_type,
1379
+ async_execution_func,
1380
+ args,
1381
+ kwargs,
1382
+ )
1258
1383
 
1259
- trace_token = self.set_current_trace(current_trace)
1384
+ # Check for generator functions first
1385
+ if inspect.isgeneratorfunction(func):
1260
1386
 
1261
- try:
1262
- with current_trace.span(span_name, span_type=span_type) as span:
1263
- inputs = combine_args_kwargs(func, args, kwargs)
1264
- span.record_input(inputs)
1265
- if agent_name:
1266
- span.record_agent_name(agent_name)
1267
-
1268
- self._conditionally_capture_and_record_state(
1269
- span, args, is_before=True
1270
- )
1387
+ @functools.wraps(func)
1388
+ def generator_wrapper(*args, **kwargs):
1389
+ # Get the generator from the original function
1390
+ generator = func(*args, **kwargs)
1271
1391
 
1272
- try:
1273
- if self.deep_tracing:
1274
- with _DeepTracer(self):
1275
- result = await func(*args, **kwargs)
1276
- else:
1277
- result = await func(*args, **kwargs)
1278
- except Exception as e:
1279
- _capture_exception_for_trace(
1280
- current_trace, sys.exc_info()
1281
- )
1282
- raise e
1283
-
1284
- self._conditionally_capture_and_record_state(
1285
- span, args, is_before=False
1392
+ # Create wrapper generator that creates spans for each yield
1393
+ def traced_generator():
1394
+ while True:
1395
+ try:
1396
+ # Handle automatic trace creation and span execution
1397
+ item = _execute_with_auto_trace_creation(
1398
+ original_span_name,
1399
+ span_type,
1400
+ lambda: next(generator),
1401
+ args,
1402
+ kwargs,
1286
1403
  )
1404
+ yield item
1405
+ except StopIteration:
1406
+ break
1407
+
1408
+ return traced_generator()
1409
+
1410
+ return generator_wrapper
1411
+
1412
+ # Check for async generator functions
1413
+ elif inspect.isasyncgenfunction(func):
1287
1414
 
1288
- span.record_output(result)
1289
- return result
1290
- finally:
1415
+ @functools.wraps(func)
1416
+ def async_generator_wrapper(*args, **kwargs):
1417
+ # Get the async generator from the original function
1418
+ async_generator = func(*args, **kwargs)
1419
+
1420
+ # Create wrapper async generator that creates spans for each yield
1421
+ async def traced_async_generator():
1422
+ while True:
1291
1423
  try:
1292
- complete_trace_data = {
1293
- "trace_id": current_trace.trace_id,
1294
- "name": current_trace.name,
1295
- "created_at": datetime.fromtimestamp(
1296
- current_trace.start_time or time.time(),
1297
- timezone.utc,
1298
- ).isoformat(),
1299
- "duration": current_trace.get_duration(),
1300
- "trace_spans": [
1301
- span.model_dump()
1302
- for span in current_trace.trace_spans
1303
- ],
1304
- "offline_mode": self.offline_mode,
1305
- "parent_trace_id": current_trace.parent_trace_id,
1306
- "parent_name": current_trace.parent_name,
1307
- }
1308
-
1309
- trace_id, server_response = current_trace.save(
1310
- final_save=True
1424
+ # Handle automatic trace creation and span execution
1425
+ item = await _execute_with_auto_trace_creation_async(
1426
+ original_span_name,
1427
+ span_type,
1428
+ lambda: async_generator.__anext__(),
1429
+ args,
1430
+ kwargs,
1311
1431
  )
1432
+ if inspect.iscoroutine(item):
1433
+ item = await item
1434
+ yield item
1435
+ except StopAsyncIteration:
1436
+ break
1312
1437
 
1313
- self.traces.append(complete_trace_data)
1438
+ return traced_async_generator()
1314
1439
 
1315
- self.reset_current_trace(trace_token)
1316
- except Exception as e:
1317
- judgeval_logger.warning(f"Issue with async_wrapper: {e}")
1318
- pass
1319
- else:
1320
- with current_trace.span(span_name, span_type=span_type) as span:
1321
- inputs = combine_args_kwargs(func, args, kwargs)
1322
- span.record_input(inputs)
1323
- if agent_name:
1324
- span.record_agent_name(agent_name)
1325
-
1326
- # Capture state before execution
1327
- self._conditionally_capture_and_record_state(
1328
- span, args, is_before=True
1329
- )
1440
+ return async_generator_wrapper
1330
1441
 
1331
- try:
1332
- if self.deep_tracing:
1333
- with _DeepTracer(self):
1334
- result = await func(*args, **kwargs)
1335
- else:
1336
- result = await func(*args, **kwargs)
1337
- except Exception as e:
1338
- _capture_exception_for_trace(current_trace, sys.exc_info())
1339
- raise e
1340
-
1341
- # Capture state after execution
1342
- self._conditionally_capture_and_record_state(
1343
- span, args, is_before=False
1344
- )
1442
+ elif asyncio.iscoroutinefunction(func):
1345
1443
 
1346
- span.record_output(result)
1347
- return result
1444
+ @functools.wraps(func)
1445
+ async def async_wrapper(*args, **kwargs):
1446
+ nonlocal original_span_name
1447
+ span_name = original_span_name
1448
+
1449
+ async def async_execution():
1450
+ if self.deep_tracing:
1451
+ with _DeepTracer(self):
1452
+ return await func(*args, **kwargs)
1453
+ else:
1454
+ return await func(*args, **kwargs)
1455
+
1456
+ result = await _execute_with_auto_trace_creation_async(
1457
+ span_name, span_type, async_execution, args, kwargs
1458
+ )
1459
+
1460
+ return result
1348
1461
 
1349
1462
  return async_wrapper
1350
1463
  else:
@@ -1352,122 +1465,18 @@ class Tracer:
1352
1465
  @functools.wraps(func)
1353
1466
  def wrapper(*args, **kwargs):
1354
1467
  nonlocal original_span_name
1355
- class_name = None
1356
1468
  span_name = original_span_name
1357
- agent_name = None
1358
- if args and hasattr(args[0], "__class__"):
1359
- class_name = args[0].__class__.__name__
1360
- agent_name = get_instance_prefixed_name(
1361
- args[0], class_name, self.class_identifiers
1362
- )
1363
- # Get current trace from context
1364
- current_trace = self.get_current_trace()
1365
-
1366
- # If there's no current trace, create a root trace
1367
- if not current_trace:
1368
- trace_id = str(uuid.uuid4())
1369
- project = self.project_name
1370
-
1371
- # Create a new trace client to serve as the root
1372
- current_trace = TraceClient(
1373
- self,
1374
- trace_id,
1375
- span_name,
1376
- project_name=project,
1377
- enable_monitoring=self.enable_monitoring,
1378
- enable_evaluations=self.enable_evaluations,
1379
- )
1380
-
1381
- trace_token = self.set_current_trace(current_trace)
1382
-
1383
- try:
1384
- with current_trace.span(span_name, span_type=span_type) as span:
1385
- # Record inputs
1386
- inputs = combine_args_kwargs(func, args, kwargs)
1387
- span.record_input(inputs)
1388
- if agent_name:
1389
- span.record_agent_name(agent_name)
1390
- # Capture state before execution
1391
- self._conditionally_capture_and_record_state(
1392
- span, args, is_before=True
1393
- )
1394
-
1395
- try:
1396
- if self.deep_tracing:
1397
- with _DeepTracer(self):
1398
- result = func(*args, **kwargs)
1399
- else:
1400
- result = func(*args, **kwargs)
1401
- except Exception as e:
1402
- _capture_exception_for_trace(
1403
- current_trace, sys.exc_info()
1404
- )
1405
- raise e
1406
-
1407
- # Capture state after execution
1408
- self._conditionally_capture_and_record_state(
1409
- span, args, is_before=False
1410
- )
1411
-
1412
- # Record output
1413
- span.record_output(result)
1414
- return result
1415
- finally:
1416
- try:
1417
- trace_id, server_response = current_trace.save(
1418
- final_save=True
1419
- )
1420
1469
 
1421
- complete_trace_data = {
1422
- "trace_id": current_trace.trace_id,
1423
- "name": current_trace.name,
1424
- "created_at": datetime.fromtimestamp(
1425
- current_trace.start_time or time.time(),
1426
- timezone.utc,
1427
- ).isoformat(),
1428
- "duration": current_trace.get_duration(),
1429
- "trace_spans": [
1430
- span.model_dump()
1431
- for span in current_trace.trace_spans
1432
- ],
1433
- "offline_mode": self.offline_mode,
1434
- "parent_trace_id": current_trace.parent_trace_id,
1435
- "parent_name": current_trace.parent_name,
1436
- }
1437
- self.traces.append(complete_trace_data)
1438
- self.reset_current_trace(trace_token)
1439
- except Exception as e:
1440
- judgeval_logger.warning(f"Issue with save: {e}")
1441
- pass
1442
- else:
1443
- with current_trace.span(span_name, span_type=span_type) as span:
1444
- inputs = combine_args_kwargs(func, args, kwargs)
1445
- span.record_input(inputs)
1446
- if agent_name:
1447
- span.record_agent_name(agent_name)
1448
-
1449
- # Capture state before execution
1450
- self._conditionally_capture_and_record_state(
1451
- span, args, is_before=True
1452
- )
1470
+ def sync_execution():
1471
+ if self.deep_tracing:
1472
+ with _DeepTracer(self):
1473
+ return func(*args, **kwargs)
1474
+ else:
1475
+ return func(*args, **kwargs)
1453
1476
 
1454
- try:
1455
- if self.deep_tracing:
1456
- with _DeepTracer(self):
1457
- result = func(*args, **kwargs)
1458
- else:
1459
- result = func(*args, **kwargs)
1460
- except Exception as e:
1461
- _capture_exception_for_trace(current_trace, sys.exc_info())
1462
- raise e
1463
-
1464
- # Capture state after execution
1465
- self._conditionally_capture_and_record_state(
1466
- span, args, is_before=False
1467
- )
1468
-
1469
- span.record_output(result)
1470
- return result
1477
+ return _execute_with_auto_trace_creation(
1478
+ span_name, span_type, sync_execution, args, kwargs
1479
+ )
1471
1480
 
1472
1481
  return wrapper
1473
1482
 
@@ -1532,15 +1541,51 @@ class Tracer:
1532
1541
 
1533
1542
  return decorate_class if cls is None else decorate_class(cls)
1534
1543
 
1535
- def async_evaluate(self, *args, **kwargs):
1544
+ def async_evaluate(
1545
+ self,
1546
+ scorer: Union[APIScorerConfig, BaseScorer],
1547
+ example: Example,
1548
+ model: str = DEFAULT_GPT_MODEL,
1549
+ sampling_rate: float = 1,
1550
+ ):
1536
1551
  try:
1537
1552
  if not self.enable_monitoring or not self.enable_evaluations:
1538
1553
  return
1539
1554
 
1540
- current_trace = self.get_current_trace()
1555
+ if not isinstance(scorer, (APIScorerConfig, BaseScorer)):
1556
+ judgeval_logger.warning(
1557
+ f"Scorer must be an instance of APIScorerConfig or BaseScorer, got {type(scorer)}, skipping evaluation"
1558
+ )
1559
+ return
1560
+
1561
+ if not isinstance(example, Example):
1562
+ judgeval_logger.warning(
1563
+ f"Example must be an instance of Example, got {type(example)} skipping evaluation"
1564
+ )
1565
+ return
1566
+
1567
+ if sampling_rate < 0:
1568
+ judgeval_logger.warning(
1569
+ "Cannot set sampling_rate below 0, skipping evaluation"
1570
+ )
1571
+ return
1572
+
1573
+ if sampling_rate > 1:
1574
+ judgeval_logger.warning(
1575
+ "Cannot set sampling_rate above 1, skipping evaluation"
1576
+ )
1577
+ return
1578
+
1579
+ percentage = random.uniform(0, 1)
1580
+ if percentage > sampling_rate:
1581
+ judgeval_logger.info("Skipping async_evaluate due to sampling rate")
1582
+ return
1541
1583
 
1584
+ current_trace = self.get_current_trace()
1542
1585
  if current_trace:
1543
- current_trace.async_evaluate(*args, **kwargs)
1586
+ current_trace.async_evaluate(
1587
+ scorer=scorer, example=example, model=model
1588
+ )
1544
1589
  else:
1545
1590
  judgeval_logger.warning(
1546
1591
  "No trace found (context var or fallback), skipping evaluation"
@@ -1613,9 +1658,68 @@ class Tracer:
1613
1658
  self.otel_span_processor.shutdown()
1614
1659
  self.otel_span_processor = SpanProcessorBase()
1615
1660
 
1661
+ def wait_for_completion(self, timeout: Optional[float] = 30.0) -> bool:
1662
+ """Wait for all evaluations and span processing to complete.
1663
+
1664
+ This method blocks until all queued evaluations are processed and
1665
+ all pending spans are flushed to the server.
1666
+
1667
+ Args:
1668
+ timeout: Maximum time to wait in seconds. Defaults to 30 seconds.
1669
+ None means wait indefinitely.
1670
+
1671
+ Returns:
1672
+ True if all processing completed within the timeout, False otherwise.
1673
+
1674
+ """
1675
+ try:
1676
+ judgeval_logger.debug(
1677
+ "Waiting for all evaluations and spans to complete..."
1678
+ )
1679
+
1680
+ # Wait for all queued evaluation work to complete
1681
+ eval_completed = self.local_eval_queue.wait_for_completion()
1682
+ if not eval_completed:
1683
+ judgeval_logger.warning(
1684
+ f"Local evaluation queue did not complete within {timeout} seconds"
1685
+ )
1686
+ return False
1687
+
1688
+ self.flush_background_spans()
1689
+
1690
+ judgeval_logger.debug("All evaluations and spans completed successfully")
1691
+ return True
1692
+
1693
+ except Exception as e:
1694
+ judgeval_logger.warning(f"Error while waiting for completion: {e}")
1695
+ return False
1696
+
1697
+ def _log_eval_results_callback(self, evaluation_run, scoring_results):
1698
+ """Callback to log evaluation results after local processing."""
1699
+ try:
1700
+ if scoring_results and self.enable_evaluations and self.enable_monitoring:
1701
+ # Convert scoring results to the format expected by API client
1702
+ results_dict = [
1703
+ result.model_dump(warnings=False) for result in scoring_results
1704
+ ]
1705
+ api_client = JudgmentApiClient(self.api_key, self.organization_id)
1706
+ api_client.log_evaluation_results(
1707
+ results_dict, evaluation_run.model_dump(warnings=False)
1708
+ )
1709
+ except Exception as e:
1710
+ judgeval_logger.warning(f"Failed to log local evaluation results: {e}")
1711
+
1616
1712
  def _cleanup_on_exit(self):
1617
1713
  """Cleanup handler called on application exit to ensure spans are flushed."""
1618
1714
  try:
1715
+ # Wait for all queued evaluation work to complete before stopping
1716
+ completed = self.local_eval_queue.wait_for_completion()
1717
+ if not completed:
1718
+ judgeval_logger.warning(
1719
+ "Local evaluation queue did not complete within 30 seconds"
1720
+ )
1721
+
1722
+ self.local_eval_queue.stop_workers()
1619
1723
  self.flush_background_spans()
1620
1724
  except Exception as e:
1621
1725
  judgeval_logger.warning(f"Error during tracer cleanup: {e}")
@@ -1697,33 +1801,76 @@ def wrap(
1697
1801
 
1698
1802
  return wrapper
1699
1803
 
1700
- if isinstance(client, (OpenAI)):
1701
- setattr(client.chat.completions, "create", wrapped(original_create))
1702
- setattr(client.responses, "create", wrapped(original_responses_create))
1703
- setattr(client.beta.chat.completions, "parse", wrapped(original_beta_parse))
1704
- elif isinstance(client, (AsyncOpenAI)):
1705
- setattr(client.chat.completions, "create", wrapped_async(original_create))
1706
- setattr(client.responses, "create", wrapped_async(original_responses_create))
1707
- setattr(
1708
- client.beta.chat.completions, "parse", wrapped_async(original_beta_parse)
1804
+ if HAS_OPENAI:
1805
+ from judgeval.common.tracer.providers import openai_OpenAI, openai_AsyncOpenAI
1806
+
1807
+ assert openai_OpenAI is not None, "OpenAI client not found"
1808
+ assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
1809
+ if isinstance(client, (openai_OpenAI)):
1810
+ setattr(client.chat.completions, "create", wrapped(original_create))
1811
+ setattr(client.responses, "create", wrapped(original_responses_create))
1812
+ setattr(client.beta.chat.completions, "parse", wrapped(original_beta_parse))
1813
+ elif isinstance(client, (openai_AsyncOpenAI)):
1814
+ setattr(client.chat.completions, "create", wrapped_async(original_create))
1815
+ setattr(
1816
+ client.responses, "create", wrapped_async(original_responses_create)
1817
+ )
1818
+ setattr(
1819
+ client.beta.chat.completions,
1820
+ "parse",
1821
+ wrapped_async(original_beta_parse),
1822
+ )
1823
+
1824
+ if HAS_TOGETHER:
1825
+ from judgeval.common.tracer.providers import (
1826
+ together_Together,
1827
+ together_AsyncTogether,
1828
+ )
1829
+
1830
+ assert together_Together is not None, "Together client not found"
1831
+ assert together_AsyncTogether is not None, "Together async client not found"
1832
+ if isinstance(client, (together_Together)):
1833
+ setattr(client.chat.completions, "create", wrapped(original_create))
1834
+ elif isinstance(client, (together_AsyncTogether)):
1835
+ setattr(client.chat.completions, "create", wrapped_async(original_create))
1836
+
1837
+ if HAS_ANTHROPIC:
1838
+ from judgeval.common.tracer.providers import (
1839
+ anthropic_Anthropic,
1840
+ anthropic_AsyncAnthropic,
1841
+ )
1842
+
1843
+ assert anthropic_Anthropic is not None, "Anthropic client not found"
1844
+ assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
1845
+ if isinstance(client, (anthropic_Anthropic)):
1846
+ setattr(client.messages, "create", wrapped(original_create))
1847
+ elif isinstance(client, (anthropic_AsyncAnthropic)):
1848
+ setattr(client.messages, "create", wrapped_async(original_create))
1849
+
1850
+ if HAS_GOOGLE_GENAI:
1851
+ from judgeval.common.tracer.providers import (
1852
+ google_genai_Client,
1853
+ google_genai_AsyncClient,
1709
1854
  )
1710
- elif isinstance(client, (Together)):
1711
- setattr(client.chat.completions, "create", wrapped(original_create))
1712
- elif isinstance(client, (AsyncTogether)):
1713
- setattr(client.chat.completions, "create", wrapped_async(original_create))
1714
- elif isinstance(client, (Anthropic)):
1715
- setattr(client.messages, "create", wrapped(original_create))
1716
- elif isinstance(client, (AsyncAnthropic)):
1717
- setattr(client.messages, "create", wrapped_async(original_create))
1718
- elif isinstance(client, (genai.Client)):
1719
- setattr(client.models, "generate_content", wrapped(original_create))
1720
- elif isinstance(client, (genai.client.AsyncClient)):
1721
- setattr(client.models, "generate_content", wrapped_async(original_create))
1722
- elif isinstance(client, (Groq)):
1723
- setattr(client.chat.completions, "create", wrapped(original_create))
1724
- elif isinstance(client, (AsyncGroq)):
1725
- setattr(client.chat.completions, "create", wrapped_async(original_create))
1726
1855
 
1856
+ assert google_genai_Client is not None, "Google GenAI client not found"
1857
+ assert google_genai_AsyncClient is not None, (
1858
+ "Google GenAI async client not found"
1859
+ )
1860
+ if isinstance(client, (google_genai_Client)):
1861
+ setattr(client.models, "generate_content", wrapped(original_create))
1862
+ elif isinstance(client, (google_genai_AsyncClient)):
1863
+ setattr(client.models, "generate_content", wrapped_async(original_create))
1864
+
1865
+ if HAS_GROQ:
1866
+ from judgeval.common.tracer.providers import groq_Groq, groq_AsyncGroq
1867
+
1868
+ assert groq_Groq is not None, "Groq client not found"
1869
+ assert groq_AsyncGroq is not None, "Groq async client not found"
1870
+ if isinstance(client, (groq_Groq)):
1871
+ setattr(client.chat.completions, "create", wrapped(original_create))
1872
+ elif isinstance(client, (groq_AsyncGroq)):
1873
+ setattr(client.chat.completions, "create", wrapped_async(original_create))
1727
1874
  return client
1728
1875
 
1729
1876
 
@@ -1749,28 +1896,87 @@ def _get_client_config(
1749
1896
  Raises:
1750
1897
  ValueError: If client type is not supported
1751
1898
  """
1752
- if isinstance(client, (OpenAI, AsyncOpenAI)):
1753
- return (
1754
- "OPENAI_API_CALL",
1755
- client.chat.completions.create,
1756
- client.responses.create,
1757
- None,
1758
- client.beta.chat.completions.parse,
1899
+
1900
+ if HAS_OPENAI:
1901
+ from judgeval.common.tracer.providers import openai_OpenAI, openai_AsyncOpenAI
1902
+
1903
+ assert openai_OpenAI is not None, "OpenAI client not found"
1904
+ assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
1905
+ if isinstance(client, (openai_OpenAI)):
1906
+ return (
1907
+ "OPENAI_API_CALL",
1908
+ client.chat.completions.create,
1909
+ client.responses.create,
1910
+ None,
1911
+ client.beta.chat.completions.parse,
1912
+ )
1913
+ elif isinstance(client, (openai_AsyncOpenAI)):
1914
+ return (
1915
+ "OPENAI_API_CALL",
1916
+ client.chat.completions.create,
1917
+ client.responses.create,
1918
+ None,
1919
+ client.beta.chat.completions.parse,
1920
+ )
1921
+ if HAS_TOGETHER:
1922
+ from judgeval.common.tracer.providers import (
1923
+ together_Together,
1924
+ together_AsyncTogether,
1759
1925
  )
1760
- elif isinstance(client, (Groq, AsyncGroq)):
1761
- return "GROQ_API_CALL", client.chat.completions.create, None, None, None
1762
- elif isinstance(client, (Together, AsyncTogether)):
1763
- return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
1764
- elif isinstance(client, (Anthropic, AsyncAnthropic)):
1765
- return (
1766
- "ANTHROPIC_API_CALL",
1767
- client.messages.create,
1768
- None,
1769
- client.messages.stream,
1770
- None,
1926
+
1927
+ assert together_Together is not None, "Together client not found"
1928
+ assert together_AsyncTogether is not None, "Together async client not found"
1929
+ if isinstance(client, (together_Together)):
1930
+ return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
1931
+ elif isinstance(client, (together_AsyncTogether)):
1932
+ return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
1933
+ if HAS_ANTHROPIC:
1934
+ from judgeval.common.tracer.providers import (
1935
+ anthropic_Anthropic,
1936
+ anthropic_AsyncAnthropic,
1937
+ )
1938
+
1939
+ assert anthropic_Anthropic is not None, "Anthropic client not found"
1940
+ assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
1941
+ if isinstance(client, (anthropic_Anthropic)):
1942
+ return (
1943
+ "ANTHROPIC_API_CALL",
1944
+ client.messages.create,
1945
+ None,
1946
+ client.messages.stream,
1947
+ None,
1948
+ )
1949
+ elif isinstance(client, (anthropic_AsyncAnthropic)):
1950
+ return (
1951
+ "ANTHROPIC_API_CALL",
1952
+ client.messages.create,
1953
+ None,
1954
+ client.messages.stream,
1955
+ None,
1956
+ )
1957
+ if HAS_GOOGLE_GENAI:
1958
+ from judgeval.common.tracer.providers import (
1959
+ google_genai_Client,
1960
+ google_genai_AsyncClient,
1771
1961
  )
1772
- elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
1773
- return "GOOGLE_API_CALL", client.models.generate_content, None, None, None
1962
+
1963
+ assert google_genai_Client is not None, "Google GenAI client not found"
1964
+ assert google_genai_AsyncClient is not None, (
1965
+ "Google GenAI async client not found"
1966
+ )
1967
+ if isinstance(client, (google_genai_Client)):
1968
+ return "GOOGLE_API_CALL", client.models.generate_content, None, None, None
1969
+ elif isinstance(client, (google_genai_AsyncClient)):
1970
+ return "GOOGLE_API_CALL", client.models.generate_content, None, None, None
1971
+ if HAS_GROQ:
1972
+ from judgeval.common.tracer.providers import groq_Groq, groq_AsyncGroq
1973
+
1974
+ assert groq_Groq is not None, "Groq client not found"
1975
+ assert groq_AsyncGroq is not None, "Groq async client not found"
1976
+ if isinstance(client, (groq_Groq)):
1977
+ return "GROQ_API_CALL", client.chat.completions.create, None, None, None
1978
+ elif isinstance(client, (groq_AsyncGroq)):
1979
+ return "GROQ_API_CALL", client.chat.completions.create, None, None, None
1774
1980
  raise ValueError(f"Unsupported client type: {type(client)}")
1775
1981
 
1776
1982
 
@@ -1794,73 +2000,173 @@ def _format_output_data(
1794
2000
  model_name = None
1795
2001
  message_content = None
1796
2002
 
1797
- if isinstance(client, (OpenAI, AsyncOpenAI)):
1798
- if isinstance(response, ChatCompletion):
1799
- model_name = response.model
1800
- prompt_tokens = response.usage.prompt_tokens if response.usage else 0
1801
- completion_tokens = (
1802
- response.usage.completion_tokens if response.usage else 0
2003
+ if HAS_OPENAI:
2004
+ from judgeval.common.tracer.providers import (
2005
+ openai_OpenAI,
2006
+ openai_AsyncOpenAI,
2007
+ openai_ChatCompletion,
2008
+ openai_Response,
2009
+ openai_ParsedChatCompletion,
2010
+ )
2011
+
2012
+ assert openai_OpenAI is not None, "OpenAI client not found"
2013
+ assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
2014
+ assert openai_ChatCompletion is not None, "OpenAI chat completion not found"
2015
+ assert openai_Response is not None, "OpenAI response not found"
2016
+ assert openai_ParsedChatCompletion is not None, (
2017
+ "OpenAI parsed chat completion not found"
2018
+ )
2019
+
2020
+ if isinstance(client, (openai_OpenAI, openai_AsyncOpenAI)):
2021
+ if isinstance(response, openai_ChatCompletion):
2022
+ model_name = response.model
2023
+ prompt_tokens = response.usage.prompt_tokens if response.usage else 0
2024
+ completion_tokens = (
2025
+ response.usage.completion_tokens if response.usage else 0
2026
+ )
2027
+ cache_read_input_tokens = (
2028
+ response.usage.prompt_tokens_details.cached_tokens
2029
+ if response.usage
2030
+ and response.usage.prompt_tokens_details
2031
+ and response.usage.prompt_tokens_details.cached_tokens
2032
+ else 0
2033
+ )
2034
+
2035
+ if isinstance(response, openai_ParsedChatCompletion):
2036
+ message_content = response.choices[0].message.parsed
2037
+ else:
2038
+ message_content = response.choices[0].message.content
2039
+ elif isinstance(response, openai_Response):
2040
+ model_name = response.model
2041
+ prompt_tokens = response.usage.input_tokens if response.usage else 0
2042
+ completion_tokens = (
2043
+ response.usage.output_tokens if response.usage else 0
2044
+ )
2045
+ cache_read_input_tokens = (
2046
+ response.usage.input_tokens_details.cached_tokens
2047
+ if response.usage and response.usage.input_tokens_details
2048
+ else 0
2049
+ )
2050
+ if hasattr(response.output[0], "content"):
2051
+ message_content = "".join(
2052
+ seg.text
2053
+ for seg in response.output[0].content
2054
+ if hasattr(seg, "text")
2055
+ )
2056
+ # Note: LiteLLM seems to use cache_read_input_tokens to calculate the cost for OpenAI
2057
+ return message_content, _create_usage(
2058
+ model_name,
2059
+ prompt_tokens,
2060
+ completion_tokens,
2061
+ cache_read_input_tokens,
2062
+ cache_creation_input_tokens,
1803
2063
  )
1804
- cache_read_input_tokens = (
1805
- response.usage.prompt_tokens_details.cached_tokens
1806
- if response.usage
1807
- and response.usage.prompt_tokens_details
1808
- and response.usage.prompt_tokens_details.cached_tokens
1809
- else 0
2064
+
2065
+ if HAS_TOGETHER:
2066
+ from judgeval.common.tracer.providers import (
2067
+ together_Together,
2068
+ together_AsyncTogether,
2069
+ )
2070
+
2071
+ assert together_Together is not None, "Together client not found"
2072
+ assert together_AsyncTogether is not None, "Together async client not found"
2073
+
2074
+ if isinstance(client, (together_Together, together_AsyncTogether)):
2075
+ model_name = "together_ai/" + response.model
2076
+ prompt_tokens = response.usage.prompt_tokens
2077
+ completion_tokens = response.usage.completion_tokens
2078
+ message_content = response.choices[0].message.content
2079
+
2080
+ # As of 2025-07-14, Together does not do any input cache token tracking
2081
+ return message_content, _create_usage(
2082
+ model_name,
2083
+ prompt_tokens,
2084
+ completion_tokens,
2085
+ cache_read_input_tokens,
2086
+ cache_creation_input_tokens,
1810
2087
  )
1811
2088
 
1812
- if isinstance(response, ParsedChatCompletion):
1813
- message_content = response.choices[0].message.parsed
1814
- else:
1815
- message_content = response.choices[0].message.content
1816
- elif isinstance(response, Response):
2089
+ if HAS_GOOGLE_GENAI:
2090
+ from judgeval.common.tracer.providers import (
2091
+ google_genai_Client,
2092
+ google_genai_AsyncClient,
2093
+ )
2094
+
2095
+ assert google_genai_Client is not None, "Google GenAI client not found"
2096
+ assert google_genai_AsyncClient is not None, (
2097
+ "Google GenAI async client not found"
2098
+ )
2099
+ if isinstance(client, (google_genai_Client, google_genai_AsyncClient)):
2100
+ model_name = response.model_version
2101
+ prompt_tokens = response.usage_metadata.prompt_token_count
2102
+ completion_tokens = response.usage_metadata.candidates_token_count
2103
+ message_content = response.candidates[0].content.parts[0].text
2104
+
2105
+ if hasattr(response.usage_metadata, "cached_content_token_count"):
2106
+ cache_read_input_tokens = (
2107
+ response.usage_metadata.cached_content_token_count
2108
+ )
2109
+ return message_content, _create_usage(
2110
+ model_name,
2111
+ prompt_tokens,
2112
+ completion_tokens,
2113
+ cache_read_input_tokens,
2114
+ cache_creation_input_tokens,
2115
+ )
2116
+
2117
+ if HAS_ANTHROPIC:
2118
+ from judgeval.common.tracer.providers import (
2119
+ anthropic_Anthropic,
2120
+ anthropic_AsyncAnthropic,
2121
+ )
2122
+
2123
+ assert anthropic_Anthropic is not None, "Anthropic client not found"
2124
+ assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
2125
+ if isinstance(client, (anthropic_Anthropic, anthropic_AsyncAnthropic)):
1817
2126
  model_name = response.model
1818
- prompt_tokens = response.usage.input_tokens if response.usage else 0
1819
- completion_tokens = response.usage.output_tokens if response.usage else 0
1820
- cache_read_input_tokens = (
1821
- response.usage.input_tokens_details.cached_tokens
1822
- if response.usage and response.usage.input_tokens_details
1823
- else 0
2127
+ prompt_tokens = response.usage.input_tokens
2128
+ completion_tokens = response.usage.output_tokens
2129
+ cache_read_input_tokens = response.usage.cache_read_input_tokens
2130
+ cache_creation_input_tokens = response.usage.cache_creation_input_tokens
2131
+ message_content = response.content[0].text
2132
+ return message_content, _create_usage(
2133
+ model_name,
2134
+ prompt_tokens,
2135
+ completion_tokens,
2136
+ cache_read_input_tokens,
2137
+ cache_creation_input_tokens,
1824
2138
  )
1825
- if hasattr(response.output[0], "content"):
1826
- message_content = "".join(
1827
- seg.text
1828
- for seg in response.output[0].content
1829
- if hasattr(seg, "text")
1830
- )
1831
2139
 
1832
- # Note: LiteLLM seems to use cache_read_input_tokens to calculate the cost for OpenAI
1833
- elif isinstance(client, (Together, AsyncTogether)):
1834
- model_name = "together_ai/" + response.model
1835
- prompt_tokens = response.usage.prompt_tokens
1836
- completion_tokens = response.usage.completion_tokens
1837
- message_content = response.choices[0].message.content
1838
-
1839
- # As of 2025-07-14, Together does not do any input cache token tracking
1840
- elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
1841
- model_name = response.model_version
1842
- prompt_tokens = response.usage_metadata.prompt_token_count
1843
- completion_tokens = response.usage_metadata.candidates_token_count
1844
- message_content = response.candidates[0].content.parts[0].text
1845
-
1846
- if hasattr(response.usage_metadata, "cached_content_token_count"):
1847
- cache_read_input_tokens = response.usage_metadata.cached_content_token_count
1848
- elif isinstance(client, (Anthropic, AsyncAnthropic)):
1849
- model_name = response.model
1850
- prompt_tokens = response.usage.input_tokens
1851
- completion_tokens = response.usage.output_tokens
1852
- cache_read_input_tokens = response.usage.cache_read_input_tokens
1853
- cache_creation_input_tokens = response.usage.cache_creation_input_tokens
1854
- message_content = response.content[0].text
1855
- elif isinstance(client, (Groq, AsyncGroq)):
1856
- model_name = "groq/" + response.model
1857
- prompt_tokens = response.usage.prompt_tokens
1858
- completion_tokens = response.usage.completion_tokens
1859
- message_content = response.choices[0].message.content
1860
- else:
1861
- judgeval_logger.warning(f"Unsupported client type: {type(client)}")
1862
- return None, None
2140
+ if HAS_GROQ:
2141
+ from judgeval.common.tracer.providers import groq_Groq, groq_AsyncGroq
2142
+
2143
+ assert groq_Groq is not None, "Groq client not found"
2144
+ assert groq_AsyncGroq is not None, "Groq async client not found"
2145
+ if isinstance(client, (groq_Groq, groq_AsyncGroq)):
2146
+ model_name = "groq/" + response.model
2147
+ prompt_tokens = response.usage.prompt_tokens
2148
+ completion_tokens = response.usage.completion_tokens
2149
+ message_content = response.choices[0].message.content
2150
+ return message_content, _create_usage(
2151
+ model_name,
2152
+ prompt_tokens,
2153
+ completion_tokens,
2154
+ cache_read_input_tokens,
2155
+ cache_creation_input_tokens,
2156
+ )
2157
+
2158
+ judgeval_logger.warning(f"Unsupported client type: {type(client)}")
2159
+ return None, None
1863
2160
 
2161
+
2162
+ def _create_usage(
2163
+ model_name: str,
2164
+ prompt_tokens: int,
2165
+ completion_tokens: int,
2166
+ cache_read_input_tokens: int = 0,
2167
+ cache_creation_input_tokens: int = 0,
2168
+ ) -> TraceUsage:
2169
+ """Helper function to create TraceUsage object with cost calculation."""
1864
2170
  prompt_cost, completion_cost = cost_per_token(
1865
2171
  model=model_name,
1866
2172
  prompt_tokens=prompt_tokens,
@@ -1871,7 +2177,7 @@ def _format_output_data(
1871
2177
  total_cost_usd = (
1872
2178
  (prompt_cost + completion_cost) if prompt_cost and completion_cost else None
1873
2179
  )
1874
- usage = TraceUsage(
2180
+ return TraceUsage(
1875
2181
  prompt_tokens=prompt_tokens,
1876
2182
  completion_tokens=completion_tokens,
1877
2183
  total_tokens=prompt_tokens + completion_tokens,
@@ -1882,7 +2188,6 @@ def _format_output_data(
1882
2188
  total_cost_usd=total_cost_usd,
1883
2189
  model_name=model_name,
1884
2190
  )
1885
- return message_content, usage
1886
2191
 
1887
2192
 
1888
2193
  def combine_args_kwargs(func, args, kwargs):
@@ -1940,13 +2245,13 @@ def get_instance_prefixed_name(instance, class_name, class_identifiers):
1940
2245
  """
1941
2246
  if class_name in class_identifiers:
1942
2247
  class_config = class_identifiers[class_name]
1943
- attr = class_config["identifier"]
1944
-
1945
- if hasattr(instance, attr):
1946
- instance_name = getattr(instance, attr)
1947
- return instance_name
1948
- else:
1949
- raise Exception(
1950
- f"Attribute {attr} does not exist for {class_name}. Check your identify() decorator."
1951
- )
1952
- return None
2248
+ attr = class_config.get("identifier")
2249
+ if attr:
2250
+ if hasattr(instance, attr) and not callable(getattr(instance, attr)):
2251
+ instance_name = getattr(instance, attr)
2252
+ return instance_name
2253
+ else:
2254
+ raise Exception(
2255
+ f"Attribute {attr} does not exist for {class_name}. Check your agent() decorator."
2256
+ )
2257
+ return None