judgeval 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,11 +26,15 @@ from typing import (
26
26
  Generator,
27
27
  List,
28
28
  Optional,
29
+ ParamSpec,
29
30
  Tuple,
31
+ TypeVar,
30
32
  Union,
31
33
  TypeAlias,
34
+ overload,
32
35
  )
33
36
  import types
37
+ import random
34
38
 
35
39
 
36
40
  from judgeval.common.tracer.constants import _TRACE_FILEPATH_BLOCKLIST
@@ -38,40 +42,33 @@ from judgeval.common.tracer.constants import _TRACE_FILEPATH_BLOCKLIST
38
42
  from judgeval.common.tracer.otel_span_processor import JudgmentSpanProcessor
39
43
  from judgeval.common.tracer.span_processor import SpanProcessorBase
40
44
  from judgeval.common.tracer.trace_manager import TraceManagerClient
41
- from litellm import cost_per_token as _original_cost_per_token
42
- from openai import OpenAI, AsyncOpenAI
43
- from openai.types.chat.chat_completion import ChatCompletion
44
- from openai.types.responses.response import Response
45
- from openai.types.chat import ParsedChatCompletion
46
- from together import Together, AsyncTogether
47
- from anthropic import Anthropic, AsyncAnthropic
48
- from google import genai
49
- from groq import Groq, AsyncGroq
50
45
 
51
46
  from judgeval.data import Example, Trace, TraceSpan, TraceUsage
52
47
  from judgeval.scorers import APIScorerConfig, BaseScorer
53
48
  from judgeval.evaluation_run import EvaluationRun
54
- from judgeval.common.utils import ExcInfo, validate_api_key
49
+ from judgeval.local_eval_queue import LocalEvaluationQueue
50
+ from judgeval.common.api import JudgmentApiClient
51
+ from judgeval.common.utils import OptExcInfo, validate_api_key
55
52
  from judgeval.common.logger import judgeval_logger
56
53
 
54
+ from litellm import cost_per_token as _original_cost_per_token # type: ignore
55
+ from judgeval.common.tracer.providers import (
56
+ HAS_OPENAI,
57
+ HAS_TOGETHER,
58
+ HAS_ANTHROPIC,
59
+ HAS_GOOGLE_GENAI,
60
+ HAS_GROQ,
61
+ ApiClient,
62
+ )
63
+ from judgeval.constants import DEFAULT_GPT_MODEL
64
+
57
65
 
58
66
  current_trace_var = contextvars.ContextVar[Optional["TraceClient"]](
59
67
  "current_trace", default=None
60
68
  )
61
69
  current_span_var = contextvars.ContextVar[Optional[str]]("current_span", default=None)
62
70
 
63
- ApiClient: TypeAlias = Union[
64
- OpenAI,
65
- Together,
66
- Anthropic,
67
- AsyncOpenAI,
68
- AsyncAnthropic,
69
- AsyncTogether,
70
- genai.Client,
71
- genai.client.AsyncClient,
72
- Groq,
73
- AsyncGroq,
74
- ]
71
+
75
72
  SpanType: TypeAlias = str
76
73
 
77
74
 
@@ -113,10 +110,6 @@ class TraceClient:
113
110
 
114
111
  self.otel_span_processor = tracer.otel_span_processor
115
112
 
116
- judgeval_logger.info(
117
- f"🎯 TraceClient using span processor for trace {self.trace_id}"
118
- )
119
-
120
113
  def get_current_span(self):
121
114
  """Get the current span from the context var"""
122
115
  return self.tracer.get_current_span()
@@ -181,85 +174,53 @@ class TraceClient:
181
174
 
182
175
  def async_evaluate(
183
176
  self,
184
- scorers: List[Union[APIScorerConfig, BaseScorer]],
185
- example: Optional[Example] = None,
186
- input: Optional[str] = None,
187
- actual_output: Optional[Union[str, List[str]]] = None,
188
- expected_output: Optional[Union[str, List[str]]] = None,
189
- context: Optional[List[str]] = None,
190
- retrieval_context: Optional[List[str]] = None,
191
- tools_called: Optional[List[str]] = None,
192
- expected_tools: Optional[List[str]] = None,
193
- additional_metadata: Optional[Dict[str, Any]] = None,
194
- model: Optional[str] = None,
195
- span_id: Optional[str] = None,
177
+ scorer: Union[APIScorerConfig, BaseScorer],
178
+ example: Example,
179
+ model: str = DEFAULT_GPT_MODEL,
196
180
  ):
197
- if not self.enable_evaluations:
198
- return
199
-
200
181
  start_time = time.time()
182
+ span_id = self.get_current_span()
183
+ eval_run_name = (
184
+ f"{self.name.capitalize()}-{span_id}-{scorer.score_type.capitalize()}"
185
+ )
201
186
 
202
- try:
203
- if not scorers:
204
- judgeval_logger.warning("No valid scorers available for evaluation")
205
- return
206
-
207
- except Exception as e:
208
- judgeval_logger.warning(f"Failed to load scorers: {str(e)}")
209
- return
187
+ if isinstance(scorer, APIScorerConfig):
188
+ eval_run = EvaluationRun(
189
+ organization_id=self.tracer.organization_id,
190
+ project_name=self.project_name,
191
+ eval_name=eval_run_name,
192
+ examples=[example],
193
+ scorers=[scorer],
194
+ model=model,
195
+ trace_span_id=span_id,
196
+ )
210
197
 
211
- if example is None:
212
- if any(
213
- param is not None
214
- for param in [
215
- input,
216
- actual_output,
217
- expected_output,
218
- context,
219
- retrieval_context,
220
- tools_called,
221
- expected_tools,
222
- additional_metadata,
223
- ]
224
- ):
225
- example = Example(
226
- input=input,
227
- actual_output=actual_output,
228
- expected_output=expected_output,
229
- context=context,
230
- retrieval_context=retrieval_context,
231
- tools_called=tools_called,
232
- expected_tools=expected_tools,
233
- additional_metadata=additional_metadata,
234
- )
235
- else:
236
- raise ValueError(
237
- "Either 'example' or at least one of the individual parameters (input, actual_output, etc.) must be provided"
238
- )
198
+ self.add_eval_run(eval_run, start_time)
239
199
 
240
- span_id_to_use = span_id if span_id is not None else self.get_current_span()
241
-
242
- eval_run = EvaluationRun(
243
- organization_id=self.tracer.organization_id,
244
- project_name=self.project_name,
245
- eval_name=f"{self.name.capitalize()}-"
246
- f"{span_id_to_use}-"
247
- f"[{','.join(scorer.score_type.capitalize() for scorer in scorers)}]",
248
- examples=[example],
249
- scorers=scorers,
250
- model=model,
251
- judgment_api_key=self.tracer.api_key,
252
- trace_span_id=span_id_to_use,
253
- )
200
+ if span_id:
201
+ current_span = self.span_id_to_span.get(span_id)
202
+ if current_span:
203
+ self.otel_span_processor.queue_evaluation_run(
204
+ eval_run, span_id=span_id, span_data=current_span
205
+ )
206
+ elif isinstance(scorer, BaseScorer):
207
+ # Handle custom scorers using local evaluation queue
208
+ eval_run = EvaluationRun(
209
+ organization_id=self.tracer.organization_id,
210
+ project_name=self.project_name,
211
+ eval_name=eval_run_name,
212
+ examples=[example],
213
+ scorers=[scorer],
214
+ model=model,
215
+ judgment_api_key=self.tracer.api_key,
216
+ trace_span_id=span_id,
217
+ trace_id=self.trace_id,
218
+ )
254
219
 
255
- self.add_eval_run(eval_run, start_time)
220
+ self.add_eval_run(eval_run, start_time)
256
221
 
257
- if span_id_to_use:
258
- current_span = self.span_id_to_span.get(span_id_to_use)
259
- if current_span:
260
- self.otel_span_processor.queue_evaluation_run(
261
- eval_run, span_id=span_id_to_use, span_data=current_span
262
- )
222
+ # Enqueue the evaluation run to the local evaluation queue
223
+ self.tracer.local_eval_queue.enqueue(eval_run)
263
224
 
264
225
  def add_eval_run(self, eval_run: EvaluationRun, start_time: float):
265
226
  current_span_id = eval_run.trace_span_id
@@ -517,7 +478,7 @@ class TraceClient:
517
478
 
518
479
 
519
480
  def _capture_exception_for_trace(
520
- current_trace: Optional[TraceClient], exc_info: ExcInfo
481
+ current_trace: Optional[TraceClient], exc_info: OptExcInfo
521
482
  ):
522
483
  if not current_trace:
523
484
  return
@@ -841,6 +802,10 @@ class _DeepTracer:
841
802
  self._original_threading_trace = None
842
803
 
843
804
 
805
+ T = TypeVar("T", bound=Callable[..., Any])
806
+ P = ParamSpec("P")
807
+
808
+
844
809
  class Tracer:
845
810
  # Tracer.current_trace class variable is currently used in wrap()
846
811
  # TODO: Keep track of cross-context state for current trace and current span ID solely through class variables instead of instance variables?
@@ -954,6 +919,15 @@ class Tracer:
954
919
  else:
955
920
  self.otel_span_processor = SpanProcessorBase()
956
921
 
922
+ # Initialize local evaluation queue for custom scorers
923
+ self.local_eval_queue = LocalEvaluationQueue()
924
+
925
+ # Start workers with callback to log results only if monitoring is enabled
926
+ if enable_evaluations and enable_monitoring:
927
+ self.local_eval_queue.start_workers(
928
+ callback=self._log_eval_results_callback
929
+ )
930
+
957
931
  atexit.register(self._cleanup_on_exit)
958
932
  except Exception as e:
959
933
  judgeval_logger.error(
@@ -1189,11 +1163,24 @@ class Tracer:
1189
1163
  else:
1190
1164
  trace_client_instance.record_state_after(state)
1191
1165
 
1166
+ @overload
1167
+ def observe(
1168
+ self, func: T, *, name: Optional[str] = None, span_type: SpanType = "span"
1169
+ ) -> T: ...
1170
+
1171
+ @overload
1172
+ def observe(
1173
+ self,
1174
+ *,
1175
+ name: Optional[str] = None,
1176
+ span_type: SpanType = "span",
1177
+ ) -> Callable[[T], T]: ...
1178
+
1192
1179
  def observe(
1193
1180
  self,
1194
- func=None,
1181
+ func: Optional[T] = None,
1195
1182
  *,
1196
- name=None,
1183
+ name: Optional[str] = None,
1197
1184
  span_type: SpanType = "span",
1198
1185
  ):
1199
1186
  """
@@ -1210,8 +1197,8 @@ class Tracer:
1210
1197
  return func if func else lambda f: f
1211
1198
 
1212
1199
  if func is None:
1213
- return lambda f: self.observe(
1214
- f,
1200
+ return lambda func: self.observe(
1201
+ func,
1215
1202
  name=name,
1216
1203
  span_type=span_type,
1217
1204
  )
@@ -1220,8 +1207,8 @@ class Tracer:
1220
1207
  original_span_name = name or func.__name__
1221
1208
 
1222
1209
  # Store custom attributes on the function object
1223
- func._judgment_span_name = original_span_name
1224
- func._judgment_span_type = span_type
1210
+ func._judgment_span_name = original_span_name # type: ignore
1211
+ func._judgment_span_type = span_type # type: ignore
1225
1212
 
1226
1213
  except Exception:
1227
1214
  return func
@@ -1532,15 +1519,51 @@ class Tracer:
1532
1519
 
1533
1520
  return decorate_class if cls is None else decorate_class(cls)
1534
1521
 
1535
- def async_evaluate(self, *args, **kwargs):
1522
+ def async_evaluate(
1523
+ self,
1524
+ scorer: Union[APIScorerConfig, BaseScorer],
1525
+ example: Example,
1526
+ model: str = DEFAULT_GPT_MODEL,
1527
+ sampling_rate: float = 1,
1528
+ ):
1536
1529
  try:
1537
1530
  if not self.enable_monitoring or not self.enable_evaluations:
1538
1531
  return
1539
1532
 
1540
- current_trace = self.get_current_trace()
1533
+ if not isinstance(scorer, (APIScorerConfig, BaseScorer)):
1534
+ judgeval_logger.warning(
1535
+ f"Scorer must be an instance of APIScorerConfig or BaseScorer, got {type(scorer)}, skipping evaluation"
1536
+ )
1537
+ return
1538
+
1539
+ if not isinstance(example, Example):
1540
+ judgeval_logger.warning(
1541
+ f"Example must be an instance of Example, got {type(example)} skipping evaluation"
1542
+ )
1543
+ return
1544
+
1545
+ if sampling_rate < 0:
1546
+ judgeval_logger.warning(
1547
+ "Cannot set sampling_rate below 0, skipping evaluation"
1548
+ )
1549
+ return
1550
+
1551
+ if sampling_rate > 1:
1552
+ judgeval_logger.warning(
1553
+ "Cannot set sampling_rate above 1, skipping evaluation"
1554
+ )
1555
+ return
1541
1556
 
1557
+ percentage = random.uniform(0, 1)
1558
+ if percentage > sampling_rate:
1559
+ judgeval_logger.info("Skipping async_evaluate due to sampling rate")
1560
+ return
1561
+
1562
+ current_trace = self.get_current_trace()
1542
1563
  if current_trace:
1543
- current_trace.async_evaluate(*args, **kwargs)
1564
+ current_trace.async_evaluate(
1565
+ scorer=scorer, example=example, model=model
1566
+ )
1544
1567
  else:
1545
1568
  judgeval_logger.warning(
1546
1569
  "No trace found (context var or fallback), skipping evaluation"
@@ -1613,9 +1636,68 @@ class Tracer:
1613
1636
  self.otel_span_processor.shutdown()
1614
1637
  self.otel_span_processor = SpanProcessorBase()
1615
1638
 
1639
+ def wait_for_completion(self, timeout: Optional[float] = 30.0) -> bool:
1640
+ """Wait for all evaluations and span processing to complete.
1641
+
1642
+ This method blocks until all queued evaluations are processed and
1643
+ all pending spans are flushed to the server.
1644
+
1645
+ Args:
1646
+ timeout: Maximum time to wait in seconds. Defaults to 30 seconds.
1647
+ None means wait indefinitely.
1648
+
1649
+ Returns:
1650
+ True if all processing completed within the timeout, False otherwise.
1651
+
1652
+ """
1653
+ try:
1654
+ judgeval_logger.debug(
1655
+ "Waiting for all evaluations and spans to complete..."
1656
+ )
1657
+
1658
+ # Wait for all queued evaluation work to complete
1659
+ eval_completed = self.local_eval_queue.wait_for_completion()
1660
+ if not eval_completed:
1661
+ judgeval_logger.warning(
1662
+ f"Local evaluation queue did not complete within {timeout} seconds"
1663
+ )
1664
+ return False
1665
+
1666
+ self.flush_background_spans()
1667
+
1668
+ judgeval_logger.debug("All evaluations and spans completed successfully")
1669
+ return True
1670
+
1671
+ except Exception as e:
1672
+ judgeval_logger.warning(f"Error while waiting for completion: {e}")
1673
+ return False
1674
+
1675
+ def _log_eval_results_callback(self, evaluation_run, scoring_results):
1676
+ """Callback to log evaluation results after local processing."""
1677
+ try:
1678
+ if scoring_results and self.enable_evaluations and self.enable_monitoring:
1679
+ # Convert scoring results to the format expected by API client
1680
+ results_dict = [
1681
+ result.model_dump(warnings=False) for result in scoring_results
1682
+ ]
1683
+ api_client = JudgmentApiClient(self.api_key, self.organization_id)
1684
+ api_client.log_evaluation_results(
1685
+ results_dict, evaluation_run.model_dump(warnings=False)
1686
+ )
1687
+ except Exception as e:
1688
+ judgeval_logger.warning(f"Failed to log local evaluation results: {e}")
1689
+
1616
1690
  def _cleanup_on_exit(self):
1617
1691
  """Cleanup handler called on application exit to ensure spans are flushed."""
1618
1692
  try:
1693
+ # Wait for all queued evaluation work to complete before stopping
1694
+ completed = self.local_eval_queue.wait_for_completion()
1695
+ if not completed:
1696
+ judgeval_logger.warning(
1697
+ "Local evaluation queue did not complete within 30 seconds"
1698
+ )
1699
+
1700
+ self.local_eval_queue.stop_workers()
1619
1701
  self.flush_background_spans()
1620
1702
  except Exception as e:
1621
1703
  judgeval_logger.warning(f"Error during tracer cleanup: {e}")
@@ -1697,33 +1779,76 @@ def wrap(
1697
1779
 
1698
1780
  return wrapper
1699
1781
 
1700
- if isinstance(client, (OpenAI)):
1701
- setattr(client.chat.completions, "create", wrapped(original_create))
1702
- setattr(client.responses, "create", wrapped(original_responses_create))
1703
- setattr(client.beta.chat.completions, "parse", wrapped(original_beta_parse))
1704
- elif isinstance(client, (AsyncOpenAI)):
1705
- setattr(client.chat.completions, "create", wrapped_async(original_create))
1706
- setattr(client.responses, "create", wrapped_async(original_responses_create))
1707
- setattr(
1708
- client.beta.chat.completions, "parse", wrapped_async(original_beta_parse)
1782
+ if HAS_OPENAI:
1783
+ from judgeval.common.tracer.providers import openai_OpenAI, openai_AsyncOpenAI
1784
+
1785
+ assert openai_OpenAI is not None, "OpenAI client not found"
1786
+ assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
1787
+ if isinstance(client, (openai_OpenAI)):
1788
+ setattr(client.chat.completions, "create", wrapped(original_create))
1789
+ setattr(client.responses, "create", wrapped(original_responses_create))
1790
+ setattr(client.beta.chat.completions, "parse", wrapped(original_beta_parse))
1791
+ elif isinstance(client, (openai_AsyncOpenAI)):
1792
+ setattr(client.chat.completions, "create", wrapped_async(original_create))
1793
+ setattr(
1794
+ client.responses, "create", wrapped_async(original_responses_create)
1795
+ )
1796
+ setattr(
1797
+ client.beta.chat.completions,
1798
+ "parse",
1799
+ wrapped_async(original_beta_parse),
1800
+ )
1801
+
1802
+ if HAS_TOGETHER:
1803
+ from judgeval.common.tracer.providers import (
1804
+ together_Together,
1805
+ together_AsyncTogether,
1709
1806
  )
1710
- elif isinstance(client, (Together)):
1711
- setattr(client.chat.completions, "create", wrapped(original_create))
1712
- elif isinstance(client, (AsyncTogether)):
1713
- setattr(client.chat.completions, "create", wrapped_async(original_create))
1714
- elif isinstance(client, (Anthropic)):
1715
- setattr(client.messages, "create", wrapped(original_create))
1716
- elif isinstance(client, (AsyncAnthropic)):
1717
- setattr(client.messages, "create", wrapped_async(original_create))
1718
- elif isinstance(client, (genai.Client)):
1719
- setattr(client.models, "generate_content", wrapped(original_create))
1720
- elif isinstance(client, (genai.client.AsyncClient)):
1721
- setattr(client.models, "generate_content", wrapped_async(original_create))
1722
- elif isinstance(client, (Groq)):
1723
- setattr(client.chat.completions, "create", wrapped(original_create))
1724
- elif isinstance(client, (AsyncGroq)):
1725
- setattr(client.chat.completions, "create", wrapped_async(original_create))
1726
1807
 
1808
+ assert together_Together is not None, "Together client not found"
1809
+ assert together_AsyncTogether is not None, "Together async client not found"
1810
+ if isinstance(client, (together_Together)):
1811
+ setattr(client.chat.completions, "create", wrapped(original_create))
1812
+ elif isinstance(client, (together_AsyncTogether)):
1813
+ setattr(client.chat.completions, "create", wrapped_async(original_create))
1814
+
1815
+ if HAS_ANTHROPIC:
1816
+ from judgeval.common.tracer.providers import (
1817
+ anthropic_Anthropic,
1818
+ anthropic_AsyncAnthropic,
1819
+ )
1820
+
1821
+ assert anthropic_Anthropic is not None, "Anthropic client not found"
1822
+ assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
1823
+ if isinstance(client, (anthropic_Anthropic)):
1824
+ setattr(client.messages, "create", wrapped(original_create))
1825
+ elif isinstance(client, (anthropic_AsyncAnthropic)):
1826
+ setattr(client.messages, "create", wrapped_async(original_create))
1827
+
1828
+ if HAS_GOOGLE_GENAI:
1829
+ from judgeval.common.tracer.providers import (
1830
+ google_genai_Client,
1831
+ google_genai_AsyncClient,
1832
+ )
1833
+
1834
+ assert google_genai_Client is not None, "Google GenAI client not found"
1835
+ assert google_genai_AsyncClient is not None, (
1836
+ "Google GenAI async client not found"
1837
+ )
1838
+ if isinstance(client, (google_genai_Client)):
1839
+ setattr(client.models, "generate_content", wrapped(original_create))
1840
+ elif isinstance(client, (google_genai_AsyncClient)):
1841
+ setattr(client.models, "generate_content", wrapped_async(original_create))
1842
+
1843
+ if HAS_GROQ:
1844
+ from judgeval.common.tracer.providers import groq_Groq, groq_AsyncGroq
1845
+
1846
+ assert groq_Groq is not None, "Groq client not found"
1847
+ assert groq_AsyncGroq is not None, "Groq async client not found"
1848
+ if isinstance(client, (groq_Groq)):
1849
+ setattr(client.chat.completions, "create", wrapped(original_create))
1850
+ elif isinstance(client, (groq_AsyncGroq)):
1851
+ setattr(client.chat.completions, "create", wrapped_async(original_create))
1727
1852
  return client
1728
1853
 
1729
1854
 
@@ -1749,28 +1874,87 @@ def _get_client_config(
1749
1874
  Raises:
1750
1875
  ValueError: If client type is not supported
1751
1876
  """
1752
- if isinstance(client, (OpenAI, AsyncOpenAI)):
1753
- return (
1754
- "OPENAI_API_CALL",
1755
- client.chat.completions.create,
1756
- client.responses.create,
1757
- None,
1758
- client.beta.chat.completions.parse,
1877
+
1878
+ if HAS_OPENAI:
1879
+ from judgeval.common.tracer.providers import openai_OpenAI, openai_AsyncOpenAI
1880
+
1881
+ assert openai_OpenAI is not None, "OpenAI client not found"
1882
+ assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
1883
+ if isinstance(client, (openai_OpenAI)):
1884
+ return (
1885
+ "OPENAI_API_CALL",
1886
+ client.chat.completions.create,
1887
+ client.responses.create,
1888
+ None,
1889
+ client.beta.chat.completions.parse,
1890
+ )
1891
+ elif isinstance(client, (openai_AsyncOpenAI)):
1892
+ return (
1893
+ "OPENAI_API_CALL",
1894
+ client.chat.completions.create,
1895
+ client.responses.create,
1896
+ None,
1897
+ client.beta.chat.completions.parse,
1898
+ )
1899
+ if HAS_TOGETHER:
1900
+ from judgeval.common.tracer.providers import (
1901
+ together_Together,
1902
+ together_AsyncTogether,
1759
1903
  )
1760
- elif isinstance(client, (Groq, AsyncGroq)):
1761
- return "GROQ_API_CALL", client.chat.completions.create, None, None, None
1762
- elif isinstance(client, (Together, AsyncTogether)):
1763
- return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
1764
- elif isinstance(client, (Anthropic, AsyncAnthropic)):
1765
- return (
1766
- "ANTHROPIC_API_CALL",
1767
- client.messages.create,
1768
- None,
1769
- client.messages.stream,
1770
- None,
1904
+
1905
+ assert together_Together is not None, "Together client not found"
1906
+ assert together_AsyncTogether is not None, "Together async client not found"
1907
+ if isinstance(client, (together_Together)):
1908
+ return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
1909
+ elif isinstance(client, (together_AsyncTogether)):
1910
+ return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
1911
+ if HAS_ANTHROPIC:
1912
+ from judgeval.common.tracer.providers import (
1913
+ anthropic_Anthropic,
1914
+ anthropic_AsyncAnthropic,
1915
+ )
1916
+
1917
+ assert anthropic_Anthropic is not None, "Anthropic client not found"
1918
+ assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
1919
+ if isinstance(client, (anthropic_Anthropic)):
1920
+ return (
1921
+ "ANTHROPIC_API_CALL",
1922
+ client.messages.create,
1923
+ None,
1924
+ client.messages.stream,
1925
+ None,
1926
+ )
1927
+ elif isinstance(client, (anthropic_AsyncAnthropic)):
1928
+ return (
1929
+ "ANTHROPIC_API_CALL",
1930
+ client.messages.create,
1931
+ None,
1932
+ client.messages.stream,
1933
+ None,
1934
+ )
1935
+ if HAS_GOOGLE_GENAI:
1936
+ from judgeval.common.tracer.providers import (
1937
+ google_genai_Client,
1938
+ google_genai_AsyncClient,
1771
1939
  )
1772
- elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
1773
- return "GOOGLE_API_CALL", client.models.generate_content, None, None, None
1940
+
1941
+ assert google_genai_Client is not None, "Google GenAI client not found"
1942
+ assert google_genai_AsyncClient is not None, (
1943
+ "Google GenAI async client not found"
1944
+ )
1945
+ if isinstance(client, (google_genai_Client)):
1946
+ return "GOOGLE_API_CALL", client.models.generate_content, None, None, None
1947
+ elif isinstance(client, (google_genai_AsyncClient)):
1948
+ return "GOOGLE_API_CALL", client.models.generate_content, None, None, None
1949
+ if HAS_GROQ:
1950
+ from judgeval.common.tracer.providers import groq_Groq, groq_AsyncGroq
1951
+
1952
+ assert groq_Groq is not None, "Groq client not found"
1953
+ assert groq_AsyncGroq is not None, "Groq async client not found"
1954
+ if isinstance(client, (groq_Groq)):
1955
+ return "GROQ_API_CALL", client.chat.completions.create, None, None, None
1956
+ elif isinstance(client, (groq_AsyncGroq)):
1957
+ return "GROQ_API_CALL", client.chat.completions.create, None, None, None
1774
1958
  raise ValueError(f"Unsupported client type: {type(client)}")
1775
1959
 
1776
1960
 
@@ -1794,73 +1978,173 @@ def _format_output_data(
1794
1978
  model_name = None
1795
1979
  message_content = None
1796
1980
 
1797
- if isinstance(client, (OpenAI, AsyncOpenAI)):
1798
- if isinstance(response, ChatCompletion):
1799
- model_name = response.model
1800
- prompt_tokens = response.usage.prompt_tokens if response.usage else 0
1801
- completion_tokens = (
1802
- response.usage.completion_tokens if response.usage else 0
1981
+ if HAS_OPENAI:
1982
+ from judgeval.common.tracer.providers import (
1983
+ openai_OpenAI,
1984
+ openai_AsyncOpenAI,
1985
+ openai_ChatCompletion,
1986
+ openai_Response,
1987
+ openai_ParsedChatCompletion,
1988
+ )
1989
+
1990
+ assert openai_OpenAI is not None, "OpenAI client not found"
1991
+ assert openai_AsyncOpenAI is not None, "OpenAI async client not found"
1992
+ assert openai_ChatCompletion is not None, "OpenAI chat completion not found"
1993
+ assert openai_Response is not None, "OpenAI response not found"
1994
+ assert openai_ParsedChatCompletion is not None, (
1995
+ "OpenAI parsed chat completion not found"
1996
+ )
1997
+
1998
+ if isinstance(client, (openai_OpenAI, openai_AsyncOpenAI)):
1999
+ if isinstance(response, openai_ChatCompletion):
2000
+ model_name = response.model
2001
+ prompt_tokens = response.usage.prompt_tokens if response.usage else 0
2002
+ completion_tokens = (
2003
+ response.usage.completion_tokens if response.usage else 0
2004
+ )
2005
+ cache_read_input_tokens = (
2006
+ response.usage.prompt_tokens_details.cached_tokens
2007
+ if response.usage
2008
+ and response.usage.prompt_tokens_details
2009
+ and response.usage.prompt_tokens_details.cached_tokens
2010
+ else 0
2011
+ )
2012
+
2013
+ if isinstance(response, openai_ParsedChatCompletion):
2014
+ message_content = response.choices[0].message.parsed
2015
+ else:
2016
+ message_content = response.choices[0].message.content
2017
+ elif isinstance(response, openai_Response):
2018
+ model_name = response.model
2019
+ prompt_tokens = response.usage.input_tokens if response.usage else 0
2020
+ completion_tokens = (
2021
+ response.usage.output_tokens if response.usage else 0
2022
+ )
2023
+ cache_read_input_tokens = (
2024
+ response.usage.input_tokens_details.cached_tokens
2025
+ if response.usage and response.usage.input_tokens_details
2026
+ else 0
2027
+ )
2028
+ if hasattr(response.output[0], "content"):
2029
+ message_content = "".join(
2030
+ seg.text
2031
+ for seg in response.output[0].content
2032
+ if hasattr(seg, "text")
2033
+ )
2034
+ # Note: LiteLLM seems to use cache_read_input_tokens to calculate the cost for OpenAI
2035
+ return message_content, _create_usage(
2036
+ model_name,
2037
+ prompt_tokens,
2038
+ completion_tokens,
2039
+ cache_read_input_tokens,
2040
+ cache_creation_input_tokens,
2041
+ )
2042
+
2043
+ if HAS_TOGETHER:
2044
+ from judgeval.common.tracer.providers import (
2045
+ together_Together,
2046
+ together_AsyncTogether,
2047
+ )
2048
+
2049
+ assert together_Together is not None, "Together client not found"
2050
+ assert together_AsyncTogether is not None, "Together async client not found"
2051
+
2052
+ if isinstance(client, (together_Together, together_AsyncTogether)):
2053
+ model_name = "together_ai/" + response.model
2054
+ prompt_tokens = response.usage.prompt_tokens
2055
+ completion_tokens = response.usage.completion_tokens
2056
+ message_content = response.choices[0].message.content
2057
+
2058
+ # As of 2025-07-14, Together does not do any input cache token tracking
2059
+ return message_content, _create_usage(
2060
+ model_name,
2061
+ prompt_tokens,
2062
+ completion_tokens,
2063
+ cache_read_input_tokens,
2064
+ cache_creation_input_tokens,
1803
2065
  )
1804
- cache_read_input_tokens = (
1805
- response.usage.prompt_tokens_details.cached_tokens
1806
- if response.usage
1807
- and response.usage.prompt_tokens_details
1808
- and response.usage.prompt_tokens_details.cached_tokens
1809
- else 0
2066
+
2067
+ if HAS_GOOGLE_GENAI:
2068
+ from judgeval.common.tracer.providers import (
2069
+ google_genai_Client,
2070
+ google_genai_AsyncClient,
2071
+ )
2072
+
2073
+ assert google_genai_Client is not None, "Google GenAI client not found"
2074
+ assert google_genai_AsyncClient is not None, (
2075
+ "Google GenAI async client not found"
2076
+ )
2077
+ if isinstance(client, (google_genai_Client, google_genai_AsyncClient)):
2078
+ model_name = response.model_version
2079
+ prompt_tokens = response.usage_metadata.prompt_token_count
2080
+ completion_tokens = response.usage_metadata.candidates_token_count
2081
+ message_content = response.candidates[0].content.parts[0].text
2082
+
2083
+ if hasattr(response.usage_metadata, "cached_content_token_count"):
2084
+ cache_read_input_tokens = (
2085
+ response.usage_metadata.cached_content_token_count
2086
+ )
2087
+ return message_content, _create_usage(
2088
+ model_name,
2089
+ prompt_tokens,
2090
+ completion_tokens,
2091
+ cache_read_input_tokens,
2092
+ cache_creation_input_tokens,
1810
2093
  )
1811
2094
 
1812
- if isinstance(response, ParsedChatCompletion):
1813
- message_content = response.choices[0].message.parsed
1814
- else:
1815
- message_content = response.choices[0].message.content
1816
- elif isinstance(response, Response):
2095
+ if HAS_ANTHROPIC:
2096
+ from judgeval.common.tracer.providers import (
2097
+ anthropic_Anthropic,
2098
+ anthropic_AsyncAnthropic,
2099
+ )
2100
+
2101
+ assert anthropic_Anthropic is not None, "Anthropic client not found"
2102
+ assert anthropic_AsyncAnthropic is not None, "Anthropic async client not found"
2103
+ if isinstance(client, (anthropic_Anthropic, anthropic_AsyncAnthropic)):
1817
2104
  model_name = response.model
1818
- prompt_tokens = response.usage.input_tokens if response.usage else 0
1819
- completion_tokens = response.usage.output_tokens if response.usage else 0
1820
- cache_read_input_tokens = (
1821
- response.usage.input_tokens_details.cached_tokens
1822
- if response.usage and response.usage.input_tokens_details
1823
- else 0
2105
+ prompt_tokens = response.usage.input_tokens
2106
+ completion_tokens = response.usage.output_tokens
2107
+ cache_read_input_tokens = response.usage.cache_read_input_tokens
2108
+ cache_creation_input_tokens = response.usage.cache_creation_input_tokens
2109
+ message_content = response.content[0].text
2110
+ return message_content, _create_usage(
2111
+ model_name,
2112
+ prompt_tokens,
2113
+ completion_tokens,
2114
+ cache_read_input_tokens,
2115
+ cache_creation_input_tokens,
1824
2116
  )
1825
- if hasattr(response.output[0], "content"):
1826
- message_content = "".join(
1827
- seg.text
1828
- for seg in response.output[0].content
1829
- if hasattr(seg, "text")
1830
- )
1831
2117
 
1832
- # Note: LiteLLM seems to use cache_read_input_tokens to calculate the cost for OpenAI
1833
- elif isinstance(client, (Together, AsyncTogether)):
1834
- model_name = "together_ai/" + response.model
1835
- prompt_tokens = response.usage.prompt_tokens
1836
- completion_tokens = response.usage.completion_tokens
1837
- message_content = response.choices[0].message.content
1838
-
1839
- # As of 2025-07-14, Together does not do any input cache token tracking
1840
- elif isinstance(client, (genai.Client, genai.client.AsyncClient)):
1841
- model_name = response.model_version
1842
- prompt_tokens = response.usage_metadata.prompt_token_count
1843
- completion_tokens = response.usage_metadata.candidates_token_count
1844
- message_content = response.candidates[0].content.parts[0].text
1845
-
1846
- if hasattr(response.usage_metadata, "cached_content_token_count"):
1847
- cache_read_input_tokens = response.usage_metadata.cached_content_token_count
1848
- elif isinstance(client, (Anthropic, AsyncAnthropic)):
1849
- model_name = response.model
1850
- prompt_tokens = response.usage.input_tokens
1851
- completion_tokens = response.usage.output_tokens
1852
- cache_read_input_tokens = response.usage.cache_read_input_tokens
1853
- cache_creation_input_tokens = response.usage.cache_creation_input_tokens
1854
- message_content = response.content[0].text
1855
- elif isinstance(client, (Groq, AsyncGroq)):
1856
- model_name = "groq/" + response.model
1857
- prompt_tokens = response.usage.prompt_tokens
1858
- completion_tokens = response.usage.completion_tokens
1859
- message_content = response.choices[0].message.content
1860
- else:
1861
- judgeval_logger.warning(f"Unsupported client type: {type(client)}")
1862
- return None, None
2118
+ if HAS_GROQ:
2119
+ from judgeval.common.tracer.providers import groq_Groq, groq_AsyncGroq
2120
+
2121
+ assert groq_Groq is not None, "Groq client not found"
2122
+ assert groq_AsyncGroq is not None, "Groq async client not found"
2123
+ if isinstance(client, (groq_Groq, groq_AsyncGroq)):
2124
+ model_name = "groq/" + response.model
2125
+ prompt_tokens = response.usage.prompt_tokens
2126
+ completion_tokens = response.usage.completion_tokens
2127
+ message_content = response.choices[0].message.content
2128
+ return message_content, _create_usage(
2129
+ model_name,
2130
+ prompt_tokens,
2131
+ completion_tokens,
2132
+ cache_read_input_tokens,
2133
+ cache_creation_input_tokens,
2134
+ )
2135
+
2136
+ judgeval_logger.warning(f"Unsupported client type: {type(client)}")
2137
+ return None, None
2138
+
1863
2139
 
2140
+ def _create_usage(
2141
+ model_name: str,
2142
+ prompt_tokens: int,
2143
+ completion_tokens: int,
2144
+ cache_read_input_tokens: int = 0,
2145
+ cache_creation_input_tokens: int = 0,
2146
+ ) -> TraceUsage:
2147
+ """Helper function to create TraceUsage object with cost calculation."""
1864
2148
  prompt_cost, completion_cost = cost_per_token(
1865
2149
  model=model_name,
1866
2150
  prompt_tokens=prompt_tokens,
@@ -1871,7 +2155,7 @@ def _format_output_data(
1871
2155
  total_cost_usd = (
1872
2156
  (prompt_cost + completion_cost) if prompt_cost and completion_cost else None
1873
2157
  )
1874
- usage = TraceUsage(
2158
+ return TraceUsage(
1875
2159
  prompt_tokens=prompt_tokens,
1876
2160
  completion_tokens=completion_tokens,
1877
2161
  total_tokens=prompt_tokens + completion_tokens,
@@ -1882,7 +2166,6 @@ def _format_output_data(
1882
2166
  total_cost_usd=total_cost_usd,
1883
2167
  model_name=model_name,
1884
2168
  )
1885
- return message_content, usage
1886
2169
 
1887
2170
 
1888
2171
  def combine_args_kwargs(func, args, kwargs):