judgeval 0.16.5__tar.gz → 0.16.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (149) hide show
  1. {judgeval-0.16.5 → judgeval-0.16.7}/PKG-INFO +1 -1
  2. {judgeval-0.16.5 → judgeval-0.16.7}/pyproject.toml +1 -1
  3. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/__init__.py +7 -2
  4. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +15 -4
  5. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/__init__.py +9 -1
  6. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_anthropic/wrapper.py +160 -130
  7. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_google/wrapper.py +137 -98
  8. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_groq/wrapper.py +137 -116
  9. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_openai/wrapper.py +130 -106
  10. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_together/wrapper.py +145 -120
  11. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/utils.py +1 -1
  12. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/utils/decorators/dont_throw.py +1 -1
  13. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/version.py +1 -1
  14. {judgeval-0.16.5 → judgeval-0.16.7}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  15. {judgeval-0.16.5 → judgeval-0.16.7}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  16. {judgeval-0.16.5 → judgeval-0.16.7}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  17. {judgeval-0.16.5 → judgeval-0.16.7}/.github/pull_request_template.md +0 -0
  18. {judgeval-0.16.5 → judgeval-0.16.7}/.github/workflows/blocked-pr.yaml +0 -0
  19. {judgeval-0.16.5 → judgeval-0.16.7}/.github/workflows/ci.yaml +0 -0
  20. {judgeval-0.16.5 → judgeval-0.16.7}/.github/workflows/claude-code-review.yml +0 -0
  21. {judgeval-0.16.5 → judgeval-0.16.7}/.github/workflows/claude.yml +0 -0
  22. {judgeval-0.16.5 → judgeval-0.16.7}/.github/workflows/lint.yaml +0 -0
  23. {judgeval-0.16.5 → judgeval-0.16.7}/.github/workflows/merge-branch-check.yaml +0 -0
  24. {judgeval-0.16.5 → judgeval-0.16.7}/.github/workflows/mypy.yaml +0 -0
  25. {judgeval-0.16.5 → judgeval-0.16.7}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
  26. {judgeval-0.16.5 → judgeval-0.16.7}/.github/workflows/release.yaml +0 -0
  27. {judgeval-0.16.5 → judgeval-0.16.7}/.github/workflows/validate-branch.yaml +0 -0
  28. {judgeval-0.16.5 → judgeval-0.16.7}/.gitignore +0 -0
  29. {judgeval-0.16.5 → judgeval-0.16.7}/.pre-commit-config.yaml +0 -0
  30. {judgeval-0.16.5 → judgeval-0.16.7}/CONTRIBUTING.md +0 -0
  31. {judgeval-0.16.5 → judgeval-0.16.7}/LICENSE.md +0 -0
  32. {judgeval-0.16.5 → judgeval-0.16.7}/README.md +0 -0
  33. {judgeval-0.16.5 → judgeval-0.16.7}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  34. {judgeval-0.16.5 → judgeval-0.16.7}/assets/agent.gif +0 -0
  35. {judgeval-0.16.5 → judgeval-0.16.7}/assets/agent_trace_example.png +0 -0
  36. {judgeval-0.16.5 → judgeval-0.16.7}/assets/brand/company.jpg +0 -0
  37. {judgeval-0.16.5 → judgeval-0.16.7}/assets/brand/company_banner.jpg +0 -0
  38. {judgeval-0.16.5 → judgeval-0.16.7}/assets/brand/darkmode.svg +0 -0
  39. {judgeval-0.16.5 → judgeval-0.16.7}/assets/brand/full_logo.png +0 -0
  40. {judgeval-0.16.5 → judgeval-0.16.7}/assets/brand/icon.png +0 -0
  41. {judgeval-0.16.5 → judgeval-0.16.7}/assets/brand/lightmode.svg +0 -0
  42. {judgeval-0.16.5 → judgeval-0.16.7}/assets/brand/white_background.png +0 -0
  43. {judgeval-0.16.5 → judgeval-0.16.7}/assets/custom_scorer_online_abm.png +0 -0
  44. {judgeval-0.16.5 → judgeval-0.16.7}/assets/data.gif +0 -0
  45. {judgeval-0.16.5 → judgeval-0.16.7}/assets/dataset_clustering_screenshot.png +0 -0
  46. {judgeval-0.16.5 → judgeval-0.16.7}/assets/dataset_clustering_screenshot_dm.png +0 -0
  47. {judgeval-0.16.5 → judgeval-0.16.7}/assets/datasets_preview_screenshot.png +0 -0
  48. {judgeval-0.16.5 → judgeval-0.16.7}/assets/document.gif +0 -0
  49. {judgeval-0.16.5 → judgeval-0.16.7}/assets/error_analysis_dashboard.png +0 -0
  50. {judgeval-0.16.5 → judgeval-0.16.7}/assets/errors.png +0 -0
  51. {judgeval-0.16.5 → judgeval-0.16.7}/assets/experiments_dashboard_screenshot.png +0 -0
  52. {judgeval-0.16.5 → judgeval-0.16.7}/assets/experiments_page.png +0 -0
  53. {judgeval-0.16.5 → judgeval-0.16.7}/assets/experiments_pagev2.png +0 -0
  54. {judgeval-0.16.5 → judgeval-0.16.7}/assets/logo_darkmode.svg +0 -0
  55. {judgeval-0.16.5 → judgeval-0.16.7}/assets/logo_lightmode.svg +0 -0
  56. {judgeval-0.16.5 → judgeval-0.16.7}/assets/monitoring_screenshot.png +0 -0
  57. {judgeval-0.16.5 → judgeval-0.16.7}/assets/online_eval.png +0 -0
  58. {judgeval-0.16.5 → judgeval-0.16.7}/assets/product_shot.png +0 -0
  59. {judgeval-0.16.5 → judgeval-0.16.7}/assets/quickstart_trajectory_ss.png +0 -0
  60. {judgeval-0.16.5 → judgeval-0.16.7}/assets/test.png +0 -0
  61. {judgeval-0.16.5 → judgeval-0.16.7}/assets/tests.png +0 -0
  62. {judgeval-0.16.5 → judgeval-0.16.7}/assets/trace.gif +0 -0
  63. {judgeval-0.16.5 → judgeval-0.16.7}/assets/trace_demo.png +0 -0
  64. {judgeval-0.16.5 → judgeval-0.16.7}/assets/trace_screenshot.png +0 -0
  65. {judgeval-0.16.5 → judgeval-0.16.7}/assets/trace_screenshot_old.png +0 -0
  66. {judgeval-0.16.5 → judgeval-0.16.7}/pytest.ini +0 -0
  67. {judgeval-0.16.5 → judgeval-0.16.7}/scripts/api_generator.py +0 -0
  68. {judgeval-0.16.5 → judgeval-0.16.7}/scripts/openapi_transform.py +0 -0
  69. {judgeval-0.16.5 → judgeval-0.16.7}/scripts/update_types.sh +0 -0
  70. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/api/__init__.py +0 -0
  71. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/api/api_types.py +0 -0
  72. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/cli.py +0 -0
  73. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/constants.py +0 -0
  74. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/data/__init__.py +0 -0
  75. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/data/evaluation_run.py +0 -0
  76. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/data/example.py +0 -0
  77. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/data/judgment_types.py +0 -0
  78. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/data/result.py +0 -0
  79. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/data/scorer_data.py +0 -0
  80. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
  81. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/data/scripts/openapi_transform.py +0 -0
  82. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/data/trace.py +0 -0
  83. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/dataset/__init__.py +0 -0
  84. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/env.py +0 -0
  85. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/evaluation/__init__.py +0 -0
  86. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/exceptions.py +0 -0
  87. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/integrations/langgraph/__init__.py +0 -0
  88. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/integrations/openlit/__init__.py +0 -0
  89. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/judges/__init__.py +0 -0
  90. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/judges/base_judge.py +0 -0
  91. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/judges/litellm_judge.py +0 -0
  92. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/judges/together_judge.py +0 -0
  93. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/judges/utils.py +0 -0
  94. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/logger.py +0 -0
  95. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/__init__.py +0 -0
  96. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/agent_scorer.py +0 -0
  97. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/api_scorer.py +0 -0
  98. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/base_scorer.py +0 -0
  99. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/example_scorer.py +0 -0
  100. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/exceptions.py +0 -0
  101. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  102. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  103. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  104. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  105. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  106. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  107. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/score.py +0 -0
  108. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/scorers/utils.py +0 -0
  109. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/constants.py +0 -0
  110. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/exporters/__init__.py +0 -0
  111. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/exporters/s3.py +0 -0
  112. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/exporters/store.py +0 -0
  113. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/exporters/utils.py +0 -0
  114. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/keys.py +0 -0
  115. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/__init__.py +0 -0
  116. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/config.py +0 -0
  117. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/constants.py +0 -0
  118. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_anthropic/__init__.py +0 -0
  119. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_anthropic/config.py +0 -0
  120. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_google/__init__.py +0 -0
  121. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_google/config.py +0 -0
  122. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_groq/__init__.py +0 -0
  123. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_groq/config.py +0 -0
  124. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_openai/__init__.py +0 -0
  125. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_openai/config.py +0 -0
  126. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_together/__init__.py +0 -0
  127. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/llm_together/config.py +0 -0
  128. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/llm/providers.py +0 -0
  129. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/local_eval_queue.py +0 -0
  130. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/managers.py +0 -0
  131. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/tracer/processors/__init__.py +0 -0
  132. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/trainer/__init__.py +0 -0
  133. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/trainer/config.py +0 -0
  134. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/trainer/console.py +0 -0
  135. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/trainer/trainable_model.py +0 -0
  136. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/trainer/trainer.py +0 -0
  137. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/utils/async_utils.py +0 -0
  138. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/utils/decorators/__init__.py +0 -0
  139. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/utils/decorators/use_once.py +0 -0
  140. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/utils/file_utils.py +0 -0
  141. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/utils/guards.py +0 -0
  142. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/utils/meta.py +0 -0
  143. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/utils/serialize.py +0 -0
  144. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/utils/testing.py +0 -0
  145. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/utils/url.py +0 -0
  146. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/utils/version_check.py +0 -0
  147. {judgeval-0.16.5 → judgeval-0.16.7}/src/judgeval/warnings.py +0 -0
  148. {judgeval-0.16.5 → judgeval-0.16.7}/update_version.py +0 -0
  149. {judgeval-0.16.5 → judgeval-0.16.7}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.16.5
3
+ Version: 0.16.7
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.16.5"
3
+ version = "0.16.7"
4
4
  authors = [
5
5
  { name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
6
6
  { name = "Alex Shan", email = "alex@judgmentlabs.ai" },
@@ -39,18 +39,23 @@ class JudgmentClient(metaclass=SingletonMeta):
39
39
  def run_evaluation(
40
40
  self,
41
41
  examples: List[Example],
42
- scorers: Sequence[Union[ExampleAPIScorerConfig, ExampleScorer]],
42
+ scorers: Sequence[Union[ExampleAPIScorerConfig, ExampleScorer, None]],
43
43
  project_name: str = "default_project",
44
44
  eval_run_name: str = "default_eval_run",
45
45
  model: Optional[str] = None,
46
46
  assert_test: bool = False,
47
47
  ) -> List[ScoringResult]:
48
48
  try:
49
+ for scorer in scorers:
50
+ if scorer is None:
51
+ raise ValueError(
52
+ "Failed to run evaluation: At least one Prompt Scorer was not successfuly retrieved."
53
+ )
49
54
  eval = ExampleEvaluationRun(
50
55
  project_name=project_name,
51
56
  eval_name=eval_run_name,
52
57
  examples=examples,
53
- scorers=scorers,
58
+ scorers=scorers, # type: ignore
54
59
  model=model,
55
60
  )
56
61
 
@@ -12,6 +12,7 @@ from judgeval.logger import judgeval_logger
12
12
  from abc import ABC
13
13
  from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
14
14
  from copy import copy
15
+ from judgeval.utils.decorators.dont_throw import dont_throw
15
16
 
16
17
 
17
18
  def push_prompt_scorer(
@@ -60,10 +61,19 @@ def fetch_prompt_scorer(
60
61
  ):
61
62
  client = JudgmentSyncClient(judgment_api_key, organization_id)
62
63
  try:
63
- scorer_config = client.fetch_scorers({"names": [name]})["scorers"][0]
64
- scorer_config.pop("created_at")
65
- scorer_config.pop("updated_at")
66
- return scorer_config
64
+ fetched_scorers = client.fetch_scorers({"names": [name]})
65
+ if len(fetched_scorers["scorers"]) == 0:
66
+ judgeval_logger.error(f"Prompt scorer '{name}' not found")
67
+ raise JudgmentAPIError(
68
+ status_code=404,
69
+ detail=f"Prompt scorer '{name}' not found",
70
+ response=None, # type: ignore
71
+ )
72
+ else:
73
+ scorer_config = fetched_scorers["scorers"][0]
74
+ scorer_config.pop("created_at")
75
+ scorer_config.pop("updated_at")
76
+ return scorer_config
67
77
  except JudgmentAPIError as e:
68
78
  if e.status_code == 500:
69
79
  raise JudgmentAPIError(
@@ -109,6 +119,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
109
119
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
110
120
 
111
121
  @classmethod
122
+ @dont_throw
112
123
  def get(
113
124
  cls,
114
125
  name: str,
@@ -267,6 +267,7 @@ class Tracer(metaclass=SingletonMeta):
267
267
  if span and span.is_recording():
268
268
  set_span_attribute(span, AttributeKeys.JUDGMENT_CUSTOMER_ID, customer_id)
269
269
 
270
+ @dont_throw
270
271
  def add_agent_attributes_to_span(self, span):
271
272
  """Add agent ID, class name, and instance name to span if they exist in context"""
272
273
  current_agent_context = self.agent_context.get()
@@ -342,6 +343,9 @@ class Tracer(metaclass=SingletonMeta):
342
343
  run_condition = scorer_config.run_condition
343
344
  sampling_rate = scorer_config.sampling_rate
344
345
 
346
+ if scorer is None:
347
+ judgeval_logger.error("Prompt Scorer was not found, skipping evaluation.")
348
+ return
345
349
  if not isinstance(scorer, (TraceAPIScorerConfig)):
346
350
  judgeval_logger.error(
347
351
  "Scorer must be an instance of TraceAPIScorerConfig, got %s, skipping evaluation."
@@ -877,7 +881,7 @@ class Tracer(metaclass=SingletonMeta):
877
881
  self,
878
882
  /,
879
883
  *,
880
- scorer: Union[ExampleAPIScorerConfig, ExampleScorer],
884
+ scorer: Union[ExampleAPIScorerConfig, ExampleScorer, None],
881
885
  example: Example,
882
886
  model: Optional[str] = None,
883
887
  sampling_rate: float = 1.0,
@@ -886,6 +890,10 @@ class Tracer(metaclass=SingletonMeta):
886
890
  judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
887
891
  return
888
892
 
893
+ if scorer is None:
894
+ judgeval_logger.error("Prompt Scorer was not found, skipping evaluation.")
895
+ return
896
+
889
897
  if not isinstance(scorer, (ExampleAPIScorerConfig, ExampleScorer)):
890
898
  judgeval_logger.error(
891
899
  "Scorer must be an instance of ExampleAPIScorerConfig or ExampleScorer, got %s, skipping evaluation."
@@ -1,11 +1,11 @@
1
1
  from __future__ import annotations
2
2
  import functools
3
- import orjson
4
3
  from typing import (
5
4
  TYPE_CHECKING,
6
5
  Callable,
7
6
  Optional,
8
7
  Protocol,
8
+ TypeVar,
9
9
  Tuple,
10
10
  Union,
11
11
  Iterator,
@@ -19,6 +19,7 @@ from judgeval.tracer.llm.llm_anthropic.config import (
19
19
  anthropic_AsyncAnthropic,
20
20
  )
21
21
  from judgeval.tracer.managers import sync_span_context, async_span_context
22
+ from judgeval.logger import judgeval_logger
22
23
  from judgeval.tracer.keys import AttributeKeys
23
24
  from judgeval.tracer.utils import set_span_attribute
24
25
  from judgeval.utils.serialize import safe_serialize
@@ -28,10 +29,6 @@ if TYPE_CHECKING:
28
29
  from opentelemetry.trace import Span
29
30
 
30
31
 
31
- # Keep the original client type for runtime compatibility
32
- AnthropicClientType = Union[anthropic_Anthropic, anthropic_AsyncAnthropic]
33
-
34
-
35
32
  # Content block protocols
36
33
  @runtime_checkable
37
34
  class AnthropicContentBlock(Protocol):
@@ -81,6 +78,10 @@ class AnthropicAsyncClient(Protocol):
81
78
  pass
82
79
 
83
80
 
81
+ # Generic client type bound to both sync and async client protocols
82
+ TClient = TypeVar("TClient", bound=Union[AnthropicClient, AnthropicAsyncClient])
83
+
84
+
84
85
  # Union types
85
86
  AnthropicResponseType = AnthropicMessage
86
87
  AnthropicStreamType = Union[
@@ -193,7 +194,7 @@ class TracedAnthropicGenerator:
193
194
  self,
194
195
  tracer: Tracer,
195
196
  generator: Iterator[AnthropicStreamEvent],
196
- client: AnthropicClientType,
197
+ client: AnthropicClient,
197
198
  span: Span,
198
199
  model_name: str,
199
200
  ):
@@ -261,7 +262,7 @@ class TracedAnthropicAsyncGenerator:
261
262
  self,
262
263
  tracer: Tracer,
263
264
  async_generator: AsyncIterator[AnthropicStreamEvent],
264
- client: AnthropicClientType,
265
+ client: AnthropicAsyncClient,
265
266
  span: Span,
266
267
  model_name: str,
267
268
  ):
@@ -278,6 +279,19 @@ class TracedAnthropicAsyncGenerator:
278
279
  async def __anext__(self) -> AnthropicStreamEvent:
279
280
  try:
280
281
  chunk = await self.async_generator.__anext__()
282
+ except StopAsyncIteration:
283
+ set_span_attribute(
284
+ self.span, AttributeKeys.GEN_AI_COMPLETION, self.accumulated_content
285
+ )
286
+ self.span.end()
287
+ raise
288
+ except Exception as e:
289
+ if self.span:
290
+ self.span.record_exception(e)
291
+ self.span.end()
292
+ raise
293
+
294
+ try:
281
295
  content = _extract_anthropic_content(chunk)
282
296
  if content:
283
297
  self.accumulated_content += content
@@ -310,18 +324,14 @@ class TracedAnthropicAsyncGenerator:
310
324
  AttributeKeys.JUDGMENT_USAGE_METADATA,
311
325
  safe_serialize(usage_data),
312
326
  )
313
- return chunk
314
- except StopAsyncIteration:
315
- set_span_attribute(
316
- self.span, AttributeKeys.GEN_AI_COMPLETION, self.accumulated_content
317
- )
318
- self.span.end()
319
- raise
320
327
  except Exception as e:
321
328
  if self.span:
322
- self.span.record_exception(e)
323
329
  self.span.end()
324
- raise
330
+ judgeval_logger.error(
331
+ f"[anthropic wrapped_async] Error adding span metadata: {e}"
332
+ )
333
+ finally:
334
+ return chunk
325
335
 
326
336
 
327
337
  class TracedAnthropicSyncContextManager:
@@ -329,7 +339,7 @@ class TracedAnthropicSyncContextManager:
329
339
  self,
330
340
  tracer: Tracer,
331
341
  context_manager,
332
- client: AnthropicClientType,
342
+ client: AnthropicClient,
333
343
  span: Span,
334
344
  model_name: str,
335
345
  ):
@@ -354,7 +364,7 @@ class TracedAnthropicAsyncContextManager:
354
364
  self,
355
365
  tracer: Tracer,
356
366
  context_manager,
357
- client: AnthropicClientType,
367
+ client: AnthropicAsyncClient,
358
368
  span: Span,
359
369
  model_name: str,
360
370
  ):
@@ -374,9 +384,7 @@ class TracedAnthropicAsyncContextManager:
374
384
  return await self.context_manager.__aexit__(exc_type, exc_val, exc_tb)
375
385
 
376
386
 
377
- def wrap_anthropic_client(
378
- tracer: Tracer, client: AnthropicClientType
379
- ) -> AnthropicClientType:
387
+ def wrap_anthropic_client(tracer: Tracer, client: TClient) -> TClient:
380
388
  def wrapped(function: Callable, span_name: str):
381
389
  @functools.wraps(function)
382
390
  def wrapper(*args, **kwargs):
@@ -398,68 +406,77 @@ def wrap_anthropic_client(
398
406
  with sync_span_context(
399
407
  tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
400
408
  ) as span:
401
- tracer.add_agent_attributes_to_span(span)
402
- set_span_attribute(
403
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
404
- )
405
- model_name = kwargs.get("model", "")
406
- set_span_attribute(
407
- span, AttributeKeys.GEN_AI_REQUEST_MODEL, model_name
408
- )
409
-
410
- response = function(*args, **kwargs)
411
-
412
- if isinstance(response, AnthropicMessage):
413
- output, usage_data = _format_anthropic_output(response)
414
- # Serialize structured data to JSON for span attribute
415
- if isinstance(output, list):
416
- output_str = orjson.dumps(
417
- output, option=orjson.OPT_INDENT_2
418
- ).decode()
419
- else:
420
- output_str = str(output) if output is not None else None
409
+ try:
410
+ tracer.add_agent_attributes_to_span(span)
411
+ set_span_attribute(
412
+ span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
413
+ )
414
+ model_name = kwargs.get("model", "")
421
415
  set_span_attribute(
422
- span, AttributeKeys.GEN_AI_COMPLETION, output_str
416
+ span, AttributeKeys.GEN_AI_REQUEST_MODEL, model_name
417
+ )
418
+ except Exception as e:
419
+ judgeval_logger.error(
420
+ f"[anthropic wrapped] Error adding span metadata: {e}"
423
421
  )
424
422
 
425
- if usage_data:
426
- (
427
- prompt_tokens,
428
- completion_tokens,
429
- cache_read,
430
- cache_creation,
431
- ) = _extract_anthropic_tokens(usage_data)
432
- set_span_attribute(
433
- span,
434
- AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
435
- prompt_tokens,
436
- )
437
- set_span_attribute(
438
- span,
439
- AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
440
- completion_tokens,
441
- )
442
- set_span_attribute(
443
- span,
444
- AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
445
- cache_read,
446
- )
423
+ response = function(*args, **kwargs)
424
+
425
+ try:
426
+ if isinstance(response, AnthropicMessage):
427
+ output, usage_data = _format_anthropic_output(response)
428
+ # Serialize structured data to JSON for span attribute
429
+ if isinstance(output, list):
430
+ output_str = safe_serialize(output)
431
+ else:
432
+ output_str = str(output) if output is not None else None
447
433
  set_span_attribute(
448
- span,
449
- AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
450
- cache_creation,
434
+ span, AttributeKeys.GEN_AI_COMPLETION, output_str
451
435
  )
436
+
437
+ if usage_data:
438
+ (
439
+ prompt_tokens,
440
+ completion_tokens,
441
+ cache_read,
442
+ cache_creation,
443
+ ) = _extract_anthropic_tokens(usage_data)
444
+ set_span_attribute(
445
+ span,
446
+ AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
447
+ prompt_tokens,
448
+ )
449
+ set_span_attribute(
450
+ span,
451
+ AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
452
+ completion_tokens,
453
+ )
454
+ set_span_attribute(
455
+ span,
456
+ AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
457
+ cache_read,
458
+ )
459
+ set_span_attribute(
460
+ span,
461
+ AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
462
+ cache_creation,
463
+ )
464
+ set_span_attribute(
465
+ span,
466
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
467
+ safe_serialize(usage_data),
468
+ )
452
469
  set_span_attribute(
453
470
  span,
454
- AttributeKeys.JUDGMENT_USAGE_METADATA,
455
- safe_serialize(usage_data),
471
+ AttributeKeys.GEN_AI_RESPONSE_MODEL,
472
+ getattr(response, "model", model_name),
456
473
  )
457
- set_span_attribute(
458
- span,
459
- AttributeKeys.GEN_AI_RESPONSE_MODEL,
460
- getattr(response, "model", model_name),
474
+ except Exception as e:
475
+ judgeval_logger.error(
476
+ f"[anthropic wrapped] Error adding span metadata: {e}"
461
477
  )
462
- return response
478
+ finally:
479
+ return response
463
480
 
464
481
  return wrapper
465
482
 
@@ -484,68 +501,77 @@ def wrap_anthropic_client(
484
501
  async with async_span_context(
485
502
  tracer, span_name, {AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
486
503
  ) as span:
487
- tracer.add_agent_attributes_to_span(span)
488
- set_span_attribute(
489
- span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
490
- )
491
- model_name = kwargs.get("model", "")
492
- set_span_attribute(
493
- span, AttributeKeys.GEN_AI_REQUEST_MODEL, model_name
494
- )
495
-
496
- response = await function(*args, **kwargs)
497
-
498
- if isinstance(response, AnthropicMessage):
499
- output, usage_data = _format_anthropic_output(response)
500
- # Serialize structured data to JSON for span attribute
501
- if isinstance(output, list):
502
- output_str = orjson.dumps(
503
- output, option=orjson.OPT_INDENT_2
504
- ).decode()
505
- else:
506
- output_str = str(output) if output is not None else None
504
+ try:
505
+ tracer.add_agent_attributes_to_span(span)
506
+ set_span_attribute(
507
+ span, AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
508
+ )
509
+ model_name = kwargs.get("model", "")
507
510
  set_span_attribute(
508
- span, AttributeKeys.GEN_AI_COMPLETION, output_str
511
+ span, AttributeKeys.GEN_AI_REQUEST_MODEL, model_name
512
+ )
513
+ except Exception as e:
514
+ judgeval_logger.error(
515
+ f"[anthropic wrapped_async] Error adding span metadata: {e}"
509
516
  )
510
517
 
511
- if usage_data:
512
- (
513
- prompt_tokens,
514
- completion_tokens,
515
- cache_read,
516
- cache_creation,
517
- ) = _extract_anthropic_tokens(usage_data)
518
- set_span_attribute(
519
- span,
520
- AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
521
- prompt_tokens,
522
- )
523
- set_span_attribute(
524
- span,
525
- AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
526
- completion_tokens,
527
- )
528
- set_span_attribute(
529
- span,
530
- AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
531
- cache_read,
532
- )
518
+ response = await function(*args, **kwargs)
519
+
520
+ try:
521
+ if isinstance(response, AnthropicMessage):
522
+ output, usage_data = _format_anthropic_output(response)
523
+ # Serialize structured data to JSON for span attribute
524
+ if isinstance(output, list):
525
+ output_str = safe_serialize(output)
526
+ else:
527
+ output_str = str(output) if output is not None else None
533
528
  set_span_attribute(
534
- span,
535
- AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
536
- cache_creation,
529
+ span, AttributeKeys.GEN_AI_COMPLETION, output_str
537
530
  )
531
+
532
+ if usage_data:
533
+ (
534
+ prompt_tokens,
535
+ completion_tokens,
536
+ cache_read,
537
+ cache_creation,
538
+ ) = _extract_anthropic_tokens(usage_data)
539
+ set_span_attribute(
540
+ span,
541
+ AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
542
+ prompt_tokens,
543
+ )
544
+ set_span_attribute(
545
+ span,
546
+ AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
547
+ completion_tokens,
548
+ )
549
+ set_span_attribute(
550
+ span,
551
+ AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
552
+ cache_read,
553
+ )
554
+ set_span_attribute(
555
+ span,
556
+ AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
557
+ cache_creation,
558
+ )
559
+ set_span_attribute(
560
+ span,
561
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
562
+ safe_serialize(usage_data),
563
+ )
538
564
  set_span_attribute(
539
565
  span,
540
- AttributeKeys.JUDGMENT_USAGE_METADATA,
541
- safe_serialize(usage_data),
566
+ AttributeKeys.GEN_AI_RESPONSE_MODEL,
567
+ getattr(response, "model", model_name),
542
568
  )
543
- set_span_attribute(
544
- span,
545
- AttributeKeys.GEN_AI_RESPONSE_MODEL,
546
- getattr(response, "model", model_name),
569
+ except Exception as e:
570
+ judgeval_logger.error(
571
+ f"[anthropic wrapped_async] Error adding span metadata: {e}"
547
572
  )
548
- return response
573
+ finally:
574
+ return response
549
575
 
550
576
  return wrapper
551
577
 
@@ -590,16 +616,20 @@ def wrap_anthropic_client(
590
616
  return wrapper
591
617
 
592
618
  span_name = "ANTHROPIC_API_CALL"
593
- if anthropic_Anthropic and isinstance(client, anthropic_Anthropic):
619
+ if anthropic_Anthropic is not None and isinstance(client, anthropic_Anthropic):
594
620
  setattr(client.messages, "create", wrapped(client.messages.create, span_name))
595
621
  setattr(
596
622
  client.messages,
597
623
  "stream",
598
624
  wrapped_sync_context_manager(client.messages.stream, span_name),
599
625
  )
600
- elif anthropic_AsyncAnthropic and isinstance(client, anthropic_AsyncAnthropic):
626
+ elif anthropic_AsyncAnthropic is not None and isinstance(
627
+ client, anthropic_AsyncAnthropic
628
+ ):
601
629
  setattr(
602
- client.messages, "create", wrapped_async(client.messages.create, span_name)
630
+ client.messages,
631
+ "create",
632
+ wrapped_async(client.messages.create, span_name),
603
633
  )
604
634
  setattr(
605
635
  client.messages,