judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,307 @@
1
+ from __future__ import annotations
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Any,
5
+ Dict,
6
+ Generator,
7
+ AsyncGenerator,
8
+ )
9
+
10
+ from opentelemetry.trace import Status, StatusCode
11
+ from judgeval.judgment_attribute_keys import AttributeKeys
12
+ from judgeval.utils.serialize import safe_serialize
13
+ from judgeval.utils.wrappers import (
14
+ mutable_wrap_sync,
15
+ immutable_wrap_sync_iterator,
16
+ immutable_wrap_async_iterator,
17
+ )
18
+ from judgeval.v1.instrumentation.llm.llm_anthropic.messages import (
19
+ _extract_anthropic_tokens,
20
+ )
21
+
22
+ if TYPE_CHECKING:
23
+ from judgeval.v1.tracer import BaseTracer
24
+ from anthropic import Anthropic, AsyncAnthropic
25
+ from anthropic.lib.streaming import (
26
+ MessageStreamManager,
27
+ AsyncMessageStreamManager,
28
+ MessageStream,
29
+ AsyncMessageStream,
30
+ )
31
+
32
+
33
+ def wrap_messages_stream_sync(tracer: BaseTracer, client: Anthropic) -> None:
34
+ original_func = client.messages.stream
35
+
36
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
37
+ ctx["span"] = tracer.get_tracer().start_span(
38
+ "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
39
+ )
40
+ ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
41
+
42
+ ctx["model_name"] = kwargs.get("model", "")
43
+ ctx["span"].set_attribute(
44
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
45
+ )
46
+ ctx["accumulated_content"] = ""
47
+
48
+ def mutate_hook(
49
+ ctx: Dict[str, Any], result: MessageStreamManager
50
+ ) -> MessageStreamManager:
51
+ original_manager = result
52
+
53
+ class WrappedMessageStreamManager:
54
+ def __init__(self, manager: MessageStreamManager):
55
+ self._manager = manager
56
+
57
+ def __enter__(self) -> MessageStream:
58
+ stream = self._manager.__enter__()
59
+ post_hook_enter_impl(stream)
60
+ return stream
61
+
62
+ def __exit__(self, exc_type, exc_val, exc_tb):
63
+ result = self._manager.__exit__(exc_type, exc_val, exc_tb)
64
+ post_hook_exit_impl()
65
+ return result
66
+
67
+ def __getattr__(self, name):
68
+ return getattr(self._manager, name)
69
+
70
+ def post_hook_enter_impl(stream: MessageStream) -> None:
71
+ ctx["stream"] = stream
72
+ original_text_stream = stream.text_stream
73
+
74
+ def traced_text_stream() -> Generator[str, None, None]:
75
+ for text_chunk in original_text_stream:
76
+ yield text_chunk
77
+
78
+ def yield_hook(inner_ctx: Dict[str, Any], text_chunk: str) -> None:
79
+ span = ctx.get("span")
80
+ if span and text_chunk:
81
+ ctx["accumulated_content"] = (
82
+ ctx.get("accumulated_content", "") + text_chunk
83
+ )
84
+
85
+ def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
86
+ pass
87
+
88
+ def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
89
+ span = ctx.get("span")
90
+ if span:
91
+ span.record_exception(error)
92
+ span.set_status(Status(StatusCode.ERROR))
93
+
94
+ def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
95
+ pass
96
+
97
+ wrapped_text_stream = immutable_wrap_sync_iterator(
98
+ traced_text_stream,
99
+ yield_hook=yield_hook,
100
+ post_hook=post_hook_inner,
101
+ error_hook=error_hook_inner,
102
+ finally_hook=finally_hook_inner,
103
+ )
104
+
105
+ stream.text_stream = wrapped_text_stream()
106
+
107
+ def post_hook_exit_impl() -> None:
108
+ span = ctx.get("span")
109
+ if span:
110
+ accumulated = ctx.get("accumulated_content", "")
111
+ span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, accumulated)
112
+
113
+ stream: MessageStream | None = ctx.get("stream")
114
+ if stream:
115
+ try:
116
+ final_message = stream.get_final_message()
117
+ if final_message.usage:
118
+ (
119
+ prompt_tokens,
120
+ completion_tokens,
121
+ cache_read,
122
+ cache_creation,
123
+ ) = _extract_anthropic_tokens(final_message.usage)
124
+ span.set_attribute(
125
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
126
+ prompt_tokens,
127
+ )
128
+ span.set_attribute(
129
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
130
+ completion_tokens,
131
+ )
132
+ span.set_attribute(
133
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
134
+ cache_read,
135
+ )
136
+ span.set_attribute(
137
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
138
+ cache_creation,
139
+ )
140
+ span.set_attribute(
141
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
142
+ safe_serialize(final_message.usage),
143
+ )
144
+
145
+ span.set_attribute(
146
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME, final_message.model
147
+ )
148
+ except Exception:
149
+ pass
150
+
151
+ span.end()
152
+
153
+ return WrappedMessageStreamManager(original_manager) # type: ignore[return-value]
154
+
155
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
156
+ span = ctx.get("span")
157
+ if span:
158
+ span.record_exception(error)
159
+ span.set_status(Status(StatusCode.ERROR))
160
+
161
+ wrapped = mutable_wrap_sync(
162
+ original_func,
163
+ pre_hook=pre_hook,
164
+ mutate_hook=mutate_hook,
165
+ error_hook=error_hook,
166
+ )
167
+
168
+ setattr(client.messages, "stream", wrapped)
169
+
170
+
171
+ def wrap_messages_stream_async(tracer: BaseTracer, client: AsyncAnthropic) -> None:
172
+ original_func = client.messages.stream
173
+
174
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
175
+ ctx["span"] = tracer.get_tracer().start_span(
176
+ "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
177
+ )
178
+
179
+ ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
180
+
181
+ ctx["model_name"] = kwargs.get("model", "")
182
+ ctx["span"].set_attribute(
183
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
184
+ )
185
+ ctx["accumulated_content"] = ""
186
+
187
+ def mutate_hook(
188
+ ctx: Dict[str, Any], result: AsyncMessageStreamManager
189
+ ) -> AsyncMessageStreamManager:
190
+ original_manager = result
191
+
192
+ class WrappedAsyncMessageStreamManager:
193
+ def __init__(self, manager: AsyncMessageStreamManager):
194
+ self._manager = manager
195
+
196
+ async def __aenter__(self) -> AsyncMessageStream:
197
+ stream = await self._manager.__aenter__()
198
+ post_hook_aenter_impl(stream)
199
+ return stream
200
+
201
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
202
+ result = await self._manager.__aexit__(exc_type, exc_val, exc_tb)
203
+ await post_hook_aexit_impl()
204
+ return result
205
+
206
+ def __getattr__(self, name):
207
+ return getattr(self._manager, name)
208
+
209
+ def post_hook_aenter_impl(stream: AsyncMessageStream) -> None:
210
+ ctx["stream"] = stream
211
+ original_text_stream = stream.text_stream
212
+
213
+ async def traced_text_stream() -> AsyncGenerator[str, None]:
214
+ async for text_chunk in original_text_stream:
215
+ yield text_chunk
216
+
217
+ def yield_hook(inner_ctx: Dict[str, Any], text_chunk: str) -> None:
218
+ span = ctx.get("span")
219
+ if span and text_chunk:
220
+ ctx["accumulated_content"] = (
221
+ ctx.get("accumulated_content", "") + text_chunk
222
+ )
223
+
224
+ def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
225
+ pass
226
+
227
+ def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
228
+ span = ctx.get("span")
229
+ if span:
230
+ span.record_exception(error)
231
+ span.set_status(Status(StatusCode.ERROR))
232
+
233
+ def finally_hook_inner_sync(inner_ctx: Dict[str, Any]) -> None:
234
+ pass
235
+
236
+ wrapped_text_stream = immutable_wrap_async_iterator(
237
+ traced_text_stream,
238
+ yield_hook=yield_hook,
239
+ post_hook=post_hook_inner,
240
+ error_hook=error_hook_inner,
241
+ finally_hook=finally_hook_inner_sync,
242
+ )
243
+
244
+ stream.text_stream = wrapped_text_stream()
245
+
246
+ async def post_hook_aexit_impl() -> None:
247
+ span = ctx.get("span")
248
+ if span:
249
+ accumulated = ctx.get("accumulated_content", "")
250
+ span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, accumulated)
251
+
252
+ stream: AsyncMessageStream | None = ctx.get("stream")
253
+ if stream:
254
+ try:
255
+ final_message = await stream.get_final_message()
256
+ if final_message.usage:
257
+ (
258
+ prompt_tokens,
259
+ completion_tokens,
260
+ cache_read,
261
+ cache_creation,
262
+ ) = _extract_anthropic_tokens(final_message.usage)
263
+ span.set_attribute(
264
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
265
+ prompt_tokens,
266
+ )
267
+ span.set_attribute(
268
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
269
+ completion_tokens,
270
+ )
271
+ span.set_attribute(
272
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
273
+ cache_read,
274
+ )
275
+ span.set_attribute(
276
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
277
+ cache_creation,
278
+ )
279
+ span.set_attribute(
280
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
281
+ safe_serialize(final_message.usage),
282
+ )
283
+
284
+ span.set_attribute(
285
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME, final_message.model
286
+ )
287
+ except Exception:
288
+ pass
289
+
290
+ span.end()
291
+
292
+ return WrappedAsyncMessageStreamManager(original_manager) # type: ignore[return-value]
293
+
294
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
295
+ span = ctx.get("span")
296
+ if span:
297
+ span.record_exception(error)
298
+ span.set_status(Status(StatusCode.ERROR))
299
+
300
+ wrapped = mutable_wrap_sync(
301
+ original_func,
302
+ pre_hook=pre_hook,
303
+ mutate_hook=mutate_hook,
304
+ error_hook=error_hook,
305
+ )
306
+
307
+ setattr(client.messages, "stream", wrapped)
@@ -0,0 +1,61 @@
1
+ from __future__ import annotations
2
+ from typing import TYPE_CHECKING, Union
3
+ import typing
4
+
5
+ from judgeval.v1.instrumentation.llm.llm_anthropic.messages import (
6
+ wrap_messages_create_sync,
7
+ wrap_messages_create_async,
8
+ )
9
+ from judgeval.v1.instrumentation.llm.llm_anthropic.messages_stream import (
10
+ wrap_messages_stream_sync,
11
+ wrap_messages_stream_async,
12
+ )
13
+
14
+ if TYPE_CHECKING:
15
+ from judgeval.v1.tracer import BaseTracer
16
+ from anthropic import Anthropic, AsyncAnthropic
17
+
18
+ TClient = Union[Anthropic, AsyncAnthropic]
19
+
20
+
21
+ def wrap_anthropic_client_sync(tracer: BaseTracer, client: Anthropic) -> Anthropic:
22
+ wrap_messages_create_sync(tracer, client)
23
+ wrap_messages_stream_sync(tracer, client)
24
+ return client
25
+
26
+
27
+ def wrap_anthropic_client_async(
28
+ tracer: BaseTracer, client: AsyncAnthropic
29
+ ) -> AsyncAnthropic:
30
+ wrap_messages_create_async(tracer, client)
31
+ wrap_messages_stream_async(tracer, client)
32
+ return client
33
+
34
+
35
+ @typing.overload
36
+ def wrap_anthropic_client(tracer: BaseTracer, client: Anthropic) -> Anthropic: ...
37
+ @typing.overload
38
+ def wrap_anthropic_client(
39
+ tracer: BaseTracer, client: AsyncAnthropic
40
+ ) -> AsyncAnthropic: ...
41
+
42
+
43
+ def wrap_anthropic_client(tracer: BaseTracer, client: TClient) -> TClient:
44
+ from judgeval.v1.instrumentation.llm.llm_anthropic.config import HAS_ANTHROPIC
45
+ from judgeval.logger import judgeval_logger
46
+
47
+ if not HAS_ANTHROPIC:
48
+ judgeval_logger.error(
49
+ "Cannot wrap Anthropic client: 'anthropic' library not installed. "
50
+ "Install it with: pip install anthropic"
51
+ )
52
+ return client
53
+
54
+ from anthropic import Anthropic, AsyncAnthropic
55
+
56
+ if isinstance(client, AsyncAnthropic):
57
+ return wrap_anthropic_client_async(tracer, client)
58
+ elif isinstance(client, Anthropic):
59
+ return wrap_anthropic_client_sync(tracer, client)
60
+ else:
61
+ raise TypeError(f"Invalid client type: {type(client)}")
@@ -0,0 +1,5 @@
1
+ from __future__ import annotations
2
+
3
+ from .wrapper import wrap_google_client
4
+
5
+ __all__ = ["wrap_google_client"]
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+ import importlib.util
3
+
4
+ HAS_GOOGLE_GENAI = importlib.util.find_spec("google.genai") is not None
5
+
6
+ __all__ = ["HAS_GOOGLE_GENAI"]
@@ -0,0 +1,121 @@
1
+ from __future__ import annotations
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Any,
5
+ Dict,
6
+ Optional,
7
+ Tuple,
8
+ )
9
+
10
+ from opentelemetry.trace import Status, StatusCode
11
+ from judgeval.judgment_attribute_keys import AttributeKeys
12
+ from judgeval.utils.serialize import safe_serialize
13
+ from judgeval.utils.wrappers import immutable_wrap_sync
14
+
15
+ if TYPE_CHECKING:
16
+ from judgeval.v1.tracer import BaseTracer
17
+ from google.genai import Client
18
+ from google.genai.types import (
19
+ GenerateContentResponse,
20
+ GenerateContentResponseUsageMetadata,
21
+ )
22
+
23
+
24
+ def _extract_google_tokens(
25
+ usage: GenerateContentResponseUsageMetadata,
26
+ ) -> Tuple[int, int, int, int]:
27
+ prompt_tokens = (
28
+ usage.prompt_token_count if usage.prompt_token_count is not None else 0
29
+ )
30
+ completion_tokens = (
31
+ usage.candidates_token_count if usage.candidates_token_count is not None else 0
32
+ )
33
+ cache_read_input_tokens = (
34
+ usage.cached_content_token_count
35
+ if usage.cached_content_token_count is not None
36
+ else 0
37
+ )
38
+ cache_creation_input_tokens = 0
39
+ return (
40
+ prompt_tokens,
41
+ completion_tokens,
42
+ cache_read_input_tokens,
43
+ cache_creation_input_tokens,
44
+ )
45
+
46
+
47
+ def _format_google_output(
48
+ response: GenerateContentResponse,
49
+ ) -> Tuple[Optional[str], Optional[GenerateContentResponseUsageMetadata]]:
50
+ return response.text, response.usage_metadata
51
+
52
+
53
+ def wrap_generate_content_sync(tracer: BaseTracer, client: Client) -> None:
54
+ original_func = client.models.generate_content
55
+
56
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
57
+ ctx["span"] = tracer.get_tracer().start_span(
58
+ "GOOGLE_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
59
+ )
60
+ ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
61
+ ctx["model_name"] = kwargs.get("model", "")
62
+ ctx["span"].set_attribute(
63
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
64
+ )
65
+
66
+ def post_hook(ctx: Dict[str, Any], result: GenerateContentResponse) -> None:
67
+ span = ctx.get("span")
68
+ if not span:
69
+ return
70
+
71
+ output, usage_data = _format_google_output(result)
72
+ span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, output)
73
+
74
+ if usage_data:
75
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
76
+ _extract_google_tokens(usage_data)
77
+ )
78
+ span.set_attribute(
79
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
80
+ prompt_tokens,
81
+ )
82
+ span.set_attribute(
83
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
84
+ )
85
+ span.set_attribute(
86
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
87
+ )
88
+ span.set_attribute(
89
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
90
+ cache_creation,
91
+ )
92
+ span.set_attribute(
93
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
94
+ safe_serialize(usage_data),
95
+ )
96
+
97
+ span.set_attribute(
98
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
99
+ result.model_version if result.model_version else ctx["model_name"],
100
+ )
101
+
102
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
103
+ span = ctx.get("span")
104
+ if span:
105
+ span.record_exception(error)
106
+ span.set_status(Status(StatusCode.ERROR))
107
+
108
+ def finally_hook(ctx: Dict[str, Any]) -> None:
109
+ span = ctx.get("span")
110
+ if span:
111
+ span.end()
112
+
113
+ wrapped = immutable_wrap_sync(
114
+ original_func,
115
+ pre_hook=pre_hook,
116
+ post_hook=post_hook,
117
+ error_hook=error_hook,
118
+ finally_hook=finally_hook,
119
+ )
120
+
121
+ setattr(client.models, "generate_content", wrapped)
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+ from typing import TYPE_CHECKING
3
+
4
+ from judgeval.v1.instrumentation.llm.llm_google.generate_content import (
5
+ wrap_generate_content_sync,
6
+ )
7
+
8
+ if TYPE_CHECKING:
9
+ from judgeval.v1.tracer import BaseTracer
10
+ from google.genai import Client
11
+
12
+
13
+ def wrap_google_client(tracer: BaseTracer, client: Client) -> Client:
14
+ from judgeval.v1.instrumentation.llm.llm_google.config import HAS_GOOGLE_GENAI
15
+ from judgeval.logger import judgeval_logger
16
+
17
+ if not HAS_GOOGLE_GENAI:
18
+ judgeval_logger.error(
19
+ "Cannot wrap Google GenAI client: 'google-genai' library not installed. "
20
+ "Install it with: pip install google-genai"
21
+ )
22
+ return client
23
+
24
+ from google.genai import Client
25
+
26
+ if isinstance(client, Client):
27
+ wrap_generate_content_sync(tracer, client)
28
+ return client
29
+ else:
30
+ raise TypeError(f"Invalid client type: {type(client)}")
@@ -0,0 +1,5 @@
1
+ from __future__ import annotations
2
+
3
+ from .wrapper import wrap_openai_client
4
+
5
+ __all__ = ["wrap_openai_client"]