judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,506 @@
1
+ from __future__ import annotations
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Any,
5
+ Awaitable,
6
+ Callable,
7
+ Dict,
8
+ Iterator,
9
+ AsyncIterator,
10
+ Generator,
11
+ AsyncGenerator,
12
+ ParamSpec,
13
+ TypeVar,
14
+ )
15
+
16
+ from judgeval.tracer.keys import AttributeKeys
17
+ from judgeval.tracer.utils import set_span_attribute
18
+ from judgeval.utils.serialize import safe_serialize
19
+ from judgeval.utils.wrappers import (
20
+ immutable_wrap_sync,
21
+ immutable_wrap_async,
22
+ mutable_wrap_sync,
23
+ mutable_wrap_async,
24
+ immutable_wrap_sync_iterator,
25
+ immutable_wrap_async_iterator,
26
+ )
27
+ from judgeval.tracer.llm.llm_openai.utils import (
28
+ openai_tokens_converter,
29
+ set_cost_attribute,
30
+ )
31
+
32
+ if TYPE_CHECKING:
33
+ from judgeval.tracer import Tracer
34
+ from openai import OpenAI, AsyncOpenAI
35
+ from openai.types.responses import Response
36
+
37
+ P = ParamSpec("P")
38
+ T = TypeVar("T")
39
+
40
+
41
+ def wrap_responses_create_sync(tracer: Tracer, client: OpenAI) -> None:
42
+ original_func = client.responses.create
43
+
44
+ def dispatcher(*args: Any, **kwargs: Any) -> Any:
45
+ if kwargs.get("stream", False):
46
+ return _wrap_responses_streaming_sync(tracer, original_func)(
47
+ *args, **kwargs
48
+ )
49
+ return _wrap_responses_non_streaming_sync(tracer, original_func)(
50
+ *args, **kwargs
51
+ )
52
+
53
+ setattr(client.responses, "create", dispatcher)
54
+
55
+
56
+ def _wrap_responses_non_streaming_sync(
57
+ tracer: Tracer, original_func: Callable[..., Response]
58
+ ) -> Callable[..., Response]:
59
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
60
+ ctx["span"] = tracer.get_tracer().start_span(
61
+ "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
62
+ )
63
+ tracer._inject_judgment_context(ctx["span"])
64
+ set_span_attribute(
65
+ ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
66
+ )
67
+ ctx["model_name"] = kwargs.get("model", "")
68
+ set_span_attribute(
69
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
70
+ )
71
+
72
+ def post_hook(ctx: Dict[str, Any], result: Response) -> None:
73
+ span = ctx.get("span")
74
+ if not span:
75
+ return
76
+
77
+ set_span_attribute(
78
+ span, AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result)
79
+ )
80
+
81
+ usage_data = result.usage if hasattr(result, "usage") else None
82
+ if usage_data:
83
+ prompt_tokens = usage_data.input_tokens or 0
84
+ completion_tokens = usage_data.output_tokens or 0
85
+ cache_read = usage_data.input_tokens_details.cached_tokens or 0
86
+
87
+ set_cost_attribute(span, usage_data)
88
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
89
+ openai_tokens_converter(
90
+ prompt_tokens,
91
+ completion_tokens,
92
+ cache_read,
93
+ 0,
94
+ usage_data.total_tokens,
95
+ )
96
+ )
97
+
98
+ set_span_attribute(
99
+ span,
100
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
101
+ prompt_tokens,
102
+ )
103
+ set_span_attribute(
104
+ span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
105
+ )
106
+ set_span_attribute(
107
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
108
+ )
109
+ set_span_attribute(
110
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
111
+ )
112
+ set_span_attribute(
113
+ span,
114
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
115
+ safe_serialize(usage_data),
116
+ )
117
+
118
+ if hasattr(result, "model"):
119
+ set_span_attribute(
120
+ span,
121
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
122
+ result.model or ctx["model_name"],
123
+ )
124
+
125
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
126
+ span = ctx.get("span")
127
+ if span:
128
+ span.record_exception(error)
129
+
130
+ def finally_hook(ctx: Dict[str, Any]) -> None:
131
+ span = ctx.get("span")
132
+ if span:
133
+ span.end()
134
+
135
+ return immutable_wrap_sync(
136
+ original_func,
137
+ pre_hook=pre_hook,
138
+ post_hook=post_hook,
139
+ error_hook=error_hook,
140
+ finally_hook=finally_hook,
141
+ )
142
+
143
+
144
+ def _wrap_responses_streaming_sync(
145
+ tracer: Tracer, original_func: Callable[..., Iterator[Any]]
146
+ ) -> Callable[..., Iterator[Any]]:
147
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
148
+ ctx["span"] = tracer.get_tracer().start_span(
149
+ "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
150
+ )
151
+ tracer._inject_judgment_context(ctx["span"])
152
+ set_span_attribute(
153
+ ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
154
+ )
155
+ ctx["model_name"] = kwargs.get("model", "")
156
+ set_span_attribute(
157
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
158
+ )
159
+ ctx["accumulated_content"] = ""
160
+
161
+ def mutate_hook(ctx: Dict[str, Any], result: Iterator[Any]) -> Iterator[Any]:
162
+ def traced_generator() -> Generator[Any, None, None]:
163
+ for chunk in result:
164
+ yield chunk
165
+
166
+ def yield_hook(inner_ctx: Dict[str, Any], chunk: Any) -> None:
167
+ span = ctx.get("span")
168
+ if not span:
169
+ return
170
+
171
+ if hasattr(chunk, "type") and chunk.type == "response.output_text.delta":
172
+ delta = getattr(chunk, "delta", None)
173
+ if delta:
174
+ ctx["accumulated_content"] = (
175
+ ctx.get("accumulated_content", "") + delta
176
+ )
177
+
178
+ if hasattr(chunk, "type") and chunk.type == "response.completed":
179
+ if (
180
+ hasattr(chunk, "response")
181
+ and chunk.response
182
+ and hasattr(chunk.response, "usage")
183
+ and chunk.response.usage
184
+ ):
185
+ prompt_tokens = chunk.response.usage.input_tokens or 0
186
+ completion_tokens = chunk.response.usage.output_tokens or 0
187
+ total_tokens = chunk.response.usage.total_tokens or 0
188
+ # Safely access nested cached_tokens
189
+ input_tokens_details = getattr(
190
+ chunk.response.usage, "input_tokens_details", None
191
+ )
192
+ cache_read = (
193
+ getattr(input_tokens_details, "cached_tokens", 0)
194
+ if input_tokens_details
195
+ else 0
196
+ )
197
+
198
+ set_cost_attribute(span, chunk.response.usage)
199
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
200
+ openai_tokens_converter(
201
+ prompt_tokens,
202
+ completion_tokens,
203
+ cache_read,
204
+ 0,
205
+ total_tokens,
206
+ )
207
+ )
208
+
209
+ set_span_attribute(
210
+ span,
211
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
212
+ prompt_tokens,
213
+ )
214
+ set_span_attribute(
215
+ span,
216
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
217
+ completion_tokens,
218
+ )
219
+ set_span_attribute(
220
+ span,
221
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
222
+ cache_read,
223
+ )
224
+ set_span_attribute(
225
+ span,
226
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
227
+ 0,
228
+ )
229
+ set_span_attribute(
230
+ span,
231
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
232
+ safe_serialize(chunk.response.usage),
233
+ )
234
+
235
+ def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
236
+ span = ctx.get("span")
237
+ if span:
238
+ accumulated = ctx.get("accumulated_content", "")
239
+ set_span_attribute(span, AttributeKeys.GEN_AI_COMPLETION, accumulated)
240
+
241
+ def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
242
+ span = ctx.get("span")
243
+ if span:
244
+ span.record_exception(error)
245
+
246
+ def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
247
+ span = ctx.get("span")
248
+ if span:
249
+ span.end()
250
+
251
+ wrapped_generator = immutable_wrap_sync_iterator(
252
+ traced_generator,
253
+ yield_hook=yield_hook,
254
+ post_hook=post_hook_inner,
255
+ error_hook=error_hook_inner,
256
+ finally_hook=finally_hook_inner,
257
+ )
258
+
259
+ return wrapped_generator()
260
+
261
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
262
+ span = ctx.get("span")
263
+ if span:
264
+ span.record_exception(error)
265
+
266
+ return mutable_wrap_sync(
267
+ original_func,
268
+ pre_hook=pre_hook,
269
+ mutate_hook=mutate_hook,
270
+ error_hook=error_hook,
271
+ )
272
+
273
+
274
+ def wrap_responses_create_async(tracer: Tracer, client: AsyncOpenAI) -> None:
275
+ original_func = client.responses.create
276
+
277
+ async def dispatcher(*args: Any, **kwargs: Any) -> Any:
278
+ if kwargs.get("stream", False):
279
+ return await _wrap_responses_streaming_async(tracer, original_func)(
280
+ *args, **kwargs
281
+ )
282
+ return await _wrap_responses_non_streaming_async(tracer, original_func)(
283
+ *args, **kwargs
284
+ )
285
+
286
+ setattr(client.responses, "create", dispatcher)
287
+
288
+
289
+ def _wrap_responses_non_streaming_async(
290
+ tracer: Tracer, original_func: Callable[..., Awaitable[Response]]
291
+ ) -> Callable[..., Awaitable[Response]]:
292
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
293
+ ctx["span"] = tracer.get_tracer().start_span(
294
+ "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
295
+ )
296
+ tracer._inject_judgment_context(ctx["span"])
297
+ set_span_attribute(
298
+ ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
299
+ )
300
+ ctx["model_name"] = kwargs.get("model", "")
301
+ set_span_attribute(
302
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
303
+ )
304
+
305
+ def post_hook(ctx: Dict[str, Any], result: Response) -> None:
306
+ span = ctx.get("span")
307
+ if not span:
308
+ return
309
+
310
+ set_span_attribute(
311
+ span, AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result)
312
+ )
313
+
314
+ usage_data = result.usage if hasattr(result, "usage") else None
315
+ if usage_data:
316
+ prompt_tokens = usage_data.input_tokens or 0
317
+ completion_tokens = usage_data.output_tokens or 0
318
+ cache_read = usage_data.input_tokens_details.cached_tokens or 0
319
+
320
+ set_cost_attribute(span, usage_data)
321
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
322
+ openai_tokens_converter(
323
+ prompt_tokens,
324
+ completion_tokens,
325
+ cache_read,
326
+ 0,
327
+ usage_data.total_tokens,
328
+ )
329
+ )
330
+
331
+ set_span_attribute(
332
+ span,
333
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
334
+ prompt_tokens,
335
+ )
336
+ set_span_attribute(
337
+ span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
338
+ )
339
+ set_span_attribute(
340
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
341
+ )
342
+ set_span_attribute(
343
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
344
+ )
345
+ set_span_attribute(
346
+ span,
347
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
348
+ safe_serialize(usage_data),
349
+ )
350
+
351
+ if hasattr(result, "model"):
352
+ set_span_attribute(
353
+ span,
354
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
355
+ result.model or ctx["model_name"],
356
+ )
357
+
358
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
359
+ span = ctx.get("span")
360
+ if span:
361
+ span.record_exception(error)
362
+
363
+ def finally_hook(ctx: Dict[str, Any]) -> None:
364
+ span = ctx.get("span")
365
+ if span:
366
+ span.end()
367
+
368
+ return immutable_wrap_async(
369
+ original_func,
370
+ pre_hook=pre_hook,
371
+ post_hook=post_hook,
372
+ error_hook=error_hook,
373
+ finally_hook=finally_hook,
374
+ )
375
+
376
+
377
+ def _wrap_responses_streaming_async(
378
+ tracer: Tracer, original_func: Callable[..., Awaitable[AsyncIterator[Any]]]
379
+ ) -> Callable[..., Awaitable[AsyncIterator[Any]]]:
380
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
381
+ ctx["span"] = tracer.get_tracer().start_span(
382
+ "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
383
+ )
384
+ tracer._inject_judgment_context(ctx["span"])
385
+ set_span_attribute(
386
+ ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
387
+ )
388
+ ctx["model_name"] = kwargs.get("model", "")
389
+ set_span_attribute(
390
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
391
+ )
392
+ ctx["accumulated_content"] = ""
393
+
394
+ def mutate_hook(
395
+ ctx: Dict[str, Any], result: AsyncIterator[Any]
396
+ ) -> AsyncIterator[Any]:
397
+ async def traced_generator() -> AsyncGenerator[Any, None]:
398
+ async for chunk in result:
399
+ yield chunk
400
+
401
+ def yield_hook(inner_ctx: Dict[str, Any], chunk: Any) -> None:
402
+ span = ctx.get("span")
403
+ if not span:
404
+ return
405
+
406
+ if hasattr(chunk, "type") and chunk.type == "response.output_text.delta":
407
+ delta = getattr(chunk, "delta", None)
408
+ if delta:
409
+ ctx["accumulated_content"] = (
410
+ ctx.get("accumulated_content", "") + delta
411
+ )
412
+
413
+ if hasattr(chunk, "type") and chunk.type == "response.completed":
414
+ if (
415
+ hasattr(chunk, "response")
416
+ and chunk.response
417
+ and hasattr(chunk.response, "usage")
418
+ and chunk.response.usage
419
+ ):
420
+ prompt_tokens = chunk.response.usage.input_tokens or 0
421
+ completion_tokens = chunk.response.usage.output_tokens or 0
422
+ total_tokens = chunk.response.usage.total_tokens or 0
423
+ # Safely access nested cached_tokens
424
+ input_tokens_details = getattr(
425
+ chunk.response.usage, "input_tokens_details", None
426
+ )
427
+ cache_read = (
428
+ getattr(input_tokens_details, "cached_tokens", 0)
429
+ if input_tokens_details
430
+ else 0
431
+ )
432
+
433
+ set_cost_attribute(span, chunk.response.usage)
434
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
435
+ openai_tokens_converter(
436
+ prompt_tokens,
437
+ completion_tokens,
438
+ cache_read,
439
+ 0,
440
+ total_tokens,
441
+ )
442
+ )
443
+
444
+ set_span_attribute(
445
+ span,
446
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
447
+ prompt_tokens,
448
+ )
449
+ set_span_attribute(
450
+ span,
451
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
452
+ completion_tokens,
453
+ )
454
+ set_span_attribute(
455
+ span,
456
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
457
+ cache_read,
458
+ )
459
+ set_span_attribute(
460
+ span,
461
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
462
+ 0,
463
+ )
464
+ set_span_attribute(
465
+ span,
466
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
467
+ safe_serialize(chunk.response.usage),
468
+ )
469
+
470
+ def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
471
+ span = ctx.get("span")
472
+ if span:
473
+ accumulated = ctx.get("accumulated_content", "")
474
+ set_span_attribute(span, AttributeKeys.GEN_AI_COMPLETION, accumulated)
475
+
476
+ def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
477
+ span = ctx.get("span")
478
+ if span:
479
+ span.record_exception(error)
480
+
481
+ def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
482
+ span = ctx.get("span")
483
+ if span:
484
+ span.end()
485
+
486
+ wrapped_generator = immutable_wrap_async_iterator(
487
+ traced_generator,
488
+ yield_hook=yield_hook,
489
+ post_hook=post_hook_inner,
490
+ error_hook=error_hook_inner,
491
+ finally_hook=finally_hook_inner,
492
+ )
493
+
494
+ return wrapped_generator()
495
+
496
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
497
+ span = ctx.get("span")
498
+ if span:
499
+ span.record_exception(error)
500
+
501
+ return mutable_wrap_async(
502
+ original_func,
503
+ pre_hook=pre_hook,
504
+ mutate_hook=mutate_hook,
505
+ error_hook=error_hook,
506
+ )
@@ -0,0 +1,42 @@
1
+ from typing import Any
2
+ from opentelemetry.trace import Span
3
+ from judgeval.tracer.keys import AttributeKeys
4
+ from judgeval.tracer.utils import set_span_attribute
5
+ from judgeval.utils.serialize import safe_serialize
6
+
7
+
8
+ def openai_tokens_converter(
9
+ prompt_tokens: int,
10
+ completion_tokens: int,
11
+ cache_read: int,
12
+ cache_creation: int,
13
+ total_tokens: int,
14
+ ) -> tuple[int, int, int, int]:
15
+ """
16
+ Returns:
17
+ tuple[int, int, int, int]:
18
+ - judgment.usage.non_cached_input
19
+ - judgment.usage.output_tokens
20
+ - judgment.usage.cached_input_tokens
21
+ - judgment.usage.cache_creation_tokens
22
+ """
23
+ manual_tokens = prompt_tokens + completion_tokens + cache_read + cache_creation
24
+
25
+ if manual_tokens > total_tokens:
26
+ # This is the openAI case where we need to subtract the cached tokens from the input tokens
27
+ return prompt_tokens - cache_read, completion_tokens, cache_read, cache_creation
28
+ else:
29
+ return prompt_tokens, completion_tokens, cache_read, cache_creation
30
+
31
+
32
+ def set_cost_attribute(span: Span, usage_data: Any) -> None:
33
+ """
34
+ This is for OpenRouter case where the cost is provided in the usage data when they specify:
35
+ extra_body={"usage": {"include": True}},
36
+ """
37
+ if hasattr(usage_data, "cost") and usage_data.cost:
38
+ set_span_attribute(
39
+ span,
40
+ AttributeKeys.JUDGMENT_USAGE_TOTAL_COST_USD,
41
+ safe_serialize(usage_data.cost),
42
+ )
@@ -0,0 +1,63 @@
1
+ from __future__ import annotations
2
+ from typing import TYPE_CHECKING, Union
3
+ import typing
4
+
5
+ from judgeval.tracer.llm.llm_openai.chat_completions import (
6
+ wrap_chat_completions_create_sync,
7
+ wrap_chat_completions_create_async,
8
+ )
9
+ from judgeval.tracer.llm.llm_openai.responses import (
10
+ wrap_responses_create_sync,
11
+ wrap_responses_create_async,
12
+ )
13
+ from judgeval.tracer.llm.llm_openai.beta_chat_completions import (
14
+ wrap_beta_chat_completions_parse_sync,
15
+ wrap_beta_chat_completions_parse_async,
16
+ )
17
+
18
+ if TYPE_CHECKING:
19
+ from judgeval.tracer import Tracer
20
+ from openai import OpenAI, AsyncOpenAI
21
+
22
+ TClient = Union[OpenAI, AsyncOpenAI]
23
+
24
+
25
+ def wrap_openai_client_sync(tracer: Tracer, client: OpenAI) -> OpenAI:
26
+ wrap_chat_completions_create_sync(tracer, client)
27
+ wrap_responses_create_sync(tracer, client)
28
+ wrap_beta_chat_completions_parse_sync(tracer, client)
29
+ return client
30
+
31
+
32
+ def wrap_openai_client_async(tracer: Tracer, client: AsyncOpenAI) -> AsyncOpenAI:
33
+ wrap_chat_completions_create_async(tracer, client)
34
+ wrap_responses_create_async(tracer, client)
35
+ wrap_beta_chat_completions_parse_async(tracer, client)
36
+ return client
37
+
38
+
39
+ @typing.overload
40
+ def wrap_openai_client(tracer: Tracer, client: OpenAI) -> OpenAI: ...
41
+ @typing.overload
42
+ def wrap_openai_client(tracer: Tracer, client: AsyncOpenAI) -> AsyncOpenAI: ...
43
+
44
+
45
+ def wrap_openai_client(tracer: Tracer, client: TClient) -> TClient:
46
+ from judgeval.tracer.llm.llm_openai.config import HAS_OPENAI
47
+ from judgeval.logger import judgeval_logger
48
+
49
+ if not HAS_OPENAI:
50
+ judgeval_logger.error(
51
+ "Cannot wrap OpenAI client: 'openai' library not installed. "
52
+ "Install it with: pip install openai"
53
+ )
54
+ return client
55
+
56
+ from openai import OpenAI, AsyncOpenAI
57
+
58
+ if isinstance(client, AsyncOpenAI):
59
+ return wrap_openai_client_async(tracer, client)
60
+ elif isinstance(client, OpenAI):
61
+ return wrap_openai_client_sync(tracer, client)
62
+ else:
63
+ raise TypeError(f"Invalid client type: {type(client)}")
@@ -0,0 +1,3 @@
1
+ from .wrapper import wrap_together_client
2
+
3
+ __all__ = ["wrap_together_client"]