judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,212 @@
1
+ from __future__ import annotations
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Any,
5
+ Awaitable,
6
+ Callable,
7
+ Dict,
8
+ ParamSpec,
9
+ TypeVar,
10
+ )
11
+
12
+ from opentelemetry.trace import Status, StatusCode
13
+ from judgeval.judgment_attribute_keys import AttributeKeys
14
+ from judgeval.utils.serialize import safe_serialize
15
+ from judgeval.utils.wrappers import (
16
+ immutable_wrap_sync,
17
+ immutable_wrap_async,
18
+ )
19
+ from judgeval.v1.instrumentation.llm.llm_openai.utils import (
20
+ openai_tokens_converter,
21
+ set_cost_attribute,
22
+ )
23
+
24
+ if TYPE_CHECKING:
25
+ from judgeval.v1.tracer import BaseTracer
26
+ from openai import OpenAI, AsyncOpenAI
27
+ from openai.types.chat.parsed_chat_completion import ParsedChatCompletion
28
+
29
+ P = ParamSpec("P")
30
+ T = TypeVar("T")
31
+
32
+
33
+ def wrap_beta_chat_completions_parse_sync(tracer: BaseTracer, client: OpenAI) -> None:
34
+ original_func = client.beta.chat.completions.parse
35
+ wrapped = _wrap_beta_non_streaming_sync(tracer, original_func)
36
+ setattr(client.beta.chat.completions, "parse", wrapped)
37
+
38
+
39
+ def _wrap_beta_non_streaming_sync(
40
+ tracer: BaseTracer, original_func: Callable[P, ParsedChatCompletion[T]]
41
+ ) -> Callable[P, ParsedChatCompletion[T]]:
42
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
43
+ ctx["span"] = tracer.get_tracer().start_span(
44
+ "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
45
+ )
46
+ ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
47
+ ctx["model_name"] = kwargs.get("model", "")
48
+ ctx["span"].set_attribute(
49
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
50
+ )
51
+
52
+ def post_hook(ctx: Dict[str, Any], result: ParsedChatCompletion[T]) -> None:
53
+ span = ctx.get("span")
54
+ if not span:
55
+ return
56
+
57
+ span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result))
58
+
59
+ usage_data = result.usage
60
+ if usage_data:
61
+ prompt_tokens = usage_data.prompt_tokens or 0
62
+ completion_tokens = usage_data.completion_tokens or 0
63
+ cache_read = 0
64
+ prompt_tokens_details = usage_data.prompt_tokens_details
65
+ if prompt_tokens_details:
66
+ cache_read = prompt_tokens_details.cached_tokens or 0
67
+
68
+ set_cost_attribute(span, usage_data)
69
+
70
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
71
+ openai_tokens_converter(
72
+ prompt_tokens,
73
+ completion_tokens,
74
+ cache_read,
75
+ 0,
76
+ usage_data.total_tokens,
77
+ )
78
+ )
79
+
80
+ span.set_attribute(
81
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
82
+ prompt_tokens,
83
+ )
84
+ span.set_attribute(
85
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
86
+ )
87
+ span.set_attribute(
88
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
89
+ )
90
+ span.set_attribute(
91
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
92
+ )
93
+ span.set_attribute(
94
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
95
+ safe_serialize(usage_data),
96
+ )
97
+
98
+ span.set_attribute(
99
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
100
+ result.model or ctx["model_name"],
101
+ )
102
+
103
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
104
+ span = ctx.get("span")
105
+ if span:
106
+ span.record_exception(error)
107
+ span.set_status(Status(StatusCode.ERROR))
108
+
109
+ def finally_hook(ctx: Dict[str, Any]) -> None:
110
+ span = ctx.get("span")
111
+ if span:
112
+ span.end()
113
+
114
+ return immutable_wrap_sync(
115
+ original_func,
116
+ pre_hook=pre_hook,
117
+ post_hook=post_hook,
118
+ error_hook=error_hook,
119
+ finally_hook=finally_hook,
120
+ )
121
+
122
+
123
+ def wrap_beta_chat_completions_parse_async(
124
+ tracer: BaseTracer, client: AsyncOpenAI
125
+ ) -> None:
126
+ original_func = client.beta.chat.completions.parse
127
+ wrapped = _wrap_beta_non_streaming_async(tracer, original_func)
128
+ setattr(client.beta.chat.completions, "parse", wrapped)
129
+
130
+
131
+ def _wrap_beta_non_streaming_async(
132
+ tracer: BaseTracer, original_func: Callable[P, Awaitable[ParsedChatCompletion[T]]]
133
+ ) -> Callable[P, Awaitable[ParsedChatCompletion[T]]]:
134
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
135
+ ctx["span"] = tracer.get_tracer().start_span(
136
+ "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
137
+ )
138
+ ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
139
+ ctx["model_name"] = kwargs.get("model", "")
140
+ ctx["span"].set_attribute(
141
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
142
+ )
143
+
144
+ def post_hook(ctx: Dict[str, Any], result: ParsedChatCompletion[T]) -> None:
145
+ span = ctx.get("span")
146
+ if not span:
147
+ return
148
+
149
+ span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result))
150
+
151
+ usage_data = result.usage
152
+ if usage_data:
153
+ prompt_tokens = usage_data.prompt_tokens or 0
154
+ completion_tokens = usage_data.completion_tokens or 0
155
+ cache_read = 0
156
+ prompt_tokens_details = usage_data.prompt_tokens_details
157
+ if prompt_tokens_details:
158
+ cache_read = prompt_tokens_details.cached_tokens or 0
159
+
160
+ set_cost_attribute(span, usage_data)
161
+
162
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
163
+ openai_tokens_converter(
164
+ prompt_tokens,
165
+ completion_tokens,
166
+ cache_read,
167
+ 0,
168
+ usage_data.total_tokens,
169
+ )
170
+ )
171
+
172
+ span.set_attribute(
173
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
174
+ prompt_tokens,
175
+ )
176
+ span.set_attribute(
177
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
178
+ )
179
+ span.set_attribute(
180
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
181
+ )
182
+ span.set_attribute(
183
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
184
+ )
185
+ span.set_attribute(
186
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
187
+ safe_serialize(usage_data),
188
+ )
189
+
190
+ span.set_attribute(
191
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
192
+ result.model or ctx["model_name"],
193
+ )
194
+
195
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
196
+ span = ctx.get("span")
197
+ if span:
198
+ span.record_exception(error)
199
+ span.set_status(Status(StatusCode.ERROR))
200
+
201
+ def finally_hook(ctx: Dict[str, Any]) -> None:
202
+ span = ctx.get("span")
203
+ if span:
204
+ span.end()
205
+
206
+ return immutable_wrap_async(
207
+ original_func,
208
+ pre_hook=pre_hook,
209
+ post_hook=post_hook,
210
+ error_hook=error_hook,
211
+ finally_hook=finally_hook,
212
+ )
@@ -0,0 +1,477 @@
1
+ from __future__ import annotations
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Any,
5
+ Awaitable,
6
+ Callable,
7
+ Dict,
8
+ Iterator,
9
+ AsyncIterator,
10
+ Generator,
11
+ AsyncGenerator,
12
+ ParamSpec,
13
+ TypeVar,
14
+ )
15
+ from packaging import version
16
+
17
+ from opentelemetry.trace import Status, StatusCode
18
+ from judgeval.judgment_attribute_keys import AttributeKeys
19
+ from judgeval.utils.serialize import safe_serialize
20
+ from judgeval.utils.wrappers import (
21
+ immutable_wrap_async,
22
+ immutable_wrap_sync,
23
+ mutable_wrap_sync,
24
+ mutable_wrap_async,
25
+ immutable_wrap_sync_iterator,
26
+ immutable_wrap_async_iterator,
27
+ )
28
+ from judgeval.v1.instrumentation.llm.llm_openai.utils import (
29
+ openai_tokens_converter,
30
+ set_cost_attribute,
31
+ )
32
+
33
+ if TYPE_CHECKING:
34
+ from judgeval.v1.tracer import BaseTracer
35
+ from openai import OpenAI, AsyncOpenAI
36
+ from openai.types.chat import ChatCompletion, ChatCompletionChunk
37
+
38
+ P = ParamSpec("P")
39
+ T = TypeVar("T")
40
+
41
+
42
+ def _supports_stream_options() -> bool:
43
+ try:
44
+ import openai
45
+
46
+ return version.parse(openai.__version__) >= version.parse("1.26.0")
47
+ except Exception:
48
+ return False
49
+
50
+
51
+ def wrap_chat_completions_create_sync(tracer: BaseTracer, client: OpenAI) -> None:
52
+ original_func = client.chat.completions.create
53
+
54
+ def dispatcher(*args: Any, **kwargs: Any) -> Any:
55
+ if kwargs.get("stream", False):
56
+ return _wrap_streaming_sync(tracer, original_func)(*args, **kwargs)
57
+ return _wrap_non_streaming_sync(tracer, original_func)(*args, **kwargs)
58
+
59
+ setattr(client.chat.completions, "create", dispatcher)
60
+
61
+
62
+ def _wrap_non_streaming_sync(
63
+ tracer: BaseTracer, original_func: Callable[..., ChatCompletion]
64
+ ) -> Callable[..., ChatCompletion]:
65
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
66
+ ctx["span"] = tracer.get_tracer().start_span(
67
+ "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
68
+ )
69
+ ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
70
+ ctx["model_name"] = kwargs.get("model", "")
71
+ ctx["span"].set_attribute(
72
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
73
+ )
74
+
75
+ def post_hook(ctx: Dict[str, Any], result: ChatCompletion) -> None:
76
+ span = ctx.get("span")
77
+ if not span:
78
+ return
79
+
80
+ span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result))
81
+
82
+ usage_data = result.usage
83
+ if usage_data:
84
+ prompt_tokens = usage_data.prompt_tokens or 0
85
+ completion_tokens = usage_data.completion_tokens or 0
86
+ cache_read = 0
87
+ prompt_tokens_details = usage_data.prompt_tokens_details
88
+ if prompt_tokens_details:
89
+ cache_read = prompt_tokens_details.cached_tokens or 0
90
+
91
+ set_cost_attribute(span, usage_data)
92
+
93
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
94
+ openai_tokens_converter(
95
+ prompt_tokens,
96
+ completion_tokens,
97
+ cache_read,
98
+ 0,
99
+ usage_data.total_tokens,
100
+ )
101
+ )
102
+
103
+ span.set_attribute(
104
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
105
+ prompt_tokens,
106
+ )
107
+ span.set_attribute(
108
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
109
+ )
110
+ span.set_attribute(
111
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
112
+ )
113
+ span.set_attribute(
114
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
115
+ )
116
+ span.set_attribute(
117
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
118
+ safe_serialize(usage_data),
119
+ )
120
+
121
+ span.set_attribute(
122
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
123
+ result.model or ctx["model_name"],
124
+ )
125
+
126
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
127
+ span = ctx.get("span")
128
+ if span:
129
+ span.record_exception(error)
130
+ span.set_status(Status(StatusCode.ERROR))
131
+
132
+ def finally_hook(ctx: Dict[str, Any]) -> None:
133
+ span = ctx.get("span")
134
+ if span:
135
+ span.end()
136
+
137
+ return immutable_wrap_sync(
138
+ original_func,
139
+ pre_hook=pre_hook,
140
+ post_hook=post_hook,
141
+ error_hook=error_hook,
142
+ finally_hook=finally_hook,
143
+ )
144
+
145
+
146
+ def _wrap_streaming_sync(
147
+ tracer: BaseTracer, original_func: Callable[..., Iterator[ChatCompletionChunk]]
148
+ ) -> Callable[..., Iterator[ChatCompletionChunk]]:
149
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
150
+ ctx["span"] = tracer.get_tracer().start_span(
151
+ "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
152
+ )
153
+ ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
154
+ ctx["model_name"] = kwargs.get("model", "")
155
+ ctx["span"].set_attribute(
156
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
157
+ )
158
+ ctx["accumulated_content"] = ""
159
+
160
+ def mutate_kwargs_hook(ctx: Dict[str, Any], kwargs: Any) -> Any:
161
+ if "stream_options" not in kwargs and _supports_stream_options():
162
+ modified_kwargs = dict(kwargs)
163
+ modified_kwargs["stream_options"] = {"include_usage": True}
164
+ return modified_kwargs
165
+ return kwargs
166
+
167
+ def mutate_hook(
168
+ ctx: Dict[str, Any], result: Iterator[ChatCompletionChunk]
169
+ ) -> Iterator[ChatCompletionChunk]:
170
+ def traced_generator() -> Generator[ChatCompletionChunk, None, None]:
171
+ for chunk in result:
172
+ yield chunk
173
+
174
+ def yield_hook(inner_ctx: Dict[str, Any], chunk: ChatCompletionChunk) -> None:
175
+ span = ctx.get("span")
176
+ if not span:
177
+ return
178
+
179
+ if chunk.choices and len(chunk.choices) > 0:
180
+ delta = chunk.choices[0].delta
181
+ if delta and delta.content:
182
+ ctx["accumulated_content"] = (
183
+ ctx.get("accumulated_content", "") + delta.content
184
+ )
185
+
186
+ if hasattr(chunk, "usage") and chunk.usage:
187
+ prompt_tokens = chunk.usage.prompt_tokens or 0
188
+ completion_tokens = chunk.usage.completion_tokens or 0
189
+ cache_read = 0
190
+ if chunk.usage.prompt_tokens_details:
191
+ cache_read = chunk.usage.prompt_tokens_details.cached_tokens or 0
192
+
193
+ set_cost_attribute(span, chunk.usage)
194
+
195
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
196
+ openai_tokens_converter(
197
+ prompt_tokens,
198
+ completion_tokens,
199
+ cache_read,
200
+ 0,
201
+ chunk.usage.total_tokens,
202
+ )
203
+ )
204
+
205
+ span.set_attribute(
206
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
207
+ prompt_tokens,
208
+ )
209
+ span.set_attribute(
210
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
211
+ )
212
+ span.set_attribute(
213
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
214
+ )
215
+ span.set_attribute(
216
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
217
+ )
218
+ span.set_attribute(
219
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
220
+ safe_serialize(chunk.usage),
221
+ )
222
+
223
+ def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
224
+ span = ctx.get("span")
225
+ if span:
226
+ accumulated = ctx.get("accumulated_content", "")
227
+ span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, accumulated)
228
+
229
+ def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
230
+ span = ctx.get("span")
231
+ if span:
232
+ span.record_exception(error)
233
+ span.set_status(Status(StatusCode.ERROR))
234
+
235
+ def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
236
+ span = ctx.get("span")
237
+ if span:
238
+ span.end()
239
+
240
+ wrapped_generator = immutable_wrap_sync_iterator(
241
+ traced_generator,
242
+ yield_hook=yield_hook,
243
+ post_hook=post_hook_inner,
244
+ error_hook=error_hook_inner,
245
+ finally_hook=finally_hook_inner,
246
+ )
247
+
248
+ return wrapped_generator()
249
+
250
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
251
+ span = ctx.get("span")
252
+ if span:
253
+ span.record_exception(error)
254
+ span.set_status(Status(StatusCode.ERROR))
255
+
256
+ return mutable_wrap_sync(
257
+ original_func,
258
+ pre_hook=pre_hook,
259
+ mutate_kwargs_hook=mutate_kwargs_hook,
260
+ mutate_hook=mutate_hook,
261
+ error_hook=error_hook,
262
+ )
263
+
264
+
265
+ def wrap_chat_completions_create_async(tracer: BaseTracer, client: AsyncOpenAI) -> None:
266
+ original_func = client.chat.completions.create
267
+
268
+ async def dispatcher(*args: Any, **kwargs: Any) -> Any:
269
+ if kwargs.get("stream", False):
270
+ return await _wrap_streaming_async(tracer, original_func)(*args, **kwargs)
271
+ return await _wrap_non_streaming_async(tracer, original_func)(*args, **kwargs)
272
+
273
+ setattr(client.chat.completions, "create", dispatcher)
274
+
275
+
276
+ def _wrap_non_streaming_async(
277
+ tracer: BaseTracer, original_func: Callable[..., Awaitable[ChatCompletion]]
278
+ ) -> Callable[..., Awaitable[ChatCompletion]]:
279
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
280
+ ctx["span"] = tracer.get_tracer().start_span(
281
+ "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
282
+ )
283
+ ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
284
+ ctx["model_name"] = kwargs.get("model", "")
285
+ ctx["span"].set_attribute(
286
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
287
+ )
288
+
289
+ def post_hook(ctx: Dict[str, Any], result: ChatCompletion) -> None:
290
+ span = ctx.get("span")
291
+ if not span:
292
+ return
293
+
294
+ span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result))
295
+
296
+ usage_data = result.usage
297
+ if usage_data:
298
+ prompt_tokens = usage_data.prompt_tokens or 0
299
+ completion_tokens = usage_data.completion_tokens or 0
300
+ cache_read = 0
301
+ prompt_tokens_details = usage_data.prompt_tokens_details
302
+ if prompt_tokens_details:
303
+ cache_read = prompt_tokens_details.cached_tokens or 0
304
+
305
+ set_cost_attribute(span, usage_data)
306
+
307
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
308
+ openai_tokens_converter(
309
+ prompt_tokens,
310
+ completion_tokens,
311
+ cache_read,
312
+ 0,
313
+ usage_data.total_tokens,
314
+ )
315
+ )
316
+
317
+ span.set_attribute(
318
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
319
+ prompt_tokens,
320
+ )
321
+ span.set_attribute(
322
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
323
+ )
324
+ span.set_attribute(
325
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
326
+ )
327
+ span.set_attribute(
328
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
329
+ )
330
+ span.set_attribute(
331
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
332
+ safe_serialize(usage_data),
333
+ )
334
+
335
+ span.set_attribute(
336
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
337
+ result.model or ctx["model_name"],
338
+ )
339
+
340
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
341
+ span = ctx.get("span")
342
+ if span:
343
+ span.record_exception(error)
344
+ span.set_status(Status(StatusCode.ERROR))
345
+
346
+ def finally_hook(ctx: Dict[str, Any]) -> None:
347
+ span = ctx.get("span")
348
+ if span:
349
+ span.end()
350
+
351
+ return immutable_wrap_async(
352
+ original_func,
353
+ pre_hook=pre_hook,
354
+ post_hook=post_hook,
355
+ error_hook=error_hook,
356
+ finally_hook=finally_hook,
357
+ )
358
+
359
+
360
+ def _wrap_streaming_async(
361
+ tracer: BaseTracer,
362
+ original_func: Callable[..., Awaitable[AsyncIterator[ChatCompletionChunk]]],
363
+ ) -> Callable[..., Awaitable[AsyncIterator[ChatCompletionChunk]]]:
364
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
365
+ ctx["span"] = tracer.get_tracer().start_span(
366
+ "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
367
+ )
368
+ ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
369
+ ctx["model_name"] = kwargs.get("model", "")
370
+ ctx["span"].set_attribute(
371
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
372
+ )
373
+ ctx["accumulated_content"] = ""
374
+
375
+ def mutate_kwargs_hook(ctx: Dict[str, Any], kwargs: Any) -> Any:
376
+ if "stream_options" not in kwargs and _supports_stream_options():
377
+ modified_kwargs = dict(kwargs)
378
+ modified_kwargs["stream_options"] = {"include_usage": True}
379
+ return modified_kwargs
380
+ return kwargs
381
+
382
+ def mutate_hook(
383
+ ctx: Dict[str, Any], result: AsyncIterator[ChatCompletionChunk]
384
+ ) -> AsyncIterator[ChatCompletionChunk]:
385
+ async def traced_generator() -> AsyncGenerator[ChatCompletionChunk, None]:
386
+ async for chunk in result:
387
+ yield chunk
388
+
389
+ def yield_hook(inner_ctx: Dict[str, Any], chunk: ChatCompletionChunk) -> None:
390
+ span = ctx.get("span")
391
+ if not span:
392
+ return
393
+
394
+ if chunk.choices and len(chunk.choices) > 0:
395
+ delta = chunk.choices[0].delta
396
+ if delta and delta.content:
397
+ ctx["accumulated_content"] = (
398
+ ctx.get("accumulated_content", "") + delta.content
399
+ )
400
+
401
+ if hasattr(chunk, "usage") and chunk.usage:
402
+ prompt_tokens = chunk.usage.prompt_tokens or 0
403
+ completion_tokens = chunk.usage.completion_tokens or 0
404
+ cache_read = 0
405
+ if chunk.usage.prompt_tokens_details:
406
+ cache_read = chunk.usage.prompt_tokens_details.cached_tokens or 0
407
+
408
+ set_cost_attribute(span, chunk.usage)
409
+
410
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
411
+ openai_tokens_converter(
412
+ prompt_tokens,
413
+ completion_tokens,
414
+ cache_read,
415
+ 0,
416
+ chunk.usage.total_tokens,
417
+ )
418
+ )
419
+
420
+ span.set_attribute(
421
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
422
+ prompt_tokens,
423
+ )
424
+ span.set_attribute(
425
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
426
+ )
427
+ span.set_attribute(
428
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
429
+ )
430
+ span.set_attribute(
431
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
432
+ )
433
+ span.set_attribute(
434
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
435
+ safe_serialize(chunk.usage),
436
+ )
437
+
438
+ def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
439
+ span = ctx.get("span")
440
+ if span:
441
+ accumulated = ctx.get("accumulated_content", "")
442
+ span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, accumulated)
443
+
444
+ def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
445
+ span = ctx.get("span")
446
+ if span:
447
+ span.record_exception(error)
448
+ span.set_status(Status(StatusCode.ERROR))
449
+
450
+ def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
451
+ span = ctx.get("span")
452
+ if span:
453
+ span.end()
454
+
455
+ wrapped_generator = immutable_wrap_async_iterator(
456
+ traced_generator,
457
+ yield_hook=yield_hook,
458
+ post_hook=post_hook_inner,
459
+ error_hook=error_hook_inner,
460
+ finally_hook=finally_hook_inner,
461
+ )
462
+
463
+ return wrapped_generator()
464
+
465
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
466
+ span = ctx.get("span")
467
+ if span:
468
+ span.record_exception(error)
469
+ span.set_status(Status(StatusCode.ERROR))
470
+
471
+ return mutable_wrap_async(
472
+ original_func,
473
+ pre_hook=pre_hook,
474
+ mutate_kwargs_hook=mutate_kwargs_hook,
475
+ mutate_hook=mutate_hook,
476
+ error_hook=error_hook,
477
+ )
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+ import importlib.util
3
+
4
+ HAS_OPENAI = importlib.util.find_spec("openai") is not None
5
+
6
+ __all__ = ["HAS_OPENAI"]