judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +177 -12
- judgeval/api/__init__.py +519 -0
- judgeval/api/api_types.py +407 -0
- judgeval/cli.py +79 -0
- judgeval/constants.py +76 -47
- judgeval/data/__init__.py +3 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +15 -56
- judgeval/data/judgment_types.py +450 -0
- judgeval/data/result.py +29 -73
- judgeval/data/scorer_data.py +29 -62
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/trace.py +121 -0
- judgeval/dataset/__init__.py +264 -0
- judgeval/env.py +52 -0
- judgeval/evaluation/__init__.py +344 -0
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +50 -0
- judgeval/judges/__init__.py +2 -3
- judgeval/judges/base_judge.py +2 -3
- judgeval/judges/litellm_judge.py +100 -20
- judgeval/judges/together_judge.py +101 -20
- judgeval/judges/utils.py +20 -24
- judgeval/logger.py +62 -0
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +18 -25
- judgeval/scorers/agent_scorer.py +17 -0
- judgeval/scorers/api_scorer.py +45 -41
- judgeval/scorers/base_scorer.py +83 -38
- judgeval/scorers/example_scorer.py +17 -0
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorers/__init__.py +0 -148
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
- judgeval/scorers/score.py +77 -306
- judgeval/scorers/utils.py +4 -199
- judgeval/tracer/__init__.py +1122 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +128 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainable_model.py +243 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +97 -0
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/utils/version_check.py +28 -0
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.22.2.dist-info/METADATA +265 -0
- judgeval-0.22.2.dist-info/RECORD +112 -0
- judgeval-0.22.2.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -39
- judgeval/common/__init__.py +0 -8
- judgeval/common/exceptions.py +0 -28
- judgeval/common/logger.py +0 -189
- judgeval/common/tracer.py +0 -798
- judgeval/common/utils.py +0 -763
- judgeval/data/api_example.py +0 -111
- judgeval/data/datasets/__init__.py +0 -5
- judgeval/data/datasets/dataset.py +0 -286
- judgeval/data/datasets/eval_dataset_client.py +0 -193
- judgeval/data/datasets/ground_truth.py +0 -54
- judgeval/data/datasets/utils.py +0 -74
- judgeval/evaluation_run.py +0 -132
- judgeval/judges/mixture_of_judges.py +0 -248
- judgeval/judgment_client.py +0 -354
- judgeval/run_evaluation.py +0 -439
- judgeval/scorers/judgeval_scorer.py +0 -140
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
- judgeval/scorers/prompt_scorer.py +0 -439
- judgeval-0.0.11.dist-info/METADATA +0 -36
- judgeval-0.0.11.dist-info/RECORD +0 -84
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import (
|
|
3
|
+
TYPE_CHECKING,
|
|
4
|
+
Any,
|
|
5
|
+
Awaitable,
|
|
6
|
+
Callable,
|
|
7
|
+
Dict,
|
|
8
|
+
Iterator,
|
|
9
|
+
AsyncIterator,
|
|
10
|
+
Generator,
|
|
11
|
+
AsyncGenerator,
|
|
12
|
+
ParamSpec,
|
|
13
|
+
TypeVar,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from judgeval.tracer.keys import AttributeKeys
|
|
17
|
+
from judgeval.tracer.utils import set_span_attribute
|
|
18
|
+
from judgeval.utils.serialize import safe_serialize
|
|
19
|
+
from judgeval.utils.wrappers import (
|
|
20
|
+
immutable_wrap_sync,
|
|
21
|
+
immutable_wrap_async,
|
|
22
|
+
mutable_wrap_sync,
|
|
23
|
+
mutable_wrap_async,
|
|
24
|
+
immutable_wrap_sync_iterator,
|
|
25
|
+
immutable_wrap_async_iterator,
|
|
26
|
+
)
|
|
27
|
+
from judgeval.tracer.llm.llm_openai.utils import (
|
|
28
|
+
openai_tokens_converter,
|
|
29
|
+
set_cost_attribute,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from judgeval.tracer import Tracer
|
|
34
|
+
from openai import OpenAI, AsyncOpenAI
|
|
35
|
+
from openai.types.responses import Response
|
|
36
|
+
|
|
37
|
+
P = ParamSpec("P")
|
|
38
|
+
T = TypeVar("T")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def wrap_responses_create_sync(tracer: Tracer, client: OpenAI) -> None:
|
|
42
|
+
original_func = client.responses.create
|
|
43
|
+
|
|
44
|
+
def dispatcher(*args: Any, **kwargs: Any) -> Any:
|
|
45
|
+
if kwargs.get("stream", False):
|
|
46
|
+
return _wrap_responses_streaming_sync(tracer, original_func)(
|
|
47
|
+
*args, **kwargs
|
|
48
|
+
)
|
|
49
|
+
return _wrap_responses_non_streaming_sync(tracer, original_func)(
|
|
50
|
+
*args, **kwargs
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
setattr(client.responses, "create", dispatcher)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _wrap_responses_non_streaming_sync(
|
|
57
|
+
tracer: Tracer, original_func: Callable[..., Response]
|
|
58
|
+
) -> Callable[..., Response]:
|
|
59
|
+
def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
|
|
60
|
+
ctx["span"] = tracer.get_tracer().start_span(
|
|
61
|
+
"OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
62
|
+
)
|
|
63
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
64
|
+
set_span_attribute(
|
|
65
|
+
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
66
|
+
)
|
|
67
|
+
ctx["model_name"] = kwargs.get("model", "")
|
|
68
|
+
set_span_attribute(
|
|
69
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def post_hook(ctx: Dict[str, Any], result: Response) -> None:
|
|
73
|
+
span = ctx.get("span")
|
|
74
|
+
if not span:
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
set_span_attribute(
|
|
78
|
+
span, AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result)
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
usage_data = result.usage if hasattr(result, "usage") else None
|
|
82
|
+
if usage_data:
|
|
83
|
+
prompt_tokens = usage_data.input_tokens or 0
|
|
84
|
+
completion_tokens = usage_data.output_tokens or 0
|
|
85
|
+
cache_read = usage_data.input_tokens_details.cached_tokens or 0
|
|
86
|
+
|
|
87
|
+
set_cost_attribute(span, usage_data)
|
|
88
|
+
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
89
|
+
openai_tokens_converter(
|
|
90
|
+
prompt_tokens,
|
|
91
|
+
completion_tokens,
|
|
92
|
+
cache_read,
|
|
93
|
+
0,
|
|
94
|
+
usage_data.total_tokens,
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
set_span_attribute(
|
|
99
|
+
span,
|
|
100
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
101
|
+
prompt_tokens,
|
|
102
|
+
)
|
|
103
|
+
set_span_attribute(
|
|
104
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
105
|
+
)
|
|
106
|
+
set_span_attribute(
|
|
107
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
|
|
108
|
+
)
|
|
109
|
+
set_span_attribute(
|
|
110
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
|
|
111
|
+
)
|
|
112
|
+
set_span_attribute(
|
|
113
|
+
span,
|
|
114
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
115
|
+
safe_serialize(usage_data),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if hasattr(result, "model"):
|
|
119
|
+
set_span_attribute(
|
|
120
|
+
span,
|
|
121
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
122
|
+
result.model or ctx["model_name"],
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
|
|
126
|
+
span = ctx.get("span")
|
|
127
|
+
if span:
|
|
128
|
+
span.record_exception(error)
|
|
129
|
+
|
|
130
|
+
def finally_hook(ctx: Dict[str, Any]) -> None:
|
|
131
|
+
span = ctx.get("span")
|
|
132
|
+
if span:
|
|
133
|
+
span.end()
|
|
134
|
+
|
|
135
|
+
return immutable_wrap_sync(
|
|
136
|
+
original_func,
|
|
137
|
+
pre_hook=pre_hook,
|
|
138
|
+
post_hook=post_hook,
|
|
139
|
+
error_hook=error_hook,
|
|
140
|
+
finally_hook=finally_hook,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _wrap_responses_streaming_sync(
|
|
145
|
+
tracer: Tracer, original_func: Callable[..., Iterator[Any]]
|
|
146
|
+
) -> Callable[..., Iterator[Any]]:
|
|
147
|
+
def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
|
|
148
|
+
ctx["span"] = tracer.get_tracer().start_span(
|
|
149
|
+
"OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
150
|
+
)
|
|
151
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
152
|
+
set_span_attribute(
|
|
153
|
+
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
154
|
+
)
|
|
155
|
+
ctx["model_name"] = kwargs.get("model", "")
|
|
156
|
+
set_span_attribute(
|
|
157
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
158
|
+
)
|
|
159
|
+
ctx["accumulated_content"] = ""
|
|
160
|
+
|
|
161
|
+
def mutate_hook(ctx: Dict[str, Any], result: Iterator[Any]) -> Iterator[Any]:
|
|
162
|
+
def traced_generator() -> Generator[Any, None, None]:
|
|
163
|
+
for chunk in result:
|
|
164
|
+
yield chunk
|
|
165
|
+
|
|
166
|
+
def yield_hook(inner_ctx: Dict[str, Any], chunk: Any) -> None:
|
|
167
|
+
span = ctx.get("span")
|
|
168
|
+
if not span:
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
if hasattr(chunk, "type") and chunk.type == "response.output_text.delta":
|
|
172
|
+
delta = getattr(chunk, "delta", None)
|
|
173
|
+
if delta:
|
|
174
|
+
ctx["accumulated_content"] = (
|
|
175
|
+
ctx.get("accumulated_content", "") + delta
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
if hasattr(chunk, "type") and chunk.type == "response.completed":
|
|
179
|
+
if (
|
|
180
|
+
hasattr(chunk, "response")
|
|
181
|
+
and chunk.response
|
|
182
|
+
and hasattr(chunk.response, "usage")
|
|
183
|
+
and chunk.response.usage
|
|
184
|
+
):
|
|
185
|
+
prompt_tokens = chunk.response.usage.input_tokens or 0
|
|
186
|
+
completion_tokens = chunk.response.usage.output_tokens or 0
|
|
187
|
+
total_tokens = chunk.response.usage.total_tokens or 0
|
|
188
|
+
# Safely access nested cached_tokens
|
|
189
|
+
input_tokens_details = getattr(
|
|
190
|
+
chunk.response.usage, "input_tokens_details", None
|
|
191
|
+
)
|
|
192
|
+
cache_read = (
|
|
193
|
+
getattr(input_tokens_details, "cached_tokens", 0)
|
|
194
|
+
if input_tokens_details
|
|
195
|
+
else 0
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
set_cost_attribute(span, chunk.response.usage)
|
|
199
|
+
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
200
|
+
openai_tokens_converter(
|
|
201
|
+
prompt_tokens,
|
|
202
|
+
completion_tokens,
|
|
203
|
+
cache_read,
|
|
204
|
+
0,
|
|
205
|
+
total_tokens,
|
|
206
|
+
)
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
set_span_attribute(
|
|
210
|
+
span,
|
|
211
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
212
|
+
prompt_tokens,
|
|
213
|
+
)
|
|
214
|
+
set_span_attribute(
|
|
215
|
+
span,
|
|
216
|
+
AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
|
|
217
|
+
completion_tokens,
|
|
218
|
+
)
|
|
219
|
+
set_span_attribute(
|
|
220
|
+
span,
|
|
221
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
222
|
+
cache_read,
|
|
223
|
+
)
|
|
224
|
+
set_span_attribute(
|
|
225
|
+
span,
|
|
226
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
227
|
+
0,
|
|
228
|
+
)
|
|
229
|
+
set_span_attribute(
|
|
230
|
+
span,
|
|
231
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
232
|
+
safe_serialize(chunk.response.usage),
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
|
|
236
|
+
span = ctx.get("span")
|
|
237
|
+
if span:
|
|
238
|
+
accumulated = ctx.get("accumulated_content", "")
|
|
239
|
+
set_span_attribute(span, AttributeKeys.GEN_AI_COMPLETION, accumulated)
|
|
240
|
+
|
|
241
|
+
def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
|
|
242
|
+
span = ctx.get("span")
|
|
243
|
+
if span:
|
|
244
|
+
span.record_exception(error)
|
|
245
|
+
|
|
246
|
+
def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
|
|
247
|
+
span = ctx.get("span")
|
|
248
|
+
if span:
|
|
249
|
+
span.end()
|
|
250
|
+
|
|
251
|
+
wrapped_generator = immutable_wrap_sync_iterator(
|
|
252
|
+
traced_generator,
|
|
253
|
+
yield_hook=yield_hook,
|
|
254
|
+
post_hook=post_hook_inner,
|
|
255
|
+
error_hook=error_hook_inner,
|
|
256
|
+
finally_hook=finally_hook_inner,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
return wrapped_generator()
|
|
260
|
+
|
|
261
|
+
def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
|
|
262
|
+
span = ctx.get("span")
|
|
263
|
+
if span:
|
|
264
|
+
span.record_exception(error)
|
|
265
|
+
|
|
266
|
+
return mutable_wrap_sync(
|
|
267
|
+
original_func,
|
|
268
|
+
pre_hook=pre_hook,
|
|
269
|
+
mutate_hook=mutate_hook,
|
|
270
|
+
error_hook=error_hook,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def wrap_responses_create_async(tracer: Tracer, client: AsyncOpenAI) -> None:
|
|
275
|
+
original_func = client.responses.create
|
|
276
|
+
|
|
277
|
+
async def dispatcher(*args: Any, **kwargs: Any) -> Any:
|
|
278
|
+
if kwargs.get("stream", False):
|
|
279
|
+
return await _wrap_responses_streaming_async(tracer, original_func)(
|
|
280
|
+
*args, **kwargs
|
|
281
|
+
)
|
|
282
|
+
return await _wrap_responses_non_streaming_async(tracer, original_func)(
|
|
283
|
+
*args, **kwargs
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
setattr(client.responses, "create", dispatcher)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _wrap_responses_non_streaming_async(
|
|
290
|
+
tracer: Tracer, original_func: Callable[..., Awaitable[Response]]
|
|
291
|
+
) -> Callable[..., Awaitable[Response]]:
|
|
292
|
+
def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
|
|
293
|
+
ctx["span"] = tracer.get_tracer().start_span(
|
|
294
|
+
"OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
295
|
+
)
|
|
296
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
297
|
+
set_span_attribute(
|
|
298
|
+
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
299
|
+
)
|
|
300
|
+
ctx["model_name"] = kwargs.get("model", "")
|
|
301
|
+
set_span_attribute(
|
|
302
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
def post_hook(ctx: Dict[str, Any], result: Response) -> None:
|
|
306
|
+
span = ctx.get("span")
|
|
307
|
+
if not span:
|
|
308
|
+
return
|
|
309
|
+
|
|
310
|
+
set_span_attribute(
|
|
311
|
+
span, AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result)
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
usage_data = result.usage if hasattr(result, "usage") else None
|
|
315
|
+
if usage_data:
|
|
316
|
+
prompt_tokens = usage_data.input_tokens or 0
|
|
317
|
+
completion_tokens = usage_data.output_tokens or 0
|
|
318
|
+
cache_read = usage_data.input_tokens_details.cached_tokens or 0
|
|
319
|
+
|
|
320
|
+
set_cost_attribute(span, usage_data)
|
|
321
|
+
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
322
|
+
openai_tokens_converter(
|
|
323
|
+
prompt_tokens,
|
|
324
|
+
completion_tokens,
|
|
325
|
+
cache_read,
|
|
326
|
+
0,
|
|
327
|
+
usage_data.total_tokens,
|
|
328
|
+
)
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
set_span_attribute(
|
|
332
|
+
span,
|
|
333
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
334
|
+
prompt_tokens,
|
|
335
|
+
)
|
|
336
|
+
set_span_attribute(
|
|
337
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
338
|
+
)
|
|
339
|
+
set_span_attribute(
|
|
340
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
|
|
341
|
+
)
|
|
342
|
+
set_span_attribute(
|
|
343
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
|
|
344
|
+
)
|
|
345
|
+
set_span_attribute(
|
|
346
|
+
span,
|
|
347
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
348
|
+
safe_serialize(usage_data),
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
if hasattr(result, "model"):
|
|
352
|
+
set_span_attribute(
|
|
353
|
+
span,
|
|
354
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
355
|
+
result.model or ctx["model_name"],
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
|
|
359
|
+
span = ctx.get("span")
|
|
360
|
+
if span:
|
|
361
|
+
span.record_exception(error)
|
|
362
|
+
|
|
363
|
+
def finally_hook(ctx: Dict[str, Any]) -> None:
|
|
364
|
+
span = ctx.get("span")
|
|
365
|
+
if span:
|
|
366
|
+
span.end()
|
|
367
|
+
|
|
368
|
+
return immutable_wrap_async(
|
|
369
|
+
original_func,
|
|
370
|
+
pre_hook=pre_hook,
|
|
371
|
+
post_hook=post_hook,
|
|
372
|
+
error_hook=error_hook,
|
|
373
|
+
finally_hook=finally_hook,
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _wrap_responses_streaming_async(
|
|
378
|
+
tracer: Tracer, original_func: Callable[..., Awaitable[AsyncIterator[Any]]]
|
|
379
|
+
) -> Callable[..., Awaitable[AsyncIterator[Any]]]:
|
|
380
|
+
def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
|
|
381
|
+
ctx["span"] = tracer.get_tracer().start_span(
|
|
382
|
+
"OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
383
|
+
)
|
|
384
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
385
|
+
set_span_attribute(
|
|
386
|
+
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
387
|
+
)
|
|
388
|
+
ctx["model_name"] = kwargs.get("model", "")
|
|
389
|
+
set_span_attribute(
|
|
390
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
391
|
+
)
|
|
392
|
+
ctx["accumulated_content"] = ""
|
|
393
|
+
|
|
394
|
+
def mutate_hook(
|
|
395
|
+
ctx: Dict[str, Any], result: AsyncIterator[Any]
|
|
396
|
+
) -> AsyncIterator[Any]:
|
|
397
|
+
async def traced_generator() -> AsyncGenerator[Any, None]:
|
|
398
|
+
async for chunk in result:
|
|
399
|
+
yield chunk
|
|
400
|
+
|
|
401
|
+
def yield_hook(inner_ctx: Dict[str, Any], chunk: Any) -> None:
|
|
402
|
+
span = ctx.get("span")
|
|
403
|
+
if not span:
|
|
404
|
+
return
|
|
405
|
+
|
|
406
|
+
if hasattr(chunk, "type") and chunk.type == "response.output_text.delta":
|
|
407
|
+
delta = getattr(chunk, "delta", None)
|
|
408
|
+
if delta:
|
|
409
|
+
ctx["accumulated_content"] = (
|
|
410
|
+
ctx.get("accumulated_content", "") + delta
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
if hasattr(chunk, "type") and chunk.type == "response.completed":
|
|
414
|
+
if (
|
|
415
|
+
hasattr(chunk, "response")
|
|
416
|
+
and chunk.response
|
|
417
|
+
and hasattr(chunk.response, "usage")
|
|
418
|
+
and chunk.response.usage
|
|
419
|
+
):
|
|
420
|
+
prompt_tokens = chunk.response.usage.input_tokens or 0
|
|
421
|
+
completion_tokens = chunk.response.usage.output_tokens or 0
|
|
422
|
+
total_tokens = chunk.response.usage.total_tokens or 0
|
|
423
|
+
# Safely access nested cached_tokens
|
|
424
|
+
input_tokens_details = getattr(
|
|
425
|
+
chunk.response.usage, "input_tokens_details", None
|
|
426
|
+
)
|
|
427
|
+
cache_read = (
|
|
428
|
+
getattr(input_tokens_details, "cached_tokens", 0)
|
|
429
|
+
if input_tokens_details
|
|
430
|
+
else 0
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
set_cost_attribute(span, chunk.response.usage)
|
|
434
|
+
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
435
|
+
openai_tokens_converter(
|
|
436
|
+
prompt_tokens,
|
|
437
|
+
completion_tokens,
|
|
438
|
+
cache_read,
|
|
439
|
+
0,
|
|
440
|
+
total_tokens,
|
|
441
|
+
)
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
set_span_attribute(
|
|
445
|
+
span,
|
|
446
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
447
|
+
prompt_tokens,
|
|
448
|
+
)
|
|
449
|
+
set_span_attribute(
|
|
450
|
+
span,
|
|
451
|
+
AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
|
|
452
|
+
completion_tokens,
|
|
453
|
+
)
|
|
454
|
+
set_span_attribute(
|
|
455
|
+
span,
|
|
456
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
457
|
+
cache_read,
|
|
458
|
+
)
|
|
459
|
+
set_span_attribute(
|
|
460
|
+
span,
|
|
461
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
462
|
+
0,
|
|
463
|
+
)
|
|
464
|
+
set_span_attribute(
|
|
465
|
+
span,
|
|
466
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
467
|
+
safe_serialize(chunk.response.usage),
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
|
|
471
|
+
span = ctx.get("span")
|
|
472
|
+
if span:
|
|
473
|
+
accumulated = ctx.get("accumulated_content", "")
|
|
474
|
+
set_span_attribute(span, AttributeKeys.GEN_AI_COMPLETION, accumulated)
|
|
475
|
+
|
|
476
|
+
def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
|
|
477
|
+
span = ctx.get("span")
|
|
478
|
+
if span:
|
|
479
|
+
span.record_exception(error)
|
|
480
|
+
|
|
481
|
+
def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
|
|
482
|
+
span = ctx.get("span")
|
|
483
|
+
if span:
|
|
484
|
+
span.end()
|
|
485
|
+
|
|
486
|
+
wrapped_generator = immutable_wrap_async_iterator(
|
|
487
|
+
traced_generator,
|
|
488
|
+
yield_hook=yield_hook,
|
|
489
|
+
post_hook=post_hook_inner,
|
|
490
|
+
error_hook=error_hook_inner,
|
|
491
|
+
finally_hook=finally_hook_inner,
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
return wrapped_generator()
|
|
495
|
+
|
|
496
|
+
def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
|
|
497
|
+
span = ctx.get("span")
|
|
498
|
+
if span:
|
|
499
|
+
span.record_exception(error)
|
|
500
|
+
|
|
501
|
+
return mutable_wrap_async(
|
|
502
|
+
original_func,
|
|
503
|
+
pre_hook=pre_hook,
|
|
504
|
+
mutate_hook=mutate_hook,
|
|
505
|
+
error_hook=error_hook,
|
|
506
|
+
)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from opentelemetry.trace import Span
|
|
3
|
+
from judgeval.tracer.keys import AttributeKeys
|
|
4
|
+
from judgeval.tracer.utils import set_span_attribute
|
|
5
|
+
from judgeval.utils.serialize import safe_serialize
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def openai_tokens_converter(
|
|
9
|
+
prompt_tokens: int,
|
|
10
|
+
completion_tokens: int,
|
|
11
|
+
cache_read: int,
|
|
12
|
+
cache_creation: int,
|
|
13
|
+
total_tokens: int,
|
|
14
|
+
) -> tuple[int, int, int, int]:
|
|
15
|
+
"""
|
|
16
|
+
Returns:
|
|
17
|
+
tuple[int, int, int, int]:
|
|
18
|
+
- judgment.usage.non_cached_input
|
|
19
|
+
- judgment.usage.output_tokens
|
|
20
|
+
- judgment.usage.cached_input_tokens
|
|
21
|
+
- judgment.usage.cache_creation_tokens
|
|
22
|
+
"""
|
|
23
|
+
manual_tokens = prompt_tokens + completion_tokens + cache_read + cache_creation
|
|
24
|
+
|
|
25
|
+
if manual_tokens > total_tokens:
|
|
26
|
+
# This is the openAI case where we need to subtract the cached tokens from the input tokens
|
|
27
|
+
return prompt_tokens - cache_read, completion_tokens, cache_read, cache_creation
|
|
28
|
+
else:
|
|
29
|
+
return prompt_tokens, completion_tokens, cache_read, cache_creation
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def set_cost_attribute(span: Span, usage_data: Any) -> None:
|
|
33
|
+
"""
|
|
34
|
+
This is for OpenRouter case where the cost is provided in the usage data when they specify:
|
|
35
|
+
extra_body={"usage": {"include": True}},
|
|
36
|
+
"""
|
|
37
|
+
if hasattr(usage_data, "cost") and usage_data.cost:
|
|
38
|
+
set_span_attribute(
|
|
39
|
+
span,
|
|
40
|
+
AttributeKeys.JUDGMENT_USAGE_TOTAL_COST_USD,
|
|
41
|
+
safe_serialize(usage_data.cost),
|
|
42
|
+
)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import TYPE_CHECKING, Union
|
|
3
|
+
import typing
|
|
4
|
+
|
|
5
|
+
from judgeval.tracer.llm.llm_openai.chat_completions import (
|
|
6
|
+
wrap_chat_completions_create_sync,
|
|
7
|
+
wrap_chat_completions_create_async,
|
|
8
|
+
)
|
|
9
|
+
from judgeval.tracer.llm.llm_openai.responses import (
|
|
10
|
+
wrap_responses_create_sync,
|
|
11
|
+
wrap_responses_create_async,
|
|
12
|
+
)
|
|
13
|
+
from judgeval.tracer.llm.llm_openai.beta_chat_completions import (
|
|
14
|
+
wrap_beta_chat_completions_parse_sync,
|
|
15
|
+
wrap_beta_chat_completions_parse_async,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from judgeval.tracer import Tracer
|
|
20
|
+
from openai import OpenAI, AsyncOpenAI
|
|
21
|
+
|
|
22
|
+
TClient = Union[OpenAI, AsyncOpenAI]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def wrap_openai_client_sync(tracer: Tracer, client: OpenAI) -> OpenAI:
|
|
26
|
+
wrap_chat_completions_create_sync(tracer, client)
|
|
27
|
+
wrap_responses_create_sync(tracer, client)
|
|
28
|
+
wrap_beta_chat_completions_parse_sync(tracer, client)
|
|
29
|
+
return client
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def wrap_openai_client_async(tracer: Tracer, client: AsyncOpenAI) -> AsyncOpenAI:
|
|
33
|
+
wrap_chat_completions_create_async(tracer, client)
|
|
34
|
+
wrap_responses_create_async(tracer, client)
|
|
35
|
+
wrap_beta_chat_completions_parse_async(tracer, client)
|
|
36
|
+
return client
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@typing.overload
|
|
40
|
+
def wrap_openai_client(tracer: Tracer, client: OpenAI) -> OpenAI: ...
|
|
41
|
+
@typing.overload
|
|
42
|
+
def wrap_openai_client(tracer: Tracer, client: AsyncOpenAI) -> AsyncOpenAI: ...
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def wrap_openai_client(tracer: Tracer, client: TClient) -> TClient:
|
|
46
|
+
from judgeval.tracer.llm.llm_openai.config import HAS_OPENAI
|
|
47
|
+
from judgeval.logger import judgeval_logger
|
|
48
|
+
|
|
49
|
+
if not HAS_OPENAI:
|
|
50
|
+
judgeval_logger.error(
|
|
51
|
+
"Cannot wrap OpenAI client: 'openai' library not installed. "
|
|
52
|
+
"Install it with: pip install openai"
|
|
53
|
+
)
|
|
54
|
+
return client
|
|
55
|
+
|
|
56
|
+
from openai import OpenAI, AsyncOpenAI
|
|
57
|
+
|
|
58
|
+
if isinstance(client, AsyncOpenAI):
|
|
59
|
+
return wrap_openai_client_async(tracer, client)
|
|
60
|
+
elif isinstance(client, OpenAI):
|
|
61
|
+
return wrap_openai_client_sync(tracer, client)
|
|
62
|
+
else:
|
|
63
|
+
raise TypeError(f"Invalid client type: {type(client)}")
|