judgeval 0.16.9__py3-none-any.whl → 0.22.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +32 -2
- judgeval/api/__init__.py +108 -0
- judgeval/api/api_types.py +76 -15
- judgeval/cli.py +16 -1
- judgeval/data/judgment_types.py +76 -20
- judgeval/dataset/__init__.py +11 -2
- judgeval/env.py +2 -11
- judgeval/evaluation/__init__.py +4 -0
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +1 -13
- judgeval/tracer/__init__.py +371 -257
- judgeval/tracer/constants.py +1 -1
- judgeval/tracer/exporters/store.py +32 -16
- judgeval/tracer/keys.py +11 -9
- judgeval/tracer/llm/llm_anthropic/messages.py +38 -26
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +14 -14
- judgeval/tracer/llm/llm_google/generate_content.py +9 -7
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +38 -14
- judgeval/tracer/llm/llm_openai/chat_completions.py +90 -26
- judgeval/tracer/llm/llm_openai/responses.py +88 -26
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +26 -18
- judgeval/tracer/managers.py +4 -0
- judgeval/trainer/__init__.py +10 -1
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +1 -1
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainer.py +52 -387
- judgeval/utils/guards.py +9 -5
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +2 -2
- judgeval/version.py +1 -1
- {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/METADATA +2 -3
- {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/RECORD +37 -32
- {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
- {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/entry_points.txt +0 -0
- {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -25,6 +25,10 @@ from judgeval.utils.wrappers import (
|
|
|
25
25
|
immutable_wrap_sync_iterator,
|
|
26
26
|
immutable_wrap_async_iterator,
|
|
27
27
|
)
|
|
28
|
+
from judgeval.tracer.llm.llm_openai.utils import (
|
|
29
|
+
openai_tokens_converter,
|
|
30
|
+
set_cost_attribute,
|
|
31
|
+
)
|
|
28
32
|
|
|
29
33
|
if TYPE_CHECKING:
|
|
30
34
|
from judgeval.tracer import Tracer
|
|
@@ -62,13 +66,13 @@ def _wrap_non_streaming_sync(
|
|
|
62
66
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
63
67
|
"OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
64
68
|
)
|
|
65
|
-
tracer.
|
|
69
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
66
70
|
set_span_attribute(
|
|
67
71
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
68
72
|
)
|
|
69
73
|
ctx["model_name"] = kwargs.get("model", "")
|
|
70
74
|
set_span_attribute(
|
|
71
|
-
ctx["span"], AttributeKeys.
|
|
75
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
72
76
|
)
|
|
73
77
|
|
|
74
78
|
def post_hook(ctx: Dict[str, Any], result: ChatCompletion) -> None:
|
|
@@ -89,17 +93,31 @@ def _wrap_non_streaming_sync(
|
|
|
89
93
|
if prompt_tokens_details:
|
|
90
94
|
cache_read = prompt_tokens_details.cached_tokens or 0
|
|
91
95
|
|
|
96
|
+
set_cost_attribute(span, usage_data)
|
|
97
|
+
|
|
98
|
+
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
99
|
+
openai_tokens_converter(
|
|
100
|
+
prompt_tokens,
|
|
101
|
+
completion_tokens,
|
|
102
|
+
cache_read,
|
|
103
|
+
0,
|
|
104
|
+
usage_data.total_tokens,
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
|
|
92
108
|
set_span_attribute(
|
|
93
|
-
span,
|
|
109
|
+
span,
|
|
110
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
111
|
+
prompt_tokens,
|
|
94
112
|
)
|
|
95
113
|
set_span_attribute(
|
|
96
|
-
span, AttributeKeys.
|
|
114
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
97
115
|
)
|
|
98
116
|
set_span_attribute(
|
|
99
|
-
span, AttributeKeys.
|
|
117
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
|
|
100
118
|
)
|
|
101
119
|
set_span_attribute(
|
|
102
|
-
span, AttributeKeys.
|
|
120
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
|
|
103
121
|
)
|
|
104
122
|
set_span_attribute(
|
|
105
123
|
span,
|
|
@@ -109,7 +127,7 @@ def _wrap_non_streaming_sync(
|
|
|
109
127
|
|
|
110
128
|
set_span_attribute(
|
|
111
129
|
span,
|
|
112
|
-
AttributeKeys.
|
|
130
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
113
131
|
result.model or ctx["model_name"],
|
|
114
132
|
)
|
|
115
133
|
|
|
@@ -139,13 +157,13 @@ def _wrap_streaming_sync(
|
|
|
139
157
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
140
158
|
"OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
141
159
|
)
|
|
142
|
-
tracer.
|
|
160
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
143
161
|
set_span_attribute(
|
|
144
162
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
145
163
|
)
|
|
146
164
|
ctx["model_name"] = kwargs.get("model", "")
|
|
147
165
|
set_span_attribute(
|
|
148
|
-
ctx["span"], AttributeKeys.
|
|
166
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
149
167
|
)
|
|
150
168
|
ctx["accumulated_content"] = ""
|
|
151
169
|
|
|
@@ -182,17 +200,33 @@ def _wrap_streaming_sync(
|
|
|
182
200
|
if chunk.usage.prompt_tokens_details:
|
|
183
201
|
cache_read = chunk.usage.prompt_tokens_details.cached_tokens or 0
|
|
184
202
|
|
|
203
|
+
set_cost_attribute(span, chunk.usage)
|
|
204
|
+
|
|
205
|
+
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
206
|
+
openai_tokens_converter(
|
|
207
|
+
prompt_tokens,
|
|
208
|
+
completion_tokens,
|
|
209
|
+
cache_read,
|
|
210
|
+
0,
|
|
211
|
+
chunk.usage.total_tokens,
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
|
|
185
215
|
set_span_attribute(
|
|
186
|
-
span,
|
|
216
|
+
span,
|
|
217
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
218
|
+
prompt_tokens,
|
|
187
219
|
)
|
|
188
220
|
set_span_attribute(
|
|
189
|
-
span, AttributeKeys.
|
|
221
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
190
222
|
)
|
|
191
223
|
set_span_attribute(
|
|
192
|
-
span,
|
|
224
|
+
span,
|
|
225
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
226
|
+
cache_read,
|
|
193
227
|
)
|
|
194
228
|
set_span_attribute(
|
|
195
|
-
span, AttributeKeys.
|
|
229
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
|
|
196
230
|
)
|
|
197
231
|
set_span_attribute(
|
|
198
232
|
span,
|
|
@@ -258,13 +292,13 @@ def _wrap_non_streaming_async(
|
|
|
258
292
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
259
293
|
"OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
260
294
|
)
|
|
261
|
-
tracer.
|
|
295
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
262
296
|
set_span_attribute(
|
|
263
297
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
264
298
|
)
|
|
265
299
|
ctx["model_name"] = kwargs.get("model", "")
|
|
266
300
|
set_span_attribute(
|
|
267
|
-
ctx["span"], AttributeKeys.
|
|
301
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
268
302
|
)
|
|
269
303
|
|
|
270
304
|
def post_hook(ctx: Dict[str, Any], result: ChatCompletion) -> None:
|
|
@@ -285,17 +319,31 @@ def _wrap_non_streaming_async(
|
|
|
285
319
|
if prompt_tokens_details:
|
|
286
320
|
cache_read = prompt_tokens_details.cached_tokens or 0
|
|
287
321
|
|
|
322
|
+
set_cost_attribute(span, usage_data)
|
|
323
|
+
|
|
324
|
+
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
325
|
+
openai_tokens_converter(
|
|
326
|
+
prompt_tokens,
|
|
327
|
+
completion_tokens,
|
|
328
|
+
cache_read,
|
|
329
|
+
0,
|
|
330
|
+
usage_data.total_tokens,
|
|
331
|
+
)
|
|
332
|
+
)
|
|
333
|
+
|
|
288
334
|
set_span_attribute(
|
|
289
|
-
span,
|
|
335
|
+
span,
|
|
336
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
337
|
+
prompt_tokens,
|
|
290
338
|
)
|
|
291
339
|
set_span_attribute(
|
|
292
|
-
span, AttributeKeys.
|
|
340
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
293
341
|
)
|
|
294
342
|
set_span_attribute(
|
|
295
|
-
span, AttributeKeys.
|
|
343
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
|
|
296
344
|
)
|
|
297
345
|
set_span_attribute(
|
|
298
|
-
span, AttributeKeys.
|
|
346
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
|
|
299
347
|
)
|
|
300
348
|
set_span_attribute(
|
|
301
349
|
span,
|
|
@@ -305,7 +353,7 @@ def _wrap_non_streaming_async(
|
|
|
305
353
|
|
|
306
354
|
set_span_attribute(
|
|
307
355
|
span,
|
|
308
|
-
AttributeKeys.
|
|
356
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
309
357
|
result.model or ctx["model_name"],
|
|
310
358
|
)
|
|
311
359
|
|
|
@@ -336,13 +384,13 @@ def _wrap_streaming_async(
|
|
|
336
384
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
337
385
|
"OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
338
386
|
)
|
|
339
|
-
tracer.
|
|
387
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
340
388
|
set_span_attribute(
|
|
341
389
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
342
390
|
)
|
|
343
391
|
ctx["model_name"] = kwargs.get("model", "")
|
|
344
392
|
set_span_attribute(
|
|
345
|
-
ctx["span"], AttributeKeys.
|
|
393
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
346
394
|
)
|
|
347
395
|
ctx["accumulated_content"] = ""
|
|
348
396
|
|
|
@@ -379,17 +427,33 @@ def _wrap_streaming_async(
|
|
|
379
427
|
if chunk.usage.prompt_tokens_details:
|
|
380
428
|
cache_read = chunk.usage.prompt_tokens_details.cached_tokens or 0
|
|
381
429
|
|
|
430
|
+
set_cost_attribute(span, chunk.usage)
|
|
431
|
+
|
|
432
|
+
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
433
|
+
openai_tokens_converter(
|
|
434
|
+
prompt_tokens,
|
|
435
|
+
completion_tokens,
|
|
436
|
+
cache_read,
|
|
437
|
+
0,
|
|
438
|
+
chunk.usage.total_tokens,
|
|
439
|
+
)
|
|
440
|
+
)
|
|
441
|
+
|
|
382
442
|
set_span_attribute(
|
|
383
|
-
span,
|
|
443
|
+
span,
|
|
444
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
445
|
+
prompt_tokens,
|
|
384
446
|
)
|
|
385
447
|
set_span_attribute(
|
|
386
|
-
span, AttributeKeys.
|
|
448
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
387
449
|
)
|
|
388
450
|
set_span_attribute(
|
|
389
|
-
span,
|
|
451
|
+
span,
|
|
452
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
453
|
+
cache_read,
|
|
390
454
|
)
|
|
391
455
|
set_span_attribute(
|
|
392
|
-
span, AttributeKeys.
|
|
456
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
|
|
393
457
|
)
|
|
394
458
|
set_span_attribute(
|
|
395
459
|
span,
|
|
@@ -24,6 +24,10 @@ from judgeval.utils.wrappers import (
|
|
|
24
24
|
immutable_wrap_sync_iterator,
|
|
25
25
|
immutable_wrap_async_iterator,
|
|
26
26
|
)
|
|
27
|
+
from judgeval.tracer.llm.llm_openai.utils import (
|
|
28
|
+
openai_tokens_converter,
|
|
29
|
+
set_cost_attribute,
|
|
30
|
+
)
|
|
27
31
|
|
|
28
32
|
if TYPE_CHECKING:
|
|
29
33
|
from judgeval.tracer import Tracer
|
|
@@ -56,13 +60,13 @@ def _wrap_responses_non_streaming_sync(
|
|
|
56
60
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
57
61
|
"OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
58
62
|
)
|
|
59
|
-
tracer.
|
|
63
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
60
64
|
set_span_attribute(
|
|
61
65
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
62
66
|
)
|
|
63
67
|
ctx["model_name"] = kwargs.get("model", "")
|
|
64
68
|
set_span_attribute(
|
|
65
|
-
ctx["span"], AttributeKeys.
|
|
69
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
66
70
|
)
|
|
67
71
|
|
|
68
72
|
def post_hook(ctx: Dict[str, Any], result: Response) -> None:
|
|
@@ -80,17 +84,30 @@ def _wrap_responses_non_streaming_sync(
|
|
|
80
84
|
completion_tokens = usage_data.output_tokens or 0
|
|
81
85
|
cache_read = usage_data.input_tokens_details.cached_tokens or 0
|
|
82
86
|
|
|
87
|
+
set_cost_attribute(span, usage_data)
|
|
88
|
+
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
89
|
+
openai_tokens_converter(
|
|
90
|
+
prompt_tokens,
|
|
91
|
+
completion_tokens,
|
|
92
|
+
cache_read,
|
|
93
|
+
0,
|
|
94
|
+
usage_data.total_tokens,
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
|
|
83
98
|
set_span_attribute(
|
|
84
|
-
span,
|
|
99
|
+
span,
|
|
100
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
101
|
+
prompt_tokens,
|
|
85
102
|
)
|
|
86
103
|
set_span_attribute(
|
|
87
|
-
span, AttributeKeys.
|
|
104
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
88
105
|
)
|
|
89
106
|
set_span_attribute(
|
|
90
|
-
span, AttributeKeys.
|
|
107
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
|
|
91
108
|
)
|
|
92
109
|
set_span_attribute(
|
|
93
|
-
span, AttributeKeys.
|
|
110
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
|
|
94
111
|
)
|
|
95
112
|
set_span_attribute(
|
|
96
113
|
span,
|
|
@@ -101,7 +118,7 @@ def _wrap_responses_non_streaming_sync(
|
|
|
101
118
|
if hasattr(result, "model"):
|
|
102
119
|
set_span_attribute(
|
|
103
120
|
span,
|
|
104
|
-
AttributeKeys.
|
|
121
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
105
122
|
result.model or ctx["model_name"],
|
|
106
123
|
)
|
|
107
124
|
|
|
@@ -131,13 +148,13 @@ def _wrap_responses_streaming_sync(
|
|
|
131
148
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
132
149
|
"OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
133
150
|
)
|
|
134
|
-
tracer.
|
|
151
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
135
152
|
set_span_attribute(
|
|
136
153
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
137
154
|
)
|
|
138
155
|
ctx["model_name"] = kwargs.get("model", "")
|
|
139
156
|
set_span_attribute(
|
|
140
|
-
ctx["span"], AttributeKeys.
|
|
157
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
141
158
|
)
|
|
142
159
|
ctx["accumulated_content"] = ""
|
|
143
160
|
|
|
@@ -167,6 +184,7 @@ def _wrap_responses_streaming_sync(
|
|
|
167
184
|
):
|
|
168
185
|
prompt_tokens = chunk.response.usage.input_tokens or 0
|
|
169
186
|
completion_tokens = chunk.response.usage.output_tokens or 0
|
|
187
|
+
total_tokens = chunk.response.usage.total_tokens or 0
|
|
170
188
|
# Safely access nested cached_tokens
|
|
171
189
|
input_tokens_details = getattr(
|
|
172
190
|
chunk.response.usage, "input_tokens_details", None
|
|
@@ -177,21 +195,36 @@ def _wrap_responses_streaming_sync(
|
|
|
177
195
|
else 0
|
|
178
196
|
)
|
|
179
197
|
|
|
198
|
+
set_cost_attribute(span, chunk.response.usage)
|
|
199
|
+
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
200
|
+
openai_tokens_converter(
|
|
201
|
+
prompt_tokens,
|
|
202
|
+
completion_tokens,
|
|
203
|
+
cache_read,
|
|
204
|
+
0,
|
|
205
|
+
total_tokens,
|
|
206
|
+
)
|
|
207
|
+
)
|
|
208
|
+
|
|
180
209
|
set_span_attribute(
|
|
181
|
-
span,
|
|
210
|
+
span,
|
|
211
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
212
|
+
prompt_tokens,
|
|
182
213
|
)
|
|
183
214
|
set_span_attribute(
|
|
184
215
|
span,
|
|
185
|
-
AttributeKeys.
|
|
216
|
+
AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
|
|
186
217
|
completion_tokens,
|
|
187
218
|
)
|
|
188
219
|
set_span_attribute(
|
|
189
220
|
span,
|
|
190
|
-
AttributeKeys.
|
|
221
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
191
222
|
cache_read,
|
|
192
223
|
)
|
|
193
224
|
set_span_attribute(
|
|
194
|
-
span,
|
|
225
|
+
span,
|
|
226
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
227
|
+
0,
|
|
195
228
|
)
|
|
196
229
|
set_span_attribute(
|
|
197
230
|
span,
|
|
@@ -260,13 +293,13 @@ def _wrap_responses_non_streaming_async(
|
|
|
260
293
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
261
294
|
"OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
262
295
|
)
|
|
263
|
-
tracer.
|
|
296
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
264
297
|
set_span_attribute(
|
|
265
298
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
266
299
|
)
|
|
267
300
|
ctx["model_name"] = kwargs.get("model", "")
|
|
268
301
|
set_span_attribute(
|
|
269
|
-
ctx["span"], AttributeKeys.
|
|
302
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
270
303
|
)
|
|
271
304
|
|
|
272
305
|
def post_hook(ctx: Dict[str, Any], result: Response) -> None:
|
|
@@ -284,17 +317,30 @@ def _wrap_responses_non_streaming_async(
|
|
|
284
317
|
completion_tokens = usage_data.output_tokens or 0
|
|
285
318
|
cache_read = usage_data.input_tokens_details.cached_tokens or 0
|
|
286
319
|
|
|
320
|
+
set_cost_attribute(span, usage_data)
|
|
321
|
+
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
322
|
+
openai_tokens_converter(
|
|
323
|
+
prompt_tokens,
|
|
324
|
+
completion_tokens,
|
|
325
|
+
cache_read,
|
|
326
|
+
0,
|
|
327
|
+
usage_data.total_tokens,
|
|
328
|
+
)
|
|
329
|
+
)
|
|
330
|
+
|
|
287
331
|
set_span_attribute(
|
|
288
|
-
span,
|
|
332
|
+
span,
|
|
333
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
334
|
+
prompt_tokens,
|
|
289
335
|
)
|
|
290
336
|
set_span_attribute(
|
|
291
|
-
span, AttributeKeys.
|
|
337
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
292
338
|
)
|
|
293
339
|
set_span_attribute(
|
|
294
|
-
span, AttributeKeys.
|
|
340
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
|
|
295
341
|
)
|
|
296
342
|
set_span_attribute(
|
|
297
|
-
span, AttributeKeys.
|
|
343
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
|
|
298
344
|
)
|
|
299
345
|
set_span_attribute(
|
|
300
346
|
span,
|
|
@@ -305,7 +351,7 @@ def _wrap_responses_non_streaming_async(
|
|
|
305
351
|
if hasattr(result, "model"):
|
|
306
352
|
set_span_attribute(
|
|
307
353
|
span,
|
|
308
|
-
AttributeKeys.
|
|
354
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
309
355
|
result.model or ctx["model_name"],
|
|
310
356
|
)
|
|
311
357
|
|
|
@@ -335,13 +381,13 @@ def _wrap_responses_streaming_async(
|
|
|
335
381
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
336
382
|
"OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
337
383
|
)
|
|
338
|
-
tracer.
|
|
384
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
339
385
|
set_span_attribute(
|
|
340
386
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
341
387
|
)
|
|
342
388
|
ctx["model_name"] = kwargs.get("model", "")
|
|
343
389
|
set_span_attribute(
|
|
344
|
-
ctx["span"], AttributeKeys.
|
|
390
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
345
391
|
)
|
|
346
392
|
ctx["accumulated_content"] = ""
|
|
347
393
|
|
|
@@ -373,6 +419,7 @@ def _wrap_responses_streaming_async(
|
|
|
373
419
|
):
|
|
374
420
|
prompt_tokens = chunk.response.usage.input_tokens or 0
|
|
375
421
|
completion_tokens = chunk.response.usage.output_tokens or 0
|
|
422
|
+
total_tokens = chunk.response.usage.total_tokens or 0
|
|
376
423
|
# Safely access nested cached_tokens
|
|
377
424
|
input_tokens_details = getattr(
|
|
378
425
|
chunk.response.usage, "input_tokens_details", None
|
|
@@ -383,21 +430,36 @@ def _wrap_responses_streaming_async(
|
|
|
383
430
|
else 0
|
|
384
431
|
)
|
|
385
432
|
|
|
433
|
+
set_cost_attribute(span, chunk.response.usage)
|
|
434
|
+
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
435
|
+
openai_tokens_converter(
|
|
436
|
+
prompt_tokens,
|
|
437
|
+
completion_tokens,
|
|
438
|
+
cache_read,
|
|
439
|
+
0,
|
|
440
|
+
total_tokens,
|
|
441
|
+
)
|
|
442
|
+
)
|
|
443
|
+
|
|
386
444
|
set_span_attribute(
|
|
387
|
-
span,
|
|
445
|
+
span,
|
|
446
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
447
|
+
prompt_tokens,
|
|
388
448
|
)
|
|
389
449
|
set_span_attribute(
|
|
390
450
|
span,
|
|
391
|
-
AttributeKeys.
|
|
451
|
+
AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
|
|
392
452
|
completion_tokens,
|
|
393
453
|
)
|
|
394
454
|
set_span_attribute(
|
|
395
455
|
span,
|
|
396
|
-
AttributeKeys.
|
|
456
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
397
457
|
cache_read,
|
|
398
458
|
)
|
|
399
459
|
set_span_attribute(
|
|
400
|
-
span,
|
|
460
|
+
span,
|
|
461
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
462
|
+
0,
|
|
401
463
|
)
|
|
402
464
|
set_span_attribute(
|
|
403
465
|
span,
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from opentelemetry.trace import Span
|
|
3
|
+
from judgeval.tracer.keys import AttributeKeys
|
|
4
|
+
from judgeval.tracer.utils import set_span_attribute
|
|
5
|
+
from judgeval.utils.serialize import safe_serialize
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def openai_tokens_converter(
|
|
9
|
+
prompt_tokens: int,
|
|
10
|
+
completion_tokens: int,
|
|
11
|
+
cache_read: int,
|
|
12
|
+
cache_creation: int,
|
|
13
|
+
total_tokens: int,
|
|
14
|
+
) -> tuple[int, int, int, int]:
|
|
15
|
+
"""
|
|
16
|
+
Returns:
|
|
17
|
+
tuple[int, int, int, int]:
|
|
18
|
+
- judgment.usage.non_cached_input
|
|
19
|
+
- judgment.usage.output_tokens
|
|
20
|
+
- judgment.usage.cached_input_tokens
|
|
21
|
+
- judgment.usage.cache_creation_tokens
|
|
22
|
+
"""
|
|
23
|
+
manual_tokens = prompt_tokens + completion_tokens + cache_read + cache_creation
|
|
24
|
+
|
|
25
|
+
if manual_tokens > total_tokens:
|
|
26
|
+
# This is the openAI case where we need to subtract the cached tokens from the input tokens
|
|
27
|
+
return prompt_tokens - cache_read, completion_tokens, cache_read, cache_creation
|
|
28
|
+
else:
|
|
29
|
+
return prompt_tokens, completion_tokens, cache_read, cache_creation
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def set_cost_attribute(span: Span, usage_data: Any) -> None:
|
|
33
|
+
"""
|
|
34
|
+
This is for OpenRouter case where the cost is provided in the usage data when they specify:
|
|
35
|
+
extra_body={"usage": {"include": True}},
|
|
36
|
+
"""
|
|
37
|
+
if hasattr(usage_data, "cost") and usage_data.cost:
|
|
38
|
+
set_span_attribute(
|
|
39
|
+
span,
|
|
40
|
+
AttributeKeys.JUDGMENT_USAGE_TOTAL_COST_USD,
|
|
41
|
+
safe_serialize(usage_data.cost),
|
|
42
|
+
)
|