judgeval 0.16.9__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (37) hide show
  1. judgeval/__init__.py +32 -2
  2. judgeval/api/__init__.py +108 -0
  3. judgeval/api/api_types.py +76 -15
  4. judgeval/cli.py +16 -1
  5. judgeval/data/judgment_types.py +76 -20
  6. judgeval/dataset/__init__.py +11 -2
  7. judgeval/env.py +2 -11
  8. judgeval/evaluation/__init__.py +4 -0
  9. judgeval/prompt/__init__.py +330 -0
  10. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +1 -13
  11. judgeval/tracer/__init__.py +371 -257
  12. judgeval/tracer/constants.py +1 -1
  13. judgeval/tracer/exporters/store.py +32 -16
  14. judgeval/tracer/keys.py +11 -9
  15. judgeval/tracer/llm/llm_anthropic/messages.py +38 -26
  16. judgeval/tracer/llm/llm_anthropic/messages_stream.py +14 -14
  17. judgeval/tracer/llm/llm_google/generate_content.py +9 -7
  18. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +38 -14
  19. judgeval/tracer/llm/llm_openai/chat_completions.py +90 -26
  20. judgeval/tracer/llm/llm_openai/responses.py +88 -26
  21. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  22. judgeval/tracer/llm/llm_together/chat_completions.py +26 -18
  23. judgeval/tracer/managers.py +4 -0
  24. judgeval/trainer/__init__.py +10 -1
  25. judgeval/trainer/base_trainer.py +122 -0
  26. judgeval/trainer/config.py +1 -1
  27. judgeval/trainer/fireworks_trainer.py +396 -0
  28. judgeval/trainer/trainer.py +52 -387
  29. judgeval/utils/guards.py +9 -5
  30. judgeval/utils/project.py +15 -0
  31. judgeval/utils/serialize.py +2 -2
  32. judgeval/version.py +1 -1
  33. {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/METADATA +2 -3
  34. {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/RECORD +37 -32
  35. {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  36. {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/entry_points.txt +0 -0
  37. {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -25,6 +25,10 @@ from judgeval.utils.wrappers import (
25
25
  immutable_wrap_sync_iterator,
26
26
  immutable_wrap_async_iterator,
27
27
  )
28
+ from judgeval.tracer.llm.llm_openai.utils import (
29
+ openai_tokens_converter,
30
+ set_cost_attribute,
31
+ )
28
32
 
29
33
  if TYPE_CHECKING:
30
34
  from judgeval.tracer import Tracer
@@ -62,13 +66,13 @@ def _wrap_non_streaming_sync(
62
66
  ctx["span"] = tracer.get_tracer().start_span(
63
67
  "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
64
68
  )
65
- tracer.add_agent_attributes_to_span(ctx["span"])
69
+ tracer._inject_judgment_context(ctx["span"])
66
70
  set_span_attribute(
67
71
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
68
72
  )
69
73
  ctx["model_name"] = kwargs.get("model", "")
70
74
  set_span_attribute(
71
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
75
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
72
76
  )
73
77
 
74
78
  def post_hook(ctx: Dict[str, Any], result: ChatCompletion) -> None:
@@ -89,17 +93,31 @@ def _wrap_non_streaming_sync(
89
93
  if prompt_tokens_details:
90
94
  cache_read = prompt_tokens_details.cached_tokens or 0
91
95
 
96
+ set_cost_attribute(span, usage_data)
97
+
98
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
99
+ openai_tokens_converter(
100
+ prompt_tokens,
101
+ completion_tokens,
102
+ cache_read,
103
+ 0,
104
+ usage_data.total_tokens,
105
+ )
106
+ )
107
+
92
108
  set_span_attribute(
93
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
109
+ span,
110
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
111
+ prompt_tokens,
94
112
  )
95
113
  set_span_attribute(
96
- span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
114
+ span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
97
115
  )
98
116
  set_span_attribute(
99
- span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
117
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
100
118
  )
101
119
  set_span_attribute(
102
- span, AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
120
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
103
121
  )
104
122
  set_span_attribute(
105
123
  span,
@@ -109,7 +127,7 @@ def _wrap_non_streaming_sync(
109
127
 
110
128
  set_span_attribute(
111
129
  span,
112
- AttributeKeys.GEN_AI_RESPONSE_MODEL,
130
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
113
131
  result.model or ctx["model_name"],
114
132
  )
115
133
 
@@ -139,13 +157,13 @@ def _wrap_streaming_sync(
139
157
  ctx["span"] = tracer.get_tracer().start_span(
140
158
  "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
141
159
  )
142
- tracer.add_agent_attributes_to_span(ctx["span"])
160
+ tracer._inject_judgment_context(ctx["span"])
143
161
  set_span_attribute(
144
162
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
145
163
  )
146
164
  ctx["model_name"] = kwargs.get("model", "")
147
165
  set_span_attribute(
148
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
166
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
149
167
  )
150
168
  ctx["accumulated_content"] = ""
151
169
 
@@ -182,17 +200,33 @@ def _wrap_streaming_sync(
182
200
  if chunk.usage.prompt_tokens_details:
183
201
  cache_read = chunk.usage.prompt_tokens_details.cached_tokens or 0
184
202
 
203
+ set_cost_attribute(span, chunk.usage)
204
+
205
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
206
+ openai_tokens_converter(
207
+ prompt_tokens,
208
+ completion_tokens,
209
+ cache_read,
210
+ 0,
211
+ chunk.usage.total_tokens,
212
+ )
213
+ )
214
+
185
215
  set_span_attribute(
186
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
216
+ span,
217
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
218
+ prompt_tokens,
187
219
  )
188
220
  set_span_attribute(
189
- span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
221
+ span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
190
222
  )
191
223
  set_span_attribute(
192
- span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
224
+ span,
225
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
226
+ cache_read,
193
227
  )
194
228
  set_span_attribute(
195
- span, AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
229
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
196
230
  )
197
231
  set_span_attribute(
198
232
  span,
@@ -258,13 +292,13 @@ def _wrap_non_streaming_async(
258
292
  ctx["span"] = tracer.get_tracer().start_span(
259
293
  "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
260
294
  )
261
- tracer.add_agent_attributes_to_span(ctx["span"])
295
+ tracer._inject_judgment_context(ctx["span"])
262
296
  set_span_attribute(
263
297
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
264
298
  )
265
299
  ctx["model_name"] = kwargs.get("model", "")
266
300
  set_span_attribute(
267
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
301
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
268
302
  )
269
303
 
270
304
  def post_hook(ctx: Dict[str, Any], result: ChatCompletion) -> None:
@@ -285,17 +319,31 @@ def _wrap_non_streaming_async(
285
319
  if prompt_tokens_details:
286
320
  cache_read = prompt_tokens_details.cached_tokens or 0
287
321
 
322
+ set_cost_attribute(span, usage_data)
323
+
324
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
325
+ openai_tokens_converter(
326
+ prompt_tokens,
327
+ completion_tokens,
328
+ cache_read,
329
+ 0,
330
+ usage_data.total_tokens,
331
+ )
332
+ )
333
+
288
334
  set_span_attribute(
289
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
335
+ span,
336
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
337
+ prompt_tokens,
290
338
  )
291
339
  set_span_attribute(
292
- span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
340
+ span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
293
341
  )
294
342
  set_span_attribute(
295
- span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
343
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
296
344
  )
297
345
  set_span_attribute(
298
- span, AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
346
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
299
347
  )
300
348
  set_span_attribute(
301
349
  span,
@@ -305,7 +353,7 @@ def _wrap_non_streaming_async(
305
353
 
306
354
  set_span_attribute(
307
355
  span,
308
- AttributeKeys.GEN_AI_RESPONSE_MODEL,
356
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
309
357
  result.model or ctx["model_name"],
310
358
  )
311
359
 
@@ -336,13 +384,13 @@ def _wrap_streaming_async(
336
384
  ctx["span"] = tracer.get_tracer().start_span(
337
385
  "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
338
386
  )
339
- tracer.add_agent_attributes_to_span(ctx["span"])
387
+ tracer._inject_judgment_context(ctx["span"])
340
388
  set_span_attribute(
341
389
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
342
390
  )
343
391
  ctx["model_name"] = kwargs.get("model", "")
344
392
  set_span_attribute(
345
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
393
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
346
394
  )
347
395
  ctx["accumulated_content"] = ""
348
396
 
@@ -379,17 +427,33 @@ def _wrap_streaming_async(
379
427
  if chunk.usage.prompt_tokens_details:
380
428
  cache_read = chunk.usage.prompt_tokens_details.cached_tokens or 0
381
429
 
430
+ set_cost_attribute(span, chunk.usage)
431
+
432
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
433
+ openai_tokens_converter(
434
+ prompt_tokens,
435
+ completion_tokens,
436
+ cache_read,
437
+ 0,
438
+ chunk.usage.total_tokens,
439
+ )
440
+ )
441
+
382
442
  set_span_attribute(
383
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
443
+ span,
444
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
445
+ prompt_tokens,
384
446
  )
385
447
  set_span_attribute(
386
- span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
448
+ span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
387
449
  )
388
450
  set_span_attribute(
389
- span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
451
+ span,
452
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
453
+ cache_read,
390
454
  )
391
455
  set_span_attribute(
392
- span, AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
456
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
393
457
  )
394
458
  set_span_attribute(
395
459
  span,
@@ -24,6 +24,10 @@ from judgeval.utils.wrappers import (
24
24
  immutable_wrap_sync_iterator,
25
25
  immutable_wrap_async_iterator,
26
26
  )
27
+ from judgeval.tracer.llm.llm_openai.utils import (
28
+ openai_tokens_converter,
29
+ set_cost_attribute,
30
+ )
27
31
 
28
32
  if TYPE_CHECKING:
29
33
  from judgeval.tracer import Tracer
@@ -56,13 +60,13 @@ def _wrap_responses_non_streaming_sync(
56
60
  ctx["span"] = tracer.get_tracer().start_span(
57
61
  "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
58
62
  )
59
- tracer.add_agent_attributes_to_span(ctx["span"])
63
+ tracer._inject_judgment_context(ctx["span"])
60
64
  set_span_attribute(
61
65
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
62
66
  )
63
67
  ctx["model_name"] = kwargs.get("model", "")
64
68
  set_span_attribute(
65
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
69
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
66
70
  )
67
71
 
68
72
  def post_hook(ctx: Dict[str, Any], result: Response) -> None:
@@ -80,17 +84,30 @@ def _wrap_responses_non_streaming_sync(
80
84
  completion_tokens = usage_data.output_tokens or 0
81
85
  cache_read = usage_data.input_tokens_details.cached_tokens or 0
82
86
 
87
+ set_cost_attribute(span, usage_data)
88
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
89
+ openai_tokens_converter(
90
+ prompt_tokens,
91
+ completion_tokens,
92
+ cache_read,
93
+ 0,
94
+ usage_data.total_tokens,
95
+ )
96
+ )
97
+
83
98
  set_span_attribute(
84
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
99
+ span,
100
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
101
+ prompt_tokens,
85
102
  )
86
103
  set_span_attribute(
87
- span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
104
+ span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
88
105
  )
89
106
  set_span_attribute(
90
- span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
107
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
91
108
  )
92
109
  set_span_attribute(
93
- span, AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
110
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
94
111
  )
95
112
  set_span_attribute(
96
113
  span,
@@ -101,7 +118,7 @@ def _wrap_responses_non_streaming_sync(
101
118
  if hasattr(result, "model"):
102
119
  set_span_attribute(
103
120
  span,
104
- AttributeKeys.GEN_AI_RESPONSE_MODEL,
121
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
105
122
  result.model or ctx["model_name"],
106
123
  )
107
124
 
@@ -131,13 +148,13 @@ def _wrap_responses_streaming_sync(
131
148
  ctx["span"] = tracer.get_tracer().start_span(
132
149
  "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
133
150
  )
134
- tracer.add_agent_attributes_to_span(ctx["span"])
151
+ tracer._inject_judgment_context(ctx["span"])
135
152
  set_span_attribute(
136
153
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
137
154
  )
138
155
  ctx["model_name"] = kwargs.get("model", "")
139
156
  set_span_attribute(
140
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
157
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
141
158
  )
142
159
  ctx["accumulated_content"] = ""
143
160
 
@@ -167,6 +184,7 @@ def _wrap_responses_streaming_sync(
167
184
  ):
168
185
  prompt_tokens = chunk.response.usage.input_tokens or 0
169
186
  completion_tokens = chunk.response.usage.output_tokens or 0
187
+ total_tokens = chunk.response.usage.total_tokens or 0
170
188
  # Safely access nested cached_tokens
171
189
  input_tokens_details = getattr(
172
190
  chunk.response.usage, "input_tokens_details", None
@@ -177,21 +195,36 @@ def _wrap_responses_streaming_sync(
177
195
  else 0
178
196
  )
179
197
 
198
+ set_cost_attribute(span, chunk.response.usage)
199
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
200
+ openai_tokens_converter(
201
+ prompt_tokens,
202
+ completion_tokens,
203
+ cache_read,
204
+ 0,
205
+ total_tokens,
206
+ )
207
+ )
208
+
180
209
  set_span_attribute(
181
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
210
+ span,
211
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
212
+ prompt_tokens,
182
213
  )
183
214
  set_span_attribute(
184
215
  span,
185
- AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
216
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
186
217
  completion_tokens,
187
218
  )
188
219
  set_span_attribute(
189
220
  span,
190
- AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
221
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
191
222
  cache_read,
192
223
  )
193
224
  set_span_attribute(
194
- span, AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
225
+ span,
226
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
227
+ 0,
195
228
  )
196
229
  set_span_attribute(
197
230
  span,
@@ -260,13 +293,13 @@ def _wrap_responses_non_streaming_async(
260
293
  ctx["span"] = tracer.get_tracer().start_span(
261
294
  "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
262
295
  )
263
- tracer.add_agent_attributes_to_span(ctx["span"])
296
+ tracer._inject_judgment_context(ctx["span"])
264
297
  set_span_attribute(
265
298
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
266
299
  )
267
300
  ctx["model_name"] = kwargs.get("model", "")
268
301
  set_span_attribute(
269
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
302
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
270
303
  )
271
304
 
272
305
  def post_hook(ctx: Dict[str, Any], result: Response) -> None:
@@ -284,17 +317,30 @@ def _wrap_responses_non_streaming_async(
284
317
  completion_tokens = usage_data.output_tokens or 0
285
318
  cache_read = usage_data.input_tokens_details.cached_tokens or 0
286
319
 
320
+ set_cost_attribute(span, usage_data)
321
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
322
+ openai_tokens_converter(
323
+ prompt_tokens,
324
+ completion_tokens,
325
+ cache_read,
326
+ 0,
327
+ usage_data.total_tokens,
328
+ )
329
+ )
330
+
287
331
  set_span_attribute(
288
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
332
+ span,
333
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
334
+ prompt_tokens,
289
335
  )
290
336
  set_span_attribute(
291
- span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
337
+ span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
292
338
  )
293
339
  set_span_attribute(
294
- span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
340
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
295
341
  )
296
342
  set_span_attribute(
297
- span, AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
343
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
298
344
  )
299
345
  set_span_attribute(
300
346
  span,
@@ -305,7 +351,7 @@ def _wrap_responses_non_streaming_async(
305
351
  if hasattr(result, "model"):
306
352
  set_span_attribute(
307
353
  span,
308
- AttributeKeys.GEN_AI_RESPONSE_MODEL,
354
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
309
355
  result.model or ctx["model_name"],
310
356
  )
311
357
 
@@ -335,13 +381,13 @@ def _wrap_responses_streaming_async(
335
381
  ctx["span"] = tracer.get_tracer().start_span(
336
382
  "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
337
383
  )
338
- tracer.add_agent_attributes_to_span(ctx["span"])
384
+ tracer._inject_judgment_context(ctx["span"])
339
385
  set_span_attribute(
340
386
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
341
387
  )
342
388
  ctx["model_name"] = kwargs.get("model", "")
343
389
  set_span_attribute(
344
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
390
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
345
391
  )
346
392
  ctx["accumulated_content"] = ""
347
393
 
@@ -373,6 +419,7 @@ def _wrap_responses_streaming_async(
373
419
  ):
374
420
  prompt_tokens = chunk.response.usage.input_tokens or 0
375
421
  completion_tokens = chunk.response.usage.output_tokens or 0
422
+ total_tokens = chunk.response.usage.total_tokens or 0
376
423
  # Safely access nested cached_tokens
377
424
  input_tokens_details = getattr(
378
425
  chunk.response.usage, "input_tokens_details", None
@@ -383,21 +430,36 @@ def _wrap_responses_streaming_async(
383
430
  else 0
384
431
  )
385
432
 
433
+ set_cost_attribute(span, chunk.response.usage)
434
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
435
+ openai_tokens_converter(
436
+ prompt_tokens,
437
+ completion_tokens,
438
+ cache_read,
439
+ 0,
440
+ total_tokens,
441
+ )
442
+ )
443
+
386
444
  set_span_attribute(
387
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
445
+ span,
446
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
447
+ prompt_tokens,
388
448
  )
389
449
  set_span_attribute(
390
450
  span,
391
- AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
451
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
392
452
  completion_tokens,
393
453
  )
394
454
  set_span_attribute(
395
455
  span,
396
- AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
456
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
397
457
  cache_read,
398
458
  )
399
459
  set_span_attribute(
400
- span, AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
460
+ span,
461
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
462
+ 0,
401
463
  )
402
464
  set_span_attribute(
403
465
  span,
@@ -0,0 +1,42 @@
1
+ from typing import Any
2
+ from opentelemetry.trace import Span
3
+ from judgeval.tracer.keys import AttributeKeys
4
+ from judgeval.tracer.utils import set_span_attribute
5
+ from judgeval.utils.serialize import safe_serialize
6
+
7
+
8
+ def openai_tokens_converter(
9
+ prompt_tokens: int,
10
+ completion_tokens: int,
11
+ cache_read: int,
12
+ cache_creation: int,
13
+ total_tokens: int,
14
+ ) -> tuple[int, int, int, int]:
15
+ """
16
+ Returns:
17
+ tuple[int, int, int, int]:
18
+ - judgment.usage.non_cached_input
19
+ - judgment.usage.output_tokens
20
+ - judgment.usage.cached_input_tokens
21
+ - judgment.usage.cache_creation_tokens
22
+ """
23
+ manual_tokens = prompt_tokens + completion_tokens + cache_read + cache_creation
24
+
25
+ if manual_tokens > total_tokens:
26
+ # This is the openAI case where we need to subtract the cached tokens from the input tokens
27
+ return prompt_tokens - cache_read, completion_tokens, cache_read, cache_creation
28
+ else:
29
+ return prompt_tokens, completion_tokens, cache_read, cache_creation
30
+
31
+
32
+ def set_cost_attribute(span: Span, usage_data: Any) -> None:
33
+ """
34
+ This is for OpenRouter case where the cost is provided in the usage data when they specify:
35
+ extra_body={"usage": {"include": True}},
36
+ """
37
+ if hasattr(usage_data, "cost") and usage_data.cost:
38
+ set_span_attribute(
39
+ span,
40
+ AttributeKeys.JUDGMENT_USAGE_TOTAL_COST_USD,
41
+ safe_serialize(usage_data.cost),
42
+ )