judgeval 0.16.9__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (37) hide show
  1. judgeval/__init__.py +32 -2
  2. judgeval/api/__init__.py +108 -0
  3. judgeval/api/api_types.py +76 -15
  4. judgeval/cli.py +16 -1
  5. judgeval/data/judgment_types.py +76 -20
  6. judgeval/dataset/__init__.py +11 -2
  7. judgeval/env.py +2 -11
  8. judgeval/evaluation/__init__.py +4 -0
  9. judgeval/prompt/__init__.py +330 -0
  10. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +1 -13
  11. judgeval/tracer/__init__.py +371 -257
  12. judgeval/tracer/constants.py +1 -1
  13. judgeval/tracer/exporters/store.py +32 -16
  14. judgeval/tracer/keys.py +11 -9
  15. judgeval/tracer/llm/llm_anthropic/messages.py +38 -26
  16. judgeval/tracer/llm/llm_anthropic/messages_stream.py +14 -14
  17. judgeval/tracer/llm/llm_google/generate_content.py +9 -7
  18. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +38 -14
  19. judgeval/tracer/llm/llm_openai/chat_completions.py +90 -26
  20. judgeval/tracer/llm/llm_openai/responses.py +88 -26
  21. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  22. judgeval/tracer/llm/llm_together/chat_completions.py +26 -18
  23. judgeval/tracer/managers.py +4 -0
  24. judgeval/trainer/__init__.py +10 -1
  25. judgeval/trainer/base_trainer.py +122 -0
  26. judgeval/trainer/config.py +1 -1
  27. judgeval/trainer/fireworks_trainer.py +396 -0
  28. judgeval/trainer/trainer.py +52 -387
  29. judgeval/utils/guards.py +9 -5
  30. judgeval/utils/project.py +15 -0
  31. judgeval/utils/serialize.py +2 -2
  32. judgeval/version.py +1 -1
  33. {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/METADATA +2 -3
  34. {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/RECORD +37 -32
  35. {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  36. {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/entry_points.txt +0 -0
  37. {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -1 +1 @@
1
- JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME = "opentelemetry.instrumentation.judgeval"
1
+ JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME = "judgeval"
@@ -1,5 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import List
2
+ from typing import List, Dict
3
3
 
4
4
  from opentelemetry.sdk.trace import ReadableSpan
5
5
 
@@ -9,35 +9,51 @@ class ABCSpanStore(ABC):
9
9
  def add(self, *spans: ReadableSpan): ...
10
10
 
11
11
  @abstractmethod
12
- def get(self, id: str) -> ReadableSpan: ...
12
+ def get_all(self) -> List[ReadableSpan]: ...
13
13
 
14
14
  @abstractmethod
15
- def get_all(self) -> List[ReadableSpan]: ...
15
+ def get_by_trace_id(self, trace_id: str) -> List[ReadableSpan]: ...
16
+
17
+ @abstractmethod
18
+ def clear_trace(self, trace_id: str): ...
16
19
 
17
20
 
18
21
  class SpanStore(ABCSpanStore):
19
- __slots__ = ("spans",)
22
+ __slots__ = ("_spans_by_trace",)
20
23
 
21
- spans: List[ReadableSpan]
24
+ _spans_by_trace: Dict[str, List[ReadableSpan]]
22
25
 
23
26
  def __init__(self):
24
- self.spans = []
27
+ self._spans_by_trace = {}
25
28
 
26
29
  def add(self, *spans: ReadableSpan):
27
- self.spans.extend(spans)
28
-
29
- def get(self, id: str) -> ReadableSpan:
30
- for span in self.spans:
30
+ for span in spans:
31
31
  context = span.get_span_context()
32
32
  if context is None:
33
33
  continue
34
- if context.span_id == id:
35
- return span
36
-
37
- raise ValueError(f"Span with id {id} not found")
34
+ # Convert trace_id to hex string per OTEL spec
35
+ trace_id = format(context.trace_id, "032x")
36
+ if trace_id not in self._spans_by_trace:
37
+ self._spans_by_trace[trace_id] = []
38
+ self._spans_by_trace[trace_id].append(span)
38
39
 
39
40
  def get_all(self) -> List[ReadableSpan]:
40
- return self.spans
41
+ all_spans = []
42
+ for spans in self._spans_by_trace.values():
43
+ all_spans.extend(spans)
44
+ return all_spans
45
+
46
+ def get_by_trace_id(self, trace_id: str) -> List[ReadableSpan]:
47
+ """Get all spans for a specific trace ID (32-char hex string)."""
48
+ return self._spans_by_trace.get(trace_id, [])
49
+
50
+ def clear_trace(self, trace_id: str):
51
+ """Clear all spans for a specific trace ID (32-char hex string)."""
52
+ if trace_id in self._spans_by_trace:
53
+ del self._spans_by_trace[trace_id]
41
54
 
42
55
  def __repr__(self) -> str:
43
- return f"SpanStore(spans={self.spans})"
56
+ total_spans = sum(len(spans) for spans in self._spans_by_trace.values())
57
+ return (
58
+ f"SpanStore(traces={len(self._spans_by_trace)}, total_spans={total_spans})"
59
+ )
judgeval/tracer/keys.py CHANGED
@@ -26,18 +26,19 @@ class AttributeKeys(str, Enum):
26
26
 
27
27
  PENDING_TRACE_EVAL = "judgment.pending_trace_eval"
28
28
 
29
+ JUDGMENT_LLM_PROVIDER = "judgment.llm.provider"
30
+ JUDGMENT_LLM_MODEL_NAME = "judgment.llm.model"
31
+ JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS = "judgment.usage.non_cached_input_tokens"
32
+ JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS = (
33
+ "judgment.usage.cache_creation_input_tokens"
34
+ )
35
+ JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS = "judgment.usage.cache_read_input_tokens"
36
+ JUDGMENT_USAGE_OUTPUT_TOKENS = "judgment.usage.output_tokens"
37
+ JUDGMENT_USAGE_TOTAL_COST_USD = "judgment.usage.total_cost_usd"
38
+
29
39
  GEN_AI_PROMPT = "gen_ai.prompt"
30
40
  GEN_AI_COMPLETION = "gen_ai.completion"
31
- GEN_AI_REQUEST_MODEL = "gen_ai.request.model"
32
- GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
33
41
  GEN_AI_SYSTEM = "gen_ai.system"
34
- GEN_AI_USAGE_INPUT_TOKENS = "gen_ai.usage.input_tokens"
35
- GEN_AI_USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
36
- GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS = (
37
- "gen_ai.usage.cache_creation_input_tokens"
38
- )
39
- GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS = "gen_ai.usage.cache_read_input_tokens"
40
-
41
42
  GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
42
43
  GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
43
44
  GEN_AI_RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons"
@@ -51,6 +52,7 @@ class InternalAttributeKeys(str, Enum):
51
52
 
52
53
  DISABLE_PARTIAL_EMIT = "disable_partial_emit"
53
54
  CANCELLED = "cancelled"
55
+ IS_CUSTOMER_CONTEXT_OWNER = "is_customer_context_owner"
54
56
 
55
57
 
56
58
  class ResourceKeys(str, Enum):
@@ -89,13 +89,13 @@ def _wrap_non_streaming_sync(
89
89
  ctx["span"] = tracer.get_tracer().start_span(
90
90
  "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
91
91
  )
92
- tracer.add_agent_attributes_to_span(ctx["span"])
92
+ tracer._inject_judgment_context(ctx["span"])
93
93
  set_span_attribute(
94
94
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
95
95
  )
96
96
  ctx["model_name"] = kwargs.get("model", "")
97
97
  set_span_attribute(
98
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
98
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
99
99
  )
100
100
 
101
101
  def post_hook(ctx: Dict[str, Any], result: Message) -> None:
@@ -112,17 +112,19 @@ def _wrap_non_streaming_sync(
112
112
  _extract_anthropic_tokens(result.usage)
113
113
  )
114
114
  set_span_attribute(
115
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
115
+ span,
116
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
117
+ prompt_tokens,
116
118
  )
117
119
  set_span_attribute(
118
- span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
120
+ span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
119
121
  )
120
122
  set_span_attribute(
121
- span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
123
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
122
124
  )
123
125
  set_span_attribute(
124
126
  span,
125
- AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
127
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
126
128
  cache_creation,
127
129
  )
128
130
  set_span_attribute(
@@ -133,7 +135,7 @@ def _wrap_non_streaming_sync(
133
135
 
134
136
  set_span_attribute(
135
137
  span,
136
- AttributeKeys.GEN_AI_RESPONSE_MODEL,
138
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
137
139
  result.model,
138
140
  )
139
141
 
@@ -163,13 +165,13 @@ def _wrap_streaming_sync(
163
165
  ctx["span"] = tracer.get_tracer().start_span(
164
166
  "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
165
167
  )
166
- tracer.add_agent_attributes_to_span(ctx["span"])
168
+ tracer._inject_judgment_context(ctx["span"])
167
169
  set_span_attribute(
168
170
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
169
171
  )
170
172
  ctx["model_name"] = kwargs.get("model", "")
171
173
  set_span_attribute(
172
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
174
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
173
175
  )
174
176
  ctx["accumulated_content"] = ""
175
177
 
@@ -197,17 +199,21 @@ def _wrap_streaming_sync(
197
199
  _extract_anthropic_tokens(usage_data)
198
200
  )
199
201
  set_span_attribute(
200
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
202
+ span,
203
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
204
+ prompt_tokens,
201
205
  )
202
206
  set_span_attribute(
203
- span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
207
+ span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
204
208
  )
205
209
  set_span_attribute(
206
- span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
210
+ span,
211
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
212
+ cache_read,
207
213
  )
208
214
  set_span_attribute(
209
215
  span,
210
- AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
216
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
211
217
  cache_creation,
212
218
  )
213
219
  set_span_attribute(
@@ -273,13 +279,13 @@ def _wrap_non_streaming_async(
273
279
  ctx["span"] = tracer.get_tracer().start_span(
274
280
  "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
275
281
  )
276
- tracer.add_agent_attributes_to_span(ctx["span"])
282
+ tracer._inject_judgment_context(ctx["span"])
277
283
  set_span_attribute(
278
284
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
279
285
  )
280
286
  ctx["model_name"] = kwargs.get("model", "")
281
287
  set_span_attribute(
282
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
288
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
283
289
  )
284
290
 
285
291
  def post_hook(ctx: Dict[str, Any], result: Message) -> None:
@@ -296,17 +302,19 @@ def _wrap_non_streaming_async(
296
302
  _extract_anthropic_tokens(result.usage)
297
303
  )
298
304
  set_span_attribute(
299
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
305
+ span,
306
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
307
+ prompt_tokens,
300
308
  )
301
309
  set_span_attribute(
302
- span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
310
+ span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
303
311
  )
304
312
  set_span_attribute(
305
- span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
313
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
306
314
  )
307
315
  set_span_attribute(
308
316
  span,
309
- AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
317
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
310
318
  cache_creation,
311
319
  )
312
320
  set_span_attribute(
@@ -317,7 +325,7 @@ def _wrap_non_streaming_async(
317
325
 
318
326
  set_span_attribute(
319
327
  span,
320
- AttributeKeys.GEN_AI_RESPONSE_MODEL,
328
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
321
329
  result.model,
322
330
  )
323
331
 
@@ -348,13 +356,13 @@ def _wrap_streaming_async(
348
356
  ctx["span"] = tracer.get_tracer().start_span(
349
357
  "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
350
358
  )
351
- tracer.add_agent_attributes_to_span(ctx["span"])
359
+ tracer._inject_judgment_context(ctx["span"])
352
360
  set_span_attribute(
353
361
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
354
362
  )
355
363
  ctx["model_name"] = kwargs.get("model", "")
356
364
  set_span_attribute(
357
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
365
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
358
366
  )
359
367
  ctx["accumulated_content"] = ""
360
368
 
@@ -382,17 +390,21 @@ def _wrap_streaming_async(
382
390
  _extract_anthropic_tokens(usage_data)
383
391
  )
384
392
  set_span_attribute(
385
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
393
+ span,
394
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
395
+ prompt_tokens,
386
396
  )
387
397
  set_span_attribute(
388
- span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
398
+ span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
389
399
  )
390
400
  set_span_attribute(
391
- span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
401
+ span,
402
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
403
+ cache_read,
392
404
  )
393
405
  set_span_attribute(
394
406
  span,
395
- AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
407
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
396
408
  cache_creation,
397
409
  )
398
410
  set_span_attribute(
@@ -37,14 +37,14 @@ def wrap_messages_stream_sync(tracer: Tracer, client: Anthropic) -> None:
37
37
  ctx["span"] = tracer.get_tracer().start_span(
38
38
  "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
39
39
  )
40
- tracer.add_agent_attributes_to_span(ctx["span"])
40
+ tracer._inject_judgment_context(ctx["span"])
41
41
  set_span_attribute(
42
42
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
43
43
  )
44
44
 
45
45
  ctx["model_name"] = kwargs.get("model", "")
46
46
  set_span_attribute(
47
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
47
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
48
48
  )
49
49
  ctx["accumulated_content"] = ""
50
50
 
@@ -125,22 +125,22 @@ def wrap_messages_stream_sync(tracer: Tracer, client: Anthropic) -> None:
125
125
  ) = _extract_anthropic_tokens(final_message.usage)
126
126
  set_span_attribute(
127
127
  span,
128
- AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
128
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
129
129
  prompt_tokens,
130
130
  )
131
131
  set_span_attribute(
132
132
  span,
133
- AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
133
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
134
134
  completion_tokens,
135
135
  )
136
136
  set_span_attribute(
137
137
  span,
138
- AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
138
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
139
139
  cache_read,
140
140
  )
141
141
  set_span_attribute(
142
142
  span,
143
- AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
143
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
144
144
  cache_creation,
145
145
  )
146
146
  set_span_attribute(
@@ -151,7 +151,7 @@ def wrap_messages_stream_sync(tracer: Tracer, client: Anthropic) -> None:
151
151
 
152
152
  set_span_attribute(
153
153
  span,
154
- AttributeKeys.GEN_AI_RESPONSE_MODEL,
154
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
155
155
  final_message.model,
156
156
  )
157
157
  except Exception:
@@ -183,14 +183,14 @@ def wrap_messages_stream_async(tracer: Tracer, client: AsyncAnthropic) -> None:
183
183
  ctx["span"] = tracer.get_tracer().start_span(
184
184
  "ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
185
185
  )
186
- tracer.add_agent_attributes_to_span(ctx["span"])
186
+ tracer._inject_judgment_context(ctx["span"])
187
187
  set_span_attribute(
188
188
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
189
189
  )
190
190
 
191
191
  ctx["model_name"] = kwargs.get("model", "")
192
192
  set_span_attribute(
193
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
193
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
194
194
  )
195
195
  ctx["accumulated_content"] = ""
196
196
 
@@ -271,22 +271,22 @@ def wrap_messages_stream_async(tracer: Tracer, client: AsyncAnthropic) -> None:
271
271
  ) = _extract_anthropic_tokens(final_message.usage)
272
272
  set_span_attribute(
273
273
  span,
274
- AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
274
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
275
275
  prompt_tokens,
276
276
  )
277
277
  set_span_attribute(
278
278
  span,
279
- AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
279
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
280
280
  completion_tokens,
281
281
  )
282
282
  set_span_attribute(
283
283
  span,
284
- AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
284
+ AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
285
285
  cache_read,
286
286
  )
287
287
  set_span_attribute(
288
288
  span,
289
- AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
289
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
290
290
  cache_creation,
291
291
  )
292
292
  set_span_attribute(
@@ -297,7 +297,7 @@ def wrap_messages_stream_async(tracer: Tracer, client: AsyncAnthropic) -> None:
297
297
 
298
298
  set_span_attribute(
299
299
  span,
300
- AttributeKeys.GEN_AI_RESPONSE_MODEL,
300
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
301
301
  final_message.model,
302
302
  )
303
303
  except Exception:
@@ -57,13 +57,13 @@ def wrap_generate_content_sync(tracer: Tracer, client: Client) -> None:
57
57
  ctx["span"] = tracer.get_tracer().start_span(
58
58
  "GOOGLE_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
59
59
  )
60
- tracer.add_agent_attributes_to_span(ctx["span"])
60
+ tracer._inject_judgment_context(ctx["span"])
61
61
  set_span_attribute(
62
62
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
63
63
  )
64
64
  ctx["model_name"] = kwargs.get("model", "")
65
65
  set_span_attribute(
66
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
66
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
67
67
  )
68
68
 
69
69
  def post_hook(ctx: Dict[str, Any], result: GenerateContentResponse) -> None:
@@ -79,17 +79,19 @@ def wrap_generate_content_sync(tracer: Tracer, client: Client) -> None:
79
79
  _extract_google_tokens(usage_data)
80
80
  )
81
81
  set_span_attribute(
82
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
82
+ span,
83
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
84
+ prompt_tokens,
83
85
  )
84
86
  set_span_attribute(
85
- span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
87
+ span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
86
88
  )
87
89
  set_span_attribute(
88
- span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
90
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
89
91
  )
90
92
  set_span_attribute(
91
93
  span,
92
- AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
94
+ AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
93
95
  cache_creation,
94
96
  )
95
97
  set_span_attribute(
@@ -100,7 +102,7 @@ def wrap_generate_content_sync(tracer: Tracer, client: Client) -> None:
100
102
 
101
103
  set_span_attribute(
102
104
  span,
103
- AttributeKeys.GEN_AI_RESPONSE_MODEL,
105
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
104
106
  result.model_version if result.model_version else ctx["model_name"],
105
107
  )
106
108
 
@@ -16,6 +16,7 @@ from judgeval.utils.wrappers import (
16
16
  immutable_wrap_sync,
17
17
  immutable_wrap_async,
18
18
  )
19
+ from judgeval.tracer.llm.llm_openai.utils import openai_tokens_converter
19
20
 
20
21
  if TYPE_CHECKING:
21
22
  from judgeval.tracer import Tracer
@@ -39,13 +40,13 @@ def _wrap_beta_non_streaming_sync(
39
40
  ctx["span"] = tracer.get_tracer().start_span(
40
41
  "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
41
42
  )
42
- tracer.add_agent_attributes_to_span(ctx["span"])
43
+ tracer._inject_judgment_context(ctx["span"])
43
44
  set_span_attribute(
44
45
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
45
46
  )
46
47
  ctx["model_name"] = kwargs.get("model", "")
47
48
  set_span_attribute(
48
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
49
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
49
50
  )
50
51
 
51
52
  def post_hook(ctx: Dict[str, Any], result: ParsedChatCompletion[T]) -> None:
@@ -66,17 +67,29 @@ def _wrap_beta_non_streaming_sync(
66
67
  if prompt_tokens_details:
67
68
  cache_read = prompt_tokens_details.cached_tokens or 0
68
69
 
70
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
71
+ openai_tokens_converter(
72
+ prompt_tokens,
73
+ completion_tokens,
74
+ cache_read,
75
+ 0,
76
+ usage_data.total_tokens,
77
+ )
78
+ )
79
+
69
80
  set_span_attribute(
70
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
81
+ span,
82
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
83
+ prompt_tokens,
71
84
  )
72
85
  set_span_attribute(
73
- span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
86
+ span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
74
87
  )
75
88
  set_span_attribute(
76
- span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
89
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
77
90
  )
78
91
  set_span_attribute(
79
- span, AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
92
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
80
93
  )
81
94
  set_span_attribute(
82
95
  span,
@@ -86,7 +99,7 @@ def _wrap_beta_non_streaming_sync(
86
99
 
87
100
  set_span_attribute(
88
101
  span,
89
- AttributeKeys.GEN_AI_RESPONSE_MODEL,
102
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
90
103
  result.model or ctx["model_name"],
91
104
  )
92
105
 
@@ -122,13 +135,13 @@ def _wrap_beta_non_streaming_async(
122
135
  ctx["span"] = tracer.get_tracer().start_span(
123
136
  "OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
124
137
  )
125
- tracer.add_agent_attributes_to_span(ctx["span"])
138
+ tracer._inject_judgment_context(ctx["span"])
126
139
  set_span_attribute(
127
140
  ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
128
141
  )
129
142
  ctx["model_name"] = kwargs.get("model", "")
130
143
  set_span_attribute(
131
- ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
144
+ ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
132
145
  )
133
146
 
134
147
  def post_hook(ctx: Dict[str, Any], result: ParsedChatCompletion[T]) -> None:
@@ -149,17 +162,28 @@ def _wrap_beta_non_streaming_async(
149
162
  if prompt_tokens_details:
150
163
  cache_read = prompt_tokens_details.cached_tokens or 0
151
164
 
165
+ prompt_tokens, completion_tokens, cache_read, cache_creation = (
166
+ openai_tokens_converter(
167
+ prompt_tokens,
168
+ completion_tokens,
169
+ cache_read,
170
+ 0,
171
+ usage_data.total_tokens,
172
+ )
173
+ )
152
174
  set_span_attribute(
153
- span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
175
+ span,
176
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
177
+ prompt_tokens,
154
178
  )
155
179
  set_span_attribute(
156
- span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
180
+ span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
157
181
  )
158
182
  set_span_attribute(
159
- span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
183
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
160
184
  )
161
185
  set_span_attribute(
162
- span, AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
186
+ span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
163
187
  )
164
188
  set_span_attribute(
165
189
  span,
@@ -169,7 +193,7 @@ def _wrap_beta_non_streaming_async(
169
193
 
170
194
  set_span_attribute(
171
195
  span,
172
- AttributeKeys.GEN_AI_RESPONSE_MODEL,
196
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
173
197
  result.model or ctx["model_name"],
174
198
  )
175
199