judgeval 0.16.9__py3-none-any.whl → 0.22.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +32 -2
- judgeval/api/__init__.py +108 -0
- judgeval/api/api_types.py +76 -15
- judgeval/cli.py +16 -1
- judgeval/data/judgment_types.py +76 -20
- judgeval/dataset/__init__.py +11 -2
- judgeval/env.py +2 -11
- judgeval/evaluation/__init__.py +4 -0
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +1 -13
- judgeval/tracer/__init__.py +371 -257
- judgeval/tracer/constants.py +1 -1
- judgeval/tracer/exporters/store.py +32 -16
- judgeval/tracer/keys.py +11 -9
- judgeval/tracer/llm/llm_anthropic/messages.py +38 -26
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +14 -14
- judgeval/tracer/llm/llm_google/generate_content.py +9 -7
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +38 -14
- judgeval/tracer/llm/llm_openai/chat_completions.py +90 -26
- judgeval/tracer/llm/llm_openai/responses.py +88 -26
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +26 -18
- judgeval/tracer/managers.py +4 -0
- judgeval/trainer/__init__.py +10 -1
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +1 -1
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainer.py +52 -387
- judgeval/utils/guards.py +9 -5
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +2 -2
- judgeval/version.py +1 -1
- {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/METADATA +2 -3
- {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/RECORD +37 -32
- {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
- {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/entry_points.txt +0 -0
- {judgeval-0.16.9.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
judgeval/tracer/constants.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME = "
|
|
1
|
+
JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME = "judgeval"
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import List
|
|
2
|
+
from typing import List, Dict
|
|
3
3
|
|
|
4
4
|
from opentelemetry.sdk.trace import ReadableSpan
|
|
5
5
|
|
|
@@ -9,35 +9,51 @@ class ABCSpanStore(ABC):
|
|
|
9
9
|
def add(self, *spans: ReadableSpan): ...
|
|
10
10
|
|
|
11
11
|
@abstractmethod
|
|
12
|
-
def
|
|
12
|
+
def get_all(self) -> List[ReadableSpan]: ...
|
|
13
13
|
|
|
14
14
|
@abstractmethod
|
|
15
|
-
def
|
|
15
|
+
def get_by_trace_id(self, trace_id: str) -> List[ReadableSpan]: ...
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def clear_trace(self, trace_id: str): ...
|
|
16
19
|
|
|
17
20
|
|
|
18
21
|
class SpanStore(ABCSpanStore):
|
|
19
|
-
__slots__ = ("
|
|
22
|
+
__slots__ = ("_spans_by_trace",)
|
|
20
23
|
|
|
21
|
-
|
|
24
|
+
_spans_by_trace: Dict[str, List[ReadableSpan]]
|
|
22
25
|
|
|
23
26
|
def __init__(self):
|
|
24
|
-
self.
|
|
27
|
+
self._spans_by_trace = {}
|
|
25
28
|
|
|
26
29
|
def add(self, *spans: ReadableSpan):
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def get(self, id: str) -> ReadableSpan:
|
|
30
|
-
for span in self.spans:
|
|
30
|
+
for span in spans:
|
|
31
31
|
context = span.get_span_context()
|
|
32
32
|
if context is None:
|
|
33
33
|
continue
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
34
|
+
# Convert trace_id to hex string per OTEL spec
|
|
35
|
+
trace_id = format(context.trace_id, "032x")
|
|
36
|
+
if trace_id not in self._spans_by_trace:
|
|
37
|
+
self._spans_by_trace[trace_id] = []
|
|
38
|
+
self._spans_by_trace[trace_id].append(span)
|
|
38
39
|
|
|
39
40
|
def get_all(self) -> List[ReadableSpan]:
|
|
40
|
-
|
|
41
|
+
all_spans = []
|
|
42
|
+
for spans in self._spans_by_trace.values():
|
|
43
|
+
all_spans.extend(spans)
|
|
44
|
+
return all_spans
|
|
45
|
+
|
|
46
|
+
def get_by_trace_id(self, trace_id: str) -> List[ReadableSpan]:
|
|
47
|
+
"""Get all spans for a specific trace ID (32-char hex string)."""
|
|
48
|
+
return self._spans_by_trace.get(trace_id, [])
|
|
49
|
+
|
|
50
|
+
def clear_trace(self, trace_id: str):
|
|
51
|
+
"""Clear all spans for a specific trace ID (32-char hex string)."""
|
|
52
|
+
if trace_id in self._spans_by_trace:
|
|
53
|
+
del self._spans_by_trace[trace_id]
|
|
41
54
|
|
|
42
55
|
def __repr__(self) -> str:
|
|
43
|
-
|
|
56
|
+
total_spans = sum(len(spans) for spans in self._spans_by_trace.values())
|
|
57
|
+
return (
|
|
58
|
+
f"SpanStore(traces={len(self._spans_by_trace)}, total_spans={total_spans})"
|
|
59
|
+
)
|
judgeval/tracer/keys.py
CHANGED
|
@@ -26,18 +26,19 @@ class AttributeKeys(str, Enum):
|
|
|
26
26
|
|
|
27
27
|
PENDING_TRACE_EVAL = "judgment.pending_trace_eval"
|
|
28
28
|
|
|
29
|
+
JUDGMENT_LLM_PROVIDER = "judgment.llm.provider"
|
|
30
|
+
JUDGMENT_LLM_MODEL_NAME = "judgment.llm.model"
|
|
31
|
+
JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS = "judgment.usage.non_cached_input_tokens"
|
|
32
|
+
JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS = (
|
|
33
|
+
"judgment.usage.cache_creation_input_tokens"
|
|
34
|
+
)
|
|
35
|
+
JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS = "judgment.usage.cache_read_input_tokens"
|
|
36
|
+
JUDGMENT_USAGE_OUTPUT_TOKENS = "judgment.usage.output_tokens"
|
|
37
|
+
JUDGMENT_USAGE_TOTAL_COST_USD = "judgment.usage.total_cost_usd"
|
|
38
|
+
|
|
29
39
|
GEN_AI_PROMPT = "gen_ai.prompt"
|
|
30
40
|
GEN_AI_COMPLETION = "gen_ai.completion"
|
|
31
|
-
GEN_AI_REQUEST_MODEL = "gen_ai.request.model"
|
|
32
|
-
GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
|
|
33
41
|
GEN_AI_SYSTEM = "gen_ai.system"
|
|
34
|
-
GEN_AI_USAGE_INPUT_TOKENS = "gen_ai.usage.input_tokens"
|
|
35
|
-
GEN_AI_USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
|
|
36
|
-
GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS = (
|
|
37
|
-
"gen_ai.usage.cache_creation_input_tokens"
|
|
38
|
-
)
|
|
39
|
-
GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS = "gen_ai.usage.cache_read_input_tokens"
|
|
40
|
-
|
|
41
42
|
GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
|
|
42
43
|
GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
|
|
43
44
|
GEN_AI_RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons"
|
|
@@ -51,6 +52,7 @@ class InternalAttributeKeys(str, Enum):
|
|
|
51
52
|
|
|
52
53
|
DISABLE_PARTIAL_EMIT = "disable_partial_emit"
|
|
53
54
|
CANCELLED = "cancelled"
|
|
55
|
+
IS_CUSTOMER_CONTEXT_OWNER = "is_customer_context_owner"
|
|
54
56
|
|
|
55
57
|
|
|
56
58
|
class ResourceKeys(str, Enum):
|
|
@@ -89,13 +89,13 @@ def _wrap_non_streaming_sync(
|
|
|
89
89
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
90
90
|
"ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
91
91
|
)
|
|
92
|
-
tracer.
|
|
92
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
93
93
|
set_span_attribute(
|
|
94
94
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
95
95
|
)
|
|
96
96
|
ctx["model_name"] = kwargs.get("model", "")
|
|
97
97
|
set_span_attribute(
|
|
98
|
-
ctx["span"], AttributeKeys.
|
|
98
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
99
99
|
)
|
|
100
100
|
|
|
101
101
|
def post_hook(ctx: Dict[str, Any], result: Message) -> None:
|
|
@@ -112,17 +112,19 @@ def _wrap_non_streaming_sync(
|
|
|
112
112
|
_extract_anthropic_tokens(result.usage)
|
|
113
113
|
)
|
|
114
114
|
set_span_attribute(
|
|
115
|
-
span,
|
|
115
|
+
span,
|
|
116
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
117
|
+
prompt_tokens,
|
|
116
118
|
)
|
|
117
119
|
set_span_attribute(
|
|
118
|
-
span, AttributeKeys.
|
|
120
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
119
121
|
)
|
|
120
122
|
set_span_attribute(
|
|
121
|
-
span, AttributeKeys.
|
|
123
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
|
|
122
124
|
)
|
|
123
125
|
set_span_attribute(
|
|
124
126
|
span,
|
|
125
|
-
AttributeKeys.
|
|
127
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
126
128
|
cache_creation,
|
|
127
129
|
)
|
|
128
130
|
set_span_attribute(
|
|
@@ -133,7 +135,7 @@ def _wrap_non_streaming_sync(
|
|
|
133
135
|
|
|
134
136
|
set_span_attribute(
|
|
135
137
|
span,
|
|
136
|
-
AttributeKeys.
|
|
138
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
137
139
|
result.model,
|
|
138
140
|
)
|
|
139
141
|
|
|
@@ -163,13 +165,13 @@ def _wrap_streaming_sync(
|
|
|
163
165
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
164
166
|
"ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
165
167
|
)
|
|
166
|
-
tracer.
|
|
168
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
167
169
|
set_span_attribute(
|
|
168
170
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
169
171
|
)
|
|
170
172
|
ctx["model_name"] = kwargs.get("model", "")
|
|
171
173
|
set_span_attribute(
|
|
172
|
-
ctx["span"], AttributeKeys.
|
|
174
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
173
175
|
)
|
|
174
176
|
ctx["accumulated_content"] = ""
|
|
175
177
|
|
|
@@ -197,17 +199,21 @@ def _wrap_streaming_sync(
|
|
|
197
199
|
_extract_anthropic_tokens(usage_data)
|
|
198
200
|
)
|
|
199
201
|
set_span_attribute(
|
|
200
|
-
span,
|
|
202
|
+
span,
|
|
203
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
204
|
+
prompt_tokens,
|
|
201
205
|
)
|
|
202
206
|
set_span_attribute(
|
|
203
|
-
span, AttributeKeys.
|
|
207
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
204
208
|
)
|
|
205
209
|
set_span_attribute(
|
|
206
|
-
span,
|
|
210
|
+
span,
|
|
211
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
212
|
+
cache_read,
|
|
207
213
|
)
|
|
208
214
|
set_span_attribute(
|
|
209
215
|
span,
|
|
210
|
-
AttributeKeys.
|
|
216
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
211
217
|
cache_creation,
|
|
212
218
|
)
|
|
213
219
|
set_span_attribute(
|
|
@@ -273,13 +279,13 @@ def _wrap_non_streaming_async(
|
|
|
273
279
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
274
280
|
"ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
275
281
|
)
|
|
276
|
-
tracer.
|
|
282
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
277
283
|
set_span_attribute(
|
|
278
284
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
279
285
|
)
|
|
280
286
|
ctx["model_name"] = kwargs.get("model", "")
|
|
281
287
|
set_span_attribute(
|
|
282
|
-
ctx["span"], AttributeKeys.
|
|
288
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
283
289
|
)
|
|
284
290
|
|
|
285
291
|
def post_hook(ctx: Dict[str, Any], result: Message) -> None:
|
|
@@ -296,17 +302,19 @@ def _wrap_non_streaming_async(
|
|
|
296
302
|
_extract_anthropic_tokens(result.usage)
|
|
297
303
|
)
|
|
298
304
|
set_span_attribute(
|
|
299
|
-
span,
|
|
305
|
+
span,
|
|
306
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
307
|
+
prompt_tokens,
|
|
300
308
|
)
|
|
301
309
|
set_span_attribute(
|
|
302
|
-
span, AttributeKeys.
|
|
310
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
303
311
|
)
|
|
304
312
|
set_span_attribute(
|
|
305
|
-
span, AttributeKeys.
|
|
313
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
|
|
306
314
|
)
|
|
307
315
|
set_span_attribute(
|
|
308
316
|
span,
|
|
309
|
-
AttributeKeys.
|
|
317
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
310
318
|
cache_creation,
|
|
311
319
|
)
|
|
312
320
|
set_span_attribute(
|
|
@@ -317,7 +325,7 @@ def _wrap_non_streaming_async(
|
|
|
317
325
|
|
|
318
326
|
set_span_attribute(
|
|
319
327
|
span,
|
|
320
|
-
AttributeKeys.
|
|
328
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
321
329
|
result.model,
|
|
322
330
|
)
|
|
323
331
|
|
|
@@ -348,13 +356,13 @@ def _wrap_streaming_async(
|
|
|
348
356
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
349
357
|
"ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
350
358
|
)
|
|
351
|
-
tracer.
|
|
359
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
352
360
|
set_span_attribute(
|
|
353
361
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
354
362
|
)
|
|
355
363
|
ctx["model_name"] = kwargs.get("model", "")
|
|
356
364
|
set_span_attribute(
|
|
357
|
-
ctx["span"], AttributeKeys.
|
|
365
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
358
366
|
)
|
|
359
367
|
ctx["accumulated_content"] = ""
|
|
360
368
|
|
|
@@ -382,17 +390,21 @@ def _wrap_streaming_async(
|
|
|
382
390
|
_extract_anthropic_tokens(usage_data)
|
|
383
391
|
)
|
|
384
392
|
set_span_attribute(
|
|
385
|
-
span,
|
|
393
|
+
span,
|
|
394
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
395
|
+
prompt_tokens,
|
|
386
396
|
)
|
|
387
397
|
set_span_attribute(
|
|
388
|
-
span, AttributeKeys.
|
|
398
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
389
399
|
)
|
|
390
400
|
set_span_attribute(
|
|
391
|
-
span,
|
|
401
|
+
span,
|
|
402
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
403
|
+
cache_read,
|
|
392
404
|
)
|
|
393
405
|
set_span_attribute(
|
|
394
406
|
span,
|
|
395
|
-
AttributeKeys.
|
|
407
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
396
408
|
cache_creation,
|
|
397
409
|
)
|
|
398
410
|
set_span_attribute(
|
|
@@ -37,14 +37,14 @@ def wrap_messages_stream_sync(tracer: Tracer, client: Anthropic) -> None:
|
|
|
37
37
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
38
38
|
"ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
39
39
|
)
|
|
40
|
-
tracer.
|
|
40
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
41
41
|
set_span_attribute(
|
|
42
42
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
43
43
|
)
|
|
44
44
|
|
|
45
45
|
ctx["model_name"] = kwargs.get("model", "")
|
|
46
46
|
set_span_attribute(
|
|
47
|
-
ctx["span"], AttributeKeys.
|
|
47
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
48
48
|
)
|
|
49
49
|
ctx["accumulated_content"] = ""
|
|
50
50
|
|
|
@@ -125,22 +125,22 @@ def wrap_messages_stream_sync(tracer: Tracer, client: Anthropic) -> None:
|
|
|
125
125
|
) = _extract_anthropic_tokens(final_message.usage)
|
|
126
126
|
set_span_attribute(
|
|
127
127
|
span,
|
|
128
|
-
AttributeKeys.
|
|
128
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
129
129
|
prompt_tokens,
|
|
130
130
|
)
|
|
131
131
|
set_span_attribute(
|
|
132
132
|
span,
|
|
133
|
-
AttributeKeys.
|
|
133
|
+
AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
|
|
134
134
|
completion_tokens,
|
|
135
135
|
)
|
|
136
136
|
set_span_attribute(
|
|
137
137
|
span,
|
|
138
|
-
AttributeKeys.
|
|
138
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
139
139
|
cache_read,
|
|
140
140
|
)
|
|
141
141
|
set_span_attribute(
|
|
142
142
|
span,
|
|
143
|
-
AttributeKeys.
|
|
143
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
144
144
|
cache_creation,
|
|
145
145
|
)
|
|
146
146
|
set_span_attribute(
|
|
@@ -151,7 +151,7 @@ def wrap_messages_stream_sync(tracer: Tracer, client: Anthropic) -> None:
|
|
|
151
151
|
|
|
152
152
|
set_span_attribute(
|
|
153
153
|
span,
|
|
154
|
-
AttributeKeys.
|
|
154
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
155
155
|
final_message.model,
|
|
156
156
|
)
|
|
157
157
|
except Exception:
|
|
@@ -183,14 +183,14 @@ def wrap_messages_stream_async(tracer: Tracer, client: AsyncAnthropic) -> None:
|
|
|
183
183
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
184
184
|
"ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
185
185
|
)
|
|
186
|
-
tracer.
|
|
186
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
187
187
|
set_span_attribute(
|
|
188
188
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
189
189
|
)
|
|
190
190
|
|
|
191
191
|
ctx["model_name"] = kwargs.get("model", "")
|
|
192
192
|
set_span_attribute(
|
|
193
|
-
ctx["span"], AttributeKeys.
|
|
193
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
194
194
|
)
|
|
195
195
|
ctx["accumulated_content"] = ""
|
|
196
196
|
|
|
@@ -271,22 +271,22 @@ def wrap_messages_stream_async(tracer: Tracer, client: AsyncAnthropic) -> None:
|
|
|
271
271
|
) = _extract_anthropic_tokens(final_message.usage)
|
|
272
272
|
set_span_attribute(
|
|
273
273
|
span,
|
|
274
|
-
AttributeKeys.
|
|
274
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
275
275
|
prompt_tokens,
|
|
276
276
|
)
|
|
277
277
|
set_span_attribute(
|
|
278
278
|
span,
|
|
279
|
-
AttributeKeys.
|
|
279
|
+
AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
|
|
280
280
|
completion_tokens,
|
|
281
281
|
)
|
|
282
282
|
set_span_attribute(
|
|
283
283
|
span,
|
|
284
|
-
AttributeKeys.
|
|
284
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
285
285
|
cache_read,
|
|
286
286
|
)
|
|
287
287
|
set_span_attribute(
|
|
288
288
|
span,
|
|
289
|
-
AttributeKeys.
|
|
289
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
290
290
|
cache_creation,
|
|
291
291
|
)
|
|
292
292
|
set_span_attribute(
|
|
@@ -297,7 +297,7 @@ def wrap_messages_stream_async(tracer: Tracer, client: AsyncAnthropic) -> None:
|
|
|
297
297
|
|
|
298
298
|
set_span_attribute(
|
|
299
299
|
span,
|
|
300
|
-
AttributeKeys.
|
|
300
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
301
301
|
final_message.model,
|
|
302
302
|
)
|
|
303
303
|
except Exception:
|
|
@@ -57,13 +57,13 @@ def wrap_generate_content_sync(tracer: Tracer, client: Client) -> None:
|
|
|
57
57
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
58
58
|
"GOOGLE_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
59
59
|
)
|
|
60
|
-
tracer.
|
|
60
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
61
61
|
set_span_attribute(
|
|
62
62
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
63
63
|
)
|
|
64
64
|
ctx["model_name"] = kwargs.get("model", "")
|
|
65
65
|
set_span_attribute(
|
|
66
|
-
ctx["span"], AttributeKeys.
|
|
66
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
67
67
|
)
|
|
68
68
|
|
|
69
69
|
def post_hook(ctx: Dict[str, Any], result: GenerateContentResponse) -> None:
|
|
@@ -79,17 +79,19 @@ def wrap_generate_content_sync(tracer: Tracer, client: Client) -> None:
|
|
|
79
79
|
_extract_google_tokens(usage_data)
|
|
80
80
|
)
|
|
81
81
|
set_span_attribute(
|
|
82
|
-
span,
|
|
82
|
+
span,
|
|
83
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
84
|
+
prompt_tokens,
|
|
83
85
|
)
|
|
84
86
|
set_span_attribute(
|
|
85
|
-
span, AttributeKeys.
|
|
87
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
86
88
|
)
|
|
87
89
|
set_span_attribute(
|
|
88
|
-
span, AttributeKeys.
|
|
90
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
|
|
89
91
|
)
|
|
90
92
|
set_span_attribute(
|
|
91
93
|
span,
|
|
92
|
-
AttributeKeys.
|
|
94
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
93
95
|
cache_creation,
|
|
94
96
|
)
|
|
95
97
|
set_span_attribute(
|
|
@@ -100,7 +102,7 @@ def wrap_generate_content_sync(tracer: Tracer, client: Client) -> None:
|
|
|
100
102
|
|
|
101
103
|
set_span_attribute(
|
|
102
104
|
span,
|
|
103
|
-
AttributeKeys.
|
|
105
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
104
106
|
result.model_version if result.model_version else ctx["model_name"],
|
|
105
107
|
)
|
|
106
108
|
|
|
@@ -16,6 +16,7 @@ from judgeval.utils.wrappers import (
|
|
|
16
16
|
immutable_wrap_sync,
|
|
17
17
|
immutable_wrap_async,
|
|
18
18
|
)
|
|
19
|
+
from judgeval.tracer.llm.llm_openai.utils import openai_tokens_converter
|
|
19
20
|
|
|
20
21
|
if TYPE_CHECKING:
|
|
21
22
|
from judgeval.tracer import Tracer
|
|
@@ -39,13 +40,13 @@ def _wrap_beta_non_streaming_sync(
|
|
|
39
40
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
40
41
|
"OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
41
42
|
)
|
|
42
|
-
tracer.
|
|
43
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
43
44
|
set_span_attribute(
|
|
44
45
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
45
46
|
)
|
|
46
47
|
ctx["model_name"] = kwargs.get("model", "")
|
|
47
48
|
set_span_attribute(
|
|
48
|
-
ctx["span"], AttributeKeys.
|
|
49
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
49
50
|
)
|
|
50
51
|
|
|
51
52
|
def post_hook(ctx: Dict[str, Any], result: ParsedChatCompletion[T]) -> None:
|
|
@@ -66,17 +67,29 @@ def _wrap_beta_non_streaming_sync(
|
|
|
66
67
|
if prompt_tokens_details:
|
|
67
68
|
cache_read = prompt_tokens_details.cached_tokens or 0
|
|
68
69
|
|
|
70
|
+
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
71
|
+
openai_tokens_converter(
|
|
72
|
+
prompt_tokens,
|
|
73
|
+
completion_tokens,
|
|
74
|
+
cache_read,
|
|
75
|
+
0,
|
|
76
|
+
usage_data.total_tokens,
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
|
|
69
80
|
set_span_attribute(
|
|
70
|
-
span,
|
|
81
|
+
span,
|
|
82
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
83
|
+
prompt_tokens,
|
|
71
84
|
)
|
|
72
85
|
set_span_attribute(
|
|
73
|
-
span, AttributeKeys.
|
|
86
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
74
87
|
)
|
|
75
88
|
set_span_attribute(
|
|
76
|
-
span, AttributeKeys.
|
|
89
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
|
|
77
90
|
)
|
|
78
91
|
set_span_attribute(
|
|
79
|
-
span, AttributeKeys.
|
|
92
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
|
|
80
93
|
)
|
|
81
94
|
set_span_attribute(
|
|
82
95
|
span,
|
|
@@ -86,7 +99,7 @@ def _wrap_beta_non_streaming_sync(
|
|
|
86
99
|
|
|
87
100
|
set_span_attribute(
|
|
88
101
|
span,
|
|
89
|
-
AttributeKeys.
|
|
102
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
90
103
|
result.model or ctx["model_name"],
|
|
91
104
|
)
|
|
92
105
|
|
|
@@ -122,13 +135,13 @@ def _wrap_beta_non_streaming_async(
|
|
|
122
135
|
ctx["span"] = tracer.get_tracer().start_span(
|
|
123
136
|
"OPENAI_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
124
137
|
)
|
|
125
|
-
tracer.
|
|
138
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
126
139
|
set_span_attribute(
|
|
127
140
|
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
128
141
|
)
|
|
129
142
|
ctx["model_name"] = kwargs.get("model", "")
|
|
130
143
|
set_span_attribute(
|
|
131
|
-
ctx["span"], AttributeKeys.
|
|
144
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
132
145
|
)
|
|
133
146
|
|
|
134
147
|
def post_hook(ctx: Dict[str, Any], result: ParsedChatCompletion[T]) -> None:
|
|
@@ -149,17 +162,28 @@ def _wrap_beta_non_streaming_async(
|
|
|
149
162
|
if prompt_tokens_details:
|
|
150
163
|
cache_read = prompt_tokens_details.cached_tokens or 0
|
|
151
164
|
|
|
165
|
+
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
166
|
+
openai_tokens_converter(
|
|
167
|
+
prompt_tokens,
|
|
168
|
+
completion_tokens,
|
|
169
|
+
cache_read,
|
|
170
|
+
0,
|
|
171
|
+
usage_data.total_tokens,
|
|
172
|
+
)
|
|
173
|
+
)
|
|
152
174
|
set_span_attribute(
|
|
153
|
-
span,
|
|
175
|
+
span,
|
|
176
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
177
|
+
prompt_tokens,
|
|
154
178
|
)
|
|
155
179
|
set_span_attribute(
|
|
156
|
-
span, AttributeKeys.
|
|
180
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
157
181
|
)
|
|
158
182
|
set_span_attribute(
|
|
159
|
-
span, AttributeKeys.
|
|
183
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
|
|
160
184
|
)
|
|
161
185
|
set_span_attribute(
|
|
162
|
-
span, AttributeKeys.
|
|
186
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS, 0
|
|
163
187
|
)
|
|
164
188
|
set_span_attribute(
|
|
165
189
|
span,
|
|
@@ -169,7 +193,7 @@ def _wrap_beta_non_streaming_async(
|
|
|
169
193
|
|
|
170
194
|
set_span_attribute(
|
|
171
195
|
span,
|
|
172
|
-
AttributeKeys.
|
|
196
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
173
197
|
result.model or ctx["model_name"],
|
|
174
198
|
)
|
|
175
199
|
|