judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import (
|
|
3
|
+
TYPE_CHECKING,
|
|
4
|
+
Any,
|
|
5
|
+
Dict,
|
|
6
|
+
Generator,
|
|
7
|
+
AsyncGenerator,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
from judgeval.tracer.keys import AttributeKeys
|
|
11
|
+
from judgeval.tracer.utils import set_span_attribute
|
|
12
|
+
from judgeval.utils.serialize import safe_serialize
|
|
13
|
+
from judgeval.utils.wrappers import (
|
|
14
|
+
mutable_wrap_sync,
|
|
15
|
+
immutable_wrap_sync_iterator,
|
|
16
|
+
immutable_wrap_async_iterator,
|
|
17
|
+
)
|
|
18
|
+
from judgeval.tracer.llm.llm_anthropic.messages import (
|
|
19
|
+
_extract_anthropic_tokens,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from judgeval.tracer import Tracer
|
|
24
|
+
from anthropic import Anthropic, AsyncAnthropic
|
|
25
|
+
from anthropic.lib.streaming import (
|
|
26
|
+
MessageStreamManager,
|
|
27
|
+
AsyncMessageStreamManager,
|
|
28
|
+
MessageStream,
|
|
29
|
+
AsyncMessageStream,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def wrap_messages_stream_sync(tracer: Tracer, client: Anthropic) -> None:
|
|
34
|
+
original_func = client.messages.stream
|
|
35
|
+
|
|
36
|
+
def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
|
|
37
|
+
ctx["span"] = tracer.get_tracer().start_span(
|
|
38
|
+
"ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
39
|
+
)
|
|
40
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
41
|
+
set_span_attribute(
|
|
42
|
+
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
ctx["model_name"] = kwargs.get("model", "")
|
|
46
|
+
set_span_attribute(
|
|
47
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
48
|
+
)
|
|
49
|
+
ctx["accumulated_content"] = ""
|
|
50
|
+
|
|
51
|
+
def mutate_hook(
|
|
52
|
+
ctx: Dict[str, Any], result: MessageStreamManager
|
|
53
|
+
) -> MessageStreamManager:
|
|
54
|
+
original_manager = result
|
|
55
|
+
|
|
56
|
+
class WrappedMessageStreamManager:
|
|
57
|
+
def __init__(self, manager: MessageStreamManager):
|
|
58
|
+
self._manager = manager
|
|
59
|
+
|
|
60
|
+
def __enter__(self) -> MessageStream:
|
|
61
|
+
stream = self._manager.__enter__()
|
|
62
|
+
post_hook_enter_impl(stream)
|
|
63
|
+
return stream
|
|
64
|
+
|
|
65
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
66
|
+
result = self._manager.__exit__(exc_type, exc_val, exc_tb)
|
|
67
|
+
post_hook_exit_impl()
|
|
68
|
+
return result
|
|
69
|
+
|
|
70
|
+
def __getattr__(self, name):
|
|
71
|
+
return getattr(self._manager, name)
|
|
72
|
+
|
|
73
|
+
def post_hook_enter_impl(stream: MessageStream) -> None:
|
|
74
|
+
ctx["stream"] = stream
|
|
75
|
+
original_text_stream = stream.text_stream
|
|
76
|
+
|
|
77
|
+
def traced_text_stream() -> Generator[str, None, None]:
|
|
78
|
+
for text_chunk in original_text_stream:
|
|
79
|
+
yield text_chunk
|
|
80
|
+
|
|
81
|
+
def yield_hook(inner_ctx: Dict[str, Any], text_chunk: str) -> None:
|
|
82
|
+
span = ctx.get("span")
|
|
83
|
+
if span and text_chunk:
|
|
84
|
+
ctx["accumulated_content"] = (
|
|
85
|
+
ctx.get("accumulated_content", "") + text_chunk
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
|
|
92
|
+
span = ctx.get("span")
|
|
93
|
+
if span:
|
|
94
|
+
span.record_exception(error)
|
|
95
|
+
|
|
96
|
+
def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
wrapped_text_stream = immutable_wrap_sync_iterator(
|
|
100
|
+
traced_text_stream,
|
|
101
|
+
yield_hook=yield_hook,
|
|
102
|
+
post_hook=post_hook_inner,
|
|
103
|
+
error_hook=error_hook_inner,
|
|
104
|
+
finally_hook=finally_hook_inner,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
stream.text_stream = wrapped_text_stream()
|
|
108
|
+
|
|
109
|
+
def post_hook_exit_impl() -> None:
|
|
110
|
+
span = ctx.get("span")
|
|
111
|
+
if span:
|
|
112
|
+
accumulated = ctx.get("accumulated_content", "")
|
|
113
|
+
set_span_attribute(span, AttributeKeys.GEN_AI_COMPLETION, accumulated)
|
|
114
|
+
|
|
115
|
+
stream: MessageStream | None = ctx.get("stream")
|
|
116
|
+
if stream:
|
|
117
|
+
try:
|
|
118
|
+
final_message = stream.get_final_message()
|
|
119
|
+
if final_message.usage:
|
|
120
|
+
(
|
|
121
|
+
prompt_tokens,
|
|
122
|
+
completion_tokens,
|
|
123
|
+
cache_read,
|
|
124
|
+
cache_creation,
|
|
125
|
+
) = _extract_anthropic_tokens(final_message.usage)
|
|
126
|
+
set_span_attribute(
|
|
127
|
+
span,
|
|
128
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
129
|
+
prompt_tokens,
|
|
130
|
+
)
|
|
131
|
+
set_span_attribute(
|
|
132
|
+
span,
|
|
133
|
+
AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
|
|
134
|
+
completion_tokens,
|
|
135
|
+
)
|
|
136
|
+
set_span_attribute(
|
|
137
|
+
span,
|
|
138
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
139
|
+
cache_read,
|
|
140
|
+
)
|
|
141
|
+
set_span_attribute(
|
|
142
|
+
span,
|
|
143
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
144
|
+
cache_creation,
|
|
145
|
+
)
|
|
146
|
+
set_span_attribute(
|
|
147
|
+
span,
|
|
148
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
149
|
+
safe_serialize(final_message.usage),
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
set_span_attribute(
|
|
153
|
+
span,
|
|
154
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
155
|
+
final_message.model,
|
|
156
|
+
)
|
|
157
|
+
except Exception:
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
span.end()
|
|
161
|
+
|
|
162
|
+
return WrappedMessageStreamManager(original_manager) # type: ignore[return-value]
|
|
163
|
+
|
|
164
|
+
def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
|
|
165
|
+
span = ctx.get("span")
|
|
166
|
+
if span:
|
|
167
|
+
span.record_exception(error)
|
|
168
|
+
|
|
169
|
+
wrapped = mutable_wrap_sync(
|
|
170
|
+
original_func,
|
|
171
|
+
pre_hook=pre_hook,
|
|
172
|
+
mutate_hook=mutate_hook,
|
|
173
|
+
error_hook=error_hook,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
setattr(client.messages, "stream", wrapped)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def wrap_messages_stream_async(tracer: Tracer, client: AsyncAnthropic) -> None:
|
|
180
|
+
original_func = client.messages.stream
|
|
181
|
+
|
|
182
|
+
def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
|
|
183
|
+
ctx["span"] = tracer.get_tracer().start_span(
|
|
184
|
+
"ANTHROPIC_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
185
|
+
)
|
|
186
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
187
|
+
set_span_attribute(
|
|
188
|
+
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
ctx["model_name"] = kwargs.get("model", "")
|
|
192
|
+
set_span_attribute(
|
|
193
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
194
|
+
)
|
|
195
|
+
ctx["accumulated_content"] = ""
|
|
196
|
+
|
|
197
|
+
def mutate_hook(
|
|
198
|
+
ctx: Dict[str, Any], result: AsyncMessageStreamManager
|
|
199
|
+
) -> AsyncMessageStreamManager:
|
|
200
|
+
original_manager = result
|
|
201
|
+
|
|
202
|
+
class WrappedAsyncMessageStreamManager:
|
|
203
|
+
def __init__(self, manager: AsyncMessageStreamManager):
|
|
204
|
+
self._manager = manager
|
|
205
|
+
|
|
206
|
+
async def __aenter__(self) -> AsyncMessageStream:
|
|
207
|
+
stream = await self._manager.__aenter__()
|
|
208
|
+
post_hook_aenter_impl(stream)
|
|
209
|
+
return stream
|
|
210
|
+
|
|
211
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
212
|
+
result = await self._manager.__aexit__(exc_type, exc_val, exc_tb)
|
|
213
|
+
await post_hook_aexit_impl()
|
|
214
|
+
return result
|
|
215
|
+
|
|
216
|
+
def __getattr__(self, name):
|
|
217
|
+
return getattr(self._manager, name)
|
|
218
|
+
|
|
219
|
+
def post_hook_aenter_impl(stream: AsyncMessageStream) -> None:
|
|
220
|
+
ctx["stream"] = stream
|
|
221
|
+
original_text_stream = stream.text_stream
|
|
222
|
+
|
|
223
|
+
async def traced_text_stream() -> AsyncGenerator[str, None]:
|
|
224
|
+
async for text_chunk in original_text_stream:
|
|
225
|
+
yield text_chunk
|
|
226
|
+
|
|
227
|
+
def yield_hook(inner_ctx: Dict[str, Any], text_chunk: str) -> None:
|
|
228
|
+
span = ctx.get("span")
|
|
229
|
+
if span and text_chunk:
|
|
230
|
+
ctx["accumulated_content"] = (
|
|
231
|
+
ctx.get("accumulated_content", "") + text_chunk
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
|
|
235
|
+
pass
|
|
236
|
+
|
|
237
|
+
def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
|
|
238
|
+
span = ctx.get("span")
|
|
239
|
+
if span:
|
|
240
|
+
span.record_exception(error)
|
|
241
|
+
|
|
242
|
+
def finally_hook_inner_sync(inner_ctx: Dict[str, Any]) -> None:
|
|
243
|
+
pass
|
|
244
|
+
|
|
245
|
+
wrapped_text_stream = immutable_wrap_async_iterator(
|
|
246
|
+
traced_text_stream,
|
|
247
|
+
yield_hook=yield_hook,
|
|
248
|
+
post_hook=post_hook_inner,
|
|
249
|
+
error_hook=error_hook_inner,
|
|
250
|
+
finally_hook=finally_hook_inner_sync,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
stream.text_stream = wrapped_text_stream()
|
|
254
|
+
|
|
255
|
+
async def post_hook_aexit_impl() -> None:
|
|
256
|
+
span = ctx.get("span")
|
|
257
|
+
if span:
|
|
258
|
+
accumulated = ctx.get("accumulated_content", "")
|
|
259
|
+
set_span_attribute(span, AttributeKeys.GEN_AI_COMPLETION, accumulated)
|
|
260
|
+
|
|
261
|
+
stream: AsyncMessageStream | None = ctx.get("stream")
|
|
262
|
+
if stream:
|
|
263
|
+
try:
|
|
264
|
+
final_message = await stream.get_final_message()
|
|
265
|
+
if final_message.usage:
|
|
266
|
+
(
|
|
267
|
+
prompt_tokens,
|
|
268
|
+
completion_tokens,
|
|
269
|
+
cache_read,
|
|
270
|
+
cache_creation,
|
|
271
|
+
) = _extract_anthropic_tokens(final_message.usage)
|
|
272
|
+
set_span_attribute(
|
|
273
|
+
span,
|
|
274
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
275
|
+
prompt_tokens,
|
|
276
|
+
)
|
|
277
|
+
set_span_attribute(
|
|
278
|
+
span,
|
|
279
|
+
AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
|
|
280
|
+
completion_tokens,
|
|
281
|
+
)
|
|
282
|
+
set_span_attribute(
|
|
283
|
+
span,
|
|
284
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
|
|
285
|
+
cache_read,
|
|
286
|
+
)
|
|
287
|
+
set_span_attribute(
|
|
288
|
+
span,
|
|
289
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
290
|
+
cache_creation,
|
|
291
|
+
)
|
|
292
|
+
set_span_attribute(
|
|
293
|
+
span,
|
|
294
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
295
|
+
safe_serialize(final_message.usage),
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
set_span_attribute(
|
|
299
|
+
span,
|
|
300
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
301
|
+
final_message.model,
|
|
302
|
+
)
|
|
303
|
+
except Exception:
|
|
304
|
+
pass
|
|
305
|
+
|
|
306
|
+
span.end()
|
|
307
|
+
|
|
308
|
+
return WrappedAsyncMessageStreamManager(original_manager) # type: ignore[return-value]
|
|
309
|
+
|
|
310
|
+
def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
|
|
311
|
+
span = ctx.get("span")
|
|
312
|
+
if span:
|
|
313
|
+
span.record_exception(error)
|
|
314
|
+
|
|
315
|
+
wrapped = mutable_wrap_sync(
|
|
316
|
+
original_func,
|
|
317
|
+
pre_hook=pre_hook,
|
|
318
|
+
mutate_hook=mutate_hook,
|
|
319
|
+
error_hook=error_hook,
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
setattr(client.messages, "stream", wrapped)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import TYPE_CHECKING, Union
|
|
3
|
+
import typing
|
|
4
|
+
|
|
5
|
+
from judgeval.tracer.llm.llm_anthropic.messages import (
|
|
6
|
+
wrap_messages_create_sync,
|
|
7
|
+
wrap_messages_create_async,
|
|
8
|
+
)
|
|
9
|
+
from judgeval.tracer.llm.llm_anthropic.messages_stream import (
|
|
10
|
+
wrap_messages_stream_sync,
|
|
11
|
+
wrap_messages_stream_async,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from judgeval.tracer import Tracer
|
|
16
|
+
from anthropic import Anthropic, AsyncAnthropic
|
|
17
|
+
|
|
18
|
+
TClient = Union[Anthropic, AsyncAnthropic]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def wrap_anthropic_client_sync(tracer: Tracer, client: Anthropic) -> Anthropic:
|
|
22
|
+
wrap_messages_create_sync(tracer, client)
|
|
23
|
+
wrap_messages_stream_sync(tracer, client)
|
|
24
|
+
return client
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def wrap_anthropic_client_async(
|
|
28
|
+
tracer: Tracer, client: AsyncAnthropic
|
|
29
|
+
) -> AsyncAnthropic:
|
|
30
|
+
wrap_messages_create_async(tracer, client)
|
|
31
|
+
wrap_messages_stream_async(tracer, client)
|
|
32
|
+
return client
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@typing.overload
|
|
36
|
+
def wrap_anthropic_client(tracer: Tracer, client: Anthropic) -> Anthropic: ...
|
|
37
|
+
@typing.overload
|
|
38
|
+
def wrap_anthropic_client(tracer: Tracer, client: AsyncAnthropic) -> AsyncAnthropic: ...
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def wrap_anthropic_client(tracer: Tracer, client: TClient) -> TClient:
|
|
42
|
+
from judgeval.tracer.llm.llm_anthropic.config import HAS_ANTHROPIC
|
|
43
|
+
from judgeval.logger import judgeval_logger
|
|
44
|
+
|
|
45
|
+
if not HAS_ANTHROPIC:
|
|
46
|
+
judgeval_logger.error(
|
|
47
|
+
"Cannot wrap Anthropic client: 'anthropic' library not installed. "
|
|
48
|
+
"Install it with: pip install anthropic"
|
|
49
|
+
)
|
|
50
|
+
return client
|
|
51
|
+
|
|
52
|
+
from anthropic import Anthropic, AsyncAnthropic
|
|
53
|
+
|
|
54
|
+
if isinstance(client, AsyncAnthropic):
|
|
55
|
+
return wrap_anthropic_client_async(tracer, client)
|
|
56
|
+
elif isinstance(client, Anthropic):
|
|
57
|
+
return wrap_anthropic_client_sync(tracer, client)
|
|
58
|
+
else:
|
|
59
|
+
raise TypeError(f"Invalid client type: {type(client)}")
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import (
|
|
3
|
+
TYPE_CHECKING,
|
|
4
|
+
Any,
|
|
5
|
+
Dict,
|
|
6
|
+
Optional,
|
|
7
|
+
Tuple,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
from judgeval.tracer.keys import AttributeKeys
|
|
11
|
+
from judgeval.tracer.utils import set_span_attribute
|
|
12
|
+
from judgeval.utils.serialize import safe_serialize
|
|
13
|
+
from judgeval.utils.wrappers import immutable_wrap_sync
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from judgeval.tracer import Tracer
|
|
17
|
+
from google.genai import Client
|
|
18
|
+
from google.genai.types import (
|
|
19
|
+
GenerateContentResponse,
|
|
20
|
+
GenerateContentResponseUsageMetadata,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _extract_google_tokens(
|
|
25
|
+
usage: GenerateContentResponseUsageMetadata,
|
|
26
|
+
) -> Tuple[int, int, int, int]:
|
|
27
|
+
prompt_tokens = (
|
|
28
|
+
usage.prompt_token_count if usage.prompt_token_count is not None else 0
|
|
29
|
+
)
|
|
30
|
+
completion_tokens = (
|
|
31
|
+
usage.candidates_token_count if usage.candidates_token_count is not None else 0
|
|
32
|
+
)
|
|
33
|
+
cache_read_input_tokens = (
|
|
34
|
+
usage.cached_content_token_count
|
|
35
|
+
if usage.cached_content_token_count is not None
|
|
36
|
+
else 0
|
|
37
|
+
)
|
|
38
|
+
cache_creation_input_tokens = 0
|
|
39
|
+
return (
|
|
40
|
+
prompt_tokens,
|
|
41
|
+
completion_tokens,
|
|
42
|
+
cache_read_input_tokens,
|
|
43
|
+
cache_creation_input_tokens,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _format_google_output(
|
|
48
|
+
response: GenerateContentResponse,
|
|
49
|
+
) -> Tuple[Optional[str], Optional[GenerateContentResponseUsageMetadata]]:
|
|
50
|
+
return response.text, response.usage_metadata
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def wrap_generate_content_sync(tracer: Tracer, client: Client) -> None:
|
|
54
|
+
original_func = client.models.generate_content
|
|
55
|
+
|
|
56
|
+
def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
|
|
57
|
+
ctx["span"] = tracer.get_tracer().start_span(
|
|
58
|
+
"GOOGLE_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
59
|
+
)
|
|
60
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
61
|
+
set_span_attribute(
|
|
62
|
+
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
63
|
+
)
|
|
64
|
+
ctx["model_name"] = kwargs.get("model", "")
|
|
65
|
+
set_span_attribute(
|
|
66
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def post_hook(ctx: Dict[str, Any], result: GenerateContentResponse) -> None:
|
|
70
|
+
span = ctx.get("span")
|
|
71
|
+
if not span:
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
output, usage_data = _format_google_output(result)
|
|
75
|
+
set_span_attribute(span, AttributeKeys.GEN_AI_COMPLETION, output)
|
|
76
|
+
|
|
77
|
+
if usage_data:
|
|
78
|
+
prompt_tokens, completion_tokens, cache_read, cache_creation = (
|
|
79
|
+
_extract_google_tokens(usage_data)
|
|
80
|
+
)
|
|
81
|
+
set_span_attribute(
|
|
82
|
+
span,
|
|
83
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
84
|
+
prompt_tokens,
|
|
85
|
+
)
|
|
86
|
+
set_span_attribute(
|
|
87
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
88
|
+
)
|
|
89
|
+
set_span_attribute(
|
|
90
|
+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
|
|
91
|
+
)
|
|
92
|
+
set_span_attribute(
|
|
93
|
+
span,
|
|
94
|
+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
|
|
95
|
+
cache_creation,
|
|
96
|
+
)
|
|
97
|
+
set_span_attribute(
|
|
98
|
+
span,
|
|
99
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
100
|
+
safe_serialize(usage_data),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
set_span_attribute(
|
|
104
|
+
span,
|
|
105
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
106
|
+
result.model_version if result.model_version else ctx["model_name"],
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
|
|
110
|
+
span = ctx.get("span")
|
|
111
|
+
if span:
|
|
112
|
+
span.record_exception(error)
|
|
113
|
+
|
|
114
|
+
def finally_hook(ctx: Dict[str, Any]) -> None:
|
|
115
|
+
span = ctx.get("span")
|
|
116
|
+
if span:
|
|
117
|
+
span.end()
|
|
118
|
+
|
|
119
|
+
wrapped = immutable_wrap_sync(
|
|
120
|
+
original_func,
|
|
121
|
+
pre_hook=pre_hook,
|
|
122
|
+
post_hook=post_hook,
|
|
123
|
+
error_hook=error_hook,
|
|
124
|
+
finally_hook=finally_hook,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
setattr(client.models, "generate_content", wrapped)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from judgeval.tracer.llm.llm_google.generate_content import (
|
|
5
|
+
wrap_generate_content_sync,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from judgeval.tracer import Tracer
|
|
10
|
+
from google.genai import Client
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def wrap_google_client(tracer: Tracer, client: Client) -> Client:
|
|
14
|
+
from judgeval.tracer.llm.llm_google.config import HAS_GOOGLE_GENAI
|
|
15
|
+
from judgeval.logger import judgeval_logger
|
|
16
|
+
|
|
17
|
+
if not HAS_GOOGLE_GENAI:
|
|
18
|
+
judgeval_logger.error(
|
|
19
|
+
"Cannot wrap Google GenAI client: 'google-genai' library not installed. "
|
|
20
|
+
"Install it with: pip install google-genai"
|
|
21
|
+
)
|
|
22
|
+
return client
|
|
23
|
+
|
|
24
|
+
from google.genai import Client
|
|
25
|
+
|
|
26
|
+
if isinstance(client, Client):
|
|
27
|
+
wrap_generate_content_sync(tracer, client)
|
|
28
|
+
return client
|
|
29
|
+
else:
|
|
30
|
+
raise TypeError(f"Invalid client type: {type(client)}")
|