judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import (
|
|
3
|
+
TYPE_CHECKING,
|
|
4
|
+
Any,
|
|
5
|
+
Awaitable,
|
|
6
|
+
Callable,
|
|
7
|
+
Dict,
|
|
8
|
+
Iterator,
|
|
9
|
+
AsyncIterator,
|
|
10
|
+
Generator,
|
|
11
|
+
AsyncGenerator,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from opentelemetry.trace import Status, StatusCode
|
|
15
|
+
from judgeval.judgment_attribute_keys import AttributeKeys
|
|
16
|
+
from judgeval.utils.serialize import safe_serialize
|
|
17
|
+
from judgeval.utils.wrappers import (
|
|
18
|
+
immutable_wrap_async,
|
|
19
|
+
immutable_wrap_sync,
|
|
20
|
+
mutable_wrap_sync,
|
|
21
|
+
mutable_wrap_async,
|
|
22
|
+
immutable_wrap_sync_iterator,
|
|
23
|
+
immutable_wrap_async_iterator,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from judgeval.v1.tracer import BaseTracer
|
|
28
|
+
from together import Together, AsyncTogether # type: ignore[import-untyped]
|
|
29
|
+
from together.types import ChatCompletionResponse, ChatCompletionChunk # type: ignore[import-untyped]
|
|
30
|
+
from together.types.common import UsageData # type: ignore[import-untyped]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _extract_together_tokens(usage: UsageData) -> tuple[int, int, int, int]:
|
|
34
|
+
prompt_tokens = usage.prompt_tokens if usage.prompt_tokens is not None else 0
|
|
35
|
+
completion_tokens = (
|
|
36
|
+
usage.completion_tokens if usage.completion_tokens is not None else 0
|
|
37
|
+
)
|
|
38
|
+
cache_read_input_tokens = 0
|
|
39
|
+
cache_creation_input_tokens = 0
|
|
40
|
+
return (
|
|
41
|
+
prompt_tokens,
|
|
42
|
+
completion_tokens,
|
|
43
|
+
cache_read_input_tokens,
|
|
44
|
+
cache_creation_input_tokens,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def wrap_chat_completions_create_sync(tracer: BaseTracer, client: Together) -> None:
|
|
49
|
+
original_func = client.chat.completions.create
|
|
50
|
+
|
|
51
|
+
def dispatcher(*args: Any, **kwargs: Any) -> Any:
|
|
52
|
+
if kwargs.get("stream", False):
|
|
53
|
+
return _wrap_streaming_sync(tracer, original_func)(*args, **kwargs) # type: ignore[arg-type]
|
|
54
|
+
return _wrap_non_streaming_sync(tracer, original_func)(*args, **kwargs) # type: ignore[arg-type]
|
|
55
|
+
|
|
56
|
+
setattr(client.chat.completions, "create", dispatcher)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _wrap_non_streaming_sync(
|
|
60
|
+
tracer: BaseTracer, original_func: Callable[..., ChatCompletionResponse]
|
|
61
|
+
) -> Callable[..., ChatCompletionResponse]:
|
|
62
|
+
def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
|
|
63
|
+
ctx["span"] = tracer.get_tracer().start_span(
|
|
64
|
+
"TOGETHER_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
65
|
+
)
|
|
66
|
+
ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
|
|
67
|
+
ctx["model_name"] = kwargs.get("model", "")
|
|
68
|
+
prefixed_model_name = (
|
|
69
|
+
f"together_ai/{ctx['model_name']}" if ctx["model_name"] else ""
|
|
70
|
+
)
|
|
71
|
+
ctx["model_name"] = prefixed_model_name
|
|
72
|
+
ctx["span"].set_attribute(
|
|
73
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME, prefixed_model_name
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def post_hook(ctx: Dict[str, Any], result: ChatCompletionResponse) -> None:
|
|
77
|
+
span = ctx.get("span")
|
|
78
|
+
if not span:
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result))
|
|
82
|
+
|
|
83
|
+
if result.usage:
|
|
84
|
+
prompt_tokens, completion_tokens, _, _ = _extract_together_tokens(
|
|
85
|
+
result.usage
|
|
86
|
+
)
|
|
87
|
+
span.set_attribute(
|
|
88
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
89
|
+
prompt_tokens,
|
|
90
|
+
)
|
|
91
|
+
span.set_attribute(
|
|
92
|
+
AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
93
|
+
)
|
|
94
|
+
span.set_attribute(
|
|
95
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
96
|
+
safe_serialize(result.usage),
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
span.set_attribute(AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"])
|
|
100
|
+
|
|
101
|
+
def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
|
|
102
|
+
span = ctx.get("span")
|
|
103
|
+
if span:
|
|
104
|
+
span.record_exception(error)
|
|
105
|
+
span.set_status(Status(StatusCode.ERROR))
|
|
106
|
+
|
|
107
|
+
def finally_hook(ctx: Dict[str, Any]) -> None:
|
|
108
|
+
span = ctx.get("span")
|
|
109
|
+
if span:
|
|
110
|
+
span.end()
|
|
111
|
+
|
|
112
|
+
return immutable_wrap_sync(
|
|
113
|
+
original_func,
|
|
114
|
+
pre_hook=pre_hook,
|
|
115
|
+
post_hook=post_hook,
|
|
116
|
+
error_hook=error_hook,
|
|
117
|
+
finally_hook=finally_hook,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _wrap_streaming_sync(
|
|
122
|
+
tracer: BaseTracer, original_func: Callable[..., Iterator[ChatCompletionChunk]]
|
|
123
|
+
) -> Callable[..., Iterator[ChatCompletionChunk]]:
|
|
124
|
+
def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
|
|
125
|
+
ctx["span"] = tracer.get_tracer().start_span(
|
|
126
|
+
"TOGETHER_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
127
|
+
)
|
|
128
|
+
ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
|
|
129
|
+
ctx["model_name"] = kwargs.get("model", "")
|
|
130
|
+
prefixed_model_name = (
|
|
131
|
+
f"together_ai/{ctx['model_name']}" if ctx["model_name"] else ""
|
|
132
|
+
)
|
|
133
|
+
ctx["model_name"] = prefixed_model_name
|
|
134
|
+
ctx["span"].set_attribute(
|
|
135
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME, prefixed_model_name
|
|
136
|
+
)
|
|
137
|
+
ctx["accumulated_content"] = ""
|
|
138
|
+
|
|
139
|
+
def mutate_hook(
|
|
140
|
+
ctx: Dict[str, Any], result: Iterator[ChatCompletionChunk]
|
|
141
|
+
) -> Iterator[ChatCompletionChunk]:
|
|
142
|
+
def traced_generator() -> Generator[ChatCompletionChunk, None, None]:
|
|
143
|
+
for chunk in result:
|
|
144
|
+
yield chunk
|
|
145
|
+
|
|
146
|
+
def yield_hook(inner_ctx: Dict[str, Any], chunk: ChatCompletionChunk) -> None:
|
|
147
|
+
span = ctx.get("span")
|
|
148
|
+
if not span:
|
|
149
|
+
return
|
|
150
|
+
|
|
151
|
+
if chunk.choices and len(chunk.choices) > 0:
|
|
152
|
+
delta = chunk.choices[0].delta
|
|
153
|
+
if delta and hasattr(delta, "content") and delta.content:
|
|
154
|
+
ctx["accumulated_content"] = (
|
|
155
|
+
ctx.get("accumulated_content", "") + delta.content
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
if chunk.usage:
|
|
159
|
+
prompt_tokens, completion_tokens, _, _ = _extract_together_tokens(
|
|
160
|
+
chunk.usage
|
|
161
|
+
)
|
|
162
|
+
span.set_attribute(
|
|
163
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
164
|
+
prompt_tokens,
|
|
165
|
+
)
|
|
166
|
+
span.set_attribute(
|
|
167
|
+
AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
168
|
+
)
|
|
169
|
+
span.set_attribute(
|
|
170
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
171
|
+
safe_serialize(chunk.usage),
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
|
|
175
|
+
span = ctx.get("span")
|
|
176
|
+
if span:
|
|
177
|
+
accumulated = ctx.get("accumulated_content", "")
|
|
178
|
+
span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, accumulated)
|
|
179
|
+
|
|
180
|
+
def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
|
|
181
|
+
span = ctx.get("span")
|
|
182
|
+
if span:
|
|
183
|
+
span.record_exception(error)
|
|
184
|
+
span.set_status(Status(StatusCode.ERROR))
|
|
185
|
+
|
|
186
|
+
def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
|
|
187
|
+
span = ctx.get("span")
|
|
188
|
+
if span:
|
|
189
|
+
span.end()
|
|
190
|
+
|
|
191
|
+
wrapped_generator = immutable_wrap_sync_iterator(
|
|
192
|
+
traced_generator,
|
|
193
|
+
yield_hook=yield_hook,
|
|
194
|
+
post_hook=post_hook_inner,
|
|
195
|
+
error_hook=error_hook_inner,
|
|
196
|
+
finally_hook=finally_hook_inner,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
return wrapped_generator()
|
|
200
|
+
|
|
201
|
+
def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
|
|
202
|
+
span = ctx.get("span")
|
|
203
|
+
if span:
|
|
204
|
+
span.record_exception(error)
|
|
205
|
+
span.set_status(Status(StatusCode.ERROR))
|
|
206
|
+
|
|
207
|
+
return mutable_wrap_sync(
|
|
208
|
+
original_func,
|
|
209
|
+
pre_hook=pre_hook,
|
|
210
|
+
mutate_hook=mutate_hook,
|
|
211
|
+
error_hook=error_hook,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def wrap_chat_completions_create_async(
|
|
216
|
+
tracer: BaseTracer, client: AsyncTogether
|
|
217
|
+
) -> None:
|
|
218
|
+
original_func = client.chat.completions.create
|
|
219
|
+
|
|
220
|
+
async def dispatcher(*args: Any, **kwargs: Any) -> Any:
|
|
221
|
+
if kwargs.get("stream", False):
|
|
222
|
+
return await _wrap_streaming_async(tracer, original_func)(*args, **kwargs) # type: ignore[arg-type]
|
|
223
|
+
return await _wrap_non_streaming_async(tracer, original_func)(*args, **kwargs) # type: ignore[arg-type]
|
|
224
|
+
|
|
225
|
+
setattr(client.chat.completions, "create", dispatcher)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _wrap_non_streaming_async(
|
|
229
|
+
tracer: BaseTracer, original_func: Callable[..., Awaitable[ChatCompletionResponse]]
|
|
230
|
+
) -> Callable[..., Awaitable[ChatCompletionResponse]]:
|
|
231
|
+
def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
|
|
232
|
+
ctx["span"] = tracer.get_tracer().start_span(
|
|
233
|
+
"TOGETHER_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
234
|
+
)
|
|
235
|
+
ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
|
|
236
|
+
ctx["model_name"] = kwargs.get("model", "")
|
|
237
|
+
prefixed_model_name = (
|
|
238
|
+
f"together_ai/{ctx['model_name']}" if ctx["model_name"] else ""
|
|
239
|
+
)
|
|
240
|
+
ctx["model_name"] = prefixed_model_name
|
|
241
|
+
ctx["span"].set_attribute(
|
|
242
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME, prefixed_model_name
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
def post_hook(ctx: Dict[str, Any], result: ChatCompletionResponse) -> None:
|
|
246
|
+
span = ctx.get("span")
|
|
247
|
+
if not span:
|
|
248
|
+
return
|
|
249
|
+
|
|
250
|
+
span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result))
|
|
251
|
+
|
|
252
|
+
if result.usage:
|
|
253
|
+
prompt_tokens, completion_tokens, _, _ = _extract_together_tokens(
|
|
254
|
+
result.usage
|
|
255
|
+
)
|
|
256
|
+
span.set_attribute(
|
|
257
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
258
|
+
prompt_tokens,
|
|
259
|
+
)
|
|
260
|
+
span.set_attribute(
|
|
261
|
+
AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
262
|
+
)
|
|
263
|
+
span.set_attribute(
|
|
264
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
265
|
+
safe_serialize(result.usage),
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
span.set_attribute(AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"])
|
|
269
|
+
|
|
270
|
+
def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
|
|
271
|
+
span = ctx.get("span")
|
|
272
|
+
if span:
|
|
273
|
+
span.record_exception(error)
|
|
274
|
+
span.set_status(Status(StatusCode.ERROR))
|
|
275
|
+
|
|
276
|
+
def finally_hook(ctx: Dict[str, Any]) -> None:
|
|
277
|
+
span = ctx.get("span")
|
|
278
|
+
if span:
|
|
279
|
+
span.end()
|
|
280
|
+
|
|
281
|
+
return immutable_wrap_async(
|
|
282
|
+
original_func,
|
|
283
|
+
pre_hook=pre_hook,
|
|
284
|
+
post_hook=post_hook,
|
|
285
|
+
error_hook=error_hook,
|
|
286
|
+
finally_hook=finally_hook,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def _wrap_streaming_async(
|
|
291
|
+
tracer: BaseTracer,
|
|
292
|
+
original_func: Callable[..., Awaitable[AsyncIterator[ChatCompletionChunk]]],
|
|
293
|
+
) -> Callable[..., Awaitable[AsyncIterator[ChatCompletionChunk]]]:
|
|
294
|
+
def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
|
|
295
|
+
ctx["span"] = tracer.get_tracer().start_span(
|
|
296
|
+
"TOGETHER_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
297
|
+
)
|
|
298
|
+
ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
|
|
299
|
+
ctx["model_name"] = kwargs.get("model", "")
|
|
300
|
+
prefixed_model_name = (
|
|
301
|
+
f"together_ai/{ctx['model_name']}" if ctx["model_name"] else ""
|
|
302
|
+
)
|
|
303
|
+
ctx["model_name"] = prefixed_model_name
|
|
304
|
+
ctx["span"].set_attribute(
|
|
305
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME, prefixed_model_name
|
|
306
|
+
)
|
|
307
|
+
ctx["accumulated_content"] = ""
|
|
308
|
+
|
|
309
|
+
def mutate_hook(
|
|
310
|
+
ctx: Dict[str, Any], result: AsyncIterator[ChatCompletionChunk]
|
|
311
|
+
) -> AsyncIterator[ChatCompletionChunk]:
|
|
312
|
+
async def traced_generator() -> AsyncGenerator[ChatCompletionChunk, None]:
|
|
313
|
+
async for chunk in result:
|
|
314
|
+
yield chunk
|
|
315
|
+
|
|
316
|
+
def yield_hook(inner_ctx: Dict[str, Any], chunk: ChatCompletionChunk) -> None:
|
|
317
|
+
span = ctx.get("span")
|
|
318
|
+
if not span:
|
|
319
|
+
return
|
|
320
|
+
|
|
321
|
+
if chunk.choices and len(chunk.choices) > 0:
|
|
322
|
+
delta = chunk.choices[0].delta
|
|
323
|
+
if delta and hasattr(delta, "content") and delta.content:
|
|
324
|
+
ctx["accumulated_content"] = (
|
|
325
|
+
ctx.get("accumulated_content", "") + delta.content
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
if chunk.usage:
|
|
329
|
+
prompt_tokens, completion_tokens, _, _ = _extract_together_tokens(
|
|
330
|
+
chunk.usage
|
|
331
|
+
)
|
|
332
|
+
span.set_attribute(
|
|
333
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
334
|
+
prompt_tokens,
|
|
335
|
+
)
|
|
336
|
+
span.set_attribute(
|
|
337
|
+
AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
338
|
+
)
|
|
339
|
+
span.set_attribute(
|
|
340
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
341
|
+
safe_serialize(chunk.usage),
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
|
|
345
|
+
span = ctx.get("span")
|
|
346
|
+
if span:
|
|
347
|
+
accumulated = ctx.get("accumulated_content", "")
|
|
348
|
+
span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, accumulated)
|
|
349
|
+
|
|
350
|
+
def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
|
|
351
|
+
span = ctx.get("span")
|
|
352
|
+
if span:
|
|
353
|
+
span.record_exception(error)
|
|
354
|
+
span.set_status(Status(StatusCode.ERROR))
|
|
355
|
+
|
|
356
|
+
def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
|
|
357
|
+
span = ctx.get("span")
|
|
358
|
+
if span:
|
|
359
|
+
span.end()
|
|
360
|
+
|
|
361
|
+
wrapped_generator = immutable_wrap_async_iterator(
|
|
362
|
+
traced_generator,
|
|
363
|
+
yield_hook=yield_hook,
|
|
364
|
+
post_hook=post_hook_inner,
|
|
365
|
+
error_hook=error_hook_inner,
|
|
366
|
+
finally_hook=finally_hook_inner,
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
return wrapped_generator()
|
|
370
|
+
|
|
371
|
+
def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
|
|
372
|
+
span = ctx.get("span")
|
|
373
|
+
if span:
|
|
374
|
+
span.record_exception(error)
|
|
375
|
+
span.set_status(Status(StatusCode.ERROR))
|
|
376
|
+
|
|
377
|
+
return mutable_wrap_async(
|
|
378
|
+
original_func,
|
|
379
|
+
pre_hook=pre_hook,
|
|
380
|
+
mutate_hook=mutate_hook,
|
|
381
|
+
error_hook=error_hook,
|
|
382
|
+
)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import TYPE_CHECKING, Union
|
|
3
|
+
import typing
|
|
4
|
+
|
|
5
|
+
from judgeval.v1.instrumentation.llm.llm_together.chat_completions import (
|
|
6
|
+
wrap_chat_completions_create_sync,
|
|
7
|
+
wrap_chat_completions_create_async,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from judgeval.v1.tracer import BaseTracer
|
|
13
|
+
from together import Together, AsyncTogether # type: ignore[import-untyped]
|
|
14
|
+
|
|
15
|
+
TClient = Union[Together, AsyncTogether]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def wrap_together_client_sync(tracer: BaseTracer, client: Together) -> Together:
|
|
19
|
+
wrap_chat_completions_create_sync(tracer, client)
|
|
20
|
+
return client
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def wrap_together_client_async(
|
|
24
|
+
tracer: BaseTracer, client: AsyncTogether
|
|
25
|
+
) -> AsyncTogether:
|
|
26
|
+
wrap_chat_completions_create_async(tracer, client)
|
|
27
|
+
return client
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@typing.overload
|
|
31
|
+
def wrap_together_client(tracer: BaseTracer, client: Together) -> Together: ...
|
|
32
|
+
@typing.overload
|
|
33
|
+
def wrap_together_client( # type: ignore[overload-cannot-match]
|
|
34
|
+
tracer: BaseTracer,
|
|
35
|
+
client: AsyncTogether,
|
|
36
|
+
) -> AsyncTogether: ...
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def wrap_together_client(tracer: BaseTracer, client: TClient) -> TClient:
|
|
40
|
+
from judgeval.v1.instrumentation.llm.llm_together.config import HAS_TOGETHER
|
|
41
|
+
from judgeval.logger import judgeval_logger
|
|
42
|
+
|
|
43
|
+
if not HAS_TOGETHER:
|
|
44
|
+
judgeval_logger.error(
|
|
45
|
+
"Cannot wrap Together client: 'together' library not installed. "
|
|
46
|
+
"Install it with: pip install together"
|
|
47
|
+
)
|
|
48
|
+
return client
|
|
49
|
+
|
|
50
|
+
from together import Together, AsyncTogether # type: ignore[import-untyped]
|
|
51
|
+
|
|
52
|
+
if isinstance(client, AsyncTogether):
|
|
53
|
+
return wrap_together_client_async(tracer, client)
|
|
54
|
+
elif isinstance(client, Together):
|
|
55
|
+
return wrap_together_client_sync(tracer, client)
|
|
56
|
+
else:
|
|
57
|
+
raise TypeError(f"Invalid client type: {type(client)}")
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Any, TypeAlias
|
|
3
|
+
|
|
4
|
+
from judgeval.v1.instrumentation.llm.llm_openai.config import HAS_OPENAI
|
|
5
|
+
from judgeval.v1.instrumentation.llm.llm_together.config import HAS_TOGETHER
|
|
6
|
+
from judgeval.v1.instrumentation.llm.llm_anthropic.config import HAS_ANTHROPIC
|
|
7
|
+
from judgeval.v1.instrumentation.llm.llm_google.config import HAS_GOOGLE_GENAI
|
|
8
|
+
|
|
9
|
+
# TODO: if we support dependency groups we can have this better type, but during runtime, we do
|
|
10
|
+
# not know which clients an end user might have installed.
|
|
11
|
+
ApiClient: TypeAlias = Any
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ApiClient",
|
|
15
|
+
"HAS_OPENAI",
|
|
16
|
+
"HAS_TOGETHER",
|
|
17
|
+
"HAS_ANTHROPIC",
|
|
18
|
+
"HAS_GOOGLE_GENAI",
|
|
19
|
+
]
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
import sys
|
|
4
|
+
from judgeval.logger import judgeval_logger
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from judgeval.v1.tracer.base_tracer import BaseTracer
|
|
8
|
+
|
|
9
|
+
__all__ = ["setup_claude_agent_sdk"]
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import claude_agent_sdk # type: ignore
|
|
13
|
+
except ImportError:
|
|
14
|
+
raise ImportError(
|
|
15
|
+
"Claude Agent SDK is not installed and required for the claude agent sdk integration. Please install it with `pip install claude-agent-sdk`."
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def setup_claude_agent_sdk(
|
|
20
|
+
tracer: "BaseTracer",
|
|
21
|
+
) -> bool:
|
|
22
|
+
"""
|
|
23
|
+
Setup Judgeval integration with Claude Agent SDK. Will automatically patch the SDK for automatic tracing.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
tracer: Judgeval Tracer instance
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
bool: True if setup was successful, False otherwise.
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
```python
|
|
33
|
+
import claude_agent_sdk
|
|
34
|
+
from judgeval.v1.integrations.claude_agent_sdk import setup_claude_agent_sdk
|
|
35
|
+
|
|
36
|
+
tracer = Tracer(project_name="my-project")
|
|
37
|
+
setup_claude_agent_sdk(tracer=tracer)
|
|
38
|
+
|
|
39
|
+
# Now use claude_agent_sdk normally - all calls automatically traced
|
|
40
|
+
```
|
|
41
|
+
"""
|
|
42
|
+
from judgeval.v1.integrations.claude_agent_sdk.wrapper import (
|
|
43
|
+
_create_client_wrapper_class,
|
|
44
|
+
_create_tool_wrapper_class,
|
|
45
|
+
_wrap_tool_factory,
|
|
46
|
+
_wrap_query_function,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
# Store original classes before patching
|
|
51
|
+
original_client = (
|
|
52
|
+
claude_agent_sdk.ClaudeSDKClient
|
|
53
|
+
if hasattr(claude_agent_sdk, "ClaudeSDKClient")
|
|
54
|
+
else None
|
|
55
|
+
)
|
|
56
|
+
original_tool_class = (
|
|
57
|
+
claude_agent_sdk.SdkMcpTool
|
|
58
|
+
if hasattr(claude_agent_sdk, "SdkMcpTool")
|
|
59
|
+
else None
|
|
60
|
+
)
|
|
61
|
+
original_tool_fn = (
|
|
62
|
+
claude_agent_sdk.tool if hasattr(claude_agent_sdk, "tool") else None
|
|
63
|
+
)
|
|
64
|
+
original_query_fn = (
|
|
65
|
+
claude_agent_sdk.query if hasattr(claude_agent_sdk, "query") else None
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Patch ClaudeSDKClient
|
|
69
|
+
if original_client:
|
|
70
|
+
wrapped_client = _create_client_wrapper_class(original_client, tracer)
|
|
71
|
+
claude_agent_sdk.ClaudeSDKClient = wrapped_client # type: ignore
|
|
72
|
+
|
|
73
|
+
# Update all modules that already imported ClaudeSDKClient
|
|
74
|
+
for module in list(sys.modules.values()):
|
|
75
|
+
if module and hasattr(module, "ClaudeSDKClient"):
|
|
76
|
+
if getattr(module, "ClaudeSDKClient", None) is original_client:
|
|
77
|
+
setattr(module, "ClaudeSDKClient", wrapped_client)
|
|
78
|
+
|
|
79
|
+
# Patch SdkMcpTool
|
|
80
|
+
if original_tool_class:
|
|
81
|
+
wrapped_tool_class = _create_tool_wrapper_class(original_tool_class, tracer)
|
|
82
|
+
claude_agent_sdk.SdkMcpTool = wrapped_tool_class # type: ignore
|
|
83
|
+
|
|
84
|
+
# Update all modules that already imported SdkMcpTool
|
|
85
|
+
for module in list(sys.modules.values()):
|
|
86
|
+
if module and hasattr(module, "SdkMcpTool"):
|
|
87
|
+
if getattr(module, "SdkMcpTool", None) is original_tool_class:
|
|
88
|
+
setattr(module, "SdkMcpTool", wrapped_tool_class)
|
|
89
|
+
|
|
90
|
+
# Patch tool() decorator
|
|
91
|
+
if original_tool_fn:
|
|
92
|
+
wrapped_tool_fn = _wrap_tool_factory(original_tool_fn, tracer)
|
|
93
|
+
claude_agent_sdk.tool = wrapped_tool_fn # type: ignore
|
|
94
|
+
|
|
95
|
+
# Update all modules that already imported tool
|
|
96
|
+
for module in list(sys.modules.values()):
|
|
97
|
+
if module and hasattr(module, "tool"):
|
|
98
|
+
if getattr(module, "tool", None) is original_tool_fn:
|
|
99
|
+
setattr(module, "tool", wrapped_tool_fn)
|
|
100
|
+
|
|
101
|
+
# Patch standalone query() function if it exists
|
|
102
|
+
# Note: The standalone query() uses InternalClient, not ClaudeSDKClient,
|
|
103
|
+
# so we need to wrap it separately to add tracing
|
|
104
|
+
if original_query_fn:
|
|
105
|
+
wrapped_query_fn = _wrap_query_function(original_query_fn, tracer)
|
|
106
|
+
claude_agent_sdk.query = wrapped_query_fn # type: ignore
|
|
107
|
+
|
|
108
|
+
# Update all modules that already imported query
|
|
109
|
+
for module in list(sys.modules.values()):
|
|
110
|
+
if module and hasattr(module, "query"):
|
|
111
|
+
if getattr(module, "query", None) is original_query_fn:
|
|
112
|
+
setattr(module, "query", wrapped_query_fn)
|
|
113
|
+
|
|
114
|
+
judgeval_logger.info("Claude Agent SDK integration setup successful")
|
|
115
|
+
return True
|
|
116
|
+
|
|
117
|
+
except Exception as e:
|
|
118
|
+
judgeval_logger.error(f"Failed to setup Claude Agent SDK integration: {e}")
|
|
119
|
+
return False
|