judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +177 -12
- judgeval/api/__init__.py +519 -0
- judgeval/api/api_types.py +407 -0
- judgeval/cli.py +79 -0
- judgeval/constants.py +76 -47
- judgeval/data/__init__.py +3 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +15 -56
- judgeval/data/judgment_types.py +450 -0
- judgeval/data/result.py +29 -73
- judgeval/data/scorer_data.py +29 -62
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/trace.py +121 -0
- judgeval/dataset/__init__.py +264 -0
- judgeval/env.py +52 -0
- judgeval/evaluation/__init__.py +344 -0
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +50 -0
- judgeval/judges/__init__.py +2 -3
- judgeval/judges/base_judge.py +2 -3
- judgeval/judges/litellm_judge.py +100 -20
- judgeval/judges/together_judge.py +101 -20
- judgeval/judges/utils.py +20 -24
- judgeval/logger.py +62 -0
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +18 -25
- judgeval/scorers/agent_scorer.py +17 -0
- judgeval/scorers/api_scorer.py +45 -41
- judgeval/scorers/base_scorer.py +83 -38
- judgeval/scorers/example_scorer.py +17 -0
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorers/__init__.py +0 -148
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
- judgeval/scorers/score.py +77 -306
- judgeval/scorers/utils.py +4 -199
- judgeval/tracer/__init__.py +1122 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +128 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainable_model.py +243 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +97 -0
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/utils/version_check.py +28 -0
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.22.2.dist-info/METADATA +265 -0
- judgeval-0.22.2.dist-info/RECORD +112 -0
- judgeval-0.22.2.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -39
- judgeval/common/__init__.py +0 -8
- judgeval/common/exceptions.py +0 -28
- judgeval/common/logger.py +0 -189
- judgeval/common/tracer.py +0 -798
- judgeval/common/utils.py +0 -763
- judgeval/data/api_example.py +0 -111
- judgeval/data/datasets/__init__.py +0 -5
- judgeval/data/datasets/dataset.py +0 -286
- judgeval/data/datasets/eval_dataset_client.py +0 -193
- judgeval/data/datasets/ground_truth.py +0 -54
- judgeval/data/datasets/utils.py +0 -74
- judgeval/evaluation_run.py +0 -132
- judgeval/judges/mixture_of_judges.py +0 -248
- judgeval/judgment_client.py +0 -354
- judgeval/run_evaluation.py +0 -439
- judgeval/scorers/judgeval_scorer.py +0 -140
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
- judgeval/scorers/prompt_scorer.py +0 -439
- judgeval-0.0.11.dist-info/METADATA +0 -36
- judgeval-0.0.11.dist-info/RECORD +0 -84
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import (
|
|
3
|
+
TYPE_CHECKING,
|
|
4
|
+
Any,
|
|
5
|
+
Awaitable,
|
|
6
|
+
Callable,
|
|
7
|
+
Dict,
|
|
8
|
+
Iterator,
|
|
9
|
+
AsyncIterator,
|
|
10
|
+
Generator,
|
|
11
|
+
AsyncGenerator,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from judgeval.tracer.keys import AttributeKeys
|
|
15
|
+
from judgeval.tracer.utils import set_span_attribute
|
|
16
|
+
from judgeval.utils.serialize import safe_serialize
|
|
17
|
+
from judgeval.utils.wrappers import (
|
|
18
|
+
immutable_wrap_async,
|
|
19
|
+
immutable_wrap_sync,
|
|
20
|
+
mutable_wrap_sync,
|
|
21
|
+
mutable_wrap_async,
|
|
22
|
+
immutable_wrap_sync_iterator,
|
|
23
|
+
immutable_wrap_async_iterator,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from judgeval.tracer import Tracer
|
|
28
|
+
from together import Together, AsyncTogether # type: ignore[import-untyped]
|
|
29
|
+
from together.types import ChatCompletionResponse, ChatCompletionChunk # type: ignore[import-untyped]
|
|
30
|
+
from together.types.common import UsageData # type: ignore[import-untyped]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _extract_together_tokens(usage: UsageData) -> tuple[int, int, int, int]:
|
|
34
|
+
prompt_tokens = usage.prompt_tokens if usage.prompt_tokens is not None else 0
|
|
35
|
+
completion_tokens = (
|
|
36
|
+
usage.completion_tokens if usage.completion_tokens is not None else 0
|
|
37
|
+
)
|
|
38
|
+
cache_read_input_tokens = 0
|
|
39
|
+
cache_creation_input_tokens = 0
|
|
40
|
+
return (
|
|
41
|
+
prompt_tokens,
|
|
42
|
+
completion_tokens,
|
|
43
|
+
cache_read_input_tokens,
|
|
44
|
+
cache_creation_input_tokens,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def wrap_chat_completions_create_sync(tracer: Tracer, client: Together) -> None:
|
|
49
|
+
original_func = client.chat.completions.create
|
|
50
|
+
|
|
51
|
+
def dispatcher(*args: Any, **kwargs: Any) -> Any:
|
|
52
|
+
if kwargs.get("stream", False):
|
|
53
|
+
return _wrap_streaming_sync(tracer, original_func)(*args, **kwargs) # type: ignore[arg-type]
|
|
54
|
+
return _wrap_non_streaming_sync(tracer, original_func)(*args, **kwargs) # type: ignore[arg-type]
|
|
55
|
+
|
|
56
|
+
setattr(client.chat.completions, "create", dispatcher)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _wrap_non_streaming_sync(
|
|
60
|
+
tracer: Tracer, original_func: Callable[..., ChatCompletionResponse]
|
|
61
|
+
) -> Callable[..., ChatCompletionResponse]:
|
|
62
|
+
def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
|
|
63
|
+
ctx["span"] = tracer.get_tracer().start_span(
|
|
64
|
+
"TOGETHER_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
65
|
+
)
|
|
66
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
67
|
+
set_span_attribute(
|
|
68
|
+
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
69
|
+
)
|
|
70
|
+
ctx["model_name"] = kwargs.get("model", "")
|
|
71
|
+
prefixed_model_name = (
|
|
72
|
+
f"together_ai/{ctx['model_name']}" if ctx["model_name"] else ""
|
|
73
|
+
)
|
|
74
|
+
ctx["model_name"] = prefixed_model_name
|
|
75
|
+
set_span_attribute(
|
|
76
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, prefixed_model_name
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def post_hook(ctx: Dict[str, Any], result: ChatCompletionResponse) -> None:
|
|
80
|
+
span = ctx.get("span")
|
|
81
|
+
if not span:
|
|
82
|
+
return
|
|
83
|
+
|
|
84
|
+
set_span_attribute(
|
|
85
|
+
span, AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result)
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
if result.usage:
|
|
89
|
+
prompt_tokens, completion_tokens, _, _ = _extract_together_tokens(
|
|
90
|
+
result.usage
|
|
91
|
+
)
|
|
92
|
+
set_span_attribute(
|
|
93
|
+
span,
|
|
94
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
95
|
+
prompt_tokens,
|
|
96
|
+
)
|
|
97
|
+
set_span_attribute(
|
|
98
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
99
|
+
)
|
|
100
|
+
set_span_attribute(
|
|
101
|
+
span,
|
|
102
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
103
|
+
safe_serialize(result.usage),
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
set_span_attribute(
|
|
107
|
+
span,
|
|
108
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
109
|
+
ctx["model_name"],
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
|
|
113
|
+
span = ctx.get("span")
|
|
114
|
+
if span:
|
|
115
|
+
span.record_exception(error)
|
|
116
|
+
|
|
117
|
+
def finally_hook(ctx: Dict[str, Any]) -> None:
|
|
118
|
+
span = ctx.get("span")
|
|
119
|
+
if span:
|
|
120
|
+
span.end()
|
|
121
|
+
|
|
122
|
+
return immutable_wrap_sync(
|
|
123
|
+
original_func,
|
|
124
|
+
pre_hook=pre_hook,
|
|
125
|
+
post_hook=post_hook,
|
|
126
|
+
error_hook=error_hook,
|
|
127
|
+
finally_hook=finally_hook,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _wrap_streaming_sync(
|
|
132
|
+
tracer: Tracer, original_func: Callable[..., Iterator[ChatCompletionChunk]]
|
|
133
|
+
) -> Callable[..., Iterator[ChatCompletionChunk]]:
|
|
134
|
+
def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
|
|
135
|
+
ctx["span"] = tracer.get_tracer().start_span(
|
|
136
|
+
"TOGETHER_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
137
|
+
)
|
|
138
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
139
|
+
set_span_attribute(
|
|
140
|
+
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
141
|
+
)
|
|
142
|
+
ctx["model_name"] = kwargs.get("model", "")
|
|
143
|
+
prefixed_model_name = (
|
|
144
|
+
f"together_ai/{ctx['model_name']}" if ctx["model_name"] else ""
|
|
145
|
+
)
|
|
146
|
+
ctx["model_name"] = prefixed_model_name
|
|
147
|
+
set_span_attribute(
|
|
148
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, prefixed_model_name
|
|
149
|
+
)
|
|
150
|
+
ctx["accumulated_content"] = ""
|
|
151
|
+
|
|
152
|
+
def mutate_hook(
|
|
153
|
+
ctx: Dict[str, Any], result: Iterator[ChatCompletionChunk]
|
|
154
|
+
) -> Iterator[ChatCompletionChunk]:
|
|
155
|
+
def traced_generator() -> Generator[ChatCompletionChunk, None, None]:
|
|
156
|
+
for chunk in result:
|
|
157
|
+
yield chunk
|
|
158
|
+
|
|
159
|
+
def yield_hook(inner_ctx: Dict[str, Any], chunk: ChatCompletionChunk) -> None:
|
|
160
|
+
span = ctx.get("span")
|
|
161
|
+
if not span:
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
if chunk.choices and len(chunk.choices) > 0:
|
|
165
|
+
delta = chunk.choices[0].delta
|
|
166
|
+
if delta and hasattr(delta, "content") and delta.content:
|
|
167
|
+
ctx["accumulated_content"] = (
|
|
168
|
+
ctx.get("accumulated_content", "") + delta.content
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
if chunk.usage:
|
|
172
|
+
prompt_tokens, completion_tokens, _, _ = _extract_together_tokens(
|
|
173
|
+
chunk.usage
|
|
174
|
+
)
|
|
175
|
+
set_span_attribute(
|
|
176
|
+
span,
|
|
177
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
178
|
+
prompt_tokens,
|
|
179
|
+
)
|
|
180
|
+
set_span_attribute(
|
|
181
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
182
|
+
)
|
|
183
|
+
set_span_attribute(
|
|
184
|
+
span,
|
|
185
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
186
|
+
safe_serialize(chunk.usage),
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
|
|
190
|
+
span = ctx.get("span")
|
|
191
|
+
if span:
|
|
192
|
+
accumulated = ctx.get("accumulated_content", "")
|
|
193
|
+
set_span_attribute(span, AttributeKeys.GEN_AI_COMPLETION, accumulated)
|
|
194
|
+
|
|
195
|
+
def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
|
|
196
|
+
span = ctx.get("span")
|
|
197
|
+
if span:
|
|
198
|
+
span.record_exception(error)
|
|
199
|
+
|
|
200
|
+
def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
|
|
201
|
+
span = ctx.get("span")
|
|
202
|
+
if span:
|
|
203
|
+
span.end()
|
|
204
|
+
|
|
205
|
+
wrapped_generator = immutable_wrap_sync_iterator(
|
|
206
|
+
traced_generator,
|
|
207
|
+
yield_hook=yield_hook,
|
|
208
|
+
post_hook=post_hook_inner,
|
|
209
|
+
error_hook=error_hook_inner,
|
|
210
|
+
finally_hook=finally_hook_inner,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
return wrapped_generator()
|
|
214
|
+
|
|
215
|
+
def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
|
|
216
|
+
span = ctx.get("span")
|
|
217
|
+
if span:
|
|
218
|
+
span.record_exception(error)
|
|
219
|
+
|
|
220
|
+
return mutable_wrap_sync(
|
|
221
|
+
original_func,
|
|
222
|
+
pre_hook=pre_hook,
|
|
223
|
+
mutate_hook=mutate_hook,
|
|
224
|
+
error_hook=error_hook,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def wrap_chat_completions_create_async(tracer: Tracer, client: AsyncTogether) -> None:
|
|
229
|
+
original_func = client.chat.completions.create
|
|
230
|
+
|
|
231
|
+
async def dispatcher(*args: Any, **kwargs: Any) -> Any:
|
|
232
|
+
if kwargs.get("stream", False):
|
|
233
|
+
return await _wrap_streaming_async(tracer, original_func)(*args, **kwargs) # type: ignore[arg-type]
|
|
234
|
+
return await _wrap_non_streaming_async(tracer, original_func)(*args, **kwargs) # type: ignore[arg-type]
|
|
235
|
+
|
|
236
|
+
setattr(client.chat.completions, "create", dispatcher)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _wrap_non_streaming_async(
|
|
240
|
+
tracer: Tracer, original_func: Callable[..., Awaitable[ChatCompletionResponse]]
|
|
241
|
+
) -> Callable[..., Awaitable[ChatCompletionResponse]]:
|
|
242
|
+
def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
|
|
243
|
+
ctx["span"] = tracer.get_tracer().start_span(
|
|
244
|
+
"TOGETHER_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
245
|
+
)
|
|
246
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
247
|
+
set_span_attribute(
|
|
248
|
+
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
249
|
+
)
|
|
250
|
+
ctx["model_name"] = kwargs.get("model", "")
|
|
251
|
+
prefixed_model_name = (
|
|
252
|
+
f"together_ai/{ctx['model_name']}" if ctx["model_name"] else ""
|
|
253
|
+
)
|
|
254
|
+
ctx["model_name"] = prefixed_model_name
|
|
255
|
+
set_span_attribute(
|
|
256
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, prefixed_model_name
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
def post_hook(ctx: Dict[str, Any], result: ChatCompletionResponse) -> None:
|
|
260
|
+
span = ctx.get("span")
|
|
261
|
+
if not span:
|
|
262
|
+
return
|
|
263
|
+
|
|
264
|
+
set_span_attribute(
|
|
265
|
+
span, AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result)
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
if result.usage:
|
|
269
|
+
prompt_tokens, completion_tokens, _, _ = _extract_together_tokens(
|
|
270
|
+
result.usage
|
|
271
|
+
)
|
|
272
|
+
set_span_attribute(
|
|
273
|
+
span,
|
|
274
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
275
|
+
prompt_tokens,
|
|
276
|
+
)
|
|
277
|
+
set_span_attribute(
|
|
278
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
279
|
+
)
|
|
280
|
+
set_span_attribute(
|
|
281
|
+
span,
|
|
282
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
283
|
+
safe_serialize(result.usage),
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
set_span_attribute(
|
|
287
|
+
span,
|
|
288
|
+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
|
|
289
|
+
ctx["model_name"],
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
|
|
293
|
+
span = ctx.get("span")
|
|
294
|
+
if span:
|
|
295
|
+
span.record_exception(error)
|
|
296
|
+
|
|
297
|
+
def finally_hook(ctx: Dict[str, Any]) -> None:
|
|
298
|
+
span = ctx.get("span")
|
|
299
|
+
if span:
|
|
300
|
+
span.end()
|
|
301
|
+
|
|
302
|
+
return immutable_wrap_async(
|
|
303
|
+
original_func,
|
|
304
|
+
pre_hook=pre_hook,
|
|
305
|
+
post_hook=post_hook,
|
|
306
|
+
error_hook=error_hook,
|
|
307
|
+
finally_hook=finally_hook,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _wrap_streaming_async(
|
|
312
|
+
tracer: Tracer,
|
|
313
|
+
original_func: Callable[..., Awaitable[AsyncIterator[ChatCompletionChunk]]],
|
|
314
|
+
) -> Callable[..., Awaitable[AsyncIterator[ChatCompletionChunk]]]:
|
|
315
|
+
def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
|
|
316
|
+
ctx["span"] = tracer.get_tracer().start_span(
|
|
317
|
+
"TOGETHER_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
|
|
318
|
+
)
|
|
319
|
+
tracer._inject_judgment_context(ctx["span"])
|
|
320
|
+
set_span_attribute(
|
|
321
|
+
ctx["span"], AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs)
|
|
322
|
+
)
|
|
323
|
+
ctx["model_name"] = kwargs.get("model", "")
|
|
324
|
+
prefixed_model_name = (
|
|
325
|
+
f"together_ai/{ctx['model_name']}" if ctx["model_name"] else ""
|
|
326
|
+
)
|
|
327
|
+
ctx["model_name"] = prefixed_model_name
|
|
328
|
+
set_span_attribute(
|
|
329
|
+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, prefixed_model_name
|
|
330
|
+
)
|
|
331
|
+
ctx["accumulated_content"] = ""
|
|
332
|
+
|
|
333
|
+
def mutate_hook(
|
|
334
|
+
ctx: Dict[str, Any], result: AsyncIterator[ChatCompletionChunk]
|
|
335
|
+
) -> AsyncIterator[ChatCompletionChunk]:
|
|
336
|
+
async def traced_generator() -> AsyncGenerator[ChatCompletionChunk, None]:
|
|
337
|
+
async for chunk in result:
|
|
338
|
+
yield chunk
|
|
339
|
+
|
|
340
|
+
def yield_hook(inner_ctx: Dict[str, Any], chunk: ChatCompletionChunk) -> None:
|
|
341
|
+
span = ctx.get("span")
|
|
342
|
+
if not span:
|
|
343
|
+
return
|
|
344
|
+
|
|
345
|
+
if chunk.choices and len(chunk.choices) > 0:
|
|
346
|
+
delta = chunk.choices[0].delta
|
|
347
|
+
if delta and hasattr(delta, "content") and delta.content:
|
|
348
|
+
ctx["accumulated_content"] = (
|
|
349
|
+
ctx.get("accumulated_content", "") + delta.content
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
if chunk.usage:
|
|
353
|
+
prompt_tokens, completion_tokens, _, _ = _extract_together_tokens(
|
|
354
|
+
chunk.usage
|
|
355
|
+
)
|
|
356
|
+
set_span_attribute(
|
|
357
|
+
span,
|
|
358
|
+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
|
|
359
|
+
prompt_tokens,
|
|
360
|
+
)
|
|
361
|
+
set_span_attribute(
|
|
362
|
+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
|
|
363
|
+
)
|
|
364
|
+
set_span_attribute(
|
|
365
|
+
span,
|
|
366
|
+
AttributeKeys.JUDGMENT_USAGE_METADATA,
|
|
367
|
+
safe_serialize(chunk.usage),
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
|
|
371
|
+
span = ctx.get("span")
|
|
372
|
+
if span:
|
|
373
|
+
accumulated = ctx.get("accumulated_content", "")
|
|
374
|
+
set_span_attribute(span, AttributeKeys.GEN_AI_COMPLETION, accumulated)
|
|
375
|
+
|
|
376
|
+
def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
|
|
377
|
+
span = ctx.get("span")
|
|
378
|
+
if span:
|
|
379
|
+
span.record_exception(error)
|
|
380
|
+
|
|
381
|
+
def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
|
|
382
|
+
span = ctx.get("span")
|
|
383
|
+
if span:
|
|
384
|
+
span.end()
|
|
385
|
+
|
|
386
|
+
wrapped_generator = immutable_wrap_async_iterator(
|
|
387
|
+
traced_generator,
|
|
388
|
+
yield_hook=yield_hook,
|
|
389
|
+
post_hook=post_hook_inner,
|
|
390
|
+
error_hook=error_hook_inner,
|
|
391
|
+
finally_hook=finally_hook_inner,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
return wrapped_generator()
|
|
395
|
+
|
|
396
|
+
def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
|
|
397
|
+
span = ctx.get("span")
|
|
398
|
+
if span:
|
|
399
|
+
span.record_exception(error)
|
|
400
|
+
|
|
401
|
+
return mutable_wrap_async(
|
|
402
|
+
original_func,
|
|
403
|
+
pre_hook=pre_hook,
|
|
404
|
+
mutate_hook=mutate_hook,
|
|
405
|
+
error_hook=error_hook,
|
|
406
|
+
)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import TYPE_CHECKING, Union
|
|
3
|
+
import typing
|
|
4
|
+
|
|
5
|
+
from judgeval.tracer.llm.llm_together.chat_completions import (
|
|
6
|
+
wrap_chat_completions_create_sync,
|
|
7
|
+
wrap_chat_completions_create_async,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from judgeval.tracer import Tracer
|
|
13
|
+
from together import Together, AsyncTogether # type: ignore[import-untyped]
|
|
14
|
+
|
|
15
|
+
TClient = Union[Together, AsyncTogether]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def wrap_together_client_sync(tracer: Tracer, client: Together) -> Together:
|
|
19
|
+
wrap_chat_completions_create_sync(tracer, client)
|
|
20
|
+
return client
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def wrap_together_client_async(tracer: Tracer, client: AsyncTogether) -> AsyncTogether:
|
|
24
|
+
wrap_chat_completions_create_async(tracer, client)
|
|
25
|
+
return client
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@typing.overload
|
|
29
|
+
def wrap_together_client(tracer: Tracer, client: Together) -> Together: ...
|
|
30
|
+
@typing.overload
|
|
31
|
+
def wrap_together_client(tracer: Tracer, client: AsyncTogether) -> AsyncTogether: ... # type: ignore[overload-cannot-match]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def wrap_together_client(tracer: Tracer, client: TClient) -> TClient:
|
|
35
|
+
from judgeval.tracer.llm.llm_together.config import HAS_TOGETHER
|
|
36
|
+
from judgeval.logger import judgeval_logger
|
|
37
|
+
|
|
38
|
+
if not HAS_TOGETHER:
|
|
39
|
+
judgeval_logger.error(
|
|
40
|
+
"Cannot wrap Together client: 'together' library not installed. "
|
|
41
|
+
"Install it with: pip install together"
|
|
42
|
+
)
|
|
43
|
+
return client
|
|
44
|
+
|
|
45
|
+
from together import Together, AsyncTogether # type: ignore[import-untyped]
|
|
46
|
+
|
|
47
|
+
if isinstance(client, AsyncTogether):
|
|
48
|
+
return wrap_together_client_async(tracer, client)
|
|
49
|
+
elif isinstance(client, Together):
|
|
50
|
+
return wrap_together_client_sync(tracer, client)
|
|
51
|
+
else:
|
|
52
|
+
raise TypeError(f"Invalid client type: {type(client)}")
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Any, TypeAlias
|
|
3
|
+
|
|
4
|
+
from judgeval.tracer.llm.llm_openai.config import HAS_OPENAI
|
|
5
|
+
from judgeval.tracer.llm.llm_together.config import HAS_TOGETHER
|
|
6
|
+
from judgeval.tracer.llm.llm_anthropic.config import HAS_ANTHROPIC
|
|
7
|
+
from judgeval.tracer.llm.llm_google.config import HAS_GOOGLE_GENAI
|
|
8
|
+
|
|
9
|
+
# TODO: if we support dependency groups we can have this better type, but during runtime, we do
|
|
10
|
+
# not know which clients an end user might have installed.
|
|
11
|
+
ApiClient: TypeAlias = Any
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ApiClient",
|
|
15
|
+
"HAS_OPENAI",
|
|
16
|
+
"HAS_TOGETHER",
|
|
17
|
+
"HAS_ANTHROPIC",
|
|
18
|
+
"HAS_GOOGLE_GENAI",
|
|
19
|
+
]
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import asynccontextmanager, contextmanager
|
|
4
|
+
from typing import TYPE_CHECKING, Dict, Optional, List, Any
|
|
5
|
+
from judgeval.tracer.keys import InternalAttributeKeys
|
|
6
|
+
import uuid
|
|
7
|
+
from judgeval.exceptions import JudgmentRuntimeError
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from judgeval.tracer import Tracer
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@contextmanager
|
|
14
|
+
def sync_span_context(
|
|
15
|
+
tracer: Tracer,
|
|
16
|
+
name: str,
|
|
17
|
+
span_attributes: Optional[Dict[str, str]] = None,
|
|
18
|
+
disable_partial_emit: bool = False,
|
|
19
|
+
end_on_exit: bool = False,
|
|
20
|
+
):
|
|
21
|
+
if span_attributes is None:
|
|
22
|
+
span_attributes = {}
|
|
23
|
+
|
|
24
|
+
with tracer.get_tracer().start_as_current_span(
|
|
25
|
+
name=name,
|
|
26
|
+
attributes=span_attributes,
|
|
27
|
+
end_on_exit=end_on_exit,
|
|
28
|
+
) as span:
|
|
29
|
+
if disable_partial_emit:
|
|
30
|
+
tracer.judgment_processor.set_internal_attribute(
|
|
31
|
+
span_context=span.get_span_context(),
|
|
32
|
+
key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
|
|
33
|
+
value=True,
|
|
34
|
+
)
|
|
35
|
+
yield span
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@asynccontextmanager
|
|
39
|
+
async def async_span_context(
|
|
40
|
+
tracer: Tracer,
|
|
41
|
+
name: str,
|
|
42
|
+
span_attributes: Optional[Dict[str, str]] = None,
|
|
43
|
+
disable_partial_emit: bool = False,
|
|
44
|
+
end_on_exit: bool = False,
|
|
45
|
+
):
|
|
46
|
+
if span_attributes is None:
|
|
47
|
+
span_attributes = {}
|
|
48
|
+
|
|
49
|
+
with tracer.get_tracer().start_as_current_span(
|
|
50
|
+
name=name,
|
|
51
|
+
attributes=span_attributes,
|
|
52
|
+
end_on_exit=end_on_exit,
|
|
53
|
+
) as span:
|
|
54
|
+
if disable_partial_emit:
|
|
55
|
+
tracer.judgment_processor.set_internal_attribute(
|
|
56
|
+
span_context=span.get_span_context(),
|
|
57
|
+
key=InternalAttributeKeys.DISABLE_PARTIAL_EMIT,
|
|
58
|
+
value=True,
|
|
59
|
+
)
|
|
60
|
+
yield span
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def create_agent_context(
|
|
64
|
+
tracer: Tracer,
|
|
65
|
+
args: tuple,
|
|
66
|
+
class_name: Optional[str] = None,
|
|
67
|
+
identifier: Optional[str] = None,
|
|
68
|
+
track_state: bool = False,
|
|
69
|
+
track_attributes: Optional[List[str]] = None,
|
|
70
|
+
field_mappings: Optional[Dict[str, str]] = None,
|
|
71
|
+
):
|
|
72
|
+
"""Create agent context and return token for cleanup"""
|
|
73
|
+
agent_id = str(uuid.uuid4())
|
|
74
|
+
agent_context: Dict[str, Any] = {"agent_id": agent_id}
|
|
75
|
+
|
|
76
|
+
if class_name:
|
|
77
|
+
agent_context["class_name"] = class_name
|
|
78
|
+
else:
|
|
79
|
+
agent_context["class_name"] = None
|
|
80
|
+
|
|
81
|
+
agent_context["track_state"] = track_state
|
|
82
|
+
agent_context["track_attributes"] = track_attributes or []
|
|
83
|
+
agent_context["field_mappings"] = field_mappings or {}
|
|
84
|
+
|
|
85
|
+
instance = args[0] if args else None
|
|
86
|
+
agent_context["instance"] = instance
|
|
87
|
+
|
|
88
|
+
if identifier:
|
|
89
|
+
if not class_name or not instance or not isinstance(instance, object):
|
|
90
|
+
raise JudgmentRuntimeError(
|
|
91
|
+
"'identifier' is set but no class name or instance is available. 'identifier' can only be specified when using the agent() decorator on a class method."
|
|
92
|
+
)
|
|
93
|
+
if (
|
|
94
|
+
instance
|
|
95
|
+
and hasattr(instance, identifier)
|
|
96
|
+
and not callable(getattr(instance, identifier))
|
|
97
|
+
):
|
|
98
|
+
instance_name = str(getattr(instance, identifier))
|
|
99
|
+
agent_context["instance_name"] = instance_name
|
|
100
|
+
else:
|
|
101
|
+
raise JudgmentRuntimeError(
|
|
102
|
+
f"Attribute {identifier} does not exist for {class_name}. Check your agent() decorator."
|
|
103
|
+
)
|
|
104
|
+
else:
|
|
105
|
+
agent_context["instance_name"] = None
|
|
106
|
+
|
|
107
|
+
current_agent_context = tracer.get_current_agent_context().get()
|
|
108
|
+
if current_agent_context and "agent_id" in current_agent_context:
|
|
109
|
+
agent_context["parent_agent_id"] = current_agent_context["agent_id"]
|
|
110
|
+
else:
|
|
111
|
+
agent_context["parent_agent_id"] = None
|
|
112
|
+
|
|
113
|
+
agent_context["is_agent_entry_point"] = True
|
|
114
|
+
token = tracer.get_current_agent_context().set(agent_context) # type: ignore
|
|
115
|
+
return token
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@contextmanager
|
|
119
|
+
def sync_agent_context(
|
|
120
|
+
tracer: Tracer,
|
|
121
|
+
args: tuple,
|
|
122
|
+
class_name: Optional[str] = None,
|
|
123
|
+
identifier: Optional[str] = None,
|
|
124
|
+
track_state: bool = False,
|
|
125
|
+
track_attributes: Optional[List[str]] = None,
|
|
126
|
+
field_mappings: Optional[Dict[str, str]] = None,
|
|
127
|
+
):
|
|
128
|
+
"""Context manager for synchronous agent context"""
|
|
129
|
+
token = create_agent_context(
|
|
130
|
+
tracer=tracer,
|
|
131
|
+
args=args,
|
|
132
|
+
class_name=class_name,
|
|
133
|
+
identifier=identifier,
|
|
134
|
+
track_state=track_state,
|
|
135
|
+
track_attributes=track_attributes,
|
|
136
|
+
field_mappings=field_mappings,
|
|
137
|
+
)
|
|
138
|
+
try:
|
|
139
|
+
yield
|
|
140
|
+
finally:
|
|
141
|
+
tracer.get_current_agent_context().reset(token)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@asynccontextmanager
|
|
145
|
+
async def async_agent_context(
|
|
146
|
+
tracer: Tracer,
|
|
147
|
+
args: tuple,
|
|
148
|
+
class_name: Optional[str] = None,
|
|
149
|
+
identifier: Optional[str] = None,
|
|
150
|
+
track_state: bool = False,
|
|
151
|
+
track_attributes: Optional[List[str]] = None,
|
|
152
|
+
field_mappings: Optional[Dict[str, str]] = None,
|
|
153
|
+
):
|
|
154
|
+
"""Context manager for asynchronous agent context"""
|
|
155
|
+
token = create_agent_context(
|
|
156
|
+
tracer=tracer,
|
|
157
|
+
args=args,
|
|
158
|
+
class_name=class_name,
|
|
159
|
+
identifier=identifier,
|
|
160
|
+
track_state=track_state,
|
|
161
|
+
track_attributes=track_attributes,
|
|
162
|
+
field_mappings=field_mappings,
|
|
163
|
+
)
|
|
164
|
+
try:
|
|
165
|
+
yield
|
|
166
|
+
finally:
|
|
167
|
+
tracer.get_current_agent_context().reset(token)
|