judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,564 @@
|
|
|
1
|
+
"""Wrapper implementation for Claude Agent SDK."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
import dataclasses
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
from typing import (
|
|
8
|
+
TYPE_CHECKING,
|
|
9
|
+
Any,
|
|
10
|
+
AsyncGenerator,
|
|
11
|
+
Callable,
|
|
12
|
+
Dict,
|
|
13
|
+
List,
|
|
14
|
+
Optional,
|
|
15
|
+
Tuple,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
from opentelemetry import trace, context as otel_context
|
|
19
|
+
from opentelemetry.trace import set_span_in_context
|
|
20
|
+
|
|
21
|
+
from judgeval.tracer.keys import AttributeKeys
|
|
22
|
+
from judgeval.tracer.utils import set_span_attribute
|
|
23
|
+
from judgeval.utils.serialize import safe_serialize
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from judgeval.v1.tracer.tracer import BaseTracer
|
|
27
|
+
|
|
28
|
+
# Thread-local storage to propagate parent span context to tool handlers
|
|
29
|
+
# Claude Agent SDK breaks OpenTelemetry's automatic context propagation
|
|
30
|
+
# when executing tools, so we need to explicitly store and pass the context
|
|
31
|
+
_thread_local = threading.local()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class LLMSpanTracker:
|
|
35
|
+
"""Manages LLM span lifecycle for Claude Agent SDK message streams.
|
|
36
|
+
|
|
37
|
+
Message flow per turn:
|
|
38
|
+
1. UserMessage (tool results) → mark the time when next LLM will start
|
|
39
|
+
2. AssistantMessage - LLM response arrives → create span with the marked start time, ending previous span
|
|
40
|
+
3. ResultMessage - usage metrics → log to span
|
|
41
|
+
|
|
42
|
+
We end the previous span when the next AssistantMessage arrives, using the marked
|
|
43
|
+
start time to ensure sequential timing (no overlapping LLM spans).
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(self, tracer: "BaseTracer", query_start_time: Optional[float] = None):
|
|
47
|
+
self.tracer = tracer
|
|
48
|
+
self.current_span: Optional[Any] = None
|
|
49
|
+
self.current_span_context: Optional[Any] = None
|
|
50
|
+
self.next_start_time: Optional[float] = query_start_time
|
|
51
|
+
|
|
52
|
+
def start_llm_span(
|
|
53
|
+
self, message: Any, prompt: Any, conversation_history: List[Dict[str, Any]]
|
|
54
|
+
) -> Optional[Dict[str, Any]]:
|
|
55
|
+
"""Start a new LLM span, ending the previous one if it exists."""
|
|
56
|
+
# Use the marked start time, or current time as fallback
|
|
57
|
+
start_time = (
|
|
58
|
+
self.next_start_time if self.next_start_time is not None else time.time()
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# End the previous span - only use __exit__ as it calls end() internally
|
|
62
|
+
if self.current_span_context:
|
|
63
|
+
self.current_span_context.__exit__(None, None, None)
|
|
64
|
+
|
|
65
|
+
final_content, span, span_context = _create_llm_span_for_messages(
|
|
66
|
+
self.tracer,
|
|
67
|
+
[message],
|
|
68
|
+
prompt,
|
|
69
|
+
conversation_history,
|
|
70
|
+
start_time=start_time,
|
|
71
|
+
)
|
|
72
|
+
self.current_span = span
|
|
73
|
+
self.current_span_context = span_context
|
|
74
|
+
self.next_start_time = None # Reset for next span
|
|
75
|
+
return final_content
|
|
76
|
+
|
|
77
|
+
def mark_next_llm_start(self) -> None:
|
|
78
|
+
"""Mark when the next LLM call will start (after tool results)."""
|
|
79
|
+
self.next_start_time = time.time()
|
|
80
|
+
|
|
81
|
+
def log_usage(self, usage_metrics: Dict[str, Any]) -> None:
|
|
82
|
+
"""Log usage metrics to the current LLM span."""
|
|
83
|
+
if self.current_span and usage_metrics:
|
|
84
|
+
for key, value in usage_metrics.items():
|
|
85
|
+
set_span_attribute(self.current_span, key, value)
|
|
86
|
+
|
|
87
|
+
def cleanup(self) -> None:
|
|
88
|
+
"""End any unclosed spans."""
|
|
89
|
+
if self.current_span_context:
|
|
90
|
+
self.current_span_context.__exit__(None, None, None)
|
|
91
|
+
self.current_span = None
|
|
92
|
+
self.current_span_context = None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _create_client_wrapper_class(
|
|
96
|
+
original_client_class: Any, tracer: "BaseTracer"
|
|
97
|
+
) -> Any:
|
|
98
|
+
"""Creates a wrapper class for ClaudeSDKClient that wraps query and receive_response."""
|
|
99
|
+
|
|
100
|
+
class WrappedClaudeSDKClient(original_client_class): # type: ignore
|
|
101
|
+
def __init__(self, *args: Any, **kwargs: Any):
|
|
102
|
+
super().__init__(*args, **kwargs)
|
|
103
|
+
self.__last_prompt: Optional[str] = None
|
|
104
|
+
self.__query_start_time: Optional[float] = None
|
|
105
|
+
|
|
106
|
+
async def query(self, *args: Any, **kwargs: Any) -> Any:
|
|
107
|
+
"""Wrap query to capture the prompt and start time for tracing."""
|
|
108
|
+
# Capture the time when query is called (when LLM call starts)
|
|
109
|
+
self.__query_start_time = time.time()
|
|
110
|
+
|
|
111
|
+
# Capture the prompt for use in receive_response
|
|
112
|
+
if args:
|
|
113
|
+
self.__last_prompt = str(args[0])
|
|
114
|
+
elif "prompt" in kwargs:
|
|
115
|
+
self.__last_prompt = str(kwargs["prompt"])
|
|
116
|
+
|
|
117
|
+
return await super().query(*args, **kwargs)
|
|
118
|
+
|
|
119
|
+
async def receive_response(self) -> AsyncGenerator[Any, None]:
|
|
120
|
+
"""Wrap receive_response to add tracing with proper span hierarchy."""
|
|
121
|
+
generator = super().receive_response()
|
|
122
|
+
|
|
123
|
+
# Create TASK span for the entire agent conversation
|
|
124
|
+
agent_span_context = tracer.get_tracer().start_as_current_span(
|
|
125
|
+
"Claude_Agent",
|
|
126
|
+
attributes={
|
|
127
|
+
AttributeKeys.JUDGMENT_SPAN_KIND: "agent",
|
|
128
|
+
},
|
|
129
|
+
)
|
|
130
|
+
agent_span = agent_span_context.__enter__()
|
|
131
|
+
|
|
132
|
+
# Record input
|
|
133
|
+
if self.__last_prompt:
|
|
134
|
+
set_span_attribute(
|
|
135
|
+
agent_span,
|
|
136
|
+
AttributeKeys.JUDGMENT_INPUT,
|
|
137
|
+
safe_serialize(self.__last_prompt),
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Store the parent span context in thread-local storage
|
|
141
|
+
# Claude Agent SDK breaks OpenTelemetry's context propagation when executing tools,
|
|
142
|
+
# so we need to explicitly store the context for tool handlers to access
|
|
143
|
+
parent_context = set_span_in_context(agent_span, otel_context.get_current())
|
|
144
|
+
_thread_local.parent_context = parent_context
|
|
145
|
+
|
|
146
|
+
final_results: List[Dict[str, Any]] = []
|
|
147
|
+
llm_tracker = LLMSpanTracker(
|
|
148
|
+
tracer, query_start_time=self.__query_start_time
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
async for message in generator:
|
|
153
|
+
message_type = type(message).__name__
|
|
154
|
+
|
|
155
|
+
if message_type == "AssistantMessage":
|
|
156
|
+
final_content = llm_tracker.start_llm_span(
|
|
157
|
+
message, self.__last_prompt, final_results
|
|
158
|
+
)
|
|
159
|
+
if final_content:
|
|
160
|
+
final_results.append(final_content)
|
|
161
|
+
|
|
162
|
+
elif message_type == "UserMessage":
|
|
163
|
+
if hasattr(message, "content"):
|
|
164
|
+
content = _serialize_content_blocks(message.content)
|
|
165
|
+
final_results.append({"content": content, "role": "user"})
|
|
166
|
+
|
|
167
|
+
llm_tracker.mark_next_llm_start()
|
|
168
|
+
|
|
169
|
+
elif message_type == "ResultMessage":
|
|
170
|
+
if hasattr(message, "usage"):
|
|
171
|
+
usage_metrics = _extract_usage_from_result_message(message)
|
|
172
|
+
llm_tracker.log_usage(usage_metrics)
|
|
173
|
+
|
|
174
|
+
result_metadata = {
|
|
175
|
+
k: v
|
|
176
|
+
for k, v in {
|
|
177
|
+
"num_turns": getattr(message, "num_turns", None),
|
|
178
|
+
"session_id": getattr(message, "session_id", None),
|
|
179
|
+
}.items()
|
|
180
|
+
if v is not None
|
|
181
|
+
}
|
|
182
|
+
if result_metadata:
|
|
183
|
+
for key, value in result_metadata.items():
|
|
184
|
+
set_span_attribute(agent_span, f"agent.{key}", value)
|
|
185
|
+
|
|
186
|
+
yield message
|
|
187
|
+
|
|
188
|
+
# Record output
|
|
189
|
+
if final_results:
|
|
190
|
+
set_span_attribute(
|
|
191
|
+
agent_span,
|
|
192
|
+
AttributeKeys.JUDGMENT_OUTPUT,
|
|
193
|
+
safe_serialize(final_results[-1] if final_results else None),
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
except Exception as e:
|
|
197
|
+
agent_span.record_exception(e)
|
|
198
|
+
raise
|
|
199
|
+
finally:
|
|
200
|
+
llm_tracker.cleanup()
|
|
201
|
+
agent_span_context.__exit__(None, None, None)
|
|
202
|
+
# Clean up thread-local storage
|
|
203
|
+
if hasattr(_thread_local, "parent_context"):
|
|
204
|
+
delattr(_thread_local, "parent_context")
|
|
205
|
+
|
|
206
|
+
return WrappedClaudeSDKClient
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _create_tool_wrapper_class(original_tool_class: Any, tracer: "BaseTracer") -> Any:
|
|
210
|
+
"""Creates a wrapper class for SdkMcpTool that wraps handlers."""
|
|
211
|
+
|
|
212
|
+
class WrappedSdkMcpTool(original_tool_class): # type: ignore
|
|
213
|
+
def __init__(
|
|
214
|
+
self,
|
|
215
|
+
name: Any,
|
|
216
|
+
description: Any,
|
|
217
|
+
input_schema: Any,
|
|
218
|
+
handler: Any,
|
|
219
|
+
**kwargs: Any,
|
|
220
|
+
):
|
|
221
|
+
wrapped_handler = _wrap_tool_handler(tracer, handler, name)
|
|
222
|
+
super().__init__(name, description, input_schema, wrapped_handler, **kwargs)
|
|
223
|
+
|
|
224
|
+
# Preserve generic typing support
|
|
225
|
+
__class_getitem__ = classmethod(lambda cls, params: cls) # type: ignore
|
|
226
|
+
|
|
227
|
+
return WrappedSdkMcpTool
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _wrap_query_function(
|
|
231
|
+
original_query_fn: Any, tracer: "BaseTracer"
|
|
232
|
+
) -> Callable[..., Any]:
|
|
233
|
+
"""Wraps the standalone query() function to add tracing."""
|
|
234
|
+
|
|
235
|
+
async def wrapped_query(*args: Any, **kwargs: Any) -> Any:
|
|
236
|
+
"""Wrapped query function with automatic tracing."""
|
|
237
|
+
# Create agent span for the query
|
|
238
|
+
agent_span_context = tracer.get_tracer().start_as_current_span(
|
|
239
|
+
"Claude_Agent_Query",
|
|
240
|
+
attributes={
|
|
241
|
+
AttributeKeys.JUDGMENT_SPAN_KIND: "agent",
|
|
242
|
+
},
|
|
243
|
+
)
|
|
244
|
+
agent_span = agent_span_context.__enter__()
|
|
245
|
+
|
|
246
|
+
# Capture prompt if available
|
|
247
|
+
prompt = kwargs.get("prompt") or (args[0] if args else None)
|
|
248
|
+
if prompt and isinstance(prompt, str):
|
|
249
|
+
set_span_attribute(
|
|
250
|
+
agent_span, AttributeKeys.JUDGMENT_INPUT, safe_serialize(prompt)
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Store parent context for tool tracing
|
|
254
|
+
parent_context = set_span_in_context(agent_span, otel_context.get_current())
|
|
255
|
+
_thread_local.parent_context = parent_context
|
|
256
|
+
|
|
257
|
+
final_results: List[Dict[str, Any]] = []
|
|
258
|
+
llm_tracker = LLMSpanTracker(tracer, query_start_time=time.time())
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
# Call original query function
|
|
262
|
+
async for message in original_query_fn(*args, **kwargs):
|
|
263
|
+
message_type = type(message).__name__
|
|
264
|
+
|
|
265
|
+
if message_type == "AssistantMessage":
|
|
266
|
+
final_content = llm_tracker.start_llm_span(
|
|
267
|
+
message,
|
|
268
|
+
prompt if isinstance(prompt, str) else None,
|
|
269
|
+
final_results,
|
|
270
|
+
)
|
|
271
|
+
if final_content:
|
|
272
|
+
final_results.append(final_content)
|
|
273
|
+
|
|
274
|
+
elif message_type == "UserMessage":
|
|
275
|
+
if hasattr(message, "content"):
|
|
276
|
+
content = _serialize_content_blocks(message.content)
|
|
277
|
+
final_results.append({"content": content, "role": "user"})
|
|
278
|
+
|
|
279
|
+
llm_tracker.mark_next_llm_start()
|
|
280
|
+
|
|
281
|
+
elif message_type == "ResultMessage":
|
|
282
|
+
if hasattr(message, "usage"):
|
|
283
|
+
usage_metrics = _extract_usage_from_result_message(message)
|
|
284
|
+
llm_tracker.log_usage(usage_metrics)
|
|
285
|
+
|
|
286
|
+
result_metadata = {
|
|
287
|
+
k: v
|
|
288
|
+
for k, v in {
|
|
289
|
+
"num_turns": getattr(message, "num_turns", None),
|
|
290
|
+
"session_id": getattr(message, "session_id", None),
|
|
291
|
+
}.items()
|
|
292
|
+
if v is not None
|
|
293
|
+
}
|
|
294
|
+
if result_metadata:
|
|
295
|
+
for key, value in result_metadata.items():
|
|
296
|
+
set_span_attribute(agent_span, f"agent.{key}", value)
|
|
297
|
+
|
|
298
|
+
yield message
|
|
299
|
+
|
|
300
|
+
# Record output
|
|
301
|
+
if final_results:
|
|
302
|
+
set_span_attribute(
|
|
303
|
+
agent_span,
|
|
304
|
+
AttributeKeys.JUDGMENT_OUTPUT,
|
|
305
|
+
safe_serialize(final_results[-1] if final_results else None),
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
except Exception as e:
|
|
309
|
+
agent_span.record_exception(e)
|
|
310
|
+
raise
|
|
311
|
+
finally:
|
|
312
|
+
llm_tracker.cleanup()
|
|
313
|
+
agent_span_context.__exit__(None, None, None)
|
|
314
|
+
# Clean up thread-local storage
|
|
315
|
+
if hasattr(_thread_local, "parent_context"):
|
|
316
|
+
delattr(_thread_local, "parent_context")
|
|
317
|
+
|
|
318
|
+
return wrapped_query
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _wrap_tool_factory(tool_fn: Any, tracer: "BaseTracer") -> Callable[..., Any]:
|
|
322
|
+
"""Wraps the tool() factory function to return wrapped tools."""
|
|
323
|
+
|
|
324
|
+
def wrapped_tool(*args: Any, **kwargs: Any) -> Any:
|
|
325
|
+
result = tool_fn(*args, **kwargs)
|
|
326
|
+
|
|
327
|
+
# The tool() function returns a decorator, not a tool definition
|
|
328
|
+
# We need to wrap the decorator to intercept the final tool definition
|
|
329
|
+
if not callable(result):
|
|
330
|
+
return result
|
|
331
|
+
|
|
332
|
+
def wrapped_decorator(handler_fn: Any) -> Any:
|
|
333
|
+
tool_def = result(handler_fn)
|
|
334
|
+
|
|
335
|
+
# Now we have the actual tool definition, wrap its handler
|
|
336
|
+
if tool_def and hasattr(tool_def, "handler"):
|
|
337
|
+
tool_name = getattr(tool_def, "name", "unknown")
|
|
338
|
+
original_handler = tool_def.handler
|
|
339
|
+
tool_def.handler = _wrap_tool_handler(
|
|
340
|
+
tracer, original_handler, tool_name
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
return tool_def
|
|
344
|
+
|
|
345
|
+
return wrapped_decorator
|
|
346
|
+
|
|
347
|
+
return wrapped_tool
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _wrap_tool_handler(
|
|
351
|
+
tracer: "BaseTracer", handler: Any, tool_name: Any
|
|
352
|
+
) -> Callable[..., Any]:
|
|
353
|
+
"""Wraps a tool handler to add tracing.
|
|
354
|
+
|
|
355
|
+
Claude Agent SDK breaks OpenTelemetry's automatic context propagation,
|
|
356
|
+
so we retrieve the parent context from thread-local storage and use it
|
|
357
|
+
explicitly when creating tool spans to ensure proper nesting.
|
|
358
|
+
"""
|
|
359
|
+
# Check if already wrapped to prevent double-wrapping
|
|
360
|
+
if hasattr(handler, "_judgeval_wrapped"):
|
|
361
|
+
return handler
|
|
362
|
+
|
|
363
|
+
async def wrapped_handler(args: Any) -> Any:
|
|
364
|
+
# Get parent context from thread-local storage
|
|
365
|
+
# Claude Agent SDK breaks context propagation, so we stored it explicitly
|
|
366
|
+
parent_context = getattr(_thread_local, "parent_context", None)
|
|
367
|
+
|
|
368
|
+
# Use the parent context if available, otherwise use current context
|
|
369
|
+
ctx = parent_context if parent_context is not None else None
|
|
370
|
+
|
|
371
|
+
# Create tool span with explicit parent context to ensure proper nesting
|
|
372
|
+
tracer_obj = tracer.get_tracer()
|
|
373
|
+
span = tracer_obj.start_span(
|
|
374
|
+
str(tool_name),
|
|
375
|
+
context=ctx,
|
|
376
|
+
attributes={
|
|
377
|
+
AttributeKeys.JUDGMENT_SPAN_KIND: "tool",
|
|
378
|
+
},
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
try:
|
|
382
|
+
# Set this span as active in the context
|
|
383
|
+
with trace.use_span(span, end_on_exit=True):
|
|
384
|
+
# Record input
|
|
385
|
+
set_span_attribute(
|
|
386
|
+
span, AttributeKeys.JUDGMENT_INPUT, safe_serialize(args)
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
try:
|
|
390
|
+
result = await handler(args)
|
|
391
|
+
|
|
392
|
+
# Record output
|
|
393
|
+
set_span_attribute(
|
|
394
|
+
span, AttributeKeys.JUDGMENT_OUTPUT, safe_serialize(result)
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
return result
|
|
398
|
+
except Exception as e:
|
|
399
|
+
span.record_exception(e)
|
|
400
|
+
raise
|
|
401
|
+
except Exception:
|
|
402
|
+
# If something goes wrong with span setup, end it manually
|
|
403
|
+
span.end()
|
|
404
|
+
raise
|
|
405
|
+
|
|
406
|
+
# Mark as wrapped to prevent double-wrapping
|
|
407
|
+
wrapped_handler._judgeval_wrapped = True # type: ignore
|
|
408
|
+
return wrapped_handler
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def _create_llm_span_for_messages(
|
|
412
|
+
tracer: "BaseTracer",
|
|
413
|
+
messages: List[Any], # List of AssistantMessage objects
|
|
414
|
+
prompt: Any,
|
|
415
|
+
conversation_history: List[Dict[str, Any]],
|
|
416
|
+
start_time: Optional[float] = None,
|
|
417
|
+
) -> Tuple[Optional[Dict[str, Any]], Optional[Any], Optional[Any]]:
|
|
418
|
+
"""Creates an LLM span for a group of AssistantMessage objects.
|
|
419
|
+
|
|
420
|
+
Returns a tuple of (final_content, span, span_context):
|
|
421
|
+
- final_content: The final message content to add to conversation history
|
|
422
|
+
- span: The LLM span object (for logging metrics later)
|
|
423
|
+
- span_context: The span context manager
|
|
424
|
+
"""
|
|
425
|
+
if not messages:
|
|
426
|
+
return None, None, None
|
|
427
|
+
|
|
428
|
+
last_message = messages[-1]
|
|
429
|
+
if type(last_message).__name__ != "AssistantMessage":
|
|
430
|
+
return None, None, None
|
|
431
|
+
|
|
432
|
+
model = getattr(last_message, "model", None)
|
|
433
|
+
input_messages = _build_llm_input(prompt, conversation_history)
|
|
434
|
+
|
|
435
|
+
outputs: List[Dict[str, Any]] = []
|
|
436
|
+
for msg in messages:
|
|
437
|
+
if hasattr(msg, "content"):
|
|
438
|
+
content = _serialize_content_blocks(msg.content)
|
|
439
|
+
outputs.append({"content": content, "role": "assistant"})
|
|
440
|
+
|
|
441
|
+
# Create LLM span
|
|
442
|
+
llm_span_context = tracer.get_tracer().start_as_current_span(
|
|
443
|
+
"anthropic.messages.create",
|
|
444
|
+
attributes={
|
|
445
|
+
AttributeKeys.JUDGMENT_SPAN_KIND: "llm",
|
|
446
|
+
},
|
|
447
|
+
)
|
|
448
|
+
llm_span = llm_span_context.__enter__()
|
|
449
|
+
|
|
450
|
+
# Record attributes
|
|
451
|
+
if model:
|
|
452
|
+
set_span_attribute(llm_span, AttributeKeys.JUDGMENT_LLM_MODEL_NAME, model)
|
|
453
|
+
# Set provider to anthropic for cost calculation
|
|
454
|
+
set_span_attribute(llm_span, AttributeKeys.JUDGMENT_LLM_PROVIDER, "anthropic")
|
|
455
|
+
|
|
456
|
+
if input_messages:
|
|
457
|
+
set_span_attribute(
|
|
458
|
+
llm_span, AttributeKeys.JUDGMENT_INPUT, safe_serialize(input_messages)
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
if outputs:
|
|
462
|
+
set_span_attribute(
|
|
463
|
+
llm_span, AttributeKeys.JUDGMENT_OUTPUT, safe_serialize(outputs)
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
# Return final message content for conversation history and the span
|
|
467
|
+
if hasattr(last_message, "content"):
|
|
468
|
+
content = _serialize_content_blocks(last_message.content)
|
|
469
|
+
return {"content": content, "role": "assistant"}, llm_span, llm_span_context
|
|
470
|
+
|
|
471
|
+
return None, llm_span, llm_span_context
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def _serialize_content_blocks(content: Any) -> Any:
|
|
475
|
+
"""Converts content blocks to a serializable format with proper type fields."""
|
|
476
|
+
if isinstance(content, list):
|
|
477
|
+
result = []
|
|
478
|
+
for block in content:
|
|
479
|
+
if dataclasses.is_dataclass(block) and not isinstance(block, type):
|
|
480
|
+
serialized = dataclasses.asdict(block) # type: ignore
|
|
481
|
+
|
|
482
|
+
block_type = type(block).__name__
|
|
483
|
+
if block_type == "TextBlock":
|
|
484
|
+
serialized["type"] = "text"
|
|
485
|
+
elif block_type == "ToolUseBlock":
|
|
486
|
+
serialized["type"] = "tool_use"
|
|
487
|
+
elif block_type == "ToolResultBlock":
|
|
488
|
+
serialized["type"] = "tool_result"
|
|
489
|
+
|
|
490
|
+
# Simplify content if it's a single text block
|
|
491
|
+
content_value = serialized.get("content")
|
|
492
|
+
if isinstance(content_value, list) and len(content_value) == 1:
|
|
493
|
+
item = content_value[0]
|
|
494
|
+
if (
|
|
495
|
+
isinstance(item, dict)
|
|
496
|
+
and item.get("type") == "text"
|
|
497
|
+
and "text" in item
|
|
498
|
+
):
|
|
499
|
+
serialized["content"] = item["text"]
|
|
500
|
+
|
|
501
|
+
# Remove None is_error
|
|
502
|
+
if "is_error" in serialized and serialized["is_error"] is None:
|
|
503
|
+
del serialized["is_error"]
|
|
504
|
+
else:
|
|
505
|
+
serialized = block
|
|
506
|
+
|
|
507
|
+
result.append(serialized)
|
|
508
|
+
return result
|
|
509
|
+
return content
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
def _extract_usage_from_result_message(result_message: Any) -> Dict[str, Any]:
|
|
513
|
+
"""Extracts and normalizes usage metrics from a ResultMessage."""
|
|
514
|
+
if not hasattr(result_message, "usage"):
|
|
515
|
+
return {}
|
|
516
|
+
|
|
517
|
+
usage = result_message.usage
|
|
518
|
+
if not usage:
|
|
519
|
+
return {}
|
|
520
|
+
|
|
521
|
+
metrics: Dict[str, Any] = {}
|
|
522
|
+
|
|
523
|
+
# Handle both dict and object with attributes
|
|
524
|
+
def get_value(key: str) -> Any:
|
|
525
|
+
if isinstance(usage, dict):
|
|
526
|
+
return usage.get(key)
|
|
527
|
+
return getattr(usage, key, None)
|
|
528
|
+
|
|
529
|
+
input_tokens = get_value("input_tokens")
|
|
530
|
+
if input_tokens is not None:
|
|
531
|
+
metrics[AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS] = input_tokens
|
|
532
|
+
|
|
533
|
+
output_tokens = get_value("output_tokens")
|
|
534
|
+
if output_tokens is not None:
|
|
535
|
+
metrics[AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS] = output_tokens
|
|
536
|
+
|
|
537
|
+
cache_creation_input_tokens = get_value("cache_creation_input_tokens")
|
|
538
|
+
if cache_creation_input_tokens is not None:
|
|
539
|
+
metrics[AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS] = (
|
|
540
|
+
cache_creation_input_tokens
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
cache_read_input_tokens = get_value("cache_read_input_tokens")
|
|
544
|
+
if cache_read_input_tokens is not None:
|
|
545
|
+
metrics[AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS] = (
|
|
546
|
+
cache_read_input_tokens
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
metrics[AttributeKeys.JUDGMENT_USAGE_METADATA] = safe_serialize(usage)
|
|
550
|
+
|
|
551
|
+
return metrics
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def _build_llm_input(
|
|
555
|
+
prompt: Any, conversation_history: List[Dict[str, Any]]
|
|
556
|
+
) -> Optional[List[Dict[str, Any]]]:
|
|
557
|
+
"""Builds the input array for an LLM span from the initial prompt and conversation history."""
|
|
558
|
+
if isinstance(prompt, str):
|
|
559
|
+
if len(conversation_history) == 0:
|
|
560
|
+
return [{"content": prompt, "role": "user"}]
|
|
561
|
+
else:
|
|
562
|
+
return [{"content": prompt, "role": "user"}] + conversation_history
|
|
563
|
+
|
|
564
|
+
return conversation_history if conversation_history else None
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Langgraph(ABC):
|
|
8
|
+
@staticmethod
|
|
9
|
+
def initialize(otel_only: bool = True):
|
|
10
|
+
os.environ["LANGSMITH_OTEL_ENABLED"] = "true"
|
|
11
|
+
os.environ["LANGSMITH_TRACING"] = "true"
|
|
12
|
+
if otel_only:
|
|
13
|
+
os.environ["LANGSMITH_OTEL_ONLY"] = "true"
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from judgeval.v1.tracer import Tracer
|
|
3
|
+
from judgeval.logger import judgeval_logger
|
|
4
|
+
from judgeval.utils.url import url_for
|
|
5
|
+
from judgeval.v1.utils import resolve_project_id
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import openlit # type: ignore
|
|
10
|
+
except ImportError:
|
|
11
|
+
raise ImportError(
|
|
12
|
+
"Openlit is not installed and required for the openlit integration. Please install it with `pip install openlit`."
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Openlit(ABC):
|
|
17
|
+
@staticmethod
|
|
18
|
+
def initialize(
|
|
19
|
+
tracer: Tracer,
|
|
20
|
+
**kwargs,
|
|
21
|
+
):
|
|
22
|
+
api_key = tracer.api_client.api_key
|
|
23
|
+
organization_id = tracer.api_client.organization_id
|
|
24
|
+
project_name = tracer.project_name
|
|
25
|
+
|
|
26
|
+
project_id = resolve_project_id(tracer.api_client, project_name)
|
|
27
|
+
if not project_id:
|
|
28
|
+
judgeval_logger.warning(
|
|
29
|
+
f"Project {project_name} not found. Please create it first at https://app.judgmentlabs.ai/org/{organization_id}/projects."
|
|
30
|
+
)
|
|
31
|
+
return
|
|
32
|
+
|
|
33
|
+
openlit.init(
|
|
34
|
+
service_name=project_name,
|
|
35
|
+
otlp_endpoint=url_for("/otel"),
|
|
36
|
+
otlp_headers={
|
|
37
|
+
"Authorization": f"Bearer {api_key}",
|
|
38
|
+
"X-Organization-Id": organization_id,
|
|
39
|
+
"X-Project-Id": project_id,
|
|
40
|
+
},
|
|
41
|
+
tracer=tracer.get_tracer(),
|
|
42
|
+
disable_metrics=True,
|
|
43
|
+
**kwargs,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
__all__ = ["Openlit"]
|