judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,564 @@
1
+ """Wrapper implementation for Claude Agent SDK."""
2
+
3
+ from __future__ import annotations
4
+ import dataclasses
5
+ import threading
6
+ import time
7
+ from typing import (
8
+ TYPE_CHECKING,
9
+ Any,
10
+ AsyncGenerator,
11
+ Callable,
12
+ Dict,
13
+ List,
14
+ Optional,
15
+ Tuple,
16
+ )
17
+
18
+ from opentelemetry import trace, context as otel_context
19
+ from opentelemetry.trace import set_span_in_context
20
+
21
+ from judgeval.tracer.keys import AttributeKeys
22
+ from judgeval.tracer.utils import set_span_attribute
23
+ from judgeval.utils.serialize import safe_serialize
24
+
25
+ if TYPE_CHECKING:
26
+ from judgeval.v1.tracer.tracer import BaseTracer
27
+
28
+ # Thread-local storage to propagate parent span context to tool handlers
29
+ # Claude Agent SDK breaks OpenTelemetry's automatic context propagation
30
+ # when executing tools, so we need to explicitly store and pass the context
31
+ _thread_local = threading.local()
32
+
33
+
34
+ class LLMSpanTracker:
35
+ """Manages LLM span lifecycle for Claude Agent SDK message streams.
36
+
37
+ Message flow per turn:
38
+ 1. UserMessage (tool results) → mark the time when next LLM will start
39
+ 2. AssistantMessage - LLM response arrives → create span with the marked start time, ending previous span
40
+ 3. ResultMessage - usage metrics → log to span
41
+
42
+ We end the previous span when the next AssistantMessage arrives, using the marked
43
+ start time to ensure sequential timing (no overlapping LLM spans).
44
+ """
45
+
46
+ def __init__(self, tracer: "BaseTracer", query_start_time: Optional[float] = None):
47
+ self.tracer = tracer
48
+ self.current_span: Optional[Any] = None
49
+ self.current_span_context: Optional[Any] = None
50
+ self.next_start_time: Optional[float] = query_start_time
51
+
52
+ def start_llm_span(
53
+ self, message: Any, prompt: Any, conversation_history: List[Dict[str, Any]]
54
+ ) -> Optional[Dict[str, Any]]:
55
+ """Start a new LLM span, ending the previous one if it exists."""
56
+ # Use the marked start time, or current time as fallback
57
+ start_time = (
58
+ self.next_start_time if self.next_start_time is not None else time.time()
59
+ )
60
+
61
+ # End the previous span - only use __exit__ as it calls end() internally
62
+ if self.current_span_context:
63
+ self.current_span_context.__exit__(None, None, None)
64
+
65
+ final_content, span, span_context = _create_llm_span_for_messages(
66
+ self.tracer,
67
+ [message],
68
+ prompt,
69
+ conversation_history,
70
+ start_time=start_time,
71
+ )
72
+ self.current_span = span
73
+ self.current_span_context = span_context
74
+ self.next_start_time = None # Reset for next span
75
+ return final_content
76
+
77
+ def mark_next_llm_start(self) -> None:
78
+ """Mark when the next LLM call will start (after tool results)."""
79
+ self.next_start_time = time.time()
80
+
81
+ def log_usage(self, usage_metrics: Dict[str, Any]) -> None:
82
+ """Log usage metrics to the current LLM span."""
83
+ if self.current_span and usage_metrics:
84
+ for key, value in usage_metrics.items():
85
+ set_span_attribute(self.current_span, key, value)
86
+
87
+ def cleanup(self) -> None:
88
+ """End any unclosed spans."""
89
+ if self.current_span_context:
90
+ self.current_span_context.__exit__(None, None, None)
91
+ self.current_span = None
92
+ self.current_span_context = None
93
+
94
+
95
+ def _create_client_wrapper_class(
96
+ original_client_class: Any, tracer: "BaseTracer"
97
+ ) -> Any:
98
+ """Creates a wrapper class for ClaudeSDKClient that wraps query and receive_response."""
99
+
100
+ class WrappedClaudeSDKClient(original_client_class): # type: ignore
101
+ def __init__(self, *args: Any, **kwargs: Any):
102
+ super().__init__(*args, **kwargs)
103
+ self.__last_prompt: Optional[str] = None
104
+ self.__query_start_time: Optional[float] = None
105
+
106
+ async def query(self, *args: Any, **kwargs: Any) -> Any:
107
+ """Wrap query to capture the prompt and start time for tracing."""
108
+ # Capture the time when query is called (when LLM call starts)
109
+ self.__query_start_time = time.time()
110
+
111
+ # Capture the prompt for use in receive_response
112
+ if args:
113
+ self.__last_prompt = str(args[0])
114
+ elif "prompt" in kwargs:
115
+ self.__last_prompt = str(kwargs["prompt"])
116
+
117
+ return await super().query(*args, **kwargs)
118
+
119
+ async def receive_response(self) -> AsyncGenerator[Any, None]:
120
+ """Wrap receive_response to add tracing with proper span hierarchy."""
121
+ generator = super().receive_response()
122
+
123
+ # Create TASK span for the entire agent conversation
124
+ agent_span_context = tracer.get_tracer().start_as_current_span(
125
+ "Claude_Agent",
126
+ attributes={
127
+ AttributeKeys.JUDGMENT_SPAN_KIND: "agent",
128
+ },
129
+ )
130
+ agent_span = agent_span_context.__enter__()
131
+
132
+ # Record input
133
+ if self.__last_prompt:
134
+ set_span_attribute(
135
+ agent_span,
136
+ AttributeKeys.JUDGMENT_INPUT,
137
+ safe_serialize(self.__last_prompt),
138
+ )
139
+
140
+ # Store the parent span context in thread-local storage
141
+ # Claude Agent SDK breaks OpenTelemetry's context propagation when executing tools,
142
+ # so we need to explicitly store the context for tool handlers to access
143
+ parent_context = set_span_in_context(agent_span, otel_context.get_current())
144
+ _thread_local.parent_context = parent_context
145
+
146
+ final_results: List[Dict[str, Any]] = []
147
+ llm_tracker = LLMSpanTracker(
148
+ tracer, query_start_time=self.__query_start_time
149
+ )
150
+
151
+ try:
152
+ async for message in generator:
153
+ message_type = type(message).__name__
154
+
155
+ if message_type == "AssistantMessage":
156
+ final_content = llm_tracker.start_llm_span(
157
+ message, self.__last_prompt, final_results
158
+ )
159
+ if final_content:
160
+ final_results.append(final_content)
161
+
162
+ elif message_type == "UserMessage":
163
+ if hasattr(message, "content"):
164
+ content = _serialize_content_blocks(message.content)
165
+ final_results.append({"content": content, "role": "user"})
166
+
167
+ llm_tracker.mark_next_llm_start()
168
+
169
+ elif message_type == "ResultMessage":
170
+ if hasattr(message, "usage"):
171
+ usage_metrics = _extract_usage_from_result_message(message)
172
+ llm_tracker.log_usage(usage_metrics)
173
+
174
+ result_metadata = {
175
+ k: v
176
+ for k, v in {
177
+ "num_turns": getattr(message, "num_turns", None),
178
+ "session_id": getattr(message, "session_id", None),
179
+ }.items()
180
+ if v is not None
181
+ }
182
+ if result_metadata:
183
+ for key, value in result_metadata.items():
184
+ set_span_attribute(agent_span, f"agent.{key}", value)
185
+
186
+ yield message
187
+
188
+ # Record output
189
+ if final_results:
190
+ set_span_attribute(
191
+ agent_span,
192
+ AttributeKeys.JUDGMENT_OUTPUT,
193
+ safe_serialize(final_results[-1] if final_results else None),
194
+ )
195
+
196
+ except Exception as e:
197
+ agent_span.record_exception(e)
198
+ raise
199
+ finally:
200
+ llm_tracker.cleanup()
201
+ agent_span_context.__exit__(None, None, None)
202
+ # Clean up thread-local storage
203
+ if hasattr(_thread_local, "parent_context"):
204
+ delattr(_thread_local, "parent_context")
205
+
206
+ return WrappedClaudeSDKClient
207
+
208
+
209
+ def _create_tool_wrapper_class(original_tool_class: Any, tracer: "BaseTracer") -> Any:
210
+ """Creates a wrapper class for SdkMcpTool that wraps handlers."""
211
+
212
+ class WrappedSdkMcpTool(original_tool_class): # type: ignore
213
+ def __init__(
214
+ self,
215
+ name: Any,
216
+ description: Any,
217
+ input_schema: Any,
218
+ handler: Any,
219
+ **kwargs: Any,
220
+ ):
221
+ wrapped_handler = _wrap_tool_handler(tracer, handler, name)
222
+ super().__init__(name, description, input_schema, wrapped_handler, **kwargs)
223
+
224
+ # Preserve generic typing support
225
+ __class_getitem__ = classmethod(lambda cls, params: cls) # type: ignore
226
+
227
+ return WrappedSdkMcpTool
228
+
229
+
230
+ def _wrap_query_function(
231
+ original_query_fn: Any, tracer: "BaseTracer"
232
+ ) -> Callable[..., Any]:
233
+ """Wraps the standalone query() function to add tracing."""
234
+
235
+ async def wrapped_query(*args: Any, **kwargs: Any) -> Any:
236
+ """Wrapped query function with automatic tracing."""
237
+ # Create agent span for the query
238
+ agent_span_context = tracer.get_tracer().start_as_current_span(
239
+ "Claude_Agent_Query",
240
+ attributes={
241
+ AttributeKeys.JUDGMENT_SPAN_KIND: "agent",
242
+ },
243
+ )
244
+ agent_span = agent_span_context.__enter__()
245
+
246
+ # Capture prompt if available
247
+ prompt = kwargs.get("prompt") or (args[0] if args else None)
248
+ if prompt and isinstance(prompt, str):
249
+ set_span_attribute(
250
+ agent_span, AttributeKeys.JUDGMENT_INPUT, safe_serialize(prompt)
251
+ )
252
+
253
+ # Store parent context for tool tracing
254
+ parent_context = set_span_in_context(agent_span, otel_context.get_current())
255
+ _thread_local.parent_context = parent_context
256
+
257
+ final_results: List[Dict[str, Any]] = []
258
+ llm_tracker = LLMSpanTracker(tracer, query_start_time=time.time())
259
+
260
+ try:
261
+ # Call original query function
262
+ async for message in original_query_fn(*args, **kwargs):
263
+ message_type = type(message).__name__
264
+
265
+ if message_type == "AssistantMessage":
266
+ final_content = llm_tracker.start_llm_span(
267
+ message,
268
+ prompt if isinstance(prompt, str) else None,
269
+ final_results,
270
+ )
271
+ if final_content:
272
+ final_results.append(final_content)
273
+
274
+ elif message_type == "UserMessage":
275
+ if hasattr(message, "content"):
276
+ content = _serialize_content_blocks(message.content)
277
+ final_results.append({"content": content, "role": "user"})
278
+
279
+ llm_tracker.mark_next_llm_start()
280
+
281
+ elif message_type == "ResultMessage":
282
+ if hasattr(message, "usage"):
283
+ usage_metrics = _extract_usage_from_result_message(message)
284
+ llm_tracker.log_usage(usage_metrics)
285
+
286
+ result_metadata = {
287
+ k: v
288
+ for k, v in {
289
+ "num_turns": getattr(message, "num_turns", None),
290
+ "session_id": getattr(message, "session_id", None),
291
+ }.items()
292
+ if v is not None
293
+ }
294
+ if result_metadata:
295
+ for key, value in result_metadata.items():
296
+ set_span_attribute(agent_span, f"agent.{key}", value)
297
+
298
+ yield message
299
+
300
+ # Record output
301
+ if final_results:
302
+ set_span_attribute(
303
+ agent_span,
304
+ AttributeKeys.JUDGMENT_OUTPUT,
305
+ safe_serialize(final_results[-1] if final_results else None),
306
+ )
307
+
308
+ except Exception as e:
309
+ agent_span.record_exception(e)
310
+ raise
311
+ finally:
312
+ llm_tracker.cleanup()
313
+ agent_span_context.__exit__(None, None, None)
314
+ # Clean up thread-local storage
315
+ if hasattr(_thread_local, "parent_context"):
316
+ delattr(_thread_local, "parent_context")
317
+
318
+ return wrapped_query
319
+
320
+
321
+ def _wrap_tool_factory(tool_fn: Any, tracer: "BaseTracer") -> Callable[..., Any]:
322
+ """Wraps the tool() factory function to return wrapped tools."""
323
+
324
+ def wrapped_tool(*args: Any, **kwargs: Any) -> Any:
325
+ result = tool_fn(*args, **kwargs)
326
+
327
+ # The tool() function returns a decorator, not a tool definition
328
+ # We need to wrap the decorator to intercept the final tool definition
329
+ if not callable(result):
330
+ return result
331
+
332
+ def wrapped_decorator(handler_fn: Any) -> Any:
333
+ tool_def = result(handler_fn)
334
+
335
+ # Now we have the actual tool definition, wrap its handler
336
+ if tool_def and hasattr(tool_def, "handler"):
337
+ tool_name = getattr(tool_def, "name", "unknown")
338
+ original_handler = tool_def.handler
339
+ tool_def.handler = _wrap_tool_handler(
340
+ tracer, original_handler, tool_name
341
+ )
342
+
343
+ return tool_def
344
+
345
+ return wrapped_decorator
346
+
347
+ return wrapped_tool
348
+
349
+
350
+ def _wrap_tool_handler(
351
+ tracer: "BaseTracer", handler: Any, tool_name: Any
352
+ ) -> Callable[..., Any]:
353
+ """Wraps a tool handler to add tracing.
354
+
355
+ Claude Agent SDK breaks OpenTelemetry's automatic context propagation,
356
+ so we retrieve the parent context from thread-local storage and use it
357
+ explicitly when creating tool spans to ensure proper nesting.
358
+ """
359
+ # Check if already wrapped to prevent double-wrapping
360
+ if hasattr(handler, "_judgeval_wrapped"):
361
+ return handler
362
+
363
+ async def wrapped_handler(args: Any) -> Any:
364
+ # Get parent context from thread-local storage
365
+ # Claude Agent SDK breaks context propagation, so we stored it explicitly
366
+ parent_context = getattr(_thread_local, "parent_context", None)
367
+
368
+ # Use the parent context if available, otherwise use current context
369
+ ctx = parent_context if parent_context is not None else None
370
+
371
+ # Create tool span with explicit parent context to ensure proper nesting
372
+ tracer_obj = tracer.get_tracer()
373
+ span = tracer_obj.start_span(
374
+ str(tool_name),
375
+ context=ctx,
376
+ attributes={
377
+ AttributeKeys.JUDGMENT_SPAN_KIND: "tool",
378
+ },
379
+ )
380
+
381
+ try:
382
+ # Set this span as active in the context
383
+ with trace.use_span(span, end_on_exit=True):
384
+ # Record input
385
+ set_span_attribute(
386
+ span, AttributeKeys.JUDGMENT_INPUT, safe_serialize(args)
387
+ )
388
+
389
+ try:
390
+ result = await handler(args)
391
+
392
+ # Record output
393
+ set_span_attribute(
394
+ span, AttributeKeys.JUDGMENT_OUTPUT, safe_serialize(result)
395
+ )
396
+
397
+ return result
398
+ except Exception as e:
399
+ span.record_exception(e)
400
+ raise
401
+ except Exception:
402
+ # If something goes wrong with span setup, end it manually
403
+ span.end()
404
+ raise
405
+
406
+ # Mark as wrapped to prevent double-wrapping
407
+ wrapped_handler._judgeval_wrapped = True # type: ignore
408
+ return wrapped_handler
409
+
410
+
411
+ def _create_llm_span_for_messages(
412
+ tracer: "BaseTracer",
413
+ messages: List[Any], # List of AssistantMessage objects
414
+ prompt: Any,
415
+ conversation_history: List[Dict[str, Any]],
416
+ start_time: Optional[float] = None,
417
+ ) -> Tuple[Optional[Dict[str, Any]], Optional[Any], Optional[Any]]:
418
+ """Creates an LLM span for a group of AssistantMessage objects.
419
+
420
+ Returns a tuple of (final_content, span, span_context):
421
+ - final_content: The final message content to add to conversation history
422
+ - span: The LLM span object (for logging metrics later)
423
+ - span_context: The span context manager
424
+ """
425
+ if not messages:
426
+ return None, None, None
427
+
428
+ last_message = messages[-1]
429
+ if type(last_message).__name__ != "AssistantMessage":
430
+ return None, None, None
431
+
432
+ model = getattr(last_message, "model", None)
433
+ input_messages = _build_llm_input(prompt, conversation_history)
434
+
435
+ outputs: List[Dict[str, Any]] = []
436
+ for msg in messages:
437
+ if hasattr(msg, "content"):
438
+ content = _serialize_content_blocks(msg.content)
439
+ outputs.append({"content": content, "role": "assistant"})
440
+
441
+ # Create LLM span
442
+ llm_span_context = tracer.get_tracer().start_as_current_span(
443
+ "anthropic.messages.create",
444
+ attributes={
445
+ AttributeKeys.JUDGMENT_SPAN_KIND: "llm",
446
+ },
447
+ )
448
+ llm_span = llm_span_context.__enter__()
449
+
450
+ # Record attributes
451
+ if model:
452
+ set_span_attribute(llm_span, AttributeKeys.JUDGMENT_LLM_MODEL_NAME, model)
453
+ # Set provider to anthropic for cost calculation
454
+ set_span_attribute(llm_span, AttributeKeys.JUDGMENT_LLM_PROVIDER, "anthropic")
455
+
456
+ if input_messages:
457
+ set_span_attribute(
458
+ llm_span, AttributeKeys.JUDGMENT_INPUT, safe_serialize(input_messages)
459
+ )
460
+
461
+ if outputs:
462
+ set_span_attribute(
463
+ llm_span, AttributeKeys.JUDGMENT_OUTPUT, safe_serialize(outputs)
464
+ )
465
+
466
+ # Return final message content for conversation history and the span
467
+ if hasattr(last_message, "content"):
468
+ content = _serialize_content_blocks(last_message.content)
469
+ return {"content": content, "role": "assistant"}, llm_span, llm_span_context
470
+
471
+ return None, llm_span, llm_span_context
472
+
473
+
474
+ def _serialize_content_blocks(content: Any) -> Any:
475
+ """Converts content blocks to a serializable format with proper type fields."""
476
+ if isinstance(content, list):
477
+ result = []
478
+ for block in content:
479
+ if dataclasses.is_dataclass(block) and not isinstance(block, type):
480
+ serialized = dataclasses.asdict(block) # type: ignore
481
+
482
+ block_type = type(block).__name__
483
+ if block_type == "TextBlock":
484
+ serialized["type"] = "text"
485
+ elif block_type == "ToolUseBlock":
486
+ serialized["type"] = "tool_use"
487
+ elif block_type == "ToolResultBlock":
488
+ serialized["type"] = "tool_result"
489
+
490
+ # Simplify content if it's a single text block
491
+ content_value = serialized.get("content")
492
+ if isinstance(content_value, list) and len(content_value) == 1:
493
+ item = content_value[0]
494
+ if (
495
+ isinstance(item, dict)
496
+ and item.get("type") == "text"
497
+ and "text" in item
498
+ ):
499
+ serialized["content"] = item["text"]
500
+
501
+ # Remove None is_error
502
+ if "is_error" in serialized and serialized["is_error"] is None:
503
+ del serialized["is_error"]
504
+ else:
505
+ serialized = block
506
+
507
+ result.append(serialized)
508
+ return result
509
+ return content
510
+
511
+
512
+ def _extract_usage_from_result_message(result_message: Any) -> Dict[str, Any]:
513
+ """Extracts and normalizes usage metrics from a ResultMessage."""
514
+ if not hasattr(result_message, "usage"):
515
+ return {}
516
+
517
+ usage = result_message.usage
518
+ if not usage:
519
+ return {}
520
+
521
+ metrics: Dict[str, Any] = {}
522
+
523
+ # Handle both dict and object with attributes
524
+ def get_value(key: str) -> Any:
525
+ if isinstance(usage, dict):
526
+ return usage.get(key)
527
+ return getattr(usage, key, None)
528
+
529
+ input_tokens = get_value("input_tokens")
530
+ if input_tokens is not None:
531
+ metrics[AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS] = input_tokens
532
+
533
+ output_tokens = get_value("output_tokens")
534
+ if output_tokens is not None:
535
+ metrics[AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS] = output_tokens
536
+
537
+ cache_creation_input_tokens = get_value("cache_creation_input_tokens")
538
+ if cache_creation_input_tokens is not None:
539
+ metrics[AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS] = (
540
+ cache_creation_input_tokens
541
+ )
542
+
543
+ cache_read_input_tokens = get_value("cache_read_input_tokens")
544
+ if cache_read_input_tokens is not None:
545
+ metrics[AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS] = (
546
+ cache_read_input_tokens
547
+ )
548
+
549
+ metrics[AttributeKeys.JUDGMENT_USAGE_METADATA] = safe_serialize(usage)
550
+
551
+ return metrics
552
+
553
+
554
+ def _build_llm_input(
555
+ prompt: Any, conversation_history: List[Dict[str, Any]]
556
+ ) -> Optional[List[Dict[str, Any]]]:
557
+ """Builds the input array for an LLM span from the initial prompt and conversation history."""
558
+ if isinstance(prompt, str):
559
+ if len(conversation_history) == 0:
560
+ return [{"content": prompt, "role": "user"}]
561
+ else:
562
+ return [{"content": prompt, "role": "user"}] + conversation_history
563
+
564
+ return conversation_history if conversation_history else None
@@ -0,0 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC
4
+ import os
5
+
6
+
7
+ class Langgraph(ABC):
8
+ @staticmethod
9
+ def initialize(otel_only: bool = True):
10
+ os.environ["LANGSMITH_OTEL_ENABLED"] = "true"
11
+ os.environ["LANGSMITH_TRACING"] = "true"
12
+ if otel_only:
13
+ os.environ["LANGSMITH_OTEL_ONLY"] = "true"
@@ -0,0 +1,47 @@
1
+ from abc import ABC
2
+ from judgeval.v1.tracer import Tracer
3
+ from judgeval.logger import judgeval_logger
4
+ from judgeval.utils.url import url_for
5
+ from judgeval.v1.utils import resolve_project_id
6
+
7
+
8
+ try:
9
+ import openlit # type: ignore
10
+ except ImportError:
11
+ raise ImportError(
12
+ "Openlit is not installed and required for the openlit integration. Please install it with `pip install openlit`."
13
+ )
14
+
15
+
16
+ class Openlit(ABC):
17
+ @staticmethod
18
+ def initialize(
19
+ tracer: Tracer,
20
+ **kwargs,
21
+ ):
22
+ api_key = tracer.api_client.api_key
23
+ organization_id = tracer.api_client.organization_id
24
+ project_name = tracer.project_name
25
+
26
+ project_id = resolve_project_id(tracer.api_client, project_name)
27
+ if not project_id:
28
+ judgeval_logger.warning(
29
+ f"Project {project_name} not found. Please create it first at https://app.judgmentlabs.ai/org/{organization_id}/projects."
30
+ )
31
+ return
32
+
33
+ openlit.init(
34
+ service_name=project_name,
35
+ otlp_endpoint=url_for("/otel"),
36
+ otlp_headers={
37
+ "Authorization": f"Bearer {api_key}",
38
+ "X-Organization-Id": organization_id,
39
+ "X-Project-Id": project_id,
40
+ },
41
+ tracer=tracer.get_tracer(),
42
+ disable_metrics=True,
43
+ **kwargs,
44
+ )
45
+
46
+
47
+ __all__ = ["Openlit"]