judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,382 @@
1
+ from __future__ import annotations
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Any,
5
+ Awaitable,
6
+ Callable,
7
+ Dict,
8
+ Iterator,
9
+ AsyncIterator,
10
+ Generator,
11
+ AsyncGenerator,
12
+ )
13
+
14
+ from opentelemetry.trace import Status, StatusCode
15
+ from judgeval.judgment_attribute_keys import AttributeKeys
16
+ from judgeval.utils.serialize import safe_serialize
17
+ from judgeval.utils.wrappers import (
18
+ immutable_wrap_async,
19
+ immutable_wrap_sync,
20
+ mutable_wrap_sync,
21
+ mutable_wrap_async,
22
+ immutable_wrap_sync_iterator,
23
+ immutable_wrap_async_iterator,
24
+ )
25
+
26
+ if TYPE_CHECKING:
27
+ from judgeval.v1.tracer import BaseTracer
28
+ from together import Together, AsyncTogether # type: ignore[import-untyped]
29
+ from together.types import ChatCompletionResponse, ChatCompletionChunk # type: ignore[import-untyped]
30
+ from together.types.common import UsageData # type: ignore[import-untyped]
31
+
32
+
33
+ def _extract_together_tokens(usage: UsageData) -> tuple[int, int, int, int]:
34
+ prompt_tokens = usage.prompt_tokens if usage.prompt_tokens is not None else 0
35
+ completion_tokens = (
36
+ usage.completion_tokens if usage.completion_tokens is not None else 0
37
+ )
38
+ cache_read_input_tokens = 0
39
+ cache_creation_input_tokens = 0
40
+ return (
41
+ prompt_tokens,
42
+ completion_tokens,
43
+ cache_read_input_tokens,
44
+ cache_creation_input_tokens,
45
+ )
46
+
47
+
48
+ def wrap_chat_completions_create_sync(tracer: BaseTracer, client: Together) -> None:
49
+ original_func = client.chat.completions.create
50
+
51
+ def dispatcher(*args: Any, **kwargs: Any) -> Any:
52
+ if kwargs.get("stream", False):
53
+ return _wrap_streaming_sync(tracer, original_func)(*args, **kwargs) # type: ignore[arg-type]
54
+ return _wrap_non_streaming_sync(tracer, original_func)(*args, **kwargs) # type: ignore[arg-type]
55
+
56
+ setattr(client.chat.completions, "create", dispatcher)
57
+
58
+
59
+ def _wrap_non_streaming_sync(
60
+ tracer: BaseTracer, original_func: Callable[..., ChatCompletionResponse]
61
+ ) -> Callable[..., ChatCompletionResponse]:
62
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
63
+ ctx["span"] = tracer.get_tracer().start_span(
64
+ "TOGETHER_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
65
+ )
66
+ ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
67
+ ctx["model_name"] = kwargs.get("model", "")
68
+ prefixed_model_name = (
69
+ f"together_ai/{ctx['model_name']}" if ctx["model_name"] else ""
70
+ )
71
+ ctx["model_name"] = prefixed_model_name
72
+ ctx["span"].set_attribute(
73
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME, prefixed_model_name
74
+ )
75
+
76
+ def post_hook(ctx: Dict[str, Any], result: ChatCompletionResponse) -> None:
77
+ span = ctx.get("span")
78
+ if not span:
79
+ return
80
+
81
+ span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result))
82
+
83
+ if result.usage:
84
+ prompt_tokens, completion_tokens, _, _ = _extract_together_tokens(
85
+ result.usage
86
+ )
87
+ span.set_attribute(
88
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
89
+ prompt_tokens,
90
+ )
91
+ span.set_attribute(
92
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
93
+ )
94
+ span.set_attribute(
95
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
96
+ safe_serialize(result.usage),
97
+ )
98
+
99
+ span.set_attribute(AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"])
100
+
101
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
102
+ span = ctx.get("span")
103
+ if span:
104
+ span.record_exception(error)
105
+ span.set_status(Status(StatusCode.ERROR))
106
+
107
+ def finally_hook(ctx: Dict[str, Any]) -> None:
108
+ span = ctx.get("span")
109
+ if span:
110
+ span.end()
111
+
112
+ return immutable_wrap_sync(
113
+ original_func,
114
+ pre_hook=pre_hook,
115
+ post_hook=post_hook,
116
+ error_hook=error_hook,
117
+ finally_hook=finally_hook,
118
+ )
119
+
120
+
121
+ def _wrap_streaming_sync(
122
+ tracer: BaseTracer, original_func: Callable[..., Iterator[ChatCompletionChunk]]
123
+ ) -> Callable[..., Iterator[ChatCompletionChunk]]:
124
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
125
+ ctx["span"] = tracer.get_tracer().start_span(
126
+ "TOGETHER_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
127
+ )
128
+ ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
129
+ ctx["model_name"] = kwargs.get("model", "")
130
+ prefixed_model_name = (
131
+ f"together_ai/{ctx['model_name']}" if ctx["model_name"] else ""
132
+ )
133
+ ctx["model_name"] = prefixed_model_name
134
+ ctx["span"].set_attribute(
135
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME, prefixed_model_name
136
+ )
137
+ ctx["accumulated_content"] = ""
138
+
139
+ def mutate_hook(
140
+ ctx: Dict[str, Any], result: Iterator[ChatCompletionChunk]
141
+ ) -> Iterator[ChatCompletionChunk]:
142
+ def traced_generator() -> Generator[ChatCompletionChunk, None, None]:
143
+ for chunk in result:
144
+ yield chunk
145
+
146
+ def yield_hook(inner_ctx: Dict[str, Any], chunk: ChatCompletionChunk) -> None:
147
+ span = ctx.get("span")
148
+ if not span:
149
+ return
150
+
151
+ if chunk.choices and len(chunk.choices) > 0:
152
+ delta = chunk.choices[0].delta
153
+ if delta and hasattr(delta, "content") and delta.content:
154
+ ctx["accumulated_content"] = (
155
+ ctx.get("accumulated_content", "") + delta.content
156
+ )
157
+
158
+ if chunk.usage:
159
+ prompt_tokens, completion_tokens, _, _ = _extract_together_tokens(
160
+ chunk.usage
161
+ )
162
+ span.set_attribute(
163
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
164
+ prompt_tokens,
165
+ )
166
+ span.set_attribute(
167
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
168
+ )
169
+ span.set_attribute(
170
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
171
+ safe_serialize(chunk.usage),
172
+ )
173
+
174
+ def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
175
+ span = ctx.get("span")
176
+ if span:
177
+ accumulated = ctx.get("accumulated_content", "")
178
+ span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, accumulated)
179
+
180
+ def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
181
+ span = ctx.get("span")
182
+ if span:
183
+ span.record_exception(error)
184
+ span.set_status(Status(StatusCode.ERROR))
185
+
186
+ def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
187
+ span = ctx.get("span")
188
+ if span:
189
+ span.end()
190
+
191
+ wrapped_generator = immutable_wrap_sync_iterator(
192
+ traced_generator,
193
+ yield_hook=yield_hook,
194
+ post_hook=post_hook_inner,
195
+ error_hook=error_hook_inner,
196
+ finally_hook=finally_hook_inner,
197
+ )
198
+
199
+ return wrapped_generator()
200
+
201
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
202
+ span = ctx.get("span")
203
+ if span:
204
+ span.record_exception(error)
205
+ span.set_status(Status(StatusCode.ERROR))
206
+
207
+ return mutable_wrap_sync(
208
+ original_func,
209
+ pre_hook=pre_hook,
210
+ mutate_hook=mutate_hook,
211
+ error_hook=error_hook,
212
+ )
213
+
214
+
215
+ def wrap_chat_completions_create_async(
216
+ tracer: BaseTracer, client: AsyncTogether
217
+ ) -> None:
218
+ original_func = client.chat.completions.create
219
+
220
+ async def dispatcher(*args: Any, **kwargs: Any) -> Any:
221
+ if kwargs.get("stream", False):
222
+ return await _wrap_streaming_async(tracer, original_func)(*args, **kwargs) # type: ignore[arg-type]
223
+ return await _wrap_non_streaming_async(tracer, original_func)(*args, **kwargs) # type: ignore[arg-type]
224
+
225
+ setattr(client.chat.completions, "create", dispatcher)
226
+
227
+
228
+ def _wrap_non_streaming_async(
229
+ tracer: BaseTracer, original_func: Callable[..., Awaitable[ChatCompletionResponse]]
230
+ ) -> Callable[..., Awaitable[ChatCompletionResponse]]:
231
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
232
+ ctx["span"] = tracer.get_tracer().start_span(
233
+ "TOGETHER_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
234
+ )
235
+ ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
236
+ ctx["model_name"] = kwargs.get("model", "")
237
+ prefixed_model_name = (
238
+ f"together_ai/{ctx['model_name']}" if ctx["model_name"] else ""
239
+ )
240
+ ctx["model_name"] = prefixed_model_name
241
+ ctx["span"].set_attribute(
242
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME, prefixed_model_name
243
+ )
244
+
245
+ def post_hook(ctx: Dict[str, Any], result: ChatCompletionResponse) -> None:
246
+ span = ctx.get("span")
247
+ if not span:
248
+ return
249
+
250
+ span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, safe_serialize(result))
251
+
252
+ if result.usage:
253
+ prompt_tokens, completion_tokens, _, _ = _extract_together_tokens(
254
+ result.usage
255
+ )
256
+ span.set_attribute(
257
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
258
+ prompt_tokens,
259
+ )
260
+ span.set_attribute(
261
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
262
+ )
263
+ span.set_attribute(
264
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
265
+ safe_serialize(result.usage),
266
+ )
267
+
268
+ span.set_attribute(AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"])
269
+
270
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
271
+ span = ctx.get("span")
272
+ if span:
273
+ span.record_exception(error)
274
+ span.set_status(Status(StatusCode.ERROR))
275
+
276
+ def finally_hook(ctx: Dict[str, Any]) -> None:
277
+ span = ctx.get("span")
278
+ if span:
279
+ span.end()
280
+
281
+ return immutable_wrap_async(
282
+ original_func,
283
+ pre_hook=pre_hook,
284
+ post_hook=post_hook,
285
+ error_hook=error_hook,
286
+ finally_hook=finally_hook,
287
+ )
288
+
289
+
290
+ def _wrap_streaming_async(
291
+ tracer: BaseTracer,
292
+ original_func: Callable[..., Awaitable[AsyncIterator[ChatCompletionChunk]]],
293
+ ) -> Callable[..., Awaitable[AsyncIterator[ChatCompletionChunk]]]:
294
+ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
295
+ ctx["span"] = tracer.get_tracer().start_span(
296
+ "TOGETHER_API_CALL", attributes={AttributeKeys.JUDGMENT_SPAN_KIND: "llm"}
297
+ )
298
+ ctx["span"].set_attribute(AttributeKeys.GEN_AI_PROMPT, safe_serialize(kwargs))
299
+ ctx["model_name"] = kwargs.get("model", "")
300
+ prefixed_model_name = (
301
+ f"together_ai/{ctx['model_name']}" if ctx["model_name"] else ""
302
+ )
303
+ ctx["model_name"] = prefixed_model_name
304
+ ctx["span"].set_attribute(
305
+ AttributeKeys.JUDGMENT_LLM_MODEL_NAME, prefixed_model_name
306
+ )
307
+ ctx["accumulated_content"] = ""
308
+
309
+ def mutate_hook(
310
+ ctx: Dict[str, Any], result: AsyncIterator[ChatCompletionChunk]
311
+ ) -> AsyncIterator[ChatCompletionChunk]:
312
+ async def traced_generator() -> AsyncGenerator[ChatCompletionChunk, None]:
313
+ async for chunk in result:
314
+ yield chunk
315
+
316
+ def yield_hook(inner_ctx: Dict[str, Any], chunk: ChatCompletionChunk) -> None:
317
+ span = ctx.get("span")
318
+ if not span:
319
+ return
320
+
321
+ if chunk.choices and len(chunk.choices) > 0:
322
+ delta = chunk.choices[0].delta
323
+ if delta and hasattr(delta, "content") and delta.content:
324
+ ctx["accumulated_content"] = (
325
+ ctx.get("accumulated_content", "") + delta.content
326
+ )
327
+
328
+ if chunk.usage:
329
+ prompt_tokens, completion_tokens, _, _ = _extract_together_tokens(
330
+ chunk.usage
331
+ )
332
+ span.set_attribute(
333
+ AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
334
+ prompt_tokens,
335
+ )
336
+ span.set_attribute(
337
+ AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
338
+ )
339
+ span.set_attribute(
340
+ AttributeKeys.JUDGMENT_USAGE_METADATA,
341
+ safe_serialize(chunk.usage),
342
+ )
343
+
344
+ def post_hook_inner(inner_ctx: Dict[str, Any]) -> None:
345
+ span = ctx.get("span")
346
+ if span:
347
+ accumulated = ctx.get("accumulated_content", "")
348
+ span.set_attribute(AttributeKeys.GEN_AI_COMPLETION, accumulated)
349
+
350
+ def error_hook_inner(inner_ctx: Dict[str, Any], error: Exception) -> None:
351
+ span = ctx.get("span")
352
+ if span:
353
+ span.record_exception(error)
354
+ span.set_status(Status(StatusCode.ERROR))
355
+
356
+ def finally_hook_inner(inner_ctx: Dict[str, Any]) -> None:
357
+ span = ctx.get("span")
358
+ if span:
359
+ span.end()
360
+
361
+ wrapped_generator = immutable_wrap_async_iterator(
362
+ traced_generator,
363
+ yield_hook=yield_hook,
364
+ post_hook=post_hook_inner,
365
+ error_hook=error_hook_inner,
366
+ finally_hook=finally_hook_inner,
367
+ )
368
+
369
+ return wrapped_generator()
370
+
371
+ def error_hook(ctx: Dict[str, Any], error: Exception) -> None:
372
+ span = ctx.get("span")
373
+ if span:
374
+ span.record_exception(error)
375
+ span.set_status(Status(StatusCode.ERROR))
376
+
377
+ return mutable_wrap_async(
378
+ original_func,
379
+ pre_hook=pre_hook,
380
+ mutate_hook=mutate_hook,
381
+ error_hook=error_hook,
382
+ )
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+ import importlib.util
3
+
4
+ HAS_TOGETHER = importlib.util.find_spec("together") is not None
5
+
6
+ __all__ = ["HAS_TOGETHER"]
@@ -0,0 +1,57 @@
1
+ from __future__ import annotations
2
+ from typing import TYPE_CHECKING, Union
3
+ import typing
4
+
5
+ from judgeval.v1.instrumentation.llm.llm_together.chat_completions import (
6
+ wrap_chat_completions_create_sync,
7
+ wrap_chat_completions_create_async,
8
+ )
9
+
10
+
11
+ if TYPE_CHECKING:
12
+ from judgeval.v1.tracer import BaseTracer
13
+ from together import Together, AsyncTogether # type: ignore[import-untyped]
14
+
15
+ TClient = Union[Together, AsyncTogether]
16
+
17
+
18
+ def wrap_together_client_sync(tracer: BaseTracer, client: Together) -> Together:
19
+ wrap_chat_completions_create_sync(tracer, client)
20
+ return client
21
+
22
+
23
+ def wrap_together_client_async(
24
+ tracer: BaseTracer, client: AsyncTogether
25
+ ) -> AsyncTogether:
26
+ wrap_chat_completions_create_async(tracer, client)
27
+ return client
28
+
29
+
30
+ @typing.overload
31
+ def wrap_together_client(tracer: BaseTracer, client: Together) -> Together: ...
32
+ @typing.overload
33
+ def wrap_together_client( # type: ignore[overload-cannot-match]
34
+ tracer: BaseTracer,
35
+ client: AsyncTogether,
36
+ ) -> AsyncTogether: ...
37
+
38
+
39
+ def wrap_together_client(tracer: BaseTracer, client: TClient) -> TClient:
40
+ from judgeval.v1.instrumentation.llm.llm_together.config import HAS_TOGETHER
41
+ from judgeval.logger import judgeval_logger
42
+
43
+ if not HAS_TOGETHER:
44
+ judgeval_logger.error(
45
+ "Cannot wrap Together client: 'together' library not installed. "
46
+ "Install it with: pip install together"
47
+ )
48
+ return client
49
+
50
+ from together import Together, AsyncTogether # type: ignore[import-untyped]
51
+
52
+ if isinstance(client, AsyncTogether):
53
+ return wrap_together_client_async(tracer, client)
54
+ elif isinstance(client, Together):
55
+ return wrap_together_client_sync(tracer, client)
56
+ else:
57
+ raise TypeError(f"Invalid client type: {type(client)}")
@@ -0,0 +1,19 @@
1
+ from __future__ import annotations
2
+ from typing import Any, TypeAlias
3
+
4
+ from judgeval.v1.instrumentation.llm.llm_openai.config import HAS_OPENAI
5
+ from judgeval.v1.instrumentation.llm.llm_together.config import HAS_TOGETHER
6
+ from judgeval.v1.instrumentation.llm.llm_anthropic.config import HAS_ANTHROPIC
7
+ from judgeval.v1.instrumentation.llm.llm_google.config import HAS_GOOGLE_GENAI
8
+
9
+ # TODO: if we support dependency groups we can have this better type, but during runtime, we do
10
+ # not know which clients an end user might have installed.
11
+ ApiClient: TypeAlias = Any
12
+
13
+ __all__ = [
14
+ "ApiClient",
15
+ "HAS_OPENAI",
16
+ "HAS_TOGETHER",
17
+ "HAS_ANTHROPIC",
18
+ "HAS_GOOGLE_GENAI",
19
+ ]
@@ -0,0 +1,119 @@
1
+ from __future__ import annotations
2
+ from typing import TYPE_CHECKING
3
+ import sys
4
+ from judgeval.logger import judgeval_logger
5
+
6
+ if TYPE_CHECKING:
7
+ from judgeval.v1.tracer.base_tracer import BaseTracer
8
+
9
+ __all__ = ["setup_claude_agent_sdk"]
10
+
11
+ try:
12
+ import claude_agent_sdk # type: ignore
13
+ except ImportError:
14
+ raise ImportError(
15
+ "Claude Agent SDK is not installed and required for the claude agent sdk integration. Please install it with `pip install claude-agent-sdk`."
16
+ )
17
+
18
+
19
+ def setup_claude_agent_sdk(
20
+ tracer: "BaseTracer",
21
+ ) -> bool:
22
+ """
23
+ Setup Judgeval integration with Claude Agent SDK. Will automatically patch the SDK for automatic tracing.
24
+
25
+ Args:
26
+ tracer: Judgeval Tracer instance
27
+
28
+ Returns:
29
+ bool: True if setup was successful, False otherwise.
30
+
31
+ Example:
32
+ ```python
33
+ import claude_agent_sdk
34
+ from judgeval.v1.integrations.claude_agent_sdk import setup_claude_agent_sdk
35
+
36
+ tracer = Tracer(project_name="my-project")
37
+ setup_claude_agent_sdk(tracer=tracer)
38
+
39
+ # Now use claude_agent_sdk normally - all calls automatically traced
40
+ ```
41
+ """
42
+ from judgeval.v1.integrations.claude_agent_sdk.wrapper import (
43
+ _create_client_wrapper_class,
44
+ _create_tool_wrapper_class,
45
+ _wrap_tool_factory,
46
+ _wrap_query_function,
47
+ )
48
+
49
+ try:
50
+ # Store original classes before patching
51
+ original_client = (
52
+ claude_agent_sdk.ClaudeSDKClient
53
+ if hasattr(claude_agent_sdk, "ClaudeSDKClient")
54
+ else None
55
+ )
56
+ original_tool_class = (
57
+ claude_agent_sdk.SdkMcpTool
58
+ if hasattr(claude_agent_sdk, "SdkMcpTool")
59
+ else None
60
+ )
61
+ original_tool_fn = (
62
+ claude_agent_sdk.tool if hasattr(claude_agent_sdk, "tool") else None
63
+ )
64
+ original_query_fn = (
65
+ claude_agent_sdk.query if hasattr(claude_agent_sdk, "query") else None
66
+ )
67
+
68
+ # Patch ClaudeSDKClient
69
+ if original_client:
70
+ wrapped_client = _create_client_wrapper_class(original_client, tracer)
71
+ claude_agent_sdk.ClaudeSDKClient = wrapped_client # type: ignore
72
+
73
+ # Update all modules that already imported ClaudeSDKClient
74
+ for module in list(sys.modules.values()):
75
+ if module and hasattr(module, "ClaudeSDKClient"):
76
+ if getattr(module, "ClaudeSDKClient", None) is original_client:
77
+ setattr(module, "ClaudeSDKClient", wrapped_client)
78
+
79
+ # Patch SdkMcpTool
80
+ if original_tool_class:
81
+ wrapped_tool_class = _create_tool_wrapper_class(original_tool_class, tracer)
82
+ claude_agent_sdk.SdkMcpTool = wrapped_tool_class # type: ignore
83
+
84
+ # Update all modules that already imported SdkMcpTool
85
+ for module in list(sys.modules.values()):
86
+ if module and hasattr(module, "SdkMcpTool"):
87
+ if getattr(module, "SdkMcpTool", None) is original_tool_class:
88
+ setattr(module, "SdkMcpTool", wrapped_tool_class)
89
+
90
+ # Patch tool() decorator
91
+ if original_tool_fn:
92
+ wrapped_tool_fn = _wrap_tool_factory(original_tool_fn, tracer)
93
+ claude_agent_sdk.tool = wrapped_tool_fn # type: ignore
94
+
95
+ # Update all modules that already imported tool
96
+ for module in list(sys.modules.values()):
97
+ if module and hasattr(module, "tool"):
98
+ if getattr(module, "tool", None) is original_tool_fn:
99
+ setattr(module, "tool", wrapped_tool_fn)
100
+
101
+ # Patch standalone query() function if it exists
102
+ # Note: The standalone query() uses InternalClient, not ClaudeSDKClient,
103
+ # so we need to wrap it separately to add tracing
104
+ if original_query_fn:
105
+ wrapped_query_fn = _wrap_query_function(original_query_fn, tracer)
106
+ claude_agent_sdk.query = wrapped_query_fn # type: ignore
107
+
108
+ # Update all modules that already imported query
109
+ for module in list(sys.modules.values()):
110
+ if module and hasattr(module, "query"):
111
+ if getattr(module, "query", None) is original_query_fn:
112
+ setattr(module, "query", wrapped_query_fn)
113
+
114
+ judgeval_logger.info("Claude Agent SDK integration setup successful")
115
+ return True
116
+
117
+ except Exception as e:
118
+ judgeval_logger.error(f"Failed to setup Claude Agent SDK integration: {e}")
119
+ return False