judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,520 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime
4
+ import functools
5
+ import inspect
6
+ import time
7
+ from abc import ABC, abstractmethod
8
+ from typing import Any, Callable, Dict, Optional, Tuple, TypeVar, overload
9
+
10
+ from opentelemetry import trace
11
+ from opentelemetry.sdk.trace.export import SpanExporter
12
+ from opentelemetry.trace import Span, SpanContext, Status, StatusCode
13
+
14
+ from judgeval.logger import judgeval_logger
15
+ from judgeval.utils.decorators.dont_throw import dont_throw
16
+ from judgeval.v1.data.example import Example
17
+ from judgeval.v1.instrumentation import wrap_provider
18
+ from judgeval.v1.instrumentation.llm.providers import ApiClient
19
+ from judgeval.v1.internal.api import JudgmentSyncClient
20
+ from judgeval.v1.utils import resolve_project_id
21
+ from judgeval.v1.internal.api.api_types import (
22
+ ExampleEvaluationRun,
23
+ TraceEvaluationRun,
24
+ )
25
+ from judgeval.v1.scorers.base_scorer import BaseScorer
26
+ from judgeval.judgment_attribute_keys import AttributeKeys
27
+ from judgeval.v1.scorers.custom_scorer.custom_scorer import CustomScorer
28
+ from judgeval.v1.tracer.exporters.judgment_span_exporter import JudgmentSpanExporter
29
+ from judgeval.v1.tracer.processors.judgment_span_processor import JudgmentSpanProcessor
30
+ from uuid import uuid4
31
+ from opentelemetry.context import attach, detach, get_value, set_value
32
+ from judgeval.v1.tracer.processors._lifecycles import (
33
+ AGENT_ID_KEY,
34
+ PARENT_AGENT_ID_KEY,
35
+ CUSTOMER_ID_KEY,
36
+ AGENT_CLASS_NAME_KEY,
37
+ AGENT_INSTANCE_NAME_KEY,
38
+ )
39
+
40
+ C = TypeVar("C", bound=Callable[..., Any])
41
+
42
+
43
+ class BaseTracer(ABC):
44
+ __slots__ = (
45
+ "project_name",
46
+ "enable_evaluation",
47
+ "api_client",
48
+ "serializer",
49
+ "project_id",
50
+ )
51
+
52
+ TRACER_NAME = "judgeval"
53
+
54
+ def __init__(
55
+ self,
56
+ project_name: str,
57
+ enable_evaluation: bool,
58
+ api_client: JudgmentSyncClient,
59
+ serializer: Callable[[Any], str],
60
+ ):
61
+ self.project_name = project_name
62
+ self.enable_evaluation = enable_evaluation
63
+ self.api_client = api_client
64
+ self.serializer = serializer
65
+ self.project_id = resolve_project_id(api_client, project_name)
66
+
67
+ if self.project_id is None:
68
+ judgeval_logger.error(
69
+ f"Failed to resolve project {project_name}, "
70
+ f"please create it first at https://app.judgmentlabs.ai/org/{self.api_client.organization_id}/projects. "
71
+ "Skipping Judgment export."
72
+ )
73
+
74
+ @abstractmethod
75
+ def initialize(self) -> None:
76
+ pass
77
+
78
+ @abstractmethod
79
+ def force_flush(self, timeout_millis: int) -> bool:
80
+ pass
81
+
82
+ @abstractmethod
83
+ def shutdown(self, timeout_millis: int) -> None:
84
+ pass
85
+
86
+ def get_span_exporter(self) -> SpanExporter:
87
+ if self.project_id is not None:
88
+ return JudgmentSpanExporter(
89
+ endpoint=self._build_endpoint(self.api_client.base_url),
90
+ api_key=self.api_client.api_key,
91
+ organization_id=self.api_client.organization_id,
92
+ project_id=self.project_id,
93
+ )
94
+ else:
95
+ judgeval_logger.error(
96
+ "Project not resolved; cannot create exporter, returning NoOpSpanExporter"
97
+ )
98
+ from judgeval.v1.tracer.exporters.noop_span_exporter import NoOpSpanExporter
99
+
100
+ return NoOpSpanExporter()
101
+
102
+ def get_span_processor(self) -> JudgmentSpanProcessor:
103
+ if self.project_id is not None:
104
+ return JudgmentSpanProcessor(
105
+ self,
106
+ self.get_span_exporter(),
107
+ )
108
+ else:
109
+ judgeval_logger.error(
110
+ "Project not resolved; cannot create processor, returning NoOpSpanProcessor"
111
+ )
112
+ from judgeval.v1.tracer.processors.noop_span_processor import (
113
+ NoOpJudgmentSpanProcessor,
114
+ )
115
+
116
+ return NoOpJudgmentSpanProcessor()
117
+
118
+ def get_tracer(self) -> trace.Tracer:
119
+ return trace.get_tracer(self.TRACER_NAME)
120
+
121
+ def set_span_kind(self, kind: str) -> None:
122
+ if kind is None:
123
+ return
124
+ current_span = trace.get_current_span()
125
+ if current_span is not None:
126
+ current_span.set_attribute(AttributeKeys.JUDGMENT_SPAN_KIND, kind)
127
+
128
+ @dont_throw
129
+ def set_attribute(self, key: str, value: Any) -> None:
130
+ if not self._is_valid_key(key):
131
+ return
132
+ if value is None:
133
+ return
134
+ current_span = trace.get_current_span()
135
+ if current_span is not None:
136
+ serialized_value = (
137
+ self.serializer(value)
138
+ if not isinstance(value, (str, int, float, bool))
139
+ else value
140
+ )
141
+ current_span.set_attribute(key, serialized_value)
142
+
143
+ def set_attributes(self, attributes: Dict[str, Any]) -> None:
144
+ if attributes is None:
145
+ return
146
+ for key, value in attributes.items():
147
+ self.set_attribute(key, value)
148
+
149
+ def set_customer_id(self, customer_id: str) -> None:
150
+ ctx = set_value(CUSTOMER_ID_KEY, customer_id)
151
+ attach(ctx)
152
+
153
+ def set_llm_span(self) -> None:
154
+ self.set_span_kind("llm")
155
+
156
+ def set_tool_span(self) -> None:
157
+ self.set_span_kind("tool")
158
+
159
+ def set_general_span(self) -> None:
160
+ self.set_span_kind("span")
161
+
162
+ def set_input(self, input_data: Any) -> None:
163
+ self.set_attribute(AttributeKeys.JUDGMENT_INPUT, input_data)
164
+
165
+ def set_output(self, output_data: Any) -> None:
166
+ self.set_attribute(AttributeKeys.JUDGMENT_OUTPUT, output_data)
167
+
168
+ def span(self, span_name: str, callable_func: Callable[[], Any]) -> Any:
169
+ tracer = self.get_tracer()
170
+ with tracer.start_as_current_span(span_name) as span:
171
+ try:
172
+ return callable_func()
173
+ except Exception as e:
174
+ span.set_status(trace.Status(trace.StatusCode.ERROR))
175
+ span.record_exception(e)
176
+ raise
177
+
178
+ @staticmethod
179
+ def start_span(span_name: str) -> Span:
180
+ tracer = trace.get_tracer(BaseTracer.TRACER_NAME)
181
+ return tracer.start_span(span_name)
182
+
183
+ @dont_throw
184
+ def async_evaluate(
185
+ self,
186
+ scorer: BaseScorer,
187
+ example: Example,
188
+ ) -> None:
189
+ if not self.enable_evaluation:
190
+ return
191
+
192
+ span_context = self._get_sampled_span_context()
193
+ if span_context is None:
194
+ return
195
+
196
+ trace_id = span_context.trace_id
197
+ span_id = span_context.span_id
198
+ trace_id_hex = format(trace_id, "032x")
199
+ span_id_hex = format(span_id, "016x")
200
+
201
+ self._log_evaluation_info(
202
+ "asyncEvaluate", trace_id_hex, span_id_hex, scorer.get_name()
203
+ )
204
+
205
+ evaluation_run = self._create_evaluation_run(
206
+ scorer, example, trace_id_hex, span_id_hex
207
+ )
208
+ self._enqueue_evaluation(evaluation_run)
209
+
210
+ @dont_throw
211
+ def async_trace_evaluate(
212
+ self,
213
+ scorer: BaseScorer,
214
+ ) -> None:
215
+ if not self.enable_evaluation:
216
+ return
217
+
218
+ current_span = self._get_sampled_span()
219
+ if current_span is None:
220
+ return
221
+
222
+ span_context = current_span.get_span_context()
223
+ trace_id = span_context.trace_id
224
+ span_id = span_context.span_id
225
+ trace_id_hex = format(trace_id, "032x")
226
+ span_id_hex = format(span_id, "016x")
227
+
228
+ self._log_evaluation_info(
229
+ "asyncTraceEvaluate", trace_id_hex, span_id_hex, scorer.get_name()
230
+ )
231
+
232
+ evaluation_run = self._create_trace_evaluation_run(
233
+ scorer, trace_id_hex, span_id_hex
234
+ )
235
+ try:
236
+ trace_eval_json = self.serializer(evaluation_run)
237
+ current_span.set_attribute(
238
+ AttributeKeys.JUDGMENT_PENDING_TRACE_EVAL, trace_eval_json
239
+ )
240
+ except Exception as e:
241
+ judgeval_logger.error(f"Failed to serialize trace evaluation: {e}")
242
+
243
+ def _build_endpoint(self, base_url: str) -> str:
244
+ return (
245
+ base_url + "otel/v1/traces"
246
+ if base_url.endswith("/")
247
+ else base_url + "/otel/v1/traces"
248
+ )
249
+
250
+ def _generate_run_id(self, prefix: str, span_id: Optional[str]) -> str:
251
+ return prefix + (
252
+ span_id if span_id is not None else str(int(time.time() * 1000))
253
+ )
254
+
255
+ def _create_evaluation_run(
256
+ self,
257
+ scorer: BaseScorer,
258
+ example: Example,
259
+ trace_id: str,
260
+ span_id: str,
261
+ ) -> ExampleEvaluationRun:
262
+ run_id = self._generate_run_id("async_evaluate_", span_id)
263
+
264
+ judgment_scorers = (
265
+ [] if isinstance(scorer, CustomScorer) else [scorer.get_scorer_config()]
266
+ )
267
+ custom_scorers = [scorer.to_dict()] if isinstance(scorer, CustomScorer) else []
268
+
269
+ return ExampleEvaluationRun(
270
+ project_name=self.project_name,
271
+ eval_name=run_id,
272
+ trace_id=trace_id,
273
+ trace_span_id=span_id,
274
+ examples=[example.to_dict()],
275
+ judgment_scorers=judgment_scorers,
276
+ custom_scorers=custom_scorers,
277
+ created_at=datetime.datetime.now(datetime.timezone.utc).isoformat(),
278
+ )
279
+
280
+ def _create_trace_evaluation_run(
281
+ self,
282
+ scorer: BaseScorer,
283
+ trace_id: str,
284
+ span_id: str,
285
+ ) -> TraceEvaluationRun:
286
+ eval_name = self._generate_run_id("async_trace_evaluate_", span_id)
287
+
288
+ judgment_scorers = (
289
+ [] if isinstance(scorer, CustomScorer) else [scorer.get_scorer_config()]
290
+ )
291
+ custom_scorers = [scorer.to_dict()] if isinstance(scorer, CustomScorer) else []
292
+
293
+ return TraceEvaluationRun(
294
+ project_name=self.project_name,
295
+ eval_name=eval_name,
296
+ trace_and_span_ids=[[trace_id, span_id]],
297
+ judgment_scorers=judgment_scorers,
298
+ custom_scorers=custom_scorers,
299
+ is_offline=False,
300
+ is_bucket_run=False,
301
+ created_at=datetime.datetime.now(datetime.timezone.utc).isoformat(),
302
+ )
303
+
304
+ def _enqueue_evaluation(self, evaluation_run: ExampleEvaluationRun) -> None:
305
+ try:
306
+ self.api_client.add_to_run_eval_queue_examples(evaluation_run)
307
+ except Exception as e:
308
+ judgeval_logger.error(f"Failed to enqueue evaluation run: {e}")
309
+
310
+ def _get_sampled_span_context(self) -> Optional[SpanContext]:
311
+ current_span = trace.get_current_span()
312
+ if current_span is None:
313
+ return None
314
+ span_context = current_span.get_span_context()
315
+ if not span_context.is_valid or not span_context.trace_flags.sampled:
316
+ return None
317
+ return span_context
318
+
319
+ def _get_sampled_span(self) -> Optional[Span]:
320
+ current_span = trace.get_current_span()
321
+ if current_span is None:
322
+ return None
323
+ span_context = current_span.get_span_context()
324
+ if not span_context.is_valid or not span_context.trace_flags.sampled:
325
+ return None
326
+ return current_span
327
+
328
+ def _log_evaluation_info(
329
+ self, method: str, trace_id: str, span_id: str, scorer_name: str
330
+ ) -> None:
331
+ judgeval_logger.info(
332
+ f"{method}: project={self.project_name}, traceId={trace_id}, spanId={span_id}, scorer={scorer_name}"
333
+ )
334
+
335
+ @staticmethod
336
+ def _is_valid_key(key: str) -> bool:
337
+ return key is not None and len(key) > 0
338
+
339
+ @overload
340
+ def observe(
341
+ self,
342
+ func: C,
343
+ span_type: Optional[str] = "span",
344
+ span_name: Optional[str] = None,
345
+ ) -> C: ...
346
+
347
+ @overload
348
+ def observe(
349
+ self,
350
+ func: None = None,
351
+ span_type: Optional[str] = "span",
352
+ span_name: Optional[str] = None,
353
+ ) -> Callable[[C], C]: ...
354
+
355
+ def observe(
356
+ self,
357
+ func: Optional[C] = None,
358
+ span_type: Optional[str] = "span",
359
+ span_name: Optional[str] = None,
360
+ ) -> C | Callable[[C], C]:
361
+ if func is None:
362
+ return lambda f: self.observe(f, span_type, span_name) # type: ignore[return-value]
363
+
364
+ tracer = self.get_tracer()
365
+ name = span_name or func.__name__
366
+
367
+ if inspect.iscoroutinefunction(func):
368
+
369
+ @functools.wraps(func)
370
+ async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
371
+ with tracer.start_as_current_span(name) as span:
372
+ if span_type:
373
+ span.set_attribute(AttributeKeys.JUDGMENT_SPAN_KIND, span_type)
374
+
375
+ try:
376
+ input_data = _format_inputs(func, args, kwargs)
377
+ span.set_attribute(
378
+ AttributeKeys.JUDGMENT_INPUT, self.serializer(input_data)
379
+ )
380
+
381
+ self.get_span_processor().emit_partial()
382
+
383
+ result = await func(*args, **kwargs)
384
+
385
+ span.set_attribute(
386
+ AttributeKeys.JUDGMENT_OUTPUT, self.serializer(result)
387
+ )
388
+ return result
389
+ except Exception as e:
390
+ span.record_exception(e)
391
+ span.set_status(Status(StatusCode.ERROR, str(e)))
392
+ raise
393
+
394
+ return async_wrapper # type: ignore[return-value]
395
+ else:
396
+
397
+ @functools.wraps(func)
398
+ def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
399
+ with tracer.start_as_current_span(name) as span:
400
+ if span_type:
401
+ span.set_attribute(AttributeKeys.JUDGMENT_SPAN_KIND, span_type)
402
+
403
+ try:
404
+ input_data = _format_inputs(func, args, kwargs)
405
+ span.set_attribute(
406
+ AttributeKeys.JUDGMENT_INPUT, self.serializer(input_data)
407
+ )
408
+
409
+ self.get_span_processor().emit_partial()
410
+
411
+ result = func(*args, **kwargs)
412
+
413
+ span.set_attribute(
414
+ AttributeKeys.JUDGMENT_OUTPUT, self.serializer(result)
415
+ )
416
+ return result
417
+ except Exception as e:
418
+ span.record_exception(e)
419
+ span.set_status(Status(StatusCode.ERROR, str(e)))
420
+ raise
421
+
422
+ return sync_wrapper # type: ignore[return-value]
423
+
424
+ @overload
425
+ def agent(self, func: C, /, *, identifier: Optional[str] = None) -> C: ...
426
+
427
+ @overload
428
+ def agent(
429
+ self, func: None = None, /, *, identifier: Optional[str] = None
430
+ ) -> Callable[[C], C]: ...
431
+
432
+ def agent(
433
+ self, func: Optional[C] = None, /, *, identifier: Optional[str] = None
434
+ ) -> C | Callable[[C], C]:
435
+ if func is None:
436
+ return lambda f: self.agent(f, identifier=identifier) # type: ignore[return-value]
437
+
438
+ class_name = None
439
+ if hasattr(func, "__qualname__") and "." in func.__qualname__:
440
+ parts = func.__qualname__.split(".")
441
+ if len(parts) >= 2:
442
+ class_name = parts[-2]
443
+
444
+ if inspect.iscoroutinefunction(func):
445
+
446
+ @functools.wraps(func)
447
+ async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
448
+ agent_id = str(uuid4())
449
+ parent_agent_id = get_value(AGENT_ID_KEY)
450
+ ctx = set_value(AGENT_ID_KEY, agent_id)
451
+ if parent_agent_id:
452
+ ctx = set_value(PARENT_AGENT_ID_KEY, parent_agent_id, context=ctx)
453
+ if class_name:
454
+ ctx = set_value(AGENT_CLASS_NAME_KEY, class_name, context=ctx)
455
+ if identifier and args:
456
+ instance = args[0]
457
+ if hasattr(instance, identifier):
458
+ instance_name = str(getattr(instance, identifier))
459
+ ctx = set_value(
460
+ AGENT_INSTANCE_NAME_KEY, instance_name, context=ctx
461
+ )
462
+ token = attach(ctx)
463
+ try:
464
+ return await func(*args, **kwargs)
465
+ finally:
466
+ detach(token)
467
+
468
+ return async_wrapper # type: ignore[return-value]
469
+ else:
470
+
471
+ @functools.wraps(func)
472
+ def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
473
+ agent_id = str(uuid4())
474
+ parent_agent_id = get_value(AGENT_ID_KEY)
475
+ ctx = set_value(AGENT_ID_KEY, agent_id)
476
+ if parent_agent_id:
477
+ ctx = set_value(PARENT_AGENT_ID_KEY, parent_agent_id, context=ctx)
478
+ if class_name:
479
+ ctx = set_value(AGENT_CLASS_NAME_KEY, class_name, context=ctx)
480
+ if identifier and args:
481
+ instance = args[0]
482
+ if hasattr(instance, identifier):
483
+ instance_name = str(getattr(instance, identifier))
484
+ ctx = set_value(
485
+ AGENT_INSTANCE_NAME_KEY, instance_name, context=ctx
486
+ )
487
+ token = attach(ctx)
488
+ try:
489
+ return func(*args, **kwargs)
490
+ finally:
491
+ detach(token)
492
+
493
+ return sync_wrapper # type: ignore[return-value]
494
+
495
+ def wrap(self, client: ApiClient) -> ApiClient:
496
+ return wrap_provider(self, client)
497
+
498
+
499
+ def _format_inputs(
500
+ f: Callable[..., Any], args: Tuple[Any, ...], kwargs: Dict[str, Any]
501
+ ) -> Dict[str, Any]:
502
+ try:
503
+ params = list(inspect.signature(f).parameters.values())
504
+ inputs: Dict[str, Any] = {}
505
+ arg_i = 0
506
+ for param in params:
507
+ if param.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD:
508
+ if arg_i < len(args):
509
+ inputs[param.name] = args[arg_i]
510
+ arg_i += 1
511
+ elif param.name in kwargs:
512
+ inputs[param.name] = kwargs[param.name]
513
+ elif param.kind == inspect.Parameter.VAR_POSITIONAL:
514
+ inputs[param.name] = args[arg_i:]
515
+ arg_i = len(args)
516
+ elif param.kind == inspect.Parameter.VAR_KEYWORD:
517
+ inputs[param.name] = kwargs
518
+ return inputs
519
+ except Exception:
520
+ return {}
@@ -0,0 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+ from judgeval.v1.tracer.exporters.judgment_span_exporter import JudgmentSpanExporter
4
+ from judgeval.v1.tracer.exporters.noop_span_exporter import NoOpSpanExporter
5
+ from judgeval.v1.tracer.exporters.span_store import ABCSpanStore, SpanStore
6
+ from judgeval.v1.tracer.exporters.in_memory_span_exporter import InMemorySpanExporter
7
+
8
+ __all__ = [
9
+ "JudgmentSpanExporter",
10
+ "NoOpSpanExporter",
11
+ "ABCSpanStore",
12
+ "SpanStore",
13
+ "InMemorySpanExporter",
14
+ ]
@@ -0,0 +1,25 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Sequence
4
+
5
+ from opentelemetry.sdk.trace import ReadableSpan
6
+ from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
7
+
8
+ from judgeval.v1.tracer.exporters.span_store import ABCSpanStore
9
+
10
+
11
+ class InMemorySpanExporter(SpanExporter):
12
+ __slots__ = ("_store",)
13
+
14
+ def __init__(self, store: ABCSpanStore) -> None:
15
+ self._store = store
16
+
17
+ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
18
+ self._store.add(*spans)
19
+ return SpanExportResult.SUCCESS
20
+
21
+ def shutdown(self) -> None:
22
+ pass
23
+
24
+ def force_flush(self, timeout_millis: int = 30000) -> bool:
25
+ return True
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Sequence
4
+
5
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
6
+ from opentelemetry.sdk.trace import ReadableSpan
7
+ from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
8
+
9
+ from judgeval.logger import judgeval_logger
10
+
11
+
12
+ class JudgmentSpanExporter(SpanExporter):
13
+ __slots__ = ("_delegate",)
14
+
15
+ def __init__(
16
+ self,
17
+ endpoint: str,
18
+ api_key: str,
19
+ organization_id: str,
20
+ project_id: str,
21
+ ):
22
+ if not project_id or len(project_id.strip()) == 0:
23
+ raise ValueError("project_id is required for JudgmentSpanExporter")
24
+
25
+ self._delegate = OTLPSpanExporter(
26
+ endpoint=endpoint,
27
+ headers={
28
+ "Authorization": f"Bearer {api_key}",
29
+ "X-Organization-Id": organization_id,
30
+ "X-Project-Id": project_id,
31
+ },
32
+ )
33
+
34
+ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
35
+ judgeval_logger.info(f"Exported {len(spans)} spans")
36
+ return self._delegate.export(spans)
37
+
38
+ def shutdown(self) -> None:
39
+ self._delegate.shutdown()
40
+
41
+ def force_flush(self, timeout_millis: int = 30000) -> bool:
42
+ return self._delegate.force_flush(timeout_millis)
@@ -0,0 +1,19 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Sequence
4
+
5
+ from opentelemetry.sdk.trace import ReadableSpan
6
+ from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
7
+
8
+
9
+ class NoOpSpanExporter(SpanExporter):
10
+ __slots__ = ()
11
+
12
+ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
13
+ return SpanExportResult.SUCCESS
14
+
15
+ def shutdown(self) -> None:
16
+ pass
17
+
18
+ def force_flush(self, timeout_millis: int = 30000) -> bool:
19
+ return True
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Dict, List
5
+
6
+ from opentelemetry.sdk.trace import ReadableSpan
7
+
8
+
9
+ class ABCSpanStore(ABC):
10
+ @abstractmethod
11
+ def add(self, *spans: ReadableSpan) -> None: ...
12
+
13
+ @abstractmethod
14
+ def get_all(self) -> List[ReadableSpan]: ...
15
+
16
+ @abstractmethod
17
+ def get_by_trace_id(self, trace_id: str) -> List[ReadableSpan]: ...
18
+
19
+ @abstractmethod
20
+ def clear_trace(self, trace_id: str) -> None: ...
21
+
22
+
23
+ class SpanStore(ABCSpanStore):
24
+ __slots__ = ("_spans_by_trace",)
25
+
26
+ def __init__(self) -> None:
27
+ self._spans_by_trace: Dict[str, List[ReadableSpan]] = {}
28
+
29
+ def add(self, *spans: ReadableSpan) -> None:
30
+ for span in spans:
31
+ context = span.get_span_context()
32
+ if context is None:
33
+ continue
34
+ trace_id = format(context.trace_id, "032x")
35
+ if trace_id not in self._spans_by_trace:
36
+ self._spans_by_trace[trace_id] = []
37
+ self._spans_by_trace[trace_id].append(span)
38
+
39
+ def get_all(self) -> List[ReadableSpan]:
40
+ all_spans = []
41
+ for spans in self._spans_by_trace.values():
42
+ all_spans.extend(spans)
43
+ return all_spans
44
+
45
+ def get_by_trace_id(self, trace_id: str) -> List[ReadableSpan]:
46
+ return self._spans_by_trace.get(trace_id, [])
47
+
48
+ def clear_trace(self, trace_id: str) -> None:
49
+ if trace_id in self._spans_by_trace:
50
+ del self._spans_by_trace[trace_id]