judgeval 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +2 -2
- judgeval/api/api_types.py +81 -12
- judgeval/cli.py +2 -1
- judgeval/constants.py +0 -6
- judgeval/data/evaluation_run.py +2 -5
- judgeval/data/judgment_types.py +97 -12
- judgeval/data/trace.py +108 -1
- judgeval/dataset/__init__.py +72 -23
- judgeval/env.py +5 -20
- judgeval/integrations/langgraph/__init__.py +9 -785
- judgeval/scorers/api_scorer.py +7 -12
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -8
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -8
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -12
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +22 -33
- judgeval/scorers/score.py +1 -1
- judgeval/scorers/utils.py +1 -4
- judgeval/tracer/__init__.py +175 -156
- judgeval/tracer/exporters/__init__.py +4 -1
- judgeval/tracer/keys.py +15 -25
- judgeval/tracer/llm/__init__.py +0 -1
- judgeval/tracer/llm/anthropic/__init__.py +20 -0
- judgeval/tracer/llm/google/__init__.py +21 -0
- judgeval/tracer/llm/groq/__init__.py +20 -0
- judgeval/tracer/llm/openai/__init__.py +32 -0
- judgeval/tracer/llm/providers.py +28 -79
- judgeval/tracer/llm/together/__init__.py +20 -0
- judgeval/tracer/managers.py +23 -48
- judgeval/tracer/processors/__init__.py +36 -75
- judgeval/tracer/utils.py +1 -2
- judgeval/utils/file_utils.py +0 -2
- judgeval/utils/meta.py +18 -5
- judgeval/utils/testing.py +0 -14
- judgeval/utils/version_check.py +2 -0
- judgeval/version.py +1 -1
- {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/METADATA +1 -7
- {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/RECORD +40 -35
- {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/WHEEL +0 -0
- {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/tracer/__init__.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
import os
|
3
2
|
from contextvars import ContextVar
|
4
3
|
import atexit
|
5
4
|
import functools
|
@@ -24,15 +23,17 @@ from typing import (
|
|
24
23
|
from functools import partial
|
25
24
|
from warnings import warn
|
26
25
|
|
27
|
-
from opentelemetry.sdk.trace import
|
26
|
+
from opentelemetry.sdk.trace import TracerProvider
|
28
27
|
from opentelemetry.sdk.resources import Resource
|
29
28
|
from opentelemetry.trace import (
|
30
29
|
Status,
|
31
30
|
StatusCode,
|
32
|
-
TracerProvider as ABCTracerProvider,
|
33
|
-
NoOpTracerProvider,
|
34
31
|
Tracer as ABCTracer,
|
32
|
+
Span,
|
35
33
|
get_current_span,
|
34
|
+
get_tracer_provider,
|
35
|
+
set_tracer_provider,
|
36
|
+
INVALID_SPAN_CONTEXT,
|
36
37
|
)
|
37
38
|
|
38
39
|
from judgeval.data.evaluation_run import ExampleEvaluationRun, TraceEvaluationRun
|
@@ -41,6 +42,8 @@ from judgeval.env import (
|
|
41
42
|
JUDGMENT_API_KEY,
|
42
43
|
JUDGMENT_DEFAULT_GPT_MODEL,
|
43
44
|
JUDGMENT_ORG_ID,
|
45
|
+
JUDGMENT_ENABLE_MONITORING,
|
46
|
+
JUDGMENT_ENABLE_EVALUATIONS,
|
44
47
|
)
|
45
48
|
from judgeval.logger import judgeval_logger
|
46
49
|
from judgeval.scorers.api_scorer import TraceAPIScorerConfig, ExampleAPIScorerConfig
|
@@ -52,7 +55,10 @@ from judgeval.tracer.managers import (
|
|
52
55
|
sync_agent_context,
|
53
56
|
async_agent_context,
|
54
57
|
)
|
58
|
+
from judgeval.utils.decorators import dont_throw
|
59
|
+
from judgeval.utils.guards import expect_api_key, expect_organization_id
|
55
60
|
from judgeval.utils.serialize import safe_serialize
|
61
|
+
from judgeval.utils.meta import SingletonMeta
|
56
62
|
from judgeval.version import get_version
|
57
63
|
from judgeval.warnings import JudgmentWarning
|
58
64
|
|
@@ -64,7 +70,6 @@ from judgeval.tracer.local_eval_queue import LocalEvaluationQueue
|
|
64
70
|
from judgeval.tracer.processors import (
|
65
71
|
JudgmentSpanProcessor,
|
66
72
|
NoOpJudgmentSpanProcessor,
|
67
|
-
NoOpSpanProcessor,
|
68
73
|
)
|
69
74
|
from judgeval.tracer.utils import set_span_attribute, TraceScorerConfig
|
70
75
|
|
@@ -85,46 +90,34 @@ class AgentContext(TypedDict):
|
|
85
90
|
parent_agent_id: str | None
|
86
91
|
|
87
92
|
|
88
|
-
class Tracer:
|
89
|
-
_active_tracers: List[Tracer] = []
|
90
|
-
|
93
|
+
class Tracer(metaclass=SingletonMeta):
|
91
94
|
__slots__ = (
|
92
95
|
"api_key",
|
93
96
|
"organization_id",
|
94
97
|
"project_name",
|
95
|
-
"api_url",
|
96
|
-
"deep_tracing",
|
97
98
|
"enable_monitoring",
|
98
99
|
"enable_evaluation",
|
100
|
+
"resource_attributes",
|
99
101
|
"api_client",
|
100
102
|
"local_eval_queue",
|
101
|
-
# Otel
|
102
103
|
"judgment_processor",
|
103
|
-
"processors",
|
104
|
-
"provider",
|
105
104
|
"tracer",
|
106
|
-
# Agent
|
107
105
|
"agent_context",
|
108
|
-
"
|
106
|
+
"_initialized",
|
109
107
|
)
|
110
108
|
|
111
109
|
api_key: str
|
112
110
|
organization_id: str
|
113
111
|
project_name: str
|
114
|
-
api_url: str
|
115
|
-
deep_tracing: bool
|
116
112
|
enable_monitoring: bool
|
117
113
|
enable_evaluation: bool
|
114
|
+
resource_attributes: Optional[Dict[str, Any]]
|
118
115
|
api_client: JudgmentSyncClient
|
119
116
|
local_eval_queue: LocalEvaluationQueue
|
120
|
-
|
121
117
|
judgment_processor: JudgmentSpanProcessor
|
122
|
-
processors: List[SpanProcessor]
|
123
|
-
provider: ABCTracerProvider
|
124
118
|
tracer: ABCTracer
|
125
|
-
|
126
119
|
agent_context: ContextVar[Optional[AgentContext]]
|
127
|
-
|
120
|
+
_initialized: bool
|
128
121
|
|
129
122
|
def __init__(
|
130
123
|
self,
|
@@ -133,82 +126,125 @@ class Tracer:
|
|
133
126
|
project_name: str,
|
134
127
|
api_key: Optional[str] = None,
|
135
128
|
organization_id: Optional[str] = None,
|
136
|
-
|
137
|
-
|
138
|
-
"JUDGMENT_ENABLE_MONITORING", "true"
|
139
|
-
).lower()
|
140
|
-
!= "false",
|
141
|
-
enable_evaluation: bool = os.getenv(
|
142
|
-
"JUDGMENT_ENABLE_EVALUATIONS", "true"
|
143
|
-
).lower()
|
144
|
-
!= "false",
|
145
|
-
processors: List[SpanProcessor] = [],
|
129
|
+
enable_monitoring: bool = JUDGMENT_ENABLE_MONITORING.lower() == "true",
|
130
|
+
enable_evaluation: bool = JUDGMENT_ENABLE_EVALUATIONS.lower() == "true",
|
146
131
|
resource_attributes: Optional[Dict[str, Any]] = None,
|
132
|
+
initialize: bool = True,
|
147
133
|
):
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
134
|
+
if not hasattr(self, "_initialized"):
|
135
|
+
self._initialized = False
|
136
|
+
self.agent_context = ContextVar("current_agent_context", default=None)
|
137
|
+
|
138
|
+
self.project_name = project_name
|
139
|
+
self.api_key = expect_api_key(api_key or JUDGMENT_API_KEY)
|
140
|
+
self.organization_id = expect_organization_id(
|
141
|
+
organization_id or JUDGMENT_ORG_ID
|
154
142
|
)
|
143
|
+
self.enable_monitoring = enable_monitoring
|
144
|
+
self.enable_evaluation = enable_evaluation
|
145
|
+
self.resource_attributes = resource_attributes
|
155
146
|
|
156
|
-
|
157
|
-
|
158
|
-
|
147
|
+
self.api_client = JudgmentSyncClient(
|
148
|
+
api_key=self.api_key,
|
149
|
+
organization_id=self.organization_id,
|
159
150
|
)
|
151
|
+
self.local_eval_queue = LocalEvaluationQueue()
|
160
152
|
|
161
|
-
|
162
|
-
|
163
|
-
self.project_name = project_name
|
164
|
-
self.api_url = url_for("/otel/v1/traces")
|
153
|
+
if initialize:
|
154
|
+
self.initialize()
|
165
155
|
|
166
|
-
|
167
|
-
self.
|
168
|
-
|
156
|
+
def initialize(self) -> Tracer:
|
157
|
+
if self._initialized:
|
158
|
+
return self
|
169
159
|
|
170
160
|
self.judgment_processor = NoOpJudgmentSpanProcessor()
|
171
|
-
self.processors = processors
|
172
|
-
self.provider = NoOpTracerProvider()
|
173
|
-
|
174
|
-
self.agent_context = ContextVar("current_agent_context", default=None)
|
175
|
-
self.cost_context = ContextVar("current_cost_context", default=None)
|
176
|
-
|
177
161
|
if self.enable_monitoring:
|
178
|
-
|
179
|
-
self,
|
180
|
-
self.project_name,
|
181
|
-
self.api_key,
|
182
|
-
self.organization_id,
|
183
|
-
max_queue_size=2**18,
|
184
|
-
export_timeout_millis=30000,
|
185
|
-
resource_attributes=resource_attributes,
|
162
|
+
project_id = Tracer._resolve_project_id(
|
163
|
+
self.project_name, self.api_key, self.organization_id
|
186
164
|
)
|
187
165
|
|
188
|
-
|
189
|
-
|
166
|
+
if project_id:
|
167
|
+
self.judgment_processor = self.get_processor(
|
168
|
+
tracer=self,
|
169
|
+
project_name=self.project_name,
|
170
|
+
project_id=project_id,
|
171
|
+
api_key=self.api_key,
|
172
|
+
organization_id=self.organization_id,
|
173
|
+
resource_attributes=self.resource_attributes,
|
174
|
+
)
|
190
175
|
|
191
|
-
|
192
|
-
|
193
|
-
|
176
|
+
resource = Resource.create(self.judgment_processor.resource_attributes)
|
177
|
+
provider = TracerProvider(resource=resource)
|
178
|
+
provider.add_span_processor(self.judgment_processor)
|
179
|
+
set_tracer_provider(provider)
|
180
|
+
else:
|
181
|
+
judgeval_logger.error(
|
182
|
+
f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/org/{self.organization_id}/projects. Skipping Judgment export."
|
183
|
+
)
|
194
184
|
|
195
|
-
self.tracer =
|
185
|
+
self.tracer = get_tracer_provider().get_tracer(
|
196
186
|
JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME,
|
197
187
|
get_version(),
|
198
188
|
)
|
199
|
-
self.api_client = JudgmentSyncClient(
|
200
|
-
api_key=self.api_key,
|
201
|
-
organization_id=self.organization_id,
|
202
|
-
)
|
203
|
-
self.local_eval_queue = LocalEvaluationQueue()
|
204
189
|
|
205
190
|
if self.enable_evaluation and self.enable_monitoring:
|
206
191
|
self.local_eval_queue.start_workers()
|
207
192
|
|
208
|
-
|
209
|
-
|
210
|
-
# Register atexit handler to flush on program exit
|
193
|
+
self._initialized = True
|
211
194
|
atexit.register(self._atexit_flush)
|
195
|
+
return self
|
196
|
+
|
197
|
+
@staticmethod
|
198
|
+
def get_exporter(
|
199
|
+
project_id: str,
|
200
|
+
api_key: Optional[str] = None,
|
201
|
+
organization_id: Optional[str] = None,
|
202
|
+
):
|
203
|
+
from judgeval.tracer.exporters import JudgmentSpanExporter
|
204
|
+
|
205
|
+
return JudgmentSpanExporter(
|
206
|
+
endpoint=url_for("/otel/v1/traces"),
|
207
|
+
api_key=api_key or JUDGMENT_API_KEY,
|
208
|
+
organization_id=organization_id or JUDGMENT_ORG_ID,
|
209
|
+
project_id=project_id,
|
210
|
+
)
|
211
|
+
|
212
|
+
@staticmethod
|
213
|
+
def get_processor(
|
214
|
+
tracer: Tracer,
|
215
|
+
project_name: str,
|
216
|
+
project_id: str,
|
217
|
+
api_key: Optional[str] = None,
|
218
|
+
organization_id: Optional[str] = None,
|
219
|
+
max_queue_size: int = 2**18,
|
220
|
+
export_timeout_millis: int = 30000,
|
221
|
+
resource_attributes: Optional[Dict[str, Any]] = None,
|
222
|
+
) -> JudgmentSpanProcessor:
|
223
|
+
"""Create a JudgmentSpanProcessor using the correct constructor."""
|
224
|
+
return JudgmentSpanProcessor(
|
225
|
+
tracer,
|
226
|
+
project_name,
|
227
|
+
project_id,
|
228
|
+
api_key or JUDGMENT_API_KEY,
|
229
|
+
organization_id or JUDGMENT_ORG_ID,
|
230
|
+
max_queue_size=max_queue_size,
|
231
|
+
export_timeout_millis=export_timeout_millis,
|
232
|
+
resource_attributes=resource_attributes,
|
233
|
+
)
|
234
|
+
|
235
|
+
@dont_throw
|
236
|
+
@functools.lru_cache(maxsize=64)
|
237
|
+
@staticmethod
|
238
|
+
def _resolve_project_id(
|
239
|
+
project_name: str, api_key: str, organization_id: str
|
240
|
+
) -> str | None:
|
241
|
+
"""Resolve project_id from project_name using the API."""
|
242
|
+
client = JudgmentSyncClient(
|
243
|
+
api_key=api_key,
|
244
|
+
organization_id=organization_id,
|
245
|
+
)
|
246
|
+
response = client.projects_resolve({"project_name": project_name})
|
247
|
+
return response["project_id"]
|
212
248
|
|
213
249
|
def get_current_span(self):
|
214
250
|
return get_current_span()
|
@@ -219,40 +255,11 @@ class Tracer:
|
|
219
255
|
def get_current_agent_context(self):
|
220
256
|
return self.agent_context
|
221
257
|
|
222
|
-
def get_current_cost_context(self):
|
223
|
-
return self.cost_context
|
224
|
-
|
225
|
-
def get_processor(self):
|
226
|
-
"""Get the judgment span processor instance.
|
227
|
-
|
228
|
-
Returns:
|
229
|
-
The JudgmentSpanProcessor or NoOpJudgmentSpanProcessor instance used by this tracer.
|
230
|
-
"""
|
231
|
-
return self.judgment_processor
|
232
|
-
|
233
258
|
def set_customer_id(self, customer_id: str) -> None:
|
234
259
|
span = self.get_current_span()
|
235
260
|
if span and span.is_recording():
|
236
261
|
set_span_attribute(span, AttributeKeys.JUDGMENT_CUSTOMER_ID, customer_id)
|
237
262
|
|
238
|
-
def add_cost_to_current_context(self, cost: Optional[float]) -> None:
|
239
|
-
"""Add cost to the current cost context and update span attribute."""
|
240
|
-
if cost is None:
|
241
|
-
return
|
242
|
-
current_cost_context = self.cost_context.get()
|
243
|
-
if current_cost_context is not None:
|
244
|
-
current_cumulative_cost = current_cost_context.get("cumulative_cost", 0.0)
|
245
|
-
new_cumulative_cost = float(current_cumulative_cost) + cost
|
246
|
-
current_cost_context["cumulative_cost"] = new_cumulative_cost
|
247
|
-
|
248
|
-
span = self.get_current_span()
|
249
|
-
if span and span.is_recording():
|
250
|
-
set_span_attribute(
|
251
|
-
span,
|
252
|
-
AttributeKeys.JUDGMENT_CUMULATIVE_LLM_COST,
|
253
|
-
new_cumulative_cost,
|
254
|
-
)
|
255
|
-
|
256
263
|
def add_agent_attributes_to_span(self, span):
|
257
264
|
"""Add agent ID, class name, and instance name to span if they exist in context"""
|
258
265
|
current_agent_context = self.agent_context.get()
|
@@ -353,6 +360,8 @@ class Tracer:
|
|
353
360
|
return
|
354
361
|
|
355
362
|
span_context = span.get_span_context()
|
363
|
+
if span_context == INVALID_SPAN_CONTEXT:
|
364
|
+
return
|
356
365
|
trace_id = format(span_context.trace_id, "032x")
|
357
366
|
span_id = format(span_context.span_id, "016x")
|
358
367
|
eval_run_name = f"async_trace_evaluate_{span_id}"
|
@@ -668,6 +677,8 @@ class Tracer:
|
|
668
677
|
/,
|
669
678
|
*,
|
670
679
|
span_type: str | None = None,
|
680
|
+
span_name: str | None = None,
|
681
|
+
attributes: Optional[Dict[str, Any]] = None,
|
671
682
|
scorer_config: TraceScorerConfig | None = None,
|
672
683
|
) -> C: ...
|
673
684
|
|
@@ -678,6 +689,8 @@ class Tracer:
|
|
678
689
|
/,
|
679
690
|
*,
|
680
691
|
span_type: str | None = None,
|
692
|
+
span_name: str | None = None,
|
693
|
+
attributes: Optional[Dict[str, Any]] = None,
|
681
694
|
scorer_config: TraceScorerConfig | None = None,
|
682
695
|
) -> Callable[[C], C]: ...
|
683
696
|
|
@@ -833,37 +846,33 @@ class Tracer:
|
|
833
846
|
timeout_millis: Maximum time to wait for flush completion in milliseconds
|
834
847
|
|
835
848
|
Returns:
|
836
|
-
True if
|
849
|
+
True if processor flushed successfully within timeout, False otherwise
|
837
850
|
"""
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
judgeval_logger.warning(f"Error flushing processor {processor}: {e}")
|
846
|
-
success = False
|
847
|
-
return success
|
848
|
-
|
849
|
-
def _atexit_flush(self) -> None:
|
851
|
+
try:
|
852
|
+
return self.judgment_processor.force_flush(timeout_millis)
|
853
|
+
except Exception as e:
|
854
|
+
judgeval_logger.warning(f"Error flushing processor: {e}")
|
855
|
+
return False
|
856
|
+
|
857
|
+
def _atexit_flush(self, timeout_millis: int = 30000) -> None:
|
850
858
|
"""Internal method called on program exit to flush remaining spans.
|
851
859
|
|
852
860
|
This blocks until all spans are flushed or timeout is reached to ensure
|
853
861
|
proper cleanup before program termination.
|
854
862
|
"""
|
855
863
|
try:
|
856
|
-
self.force_flush(timeout_millis=
|
864
|
+
self.force_flush(timeout_millis=timeout_millis)
|
857
865
|
except Exception as e:
|
858
866
|
judgeval_logger.warning(f"Error during atexit flush: {e}")
|
859
867
|
|
868
|
+
@dont_throw
|
860
869
|
def async_evaluate(
|
861
870
|
self,
|
862
871
|
/,
|
863
872
|
*,
|
864
873
|
scorer: Union[ExampleAPIScorerConfig, ExampleScorer],
|
865
874
|
example: Example,
|
866
|
-
model: str =
|
875
|
+
model: Optional[str] = None,
|
867
876
|
sampling_rate: float = 1.0,
|
868
877
|
):
|
869
878
|
if not self.enable_evaluation or not self.enable_monitoring:
|
@@ -884,6 +893,12 @@ class Tracer:
|
|
884
893
|
)
|
885
894
|
return
|
886
895
|
|
896
|
+
if model is None:
|
897
|
+
if scorer.model is None:
|
898
|
+
model = JUDGMENT_DEFAULT_GPT_MODEL
|
899
|
+
else:
|
900
|
+
model = scorer.model
|
901
|
+
|
887
902
|
if sampling_rate < 0 or sampling_rate > 1:
|
888
903
|
judgeval_logger.error(
|
889
904
|
"Sampling rate must be between 0 and 1, got %s, skipping evaluation."
|
@@ -899,37 +914,32 @@ class Tracer:
|
|
899
914
|
return
|
900
915
|
|
901
916
|
span_context = self.get_current_span().get_span_context()
|
917
|
+
if span_context == INVALID_SPAN_CONTEXT:
|
918
|
+
judgeval_logger.warning(
|
919
|
+
"No span context was found for async_evaluate, skipping evaluation. Please make sure to use the @observe decorator on the function you are evaluating."
|
920
|
+
)
|
921
|
+
return
|
922
|
+
|
902
923
|
trace_id = format(span_context.trace_id, "032x")
|
903
924
|
span_id = format(span_context.span_id, "016x")
|
904
925
|
hosted_scoring = isinstance(scorer, ExampleAPIScorerConfig) or (
|
905
926
|
isinstance(scorer, ExampleScorer) and scorer.server_hosted
|
906
927
|
)
|
907
|
-
|
928
|
+
eval_run = ExampleEvaluationRun(
|
929
|
+
project_name=self.project_name,
|
930
|
+
# note this name doesnt matter because we don't save the experiment only the example and scorer_data
|
931
|
+
eval_name=f"async_evaluate_{span_id}",
|
932
|
+
examples=[example],
|
933
|
+
scorers=[scorer],
|
934
|
+
model=model,
|
935
|
+
trace_span_id=span_id,
|
936
|
+
trace_id=trace_id,
|
937
|
+
)
|
908
938
|
if hosted_scoring:
|
909
|
-
eval_run = ExampleEvaluationRun(
|
910
|
-
project_name=self.project_name,
|
911
|
-
eval_name=eval_run_name,
|
912
|
-
examples=[example],
|
913
|
-
scorers=[scorer],
|
914
|
-
model=model,
|
915
|
-
trace_span_id=span_id,
|
916
|
-
trace_id=trace_id,
|
917
|
-
)
|
918
939
|
self.api_client.add_to_run_eval_queue_examples(
|
919
|
-
eval_run.model_dump(warnings=False)
|
920
|
-
) # type: ignore
|
921
|
-
else:
|
922
|
-
# Handle custom scorers using local evaluation queue
|
923
|
-
eval_run = ExampleEvaluationRun(
|
924
|
-
project_name=self.project_name,
|
925
|
-
eval_name=eval_run_name,
|
926
|
-
examples=[example],
|
927
|
-
scorers=[scorer],
|
928
|
-
model=model,
|
929
|
-
trace_span_id=span_id,
|
930
|
-
trace_id=trace_id,
|
940
|
+
eval_run.model_dump(warnings=False) # type: ignore
|
931
941
|
)
|
932
|
-
|
942
|
+
else:
|
933
943
|
# Enqueue the evaluation run to the local evaluation queue
|
934
944
|
self.local_eval_queue.enqueue(eval_run)
|
935
945
|
|
@@ -971,19 +981,32 @@ class Tracer:
|
|
971
981
|
|
972
982
|
|
973
983
|
def wrap(client: ApiClient) -> ApiClient:
|
974
|
-
|
984
|
+
try:
|
985
|
+
tracer = Tracer.get_instance()
|
986
|
+
if tracer is None or not isinstance(tracer, Tracer):
|
987
|
+
warn(
|
988
|
+
"No Tracer instance found, client will not be wrapped. "
|
989
|
+
"Create a Tracer instance first.",
|
990
|
+
JudgmentWarning,
|
991
|
+
stacklevel=2,
|
992
|
+
)
|
993
|
+
return client
|
994
|
+
if not tracer._initialized:
|
995
|
+
warn(
|
996
|
+
"Tracer not initialized, client will not be wrapped. "
|
997
|
+
"Call Tracer.initialize() first to setup the tracer.",
|
998
|
+
JudgmentWarning,
|
999
|
+
stacklevel=2,
|
1000
|
+
)
|
1001
|
+
return client
|
1002
|
+
return tracer.wrap(client)
|
1003
|
+
except Exception:
|
975
1004
|
warn(
|
976
|
-
"
|
977
|
-
"You can use the global `wrap` function after creating a tracer instance. "
|
978
|
-
"Or you can use the `wrap` method on the tracer instance to directly wrap the client. ",
|
1005
|
+
"Error accessing tracer singleton, client will not be wrapped.",
|
979
1006
|
JudgmentWarning,
|
980
1007
|
stacklevel=2,
|
981
1008
|
)
|
982
|
-
|
983
|
-
wrapped_client = client
|
984
|
-
for tracer in Tracer._active_tracers:
|
985
|
-
wrapped_client = tracer.wrap(wrapped_client)
|
986
|
-
return wrapped_client
|
1009
|
+
return client
|
987
1010
|
|
988
1011
|
|
989
1012
|
def format_inputs(
|
@@ -1010,11 +1033,7 @@ def format_inputs(
|
|
1010
1033
|
return {}
|
1011
1034
|
|
1012
1035
|
|
1013
|
-
# Export processor classes for direct access
|
1014
1036
|
__all__ = [
|
1015
1037
|
"Tracer",
|
1016
1038
|
"wrap",
|
1017
|
-
"JudgmentSpanProcessor",
|
1018
|
-
"NoOpJudgmentSpanProcessor",
|
1019
|
-
"NoOpSpanProcessor",
|
1020
1039
|
]
|
@@ -12,12 +12,15 @@ from judgeval.tracer.exporters.utils import deduplicate_spans
|
|
12
12
|
|
13
13
|
|
14
14
|
class JudgmentSpanExporter(OTLPSpanExporter):
|
15
|
-
def __init__(
|
15
|
+
def __init__(
|
16
|
+
self, endpoint: str, api_key: str, organization_id: str, project_id: str
|
17
|
+
):
|
16
18
|
super().__init__(
|
17
19
|
endpoint=endpoint,
|
18
20
|
headers={
|
19
21
|
"Authorization": f"Bearer {api_key}",
|
20
22
|
"X-Organization-Id": organization_id,
|
23
|
+
"X-Project-Id": project_id,
|
21
24
|
},
|
22
25
|
)
|
23
26
|
|
judgeval/tracer/keys.py
CHANGED
@@ -2,49 +2,40 @@
|
|
2
2
|
Identifiers used by Judgeval to store specific types of data in the spans.
|
3
3
|
"""
|
4
4
|
|
5
|
-
from opentelemetry.semconv.resource import ResourceAttributes
|
6
|
-
from opentelemetry.semconv._incubating.attributes import gen_ai_attributes
|
7
5
|
from enum import Enum
|
8
6
|
|
9
7
|
|
10
8
|
class AttributeKeys(str, Enum):
|
11
|
-
# General function tracing attributes (custom namespace)
|
12
9
|
JUDGMENT_SPAN_KIND = "judgment.span_kind"
|
13
10
|
JUDGMENT_INPUT = "judgment.input"
|
14
11
|
JUDGMENT_OUTPUT = "judgment.output"
|
15
12
|
JUDGMENT_OFFLINE_MODE = "judgment.offline_mode"
|
16
13
|
JUDGMENT_UPDATE_ID = "judgment.update_id"
|
17
14
|
|
18
|
-
# Custom tracking attributes
|
19
15
|
JUDGMENT_CUSTOMER_ID = "judgment.customer_id"
|
20
16
|
|
21
|
-
# Agent specific attributes (custom namespace)
|
22
17
|
JUDGMENT_AGENT_ID = "judgment.agent_id"
|
23
18
|
JUDGMENT_PARENT_AGENT_ID = "judgment.parent_agent_id"
|
24
19
|
JUDGMENT_AGENT_CLASS_NAME = "judgment.agent_class_name"
|
25
20
|
JUDGMENT_AGENT_INSTANCE_NAME = "judgment.agent_instance_name"
|
26
21
|
JUDGMENT_IS_AGENT_ENTRY_POINT = "judgment.is_agent_entry_point"
|
27
|
-
JUDGMENT_CUMULATIVE_LLM_COST = "judgment.cumulative_llm_cost"
|
28
22
|
JUDGMENT_STATE_BEFORE = "judgment.state_before"
|
29
23
|
JUDGMENT_STATE_AFTER = "judgment.state_after"
|
30
24
|
|
31
|
-
# Evaluation-specific attributes (custom namespace)
|
32
25
|
PENDING_TRACE_EVAL = "judgment.pending_trace_eval"
|
33
26
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
GEN_AI_RESPONSE_FINISH_REASONS = gen_ai_attributes.GEN_AI_RESPONSE_FINISH_REASONS
|
27
|
+
GEN_AI_PROMPT = "gen_ai.prompt"
|
28
|
+
GEN_AI_COMPLETION = "gen_ai.completion"
|
29
|
+
GEN_AI_REQUEST_MODEL = "gen_ai.request.model"
|
30
|
+
GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
|
31
|
+
GEN_AI_SYSTEM = "gen_ai.system"
|
32
|
+
GEN_AI_USAGE_INPUT_TOKENS = "gen_ai.usage.input_tokens"
|
33
|
+
GEN_AI_USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
|
34
|
+
GEN_AI_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens"
|
35
|
+
GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
|
36
|
+
GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
|
37
|
+
GEN_AI_RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons"
|
46
38
|
|
47
|
-
# GenAI-specific attributes (custom namespace)
|
48
39
|
GEN_AI_USAGE_TOTAL_COST = "gen_ai.usage.total_cost_usd"
|
49
40
|
|
50
41
|
|
@@ -54,14 +45,13 @@ class InternalAttributeKeys(str, Enum):
|
|
54
45
|
These are NOT exported and are used only for internal span lifecycle management.
|
55
46
|
"""
|
56
47
|
|
57
|
-
# Span control attributes
|
58
48
|
DISABLE_PARTIAL_EMIT = "disable_partial_emit"
|
59
49
|
CANCELLED = "cancelled"
|
60
50
|
|
61
51
|
|
62
52
|
class ResourceKeys(str, Enum):
|
63
|
-
SERVICE_NAME =
|
64
|
-
TELEMETRY_SDK_LANGUAGE =
|
65
|
-
TELEMETRY_SDK_NAME =
|
66
|
-
TELEMETRY_SDK_VERSION =
|
53
|
+
SERVICE_NAME = "service.name"
|
54
|
+
TELEMETRY_SDK_LANGUAGE = "telemetry.sdk.language"
|
55
|
+
TELEMETRY_SDK_NAME = "telemetry.sdk.name"
|
56
|
+
TELEMETRY_SDK_VERSION = "telemetry.sdk.version"
|
67
57
|
JUDGMENT_PROJECT_ID = "judgment.project_id"
|
judgeval/tracer/llm/__init__.py
CHANGED
@@ -873,7 +873,6 @@ def _set_usage_attributes(span, usage: TraceUsage, tracer: Tracer):
|
|
873
873
|
set_span_attribute(
|
874
874
|
span, AttributeKeys.GEN_AI_USAGE_TOTAL_COST, usage.total_cost_usd
|
875
875
|
)
|
876
|
-
tracer.add_cost_to_current_context(usage.total_cost_usd)
|
877
876
|
|
878
877
|
|
879
878
|
def wrap_provider(tracer: Tracer, client: ApiClient) -> ApiClient:
|
@@ -0,0 +1,20 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
HAS_ANTHROPIC = False
|
4
|
+
anthropic_Anthropic = None
|
5
|
+
anthropic_AsyncAnthropic = None
|
6
|
+
|
7
|
+
try:
|
8
|
+
from anthropic import Anthropic, AsyncAnthropic # type: ignore[import-untyped]
|
9
|
+
|
10
|
+
anthropic_Anthropic = Anthropic
|
11
|
+
anthropic_AsyncAnthropic = AsyncAnthropic
|
12
|
+
HAS_ANTHROPIC = True
|
13
|
+
except ImportError:
|
14
|
+
pass
|
15
|
+
|
16
|
+
__all__ = [
|
17
|
+
"HAS_ANTHROPIC",
|
18
|
+
"anthropic_Anthropic",
|
19
|
+
"anthropic_AsyncAnthropic",
|
20
|
+
]
|
@@ -0,0 +1,21 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
HAS_GOOGLE_GENAI = False
|
4
|
+
google_genai_Client = None
|
5
|
+
google_genai_AsyncClient = None
|
6
|
+
|
7
|
+
try:
|
8
|
+
from google.genai import Client # type: ignore[import-untyped]
|
9
|
+
from google.genai.client import AsyncClient # type: ignore[import-untyped]
|
10
|
+
|
11
|
+
google_genai_Client = Client
|
12
|
+
google_genai_AsyncClient = AsyncClient
|
13
|
+
HAS_GOOGLE_GENAI = True
|
14
|
+
except ImportError:
|
15
|
+
pass
|
16
|
+
|
17
|
+
__all__ = [
|
18
|
+
"HAS_GOOGLE_GENAI",
|
19
|
+
"google_genai_Client",
|
20
|
+
"google_genai_AsyncClient",
|
21
|
+
]
|