judgeval 0.0.55__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/api/__init__.py +3 -0
- judgeval/common/api/api.py +352 -0
- judgeval/common/api/constants.py +165 -0
- judgeval/common/storage/__init__.py +6 -0
- judgeval/common/tracer/__init__.py +31 -0
- judgeval/common/tracer/constants.py +22 -0
- judgeval/common/tracer/core.py +1916 -0
- judgeval/common/tracer/otel_exporter.py +108 -0
- judgeval/common/tracer/otel_span_processor.py +234 -0
- judgeval/common/tracer/span_processor.py +37 -0
- judgeval/common/tracer/span_transformer.py +211 -0
- judgeval/common/tracer/trace_manager.py +92 -0
- judgeval/common/utils.py +2 -2
- judgeval/constants.py +3 -30
- judgeval/data/datasets/eval_dataset_client.py +29 -156
- judgeval/data/judgment_types.py +4 -12
- judgeval/data/result.py +1 -1
- judgeval/data/scorer_data.py +2 -2
- judgeval/data/scripts/openapi_transform.py +1 -1
- judgeval/data/trace.py +66 -1
- judgeval/data/trace_run.py +0 -3
- judgeval/evaluation_run.py +0 -2
- judgeval/integrations/langgraph.py +43 -164
- judgeval/judgment_client.py +17 -211
- judgeval/run_evaluation.py +216 -611
- judgeval/scorers/__init__.py +2 -6
- judgeval/scorers/base_scorer.py +4 -23
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +3 -3
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +215 -0
- judgeval/scorers/score.py +2 -1
- judgeval/scorers/utils.py +1 -13
- judgeval/utils/requests.py +21 -0
- judgeval-0.2.0.dist-info/METADATA +202 -0
- {judgeval-0.0.55.dist-info → judgeval-0.2.0.dist-info}/RECORD +37 -29
- judgeval/common/tracer.py +0 -3215
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -73
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -53
- judgeval-0.0.55.dist-info/METADATA +0 -1384
- /judgeval/common/{s3_storage.py → storage/s3_storage.py} +0 -0
- {judgeval-0.0.55.dist-info → judgeval-0.2.0.dist-info}/WHEEL +0 -0
- {judgeval-0.0.55.dist-info → judgeval-0.2.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,108 @@
|
|
1
|
+
"""
|
2
|
+
Custom OpenTelemetry exporter for Judgment API.
|
3
|
+
|
4
|
+
This exporter sends spans to the Judgment API using the existing format.
|
5
|
+
The BatchSpanProcessor handles all batching, threading, and retry logic.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from __future__ import annotations
|
9
|
+
|
10
|
+
from typing import Any, Dict, List, Sequence
|
11
|
+
|
12
|
+
from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
|
13
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
14
|
+
|
15
|
+
from judgeval.common.tracer.span_transformer import SpanTransformer
|
16
|
+
from judgeval.common.logger import judgeval_logger
|
17
|
+
from judgeval.common.api.api import JudgmentApiClient
|
18
|
+
|
19
|
+
|
20
|
+
class JudgmentAPISpanExporter(SpanExporter):
|
21
|
+
"""
|
22
|
+
Custom OpenTelemetry exporter that sends spans to Judgment API.
|
23
|
+
|
24
|
+
This exporter is used by BatchSpanProcessor which handles all the
|
25
|
+
batching, threading, and retry logic for us.
|
26
|
+
"""
|
27
|
+
|
28
|
+
def __init__(
|
29
|
+
self,
|
30
|
+
judgment_api_key: str,
|
31
|
+
organization_id: str,
|
32
|
+
):
|
33
|
+
self.api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
34
|
+
|
35
|
+
def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
|
36
|
+
"""
|
37
|
+
Export spans to Judgment API.
|
38
|
+
|
39
|
+
This method is called by BatchSpanProcessor with a batch of spans.
|
40
|
+
We send them synchronously since BatchSpanProcessor handles threading.
|
41
|
+
"""
|
42
|
+
if not spans:
|
43
|
+
return SpanExportResult.SUCCESS
|
44
|
+
|
45
|
+
try:
|
46
|
+
spans_data = []
|
47
|
+
eval_runs_data = []
|
48
|
+
|
49
|
+
for span in spans:
|
50
|
+
span_data = self._convert_span_to_judgment_format(span)
|
51
|
+
|
52
|
+
if span.attributes and span.attributes.get("judgment.evaluation_run"):
|
53
|
+
eval_runs_data.append(span_data)
|
54
|
+
else:
|
55
|
+
spans_data.append(span_data)
|
56
|
+
|
57
|
+
if spans_data:
|
58
|
+
self._send_spans_batch(spans_data)
|
59
|
+
|
60
|
+
if eval_runs_data:
|
61
|
+
self._send_evaluation_runs_batch(eval_runs_data)
|
62
|
+
|
63
|
+
return SpanExportResult.SUCCESS
|
64
|
+
|
65
|
+
except Exception as e:
|
66
|
+
judgeval_logger.error(f"Error in JudgmentAPISpanExporter.export: {e}")
|
67
|
+
return SpanExportResult.FAILURE
|
68
|
+
|
69
|
+
def _convert_span_to_judgment_format(self, span: ReadableSpan) -> Dict[str, Any]:
|
70
|
+
"""Convert OpenTelemetry span to existing Judgment API format."""
|
71
|
+
if span.attributes and span.attributes.get("judgment.evaluation_run"):
|
72
|
+
return SpanTransformer.otel_span_to_evaluation_run_format(span)
|
73
|
+
else:
|
74
|
+
return SpanTransformer.otel_span_to_judgment_format(span)
|
75
|
+
|
76
|
+
def _send_spans_batch(self, spans: List[Dict[str, Any]]):
|
77
|
+
"""Send a batch of spans to the spans endpoint."""
|
78
|
+
spans_data = [span["data"] for span in spans]
|
79
|
+
self.api_client.send_spans_batch(spans_data)
|
80
|
+
|
81
|
+
def _send_evaluation_runs_batch(self, eval_runs: List[Dict[str, Any]]):
|
82
|
+
"""Send a batch of evaluation runs to the evaluation runs endpoint."""
|
83
|
+
evaluation_entries = []
|
84
|
+
for eval_run in eval_runs:
|
85
|
+
eval_data = eval_run["data"]
|
86
|
+
entry = {
|
87
|
+
"evaluation_run": {
|
88
|
+
key: value
|
89
|
+
for key, value in eval_data.items()
|
90
|
+
if key not in ["associated_span_id", "span_data", "queued_at"]
|
91
|
+
},
|
92
|
+
"associated_span": {
|
93
|
+
"span_id": eval_data.get("associated_span_id"),
|
94
|
+
"span_data": eval_data.get("span_data"),
|
95
|
+
},
|
96
|
+
"queued_at": eval_data.get("queued_at"),
|
97
|
+
}
|
98
|
+
evaluation_entries.append(entry)
|
99
|
+
|
100
|
+
self.api_client.send_evaluation_runs_batch(evaluation_entries)
|
101
|
+
|
102
|
+
def shutdown(self, timeout_millis: int = 30000) -> None:
|
103
|
+
"""Shutdown the exporter."""
|
104
|
+
pass
|
105
|
+
|
106
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
107
|
+
"""Force flush any pending requests."""
|
108
|
+
return True
|
@@ -0,0 +1,234 @@
|
|
1
|
+
"""
|
2
|
+
Custom OpenTelemetry span processor for Judgment API.
|
3
|
+
|
4
|
+
This processor uses BatchSpanProcessor to handle batching and export
|
5
|
+
of TraceSpan objects converted to OpenTelemetry format.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from __future__ import annotations
|
9
|
+
|
10
|
+
import threading
|
11
|
+
from typing import Any, Dict, Optional
|
12
|
+
|
13
|
+
from opentelemetry.context import Context
|
14
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
15
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanProcessor
|
16
|
+
from opentelemetry.trace import Span, Status, StatusCode, SpanContext, TraceFlags
|
17
|
+
from opentelemetry.trace.span import TraceState, INVALID_SPAN_CONTEXT
|
18
|
+
from opentelemetry.util.types import Attributes
|
19
|
+
|
20
|
+
from judgeval.common.logger import judgeval_logger
|
21
|
+
from judgeval.common.tracer.otel_exporter import JudgmentAPISpanExporter
|
22
|
+
from judgeval.common.tracer.span_processor import SpanProcessorBase
|
23
|
+
from judgeval.common.tracer.span_transformer import SpanTransformer
|
24
|
+
from judgeval.data import TraceSpan
|
25
|
+
from judgeval.evaluation_run import EvaluationRun
|
26
|
+
|
27
|
+
|
28
|
+
class SimpleReadableSpan(ReadableSpan):
|
29
|
+
"""Simple ReadableSpan implementation that wraps TraceSpan data."""
|
30
|
+
|
31
|
+
def __init__(self, trace_span: TraceSpan, span_state: str = "completed"):
|
32
|
+
self._name = trace_span.function
|
33
|
+
self._span_id = trace_span.span_id
|
34
|
+
self._trace_id = trace_span.trace_id
|
35
|
+
|
36
|
+
self._start_time = (
|
37
|
+
int(trace_span.created_at * 1_000_000_000)
|
38
|
+
if trace_span.created_at
|
39
|
+
else None
|
40
|
+
)
|
41
|
+
self._end_time: Optional[int] = None
|
42
|
+
|
43
|
+
if (
|
44
|
+
span_state == "completed"
|
45
|
+
and trace_span.duration is not None
|
46
|
+
and self._start_time is not None
|
47
|
+
):
|
48
|
+
self._end_time = self._start_time + int(trace_span.duration * 1_000_000_000)
|
49
|
+
|
50
|
+
self._status = (
|
51
|
+
Status(StatusCode.ERROR) if trace_span.error else Status(StatusCode.OK)
|
52
|
+
)
|
53
|
+
|
54
|
+
self._attributes = SpanTransformer.trace_span_to_otel_attributes(
|
55
|
+
trace_span, span_state
|
56
|
+
)
|
57
|
+
|
58
|
+
try:
|
59
|
+
trace_id_int = (
|
60
|
+
int(trace_span.trace_id.replace("-", ""), 16)
|
61
|
+
if trace_span.trace_id
|
62
|
+
else 0
|
63
|
+
)
|
64
|
+
span_id_int = (
|
65
|
+
int(trace_span.span_id.replace("-", ""), 16)
|
66
|
+
if trace_span.span_id
|
67
|
+
else 0
|
68
|
+
)
|
69
|
+
|
70
|
+
self._context = SpanContext(
|
71
|
+
trace_id=trace_id_int,
|
72
|
+
span_id=span_id_int,
|
73
|
+
is_remote=False,
|
74
|
+
trace_flags=TraceFlags(0x01),
|
75
|
+
trace_state=TraceState(),
|
76
|
+
)
|
77
|
+
except (ValueError, TypeError) as e:
|
78
|
+
judgeval_logger.warning(f"Failed to create proper SpanContext: {e}")
|
79
|
+
self._context = INVALID_SPAN_CONTEXT
|
80
|
+
|
81
|
+
self._parent: Optional[SpanContext] = None
|
82
|
+
self._events: list[Any] = []
|
83
|
+
self._links: list[Any] = []
|
84
|
+
self._resource: Optional[Any] = None
|
85
|
+
self._instrumentation_info: Optional[Any] = None
|
86
|
+
|
87
|
+
@property
|
88
|
+
def name(self) -> str:
|
89
|
+
return self._name
|
90
|
+
|
91
|
+
@property
|
92
|
+
def context(self) -> SpanContext:
|
93
|
+
return self._context
|
94
|
+
|
95
|
+
@property
|
96
|
+
def parent(self) -> Optional[SpanContext]:
|
97
|
+
return self._parent
|
98
|
+
|
99
|
+
@property
|
100
|
+
def start_time(self) -> Optional[int]:
|
101
|
+
return self._start_time
|
102
|
+
|
103
|
+
@property
|
104
|
+
def end_time(self) -> Optional[int]:
|
105
|
+
return self._end_time
|
106
|
+
|
107
|
+
@property
|
108
|
+
def status(self) -> Status:
|
109
|
+
return self._status
|
110
|
+
|
111
|
+
@property
|
112
|
+
def attributes(self) -> Optional[Attributes]:
|
113
|
+
return self._attributes
|
114
|
+
|
115
|
+
@property
|
116
|
+
def events(self):
|
117
|
+
return self._events
|
118
|
+
|
119
|
+
@property
|
120
|
+
def links(self):
|
121
|
+
return self._links
|
122
|
+
|
123
|
+
@property
|
124
|
+
def resource(self) -> Optional[Any]:
|
125
|
+
return self._resource
|
126
|
+
|
127
|
+
@property
|
128
|
+
def instrumentation_info(self) -> Optional[Any]:
|
129
|
+
return self._instrumentation_info
|
130
|
+
|
131
|
+
|
132
|
+
class JudgmentSpanProcessor(SpanProcessor, SpanProcessorBase):
|
133
|
+
"""
|
134
|
+
Span processor that converts TraceSpan objects to OpenTelemetry format
|
135
|
+
and uses BatchSpanProcessor for export.
|
136
|
+
"""
|
137
|
+
|
138
|
+
def __init__(
|
139
|
+
self,
|
140
|
+
judgment_api_key: str,
|
141
|
+
organization_id: str,
|
142
|
+
batch_size: int = 50,
|
143
|
+
flush_interval: float = 1.0,
|
144
|
+
max_queue_size: int = 2048,
|
145
|
+
export_timeout: int = 30000,
|
146
|
+
):
|
147
|
+
self.judgment_api_key = judgment_api_key
|
148
|
+
self.organization_id = organization_id
|
149
|
+
|
150
|
+
self._span_cache: Dict[str, TraceSpan] = {}
|
151
|
+
self._span_states: Dict[str, str] = {}
|
152
|
+
self._cache_lock = threading.RLock()
|
153
|
+
|
154
|
+
self.batch_processor = BatchSpanProcessor(
|
155
|
+
JudgmentAPISpanExporter(
|
156
|
+
judgment_api_key=judgment_api_key,
|
157
|
+
organization_id=organization_id,
|
158
|
+
),
|
159
|
+
max_queue_size=max_queue_size,
|
160
|
+
schedule_delay_millis=int(flush_interval * 1000),
|
161
|
+
max_export_batch_size=batch_size,
|
162
|
+
export_timeout_millis=export_timeout,
|
163
|
+
)
|
164
|
+
|
165
|
+
def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None:
|
166
|
+
self.batch_processor.on_start(span, parent_context)
|
167
|
+
|
168
|
+
def on_end(self, span: ReadableSpan) -> None:
|
169
|
+
self.batch_processor.on_end(span)
|
170
|
+
|
171
|
+
def queue_span_update(self, span: TraceSpan, span_state: str = "input") -> None:
|
172
|
+
if span_state == "completed":
|
173
|
+
span.set_update_id_to_ending_number()
|
174
|
+
else:
|
175
|
+
span.increment_update_id()
|
176
|
+
|
177
|
+
with self._cache_lock:
|
178
|
+
span_id = span.span_id
|
179
|
+
|
180
|
+
self._span_cache[span_id] = span
|
181
|
+
self._span_states[span_id] = span_state
|
182
|
+
|
183
|
+
self._send_span_update(span, span_state)
|
184
|
+
|
185
|
+
if span_state == "completed" or span_state == "error":
|
186
|
+
self._span_cache.pop(span_id, None)
|
187
|
+
self._span_states.pop(span_id, None)
|
188
|
+
|
189
|
+
def _send_span_update(self, span: TraceSpan, span_state: str) -> None:
|
190
|
+
readable_span = SimpleReadableSpan(span, span_state)
|
191
|
+
self.batch_processor.on_end(readable_span)
|
192
|
+
|
193
|
+
def flush_pending_spans(self) -> None:
|
194
|
+
with self._cache_lock:
|
195
|
+
if not self._span_cache:
|
196
|
+
return
|
197
|
+
|
198
|
+
for span_id, span in self._span_cache.items():
|
199
|
+
span_state = self._span_states.get(span_id, "input")
|
200
|
+
self._send_span_update(span, span_state)
|
201
|
+
|
202
|
+
def queue_evaluation_run(
|
203
|
+
self, evaluation_run: EvaluationRun, span_id: str, span_data: TraceSpan
|
204
|
+
) -> None:
|
205
|
+
attributes = SpanTransformer.evaluation_run_to_otel_attributes(
|
206
|
+
evaluation_run, span_id, span_data
|
207
|
+
)
|
208
|
+
|
209
|
+
readable_span = SimpleReadableSpan(span_data, "evaluation_run")
|
210
|
+
readable_span._attributes.update(attributes)
|
211
|
+
|
212
|
+
self.batch_processor.on_end(readable_span)
|
213
|
+
|
214
|
+
def shutdown(self) -> None:
|
215
|
+
try:
|
216
|
+
self.flush_pending_spans()
|
217
|
+
except Exception as e:
|
218
|
+
judgeval_logger.warning(
|
219
|
+
f"Error flushing pending spans during shutdown: {e}"
|
220
|
+
)
|
221
|
+
|
222
|
+
self.batch_processor.shutdown()
|
223
|
+
|
224
|
+
with self._cache_lock:
|
225
|
+
self._span_cache.clear()
|
226
|
+
self._span_states.clear()
|
227
|
+
|
228
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
229
|
+
try:
|
230
|
+
self.flush_pending_spans()
|
231
|
+
except Exception as e:
|
232
|
+
judgeval_logger.warning(f"Error flushing pending spans: {e}")
|
233
|
+
|
234
|
+
return self.batch_processor.force_flush(timeout_millis)
|
@@ -0,0 +1,37 @@
|
|
1
|
+
"""
|
2
|
+
Base class for span processors with default no-op implementations.
|
3
|
+
|
4
|
+
This eliminates the need for optional typing and null checks.
|
5
|
+
When monitoring is disabled, we use this base class directly.
|
6
|
+
When monitoring is enabled, we use JudgmentSpanProcessor which overrides the methods.
|
7
|
+
"""
|
8
|
+
|
9
|
+
from judgeval.data import TraceSpan
|
10
|
+
from judgeval.evaluation_run import EvaluationRun
|
11
|
+
|
12
|
+
|
13
|
+
class SpanProcessorBase:
|
14
|
+
"""
|
15
|
+
Base class for Judgment span processors with default no-op implementations.
|
16
|
+
|
17
|
+
This eliminates the need for optional typing and null checks.
|
18
|
+
When monitoring is disabled, we use this base class directly.
|
19
|
+
When monitoring is enabled, we use JudgmentSpanProcessor which overrides the methods.
|
20
|
+
"""
|
21
|
+
|
22
|
+
def queue_span_update(self, span: TraceSpan, span_state: str = "input") -> None:
|
23
|
+
pass
|
24
|
+
|
25
|
+
def queue_evaluation_run(
|
26
|
+
self, evaluation_run: EvaluationRun, span_id: str, span_data: TraceSpan
|
27
|
+
) -> None:
|
28
|
+
pass
|
29
|
+
|
30
|
+
def flush_pending_spans(self) -> None:
|
31
|
+
pass
|
32
|
+
|
33
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
34
|
+
return True
|
35
|
+
|
36
|
+
def shutdown(self) -> None:
|
37
|
+
pass
|
@@ -0,0 +1,211 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import json
|
4
|
+
import time
|
5
|
+
import uuid
|
6
|
+
from datetime import datetime, timezone
|
7
|
+
from typing import Any, Dict, Optional, Union
|
8
|
+
|
9
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
10
|
+
from pydantic import BaseModel
|
11
|
+
|
12
|
+
from judgeval.data import TraceSpan
|
13
|
+
from judgeval.evaluation_run import EvaluationRun
|
14
|
+
|
15
|
+
|
16
|
+
class SpanTransformer:
|
17
|
+
@staticmethod
|
18
|
+
def _needs_json_serialization(value: Any) -> bool:
|
19
|
+
if value is None:
|
20
|
+
return False
|
21
|
+
|
22
|
+
simple_types = (str, int, float, bool)
|
23
|
+
if isinstance(value, simple_types):
|
24
|
+
return False
|
25
|
+
|
26
|
+
complex_types = (dict, list, tuple, set, BaseModel)
|
27
|
+
if isinstance(value, complex_types):
|
28
|
+
return True
|
29
|
+
|
30
|
+
try:
|
31
|
+
json.dumps(value)
|
32
|
+
return False
|
33
|
+
except (TypeError, ValueError):
|
34
|
+
return True
|
35
|
+
|
36
|
+
@staticmethod
|
37
|
+
def _safe_json_handle(obj: Any, serialize: bool = True) -> Any:
|
38
|
+
if serialize:
|
39
|
+
if obj is None:
|
40
|
+
return None
|
41
|
+
try:
|
42
|
+
return json.dumps(obj, default=str)
|
43
|
+
except Exception:
|
44
|
+
return json.dumps(str(obj))
|
45
|
+
else:
|
46
|
+
if not isinstance(obj, str):
|
47
|
+
return obj
|
48
|
+
try:
|
49
|
+
return json.loads(obj)
|
50
|
+
except (json.JSONDecodeError, TypeError):
|
51
|
+
return obj
|
52
|
+
|
53
|
+
@staticmethod
|
54
|
+
def _format_timestamp(timestamp: Optional[Union[float, int, str]]) -> str:
|
55
|
+
if timestamp is None:
|
56
|
+
return datetime.now(timezone.utc).isoformat()
|
57
|
+
|
58
|
+
if isinstance(timestamp, str):
|
59
|
+
return timestamp
|
60
|
+
|
61
|
+
try:
|
62
|
+
dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
|
63
|
+
return dt.isoformat()
|
64
|
+
except (ValueError, OSError):
|
65
|
+
return datetime.now(timezone.utc).isoformat()
|
66
|
+
|
67
|
+
@staticmethod
|
68
|
+
def trace_span_to_otel_attributes(
|
69
|
+
trace_span: TraceSpan, span_state: str = "completed"
|
70
|
+
) -> Dict[str, Any]:
|
71
|
+
serialized_data = trace_span.model_dump()
|
72
|
+
attributes: Dict[str, Any] = {}
|
73
|
+
|
74
|
+
for field_name, value in serialized_data.items():
|
75
|
+
if value is None:
|
76
|
+
continue
|
77
|
+
|
78
|
+
attr_name = f"judgment.{field_name}"
|
79
|
+
|
80
|
+
if field_name == "created_at":
|
81
|
+
attributes[attr_name] = SpanTransformer._format_timestamp(value)
|
82
|
+
elif field_name == "expected_tools" and value:
|
83
|
+
attributes[attr_name] = SpanTransformer._safe_json_handle(
|
84
|
+
[tool.model_dump() for tool in trace_span.expected_tools]
|
85
|
+
)
|
86
|
+
elif field_name == "usage" and value:
|
87
|
+
attributes[attr_name] = SpanTransformer._safe_json_handle(
|
88
|
+
trace_span.usage.model_dump()
|
89
|
+
)
|
90
|
+
elif SpanTransformer._needs_json_serialization(value):
|
91
|
+
attributes[attr_name] = SpanTransformer._safe_json_handle(value)
|
92
|
+
else:
|
93
|
+
attributes[attr_name] = value
|
94
|
+
|
95
|
+
attributes["judgment.span_state"] = span_state
|
96
|
+
if not attributes.get("judgment.span_type"):
|
97
|
+
attributes["judgment.span_type"] = "span"
|
98
|
+
|
99
|
+
return attributes
|
100
|
+
|
101
|
+
@staticmethod
|
102
|
+
def otel_attributes_to_judgment_data(attributes: Dict[str, Any]) -> Dict[str, Any]:
|
103
|
+
judgment_data: Dict[str, Any] = {}
|
104
|
+
|
105
|
+
for key, value in attributes.items():
|
106
|
+
if not key.startswith("judgment."):
|
107
|
+
continue
|
108
|
+
|
109
|
+
field_name = key[9:]
|
110
|
+
|
111
|
+
if isinstance(value, str):
|
112
|
+
deserialized = SpanTransformer._safe_json_handle(value, serialize=False)
|
113
|
+
judgment_data[field_name] = deserialized
|
114
|
+
else:
|
115
|
+
judgment_data[field_name] = value
|
116
|
+
|
117
|
+
return judgment_data
|
118
|
+
|
119
|
+
@staticmethod
|
120
|
+
def otel_span_to_judgment_format(span: ReadableSpan) -> Dict[str, Any]:
|
121
|
+
attributes = span.attributes or {}
|
122
|
+
judgment_data = SpanTransformer.otel_attributes_to_judgment_data(attributes)
|
123
|
+
|
124
|
+
duration = judgment_data.get("duration")
|
125
|
+
if duration is None and span.end_time and span.start_time:
|
126
|
+
duration = (span.end_time - span.start_time) / 1_000_000_000
|
127
|
+
|
128
|
+
span_id = judgment_data.get("span_id") or str(uuid.uuid4())
|
129
|
+
trace_id = judgment_data.get("trace_id") or str(uuid.uuid4())
|
130
|
+
|
131
|
+
created_at = judgment_data.get("created_at")
|
132
|
+
if not created_at:
|
133
|
+
created_at = (
|
134
|
+
span.start_time / 1_000_000_000 if span.start_time else time.time()
|
135
|
+
)
|
136
|
+
|
137
|
+
return {
|
138
|
+
"type": "span",
|
139
|
+
"data": {
|
140
|
+
"span_id": span_id,
|
141
|
+
"trace_id": trace_id,
|
142
|
+
"function": span.name,
|
143
|
+
"depth": judgment_data.get("depth", 0),
|
144
|
+
"created_at": SpanTransformer._format_timestamp(created_at),
|
145
|
+
"parent_span_id": judgment_data.get("parent_span_id"),
|
146
|
+
"span_type": judgment_data.get("span_type", "span"),
|
147
|
+
"inputs": judgment_data.get("inputs"),
|
148
|
+
"error": judgment_data.get("error"),
|
149
|
+
"output": judgment_data.get("output"),
|
150
|
+
"usage": judgment_data.get("usage"),
|
151
|
+
"duration": duration,
|
152
|
+
"expected_tools": judgment_data.get("expected_tools"),
|
153
|
+
"additional_metadata": judgment_data.get("additional_metadata"),
|
154
|
+
"has_evaluation": judgment_data.get("has_evaluation", False),
|
155
|
+
"agent_name": judgment_data.get("agent_name"),
|
156
|
+
"state_before": judgment_data.get("state_before"),
|
157
|
+
"state_after": judgment_data.get("state_after"),
|
158
|
+
"update_id": judgment_data.get("update_id", 1),
|
159
|
+
"span_state": judgment_data.get("span_state", "completed"),
|
160
|
+
"queued_at": time.time(),
|
161
|
+
},
|
162
|
+
}
|
163
|
+
|
164
|
+
@staticmethod
|
165
|
+
def evaluation_run_to_otel_attributes(
|
166
|
+
evaluation_run: EvaluationRun, span_id: str, span_data: TraceSpan
|
167
|
+
) -> Dict[str, Any]:
|
168
|
+
attributes = {
|
169
|
+
"judgment.evaluation_run": True,
|
170
|
+
"judgment.associated_span_id": span_id,
|
171
|
+
"judgment.span_data": SpanTransformer._safe_json_handle(
|
172
|
+
span_data.model_dump()
|
173
|
+
),
|
174
|
+
}
|
175
|
+
|
176
|
+
eval_data = evaluation_run.model_dump()
|
177
|
+
for key, value in eval_data.items():
|
178
|
+
if value is None:
|
179
|
+
continue
|
180
|
+
|
181
|
+
attr_name = f"judgment.{key}"
|
182
|
+
if SpanTransformer._needs_json_serialization(value):
|
183
|
+
attributes[attr_name] = SpanTransformer._safe_json_handle(value)
|
184
|
+
else:
|
185
|
+
attributes[attr_name] = value
|
186
|
+
|
187
|
+
return attributes
|
188
|
+
|
189
|
+
@staticmethod
|
190
|
+
def otel_span_to_evaluation_run_format(span: ReadableSpan) -> Dict[str, Any]:
|
191
|
+
attributes = span.attributes or {}
|
192
|
+
judgment_data = SpanTransformer.otel_attributes_to_judgment_data(attributes)
|
193
|
+
|
194
|
+
associated_span_id = judgment_data.get("associated_span_id") or str(
|
195
|
+
uuid.uuid4()
|
196
|
+
)
|
197
|
+
|
198
|
+
eval_run_data = {
|
199
|
+
key: value
|
200
|
+
for key, value in judgment_data.items()
|
201
|
+
if key not in ["associated_span_id", "span_data", "evaluation_run"]
|
202
|
+
}
|
203
|
+
|
204
|
+
eval_run_data["associated_span_id"] = associated_span_id
|
205
|
+
eval_run_data["span_data"] = judgment_data.get("span_data")
|
206
|
+
eval_run_data["queued_at"] = time.time()
|
207
|
+
|
208
|
+
return {
|
209
|
+
"type": "evaluation_run",
|
210
|
+
"data": eval_run_data,
|
211
|
+
}
|
@@ -0,0 +1,92 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import List, Optional, TYPE_CHECKING
|
3
|
+
|
4
|
+
if TYPE_CHECKING:
|
5
|
+
from judgeval.common.tracer import Tracer
|
6
|
+
|
7
|
+
from judgeval.common.logger import judgeval_logger
|
8
|
+
from judgeval.common.api import JudgmentApiClient
|
9
|
+
from rich import print as rprint
|
10
|
+
|
11
|
+
|
12
|
+
class TraceManagerClient:
|
13
|
+
"""
|
14
|
+
Client for handling trace endpoints with the Judgment API
|
15
|
+
|
16
|
+
|
17
|
+
Operations include:
|
18
|
+
- Fetching a trace by id
|
19
|
+
- Saving a trace
|
20
|
+
- Deleting a trace
|
21
|
+
"""
|
22
|
+
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
judgment_api_key: str,
|
26
|
+
organization_id: str,
|
27
|
+
tracer: Optional[Tracer] = None,
|
28
|
+
):
|
29
|
+
self.api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
30
|
+
self.tracer = tracer
|
31
|
+
|
32
|
+
def fetch_trace(self, trace_id: str):
|
33
|
+
"""
|
34
|
+
Fetch a trace by its id
|
35
|
+
"""
|
36
|
+
return self.api_client.fetch_trace(trace_id)
|
37
|
+
|
38
|
+
def upsert_trace(
|
39
|
+
self,
|
40
|
+
trace_data: dict,
|
41
|
+
offline_mode: bool = False,
|
42
|
+
show_link: bool = True,
|
43
|
+
final_save: bool = True,
|
44
|
+
):
|
45
|
+
"""
|
46
|
+
Upserts a trace to the Judgment API (always overwrites if exists).
|
47
|
+
|
48
|
+
Args:
|
49
|
+
trace_data: The trace data to upsert
|
50
|
+
offline_mode: Whether running in offline mode
|
51
|
+
show_link: Whether to show the UI link (for live tracing)
|
52
|
+
final_save: Whether this is the final save (controls S3 saving)
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
dict: Server response containing UI URL and other metadata
|
56
|
+
"""
|
57
|
+
server_response = self.api_client.upsert_trace(trace_data)
|
58
|
+
|
59
|
+
if self.tracer and self.tracer.use_s3 and final_save:
|
60
|
+
try:
|
61
|
+
s3_key = self.tracer.s3_storage.save_trace(
|
62
|
+
trace_data=trace_data,
|
63
|
+
trace_id=trace_data["trace_id"],
|
64
|
+
project_name=trace_data["project_name"],
|
65
|
+
)
|
66
|
+
judgeval_logger.info(f"Trace also saved to S3 at key: {s3_key}")
|
67
|
+
except Exception as e:
|
68
|
+
judgeval_logger.warning(f"Failed to save trace to S3: {str(e)}")
|
69
|
+
|
70
|
+
if not offline_mode and show_link and "ui_results_url" in server_response:
|
71
|
+
pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={server_response['ui_results_url']}]View Trace[/link]\n"
|
72
|
+
rprint(pretty_str)
|
73
|
+
|
74
|
+
return server_response
|
75
|
+
|
76
|
+
def delete_trace(self, trace_id: str):
|
77
|
+
"""
|
78
|
+
Delete a trace from the database.
|
79
|
+
"""
|
80
|
+
return self.api_client.delete_trace(trace_id)
|
81
|
+
|
82
|
+
def delete_traces(self, trace_ids: List[str]):
|
83
|
+
"""
|
84
|
+
Delete a batch of traces from the database.
|
85
|
+
"""
|
86
|
+
return self.api_client.delete_traces(trace_ids)
|
87
|
+
|
88
|
+
def delete_project(self, project_name: str):
|
89
|
+
"""
|
90
|
+
Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
|
91
|
+
"""
|
92
|
+
return self.api_client.delete_project(project_name)
|