judgeval 0.0.55__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. judgeval/common/api/__init__.py +3 -0
  2. judgeval/common/api/api.py +352 -0
  3. judgeval/common/api/constants.py +165 -0
  4. judgeval/common/storage/__init__.py +6 -0
  5. judgeval/common/tracer/__init__.py +31 -0
  6. judgeval/common/tracer/constants.py +22 -0
  7. judgeval/common/tracer/core.py +1916 -0
  8. judgeval/common/tracer/otel_exporter.py +108 -0
  9. judgeval/common/tracer/otel_span_processor.py +234 -0
  10. judgeval/common/tracer/span_processor.py +37 -0
  11. judgeval/common/tracer/span_transformer.py +211 -0
  12. judgeval/common/tracer/trace_manager.py +92 -0
  13. judgeval/common/utils.py +2 -2
  14. judgeval/constants.py +3 -30
  15. judgeval/data/datasets/eval_dataset_client.py +29 -156
  16. judgeval/data/judgment_types.py +4 -12
  17. judgeval/data/result.py +1 -1
  18. judgeval/data/scorer_data.py +2 -2
  19. judgeval/data/scripts/openapi_transform.py +1 -1
  20. judgeval/data/trace.py +66 -1
  21. judgeval/data/trace_run.py +0 -3
  22. judgeval/evaluation_run.py +0 -2
  23. judgeval/integrations/langgraph.py +43 -164
  24. judgeval/judgment_client.py +17 -211
  25. judgeval/run_evaluation.py +216 -611
  26. judgeval/scorers/__init__.py +2 -6
  27. judgeval/scorers/base_scorer.py +4 -23
  28. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +3 -3
  29. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +215 -0
  30. judgeval/scorers/score.py +2 -1
  31. judgeval/scorers/utils.py +1 -13
  32. judgeval/utils/requests.py +21 -0
  33. judgeval-0.2.0.dist-info/METADATA +202 -0
  34. {judgeval-0.0.55.dist-info → judgeval-0.2.0.dist-info}/RECORD +37 -29
  35. judgeval/common/tracer.py +0 -3215
  36. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -73
  37. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  38. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  39. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -53
  40. judgeval-0.0.55.dist-info/METADATA +0 -1384
  41. /judgeval/common/{s3_storage.py → storage/s3_storage.py} +0 -0
  42. {judgeval-0.0.55.dist-info → judgeval-0.2.0.dist-info}/WHEEL +0 -0
  43. {judgeval-0.0.55.dist-info → judgeval-0.2.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,108 @@
1
+ """
2
+ Custom OpenTelemetry exporter for Judgment API.
3
+
4
+ This exporter sends spans to the Judgment API using the existing format.
5
+ The BatchSpanProcessor handles all batching, threading, and retry logic.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, Dict, List, Sequence
11
+
12
+ from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
13
+ from opentelemetry.sdk.trace import ReadableSpan
14
+
15
+ from judgeval.common.tracer.span_transformer import SpanTransformer
16
+ from judgeval.common.logger import judgeval_logger
17
+ from judgeval.common.api.api import JudgmentApiClient
18
+
19
+
20
+ class JudgmentAPISpanExporter(SpanExporter):
21
+ """
22
+ Custom OpenTelemetry exporter that sends spans to Judgment API.
23
+
24
+ This exporter is used by BatchSpanProcessor which handles all the
25
+ batching, threading, and retry logic for us.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ judgment_api_key: str,
31
+ organization_id: str,
32
+ ):
33
+ self.api_client = JudgmentApiClient(judgment_api_key, organization_id)
34
+
35
+ def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
36
+ """
37
+ Export spans to Judgment API.
38
+
39
+ This method is called by BatchSpanProcessor with a batch of spans.
40
+ We send them synchronously since BatchSpanProcessor handles threading.
41
+ """
42
+ if not spans:
43
+ return SpanExportResult.SUCCESS
44
+
45
+ try:
46
+ spans_data = []
47
+ eval_runs_data = []
48
+
49
+ for span in spans:
50
+ span_data = self._convert_span_to_judgment_format(span)
51
+
52
+ if span.attributes and span.attributes.get("judgment.evaluation_run"):
53
+ eval_runs_data.append(span_data)
54
+ else:
55
+ spans_data.append(span_data)
56
+
57
+ if spans_data:
58
+ self._send_spans_batch(spans_data)
59
+
60
+ if eval_runs_data:
61
+ self._send_evaluation_runs_batch(eval_runs_data)
62
+
63
+ return SpanExportResult.SUCCESS
64
+
65
+ except Exception as e:
66
+ judgeval_logger.error(f"Error in JudgmentAPISpanExporter.export: {e}")
67
+ return SpanExportResult.FAILURE
68
+
69
+ def _convert_span_to_judgment_format(self, span: ReadableSpan) -> Dict[str, Any]:
70
+ """Convert OpenTelemetry span to existing Judgment API format."""
71
+ if span.attributes and span.attributes.get("judgment.evaluation_run"):
72
+ return SpanTransformer.otel_span_to_evaluation_run_format(span)
73
+ else:
74
+ return SpanTransformer.otel_span_to_judgment_format(span)
75
+
76
+ def _send_spans_batch(self, spans: List[Dict[str, Any]]):
77
+ """Send a batch of spans to the spans endpoint."""
78
+ spans_data = [span["data"] for span in spans]
79
+ self.api_client.send_spans_batch(spans_data)
80
+
81
+ def _send_evaluation_runs_batch(self, eval_runs: List[Dict[str, Any]]):
82
+ """Send a batch of evaluation runs to the evaluation runs endpoint."""
83
+ evaluation_entries = []
84
+ for eval_run in eval_runs:
85
+ eval_data = eval_run["data"]
86
+ entry = {
87
+ "evaluation_run": {
88
+ key: value
89
+ for key, value in eval_data.items()
90
+ if key not in ["associated_span_id", "span_data", "queued_at"]
91
+ },
92
+ "associated_span": {
93
+ "span_id": eval_data.get("associated_span_id"),
94
+ "span_data": eval_data.get("span_data"),
95
+ },
96
+ "queued_at": eval_data.get("queued_at"),
97
+ }
98
+ evaluation_entries.append(entry)
99
+
100
+ self.api_client.send_evaluation_runs_batch(evaluation_entries)
101
+
102
+ def shutdown(self, timeout_millis: int = 30000) -> None:
103
+ """Shutdown the exporter."""
104
+ pass
105
+
106
+ def force_flush(self, timeout_millis: int = 30000) -> bool:
107
+ """Force flush any pending requests."""
108
+ return True
@@ -0,0 +1,234 @@
1
+ """
2
+ Custom OpenTelemetry span processor for Judgment API.
3
+
4
+ This processor uses BatchSpanProcessor to handle batching and export
5
+ of TraceSpan objects converted to OpenTelemetry format.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import threading
11
+ from typing import Any, Dict, Optional
12
+
13
+ from opentelemetry.context import Context
14
+ from opentelemetry.sdk.trace import ReadableSpan
15
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanProcessor
16
+ from opentelemetry.trace import Span, Status, StatusCode, SpanContext, TraceFlags
17
+ from opentelemetry.trace.span import TraceState, INVALID_SPAN_CONTEXT
18
+ from opentelemetry.util.types import Attributes
19
+
20
+ from judgeval.common.logger import judgeval_logger
21
+ from judgeval.common.tracer.otel_exporter import JudgmentAPISpanExporter
22
+ from judgeval.common.tracer.span_processor import SpanProcessorBase
23
+ from judgeval.common.tracer.span_transformer import SpanTransformer
24
+ from judgeval.data import TraceSpan
25
+ from judgeval.evaluation_run import EvaluationRun
26
+
27
+
28
+ class SimpleReadableSpan(ReadableSpan):
29
+ """Simple ReadableSpan implementation that wraps TraceSpan data."""
30
+
31
+ def __init__(self, trace_span: TraceSpan, span_state: str = "completed"):
32
+ self._name = trace_span.function
33
+ self._span_id = trace_span.span_id
34
+ self._trace_id = trace_span.trace_id
35
+
36
+ self._start_time = (
37
+ int(trace_span.created_at * 1_000_000_000)
38
+ if trace_span.created_at
39
+ else None
40
+ )
41
+ self._end_time: Optional[int] = None
42
+
43
+ if (
44
+ span_state == "completed"
45
+ and trace_span.duration is not None
46
+ and self._start_time is not None
47
+ ):
48
+ self._end_time = self._start_time + int(trace_span.duration * 1_000_000_000)
49
+
50
+ self._status = (
51
+ Status(StatusCode.ERROR) if trace_span.error else Status(StatusCode.OK)
52
+ )
53
+
54
+ self._attributes = SpanTransformer.trace_span_to_otel_attributes(
55
+ trace_span, span_state
56
+ )
57
+
58
+ try:
59
+ trace_id_int = (
60
+ int(trace_span.trace_id.replace("-", ""), 16)
61
+ if trace_span.trace_id
62
+ else 0
63
+ )
64
+ span_id_int = (
65
+ int(trace_span.span_id.replace("-", ""), 16)
66
+ if trace_span.span_id
67
+ else 0
68
+ )
69
+
70
+ self._context = SpanContext(
71
+ trace_id=trace_id_int,
72
+ span_id=span_id_int,
73
+ is_remote=False,
74
+ trace_flags=TraceFlags(0x01),
75
+ trace_state=TraceState(),
76
+ )
77
+ except (ValueError, TypeError) as e:
78
+ judgeval_logger.warning(f"Failed to create proper SpanContext: {e}")
79
+ self._context = INVALID_SPAN_CONTEXT
80
+
81
+ self._parent: Optional[SpanContext] = None
82
+ self._events: list[Any] = []
83
+ self._links: list[Any] = []
84
+ self._resource: Optional[Any] = None
85
+ self._instrumentation_info: Optional[Any] = None
86
+
87
+ @property
88
+ def name(self) -> str:
89
+ return self._name
90
+
91
+ @property
92
+ def context(self) -> SpanContext:
93
+ return self._context
94
+
95
+ @property
96
+ def parent(self) -> Optional[SpanContext]:
97
+ return self._parent
98
+
99
+ @property
100
+ def start_time(self) -> Optional[int]:
101
+ return self._start_time
102
+
103
+ @property
104
+ def end_time(self) -> Optional[int]:
105
+ return self._end_time
106
+
107
+ @property
108
+ def status(self) -> Status:
109
+ return self._status
110
+
111
+ @property
112
+ def attributes(self) -> Optional[Attributes]:
113
+ return self._attributes
114
+
115
+ @property
116
+ def events(self):
117
+ return self._events
118
+
119
+ @property
120
+ def links(self):
121
+ return self._links
122
+
123
+ @property
124
+ def resource(self) -> Optional[Any]:
125
+ return self._resource
126
+
127
+ @property
128
+ def instrumentation_info(self) -> Optional[Any]:
129
+ return self._instrumentation_info
130
+
131
+
132
+ class JudgmentSpanProcessor(SpanProcessor, SpanProcessorBase):
133
+ """
134
+ Span processor that converts TraceSpan objects to OpenTelemetry format
135
+ and uses BatchSpanProcessor for export.
136
+ """
137
+
138
+ def __init__(
139
+ self,
140
+ judgment_api_key: str,
141
+ organization_id: str,
142
+ batch_size: int = 50,
143
+ flush_interval: float = 1.0,
144
+ max_queue_size: int = 2048,
145
+ export_timeout: int = 30000,
146
+ ):
147
+ self.judgment_api_key = judgment_api_key
148
+ self.organization_id = organization_id
149
+
150
+ self._span_cache: Dict[str, TraceSpan] = {}
151
+ self._span_states: Dict[str, str] = {}
152
+ self._cache_lock = threading.RLock()
153
+
154
+ self.batch_processor = BatchSpanProcessor(
155
+ JudgmentAPISpanExporter(
156
+ judgment_api_key=judgment_api_key,
157
+ organization_id=organization_id,
158
+ ),
159
+ max_queue_size=max_queue_size,
160
+ schedule_delay_millis=int(flush_interval * 1000),
161
+ max_export_batch_size=batch_size,
162
+ export_timeout_millis=export_timeout,
163
+ )
164
+
165
+ def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None:
166
+ self.batch_processor.on_start(span, parent_context)
167
+
168
+ def on_end(self, span: ReadableSpan) -> None:
169
+ self.batch_processor.on_end(span)
170
+
171
+ def queue_span_update(self, span: TraceSpan, span_state: str = "input") -> None:
172
+ if span_state == "completed":
173
+ span.set_update_id_to_ending_number()
174
+ else:
175
+ span.increment_update_id()
176
+
177
+ with self._cache_lock:
178
+ span_id = span.span_id
179
+
180
+ self._span_cache[span_id] = span
181
+ self._span_states[span_id] = span_state
182
+
183
+ self._send_span_update(span, span_state)
184
+
185
+ if span_state == "completed" or span_state == "error":
186
+ self._span_cache.pop(span_id, None)
187
+ self._span_states.pop(span_id, None)
188
+
189
+ def _send_span_update(self, span: TraceSpan, span_state: str) -> None:
190
+ readable_span = SimpleReadableSpan(span, span_state)
191
+ self.batch_processor.on_end(readable_span)
192
+
193
+ def flush_pending_spans(self) -> None:
194
+ with self._cache_lock:
195
+ if not self._span_cache:
196
+ return
197
+
198
+ for span_id, span in self._span_cache.items():
199
+ span_state = self._span_states.get(span_id, "input")
200
+ self._send_span_update(span, span_state)
201
+
202
+ def queue_evaluation_run(
203
+ self, evaluation_run: EvaluationRun, span_id: str, span_data: TraceSpan
204
+ ) -> None:
205
+ attributes = SpanTransformer.evaluation_run_to_otel_attributes(
206
+ evaluation_run, span_id, span_data
207
+ )
208
+
209
+ readable_span = SimpleReadableSpan(span_data, "evaluation_run")
210
+ readable_span._attributes.update(attributes)
211
+
212
+ self.batch_processor.on_end(readable_span)
213
+
214
+ def shutdown(self) -> None:
215
+ try:
216
+ self.flush_pending_spans()
217
+ except Exception as e:
218
+ judgeval_logger.warning(
219
+ f"Error flushing pending spans during shutdown: {e}"
220
+ )
221
+
222
+ self.batch_processor.shutdown()
223
+
224
+ with self._cache_lock:
225
+ self._span_cache.clear()
226
+ self._span_states.clear()
227
+
228
+ def force_flush(self, timeout_millis: int = 30000) -> bool:
229
+ try:
230
+ self.flush_pending_spans()
231
+ except Exception as e:
232
+ judgeval_logger.warning(f"Error flushing pending spans: {e}")
233
+
234
+ return self.batch_processor.force_flush(timeout_millis)
@@ -0,0 +1,37 @@
1
+ """
2
+ Base class for span processors with default no-op implementations.
3
+
4
+ This eliminates the need for optional typing and null checks.
5
+ When monitoring is disabled, we use this base class directly.
6
+ When monitoring is enabled, we use JudgmentSpanProcessor which overrides the methods.
7
+ """
8
+
9
+ from judgeval.data import TraceSpan
10
+ from judgeval.evaluation_run import EvaluationRun
11
+
12
+
13
+ class SpanProcessorBase:
14
+ """
15
+ Base class for Judgment span processors with default no-op implementations.
16
+
17
+ This eliminates the need for optional typing and null checks.
18
+ When monitoring is disabled, we use this base class directly.
19
+ When monitoring is enabled, we use JudgmentSpanProcessor which overrides the methods.
20
+ """
21
+
22
+ def queue_span_update(self, span: TraceSpan, span_state: str = "input") -> None:
23
+ pass
24
+
25
+ def queue_evaluation_run(
26
+ self, evaluation_run: EvaluationRun, span_id: str, span_data: TraceSpan
27
+ ) -> None:
28
+ pass
29
+
30
+ def flush_pending_spans(self) -> None:
31
+ pass
32
+
33
+ def force_flush(self, timeout_millis: int = 30000) -> bool:
34
+ return True
35
+
36
+ def shutdown(self) -> None:
37
+ pass
@@ -0,0 +1,211 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import time
5
+ import uuid
6
+ from datetime import datetime, timezone
7
+ from typing import Any, Dict, Optional, Union
8
+
9
+ from opentelemetry.sdk.trace import ReadableSpan
10
+ from pydantic import BaseModel
11
+
12
+ from judgeval.data import TraceSpan
13
+ from judgeval.evaluation_run import EvaluationRun
14
+
15
+
16
+ class SpanTransformer:
17
+ @staticmethod
18
+ def _needs_json_serialization(value: Any) -> bool:
19
+ if value is None:
20
+ return False
21
+
22
+ simple_types = (str, int, float, bool)
23
+ if isinstance(value, simple_types):
24
+ return False
25
+
26
+ complex_types = (dict, list, tuple, set, BaseModel)
27
+ if isinstance(value, complex_types):
28
+ return True
29
+
30
+ try:
31
+ json.dumps(value)
32
+ return False
33
+ except (TypeError, ValueError):
34
+ return True
35
+
36
+ @staticmethod
37
+ def _safe_json_handle(obj: Any, serialize: bool = True) -> Any:
38
+ if serialize:
39
+ if obj is None:
40
+ return None
41
+ try:
42
+ return json.dumps(obj, default=str)
43
+ except Exception:
44
+ return json.dumps(str(obj))
45
+ else:
46
+ if not isinstance(obj, str):
47
+ return obj
48
+ try:
49
+ return json.loads(obj)
50
+ except (json.JSONDecodeError, TypeError):
51
+ return obj
52
+
53
+ @staticmethod
54
+ def _format_timestamp(timestamp: Optional[Union[float, int, str]]) -> str:
55
+ if timestamp is None:
56
+ return datetime.now(timezone.utc).isoformat()
57
+
58
+ if isinstance(timestamp, str):
59
+ return timestamp
60
+
61
+ try:
62
+ dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
63
+ return dt.isoformat()
64
+ except (ValueError, OSError):
65
+ return datetime.now(timezone.utc).isoformat()
66
+
67
+ @staticmethod
68
+ def trace_span_to_otel_attributes(
69
+ trace_span: TraceSpan, span_state: str = "completed"
70
+ ) -> Dict[str, Any]:
71
+ serialized_data = trace_span.model_dump()
72
+ attributes: Dict[str, Any] = {}
73
+
74
+ for field_name, value in serialized_data.items():
75
+ if value is None:
76
+ continue
77
+
78
+ attr_name = f"judgment.{field_name}"
79
+
80
+ if field_name == "created_at":
81
+ attributes[attr_name] = SpanTransformer._format_timestamp(value)
82
+ elif field_name == "expected_tools" and value:
83
+ attributes[attr_name] = SpanTransformer._safe_json_handle(
84
+ [tool.model_dump() for tool in trace_span.expected_tools]
85
+ )
86
+ elif field_name == "usage" and value:
87
+ attributes[attr_name] = SpanTransformer._safe_json_handle(
88
+ trace_span.usage.model_dump()
89
+ )
90
+ elif SpanTransformer._needs_json_serialization(value):
91
+ attributes[attr_name] = SpanTransformer._safe_json_handle(value)
92
+ else:
93
+ attributes[attr_name] = value
94
+
95
+ attributes["judgment.span_state"] = span_state
96
+ if not attributes.get("judgment.span_type"):
97
+ attributes["judgment.span_type"] = "span"
98
+
99
+ return attributes
100
+
101
+ @staticmethod
102
+ def otel_attributes_to_judgment_data(attributes: Dict[str, Any]) -> Dict[str, Any]:
103
+ judgment_data: Dict[str, Any] = {}
104
+
105
+ for key, value in attributes.items():
106
+ if not key.startswith("judgment."):
107
+ continue
108
+
109
+ field_name = key[9:]
110
+
111
+ if isinstance(value, str):
112
+ deserialized = SpanTransformer._safe_json_handle(value, serialize=False)
113
+ judgment_data[field_name] = deserialized
114
+ else:
115
+ judgment_data[field_name] = value
116
+
117
+ return judgment_data
118
+
119
+ @staticmethod
120
+ def otel_span_to_judgment_format(span: ReadableSpan) -> Dict[str, Any]:
121
+ attributes = span.attributes or {}
122
+ judgment_data = SpanTransformer.otel_attributes_to_judgment_data(attributes)
123
+
124
+ duration = judgment_data.get("duration")
125
+ if duration is None and span.end_time and span.start_time:
126
+ duration = (span.end_time - span.start_time) / 1_000_000_000
127
+
128
+ span_id = judgment_data.get("span_id") or str(uuid.uuid4())
129
+ trace_id = judgment_data.get("trace_id") or str(uuid.uuid4())
130
+
131
+ created_at = judgment_data.get("created_at")
132
+ if not created_at:
133
+ created_at = (
134
+ span.start_time / 1_000_000_000 if span.start_time else time.time()
135
+ )
136
+
137
+ return {
138
+ "type": "span",
139
+ "data": {
140
+ "span_id": span_id,
141
+ "trace_id": trace_id,
142
+ "function": span.name,
143
+ "depth": judgment_data.get("depth", 0),
144
+ "created_at": SpanTransformer._format_timestamp(created_at),
145
+ "parent_span_id": judgment_data.get("parent_span_id"),
146
+ "span_type": judgment_data.get("span_type", "span"),
147
+ "inputs": judgment_data.get("inputs"),
148
+ "error": judgment_data.get("error"),
149
+ "output": judgment_data.get("output"),
150
+ "usage": judgment_data.get("usage"),
151
+ "duration": duration,
152
+ "expected_tools": judgment_data.get("expected_tools"),
153
+ "additional_metadata": judgment_data.get("additional_metadata"),
154
+ "has_evaluation": judgment_data.get("has_evaluation", False),
155
+ "agent_name": judgment_data.get("agent_name"),
156
+ "state_before": judgment_data.get("state_before"),
157
+ "state_after": judgment_data.get("state_after"),
158
+ "update_id": judgment_data.get("update_id", 1),
159
+ "span_state": judgment_data.get("span_state", "completed"),
160
+ "queued_at": time.time(),
161
+ },
162
+ }
163
+
164
+ @staticmethod
165
+ def evaluation_run_to_otel_attributes(
166
+ evaluation_run: EvaluationRun, span_id: str, span_data: TraceSpan
167
+ ) -> Dict[str, Any]:
168
+ attributes = {
169
+ "judgment.evaluation_run": True,
170
+ "judgment.associated_span_id": span_id,
171
+ "judgment.span_data": SpanTransformer._safe_json_handle(
172
+ span_data.model_dump()
173
+ ),
174
+ }
175
+
176
+ eval_data = evaluation_run.model_dump()
177
+ for key, value in eval_data.items():
178
+ if value is None:
179
+ continue
180
+
181
+ attr_name = f"judgment.{key}"
182
+ if SpanTransformer._needs_json_serialization(value):
183
+ attributes[attr_name] = SpanTransformer._safe_json_handle(value)
184
+ else:
185
+ attributes[attr_name] = value
186
+
187
+ return attributes
188
+
189
+ @staticmethod
190
+ def otel_span_to_evaluation_run_format(span: ReadableSpan) -> Dict[str, Any]:
191
+ attributes = span.attributes or {}
192
+ judgment_data = SpanTransformer.otel_attributes_to_judgment_data(attributes)
193
+
194
+ associated_span_id = judgment_data.get("associated_span_id") or str(
195
+ uuid.uuid4()
196
+ )
197
+
198
+ eval_run_data = {
199
+ key: value
200
+ for key, value in judgment_data.items()
201
+ if key not in ["associated_span_id", "span_data", "evaluation_run"]
202
+ }
203
+
204
+ eval_run_data["associated_span_id"] = associated_span_id
205
+ eval_run_data["span_data"] = judgment_data.get("span_data")
206
+ eval_run_data["queued_at"] = time.time()
207
+
208
+ return {
209
+ "type": "evaluation_run",
210
+ "data": eval_run_data,
211
+ }
@@ -0,0 +1,92 @@
1
+ from __future__ import annotations
2
+ from typing import List, Optional, TYPE_CHECKING
3
+
4
+ if TYPE_CHECKING:
5
+ from judgeval.common.tracer import Tracer
6
+
7
+ from judgeval.common.logger import judgeval_logger
8
+ from judgeval.common.api import JudgmentApiClient
9
+ from rich import print as rprint
10
+
11
+
12
+ class TraceManagerClient:
13
+ """
14
+ Client for handling trace endpoints with the Judgment API
15
+
16
+
17
+ Operations include:
18
+ - Fetching a trace by id
19
+ - Saving a trace
20
+ - Deleting a trace
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ judgment_api_key: str,
26
+ organization_id: str,
27
+ tracer: Optional[Tracer] = None,
28
+ ):
29
+ self.api_client = JudgmentApiClient(judgment_api_key, organization_id)
30
+ self.tracer = tracer
31
+
32
+ def fetch_trace(self, trace_id: str):
33
+ """
34
+ Fetch a trace by its id
35
+ """
36
+ return self.api_client.fetch_trace(trace_id)
37
+
38
+ def upsert_trace(
39
+ self,
40
+ trace_data: dict,
41
+ offline_mode: bool = False,
42
+ show_link: bool = True,
43
+ final_save: bool = True,
44
+ ):
45
+ """
46
+ Upserts a trace to the Judgment API (always overwrites if exists).
47
+
48
+ Args:
49
+ trace_data: The trace data to upsert
50
+ offline_mode: Whether running in offline mode
51
+ show_link: Whether to show the UI link (for live tracing)
52
+ final_save: Whether this is the final save (controls S3 saving)
53
+
54
+ Returns:
55
+ dict: Server response containing UI URL and other metadata
56
+ """
57
+ server_response = self.api_client.upsert_trace(trace_data)
58
+
59
+ if self.tracer and self.tracer.use_s3 and final_save:
60
+ try:
61
+ s3_key = self.tracer.s3_storage.save_trace(
62
+ trace_data=trace_data,
63
+ trace_id=trace_data["trace_id"],
64
+ project_name=trace_data["project_name"],
65
+ )
66
+ judgeval_logger.info(f"Trace also saved to S3 at key: {s3_key}")
67
+ except Exception as e:
68
+ judgeval_logger.warning(f"Failed to save trace to S3: {str(e)}")
69
+
70
+ if not offline_mode and show_link and "ui_results_url" in server_response:
71
+ pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={server_response['ui_results_url']}]View Trace[/link]\n"
72
+ rprint(pretty_str)
73
+
74
+ return server_response
75
+
76
+ def delete_trace(self, trace_id: str):
77
+ """
78
+ Delete a trace from the database.
79
+ """
80
+ return self.api_client.delete_trace(trace_id)
81
+
82
+ def delete_traces(self, trace_ids: List[str]):
83
+ """
84
+ Delete a batch of traces from the database.
85
+ """
86
+ return self.api_client.delete_traces(trace_ids)
87
+
88
+ def delete_project(self, project_name: str):
89
+ """
90
+ Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
91
+ """
92
+ return self.api_client.delete_project(project_name)