judgeval 0.7.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. judgeval/__init__.py +139 -12
  2. judgeval/api/__init__.py +501 -0
  3. judgeval/api/api_types.py +344 -0
  4. judgeval/cli.py +2 -4
  5. judgeval/constants.py +10 -26
  6. judgeval/data/evaluation_run.py +49 -26
  7. judgeval/data/example.py +2 -2
  8. judgeval/data/judgment_types.py +266 -82
  9. judgeval/data/result.py +4 -5
  10. judgeval/data/scorer_data.py +4 -2
  11. judgeval/data/tool.py +2 -2
  12. judgeval/data/trace.py +7 -50
  13. judgeval/data/trace_run.py +7 -4
  14. judgeval/{dataset.py → dataset/__init__.py} +43 -28
  15. judgeval/env.py +67 -0
  16. judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
  17. judgeval/exceptions.py +27 -0
  18. judgeval/integrations/langgraph/__init__.py +788 -0
  19. judgeval/judges/__init__.py +2 -2
  20. judgeval/judges/litellm_judge.py +75 -15
  21. judgeval/judges/together_judge.py +86 -18
  22. judgeval/judges/utils.py +7 -21
  23. judgeval/{common/logger.py → logger.py} +8 -6
  24. judgeval/scorers/__init__.py +0 -4
  25. judgeval/scorers/agent_scorer.py +3 -7
  26. judgeval/scorers/api_scorer.py +8 -13
  27. judgeval/scorers/base_scorer.py +52 -32
  28. judgeval/scorers/example_scorer.py +1 -3
  29. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
  30. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
  31. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
  32. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
  33. judgeval/scorers/score.py +21 -31
  34. judgeval/scorers/trace_api_scorer.py +5 -0
  35. judgeval/scorers/utils.py +1 -103
  36. judgeval/tracer/__init__.py +1075 -2
  37. judgeval/tracer/constants.py +1 -0
  38. judgeval/tracer/exporters/__init__.py +37 -0
  39. judgeval/tracer/exporters/s3.py +119 -0
  40. judgeval/tracer/exporters/store.py +43 -0
  41. judgeval/tracer/exporters/utils.py +32 -0
  42. judgeval/tracer/keys.py +67 -0
  43. judgeval/tracer/llm/__init__.py +1233 -0
  44. judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
  45. judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
  46. judgeval/tracer/managers.py +188 -0
  47. judgeval/tracer/processors/__init__.py +181 -0
  48. judgeval/tracer/utils.py +20 -0
  49. judgeval/trainer/__init__.py +5 -0
  50. judgeval/{common/trainer → trainer}/config.py +12 -9
  51. judgeval/{common/trainer → trainer}/console.py +2 -9
  52. judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
  53. judgeval/{common/trainer → trainer}/trainer.py +119 -17
  54. judgeval/utils/async_utils.py +2 -3
  55. judgeval/utils/decorators.py +24 -0
  56. judgeval/utils/file_utils.py +37 -4
  57. judgeval/utils/guards.py +32 -0
  58. judgeval/utils/meta.py +14 -0
  59. judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
  60. judgeval/utils/testing.py +88 -0
  61. judgeval/utils/url.py +10 -0
  62. judgeval/{version_check.py → utils/version_check.py} +3 -3
  63. judgeval/version.py +5 -0
  64. judgeval/warnings.py +4 -0
  65. {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
  66. judgeval-0.9.0.dist-info/RECORD +80 -0
  67. judgeval/clients.py +0 -35
  68. judgeval/common/__init__.py +0 -13
  69. judgeval/common/api/__init__.py +0 -3
  70. judgeval/common/api/api.py +0 -375
  71. judgeval/common/api/constants.py +0 -186
  72. judgeval/common/exceptions.py +0 -27
  73. judgeval/common/storage/__init__.py +0 -6
  74. judgeval/common/storage/s3_storage.py +0 -97
  75. judgeval/common/tracer/__init__.py +0 -31
  76. judgeval/common/tracer/constants.py +0 -22
  77. judgeval/common/tracer/core.py +0 -2427
  78. judgeval/common/tracer/otel_exporter.py +0 -108
  79. judgeval/common/tracer/otel_span_processor.py +0 -188
  80. judgeval/common/tracer/span_processor.py +0 -37
  81. judgeval/common/tracer/span_transformer.py +0 -207
  82. judgeval/common/tracer/trace_manager.py +0 -101
  83. judgeval/common/trainer/__init__.py +0 -5
  84. judgeval/common/utils.py +0 -948
  85. judgeval/integrations/langgraph.py +0 -844
  86. judgeval/judges/mixture_of_judges.py +0 -287
  87. judgeval/judgment_client.py +0 -267
  88. judgeval/rules.py +0 -521
  89. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  90. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  91. judgeval/utils/alerts.py +0 -93
  92. judgeval/utils/requests.py +0 -50
  93. judgeval-0.7.1.dist-info/RECORD +0 -82
  94. {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
  95. {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
  96. {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,108 +0,0 @@
1
- """
2
- Custom OpenTelemetry exporter for Judgment API.
3
-
4
- This exporter sends spans to the Judgment API using the existing format.
5
- The BatchSpanProcessor handles all batching, threading, and retry logic.
6
- """
7
-
8
- from __future__ import annotations
9
-
10
- from typing import Any, Dict, List, Sequence
11
-
12
- from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
13
- from opentelemetry.sdk.trace import ReadableSpan
14
-
15
- from judgeval.common.tracer.span_transformer import SpanTransformer
16
- from judgeval.common.logger import judgeval_logger
17
- from judgeval.common.api.api import JudgmentApiClient
18
-
19
-
20
- class JudgmentAPISpanExporter(SpanExporter):
21
- """
22
- Custom OpenTelemetry exporter that sends spans to Judgment API.
23
-
24
- This exporter is used by BatchSpanProcessor which handles all the
25
- batching, threading, and retry logic for us.
26
- """
27
-
28
- def __init__(
29
- self,
30
- judgment_api_key: str,
31
- organization_id: str,
32
- ):
33
- self.api_client = JudgmentApiClient(judgment_api_key, organization_id)
34
-
35
- def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
36
- """
37
- Export spans to Judgment API.
38
-
39
- This method is called by BatchSpanProcessor with a batch of spans.
40
- We send them synchronously since BatchSpanProcessor handles threading.
41
- """
42
- if not spans:
43
- return SpanExportResult.SUCCESS
44
-
45
- try:
46
- spans_data = []
47
- eval_runs_data = []
48
-
49
- for span in spans:
50
- span_data = self._convert_span_to_judgment_format(span)
51
-
52
- if span.attributes and span.attributes.get("judgment.evaluation_run"):
53
- eval_runs_data.append(span_data)
54
- else:
55
- spans_data.append(span_data)
56
-
57
- if spans_data:
58
- self._send_spans_batch(spans_data)
59
-
60
- if eval_runs_data:
61
- self._send_evaluation_runs_batch(eval_runs_data)
62
-
63
- return SpanExportResult.SUCCESS
64
-
65
- except Exception as e:
66
- judgeval_logger.error(f"Error in JudgmentAPISpanExporter.export: {e}")
67
- return SpanExportResult.FAILURE
68
-
69
- def _convert_span_to_judgment_format(self, span: ReadableSpan) -> Dict[str, Any]:
70
- """Convert OpenTelemetry span to existing Judgment API format."""
71
- if span.attributes and span.attributes.get("judgment.evaluation_run"):
72
- return SpanTransformer.otel_span_to_evaluation_run_format(span)
73
- else:
74
- return SpanTransformer.otel_span_to_judgment_format(span)
75
-
76
- def _send_spans_batch(self, spans: List[Dict[str, Any]]):
77
- """Send a batch of spans to the spans endpoint."""
78
- spans_data = [span["data"] for span in spans]
79
- self.api_client.send_spans_batch(spans_data)
80
-
81
- def _send_evaluation_runs_batch(self, eval_runs: List[Dict[str, Any]]):
82
- """Send a batch of evaluation runs to the evaluation runs endpoint."""
83
- evaluation_entries = []
84
- for eval_run in eval_runs:
85
- eval_data = eval_run["data"]
86
- entry = {
87
- "evaluation_run": {
88
- key: value
89
- for key, value in eval_data.items()
90
- if key not in ["associated_span_id", "span_data", "queued_at"]
91
- },
92
- "associated_span": {
93
- "span_id": eval_data.get("associated_span_id"),
94
- "span_data": eval_data.get("span_data"),
95
- },
96
- "queued_at": eval_data.get("queued_at"),
97
- }
98
- evaluation_entries.append(entry)
99
-
100
- self.api_client.send_evaluation_runs_batch(evaluation_entries)
101
-
102
- def shutdown(self, timeout_millis: int = 30000) -> None:
103
- """Shutdown the exporter."""
104
- pass
105
-
106
- def force_flush(self, timeout_millis: int = 30000) -> bool:
107
- """Force flush any pending requests."""
108
- return True
@@ -1,188 +0,0 @@
1
- """
2
- Custom OpenTelemetry span processor for Judgment API.
3
-
4
- This processor uses BatchSpanProcessor to handle batching and export
5
- of TraceSpan objects converted to OpenTelemetry format.
6
- """
7
-
8
- from __future__ import annotations
9
-
10
- import threading
11
- from typing import Any, Dict, Optional
12
-
13
- from opentelemetry.context import Context
14
- from opentelemetry.sdk.trace import ReadableSpan, Span
15
- from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanProcessor
16
- from opentelemetry.trace import Status, StatusCode, SpanContext, TraceFlags
17
- from opentelemetry.trace.span import TraceState, INVALID_SPAN_CONTEXT
18
-
19
- from judgeval.common.logger import judgeval_logger
20
- from judgeval.common.tracer.otel_exporter import JudgmentAPISpanExporter
21
- from judgeval.common.tracer.span_processor import SpanProcessorBase
22
- from judgeval.common.tracer.span_transformer import SpanTransformer
23
- from judgeval.data import TraceSpan
24
- from judgeval.data.evaluation_run import EvaluationRun
25
-
26
-
27
- class SimpleReadableSpan(ReadableSpan):
28
- """Simple ReadableSpan implementation that wraps TraceSpan data."""
29
-
30
- def __init__(self, trace_span: TraceSpan, span_state: str = "completed"):
31
- self._name = trace_span.function
32
- self._span_id = trace_span.span_id
33
- self._trace_id = trace_span.trace_id
34
-
35
- self._start_time = (
36
- int(trace_span.created_at * 1_000_000_000)
37
- if trace_span.created_at
38
- else None
39
- )
40
- self._end_time: Optional[int] = None
41
-
42
- if (
43
- span_state == "completed"
44
- and trace_span.duration is not None
45
- and self._start_time is not None
46
- ):
47
- self._end_time = self._start_time + int(trace_span.duration * 1_000_000_000)
48
-
49
- self._status = (
50
- Status(StatusCode.ERROR) if trace_span.error else Status(StatusCode.OK)
51
- )
52
-
53
- self._attributes: Dict[str, Any] = (
54
- SpanTransformer.trace_span_to_otel_attributes(trace_span, span_state)
55
- )
56
-
57
- try:
58
- trace_id_int = (
59
- int(trace_span.trace_id.replace("-", ""), 16)
60
- if trace_span.trace_id
61
- else 0
62
- )
63
- span_id_int = (
64
- int(trace_span.span_id.replace("-", ""), 16)
65
- if trace_span.span_id
66
- else 0
67
- )
68
-
69
- self._context = SpanContext(
70
- trace_id=trace_id_int,
71
- span_id=span_id_int,
72
- is_remote=False,
73
- trace_flags=TraceFlags(0x01),
74
- trace_state=TraceState(),
75
- )
76
- except (ValueError, TypeError) as e:
77
- judgeval_logger.warning(f"Failed to create proper SpanContext: {e}")
78
- self._context = INVALID_SPAN_CONTEXT
79
-
80
- self._parent: Optional[SpanContext] = None
81
- self._events: list[Any] = []
82
- self._links: list[Any] = []
83
- self._instrumentation_info: Optional[Any] = None
84
-
85
-
86
- class JudgmentSpanProcessor(SpanProcessor, SpanProcessorBase):
87
- """
88
- Span processor that converts TraceSpan objects to OpenTelemetry format
89
- and uses BatchSpanProcessor for export.
90
- """
91
-
92
- def __init__(
93
- self,
94
- judgment_api_key: str,
95
- organization_id: str,
96
- batch_size: int = 50,
97
- flush_interval: float = 1.0,
98
- max_queue_size: int = 2048,
99
- export_timeout: int = 30000,
100
- ):
101
- self.judgment_api_key = judgment_api_key
102
- self.organization_id = organization_id
103
-
104
- self._span_cache: Dict[str, TraceSpan] = {}
105
- self._span_states: Dict[str, str] = {}
106
- self._cache_lock = threading.RLock()
107
-
108
- self.batch_processor = BatchSpanProcessor(
109
- JudgmentAPISpanExporter(
110
- judgment_api_key=judgment_api_key,
111
- organization_id=organization_id,
112
- ),
113
- max_queue_size=max_queue_size,
114
- schedule_delay_millis=int(flush_interval * 1000),
115
- max_export_batch_size=batch_size,
116
- export_timeout_millis=export_timeout,
117
- )
118
-
119
- def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None:
120
- self.batch_processor.on_start(span, parent_context)
121
-
122
- def on_end(self, span: ReadableSpan) -> None:
123
- self.batch_processor.on_end(span)
124
-
125
- def queue_span_update(self, span: TraceSpan, span_state: str = "input") -> None:
126
- if span_state == "completed":
127
- span.set_update_id_to_ending_number()
128
- else:
129
- span.increment_update_id()
130
-
131
- with self._cache_lock:
132
- span_id = span.span_id
133
-
134
- self._span_cache[span_id] = span
135
- self._span_states[span_id] = span_state
136
-
137
- self._send_span_update(span, span_state)
138
-
139
- if span_state == "completed" or span_state == "error":
140
- self._span_cache.pop(span_id, None)
141
- self._span_states.pop(span_id, None)
142
-
143
- def _send_span_update(self, span: TraceSpan, span_state: str) -> None:
144
- readable_span = SimpleReadableSpan(span, span_state)
145
- self.batch_processor.on_end(readable_span)
146
-
147
- def flush_pending_spans(self) -> None:
148
- with self._cache_lock:
149
- if not self._span_cache:
150
- return
151
-
152
- for span_id, span in self._span_cache.items():
153
- span_state = self._span_states.get(span_id, "input")
154
- self._send_span_update(span, span_state)
155
-
156
- def queue_evaluation_run(
157
- self, evaluation_run: EvaluationRun, span_id: str, span_data: TraceSpan
158
- ) -> None:
159
- attributes = SpanTransformer.evaluation_run_to_otel_attributes(
160
- evaluation_run, span_id, span_data
161
- )
162
-
163
- readable_span = SimpleReadableSpan(span_data, "evaluation_run")
164
- readable_span._attributes.update(attributes)
165
-
166
- self.batch_processor.on_end(readable_span)
167
-
168
- def shutdown(self) -> None:
169
- try:
170
- self.flush_pending_spans()
171
- except Exception as e:
172
- judgeval_logger.warning(
173
- f"Error flushing pending spans during shutdown: {e}"
174
- )
175
-
176
- self.batch_processor.shutdown()
177
-
178
- with self._cache_lock:
179
- self._span_cache.clear()
180
- self._span_states.clear()
181
-
182
- def force_flush(self, timeout_millis: int = 30000) -> bool:
183
- try:
184
- self.flush_pending_spans()
185
- except Exception as e:
186
- judgeval_logger.warning(f"Error flushing pending spans: {e}")
187
-
188
- return self.batch_processor.force_flush(timeout_millis)
@@ -1,37 +0,0 @@
1
- """
2
- Base class for span processors with default no-op implementations.
3
-
4
- This eliminates the need for optional typing and null checks.
5
- When monitoring is disabled, we use this base class directly.
6
- When monitoring is enabled, we use JudgmentSpanProcessor which overrides the methods.
7
- """
8
-
9
- from judgeval.data import TraceSpan
10
- from judgeval.data.evaluation_run import EvaluationRun
11
-
12
-
13
- class SpanProcessorBase:
14
- """
15
- Base class for Judgment span processors with default no-op implementations.
16
-
17
- This eliminates the need for optional typing and null checks.
18
- When monitoring is disabled, we use this base class directly.
19
- When monitoring is enabled, we use JudgmentSpanProcessor which overrides the methods.
20
- """
21
-
22
- def queue_span_update(self, span: TraceSpan, span_state: str = "input") -> None:
23
- pass
24
-
25
- def queue_evaluation_run(
26
- self, evaluation_run: EvaluationRun, span_id: str, span_data: TraceSpan
27
- ) -> None:
28
- pass
29
-
30
- def flush_pending_spans(self) -> None:
31
- pass
32
-
33
- def force_flush(self, timeout_millis: int = 30000) -> bool:
34
- return True
35
-
36
- def shutdown(self) -> None:
37
- pass
@@ -1,207 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import time
4
- import uuid
5
- import orjson
6
- from datetime import datetime, timezone
7
- from typing import Any, Dict, Mapping, Optional, Union
8
-
9
- from opentelemetry.sdk.trace import ReadableSpan
10
- from pydantic import BaseModel
11
-
12
- from judgeval.common.api.json_encoder import json_encoder
13
- from judgeval.data import TraceSpan
14
- from judgeval.data.evaluation_run import EvaluationRun
15
-
16
-
17
- class SpanTransformer:
18
- @staticmethod
19
- def _needs_json_serialization(value: Any) -> bool:
20
- """
21
- Check if the value needs JSON serialization.
22
- Returns True if the value is complex and needs serialization.
23
- """
24
- if value is None:
25
- return False
26
-
27
- # Basic JSON-serializable types don't need serialization
28
- if isinstance(value, (str, int, float, bool)):
29
- return False
30
-
31
- complex_types = (dict, list, tuple, set, BaseModel)
32
- if isinstance(value, complex_types):
33
- return True
34
-
35
- try:
36
- orjson.dumps(value)
37
- return False
38
- except (TypeError, ValueError):
39
- return True
40
-
41
- @staticmethod
42
- def _safe_deserialize(obj: Any) -> Any:
43
- if not isinstance(obj, str):
44
- return obj
45
- try:
46
- return orjson.loads(obj)
47
- except (orjson.JSONDecodeError, TypeError):
48
- return obj
49
-
50
- @staticmethod
51
- def _format_timestamp(timestamp: Optional[Union[float, int, str]]) -> str:
52
- if timestamp is None:
53
- return datetime.now(timezone.utc).isoformat()
54
-
55
- if isinstance(timestamp, str):
56
- return timestamp
57
-
58
- try:
59
- dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
60
- return dt.isoformat()
61
- except (ValueError, OSError):
62
- return datetime.now(timezone.utc).isoformat()
63
-
64
- @staticmethod
65
- def trace_span_to_otel_attributes(
66
- trace_span: TraceSpan, span_state: str = "completed"
67
- ) -> Dict[str, Any]:
68
- serialized_data = trace_span.model_dump()
69
- attributes: Dict[str, Any] = {}
70
-
71
- for field_name, value in serialized_data.items():
72
- if value is None:
73
- continue
74
-
75
- attr_name = f"judgment.{field_name}"
76
-
77
- if field_name == "created_at":
78
- attributes[attr_name] = SpanTransformer._format_timestamp(value)
79
- elif field_name == "expected_tools" and value:
80
- attributes[attr_name] = json_encoder(
81
- [tool.model_dump() for tool in trace_span.expected_tools]
82
- )
83
- elif field_name == "usage" and value:
84
- attributes[attr_name] = json_encoder(trace_span.usage)
85
- elif SpanTransformer._needs_json_serialization(value):
86
- attributes[attr_name] = json_encoder(value)
87
- else:
88
- attributes[attr_name] = value
89
-
90
- attributes["judgment.span_state"] = span_state
91
- if not attributes.get("judgment.span_type"):
92
- attributes["judgment.span_type"] = "span"
93
-
94
- return attributes
95
-
96
- @staticmethod
97
- def otel_attributes_to_judgment_data(
98
- attributes: Mapping[str, Any],
99
- ) -> Dict[str, Any]:
100
- judgment_data: Dict[str, Any] = {}
101
-
102
- for key, value in attributes.items():
103
- if not key.startswith("judgment."):
104
- continue
105
-
106
- field_name = key[9:]
107
-
108
- if isinstance(value, str):
109
- deserialized = SpanTransformer._safe_deserialize(value)
110
- judgment_data[field_name] = deserialized
111
- else:
112
- judgment_data[field_name] = value
113
-
114
- return judgment_data
115
-
116
- @staticmethod
117
- def otel_span_to_judgment_format(span: ReadableSpan) -> Dict[str, Any]:
118
- attributes = span.attributes or {}
119
- judgment_data = SpanTransformer.otel_attributes_to_judgment_data(attributes)
120
-
121
- duration = judgment_data.get("duration")
122
- if duration is None and span.end_time and span.start_time:
123
- duration = (span.end_time - span.start_time) / 1_000_000_000
124
-
125
- span_id = judgment_data.get("span_id") or str(uuid.uuid4())
126
- trace_id = judgment_data.get("trace_id") or str(uuid.uuid4())
127
-
128
- created_at = judgment_data.get("created_at")
129
- if not created_at:
130
- created_at = (
131
- span.start_time / 1_000_000_000 if span.start_time else time.time()
132
- )
133
-
134
- return {
135
- "type": "span",
136
- "data": {
137
- "span_id": span_id,
138
- "trace_id": trace_id,
139
- "function": span.name,
140
- "depth": judgment_data.get("depth", 0),
141
- "created_at": SpanTransformer._format_timestamp(created_at),
142
- "parent_span_id": judgment_data.get("parent_span_id"),
143
- "span_type": judgment_data.get("span_type", "span"),
144
- "inputs": judgment_data.get("inputs"),
145
- "error": judgment_data.get("error"),
146
- "output": judgment_data.get("output"),
147
- "usage": judgment_data.get("usage"),
148
- "duration": duration,
149
- "expected_tools": judgment_data.get("expected_tools"),
150
- "additional_metadata": judgment_data.get("additional_metadata"),
151
- "has_evaluation": judgment_data.get("has_evaluation", False),
152
- "agent_name": judgment_data.get("agent_name"),
153
- "class_name": judgment_data.get("class_name"),
154
- "state_before": judgment_data.get("state_before"),
155
- "state_after": judgment_data.get("state_after"),
156
- "update_id": judgment_data.get("update_id", 1),
157
- "span_state": judgment_data.get("span_state", "completed"),
158
- "queued_at": time.time(),
159
- },
160
- }
161
-
162
- @staticmethod
163
- def evaluation_run_to_otel_attributes(
164
- evaluation_run: EvaluationRun, span_id: str, span_data: TraceSpan
165
- ) -> Dict[str, Any]:
166
- attributes = {
167
- "judgment.evaluation_run": True,
168
- "judgment.associated_span_id": span_id,
169
- "judgment.span_data": json_encoder(span_data),
170
- }
171
-
172
- eval_data = evaluation_run.model_dump()
173
- for key, value in eval_data.items():
174
- if value is None:
175
- continue
176
-
177
- attr_name = f"judgment.{key}"
178
- if SpanTransformer._needs_json_serialization(value):
179
- attributes[attr_name] = json_encoder(value)
180
- else:
181
- attributes[attr_name] = value
182
-
183
- return attributes
184
-
185
- @staticmethod
186
- def otel_span_to_evaluation_run_format(span: ReadableSpan) -> Dict[str, Any]:
187
- attributes = span.attributes or {}
188
- judgment_data = SpanTransformer.otel_attributes_to_judgment_data(attributes)
189
-
190
- associated_span_id = judgment_data.get("associated_span_id") or str(
191
- uuid.uuid4()
192
- )
193
-
194
- eval_run_data = {
195
- key: value
196
- for key, value in judgment_data.items()
197
- if key not in ["associated_span_id", "span_data", "evaluation_run"]
198
- }
199
-
200
- eval_run_data["associated_span_id"] = associated_span_id
201
- eval_run_data["span_data"] = judgment_data.get("span_data")
202
- eval_run_data["queued_at"] = time.time()
203
-
204
- return {
205
- "type": "evaluation_run",
206
- "data": eval_run_data,
207
- }
@@ -1,101 +0,0 @@
1
- from __future__ import annotations
2
- from typing import List, Optional, TYPE_CHECKING
3
-
4
- if TYPE_CHECKING:
5
- from judgeval.common.tracer import Tracer
6
-
7
- from judgeval.common.logger import judgeval_logger
8
- from judgeval.common.api import JudgmentApiClient
9
- from rich import print as rprint
10
-
11
-
12
- class TraceManagerClient:
13
- """
14
- Client for handling trace endpoints with the Judgment API
15
-
16
-
17
- Operations include:
18
- - Fetching a trace by id
19
- - Saving a trace
20
- - Deleting a trace
21
- """
22
-
23
- def __init__(
24
- self,
25
- judgment_api_key: str,
26
- organization_id: str,
27
- tracer: Optional[Tracer] = None,
28
- ):
29
- self.api_client = JudgmentApiClient(judgment_api_key, organization_id)
30
- self.tracer = tracer
31
-
32
- def fetch_trace(self, trace_id: str):
33
- """
34
- Fetch a trace by its id
35
- """
36
- return self.api_client.fetch_trace(trace_id)
37
-
38
- def upsert_trace(
39
- self,
40
- trace_data: dict,
41
- offline_mode: bool = False,
42
- show_link: bool = True,
43
- final_save: bool = True,
44
- ):
45
- """
46
- Upserts a trace to the Judgment API (always overwrites if exists).
47
-
48
- Args:
49
- trace_data: The trace data to upsert
50
- offline_mode: Whether running in offline mode
51
- show_link: Whether to show the UI link (for live tracing)
52
- final_save: Whether this is the final save (controls S3 saving)
53
-
54
- Returns:
55
- dict: Server response containing UI URL and other metadata
56
- """
57
-
58
- if self.tracer and self.tracer.use_s3 and final_save:
59
- try:
60
- s3_key = self.tracer.s3_storage.save_trace(
61
- trace_data=trace_data,
62
- trace_id=trace_data["trace_id"],
63
- project_name=trace_data["project_name"],
64
- )
65
- judgeval_logger.info(f"Trace also saved to S3 at key: {s3_key}")
66
- except Exception as e:
67
- judgeval_logger.warning(f"Failed to save trace to S3: {str(e)}")
68
-
69
- trace_data.pop("trace_spans", None)
70
- trace_data.pop("evaluation_runs", None)
71
-
72
- server_response = self.api_client.upsert_trace(trace_data)
73
-
74
- if (
75
- not offline_mode
76
- and show_link
77
- and "ui_results_url" in server_response
78
- and self.tracer.show_trace_urls
79
- ):
80
- pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={server_response['ui_results_url']}]View Trace[/link]\n"
81
- rprint(pretty_str)
82
-
83
- return server_response
84
-
85
- def delete_trace(self, trace_id: str):
86
- """
87
- Delete a trace from the database.
88
- """
89
- return self.api_client.delete_trace(trace_id)
90
-
91
- def delete_traces(self, trace_ids: List[str]):
92
- """
93
- Delete a batch of traces from the database.
94
- """
95
- return self.api_client.delete_traces(trace_ids)
96
-
97
- def delete_project(self, project_name: str):
98
- """
99
- Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
100
- """
101
- return self.api_client.delete_project(project_name)
@@ -1,5 +0,0 @@
1
- from .trainer import JudgmentTrainer
2
- from .config import TrainerConfig, ModelConfig
3
- from .trainable_model import TrainableModel
4
-
5
- __all__ = ["JudgmentTrainer", "TrainerConfig", "ModelConfig", "TrainableModel"]