judgeval 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +93 -55
- judgeval/constants.py +4 -2
- judgeval/data/__init__.py +4 -0
- judgeval/data/custom_example.py +18 -0
- judgeval/data/datasets/eval_dataset_client.py +62 -3
- judgeval/data/example.py +1 -0
- judgeval/data/result.py +7 -6
- judgeval/data/sequence.py +59 -0
- judgeval/data/sequence_run.py +42 -0
- judgeval/evaluation_run.py +12 -7
- judgeval/integrations/langgraph.py +89 -72
- judgeval/judgment_client.py +77 -14
- judgeval/run_evaluation.py +87 -13
- judgeval/scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorer.py +3 -0
- judgeval/scorers/judgeval_scorers/__init__.py +7 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -1
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +21 -0
- judgeval/scorers/score.py +6 -5
- {judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/METADATA +1 -1
- {judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/RECORD +23 -20
- judgeval/data/custom_api_example.py +0 -91
- {judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/WHEEL +0 -0
- {judgeval-0.0.29.dist-info → judgeval-0.0.31.dist-info}/licenses/LICENSE.md +0 -0
@@ -22,9 +22,8 @@ from langchain_core.documents import Document
|
|
22
22
|
class JudgevalCallbackHandler(BaseCallbackHandler):
|
23
23
|
def __init__(self, tracer: Tracer):
|
24
24
|
self.tracer = tracer
|
25
|
-
self.trace_client = tracer.get_current_trace() if tracer.get_current_trace() else None
|
26
25
|
self.previous_spans = [] # stack of previous spans
|
27
|
-
self.
|
26
|
+
self.created_trace = False
|
28
27
|
|
29
28
|
# Attributes for users to access
|
30
29
|
self.previous_node = None
|
@@ -33,43 +32,58 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
33
32
|
self.executed_tools = []
|
34
33
|
|
35
34
|
def start_span(self, name: str, span_type: SpanType = "span"):
|
35
|
+
current_trace = self.tracer.get_current_trace()
|
36
36
|
start_time = time.time()
|
37
|
+
|
38
|
+
# Generate a unique ID for *this specific span invocation*
|
39
|
+
span_id = str(uuid.uuid4())
|
40
|
+
|
41
|
+
parent_span_id = current_trace.get_current_span()
|
42
|
+
token = current_trace.set_current_span(span_id) # Set *this* span's ID as the current one
|
43
|
+
|
44
|
+
current_depth = 0
|
45
|
+
if parent_span_id and parent_span_id in current_trace._span_depths:
|
46
|
+
current_depth = current_trace._span_depths[parent_span_id] + 1
|
37
47
|
|
48
|
+
current_trace._span_depths[span_id] = current_depth # Store depth by span_id
|
38
49
|
# Record span entry
|
39
|
-
|
50
|
+
current_trace.add_entry(TraceEntry(
|
40
51
|
type="enter",
|
52
|
+
span_id=span_id,
|
53
|
+
trace_id=current_trace.trace_id,
|
54
|
+
parent_span_id=parent_span_id,
|
41
55
|
function=name,
|
42
|
-
depth=
|
56
|
+
depth=current_depth,
|
43
57
|
message=name,
|
44
|
-
|
58
|
+
created_at=start_time,
|
45
59
|
span_type=span_type
|
46
60
|
))
|
47
61
|
|
48
|
-
self.
|
49
|
-
self.previous_spans.append(self.trace_client._current_span)
|
50
|
-
self.trace_client._current_span = name
|
62
|
+
self.previous_spans.append(token)
|
51
63
|
self._start_time = start_time
|
52
64
|
|
53
|
-
def end_span(self,
|
54
|
-
self.
|
65
|
+
def end_span(self, span_type: SpanType = "span"):
|
66
|
+
current_trace = self.tracer.get_current_trace()
|
55
67
|
duration = time.time() - self._start_time
|
68
|
+
span_id = current_trace.get_current_span()
|
69
|
+
exit_depth = current_trace._span_depths.get(span_id, 0) # Get depth using this span's ID
|
56
70
|
|
57
71
|
# Record span exit
|
58
|
-
|
72
|
+
current_trace.add_entry(TraceEntry(
|
59
73
|
type="exit",
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
74
|
+
span_id=span_id,
|
75
|
+
trace_id=current_trace.trace_id,
|
76
|
+
depth=exit_depth,
|
77
|
+
created_at=time.time(),
|
64
78
|
duration=duration,
|
65
79
|
span_type=span_type
|
66
80
|
))
|
67
|
-
|
68
|
-
|
69
|
-
if self.trace_client.tracer.depth == 0:
|
81
|
+
current_trace.reset_current_span(self.previous_spans.pop())
|
82
|
+
if exit_depth == 0:
|
70
83
|
# Save the trace if we are the root, this is when users dont use any @observe decorators
|
71
|
-
|
72
|
-
self.
|
84
|
+
trace_id, trace_data = current_trace.save(overwrite=True)
|
85
|
+
self._trace_id = trace_id
|
86
|
+
current_trace = None
|
73
87
|
|
74
88
|
def on_retriever_start(
|
75
89
|
self,
|
@@ -85,9 +99,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
85
99
|
name = "RETRIEVER_CALL"
|
86
100
|
if serialized and "name" in serialized:
|
87
101
|
name = f"RETRIEVER_{serialized['name'].upper()}"
|
88
|
-
|
102
|
+
current_trace = self.tracer.get_current_trace()
|
89
103
|
self.start_span(name, span_type="retriever")
|
90
|
-
|
104
|
+
current_trace.record_input({
|
91
105
|
'query': query,
|
92
106
|
'tags': tags,
|
93
107
|
'metadata': metadata,
|
@@ -103,6 +117,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
103
117
|
**kwargs: Any
|
104
118
|
) -> Any:
|
105
119
|
# Process the retrieved documents into a format suitable for logging
|
120
|
+
current_trace = self.tracer.get_current_trace()
|
106
121
|
doc_summary = []
|
107
122
|
for i, doc in enumerate(documents):
|
108
123
|
# Extract key information from each document
|
@@ -114,13 +129,13 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
114
129
|
doc_summary.append(doc_data)
|
115
130
|
|
116
131
|
# Record the document data
|
117
|
-
|
132
|
+
current_trace.record_output({
|
118
133
|
"document_count": len(documents),
|
119
134
|
"documents": doc_summary
|
120
135
|
})
|
121
136
|
|
122
137
|
# End the retriever span
|
123
|
-
self.end_span(
|
138
|
+
self.end_span(span_type="retriever")
|
124
139
|
|
125
140
|
def on_chain_start(
|
126
141
|
self,
|
@@ -134,29 +149,26 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
134
149
|
**kwargs: Any
|
135
150
|
) -> None:
|
136
151
|
# If the user doesnt use any @observe decorators, the first action in LangGraph workflows seems tohave this attribute, so we intialize our trace client here
|
152
|
+
current_trace = self.tracer.get_current_trace()
|
137
153
|
if kwargs.get('name') == 'LangGraph':
|
138
|
-
if not
|
154
|
+
if not current_trace:
|
155
|
+
self.created_trace = True
|
139
156
|
trace_id = str(uuid.uuid4())
|
140
157
|
project = self.tracer.project_name
|
141
|
-
trace = TraceClient(self.tracer, trace_id,
|
142
|
-
self.
|
143
|
-
self.
|
144
|
-
# Only save empty trace for the root call
|
145
|
-
self.trace_client.save(empty_save=True, overwrite=False)
|
146
|
-
|
147
|
-
self.start_span("LangGraph", span_type="Main Function")
|
158
|
+
trace = TraceClient(self.tracer, trace_id, "Langgraph", project_name=project, overwrite=False, rules=self.tracer.rules, enable_monitoring=self.tracer.enable_monitoring, enable_evaluations=self.tracer.enable_evaluations)
|
159
|
+
self.tracer.set_current_trace(trace)
|
160
|
+
self.start_span("LangGraph", span_type="Main Function")
|
148
161
|
|
149
|
-
|
150
|
-
if node
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
self.previous_node = node
|
162
|
+
node = metadata.get("langgraph_node")
|
163
|
+
if node != None and node != self.previous_node:
|
164
|
+
self.start_span(node, span_type="node")
|
165
|
+
self.executed_node_tools.append(node)
|
166
|
+
self.executed_nodes.append(node)
|
167
|
+
current_trace.record_input({
|
168
|
+
'args': inputs,
|
169
|
+
'kwargs': kwargs
|
170
|
+
})
|
171
|
+
self.previous_node = node
|
160
172
|
|
161
173
|
def on_chain_end(
|
162
174
|
self,
|
@@ -167,14 +179,13 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
167
179
|
tags: Optional[List[str]] = None,
|
168
180
|
**kwargs: Any,
|
169
181
|
) -> Any:
|
170
|
-
|
171
|
-
self.finished = True
|
182
|
+
current_trace = self.tracer.get_current_trace()
|
172
183
|
if tags is not None and any("graph:step" in tag for tag in tags):
|
173
|
-
|
174
|
-
self.end_span(
|
184
|
+
current_trace.record_output(outputs)
|
185
|
+
self.end_span(span_type="node")
|
175
186
|
|
176
|
-
|
177
|
-
|
187
|
+
if self.created_trace and (outputs == "__end__" or (not kwargs and not tags)):
|
188
|
+
self.end_span(span_type="Main Function")
|
178
189
|
|
179
190
|
def on_chain_error(
|
180
191
|
self,
|
@@ -184,9 +195,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
184
195
|
parent_run_id: Optional[UUID] = None,
|
185
196
|
**kwargs: Any,
|
186
197
|
) -> Any:
|
187
|
-
|
188
|
-
|
189
|
-
self.end_span(
|
198
|
+
current_trace = self.tracer.get_current_trace()
|
199
|
+
current_trace.record_output(error)
|
200
|
+
self.end_span(span_type="node")
|
190
201
|
|
191
202
|
def on_tool_start(
|
192
203
|
self,
|
@@ -199,19 +210,21 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
199
210
|
):
|
200
211
|
name = serialized["name"]
|
201
212
|
self.start_span(name, span_type="tool")
|
213
|
+
current_trace = self.tracer.get_current_trace()
|
202
214
|
if name:
|
203
215
|
# Track tool execution
|
204
|
-
|
216
|
+
current_trace.executed_tools.append(name)
|
205
217
|
node_tool = f"{self.previous_node}:{name}" if self.previous_node else name
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
218
|
+
current_trace.executed_node_tools.append(node_tool)
|
219
|
+
current_trace.record_input({
|
220
|
+
'args': input_str,
|
221
|
+
'kwargs': kwargs
|
222
|
+
})
|
211
223
|
|
212
224
|
def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
|
213
|
-
self.
|
214
|
-
|
225
|
+
current_trace = self.tracer.get_current_trace()
|
226
|
+
current_trace.record_output(output)
|
227
|
+
self.end_span(span_type="tool")
|
215
228
|
|
216
229
|
def on_tool_error(
|
217
230
|
self,
|
@@ -221,9 +234,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
221
234
|
parent_run_id: Optional[UUID] = None,
|
222
235
|
**kwargs: Any,
|
223
236
|
) -> Any:
|
224
|
-
|
225
|
-
|
226
|
-
self.end_span(
|
237
|
+
current_trace = self.tracer.get_current_trace()
|
238
|
+
current_trace.record_output(error)
|
239
|
+
self.end_span(span_type="tool")
|
227
240
|
|
228
241
|
def on_agent_action(
|
229
242
|
self,
|
@@ -233,7 +246,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
233
246
|
parent_run_id: Optional[UUID] = None,
|
234
247
|
**kwargs: Any,
|
235
248
|
) -> Any:
|
236
|
-
|
249
|
+
pass
|
237
250
|
|
238
251
|
def on_agent_finish(
|
239
252
|
self,
|
@@ -243,7 +256,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
243
256
|
parent_run_id: Optional[UUID] = None,
|
244
257
|
**kwargs: Any,
|
245
258
|
) -> Any:
|
246
|
-
|
259
|
+
|
260
|
+
pass
|
247
261
|
|
248
262
|
def on_llm_start(
|
249
263
|
self,
|
@@ -256,14 +270,16 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
256
270
|
) -> Any:
|
257
271
|
name = "LLM call"
|
258
272
|
self.start_span(name, span_type="llm")
|
259
|
-
self.
|
273
|
+
current_trace = self.tracer.get_current_trace()
|
274
|
+
current_trace.record_input({
|
260
275
|
'args': prompts,
|
261
276
|
'kwargs': kwargs
|
262
277
|
})
|
263
278
|
|
264
279
|
def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
|
265
|
-
self.
|
266
|
-
|
280
|
+
current_trace = self.tracer.get_current_trace()
|
281
|
+
current_trace.record_output(response.generations[0][0].text)
|
282
|
+
self.end_span(span_type="llm")
|
267
283
|
|
268
284
|
def on_llm_error(
|
269
285
|
self,
|
@@ -273,9 +289,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
273
289
|
parent_run_id: Optional[UUID] = None,
|
274
290
|
**kwargs: Any,
|
275
291
|
) -> Any:
|
276
|
-
|
277
|
-
|
278
|
-
self.end_span(
|
292
|
+
current_trace = self.tracer.get_current_trace()
|
293
|
+
current_trace.record_output(error)
|
294
|
+
self.end_span(span_type="llm")
|
279
295
|
|
280
296
|
def on_chat_model_start(
|
281
297
|
self,
|
@@ -297,7 +313,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
297
313
|
name = "LLM call"
|
298
314
|
|
299
315
|
self.start_span(name, span_type="llm")
|
300
|
-
self.
|
316
|
+
current_trace = self.tracer.get_current_trace()
|
317
|
+
current_trace.record_input({
|
301
318
|
'args': str(messages),
|
302
319
|
'kwargs': kwargs
|
303
320
|
})
|
judgeval/judgment_client.py
CHANGED
@@ -10,6 +10,8 @@ from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
|
10
10
|
from judgeval.data import (
|
11
11
|
ScoringResult,
|
12
12
|
Example,
|
13
|
+
CustomExample,
|
14
|
+
Sequence,
|
13
15
|
)
|
14
16
|
from judgeval.scorers import (
|
15
17
|
APIJudgmentScorer,
|
@@ -20,8 +22,10 @@ from judgeval.scorers import (
|
|
20
22
|
from judgeval.evaluation_run import EvaluationRun
|
21
23
|
from judgeval.run_evaluation import (
|
22
24
|
run_eval,
|
23
|
-
assert_test
|
25
|
+
assert_test,
|
26
|
+
run_sequence_eval
|
24
27
|
)
|
28
|
+
from judgeval.data.sequence_run import SequenceRun
|
25
29
|
from judgeval.judges import JudgevalJudge
|
26
30
|
from judgeval.constants import (
|
27
31
|
JUDGMENT_EVAL_FETCH_API_URL,
|
@@ -78,15 +82,71 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
78
82
|
project_name: str = "default_project",
|
79
83
|
eval_run_name: str = "default_eval_run",
|
80
84
|
override: bool = False,
|
85
|
+
append: bool = False,
|
81
86
|
use_judgment: bool = True,
|
82
87
|
ignore_errors: bool = True,
|
83
88
|
rules: Optional[List[Rule]] = None
|
84
89
|
) -> List[ScoringResult]:
|
85
|
-
return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, use_judgment, ignore_errors, True, rules)
|
90
|
+
return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, append, use_judgment, ignore_errors, True, rules)
|
91
|
+
|
92
|
+
def run_sequence_evaluation(
|
93
|
+
self,
|
94
|
+
sequences: List[Sequence],
|
95
|
+
model: Union[str, List[str], JudgevalJudge],
|
96
|
+
aggregator: Optional[str] = None,
|
97
|
+
project_name: str = "default_project",
|
98
|
+
eval_run_name: str = "default_eval_sequence",
|
99
|
+
use_judgment: bool = True,
|
100
|
+
log_results: bool = True,
|
101
|
+
override: bool = False,
|
102
|
+
ignore_errors: bool = True,
|
103
|
+
rules: Optional[List[Rule]] = None
|
104
|
+
) -> List[ScoringResult]:
|
105
|
+
try:
|
106
|
+
if rules:
|
107
|
+
loaded_rules = []
|
108
|
+
for rule in rules:
|
109
|
+
try:
|
110
|
+
processed_conditions = []
|
111
|
+
for condition in rule.conditions:
|
112
|
+
# Convert metric if it's a ScorerWrapper
|
113
|
+
if isinstance(condition.metric, ScorerWrapper):
|
114
|
+
try:
|
115
|
+
condition_copy = condition.model_copy()
|
116
|
+
condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
|
117
|
+
processed_conditions.append(condition_copy)
|
118
|
+
except Exception as e:
|
119
|
+
raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
|
120
|
+
else:
|
121
|
+
processed_conditions.append(condition)
|
122
|
+
|
123
|
+
# Create new rule with processed conditions
|
124
|
+
new_rule = rule.model_copy()
|
125
|
+
new_rule.conditions = processed_conditions
|
126
|
+
loaded_rules.append(new_rule)
|
127
|
+
except Exception as e:
|
128
|
+
raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
|
129
|
+
|
130
|
+
sequence_run = SequenceRun(
|
131
|
+
project_name=project_name,
|
132
|
+
eval_name=eval_run_name,
|
133
|
+
sequences=sequences,
|
134
|
+
model=model,
|
135
|
+
aggregator=aggregator,
|
136
|
+
log_results=log_results,
|
137
|
+
judgment_api_key=self.judgment_api_key,
|
138
|
+
organization_id=self.organization_id
|
139
|
+
)
|
140
|
+
|
141
|
+
return run_sequence_eval(sequence_run, override, ignore_errors, use_judgment)
|
142
|
+
except ValueError as e:
|
143
|
+
raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
|
144
|
+
except Exception as e:
|
145
|
+
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
86
146
|
|
87
147
|
def run_evaluation(
|
88
148
|
self,
|
89
|
-
examples: List[Example],
|
149
|
+
examples: Union[List[Example], List[CustomExample]],
|
90
150
|
scorers: List[Union[ScorerWrapper, JudgevalScorer]],
|
91
151
|
model: Union[str, List[str], JudgevalJudge],
|
92
152
|
aggregator: Optional[str] = None,
|
@@ -95,6 +155,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
95
155
|
project_name: str = "default_project",
|
96
156
|
eval_run_name: str = "default_eval_run",
|
97
157
|
override: bool = False,
|
158
|
+
append: bool = False,
|
98
159
|
use_judgment: bool = True,
|
99
160
|
ignore_errors: bool = True,
|
100
161
|
async_execution: bool = False,
|
@@ -104,7 +165,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
104
165
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
105
166
|
|
106
167
|
Args:
|
107
|
-
examples (List[Example]): The examples to evaluate
|
168
|
+
examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
|
108
169
|
scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
|
109
170
|
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
110
171
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
@@ -120,6 +181,9 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
120
181
|
Returns:
|
121
182
|
List[ScoringResult]: The results of the evaluation
|
122
183
|
"""
|
184
|
+
if override and append:
|
185
|
+
raise ValueError("Cannot set both override and append to True. Please choose one.")
|
186
|
+
|
123
187
|
try:
|
124
188
|
# Load appropriate implementations for all scorers
|
125
189
|
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
|
@@ -161,9 +225,9 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
161
225
|
loaded_rules.append(new_rule)
|
162
226
|
except Exception as e:
|
163
227
|
raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
|
164
|
-
|
165
228
|
eval = EvaluationRun(
|
166
229
|
log_results=log_results,
|
230
|
+
append=append,
|
167
231
|
project_name=project_name,
|
168
232
|
eval_name=eval_run_name,
|
169
233
|
examples=examples,
|
@@ -180,7 +244,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
180
244
|
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
|
181
245
|
except Exception as e:
|
182
246
|
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
183
|
-
|
247
|
+
|
184
248
|
def evaluate_dataset(
|
185
249
|
self,
|
186
250
|
dataset: EvalDataset,
|
@@ -292,6 +356,12 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
292
356
|
dataset.judgment_api_key = self.judgment_api_key
|
293
357
|
return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
|
294
358
|
|
359
|
+
def append_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
|
360
|
+
"""
|
361
|
+
Appends an `EvalDataset` to the Judgment platform for storage.
|
362
|
+
"""
|
363
|
+
return self.eval_dataset_client.append(alias, examples, project_name)
|
364
|
+
|
295
365
|
def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
|
296
366
|
"""
|
297
367
|
Retrieves a saved `EvalDataset` from the Judgment platform.
|
@@ -355,14 +425,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
355
425
|
if eval_run.status_code != requests.codes.ok:
|
356
426
|
raise ValueError(f"Error fetching eval results: {eval_run.json()}")
|
357
427
|
|
358
|
-
|
359
|
-
for result in eval_run.json():
|
360
|
-
result_id = result.get("id", "")
|
361
|
-
result_data = result.get("result", dict())
|
362
|
-
filtered_result = {k: v for k, v in result_data.items() if k in ScoringResult.__annotations__}
|
363
|
-
eval_run_result[0]["id"] = result_id
|
364
|
-
eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
|
365
|
-
return eval_run_result
|
428
|
+
return eval_run.json()
|
366
429
|
|
367
430
|
def delete_eval(self, project_name: str, eval_run_names: List[str]) -> bool:
|
368
431
|
"""
|
judgeval/run_evaluation.py
CHANGED
@@ -4,14 +4,15 @@ import time
|
|
4
4
|
import sys
|
5
5
|
import itertools
|
6
6
|
import threading
|
7
|
-
from typing import List, Dict, Any
|
7
|
+
from typing import List, Dict, Any, Union
|
8
8
|
from datetime import datetime
|
9
9
|
from rich import print as rprint
|
10
10
|
|
11
11
|
from judgeval.data import (
|
12
12
|
ScorerData,
|
13
13
|
ScoringResult,
|
14
|
-
Example
|
14
|
+
Example,
|
15
|
+
CustomExample
|
15
16
|
)
|
16
17
|
from judgeval.scorers import (
|
17
18
|
JudgevalScorer,
|
@@ -22,6 +23,7 @@ from judgeval.scorers.score import a_execute_scoring
|
|
22
23
|
from judgeval.constants import (
|
23
24
|
ROOT_API,
|
24
25
|
JUDGMENT_EVAL_API_URL,
|
26
|
+
JUDGMENT_SEQUENCE_EVAL_API_URL,
|
25
27
|
JUDGMENT_EVAL_LOG_API_URL,
|
26
28
|
MAX_CONCURRENT_EVALUATIONS,
|
27
29
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
|
@@ -34,7 +36,7 @@ from judgeval.common.logger import (
|
|
34
36
|
example_logging_context
|
35
37
|
)
|
36
38
|
from judgeval.evaluation_run import EvaluationRun
|
37
|
-
|
39
|
+
from judgeval.data.sequence_run import SequenceRun
|
38
40
|
|
39
41
|
def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
|
40
42
|
"""
|
@@ -91,6 +93,36 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
91
93
|
raise JudgmentAPIError(error_message)
|
92
94
|
return response_data
|
93
95
|
|
96
|
+
def execute_api_sequence_eval(sequence_run: SequenceRun) -> List[Dict]:
|
97
|
+
"""
|
98
|
+
Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
|
99
|
+
"""
|
100
|
+
|
101
|
+
try:
|
102
|
+
# submit API request to execute evals
|
103
|
+
payload = sequence_run.model_dump(warnings=False)
|
104
|
+
response = requests.post(
|
105
|
+
JUDGMENT_SEQUENCE_EVAL_API_URL,
|
106
|
+
headers={
|
107
|
+
"Content-Type": "application/json",
|
108
|
+
"Authorization": f"Bearer {sequence_run.judgment_api_key}",
|
109
|
+
"X-Organization-Id": sequence_run.organization_id
|
110
|
+
},
|
111
|
+
json=payload,
|
112
|
+
verify=True
|
113
|
+
)
|
114
|
+
response_data = response.json()
|
115
|
+
except Exception as e:
|
116
|
+
error(f"Error: {e}")
|
117
|
+
details = response.json().get("detail", "No details provided")
|
118
|
+
raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details)
|
119
|
+
# Check if the response status code is not 2XX
|
120
|
+
# Add check for the duplicate eval run name
|
121
|
+
if not response.ok:
|
122
|
+
error_message = response_data.get('detail', 'An unknown error occurred.')
|
123
|
+
error(f"Error: {error_message=}")
|
124
|
+
raise JudgmentAPIError(error_message)
|
125
|
+
return response_data
|
94
126
|
|
95
127
|
def merge_results(api_results: List[ScoringResult], local_results: List[ScoringResult]) -> List[ScoringResult]:
|
96
128
|
"""
|
@@ -197,8 +229,8 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
197
229
|
)
|
198
230
|
|
199
231
|
if response.status_code == 409:
|
200
|
-
error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `
|
201
|
-
raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `
|
232
|
+
error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true.")
|
233
|
+
raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true.")
|
202
234
|
|
203
235
|
if not response.ok:
|
204
236
|
response_data = response.json()
|
@@ -211,7 +243,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
211
243
|
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
212
244
|
|
213
245
|
|
214
|
-
def log_evaluation_results(merged_results: List[ScoringResult],
|
246
|
+
def log_evaluation_results(merged_results: List[ScoringResult], run: Union[EvaluationRun, SequenceRun]) -> str:
|
215
247
|
"""
|
216
248
|
Logs evaluation results to the Judgment API database.
|
217
249
|
|
@@ -228,13 +260,12 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
228
260
|
JUDGMENT_EVAL_LOG_API_URL,
|
229
261
|
headers={
|
230
262
|
"Content-Type": "application/json",
|
231
|
-
"Authorization": f"Bearer {
|
232
|
-
"X-Organization-Id":
|
263
|
+
"Authorization": f"Bearer {run.judgment_api_key}",
|
264
|
+
"X-Organization-Id": run.organization_id
|
233
265
|
},
|
234
266
|
json={
|
235
|
-
"results": [result.
|
236
|
-
"
|
237
|
-
"eval_name": evaluation_run.eval_name,
|
267
|
+
"results": [result.model_dump(warnings=False) for result in merged_results],
|
268
|
+
"run": run.model_dump(warnings=False)
|
238
269
|
},
|
239
270
|
verify=True
|
240
271
|
)
|
@@ -303,6 +334,42 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
|
|
303
334
|
# Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
|
304
335
|
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
305
336
|
|
337
|
+
def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
|
338
|
+
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
339
|
+
if not override and sequence_run.log_results:
|
340
|
+
check_eval_run_name_exists(
|
341
|
+
sequence_run.eval_name,
|
342
|
+
sequence_run.project_name,
|
343
|
+
sequence_run.judgment_api_key,
|
344
|
+
sequence_run.organization_id
|
345
|
+
)
|
346
|
+
|
347
|
+
# Execute evaluation using Judgment API
|
348
|
+
info("Starting API evaluation")
|
349
|
+
try: # execute an EvaluationRun with just JudgmentScorers
|
350
|
+
debug("Sending request to Judgment API")
|
351
|
+
response_data: List[Dict] = run_with_spinner("Running Sequence Evaluation: ", execute_api_sequence_eval, sequence_run)
|
352
|
+
|
353
|
+
info(f"Received {len(response_data['results'])} results from API")
|
354
|
+
except JudgmentAPIError as e:
|
355
|
+
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
356
|
+
raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
|
357
|
+
except ValueError as e:
|
358
|
+
raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: {str(e)}")
|
359
|
+
|
360
|
+
# Convert the response data to `ScoringResult` objects
|
361
|
+
debug("Processing API results")
|
362
|
+
api_results = []
|
363
|
+
for result in response_data["results"]:
|
364
|
+
api_results.append(ScoringResult(**result))
|
365
|
+
|
366
|
+
# TODO: allow for custom scorer on sequences
|
367
|
+
if sequence_run.log_results:
|
368
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, api_results, sequence_run)
|
369
|
+
rprint(pretty_str)
|
370
|
+
|
371
|
+
|
372
|
+
|
306
373
|
def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
|
307
374
|
"""
|
308
375
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
@@ -329,7 +396,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
329
396
|
"""
|
330
397
|
|
331
398
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
332
|
-
if not override and evaluation_run.log_results:
|
399
|
+
if not override and evaluation_run.log_results and not evaluation_run.append:
|
333
400
|
check_eval_run_name_exists(
|
334
401
|
evaluation_run.eval_name,
|
335
402
|
evaluation_run.project_name,
|
@@ -373,12 +440,20 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
373
440
|
local_scorers.append(scorer)
|
374
441
|
debug(f"Added local scorer: {type(scorer).__name__}")
|
375
442
|
|
443
|
+
custom_example_check = [scorer.custom_example for scorer in local_scorers]
|
444
|
+
if any(custom_example_check) and not all(custom_example_check):
|
445
|
+
error("All scorers must be custom scorers if using custom examples")
|
446
|
+
raise ValueError("All scorers must be custom scorers if using custom examples")
|
447
|
+
|
376
448
|
debug(f"Found {len(judgment_scorers)} judgment scorers and {len(local_scorers)} local scorers")
|
377
449
|
|
378
450
|
api_results: List[ScoringResult] = []
|
379
451
|
local_results: List[ScoringResult] = []
|
380
452
|
|
381
453
|
if async_execution:
|
454
|
+
if len(local_scorers) > 0:
|
455
|
+
error("Local scorers are not supported in async execution")
|
456
|
+
|
382
457
|
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
383
458
|
info("Starting async evaluation")
|
384
459
|
payload = evaluation_run.model_dump(warnings=False)
|
@@ -396,7 +471,6 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_error
|
|
396
471
|
else:
|
397
472
|
if judgment_scorers:
|
398
473
|
# Execute evaluation using Judgment API
|
399
|
-
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
400
474
|
info("Starting API evaluation")
|
401
475
|
debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
|
402
476
|
try: # execute an EvaluationRun with just JudgmentScorers
|
judgeval/scorers/__init__.py
CHANGED
@@ -17,6 +17,7 @@ from judgeval.scorers.judgeval_scorers import (
|
|
17
17
|
ComparisonScorer,
|
18
18
|
InstructionAdherenceScorer,
|
19
19
|
GroundednessScorer,
|
20
|
+
DerailmentScorer,
|
20
21
|
)
|
21
22
|
|
22
23
|
__all__ = [
|
@@ -39,4 +40,5 @@ __all__ = [
|
|
39
40
|
"ComparisonScorer",
|
40
41
|
"InstructionAdherenceScorer",
|
41
42
|
"GroundednessScorer",
|
43
|
+
"DerailmentScorer",
|
42
44
|
]
|