judgeval 0.0.30__py3-none-any.whl → 0.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +3 -1
- judgeval/common/tracer.py +352 -117
- judgeval/constants.py +5 -3
- judgeval/data/__init__.py +4 -0
- judgeval/data/custom_example.py +18 -0
- judgeval/data/datasets/dataset.py +5 -1
- judgeval/data/datasets/eval_dataset_client.py +64 -5
- judgeval/data/example.py +1 -0
- judgeval/data/result.py +7 -6
- judgeval/data/sequence.py +55 -0
- judgeval/data/sequence_run.py +44 -0
- judgeval/evaluation_run.py +12 -7
- judgeval/integrations/langgraph.py +89 -72
- judgeval/judgment_client.py +70 -68
- judgeval/run_evaluation.py +87 -13
- judgeval/scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorer.py +3 -0
- judgeval/scorers/judgeval_scorers/__init__.py +7 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -1
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +21 -0
- judgeval/scorers/score.py +6 -5
- judgeval/version_check.py +22 -0
- {judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/METADATA +1 -1
- {judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/RECORD +26 -22
- judgeval/data/custom_api_example.py +0 -91
- {judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/WHEEL +0 -0
- {judgeval-0.0.30.dist-info → judgeval-0.0.32.dist-info}/licenses/LICENSE.md +0 -0
@@ -22,9 +22,8 @@ from langchain_core.documents import Document
|
|
22
22
|
class JudgevalCallbackHandler(BaseCallbackHandler):
|
23
23
|
def __init__(self, tracer: Tracer):
|
24
24
|
self.tracer = tracer
|
25
|
-
self.trace_client = tracer.get_current_trace() if tracer.get_current_trace() else None
|
26
25
|
self.previous_spans = [] # stack of previous spans
|
27
|
-
self.
|
26
|
+
self.created_trace = False
|
28
27
|
|
29
28
|
# Attributes for users to access
|
30
29
|
self.previous_node = None
|
@@ -33,43 +32,58 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
33
32
|
self.executed_tools = []
|
34
33
|
|
35
34
|
def start_span(self, name: str, span_type: SpanType = "span"):
|
35
|
+
current_trace = self.tracer.get_current_trace()
|
36
36
|
start_time = time.time()
|
37
|
+
|
38
|
+
# Generate a unique ID for *this specific span invocation*
|
39
|
+
span_id = str(uuid.uuid4())
|
40
|
+
|
41
|
+
parent_span_id = current_trace.get_current_span()
|
42
|
+
token = current_trace.set_current_span(span_id) # Set *this* span's ID as the current one
|
43
|
+
|
44
|
+
current_depth = 0
|
45
|
+
if parent_span_id and parent_span_id in current_trace._span_depths:
|
46
|
+
current_depth = current_trace._span_depths[parent_span_id] + 1
|
37
47
|
|
48
|
+
current_trace._span_depths[span_id] = current_depth # Store depth by span_id
|
38
49
|
# Record span entry
|
39
|
-
|
50
|
+
current_trace.add_entry(TraceEntry(
|
40
51
|
type="enter",
|
52
|
+
span_id=span_id,
|
53
|
+
trace_id=current_trace.trace_id,
|
54
|
+
parent_span_id=parent_span_id,
|
41
55
|
function=name,
|
42
|
-
depth=
|
56
|
+
depth=current_depth,
|
43
57
|
message=name,
|
44
|
-
|
58
|
+
created_at=start_time,
|
45
59
|
span_type=span_type
|
46
60
|
))
|
47
61
|
|
48
|
-
self.
|
49
|
-
self.previous_spans.append(self.trace_client._current_span)
|
50
|
-
self.trace_client._current_span = name
|
62
|
+
self.previous_spans.append(token)
|
51
63
|
self._start_time = start_time
|
52
64
|
|
53
|
-
def end_span(self,
|
54
|
-
self.
|
65
|
+
def end_span(self, span_type: SpanType = "span"):
|
66
|
+
current_trace = self.tracer.get_current_trace()
|
55
67
|
duration = time.time() - self._start_time
|
68
|
+
span_id = current_trace.get_current_span()
|
69
|
+
exit_depth = current_trace._span_depths.get(span_id, 0) # Get depth using this span's ID
|
56
70
|
|
57
71
|
# Record span exit
|
58
|
-
|
72
|
+
current_trace.add_entry(TraceEntry(
|
59
73
|
type="exit",
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
74
|
+
span_id=span_id,
|
75
|
+
trace_id=current_trace.trace_id,
|
76
|
+
depth=exit_depth,
|
77
|
+
created_at=time.time(),
|
64
78
|
duration=duration,
|
65
79
|
span_type=span_type
|
66
80
|
))
|
67
|
-
|
68
|
-
|
69
|
-
if self.trace_client.tracer.depth == 0:
|
81
|
+
current_trace.reset_current_span(self.previous_spans.pop())
|
82
|
+
if exit_depth == 0:
|
70
83
|
# Save the trace if we are the root, this is when users dont use any @observe decorators
|
71
|
-
|
72
|
-
self.
|
84
|
+
trace_id, trace_data = current_trace.save(overwrite=True)
|
85
|
+
self._trace_id = trace_id
|
86
|
+
current_trace = None
|
73
87
|
|
74
88
|
def on_retriever_start(
|
75
89
|
self,
|
@@ -85,9 +99,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
85
99
|
name = "RETRIEVER_CALL"
|
86
100
|
if serialized and "name" in serialized:
|
87
101
|
name = f"RETRIEVER_{serialized['name'].upper()}"
|
88
|
-
|
102
|
+
current_trace = self.tracer.get_current_trace()
|
89
103
|
self.start_span(name, span_type="retriever")
|
90
|
-
|
104
|
+
current_trace.record_input({
|
91
105
|
'query': query,
|
92
106
|
'tags': tags,
|
93
107
|
'metadata': metadata,
|
@@ -103,6 +117,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
103
117
|
**kwargs: Any
|
104
118
|
) -> Any:
|
105
119
|
# Process the retrieved documents into a format suitable for logging
|
120
|
+
current_trace = self.tracer.get_current_trace()
|
106
121
|
doc_summary = []
|
107
122
|
for i, doc in enumerate(documents):
|
108
123
|
# Extract key information from each document
|
@@ -114,13 +129,13 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
114
129
|
doc_summary.append(doc_data)
|
115
130
|
|
116
131
|
# Record the document data
|
117
|
-
|
132
|
+
current_trace.record_output({
|
118
133
|
"document_count": len(documents),
|
119
134
|
"documents": doc_summary
|
120
135
|
})
|
121
136
|
|
122
137
|
# End the retriever span
|
123
|
-
self.end_span(
|
138
|
+
self.end_span(span_type="retriever")
|
124
139
|
|
125
140
|
def on_chain_start(
|
126
141
|
self,
|
@@ -134,29 +149,26 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
134
149
|
**kwargs: Any
|
135
150
|
) -> None:
|
136
151
|
# If the user doesnt use any @observe decorators, the first action in LangGraph workflows seems tohave this attribute, so we intialize our trace client here
|
152
|
+
current_trace = self.tracer.get_current_trace()
|
137
153
|
if kwargs.get('name') == 'LangGraph':
|
138
|
-
if not
|
154
|
+
if not current_trace:
|
155
|
+
self.created_trace = True
|
139
156
|
trace_id = str(uuid.uuid4())
|
140
157
|
project = self.tracer.project_name
|
141
|
-
trace = TraceClient(self.tracer, trace_id,
|
142
|
-
self.
|
143
|
-
self.
|
144
|
-
# Only save empty trace for the root call
|
145
|
-
self.trace_client.save(empty_save=True, overwrite=False)
|
146
|
-
|
147
|
-
self.start_span("LangGraph", span_type="Main Function")
|
158
|
+
trace = TraceClient(self.tracer, trace_id, "Langgraph", project_name=project, overwrite=False, rules=self.tracer.rules, enable_monitoring=self.tracer.enable_monitoring, enable_evaluations=self.tracer.enable_evaluations)
|
159
|
+
self.tracer.set_current_trace(trace)
|
160
|
+
self.start_span("LangGraph", span_type="Main Function")
|
148
161
|
|
149
|
-
|
150
|
-
if node
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
self.previous_node = node
|
162
|
+
node = metadata.get("langgraph_node")
|
163
|
+
if node != None and node != self.previous_node:
|
164
|
+
self.start_span(node, span_type="node")
|
165
|
+
self.executed_node_tools.append(node)
|
166
|
+
self.executed_nodes.append(node)
|
167
|
+
current_trace.record_input({
|
168
|
+
'args': inputs,
|
169
|
+
'kwargs': kwargs
|
170
|
+
})
|
171
|
+
self.previous_node = node
|
160
172
|
|
161
173
|
def on_chain_end(
|
162
174
|
self,
|
@@ -167,14 +179,13 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
167
179
|
tags: Optional[List[str]] = None,
|
168
180
|
**kwargs: Any,
|
169
181
|
) -> Any:
|
170
|
-
|
171
|
-
self.finished = True
|
182
|
+
current_trace = self.tracer.get_current_trace()
|
172
183
|
if tags is not None and any("graph:step" in tag for tag in tags):
|
173
|
-
|
174
|
-
self.end_span(
|
184
|
+
current_trace.record_output(outputs)
|
185
|
+
self.end_span(span_type="node")
|
175
186
|
|
176
|
-
|
177
|
-
|
187
|
+
if self.created_trace and (outputs == "__end__" or (not kwargs and not tags)):
|
188
|
+
self.end_span(span_type="Main Function")
|
178
189
|
|
179
190
|
def on_chain_error(
|
180
191
|
self,
|
@@ -184,9 +195,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
184
195
|
parent_run_id: Optional[UUID] = None,
|
185
196
|
**kwargs: Any,
|
186
197
|
) -> Any:
|
187
|
-
|
188
|
-
|
189
|
-
self.end_span(
|
198
|
+
current_trace = self.tracer.get_current_trace()
|
199
|
+
current_trace.record_output(error)
|
200
|
+
self.end_span(span_type="node")
|
190
201
|
|
191
202
|
def on_tool_start(
|
192
203
|
self,
|
@@ -199,19 +210,21 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
199
210
|
):
|
200
211
|
name = serialized["name"]
|
201
212
|
self.start_span(name, span_type="tool")
|
213
|
+
current_trace = self.tracer.get_current_trace()
|
202
214
|
if name:
|
203
215
|
# Track tool execution
|
204
|
-
|
216
|
+
current_trace.executed_tools.append(name)
|
205
217
|
node_tool = f"{self.previous_node}:{name}" if self.previous_node else name
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
218
|
+
current_trace.executed_node_tools.append(node_tool)
|
219
|
+
current_trace.record_input({
|
220
|
+
'args': input_str,
|
221
|
+
'kwargs': kwargs
|
222
|
+
})
|
211
223
|
|
212
224
|
def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
|
213
|
-
self.
|
214
|
-
|
225
|
+
current_trace = self.tracer.get_current_trace()
|
226
|
+
current_trace.record_output(output)
|
227
|
+
self.end_span(span_type="tool")
|
215
228
|
|
216
229
|
def on_tool_error(
|
217
230
|
self,
|
@@ -221,9 +234,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
221
234
|
parent_run_id: Optional[UUID] = None,
|
222
235
|
**kwargs: Any,
|
223
236
|
) -> Any:
|
224
|
-
|
225
|
-
|
226
|
-
self.end_span(
|
237
|
+
current_trace = self.tracer.get_current_trace()
|
238
|
+
current_trace.record_output(error)
|
239
|
+
self.end_span(span_type="tool")
|
227
240
|
|
228
241
|
def on_agent_action(
|
229
242
|
self,
|
@@ -233,7 +246,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
233
246
|
parent_run_id: Optional[UUID] = None,
|
234
247
|
**kwargs: Any,
|
235
248
|
) -> Any:
|
236
|
-
|
249
|
+
pass
|
237
250
|
|
238
251
|
def on_agent_finish(
|
239
252
|
self,
|
@@ -243,7 +256,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
243
256
|
parent_run_id: Optional[UUID] = None,
|
244
257
|
**kwargs: Any,
|
245
258
|
) -> Any:
|
246
|
-
|
259
|
+
|
260
|
+
pass
|
247
261
|
|
248
262
|
def on_llm_start(
|
249
263
|
self,
|
@@ -256,14 +270,16 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
256
270
|
) -> Any:
|
257
271
|
name = "LLM call"
|
258
272
|
self.start_span(name, span_type="llm")
|
259
|
-
self.
|
273
|
+
current_trace = self.tracer.get_current_trace()
|
274
|
+
current_trace.record_input({
|
260
275
|
'args': prompts,
|
261
276
|
'kwargs': kwargs
|
262
277
|
})
|
263
278
|
|
264
279
|
def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
|
265
|
-
self.
|
266
|
-
|
280
|
+
current_trace = self.tracer.get_current_trace()
|
281
|
+
current_trace.record_output(response.generations[0][0].text)
|
282
|
+
self.end_span(span_type="llm")
|
267
283
|
|
268
284
|
def on_llm_error(
|
269
285
|
self,
|
@@ -273,9 +289,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
273
289
|
parent_run_id: Optional[UUID] = None,
|
274
290
|
**kwargs: Any,
|
275
291
|
) -> Any:
|
276
|
-
|
277
|
-
|
278
|
-
self.end_span(
|
292
|
+
current_trace = self.tracer.get_current_trace()
|
293
|
+
current_trace.record_output(error)
|
294
|
+
self.end_span(span_type="llm")
|
279
295
|
|
280
296
|
def on_chat_model_start(
|
281
297
|
self,
|
@@ -297,7 +313,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
297
313
|
name = "LLM call"
|
298
314
|
|
299
315
|
self.start_span(name, span_type="llm")
|
300
|
-
self.
|
316
|
+
current_trace = self.tracer.get_current_trace()
|
317
|
+
current_trace.record_input({
|
301
318
|
'args': str(messages),
|
302
319
|
'kwargs': kwargs
|
303
320
|
})
|
judgeval/judgment_client.py
CHANGED
@@ -10,6 +10,8 @@ from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
|
10
10
|
from judgeval.data import (
|
11
11
|
ScoringResult,
|
12
12
|
Example,
|
13
|
+
CustomExample,
|
14
|
+
Sequence,
|
13
15
|
)
|
14
16
|
from judgeval.scorers import (
|
15
17
|
APIJudgmentScorer,
|
@@ -20,8 +22,10 @@ from judgeval.scorers import (
|
|
20
22
|
from judgeval.evaluation_run import EvaluationRun
|
21
23
|
from judgeval.run_evaluation import (
|
22
24
|
run_eval,
|
23
|
-
assert_test
|
25
|
+
assert_test,
|
26
|
+
run_sequence_eval
|
24
27
|
)
|
28
|
+
from judgeval.data.sequence_run import SequenceRun
|
25
29
|
from judgeval.judges import JudgevalJudge
|
26
30
|
from judgeval.constants import (
|
27
31
|
JUDGMENT_EVAL_FETCH_API_URL,
|
@@ -78,66 +82,58 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
78
82
|
project_name: str = "default_project",
|
79
83
|
eval_run_name: str = "default_eval_run",
|
80
84
|
override: bool = False,
|
85
|
+
append: bool = False,
|
81
86
|
use_judgment: bool = True,
|
82
87
|
ignore_errors: bool = True,
|
83
88
|
rules: Optional[List[Rule]] = None
|
84
89
|
) -> List[ScoringResult]:
|
85
|
-
return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, use_judgment, ignore_errors, True, rules)
|
90
|
+
return self.run_evaluation(examples, scorers, model, aggregator, metadata, log_results, project_name, eval_run_name, override, append, use_judgment, ignore_errors, True, rules)
|
86
91
|
|
87
|
-
def
|
88
|
-
self,
|
89
|
-
|
90
|
-
scorers: List[Union[ScorerWrapper, JudgevalScorer]],
|
92
|
+
def run_sequence_evaluation(
|
93
|
+
self,
|
94
|
+
sequences: List[Sequence],
|
91
95
|
model: Union[str, List[str], JudgevalJudge],
|
96
|
+
scorers: List[Union[ScorerWrapper, JudgevalScorer]],
|
92
97
|
aggregator: Optional[str] = None,
|
93
|
-
metadata: Optional[Dict[str, Any]] = None,
|
94
|
-
log_results: bool = True,
|
95
98
|
project_name: str = "default_project",
|
96
|
-
eval_run_name: str = "
|
97
|
-
override: bool = False,
|
99
|
+
eval_run_name: str = "default_eval_sequence",
|
98
100
|
use_judgment: bool = True,
|
101
|
+
log_results: bool = True,
|
102
|
+
append: bool = False,
|
103
|
+
override: bool = False,
|
99
104
|
ignore_errors: bool = True,
|
100
|
-
async_execution: bool = False,
|
101
105
|
rules: Optional[List[Rule]] = None
|
102
106
|
) -> List[ScoringResult]:
|
103
|
-
"""
|
104
|
-
Executes an evaluation of `Example`s using one or more `Scorer`s
|
105
|
-
|
106
|
-
Args:
|
107
|
-
examples (List[Example]): The examples to evaluate
|
108
|
-
scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
|
109
|
-
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
110
|
-
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
111
|
-
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
|
112
|
-
log_results (bool): Whether to log the results to the Judgment API
|
113
|
-
project_name (str): The name of the project the evaluation results belong to
|
114
|
-
eval_run_name (str): A name for this evaluation run
|
115
|
-
override (bool): Whether to override an existing evaluation run with the same name
|
116
|
-
use_judgment (bool): Whether to use Judgment API for evaluation
|
117
|
-
ignore_errors (bool): Whether to ignore errors during evaluation (safely handled)
|
118
|
-
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
119
|
-
|
120
|
-
Returns:
|
121
|
-
List[ScoringResult]: The results of the evaluation
|
122
|
-
"""
|
123
107
|
try:
|
124
|
-
|
125
|
-
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
|
108
|
+
loaded_scorers = []
|
126
109
|
for scorer in scorers:
|
127
110
|
try:
|
128
111
|
if isinstance(scorer, ScorerWrapper):
|
129
|
-
loaded_scorers.append(scorer.load_implementation(
|
112
|
+
loaded_scorers.append(scorer.load_implementation())
|
130
113
|
else:
|
131
114
|
loaded_scorers.append(scorer)
|
132
115
|
except Exception as e:
|
133
116
|
raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
134
117
|
|
135
|
-
|
136
|
-
|
137
|
-
|
118
|
+
def get_all_sequences(root: Sequence) -> List[Sequence]:
|
119
|
+
all_sequences = [root]
|
120
|
+
|
121
|
+
for item in root.items:
|
122
|
+
if isinstance(item, Sequence):
|
123
|
+
all_sequences.extend(get_all_sequences(item))
|
124
|
+
|
125
|
+
return all_sequences
|
126
|
+
|
127
|
+
def flatten_sequence_list(sequences: List[Sequence]) -> List[Sequence]:
|
128
|
+
flattened = []
|
129
|
+
for seq in sequences:
|
130
|
+
flattened.extend(get_all_sequences(seq))
|
131
|
+
return flattened
|
132
|
+
|
133
|
+
flattened_sequences = flatten_sequence_list(sequences)
|
134
|
+
for sequence in flattened_sequences:
|
135
|
+
sequence.scorers = loaded_scorers
|
138
136
|
|
139
|
-
# Convert ScorerWrapper in rules to their implementations
|
140
|
-
loaded_rules = None
|
141
137
|
if rules:
|
142
138
|
loaded_rules = []
|
143
139
|
for rule in rules:
|
@@ -161,57 +157,64 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
161
157
|
loaded_rules.append(new_rule)
|
162
158
|
except Exception as e:
|
163
159
|
raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
|
164
|
-
|
165
|
-
|
166
|
-
log_results=log_results,
|
160
|
+
|
161
|
+
sequence_run = SequenceRun(
|
167
162
|
project_name=project_name,
|
168
163
|
eval_name=eval_run_name,
|
169
|
-
|
170
|
-
scorers=loaded_scorers,
|
164
|
+
sequences=sequences,
|
171
165
|
model=model,
|
172
166
|
aggregator=aggregator,
|
173
|
-
|
167
|
+
log_results=log_results,
|
168
|
+
append=append,
|
174
169
|
judgment_api_key=self.judgment_api_key,
|
175
|
-
rules=loaded_rules,
|
176
170
|
organization_id=self.organization_id
|
177
171
|
)
|
178
|
-
return
|
172
|
+
return run_sequence_eval(sequence_run, override, ignore_errors, use_judgment)
|
179
173
|
except ValueError as e:
|
180
|
-
raise ValueError(f"Please check your
|
174
|
+
raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
|
181
175
|
except Exception as e:
|
182
176
|
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
183
|
-
|
184
|
-
def
|
177
|
+
|
178
|
+
def run_evaluation(
|
185
179
|
self,
|
186
|
-
|
180
|
+
examples: Union[List[Example], List[CustomExample]],
|
187
181
|
scorers: List[Union[ScorerWrapper, JudgevalScorer]],
|
188
182
|
model: Union[str, List[str], JudgevalJudge],
|
189
183
|
aggregator: Optional[str] = None,
|
190
184
|
metadata: Optional[Dict[str, Any]] = None,
|
191
|
-
project_name: str = "",
|
192
|
-
eval_run_name: str = "",
|
193
185
|
log_results: bool = True,
|
186
|
+
project_name: str = "default_project",
|
187
|
+
eval_run_name: str = "default_eval_run",
|
188
|
+
override: bool = False,
|
189
|
+
append: bool = False,
|
194
190
|
use_judgment: bool = True,
|
191
|
+
ignore_errors: bool = True,
|
192
|
+
async_execution: bool = False,
|
195
193
|
rules: Optional[List[Rule]] = None
|
196
194
|
) -> List[ScoringResult]:
|
197
195
|
"""
|
198
|
-
Executes an evaluation of
|
196
|
+
Executes an evaluation of `Example`s using one or more `Scorer`s
|
199
197
|
|
200
198
|
Args:
|
201
|
-
|
199
|
+
examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
|
202
200
|
scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
|
203
201
|
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
204
202
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
205
203
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
|
204
|
+
log_results (bool): Whether to log the results to the Judgment API
|
206
205
|
project_name (str): The name of the project the evaluation results belong to
|
207
206
|
eval_run_name (str): A name for this evaluation run
|
208
|
-
|
207
|
+
override (bool): Whether to override an existing evaluation run with the same name
|
209
208
|
use_judgment (bool): Whether to use Judgment API for evaluation
|
209
|
+
ignore_errors (bool): Whether to ignore errors during evaluation (safely handled)
|
210
210
|
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
211
211
|
|
212
212
|
Returns:
|
213
213
|
List[ScoringResult]: The results of the evaluation
|
214
214
|
"""
|
215
|
+
if override and append:
|
216
|
+
raise ValueError("Cannot set both override and append to True. Please choose one.")
|
217
|
+
|
215
218
|
try:
|
216
219
|
# Load appropriate implementations for all scorers
|
217
220
|
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
|
@@ -253,12 +256,12 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
253
256
|
loaded_rules.append(new_rule)
|
254
257
|
except Exception as e:
|
255
258
|
raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
|
256
|
-
|
257
|
-
evaluation_run = EvaluationRun(
|
259
|
+
eval = EvaluationRun(
|
258
260
|
log_results=log_results,
|
261
|
+
append=append,
|
259
262
|
project_name=project_name,
|
260
263
|
eval_name=eval_run_name,
|
261
|
-
examples=
|
264
|
+
examples=examples,
|
262
265
|
scorers=loaded_scorers,
|
263
266
|
model=model,
|
264
267
|
aggregator=aggregator,
|
@@ -267,7 +270,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
267
270
|
rules=loaded_rules,
|
268
271
|
organization_id=self.organization_id
|
269
272
|
)
|
270
|
-
return run_eval(
|
273
|
+
return run_eval(eval, override, ignore_errors=ignore_errors, async_execution=async_execution)
|
271
274
|
except ValueError as e:
|
272
275
|
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
|
273
276
|
except Exception as e:
|
@@ -292,6 +295,12 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
292
295
|
dataset.judgment_api_key = self.judgment_api_key
|
293
296
|
return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
|
294
297
|
|
298
|
+
def append_dataset(self, alias: str, examples: List[Example], project_name: str) -> bool:
|
299
|
+
"""
|
300
|
+
Appends an `EvalDataset` to the Judgment platform for storage.
|
301
|
+
"""
|
302
|
+
return self.eval_dataset_client.append(alias, examples, project_name)
|
303
|
+
|
295
304
|
def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
|
296
305
|
"""
|
297
306
|
Retrieves a saved `EvalDataset` from the Judgment platform.
|
@@ -355,14 +364,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
355
364
|
if eval_run.status_code != requests.codes.ok:
|
356
365
|
raise ValueError(f"Error fetching eval results: {eval_run.json()}")
|
357
366
|
|
358
|
-
|
359
|
-
for result in eval_run.json():
|
360
|
-
result_id = result.get("id", "")
|
361
|
-
result_data = result.get("result", dict())
|
362
|
-
filtered_result = {k: v for k, v in result_data.items() if k in ScoringResult.__annotations__}
|
363
|
-
eval_run_result[0]["id"] = result_id
|
364
|
-
eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
|
365
|
-
return eval_run_result
|
367
|
+
return eval_run.json()
|
366
368
|
|
367
369
|
def delete_eval(self, project_name: str, eval_run_names: List[str]) -> bool:
|
368
370
|
"""
|