judgeval 0.0.52__py3-none-any.whl → 0.0.54__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/logger.py +46 -199
- judgeval/common/s3_storage.py +2 -6
- judgeval/common/tracer.py +182 -262
- judgeval/common/utils.py +16 -36
- judgeval/constants.py +14 -20
- judgeval/data/__init__.py +0 -2
- judgeval/data/datasets/dataset.py +6 -10
- judgeval/data/datasets/eval_dataset_client.py +25 -27
- judgeval/data/example.py +5 -138
- judgeval/data/judgment_types.py +214 -0
- judgeval/data/result.py +7 -25
- judgeval/data/scorer_data.py +28 -40
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/tool.py +3 -54
- judgeval/data/trace.py +31 -50
- judgeval/data/trace_run.py +3 -3
- judgeval/evaluation_run.py +16 -23
- judgeval/integrations/langgraph.py +11 -12
- judgeval/judges/litellm_judge.py +3 -6
- judgeval/judges/mixture_of_judges.py +8 -25
- judgeval/judges/together_judge.py +3 -6
- judgeval/judgment_client.py +22 -24
- judgeval/rules.py +7 -19
- judgeval/run_evaluation.py +79 -242
- judgeval/scorers/__init__.py +4 -20
- judgeval/scorers/agent_scorer.py +21 -0
- judgeval/scorers/api_scorer.py +28 -38
- judgeval/scorers/base_scorer.py +98 -0
- judgeval/scorers/example_scorer.py +19 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
- judgeval/scorers/score.py +45 -330
- judgeval/scorers/utils.py +6 -88
- judgeval/utils/file_utils.py +4 -6
- judgeval/version_check.py +3 -2
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/METADATA +6 -5
- judgeval-0.0.54.dist-info/RECORD +65 -0
- judgeval/data/custom_example.py +0 -19
- judgeval/scorers/judgeval_scorer.py +0 -177
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
- judgeval/scorers/prompt_scorer.py +0 -296
- judgeval-0.0.52.dist-info/RECORD +0 -69
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/WHEEL +0 -0
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/licenses/LICENSE.md +0 -0
judgeval/scorers/score.py
CHANGED
@@ -1,303 +1,72 @@
|
|
1
1
|
"""
|
2
|
-
Infrastructure for executing evaluations of `Example`s using one or more `
|
2
|
+
Infrastructure for executing evaluations of `Example`s using one or more `BaseScorer`s.
|
3
3
|
"""
|
4
4
|
|
5
5
|
import asyncio
|
6
6
|
import time
|
7
7
|
from tqdm.asyncio import tqdm_asyncio
|
8
8
|
from typing import List, Union, Optional, Callable
|
9
|
-
from rich.progress import Progress, SpinnerColumn, TextColumn
|
10
9
|
|
11
10
|
from judgeval.data import (
|
12
11
|
Example,
|
13
|
-
CustomExample,
|
14
12
|
ScoringResult,
|
15
13
|
generate_scoring_result,
|
16
14
|
create_scorer_data,
|
17
15
|
)
|
18
|
-
from judgeval.scorers import
|
19
|
-
from judgeval.scorers.utils import clone_scorers
|
20
|
-
from judgeval.common.
|
21
|
-
from judgeval.common.logger import example_logging_context, debug, error, warning, info
|
16
|
+
from judgeval.scorers import BaseScorer
|
17
|
+
from judgeval.scorers.utils import clone_scorers
|
18
|
+
from judgeval.common.logger import judgeval_logger
|
22
19
|
from judgeval.judges import JudgevalJudge
|
23
20
|
|
24
21
|
|
25
22
|
async def safe_a_score_example(
|
26
|
-
scorer:
|
23
|
+
scorer: BaseScorer,
|
27
24
|
example: Example,
|
28
|
-
ignore_errors: bool,
|
29
|
-
skip_on_missing_params: bool,
|
30
25
|
):
|
31
26
|
"""
|
32
27
|
Scoring task function when not using a progress indicator!
|
33
|
-
"Safely" scores an `Example` using a `
|
28
|
+
"Safely" scores an `Example` using a `BaseScorer` by gracefully handling any exceptions that may occur.
|
34
29
|
|
35
30
|
Args:
|
36
|
-
scorer (
|
31
|
+
scorer (BaseScorer): The `BaseScorer` to use for scoring the example.
|
37
32
|
example (Example): The `Example` to be scored.
|
38
33
|
|
39
34
|
ignore_errors (bool): Whether to ignore errors during the evaluation.
|
40
35
|
If set to false, any error will be raised and stop the evaluation.
|
41
|
-
If set to true, the error will be stored in the `error` attribute of the `
|
36
|
+
If set to true, the error will be stored in the `error` attribute of the `BaseScorer` and the `success` attribute will be set to False.
|
42
37
|
|
43
38
|
skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
|
44
39
|
"""
|
45
|
-
debug(f"Starting safe_a_score_example for example {example.example_id}")
|
46
40
|
try:
|
47
|
-
await scorer.a_score_example(example
|
48
|
-
|
49
|
-
except MissingTestCaseParamsError as e:
|
50
|
-
if (
|
51
|
-
skip_on_missing_params
|
52
|
-
): # Skip the example if the scorer requires parameters that are missing
|
53
|
-
with example_logging_context(example.created_at, example.example_id):
|
54
|
-
warning(
|
55
|
-
f"Skipping example {example.example_id} due to missing parameters"
|
56
|
-
)
|
57
|
-
scorer.skipped = True
|
58
|
-
return
|
59
|
-
else:
|
60
|
-
if (
|
61
|
-
ignore_errors
|
62
|
-
): # Gracefully handle the error, does not stop the evaluation
|
63
|
-
scorer.error = str(e)
|
64
|
-
scorer.success = False
|
65
|
-
with example_logging_context(example.created_at, example.example_id):
|
66
|
-
warning(
|
67
|
-
f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters"
|
68
|
-
)
|
69
|
-
else: # Raise the error and stop the evaluation
|
70
|
-
with example_logging_context(example.created_at, example.example_id):
|
71
|
-
error(
|
72
|
-
f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
|
73
|
-
)
|
74
|
-
raise
|
75
|
-
except TypeError: # in case a_score_example does not accept _show_indicator
|
76
|
-
try:
|
77
|
-
await scorer.a_score_example(example)
|
78
|
-
except MissingTestCaseParamsError as e:
|
79
|
-
if skip_on_missing_params:
|
80
|
-
scorer.skipped = True
|
81
|
-
with example_logging_context(example.created_at, example.example_id):
|
82
|
-
warning(
|
83
|
-
f"Skipping example {example.example_id} due to missing parameters"
|
84
|
-
)
|
85
|
-
return
|
86
|
-
else:
|
87
|
-
if ignore_errors:
|
88
|
-
scorer.error = str(e)
|
89
|
-
scorer.success = False
|
90
|
-
with example_logging_context(
|
91
|
-
example.created_at, example.example_id
|
92
|
-
):
|
93
|
-
warning(
|
94
|
-
f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters"
|
95
|
-
)
|
96
|
-
else:
|
97
|
-
with example_logging_context(
|
98
|
-
example.created_at, example.example_id
|
99
|
-
):
|
100
|
-
error(
|
101
|
-
f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
|
102
|
-
)
|
103
|
-
raise
|
41
|
+
scorer.score = await scorer.a_score_example(example)
|
42
|
+
scorer.success = scorer.success_check()
|
104
43
|
except Exception as e:
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
|
110
|
-
else:
|
111
|
-
with example_logging_context(example.created_at, example.example_id):
|
112
|
-
error(f"Stopping example {example.example_id}: {str(e)}")
|
113
|
-
raise
|
114
|
-
|
115
|
-
|
116
|
-
async def score_task(
|
117
|
-
task_id: int,
|
118
|
-
progress: Progress,
|
119
|
-
scorer: JudgevalScorer,
|
120
|
-
example: Example,
|
121
|
-
ignore_errors: bool = True,
|
122
|
-
skip_on_missing_params: bool = True,
|
123
|
-
):
|
124
|
-
"""
|
125
|
-
Task function for asynchronously measuring a given example using a JudgevalScorer.
|
126
|
-
|
127
|
-
Args:
|
128
|
-
task_id (int): The ID of the task being measured.
|
129
|
-
progress (Progress): An instance of the Progress class to track task progress.
|
130
|
-
scorer (JudgevalScorer): An instance of the JudgevalScorer class used to score the example.
|
131
|
-
example (Example): The example to be scored.
|
132
|
-
ignore_errors (bool, optional): Whether to ignore errors during scoring. Defaults to True.
|
133
|
-
skip_on_missing_params (bool, optional): Whether to skip scoring if there are missing parameters. Defaults to True.
|
134
|
-
|
135
|
-
Raises:
|
136
|
-
MissingTestCaseParamsError: If required test case parameters are missing and skip_on_missing_params is False.
|
137
|
-
Exception: If an unexpected error occurs and ignore_errors is False.
|
138
|
-
|
139
|
-
Returns:
|
140
|
-
None
|
141
|
-
"""
|
142
|
-
while not progress.finished:
|
143
|
-
start_time = time.perf_counter()
|
144
|
-
|
145
|
-
try:
|
146
|
-
await scorer.a_score_example(example, _show_indicator=False)
|
147
|
-
finish_text = "Completed"
|
148
|
-
except MissingTestCaseParamsError as e:
|
149
|
-
if skip_on_missing_params:
|
150
|
-
scorer.skipped = True
|
151
|
-
with example_logging_context(example.created_at, example.example_id):
|
152
|
-
debug(
|
153
|
-
f"Skipping example {example.example_id} due to missing parameters"
|
154
|
-
)
|
155
|
-
return
|
156
|
-
else:
|
157
|
-
if ignore_errors:
|
158
|
-
scorer.error = str(e)
|
159
|
-
scorer.success = False # Override success
|
160
|
-
finish_text = "Failed"
|
161
|
-
else:
|
162
|
-
with example_logging_context(
|
163
|
-
example.created_at, example.example_id
|
164
|
-
):
|
165
|
-
error(
|
166
|
-
f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
|
167
|
-
)
|
168
|
-
raise
|
169
|
-
except TypeError:
|
170
|
-
try:
|
171
|
-
await scorer.a_score_example(example)
|
172
|
-
finish_text = "Completed"
|
173
|
-
except MissingTestCaseParamsError as e:
|
174
|
-
if skip_on_missing_params:
|
175
|
-
scorer.skipped = True
|
176
|
-
with example_logging_context(
|
177
|
-
example.created_at, example.example_id
|
178
|
-
):
|
179
|
-
debug(
|
180
|
-
f"Skipping example {example.example_id} due to missing parameters"
|
181
|
-
)
|
182
|
-
return
|
183
|
-
else:
|
184
|
-
if ignore_errors:
|
185
|
-
scorer.error = str(e)
|
186
|
-
scorer.success = False # Override success
|
187
|
-
finish_text = "Failed"
|
188
|
-
else:
|
189
|
-
with example_logging_context(
|
190
|
-
example.created_at, example.example_id
|
191
|
-
):
|
192
|
-
error(
|
193
|
-
f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
|
194
|
-
)
|
195
|
-
raise
|
196
|
-
except Exception as e:
|
197
|
-
if ignore_errors:
|
198
|
-
scorer.error = str(e)
|
199
|
-
scorer.success = False # Override success
|
200
|
-
finish_text = "Failed"
|
201
|
-
with example_logging_context(example.created_at, example.example_id):
|
202
|
-
warning(
|
203
|
-
f"Ignoring errors for example {example.example_id}: {str(e)}"
|
204
|
-
)
|
205
|
-
else:
|
206
|
-
with example_logging_context(example.created_at, example.example_id):
|
207
|
-
error(f"Stopping example {example.example_id}: {str(e)}")
|
208
|
-
raise
|
209
|
-
|
210
|
-
end_time = time.perf_counter()
|
211
|
-
time_taken = format(end_time - start_time, ".2f")
|
212
|
-
progress.update(task_id, advance=100) # Mark task as complete
|
213
|
-
progress.update(
|
214
|
-
task_id,
|
215
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]{finish_text}! ({time_taken}s)",
|
216
|
-
)
|
217
|
-
break
|
218
|
-
|
219
|
-
|
220
|
-
async def score_with_indicator(
|
221
|
-
scorers: List[JudgevalScorer],
|
222
|
-
example: Example,
|
223
|
-
ignore_errors: bool,
|
224
|
-
skip_on_missing_params: bool,
|
225
|
-
show_indicator: bool,
|
226
|
-
):
|
227
|
-
"""
|
228
|
-
Scores an example using a list of JudgevalScorers, optionally displaying a progress indicator.
|
229
|
-
|
230
|
-
Args:
|
231
|
-
scorers (List[JudgevalScorer]): A list of JudgevalScorer objects to evaluate the example.
|
232
|
-
example (Example): The example to be scored.
|
233
|
-
ignore_errors (bool): If True, errors during scoring will be ignored.
|
234
|
-
skip_on_missing_params (bool): If True, scoring will be skipped if required parameters are missing.
|
235
|
-
show_indicator (bool): If True, a progress indicator will be displayed during scoring.
|
236
|
-
|
237
|
-
Returns:
|
238
|
-
None
|
239
|
-
|
240
|
-
Raises:
|
241
|
-
Any exceptions raised by the scoring functions, unless `ignore_errors` is True.
|
242
|
-
"""
|
243
|
-
if show_indicator:
|
244
|
-
with Progress(
|
245
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
246
|
-
TextColumn("[progress.description]{task.description}"),
|
247
|
-
transient=True,
|
248
|
-
) as progress:
|
249
|
-
tasks = []
|
250
|
-
for scorer in scorers:
|
251
|
-
task_id = progress.add_task(
|
252
|
-
description=scorer_console_msg(scorer, async_mode=True),
|
253
|
-
total=100,
|
254
|
-
) # Add task to progress bar
|
255
|
-
tasks.append(
|
256
|
-
score_task(
|
257
|
-
task_id,
|
258
|
-
progress,
|
259
|
-
scorer,
|
260
|
-
example,
|
261
|
-
ignore_errors,
|
262
|
-
skip_on_missing_params,
|
263
|
-
) # Create and execute task to score the example with a single scorer
|
264
|
-
)
|
265
|
-
await asyncio.gather(*tasks)
|
266
|
-
else:
|
267
|
-
tasks = [
|
268
|
-
safe_a_score_example(scorer, example, ignore_errors, skip_on_missing_params)
|
269
|
-
for scorer in scorers
|
270
|
-
]
|
271
|
-
|
272
|
-
await asyncio.gather(*tasks)
|
44
|
+
judgeval_logger.error(f"Error during scoring: {str(e)}")
|
45
|
+
scorer.error = str(e)
|
46
|
+
scorer.success = False
|
47
|
+
return
|
273
48
|
|
274
49
|
|
275
50
|
async def a_execute_scoring(
|
276
|
-
examples:
|
277
|
-
scorers: List[
|
51
|
+
examples: List[Example],
|
52
|
+
scorers: List[BaseScorer],
|
278
53
|
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
|
279
|
-
ignore_errors: bool =
|
280
|
-
skip_on_missing_params: bool = True,
|
281
|
-
show_indicator: bool = True,
|
54
|
+
ignore_errors: bool = False,
|
282
55
|
throttle_value: int = 0,
|
283
56
|
max_concurrent: int = 100,
|
284
|
-
verbose_mode: Optional[bool] = None,
|
285
|
-
_use_bar_indicator: bool = True,
|
286
57
|
) -> List[ScoringResult]:
|
287
58
|
"""
|
288
|
-
Executes evaluations of `Example`s asynchronously using one or more `
|
289
|
-
Each `Example` will be evaluated by all of the `
|
59
|
+
Executes evaluations of `Example`s asynchronously using one or more `BaseScorer`s.
|
60
|
+
Each `Example` will be evaluated by all of the `BaseScorer`s in the `scorers` list.
|
290
61
|
|
291
62
|
Args:
|
292
|
-
examples (
|
293
|
-
scorers (List[
|
63
|
+
examples (List[Example]): A list of `Example` objects to be evaluated.
|
64
|
+
scorers (List[BaseScorer]): A list of `BaseScorer` objects to evaluate the examples.
|
294
65
|
model (Union[str, List[str], JudgevalJudge]): The model to use for evaluation.
|
295
66
|
ignore_errors (bool): Whether to ignore errors during evaluation.
|
296
|
-
skip_on_missing_params (bool): Whether to skip evaluation if parameters are missing.
|
297
|
-
show_indicator (bool): Whether to show a progress indicator.
|
298
67
|
throttle_value (int): The amount of time to wait between starting each task.
|
299
68
|
max_concurrent (int): The maximum number of concurrent tasks.
|
300
|
-
|
69
|
+
|
301
70
|
_use_bar_indicator (bool): Whether to use a progress bar indicator.
|
302
71
|
|
303
72
|
Returns:
|
@@ -311,69 +80,31 @@ async def a_execute_scoring(
|
|
311
80
|
try:
|
312
81
|
return await func(*args, **kwargs)
|
313
82
|
except Exception as e:
|
314
|
-
|
83
|
+
judgeval_logger.error(f"Error executing function: {e}")
|
315
84
|
if kwargs.get("ignore_errors", False):
|
316
85
|
# Simply return None when ignoring errors, as expected by the test
|
317
86
|
return None
|
318
87
|
# If we're not ignoring errors, propagate the exception
|
319
88
|
raise
|
320
89
|
|
321
|
-
if verbose_mode is not None:
|
322
|
-
for scorer in scorers:
|
323
|
-
scorer.verbose_mode = verbose_mode
|
324
|
-
|
325
90
|
# Add model to scorers
|
326
91
|
for scorer in scorers:
|
327
92
|
scorer._add_model(model)
|
328
93
|
|
329
94
|
scoring_results: List[ScoringResult] = [None for _ in examples]
|
330
95
|
tasks = []
|
331
|
-
cloned_scorers: List[
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
) as pbar:
|
340
|
-
for i, ex in enumerate(examples):
|
341
|
-
with example_logging_context(ex.created_at, ex.example_id):
|
342
|
-
debug(f"Starting scoring for example {ex.example_id}")
|
343
|
-
debug(f"Input: {ex.input}")
|
344
|
-
debug(f"Using {len(scorers)} scorers")
|
345
|
-
for scorer in scorers:
|
346
|
-
debug(f"Using scorer: {type(scorer).__name__}")
|
347
|
-
if hasattr(scorer, "threshold"):
|
348
|
-
debug(f"Scorer threshold: {scorer.threshold}")
|
349
|
-
if hasattr(scorer, "model"):
|
350
|
-
debug(f"Scorer model: {type(scorer.model).__name__}")
|
351
|
-
if isinstance(ex, Example) or isinstance(ex, CustomExample):
|
352
|
-
if len(scorers) == 0:
|
353
|
-
pbar.update(1)
|
354
|
-
continue
|
355
|
-
|
356
|
-
cloned_scorers = clone_scorers(scorers)
|
357
|
-
task = execute_with_semaphore(
|
358
|
-
func=a_eval_examples_helper,
|
359
|
-
scorers=cloned_scorers,
|
360
|
-
example=ex,
|
361
|
-
scoring_results=scoring_results,
|
362
|
-
score_index=i,
|
363
|
-
ignore_errors=ignore_errors,
|
364
|
-
skip_on_missing_params=skip_on_missing_params,
|
365
|
-
show_indicator=show_indicator,
|
366
|
-
_use_bar_indicator=_use_bar_indicator,
|
367
|
-
pbar=pbar,
|
368
|
-
)
|
369
|
-
tasks.append(asyncio.create_task(task))
|
370
|
-
|
371
|
-
await asyncio.sleep(throttle_value)
|
372
|
-
await asyncio.gather(*tasks)
|
373
|
-
else:
|
96
|
+
cloned_scorers: List[BaseScorer]
|
97
|
+
|
98
|
+
with tqdm_asyncio(
|
99
|
+
desc=f"Evaluating {len(examples)} example(s) in parallel",
|
100
|
+
unit="Example",
|
101
|
+
total=len(examples),
|
102
|
+
bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
|
103
|
+
) as pbar:
|
374
104
|
for i, ex in enumerate(examples):
|
375
|
-
if isinstance(ex, Example)
|
105
|
+
if isinstance(ex, Example):
|
376
106
|
if len(scorers) == 0:
|
107
|
+
pbar.update(1)
|
377
108
|
continue
|
378
109
|
|
379
110
|
cloned_scorers = clone_scorers(scorers)
|
@@ -384,11 +115,9 @@ async def a_execute_scoring(
|
|
384
115
|
scoring_results=scoring_results,
|
385
116
|
score_index=i,
|
386
117
|
ignore_errors=ignore_errors,
|
387
|
-
|
388
|
-
_use_bar_indicator=_use_bar_indicator,
|
389
|
-
show_indicator=show_indicator,
|
118
|
+
pbar=pbar,
|
390
119
|
)
|
391
|
-
tasks.append(asyncio.create_task(
|
120
|
+
tasks.append(asyncio.create_task(task))
|
392
121
|
|
393
122
|
await asyncio.sleep(throttle_value)
|
394
123
|
await asyncio.gather(*tasks)
|
@@ -396,48 +125,33 @@ async def a_execute_scoring(
|
|
396
125
|
|
397
126
|
|
398
127
|
async def a_eval_examples_helper(
|
399
|
-
scorers: List[
|
400
|
-
example:
|
128
|
+
scorers: List[BaseScorer],
|
129
|
+
example: Example,
|
401
130
|
scoring_results: List[ScoringResult],
|
402
131
|
score_index: int,
|
403
132
|
ignore_errors: bool,
|
404
|
-
skip_on_missing_params: bool,
|
405
|
-
show_indicator: bool,
|
406
|
-
_use_bar_indicator: bool,
|
407
133
|
pbar: Optional[tqdm_asyncio] = None,
|
408
134
|
) -> None:
|
409
135
|
"""
|
410
136
|
Evaluate a single example asynchronously using a list of scorers.
|
411
137
|
|
412
138
|
Args:
|
413
|
-
scorers (List[
|
139
|
+
scorers (List[BaseScorer]): List of BaseScorer objects to evaluate the example.
|
414
140
|
example (Example): The example to be evaluated.
|
415
141
|
scoring_results (List[ScoringResult]): List to store the scoring results.
|
416
142
|
score_index (int): Index at which the result should be stored in scoring_results.
|
417
143
|
ignore_errors (bool): Flag to indicate whether to ignore errors during scoring.
|
418
|
-
skip_on_missing_params (bool): Flag to indicate whether to skip scoring if parameters are missing.
|
419
|
-
show_indicator (bool): Flag to indicate whether to show a progress indicator.
|
420
|
-
_use_bar_indicator (bool): Flag to indicate whether to use a bar indicator for progress.
|
421
144
|
pbar (Optional[tqdm_asyncio]): Optional progress bar for tracking progress.
|
422
145
|
Returns:
|
423
146
|
None
|
424
147
|
"""
|
425
148
|
|
426
|
-
show_metrics_indicator = show_indicator and not _use_bar_indicator
|
427
|
-
|
428
|
-
for scorer in scorers:
|
429
|
-
scorer.skipped = False
|
430
|
-
scorer.error = None # Reset scorer error
|
431
|
-
|
432
149
|
# scoring the Example
|
433
150
|
scoring_start_time = time.perf_counter()
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
ignore_errors=ignore_errors,
|
439
|
-
show_indicator=show_metrics_indicator,
|
440
|
-
) # execute the scoring functions of each scorer on the example
|
151
|
+
|
152
|
+
tasks = [safe_a_score_example(scorer, example) for scorer in scorers]
|
153
|
+
|
154
|
+
await asyncio.gather(*tasks)
|
441
155
|
|
442
156
|
# Now that all the scoring functions of each scorer have executed, we collect
|
443
157
|
# the results and update the ScoringResult with the scorer data
|
@@ -450,8 +164,9 @@ async def a_eval_examples_helper(
|
|
450
164
|
scorer_data = create_scorer_data(
|
451
165
|
scorer
|
452
166
|
) # Fetch scorer data from completed scorer evaluation
|
453
|
-
|
454
|
-
|
167
|
+
for s in scorer_data:
|
168
|
+
success = success and s.success
|
169
|
+
scorer_data_list.extend(scorer_data)
|
455
170
|
|
456
171
|
scoring_end_time = time.perf_counter()
|
457
172
|
run_duration = scoring_end_time - scoring_start_time
|
judgeval/scorers/utils.py
CHANGED
@@ -6,19 +6,15 @@ import asyncio
|
|
6
6
|
import nest_asyncio
|
7
7
|
import inspect
|
8
8
|
import json
|
9
|
-
import sys
|
10
9
|
import re
|
11
|
-
from contextlib import contextmanager
|
12
|
-
from rich.progress import Progress, SpinnerColumn, TextColumn
|
13
|
-
from rich.console import Console
|
14
10
|
from typing import List, Optional
|
15
11
|
|
16
|
-
from judgeval.scorers import
|
12
|
+
from judgeval.scorers import BaseScorer
|
17
13
|
from judgeval.data import Example, ExampleParams
|
18
14
|
from judgeval.scorers.exceptions import MissingExampleParamsError
|
19
15
|
|
20
16
|
|
21
|
-
def clone_scorers(scorers: List[
|
17
|
+
def clone_scorers(scorers: List[BaseScorer]) -> List[BaseScorer]:
|
22
18
|
"""
|
23
19
|
Creates duplicates of the scorers passed as argument.
|
24
20
|
"""
|
@@ -32,60 +28,14 @@ def clone_scorers(scorers: List[JudgevalScorer]) -> List[JudgevalScorer]:
|
|
32
28
|
valid_args = {key: args[key] for key in valid_params if key in args}
|
33
29
|
|
34
30
|
cloned_scorer = scorer_class(**valid_args)
|
35
|
-
# kinda hacky, but in case the class inheriting from
|
31
|
+
# kinda hacky, but in case the class inheriting from BaseScorer doesn't have `model` in its __init__,
|
36
32
|
# we need to explicitly include it here so that we can add the judge model to the cloned scorer
|
37
33
|
cloned_scorer._add_model(model=args.get("model"))
|
38
34
|
cloned_scorers.append(cloned_scorer)
|
39
35
|
return cloned_scorers
|
40
36
|
|
41
37
|
|
42
|
-
def
|
43
|
-
scorer: JudgevalScorer,
|
44
|
-
async_mode: Optional[bool] = None,
|
45
|
-
):
|
46
|
-
"""
|
47
|
-
Renders a message to be displayed to console when a scorer is being executed.
|
48
|
-
"""
|
49
|
-
if async_mode is None:
|
50
|
-
run_async = scorer.async_mode
|
51
|
-
else:
|
52
|
-
run_async = async_mode
|
53
|
-
|
54
|
-
return f"🔨 Executing Judgment's [rgb(106,0,255)]{scorer.__name__} Scorer[/rgb(106,0,255)]! \
|
55
|
-
[rgb(55,65,81)](using {scorer.evaluation_model}, async_mode={run_async})...[/rgb(55,65,81)]"
|
56
|
-
|
57
|
-
|
58
|
-
@contextmanager
|
59
|
-
def scorer_progress_meter(
|
60
|
-
scorer: JudgevalScorer,
|
61
|
-
async_mode: Optional[bool] = None,
|
62
|
-
display_meter: bool = True,
|
63
|
-
total: int = 100,
|
64
|
-
transient: bool = True,
|
65
|
-
):
|
66
|
-
"""
|
67
|
-
Context manager to display a progress indicator (spinner) while a scorer is being run.
|
68
|
-
"""
|
69
|
-
console = Console(file=sys.stderr)
|
70
|
-
if display_meter:
|
71
|
-
with Progress(
|
72
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
73
|
-
TextColumn("[progress.description]{task.description}"),
|
74
|
-
console=console,
|
75
|
-
transient=transient,
|
76
|
-
) as progress:
|
77
|
-
progress.add_task(
|
78
|
-
description=scorer_console_msg(scorer, async_mode),
|
79
|
-
total=total,
|
80
|
-
)
|
81
|
-
yield
|
82
|
-
else:
|
83
|
-
yield
|
84
|
-
|
85
|
-
|
86
|
-
def parse_response_json(
|
87
|
-
llm_response: str, scorer: Optional[JudgevalScorer] = None
|
88
|
-
) -> dict:
|
38
|
+
def parse_response_json(llm_response: str, scorer: Optional[BaseScorer] = None) -> dict:
|
89
39
|
"""
|
90
40
|
Extracts JSON output from an LLM response and returns it as a dictionary.
|
91
41
|
|
@@ -93,7 +43,7 @@ def parse_response_json(
|
|
93
43
|
|
94
44
|
Args:
|
95
45
|
llm_response (str): The response from an LLM.
|
96
|
-
scorer (
|
46
|
+
scorer (BaseScorer, optional): The scorer object to forward errors to (if any).
|
97
47
|
"""
|
98
48
|
start = llm_response.find("{") # opening bracket
|
99
49
|
end = llm_response.rfind("}") + 1 # closing bracket
|
@@ -120,38 +70,6 @@ def parse_response_json(
|
|
120
70
|
raise Exception(f"An unexpected error occurred: {str(e)}")
|
121
71
|
|
122
72
|
|
123
|
-
def print_verbose_logs(metric: str, logs: str):
|
124
|
-
print("*" * 50)
|
125
|
-
print(f"{metric} Verbose Logs")
|
126
|
-
print("*" * 50)
|
127
|
-
print("")
|
128
|
-
print(logs)
|
129
|
-
print("")
|
130
|
-
print("=" * 70)
|
131
|
-
|
132
|
-
|
133
|
-
def create_verbose_logs(metric: JudgevalScorer, steps: List[str]) -> str:
|
134
|
-
"""
|
135
|
-
Creates verbose logs for a scorer object.
|
136
|
-
|
137
|
-
Args:
|
138
|
-
metric (JudgevalScorer): The scorer object.
|
139
|
-
steps (List[str]): The steps to be included in the verbose logs.
|
140
|
-
|
141
|
-
Returns:
|
142
|
-
str: The verbose logs (Concatenated steps).
|
143
|
-
"""
|
144
|
-
|
145
|
-
verbose_logs = ""
|
146
|
-
for i in range(len(steps) - 1):
|
147
|
-
verbose_logs += steps[i]
|
148
|
-
if i < len(steps) - 2: # don't add new line for penultimate step
|
149
|
-
verbose_logs += " \n \n"
|
150
|
-
if metric.verbose_mode:
|
151
|
-
print_verbose_logs(metric.__name__, verbose_logs + f"\n \n{steps[-1]}")
|
152
|
-
return verbose_logs
|
153
|
-
|
154
|
-
|
155
73
|
def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
|
156
74
|
"""
|
157
75
|
Get or create an asyncio event loop.
|
@@ -186,7 +104,7 @@ def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
|
|
186
104
|
def check_example_params(
|
187
105
|
example: Example,
|
188
106
|
example_params: List[ExampleParams],
|
189
|
-
scorer:
|
107
|
+
scorer: BaseScorer,
|
190
108
|
):
|
191
109
|
if isinstance(example, Example) is False:
|
192
110
|
error_str = f"in check_example_params(): Expected example to be of type 'Example', but got {type(example)}"
|
judgeval/utils/file_utils.py
CHANGED
@@ -1,16 +1,15 @@
|
|
1
1
|
import yaml
|
2
2
|
from typing import List
|
3
|
-
from judgeval.common.logger import
|
3
|
+
from judgeval.common.logger import judgeval_logger
|
4
4
|
|
5
5
|
from judgeval.data import Example
|
6
6
|
|
7
7
|
|
8
8
|
def get_examples_from_yaml(file_path: str) -> List[Example] | None:
|
9
|
-
debug(f"Loading dataset from YAML file: {file_path}")
|
10
9
|
"""
|
11
10
|
Adds examples from a YAML file.
|
12
11
|
|
13
|
-
The format of the YAML file is expected to be a dictionary with one key: "examples".
|
12
|
+
The format of the YAML file is expected to be a dictionary with one key: "examples".
|
14
13
|
The value of the key is a list of dictionaries, where each dictionary represents an example.
|
15
14
|
|
16
15
|
The YAML file is expected to have the following format:
|
@@ -42,12 +41,11 @@ def get_examples_from_yaml(file_path: str) -> List[Example] | None:
|
|
42
41
|
raise ValueError("The YAML file is empty.")
|
43
42
|
examples = payload.get("examples", [])
|
44
43
|
except FileNotFoundError:
|
45
|
-
error(f"YAML file not found: {file_path}")
|
44
|
+
judgeval_logger.error(f"YAML file not found: {file_path}")
|
46
45
|
raise FileNotFoundError(f"The file {file_path} was not found.")
|
47
46
|
except yaml.YAMLError:
|
48
|
-
error(f"Invalid YAML file: {file_path}")
|
47
|
+
judgeval_logger.error(f"Invalid YAML file: {file_path}")
|
49
48
|
raise ValueError(f"The file {file_path} is not a valid YAML file.")
|
50
49
|
|
51
|
-
info(f"Added {len(examples)} examples from YAML")
|
52
50
|
new_examples = [Example(**e) for e in examples]
|
53
51
|
return new_examples
|
judgeval/version_check.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import importlib.metadata
|
2
2
|
from judgeval.utils.requests import requests
|
3
3
|
import threading
|
4
|
+
from judgeval.common.logger import judgeval_logger
|
4
5
|
|
5
6
|
|
6
7
|
def check_latest_version(package_name: str = "judgeval"):
|
@@ -13,8 +14,8 @@ def check_latest_version(package_name: str = "judgeval"):
|
|
13
14
|
latest_version = response.json()["info"]["version"]
|
14
15
|
|
15
16
|
if current_version != latest_version:
|
16
|
-
|
17
|
-
f"
|
17
|
+
judgeval_logger.warning(
|
18
|
+
f"UPDATE AVAILABLE: You are using '{package_name}=={current_version}', "
|
18
19
|
f"but the latest version is '{latest_version}'. While this version is still supported, "
|
19
20
|
f"we recommend upgrading to avoid potential issues or missing features: "
|
20
21
|
f"`pip install --upgrade {package_name}`"
|