judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +177 -12
- judgeval/api/__init__.py +519 -0
- judgeval/api/api_types.py +407 -0
- judgeval/cli.py +79 -0
- judgeval/constants.py +76 -47
- judgeval/data/__init__.py +3 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +15 -56
- judgeval/data/judgment_types.py +450 -0
- judgeval/data/result.py +29 -73
- judgeval/data/scorer_data.py +29 -62
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/trace.py +121 -0
- judgeval/dataset/__init__.py +264 -0
- judgeval/env.py +52 -0
- judgeval/evaluation/__init__.py +344 -0
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +50 -0
- judgeval/judges/__init__.py +2 -3
- judgeval/judges/base_judge.py +2 -3
- judgeval/judges/litellm_judge.py +100 -20
- judgeval/judges/together_judge.py +101 -20
- judgeval/judges/utils.py +20 -24
- judgeval/logger.py +62 -0
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +18 -25
- judgeval/scorers/agent_scorer.py +17 -0
- judgeval/scorers/api_scorer.py +45 -41
- judgeval/scorers/base_scorer.py +83 -38
- judgeval/scorers/example_scorer.py +17 -0
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorers/__init__.py +0 -148
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
- judgeval/scorers/score.py +77 -306
- judgeval/scorers/utils.py +4 -199
- judgeval/tracer/__init__.py +1122 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +128 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainable_model.py +243 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +97 -0
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/utils/version_check.py +28 -0
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.22.2.dist-info/METADATA +265 -0
- judgeval-0.22.2.dist-info/RECORD +112 -0
- judgeval-0.22.2.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -39
- judgeval/common/__init__.py +0 -8
- judgeval/common/exceptions.py +0 -28
- judgeval/common/logger.py +0 -189
- judgeval/common/tracer.py +0 -798
- judgeval/common/utils.py +0 -763
- judgeval/data/api_example.py +0 -111
- judgeval/data/datasets/__init__.py +0 -5
- judgeval/data/datasets/dataset.py +0 -286
- judgeval/data/datasets/eval_dataset_client.py +0 -193
- judgeval/data/datasets/ground_truth.py +0 -54
- judgeval/data/datasets/utils.py +0 -74
- judgeval/evaluation_run.py +0 -132
- judgeval/judges/mixture_of_judges.py +0 -248
- judgeval/judgment_client.py +0 -354
- judgeval/run_evaluation.py +0 -439
- judgeval/scorers/judgeval_scorer.py +0 -140
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
- judgeval/scorers/prompt_scorer.py +0 -439
- judgeval-0.0.11.dist-info/METADATA +0 -36
- judgeval-0.0.11.dist-info/RECORD +0 -84
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
judgeval/scorers/score.py
CHANGED
|
@@ -1,301 +1,104 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Infrastructure for executing evaluations of `Example`s using one or more `
|
|
2
|
+
Infrastructure for executing evaluations of `Example`s using one or more `ExampleScorer`s.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
|
|
6
5
|
import asyncio
|
|
7
|
-
import time
|
|
6
|
+
import time
|
|
8
7
|
from tqdm.asyncio import tqdm_asyncio
|
|
9
8
|
from typing import List, Union, Optional, Callable
|
|
10
|
-
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
11
9
|
|
|
12
10
|
from judgeval.data import (
|
|
13
|
-
Example,
|
|
11
|
+
Example,
|
|
14
12
|
ScoringResult,
|
|
15
13
|
generate_scoring_result,
|
|
16
|
-
create_process_example,
|
|
17
14
|
create_scorer_data,
|
|
18
15
|
)
|
|
19
|
-
from judgeval.scorers import
|
|
20
|
-
from judgeval.scorers.utils import clone_scorers
|
|
21
|
-
from judgeval.
|
|
22
|
-
from judgeval.common.logger import example_logging_context, debug, error, warning, info
|
|
16
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
|
17
|
+
from judgeval.scorers.utils import clone_scorers
|
|
18
|
+
from judgeval.logger import judgeval_logger
|
|
23
19
|
from judgeval.judges import JudgevalJudge
|
|
20
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
|
21
|
+
|
|
24
22
|
|
|
25
23
|
async def safe_a_score_example(
|
|
26
|
-
scorer:
|
|
24
|
+
scorer: ExampleScorer,
|
|
27
25
|
example: Example,
|
|
28
|
-
ignore_errors: bool,
|
|
29
|
-
skip_on_missing_params: bool,
|
|
30
26
|
):
|
|
31
27
|
"""
|
|
32
28
|
Scoring task function when not using a progress indicator!
|
|
33
|
-
"Safely" scores an `Example` using a `
|
|
29
|
+
"Safely" scores an `Example` using a `ExampleScorer` by gracefully handling any exceptions that may occur.
|
|
34
30
|
|
|
35
31
|
Args:
|
|
36
|
-
scorer (
|
|
32
|
+
scorer (ExampleScorer): The `ExampleScorer` to use for scoring the example.
|
|
37
33
|
example (Example): The `Example` to be scored.
|
|
38
|
-
|
|
39
|
-
ignore_errors (bool): Whether to ignore errors during the evaluation.
|
|
40
|
-
If set to false, any error will be raised and stop the evaluation.
|
|
41
|
-
If set to true, the error will be stored in the `error` attribute of the `JudgevalScorer` and the `success` attribute will be set to False.
|
|
42
|
-
|
|
43
|
-
skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
|
|
44
34
|
"""
|
|
45
|
-
debug(f"Starting safe_a_score_example for example {example.example_id}")
|
|
46
35
|
try:
|
|
47
|
-
await scorer.a_score_example(example
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
36
|
+
score = await scorer.a_score_example(example)
|
|
37
|
+
if score is None:
|
|
38
|
+
raise Exception("a_score_example need to return a score")
|
|
39
|
+
elif score < 0:
|
|
40
|
+
judgeval_logger.warning("score cannot be less than 0 , setting to 0")
|
|
41
|
+
score = 0
|
|
42
|
+
elif score > 1:
|
|
43
|
+
judgeval_logger.warning("score cannot be greater than 1 , setting to 1")
|
|
44
|
+
score = 1
|
|
55
45
|
else:
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
scorer.success = False
|
|
59
|
-
with example_logging_context(example.timestamp, example.example_id):
|
|
60
|
-
warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
|
|
61
|
-
else: # Raise the error and stop the evaluation
|
|
62
|
-
with example_logging_context(example.timestamp, example.example_id):
|
|
63
|
-
error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
|
|
64
|
-
raise
|
|
65
|
-
except TypeError: # in case a_score_example does not accept _show_indicator
|
|
66
|
-
try:
|
|
67
|
-
await scorer.a_score_example(example)
|
|
68
|
-
except MissingTestCaseParamsError as e:
|
|
69
|
-
if skip_on_missing_params:
|
|
70
|
-
scorer.skipped = True
|
|
71
|
-
with example_logging_context(example.timestamp, example.example_id):
|
|
72
|
-
warning(f"Skipping example {example.example_id} due to missing parameters")
|
|
73
|
-
return
|
|
74
|
-
else:
|
|
75
|
-
if ignore_errors:
|
|
76
|
-
scorer.error = str(e)
|
|
77
|
-
scorer.success = False
|
|
78
|
-
with example_logging_context(example.timestamp, example.example_id):
|
|
79
|
-
warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
|
|
80
|
-
else:
|
|
81
|
-
with example_logging_context(example.timestamp, example.example_id):
|
|
82
|
-
error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
|
|
83
|
-
raise
|
|
46
|
+
scorer.score = score
|
|
47
|
+
scorer.success = scorer.success_check()
|
|
84
48
|
except Exception as e:
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
else:
|
|
91
|
-
with example_logging_context(example.timestamp, example.example_id):
|
|
92
|
-
error(f"Stopping example {example.example_id}: {str(e)}")
|
|
93
|
-
raise
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
async def score_task(
|
|
97
|
-
task_id: int,
|
|
98
|
-
progress: Progress,
|
|
99
|
-
scorer: JudgevalScorer,
|
|
100
|
-
example: Example,
|
|
101
|
-
ignore_errors: bool = True,
|
|
102
|
-
skip_on_missing_params: bool = True,
|
|
103
|
-
):
|
|
104
|
-
"""
|
|
105
|
-
Task function for asynchronously measuring a given example using a JudgevalScorer.
|
|
106
|
-
|
|
107
|
-
Args:
|
|
108
|
-
task_id (int): The ID of the task being measured.
|
|
109
|
-
progress (Progress): An instance of the Progress class to track task progress.
|
|
110
|
-
scorer (JudgevalScorer): An instance of the JudgevalScorer class used to score the example.
|
|
111
|
-
example (Example): The example to be scored.
|
|
112
|
-
ignore_errors (bool, optional): Whether to ignore errors during scoring. Defaults to True.
|
|
113
|
-
skip_on_missing_params (bool, optional): Whether to skip scoring if there are missing parameters. Defaults to True.
|
|
114
|
-
|
|
115
|
-
Raises:
|
|
116
|
-
MissingTestCaseParamsError: If required test case parameters are missing and skip_on_missing_params is False.
|
|
117
|
-
Exception: If an unexpected error occurs and ignore_errors is False.
|
|
118
|
-
|
|
119
|
-
Returns:
|
|
120
|
-
None
|
|
121
|
-
"""
|
|
122
|
-
while not progress.finished:
|
|
123
|
-
start_time = time.perf_counter()
|
|
124
|
-
|
|
125
|
-
try:
|
|
126
|
-
await scorer.a_score_example(example, _show_indicator=False)
|
|
127
|
-
finish_text = "Completed"
|
|
128
|
-
except MissingTestCaseParamsError as e:
|
|
129
|
-
if skip_on_missing_params:
|
|
130
|
-
scorer.skipped = True
|
|
131
|
-
with example_logging_context(example.timestamp, example.example_id):
|
|
132
|
-
debug(f"Skipping example {example.example_id} due to missing parameters")
|
|
133
|
-
return
|
|
134
|
-
else:
|
|
135
|
-
if ignore_errors:
|
|
136
|
-
scorer.error = str(e)
|
|
137
|
-
scorer.success = False # Override success
|
|
138
|
-
finish_text = "Failed"
|
|
139
|
-
else:
|
|
140
|
-
with example_logging_context(example.timestamp, example.example_id):
|
|
141
|
-
error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
|
|
142
|
-
raise
|
|
143
|
-
except TypeError:
|
|
144
|
-
try:
|
|
145
|
-
await scorer.a_score_example(example)
|
|
146
|
-
finish_text = "Completed"
|
|
147
|
-
except MissingTestCaseParamsError as e:
|
|
148
|
-
if skip_on_missing_params:
|
|
149
|
-
scorer.skipped = True
|
|
150
|
-
with example_logging_context(example.timestamp, example.example_id):
|
|
151
|
-
debug(f"Skipping example {example.example_id} due to missing parameters")
|
|
152
|
-
return
|
|
153
|
-
else:
|
|
154
|
-
if ignore_errors:
|
|
155
|
-
scorer.error = str(e)
|
|
156
|
-
scorer.success = False # Override success
|
|
157
|
-
finish_text = "Failed"
|
|
158
|
-
else:
|
|
159
|
-
with example_logging_context(example.timestamp, example.example_id):
|
|
160
|
-
error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
|
|
161
|
-
raise
|
|
162
|
-
except Exception as e:
|
|
163
|
-
if ignore_errors:
|
|
164
|
-
scorer.error = str(e)
|
|
165
|
-
scorer.success = False # Override success
|
|
166
|
-
finish_text = "Failed"
|
|
167
|
-
with example_logging_context(example.timestamp, example.example_id):
|
|
168
|
-
warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
|
|
169
|
-
else:
|
|
170
|
-
with example_logging_context(example.timestamp, example.example_id):
|
|
171
|
-
error(f"Stopping example {example.example_id}: {str(e)}")
|
|
172
|
-
raise
|
|
173
|
-
|
|
174
|
-
end_time = time.perf_counter()
|
|
175
|
-
time_taken = format(end_time - start_time, ".2f")
|
|
176
|
-
progress.update(task_id, advance=100) # Mark task as complete
|
|
177
|
-
progress.update(
|
|
178
|
-
task_id,
|
|
179
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]{finish_text}! ({time_taken}s)",
|
|
180
|
-
)
|
|
181
|
-
break
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
async def score_with_indicator(
|
|
185
|
-
scorers: List[JudgevalScorer],
|
|
186
|
-
example: Example,
|
|
187
|
-
ignore_errors: bool,
|
|
188
|
-
skip_on_missing_params: bool,
|
|
189
|
-
show_indicator: bool,
|
|
190
|
-
):
|
|
191
|
-
"""
|
|
192
|
-
Scores an example using a list of JudgevalScorers, optionally displaying a progress indicator.
|
|
193
|
-
|
|
194
|
-
Args:
|
|
195
|
-
scorers (List[JudgevalScorer]): A list of JudgevalScorer objects to evaluate the example.
|
|
196
|
-
example (Example): The example to be scored.
|
|
197
|
-
ignore_errors (bool): If True, errors during scoring will be ignored.
|
|
198
|
-
skip_on_missing_params (bool): If True, scoring will be skipped if required parameters are missing.
|
|
199
|
-
show_indicator (bool): If True, a progress indicator will be displayed during scoring.
|
|
200
|
-
|
|
201
|
-
Returns:
|
|
202
|
-
None
|
|
203
|
-
|
|
204
|
-
Raises:
|
|
205
|
-
Any exceptions raised by the scoring functions, unless `ignore_errors` is True.
|
|
206
|
-
"""
|
|
207
|
-
if show_indicator:
|
|
208
|
-
with Progress(
|
|
209
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
|
210
|
-
TextColumn("[progress.description]{task.description}"),
|
|
211
|
-
transient=True,
|
|
212
|
-
) as progress:
|
|
213
|
-
tasks = []
|
|
214
|
-
for scorer in scorers:
|
|
215
|
-
task_id = progress.add_task(
|
|
216
|
-
description=scorer_console_msg(
|
|
217
|
-
scorer, async_mode=True
|
|
218
|
-
),
|
|
219
|
-
total=100,
|
|
220
|
-
) # Add task to progress bar
|
|
221
|
-
tasks.append(
|
|
222
|
-
score_task(
|
|
223
|
-
task_id,
|
|
224
|
-
progress,
|
|
225
|
-
scorer,
|
|
226
|
-
example,
|
|
227
|
-
ignore_errors,
|
|
228
|
-
skip_on_missing_params,
|
|
229
|
-
) # Create and execute task to score the example with a single scorer
|
|
230
|
-
)
|
|
231
|
-
await asyncio.gather(*tasks)
|
|
232
|
-
else:
|
|
233
|
-
tasks = [
|
|
234
|
-
safe_a_score_example(
|
|
235
|
-
scorer, example, ignore_errors, skip_on_missing_params
|
|
236
|
-
)
|
|
237
|
-
for scorer in scorers
|
|
238
|
-
]
|
|
239
|
-
|
|
240
|
-
await asyncio.gather(*tasks)
|
|
49
|
+
judgeval_logger.error(f"Error during scoring: {str(e)}")
|
|
50
|
+
scorer.error = str(e)
|
|
51
|
+
scorer.success = False
|
|
52
|
+
scorer.score = 0
|
|
53
|
+
return
|
|
241
54
|
|
|
242
55
|
|
|
243
56
|
async def a_execute_scoring(
|
|
244
57
|
examples: List[Example],
|
|
245
|
-
scorers: List[
|
|
246
|
-
model: Optional[Union[str, List[str], JudgevalJudge]] =
|
|
247
|
-
ignore_errors: bool =
|
|
248
|
-
skip_on_missing_params: bool = True,
|
|
249
|
-
show_indicator: bool = True,
|
|
58
|
+
scorers: List[ExampleScorer],
|
|
59
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = JUDGMENT_DEFAULT_GPT_MODEL,
|
|
60
|
+
ignore_errors: bool = False,
|
|
250
61
|
throttle_value: int = 0,
|
|
251
62
|
max_concurrent: int = 100,
|
|
252
|
-
|
|
253
|
-
_use_bar_indicator: bool = True,
|
|
63
|
+
show_progress: bool = True,
|
|
254
64
|
) -> List[ScoringResult]:
|
|
255
65
|
"""
|
|
256
|
-
Executes evaluations of `Example`s asynchronously using one or more `
|
|
257
|
-
Each `Example` will be evaluated by all of the `
|
|
66
|
+
Executes evaluations of `Example`s asynchronously using one or more `ExampleScorer`s.
|
|
67
|
+
Each `Example` will be evaluated by all of the `ExampleScorer`s in the `scorers` list.
|
|
258
68
|
|
|
259
69
|
Args:
|
|
260
70
|
examples (List[Example]): A list of `Example` objects to be evaluated.
|
|
261
|
-
scorers (List[
|
|
71
|
+
scorers (List[ExampleScorer]): A list of `ExampleScorer` objects to evaluate the examples.
|
|
262
72
|
model (Union[str, List[str], JudgevalJudge]): The model to use for evaluation.
|
|
263
73
|
ignore_errors (bool): Whether to ignore errors during evaluation.
|
|
264
|
-
skip_on_missing_params (bool): Whether to skip evaluation if parameters are missing.
|
|
265
|
-
show_indicator (bool): Whether to show a progress indicator.
|
|
266
74
|
throttle_value (int): The amount of time to wait between starting each task.
|
|
267
75
|
max_concurrent (int): The maximum number of concurrent tasks.
|
|
268
|
-
|
|
269
|
-
_use_bar_indicator (bool): Whether to use a progress bar indicator.
|
|
76
|
+
show_progress (bool): Whether to show the progress bar indicator.
|
|
270
77
|
|
|
271
78
|
Returns:
|
|
272
79
|
List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
|
|
273
80
|
"""
|
|
81
|
+
|
|
274
82
|
semaphore = asyncio.Semaphore(max_concurrent)
|
|
275
83
|
|
|
276
84
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
277
|
-
|
|
278
|
-
|
|
85
|
+
async with semaphore:
|
|
86
|
+
try:
|
|
279
87
|
return await func(*args, **kwargs)
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
raise
|
|
286
|
-
|
|
287
|
-
if verbose_mode is not None:
|
|
288
|
-
for scorer in scorers:
|
|
289
|
-
scorer.verbose_mode = verbose_mode
|
|
88
|
+
except Exception as e:
|
|
89
|
+
judgeval_logger.error(f"Error executing function: {e}")
|
|
90
|
+
if kwargs.get("ignore_errors", False):
|
|
91
|
+
return None
|
|
92
|
+
raise
|
|
290
93
|
|
|
291
|
-
# Add model to scorers
|
|
292
94
|
for scorer in scorers:
|
|
293
|
-
scorer.
|
|
95
|
+
if not scorer.model and isinstance(model, str):
|
|
96
|
+
scorer._add_model(model)
|
|
294
97
|
|
|
295
|
-
scoring_results: List[ScoringResult] = [None for _ in examples]
|
|
98
|
+
scoring_results: List[Optional[ScoringResult]] = [None for _ in examples]
|
|
296
99
|
tasks = []
|
|
297
100
|
|
|
298
|
-
if
|
|
101
|
+
if show_progress:
|
|
299
102
|
with tqdm_asyncio(
|
|
300
103
|
desc=f"Evaluating {len(examples)} example(s) in parallel",
|
|
301
104
|
unit="Example",
|
|
@@ -303,24 +106,12 @@ async def a_execute_scoring(
|
|
|
303
106
|
bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
|
|
304
107
|
) as pbar:
|
|
305
108
|
for i, ex in enumerate(examples):
|
|
306
|
-
with example_logging_context(ex.timestamp, ex.example_id):
|
|
307
|
-
debug(f"Starting scoring for example {ex.example_id}")
|
|
308
|
-
debug(f"Input: {ex.input}")
|
|
309
|
-
debug(f"Using {len(scorers)} scorers")
|
|
310
|
-
for scorer in scorers:
|
|
311
|
-
debug(f"Using scorer: {type(scorer).__name__}")
|
|
312
|
-
if hasattr(scorer, 'threshold'):
|
|
313
|
-
debug(f"Scorer threshold: {scorer.threshold}")
|
|
314
|
-
if hasattr(scorer, 'model'):
|
|
315
|
-
debug(f"Scorer model: {type(scorer.model).__name__}")
|
|
316
109
|
if isinstance(ex, Example):
|
|
317
110
|
if len(scorers) == 0:
|
|
318
111
|
pbar.update(1)
|
|
319
112
|
continue
|
|
320
|
-
|
|
321
|
-
cloned_scorers
|
|
322
|
-
scorers
|
|
323
|
-
)
|
|
113
|
+
|
|
114
|
+
cloned_scorers = clone_scorers(scorers) # type: ignore
|
|
324
115
|
task = execute_with_semaphore(
|
|
325
116
|
func=a_eval_examples_helper,
|
|
326
117
|
scorers=cloned_scorers,
|
|
@@ -328,9 +119,6 @@ async def a_execute_scoring(
|
|
|
328
119
|
scoring_results=scoring_results,
|
|
329
120
|
score_index=i,
|
|
330
121
|
ignore_errors=ignore_errors,
|
|
331
|
-
skip_on_missing_params=skip_on_missing_params,
|
|
332
|
-
show_indicator=show_indicator,
|
|
333
|
-
_use_bar_indicator=_use_bar_indicator,
|
|
334
122
|
pbar=pbar,
|
|
335
123
|
)
|
|
336
124
|
tasks.append(asyncio.create_task(task))
|
|
@@ -343,9 +131,7 @@ async def a_execute_scoring(
|
|
|
343
131
|
if len(scorers) == 0:
|
|
344
132
|
continue
|
|
345
133
|
|
|
346
|
-
cloned_scorers
|
|
347
|
-
scorers
|
|
348
|
-
)
|
|
134
|
+
cloned_scorers = clone_scorers(scorers) # type: ignore
|
|
349
135
|
task = execute_with_semaphore(
|
|
350
136
|
func=a_eval_examples_helper,
|
|
351
137
|
scorers=cloned_scorers,
|
|
@@ -353,75 +139,60 @@ async def a_execute_scoring(
|
|
|
353
139
|
scoring_results=scoring_results,
|
|
354
140
|
score_index=i,
|
|
355
141
|
ignore_errors=ignore_errors,
|
|
356
|
-
|
|
357
|
-
_use_bar_indicator=_use_bar_indicator,
|
|
358
|
-
show_indicator=show_indicator,
|
|
142
|
+
pbar=None,
|
|
359
143
|
)
|
|
360
|
-
tasks.append(asyncio.create_task(
|
|
144
|
+
tasks.append(asyncio.create_task(task))
|
|
361
145
|
|
|
362
146
|
await asyncio.sleep(throttle_value)
|
|
363
147
|
await asyncio.gather(*tasks)
|
|
364
|
-
return scoring_results
|
|
148
|
+
return [result for result in scoring_results if result is not None]
|
|
365
149
|
|
|
366
150
|
|
|
367
151
|
async def a_eval_examples_helper(
|
|
368
|
-
scorers: List[
|
|
152
|
+
scorers: List[ExampleScorer],
|
|
369
153
|
example: Example,
|
|
370
|
-
scoring_results: List[ScoringResult],
|
|
154
|
+
scoring_results: List[Optional[ScoringResult]],
|
|
371
155
|
score_index: int,
|
|
372
156
|
ignore_errors: bool,
|
|
373
|
-
skip_on_missing_params: bool,
|
|
374
|
-
show_indicator: bool,
|
|
375
|
-
_use_bar_indicator: bool,
|
|
376
157
|
pbar: Optional[tqdm_asyncio] = None,
|
|
377
|
-
|
|
158
|
+
) -> None:
|
|
378
159
|
"""
|
|
379
160
|
Evaluate a single example asynchronously using a list of scorers.
|
|
380
|
-
|
|
161
|
+
|
|
381
162
|
Args:
|
|
382
|
-
scorers (List[
|
|
163
|
+
scorers (List[ExampleScorer]): List of ExampleScorer objects to evaluate the example.
|
|
383
164
|
example (Example): The example to be evaluated.
|
|
384
165
|
scoring_results (List[ScoringResult]): List to store the scoring results.
|
|
385
166
|
score_index (int): Index at which the result should be stored in scoring_results.
|
|
386
167
|
ignore_errors (bool): Flag to indicate whether to ignore errors during scoring.
|
|
387
|
-
skip_on_missing_params (bool): Flag to indicate whether to skip scoring if parameters are missing.
|
|
388
|
-
show_indicator (bool): Flag to indicate whether to show a progress indicator.
|
|
389
|
-
_use_bar_indicator (bool): Flag to indicate whether to use a bar indicator for progress.
|
|
390
168
|
pbar (Optional[tqdm_asyncio]): Optional progress bar for tracking progress.
|
|
391
169
|
Returns:
|
|
392
170
|
None
|
|
393
171
|
"""
|
|
394
|
-
show_metrics_indicator = show_indicator and not _use_bar_indicator
|
|
395
|
-
|
|
396
|
-
for scorer in scorers:
|
|
397
|
-
scorer.skipped = False
|
|
398
|
-
scorer.error = None # Reset scorer error
|
|
399
172
|
|
|
400
|
-
# scoring the Example
|
|
401
|
-
process_example = create_process_example(example) # Creates process example to track progress
|
|
402
173
|
scoring_start_time = time.perf_counter()
|
|
403
|
-
await score_with_indicator(
|
|
404
|
-
scorers=scorers,
|
|
405
|
-
example=example,
|
|
406
|
-
skip_on_missing_params=skip_on_missing_params,
|
|
407
|
-
ignore_errors=ignore_errors,
|
|
408
|
-
show_indicator=show_metrics_indicator,
|
|
409
|
-
) # execute the scoring functions of each scorer on the example
|
|
410
174
|
|
|
411
|
-
|
|
412
|
-
|
|
175
|
+
tasks = [safe_a_score_example(scorer, example) for scorer in scorers]
|
|
176
|
+
|
|
177
|
+
await asyncio.gather(*tasks)
|
|
178
|
+
|
|
179
|
+
success = True
|
|
180
|
+
scorer_data_list = []
|
|
413
181
|
for scorer in scorers:
|
|
414
|
-
|
|
415
|
-
if getattr(scorer, 'skipped', False):
|
|
182
|
+
if getattr(scorer, "skipped", False):
|
|
416
183
|
continue
|
|
417
|
-
scorer_data = create_scorer_data(scorer)
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
184
|
+
scorer_data = create_scorer_data(scorer)
|
|
185
|
+
for s in scorer_data:
|
|
186
|
+
success = success and s.success
|
|
187
|
+
scorer_data_list.extend(scorer_data)
|
|
188
|
+
|
|
189
|
+
scoring_end_time = time.perf_counter()
|
|
190
|
+
run_duration = scoring_end_time - scoring_start_time
|
|
191
|
+
|
|
192
|
+
scoring_result = generate_scoring_result(
|
|
193
|
+
example, scorer_data_list, run_duration, success
|
|
194
|
+
)
|
|
195
|
+
scoring_results[score_index] = scoring_result
|
|
425
196
|
|
|
426
197
|
if pbar is not None:
|
|
427
198
|
pbar.update(1)
|