judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +177 -12
- judgeval/api/__init__.py +519 -0
- judgeval/api/api_types.py +407 -0
- judgeval/cli.py +79 -0
- judgeval/constants.py +76 -47
- judgeval/data/__init__.py +3 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +15 -56
- judgeval/data/judgment_types.py +450 -0
- judgeval/data/result.py +29 -73
- judgeval/data/scorer_data.py +29 -62
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/trace.py +121 -0
- judgeval/dataset/__init__.py +264 -0
- judgeval/env.py +52 -0
- judgeval/evaluation/__init__.py +344 -0
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +50 -0
- judgeval/judges/__init__.py +2 -3
- judgeval/judges/base_judge.py +2 -3
- judgeval/judges/litellm_judge.py +100 -20
- judgeval/judges/together_judge.py +101 -20
- judgeval/judges/utils.py +20 -24
- judgeval/logger.py +62 -0
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +18 -25
- judgeval/scorers/agent_scorer.py +17 -0
- judgeval/scorers/api_scorer.py +45 -41
- judgeval/scorers/base_scorer.py +83 -38
- judgeval/scorers/example_scorer.py +17 -0
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorers/__init__.py +0 -148
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
- judgeval/scorers/score.py +77 -306
- judgeval/scorers/utils.py +4 -199
- judgeval/tracer/__init__.py +1122 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +128 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainable_model.py +243 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +97 -0
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/utils/version_check.py +28 -0
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.22.2.dist-info/METADATA +265 -0
- judgeval-0.22.2.dist-info/RECORD +112 -0
- judgeval-0.22.2.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -39
- judgeval/common/__init__.py +0 -8
- judgeval/common/exceptions.py +0 -28
- judgeval/common/logger.py +0 -189
- judgeval/common/tracer.py +0 -798
- judgeval/common/utils.py +0 -763
- judgeval/data/api_example.py +0 -111
- judgeval/data/datasets/__init__.py +0 -5
- judgeval/data/datasets/dataset.py +0 -286
- judgeval/data/datasets/eval_dataset_client.py +0 -193
- judgeval/data/datasets/ground_truth.py +0 -54
- judgeval/data/datasets/utils.py +0 -74
- judgeval/evaluation_run.py +0 -132
- judgeval/judges/mixture_of_judges.py +0 -248
- judgeval/judgment_client.py +0 -354
- judgeval/run_evaluation.py +0 -439
- judgeval/scorers/judgeval_scorer.py +0 -140
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
- judgeval/scorers/prompt_scorer.py +0 -439
- judgeval-0.0.11.dist-info/METADATA +0 -36
- judgeval-0.0.11.dist-info/RECORD +0 -84
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
DELETED
|
@@ -1,439 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import requests
|
|
3
|
-
from typing import List, Dict
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from rich import print as rprint
|
|
6
|
-
|
|
7
|
-
from judgeval.data import (
|
|
8
|
-
Example,
|
|
9
|
-
ScorerData,
|
|
10
|
-
ScoringResult
|
|
11
|
-
)
|
|
12
|
-
from judgeval.scorers import (
|
|
13
|
-
JudgevalScorer,
|
|
14
|
-
APIJudgmentScorer,
|
|
15
|
-
ClassifierScorer
|
|
16
|
-
)
|
|
17
|
-
from judgeval.scorers.score import a_execute_scoring
|
|
18
|
-
|
|
19
|
-
from judgeval.constants import (
|
|
20
|
-
ROOT_API,
|
|
21
|
-
JUDGMENT_EVAL_API_URL,
|
|
22
|
-
JUDGMENT_EVAL_LOG_API_URL,
|
|
23
|
-
)
|
|
24
|
-
from judgeval.common.exceptions import JudgmentAPIError
|
|
25
|
-
from judgeval.evaluation_run import EvaluationRun
|
|
26
|
-
from judgeval.common.logger import (
|
|
27
|
-
enable_logging,
|
|
28
|
-
debug,
|
|
29
|
-
info,
|
|
30
|
-
error,
|
|
31
|
-
example_logging_context
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
36
|
-
"""
|
|
37
|
-
Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
|
|
38
|
-
|
|
39
|
-
Args:
|
|
40
|
-
evaluation_run (EvaluationRun): The evaluation run object containing the examples, scorers, and metadata
|
|
41
|
-
|
|
42
|
-
Returns:
|
|
43
|
-
List[Dict]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult`
|
|
44
|
-
object.
|
|
45
|
-
"""
|
|
46
|
-
|
|
47
|
-
try:
|
|
48
|
-
# submit API request to execute evals
|
|
49
|
-
payload = evaluation_run.model_dump(warnings=False)
|
|
50
|
-
response = requests.post(JUDGMENT_EVAL_API_URL, json=payload)
|
|
51
|
-
response_data = response.json()
|
|
52
|
-
except Exception as e:
|
|
53
|
-
error(f"Error: {e}")
|
|
54
|
-
details = response.json().get("detail", "No details provided")
|
|
55
|
-
raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details)
|
|
56
|
-
# Check if the response status code is not 2XX
|
|
57
|
-
# Add check for the duplicate eval run name
|
|
58
|
-
if not response.ok:
|
|
59
|
-
error_message = response_data.get('detail', 'An unknown error occurred.')
|
|
60
|
-
error(f"Error: {error_message=}")
|
|
61
|
-
raise JudgmentAPIError(error_message)
|
|
62
|
-
return response_data
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def merge_results(api_results: List[ScoringResult], local_results: List[ScoringResult]) -> List[ScoringResult]:
|
|
66
|
-
"""
|
|
67
|
-
When executing scorers that come from both the Judgment API and local scorers, we're left with
|
|
68
|
-
results for each type of scorer. This function merges the results from the API and local evaluations,
|
|
69
|
-
grouped by example. In particular, we merge the `scorers_data` field of each `ScoringResult` object.
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
api_results (List[ScoringResult]): The `ScoringResult`s from the API evaluation
|
|
73
|
-
local_results (List[ScoringResult]): The `ScoringResult`s from the local evaluation
|
|
74
|
-
|
|
75
|
-
Returns:
|
|
76
|
-
List[ScoringResult]: The merged `ScoringResult`s (updated `scorers_data` field)
|
|
77
|
-
"""
|
|
78
|
-
# No merge required
|
|
79
|
-
if not local_results and api_results:
|
|
80
|
-
return api_results
|
|
81
|
-
if not api_results and local_results:
|
|
82
|
-
return local_results
|
|
83
|
-
|
|
84
|
-
if len(api_results) != len(local_results):
|
|
85
|
-
# Results should be of same length because each ScoringResult is a 1-1 mapping to an Example
|
|
86
|
-
raise ValueError(f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}")
|
|
87
|
-
|
|
88
|
-
# Each ScoringResult in api and local have all the same fields besides `scorers_data`
|
|
89
|
-
for api_result, local_result in zip(api_results, local_results):
|
|
90
|
-
if api_result.input != local_result.input:
|
|
91
|
-
raise ValueError("The API and local results are not aligned.")
|
|
92
|
-
if api_result.actual_output != local_result.actual_output:
|
|
93
|
-
raise ValueError("The API and local results are not aligned.")
|
|
94
|
-
if api_result.expected_output != local_result.expected_output:
|
|
95
|
-
raise ValueError("The API and local results are not aligned.")
|
|
96
|
-
if api_result.context != local_result.context:
|
|
97
|
-
raise ValueError("The API and local results are not aligned.")
|
|
98
|
-
if api_result.retrieval_context != local_result.retrieval_context:
|
|
99
|
-
raise ValueError("The API and local results are not aligned.")
|
|
100
|
-
if api_result.additional_metadata != local_result.additional_metadata:
|
|
101
|
-
raise ValueError("The API and local results are not aligned.")
|
|
102
|
-
if api_result.tools_called != local_result.tools_called:
|
|
103
|
-
raise ValueError("The API and local results are not aligned.")
|
|
104
|
-
if api_result.expected_tools != local_result.expected_tools:
|
|
105
|
-
raise ValueError("The API and local results are not aligned.")
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
# Merge ScorerData from the API and local scorers together
|
|
109
|
-
api_scorer_data = api_result.scorers_data
|
|
110
|
-
local_scorer_data = local_result.scorers_data
|
|
111
|
-
if api_scorer_data is None and local_scorer_data is not None:
|
|
112
|
-
api_result.scorers_data = local_scorer_data
|
|
113
|
-
|
|
114
|
-
if api_scorer_data is not None and local_scorer_data is not None:
|
|
115
|
-
api_result.scorers_data = api_scorer_data + local_scorer_data
|
|
116
|
-
|
|
117
|
-
return api_results
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
|
|
121
|
-
"""
|
|
122
|
-
Checks if any `ScoringResult` objects are missing `scorers_data`.
|
|
123
|
-
|
|
124
|
-
If any are missing, logs an error and returns the results.
|
|
125
|
-
"""
|
|
126
|
-
for i, result in enumerate(results):
|
|
127
|
-
if not result.scorers_data:
|
|
128
|
-
error(
|
|
129
|
-
f"Scorer data is missing for example {i}. "
|
|
130
|
-
"This is usually caused when the example does not contain "
|
|
131
|
-
"the fields required by the scorer. "
|
|
132
|
-
"Check that your example contains the fields required by the scorers. "
|
|
133
|
-
"TODO add docs link here for reference."
|
|
134
|
-
)
|
|
135
|
-
return results
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str) -> None:
|
|
139
|
-
"""
|
|
140
|
-
Checks if an evaluation run name already exists for a given project.
|
|
141
|
-
|
|
142
|
-
Args:
|
|
143
|
-
eval_name (str): Name of the evaluation run
|
|
144
|
-
project_name (str): Name of the project
|
|
145
|
-
judgment_api_key (str): API key for authentication
|
|
146
|
-
|
|
147
|
-
Raises:
|
|
148
|
-
ValueError: If the evaluation run name already exists
|
|
149
|
-
JudgmentAPIError: If there's an API error during the check
|
|
150
|
-
"""
|
|
151
|
-
try:
|
|
152
|
-
response = requests.post(
|
|
153
|
-
f"{ROOT_API}/eval-run-name-exists/",
|
|
154
|
-
json={
|
|
155
|
-
"eval_name": eval_name,
|
|
156
|
-
"project_name": project_name,
|
|
157
|
-
"judgment_api_key": judgment_api_key,
|
|
158
|
-
}
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
if response.status_code == 409:
|
|
162
|
-
error(f"Evaluation run name '{eval_name}' already exists for this project")
|
|
163
|
-
raise ValueError(f"Evaluation run name '{eval_name}' already exists for this project")
|
|
164
|
-
|
|
165
|
-
if not response.ok:
|
|
166
|
-
response_data = response.json()
|
|
167
|
-
error_message = response_data.get('detail', 'An unknown error occurred.')
|
|
168
|
-
error(f"Error checking eval run name: {error_message}")
|
|
169
|
-
raise JudgmentAPIError(error_message)
|
|
170
|
-
|
|
171
|
-
except requests.exceptions.RequestException as e:
|
|
172
|
-
error(f"Failed to check if eval run name exists: {str(e)}")
|
|
173
|
-
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> None:
|
|
177
|
-
"""
|
|
178
|
-
Logs evaluation results to the Judgment API database.
|
|
179
|
-
|
|
180
|
-
Args:
|
|
181
|
-
merged_results (List[ScoringResult]): The results to log
|
|
182
|
-
evaluation_run (EvaluationRun): The evaluation run containing project info and API key
|
|
183
|
-
|
|
184
|
-
Raises:
|
|
185
|
-
JudgmentAPIError: If there's an API error during logging
|
|
186
|
-
ValueError: If there's a validation error with the results
|
|
187
|
-
"""
|
|
188
|
-
try:
|
|
189
|
-
res = requests.post(
|
|
190
|
-
JUDGMENT_EVAL_LOG_API_URL,
|
|
191
|
-
json={
|
|
192
|
-
"results": [result.to_dict() for result in merged_results],
|
|
193
|
-
"judgment_api_key": evaluation_run.judgment_api_key,
|
|
194
|
-
"project_name": evaluation_run.project_name,
|
|
195
|
-
"eval_name": evaluation_run.eval_name,
|
|
196
|
-
}
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
if not res.ok:
|
|
200
|
-
response_data = res.json()
|
|
201
|
-
error_message = response_data.get('detail', 'An unknown error occurred.')
|
|
202
|
-
error(f"Error {res.status_code}: {error_message}")
|
|
203
|
-
raise JudgmentAPIError(error_message)
|
|
204
|
-
|
|
205
|
-
if "ui_results_url" in res.json():
|
|
206
|
-
rprint(f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n")
|
|
207
|
-
|
|
208
|
-
except requests.exceptions.RequestException as e:
|
|
209
|
-
error(f"Request failed while saving evaluation results to DB: {str(e)}")
|
|
210
|
-
raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
|
|
211
|
-
except Exception as e:
|
|
212
|
-
error(f"Failed to save evaluation results to DB: {str(e)}")
|
|
213
|
-
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
|
|
217
|
-
"""
|
|
218
|
-
Executes an evaluation of `Example`s using one or more `Scorer`s
|
|
219
|
-
|
|
220
|
-
Args:
|
|
221
|
-
evaluation_run (EvaluationRun): Stores example and evaluation together for running
|
|
222
|
-
|
|
223
|
-
Args:
|
|
224
|
-
project_name (str): The name of the project the evaluation results belong to
|
|
225
|
-
eval_name (str): The name of the evaluation run
|
|
226
|
-
examples (List[Example]): The examples to evaluate
|
|
227
|
-
scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
|
228
|
-
model (str): The model used as a judge when using LLM as a Judge
|
|
229
|
-
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
|
230
|
-
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
|
231
|
-
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
|
232
|
-
log_results (bool): Whether to log the results to the Judgment API
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
Returns:
|
|
236
|
-
List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
|
|
237
|
-
"""
|
|
238
|
-
|
|
239
|
-
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
|
240
|
-
if not override and evaluation_run.log_results:
|
|
241
|
-
check_eval_run_name_exists(
|
|
242
|
-
evaluation_run.eval_name,
|
|
243
|
-
evaluation_run.project_name,
|
|
244
|
-
evaluation_run.judgment_api_key
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
# Set example IDs if not already set
|
|
248
|
-
debug("Initializing examples with IDs and timestamps")
|
|
249
|
-
for idx, example in enumerate(evaluation_run.examples):
|
|
250
|
-
if example.example_id is None:
|
|
251
|
-
example.example_id = idx
|
|
252
|
-
debug(f"Set example ID {idx} for input: {example.input[:50]}...")
|
|
253
|
-
example.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
254
|
-
with example_logging_context(example.timestamp, example.example_id):
|
|
255
|
-
debug(f"Initialized example {example.example_id}")
|
|
256
|
-
debug(f"Input: {example.input}")
|
|
257
|
-
debug(f"Actual output: {example.actual_output}")
|
|
258
|
-
if example.expected_output:
|
|
259
|
-
debug(f"Expected output: {example.expected_output}")
|
|
260
|
-
if example.context:
|
|
261
|
-
debug(f"Context: {example.context}")
|
|
262
|
-
if example.retrieval_context:
|
|
263
|
-
debug(f"Retrieval context: {example.retrieval_context}")
|
|
264
|
-
if example.additional_metadata:
|
|
265
|
-
debug(f"Additional metadata: {example.additional_metadata}")
|
|
266
|
-
if example.tools_called:
|
|
267
|
-
debug(f"Tools called: {example.tools_called}")
|
|
268
|
-
if example.expected_tools:
|
|
269
|
-
debug(f"Expected tools: {example.expected_tools}")
|
|
270
|
-
|
|
271
|
-
debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
|
|
272
|
-
|
|
273
|
-
# Group APIJudgmentScorers and JudgevalScorers, then evaluate them in parallel
|
|
274
|
-
debug("Grouping scorers by type")
|
|
275
|
-
judgment_scorers: List[APIJudgmentScorer] = []
|
|
276
|
-
local_scorers: List[JudgevalScorer] = []
|
|
277
|
-
for scorer in evaluation_run.scorers:
|
|
278
|
-
if isinstance(scorer, (APIJudgmentScorer, ClassifierScorer)):
|
|
279
|
-
judgment_scorers.append(scorer)
|
|
280
|
-
debug(f"Added judgment scorer: {type(scorer).__name__}")
|
|
281
|
-
else:
|
|
282
|
-
local_scorers.append(scorer)
|
|
283
|
-
debug(f"Added local scorer: {type(scorer).__name__}")
|
|
284
|
-
|
|
285
|
-
debug(f"Found {len(judgment_scorers)} judgment scorers and {len(local_scorers)} local scorers")
|
|
286
|
-
|
|
287
|
-
api_results: List[ScoringResult] = []
|
|
288
|
-
local_results: List[ScoringResult] = []
|
|
289
|
-
|
|
290
|
-
# Execute evaluation using Judgment API
|
|
291
|
-
if judgment_scorers:
|
|
292
|
-
info("Starting API evaluation")
|
|
293
|
-
debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
|
|
294
|
-
try: # execute an EvaluationRun with just JudgmentScorers
|
|
295
|
-
api_evaluation_run: EvaluationRun = EvaluationRun(
|
|
296
|
-
eval_name=evaluation_run.eval_name,
|
|
297
|
-
project_name=evaluation_run.project_name,
|
|
298
|
-
examples=evaluation_run.examples,
|
|
299
|
-
scorers=judgment_scorers,
|
|
300
|
-
model=evaluation_run.model,
|
|
301
|
-
aggregator=evaluation_run.aggregator,
|
|
302
|
-
metadata=evaluation_run.metadata,
|
|
303
|
-
judgment_api_key=evaluation_run.judgment_api_key,
|
|
304
|
-
log_results=evaluation_run.log_results
|
|
305
|
-
)
|
|
306
|
-
debug("Sending request to Judgment API")
|
|
307
|
-
response_data: List[Dict] = execute_api_eval(api_evaluation_run) # Dicts are `ScoringResult` objs
|
|
308
|
-
info(f"Received {len(response_data['results'])} results from API")
|
|
309
|
-
except JudgmentAPIError as e:
|
|
310
|
-
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
|
311
|
-
raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
|
|
312
|
-
except ValueError as e:
|
|
313
|
-
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
|
|
314
|
-
|
|
315
|
-
# Convert the response data to `ScoringResult` objects
|
|
316
|
-
debug("Processing API results")
|
|
317
|
-
for idx, result in enumerate(response_data["results"]):
|
|
318
|
-
with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
|
|
319
|
-
for scorer in judgment_scorers:
|
|
320
|
-
debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
|
|
321
|
-
# filter for key-value pairs that are used to initialize ScoringResult
|
|
322
|
-
# there may be some stuff in here that doesn't belong in ScoringResult
|
|
323
|
-
# TODO: come back and refactor this to have ScoringResult take in **kwargs
|
|
324
|
-
filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
|
|
325
|
-
|
|
326
|
-
# Convert scorers_data dicts to ScorerData objects
|
|
327
|
-
if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
|
|
328
|
-
filtered_result["scorers_data"] = [
|
|
329
|
-
ScorerData(**scorer_dict)
|
|
330
|
-
for scorer_dict in filtered_result["scorers_data"]
|
|
331
|
-
]
|
|
332
|
-
|
|
333
|
-
api_results.append(ScoringResult(**filtered_result))
|
|
334
|
-
|
|
335
|
-
# Run local evals
|
|
336
|
-
if local_scorers: # List[JudgevalScorer]
|
|
337
|
-
info("Starting local evaluation")
|
|
338
|
-
for example in evaluation_run.examples:
|
|
339
|
-
with example_logging_context(example.timestamp, example.example_id):
|
|
340
|
-
debug(f"Processing example {example.example_id}: {example.input}")
|
|
341
|
-
|
|
342
|
-
results: List[ScoringResult] = asyncio.run(
|
|
343
|
-
a_execute_scoring(
|
|
344
|
-
evaluation_run.examples,
|
|
345
|
-
local_scorers,
|
|
346
|
-
model=evaluation_run.model,
|
|
347
|
-
ignore_errors=True,
|
|
348
|
-
skip_on_missing_params=True,
|
|
349
|
-
show_indicator=True,
|
|
350
|
-
_use_bar_indicator=True,
|
|
351
|
-
throttle_value=0,
|
|
352
|
-
max_concurrent=100,
|
|
353
|
-
)
|
|
354
|
-
)
|
|
355
|
-
local_results = results
|
|
356
|
-
info(f"Local evaluation complete with {len(local_results)} results")
|
|
357
|
-
|
|
358
|
-
# Aggregate the ScorerData from the API and local evaluations
|
|
359
|
-
debug("Merging API and local results")
|
|
360
|
-
merged_results: List[ScoringResult] = merge_results(api_results, local_results)
|
|
361
|
-
merged_results = check_missing_scorer_data(merged_results)
|
|
362
|
-
|
|
363
|
-
info(f"Successfully merged {len(merged_results)} results")
|
|
364
|
-
|
|
365
|
-
if evaluation_run.log_results:
|
|
366
|
-
log_evaluation_results(merged_results, evaluation_run)
|
|
367
|
-
|
|
368
|
-
for i, result in enumerate(merged_results):
|
|
369
|
-
if not result.scorers_data: # none of the scorers could be executed on this example
|
|
370
|
-
info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
|
|
371
|
-
return merged_results
|
|
372
|
-
|
|
373
|
-
def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
374
|
-
"""
|
|
375
|
-
Collects all failed scorers from the scoring results.
|
|
376
|
-
|
|
377
|
-
Args:
|
|
378
|
-
ScoringResults (List[ScoringResult]): List of scoring results to check
|
|
379
|
-
|
|
380
|
-
Returns:
|
|
381
|
-
None. Raises exceptions for any failed test cases.
|
|
382
|
-
"""
|
|
383
|
-
failed_cases: List[ScorerData] = []
|
|
384
|
-
|
|
385
|
-
for result in scoring_results:
|
|
386
|
-
if not result.success:
|
|
387
|
-
|
|
388
|
-
# Create a test case context with all relevant fields
|
|
389
|
-
test_case = {
|
|
390
|
-
'input': result.input,
|
|
391
|
-
'actual_output': result.actual_output,
|
|
392
|
-
'expected_output': result.expected_output,
|
|
393
|
-
'context': result.context,
|
|
394
|
-
'retrieval_context': result.retrieval_context,
|
|
395
|
-
'additional_metadata': result.additional_metadata,
|
|
396
|
-
'tools_called': result.tools_called,
|
|
397
|
-
'expected_tools': result.expected_tools,
|
|
398
|
-
'eval_run_name': result.eval_run_name,
|
|
399
|
-
'failed_scorers': []
|
|
400
|
-
}
|
|
401
|
-
if result.scorers_data:
|
|
402
|
-
# If the result was not successful, check each scorer_data
|
|
403
|
-
for scorer_data in result.scorers_data:
|
|
404
|
-
if not scorer_data.success:
|
|
405
|
-
test_case['failed_scorers'].append(scorer_data)
|
|
406
|
-
failed_cases.append(test_case)
|
|
407
|
-
|
|
408
|
-
if failed_cases:
|
|
409
|
-
error_msg = f"The following test cases failed: \n"
|
|
410
|
-
for fail_case in failed_cases:
|
|
411
|
-
error_msg += f"\nInput: {fail_case['input']}\n"
|
|
412
|
-
error_msg += f"Actual Output: {fail_case['actual_output']}\n"
|
|
413
|
-
error_msg += f"Expected Output: {fail_case['expected_output']}\n"
|
|
414
|
-
error_msg += f"Context: {fail_case['context']}\n"
|
|
415
|
-
error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
|
|
416
|
-
error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
|
|
417
|
-
error_msg += f"Tools Called: {fail_case['tools_called']}\n"
|
|
418
|
-
error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
|
|
419
|
-
error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
|
|
420
|
-
|
|
421
|
-
for fail_scorer in fail_case['failed_scorers']:
|
|
422
|
-
|
|
423
|
-
error_msg += (
|
|
424
|
-
f"\nScorer Name: {fail_scorer.name}\n"
|
|
425
|
-
f"Threshold: {fail_scorer.threshold}\n"
|
|
426
|
-
f"Success: {fail_scorer.success}\n"
|
|
427
|
-
f"Score: {fail_scorer.score}\n"
|
|
428
|
-
f"Reason: {fail_scorer.reason}\n"
|
|
429
|
-
f"Strict Mode: {fail_scorer.strict_mode}\n"
|
|
430
|
-
f"Evaluation Model: {fail_scorer.evaluation_model}\n"
|
|
431
|
-
f"Error: {fail_scorer.error}\n"
|
|
432
|
-
f"Evaluation Cost: {fail_scorer.evaluation_cost}\n"
|
|
433
|
-
f"Verbose Logs: {fail_scorer.verbose_logs}\n"
|
|
434
|
-
f"Additional Metadata: {fail_scorer.additional_metadata}\n"
|
|
435
|
-
)
|
|
436
|
-
error_msg += "-"*100
|
|
437
|
-
|
|
438
|
-
raise AssertionError(error_msg)
|
|
439
|
-
|
|
@@ -1,140 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Judgeval Scorer class
|
|
3
|
-
|
|
4
|
-
Enables client to create custom scorers that do not fall under any of the ready-made Judgment scorers.
|
|
5
|
-
To create a custom scorer, extend this class and implement the `score_example`, `a_score_example`, and `success_check` methods.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from typing import Optional, Dict, Union, List
|
|
9
|
-
from abc import abstractmethod
|
|
10
|
-
|
|
11
|
-
from judgeval.common.logger import debug, info, warning, error
|
|
12
|
-
from judgeval.judges import JudgevalJudge
|
|
13
|
-
from judgeval.judges.utils import create_judge
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class JudgevalScorer:
|
|
17
|
-
"""
|
|
18
|
-
Base class for scorers in `judgeval`.
|
|
19
|
-
|
|
20
|
-
In practice, you should not implement this class unless you are creating a custom scorer.
|
|
21
|
-
Judgeval offers 10+ default scorers that you can use out of the box.
|
|
22
|
-
|
|
23
|
-
If you want to create a scorer that does not fall under any of the ready-made Judgment scorers,
|
|
24
|
-
you can create a custom scorer by extending this class.
|
|
25
|
-
"""
|
|
26
|
-
score_type: str # name of your new scorer
|
|
27
|
-
threshold: float # The threshold to pass a test while using this scorer as a scorer
|
|
28
|
-
score: Optional[float] = None # The float score of the scorer run on the test case
|
|
29
|
-
score_breakdown: Dict = None
|
|
30
|
-
reason: Optional[str] = None # The reason for the score when evaluating the test case
|
|
31
|
-
success: Optional[bool] = None # Whether the test case passed or failed
|
|
32
|
-
evaluation_model: Optional[str] = None # The model used to evaluate the test case
|
|
33
|
-
strict_mode: bool = False # Whether to run the scorer in strict mode
|
|
34
|
-
async_mode: bool = True # Whether to run the scorer in async mode
|
|
35
|
-
verbose_mode: bool = True # Whether to run the scorer in verbose mode
|
|
36
|
-
include_reason: bool = False # Whether to include the reason in the output
|
|
37
|
-
error: Optional[str] = None # The error message if the scorer failed
|
|
38
|
-
evaluation_cost: Optional[float] = None # The cost of running the scorer
|
|
39
|
-
verbose_logs: Optional[str] = None # The verbose logs of the scorer
|
|
40
|
-
additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
|
|
41
|
-
|
|
42
|
-
def __init__(
|
|
43
|
-
self,
|
|
44
|
-
score_type: str,
|
|
45
|
-
threshold: float,
|
|
46
|
-
score: Optional[float] = None,
|
|
47
|
-
score_breakdown: Optional[Dict] = None,
|
|
48
|
-
reason: Optional[str] = None,
|
|
49
|
-
success: Optional[bool] = None,
|
|
50
|
-
evaluation_model: Optional[str] = None,
|
|
51
|
-
strict_mode: bool = False,
|
|
52
|
-
async_mode: bool = True,
|
|
53
|
-
verbose_mode: bool = True,
|
|
54
|
-
include_reason: bool = False,
|
|
55
|
-
error: Optional[str] = None,
|
|
56
|
-
evaluation_cost: Optional[float] = None,
|
|
57
|
-
verbose_logs: Optional[str] = None,
|
|
58
|
-
additional_metadata: Optional[Dict] = None
|
|
59
|
-
):
|
|
60
|
-
debug(f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}")
|
|
61
|
-
if not 0 <= threshold <= 1:
|
|
62
|
-
raise ValueError("Threshold must be between 0 and 1")
|
|
63
|
-
if strict_mode:
|
|
64
|
-
warning("Strict mode enabled - scoring will be more rigorous")
|
|
65
|
-
info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
|
|
66
|
-
self.score_type = score_type
|
|
67
|
-
self.threshold = threshold
|
|
68
|
-
self.score = score
|
|
69
|
-
self.score_breakdown = score_breakdown
|
|
70
|
-
self.reason = reason
|
|
71
|
-
self.success = success
|
|
72
|
-
self.evaluation_model = evaluation_model
|
|
73
|
-
self.strict_mode = strict_mode
|
|
74
|
-
self.async_mode = async_mode
|
|
75
|
-
self.verbose_mode = verbose_mode
|
|
76
|
-
self.include_reason = include_reason
|
|
77
|
-
self.error = error
|
|
78
|
-
self.evaluation_cost = evaluation_cost
|
|
79
|
-
self.verbose_logs = verbose_logs
|
|
80
|
-
self.additional_metadata = additional_metadata
|
|
81
|
-
|
|
82
|
-
def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
|
|
83
|
-
"""
|
|
84
|
-
Adds the evaluation model to the JudgevalScorer instance
|
|
85
|
-
|
|
86
|
-
This method is used at eval time
|
|
87
|
-
"""
|
|
88
|
-
self.model, self.using_native_model = create_judge(model)
|
|
89
|
-
self.evaluation_model = self.model.get_model_name()
|
|
90
|
-
|
|
91
|
-
@abstractmethod
|
|
92
|
-
def score_example(self, example, *args, **kwargs) -> float:
|
|
93
|
-
"""
|
|
94
|
-
Measures the score on a single example
|
|
95
|
-
"""
|
|
96
|
-
warning("Attempting to call unimplemented score_example method")
|
|
97
|
-
error("score_example method not implemented")
|
|
98
|
-
raise NotImplementedError("You must implement the `score` method in your custom scorer")
|
|
99
|
-
|
|
100
|
-
@abstractmethod
|
|
101
|
-
async def a_score_example(self, example, *args, **kwargs) -> float:
|
|
102
|
-
"""
|
|
103
|
-
Asynchronously measures the score on a single example
|
|
104
|
-
"""
|
|
105
|
-
warning("Attempting to call unimplemented a_score_example method")
|
|
106
|
-
error("a_score_example method not implemented")
|
|
107
|
-
raise NotImplementedError("You must implement the `a_score` method in your custom scorer")
|
|
108
|
-
|
|
109
|
-
@abstractmethod
|
|
110
|
-
def _success_check(self) -> bool:
|
|
111
|
-
"""
|
|
112
|
-
For unit testing, determines whether the test case passes or fails
|
|
113
|
-
"""
|
|
114
|
-
warning("Attempting to call unimplemented success_check method")
|
|
115
|
-
error("success_check method not implemented")
|
|
116
|
-
raise NotImplementedError("You must implement the `passes` method in your custom scorer")
|
|
117
|
-
|
|
118
|
-
def __str__(self):
|
|
119
|
-
debug("Converting JudgevalScorer instance to string representation")
|
|
120
|
-
if self.error:
|
|
121
|
-
warning(f"JudgevalScorer contains error: {self.error}")
|
|
122
|
-
info(f"JudgevalScorer status - success: {self.success}, score: {self.score}")
|
|
123
|
-
attributes = {
|
|
124
|
-
"score_type": self.score_type,
|
|
125
|
-
"threshold": self.threshold,
|
|
126
|
-
"score": self.score,
|
|
127
|
-
"score_breakdown": self.score_breakdown,
|
|
128
|
-
"reason": self.reason,
|
|
129
|
-
"success": self.success,
|
|
130
|
-
"evaluation_model": self.evaluation_model,
|
|
131
|
-
"strict_mode": self.strict_mode,
|
|
132
|
-
"async_mode": self.async_mode,
|
|
133
|
-
"verbose_mode": self.verbose_mode,
|
|
134
|
-
"include_reason": self.include_reason,
|
|
135
|
-
"error": self.error,
|
|
136
|
-
"evaluation_cost": self.evaluation_cost,
|
|
137
|
-
"verbose_logs": self.verbose_logs,
|
|
138
|
-
"additional_metadata": self.additional_metadata,
|
|
139
|
-
}
|
|
140
|
-
return f"JudgevalScorer({attributes})"
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
`judgeval` contextual precision scorer
|
|
3
|
-
|
|
4
|
-
TODO add link to docs page for this scorer
|
|
5
|
-
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
# Internal imports
|
|
9
|
-
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
|
10
|
-
from judgeval.constants import APIScorer
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class ContextualPrecisionScorer(APIJudgmentScorer):
|
|
14
|
-
def __init__(self, threshold: float):
|
|
15
|
-
super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_PRECISION)
|
|
16
|
-
|
|
17
|
-
@property
|
|
18
|
-
def __name__(self):
|
|
19
|
-
return "Contextual Precision"
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
`judgeval` contextual recall scorer
|
|
3
|
-
|
|
4
|
-
TODO add link to docs page for this scorer
|
|
5
|
-
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
# Internal imports
|
|
9
|
-
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
|
10
|
-
from judgeval.constants import APIScorer
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class ContextualRecallScorer(APIJudgmentScorer):
|
|
14
|
-
def __init__(self, threshold: float):
|
|
15
|
-
super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RECALL)
|
|
16
|
-
|
|
17
|
-
@property
|
|
18
|
-
def __name__(self):
|
|
19
|
-
return "Contextual Recall"
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
`judgeval` contextual relevancy scorer
|
|
3
|
-
|
|
4
|
-
TODO add link to docs page for this scorer
|
|
5
|
-
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
# Internal imports
|
|
9
|
-
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
|
10
|
-
from judgeval.constants import APIScorer
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class ContextualRelevancyScorer(APIJudgmentScorer):
|
|
14
|
-
"""
|
|
15
|
-
Scorer that checks if the output of a model is relevant to the retrieval context
|
|
16
|
-
"""
|
|
17
|
-
def __init__(self, threshold: float):
|
|
18
|
-
super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RELEVANCY)
|
|
19
|
-
|
|
20
|
-
@property
|
|
21
|
-
def __name__(self):
|
|
22
|
-
return "Contextual Relevancy"
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
`judgeval` hallucination scorer
|
|
3
|
-
|
|
4
|
-
TODO add link to docs page for this scorer
|
|
5
|
-
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
# Internal imports
|
|
9
|
-
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
|
10
|
-
from judgeval.constants import APIScorer
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class HallucinationScorer(APIJudgmentScorer):
|
|
14
|
-
def __init__(self, threshold: float):
|
|
15
|
-
super().__init__(threshold=threshold, score_type=APIScorer.HALLUCINATION)
|
|
16
|
-
|
|
17
|
-
@property
|
|
18
|
-
def __name__(self):
|
|
19
|
-
return "Hallucination"
|