judgeval 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +0 -71
- judgeval/clients.py +14 -3
- judgeval/common/tracer.py +57 -31
- judgeval/constants.py +1 -0
- judgeval/data/__init__.py +2 -1
- judgeval/data/scorer_data.py +2 -2
- judgeval/evaluation_run.py +16 -15
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/base_judge.py +1 -1
- judgeval/judges/litellm_judge.py +2 -2
- judgeval/judges/mixture_of_judges.py +2 -2
- judgeval/judges/together_judge.py +2 -2
- judgeval/judges/utils.py +4 -4
- judgeval/judgment_client.py +67 -15
- judgeval/run_evaluation.py +79 -14
- judgeval/scorers/__init__.py +8 -4
- judgeval/scorers/api_scorer.py +64 -0
- judgeval/scorers/base_scorer.py +3 -2
- judgeval/scorers/exceptions.py +11 -0
- judgeval/scorers/{custom_scorer.py → judgeval_scorer.py} +9 -5
- judgeval/scorers/judgeval_scorers/__init__.py +132 -9
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +23 -0
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +19 -0
- judgeval/scorers/judgeval_scorers/{answer_relevancy.py → api_scorers/answer_relevancy.py} +2 -2
- judgeval/scorers/judgeval_scorers/{contextual_precision.py → api_scorers/contextual_precision.py} +2 -2
- judgeval/scorers/judgeval_scorers/{contextual_recall.py → api_scorers/contextual_recall.py} +2 -2
- judgeval/scorers/judgeval_scorers/{contextual_relevancy.py → api_scorers/contextual_relevancy.py} +2 -2
- judgeval/scorers/judgeval_scorers/{faithfulness.py → api_scorers/faithfulness.py} +2 -2
- judgeval/scorers/judgeval_scorers/{hallucination.py → api_scorers/hallucination.py} +2 -2
- judgeval/scorers/judgeval_scorers/{json_correctness.py → api_scorers/json_correctness.py} +7 -7
- judgeval/scorers/judgeval_scorers/{summarization.py → api_scorers/summarization.py} +2 -2
- judgeval/scorers/judgeval_scorers/{tool_correctness.py → api_scorers/tool_correctness.py} +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +24 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +4 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +272 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +169 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +4 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +292 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +174 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +259 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +106 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +249 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +142 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +240 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +121 -0
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +318 -0
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +265 -0
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +258 -0
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +104 -0
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +127 -0
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +247 -0
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +541 -0
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +151 -0
- judgeval/scorers/prompt_scorer.py +4 -4
- judgeval/scorers/score.py +14 -14
- judgeval/scorers/utils.py +40 -6
- {judgeval-0.0.3.dist-info → judgeval-0.0.5.dist-info}/METADATA +1 -1
- judgeval-0.0.5.dist-info/RECORD +78 -0
- judgeval-0.0.3.dist-info/RECORD +0 -46
- {judgeval-0.0.3.dist-info → judgeval-0.0.5.dist-info}/WHEEL +0 -0
- {judgeval-0.0.3.dist-info → judgeval-0.0.5.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -10,8 +10,8 @@ from judgeval.data import (
|
|
10
10
|
ScoringResult
|
11
11
|
)
|
12
12
|
from judgeval.scorers import (
|
13
|
-
|
14
|
-
|
13
|
+
JudgevalScorer,
|
14
|
+
APIJudgmentScorer,
|
15
15
|
ClassifierScorer
|
16
16
|
)
|
17
17
|
from judgeval.scorers.score import a_execute_scoring
|
@@ -64,7 +64,7 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
64
64
|
|
65
65
|
def merge_results(api_results: List[ScoringResult], local_results: List[ScoringResult]) -> List[ScoringResult]:
|
66
66
|
"""
|
67
|
-
When executing scorers that come from both the Judgment API and
|
67
|
+
When executing scorers that come from both the Judgment API and local scorers, we're left with
|
68
68
|
results for each type of scorer. This function merges the results from the API and local evaluations,
|
69
69
|
grouped by example. In particular, we merge the `scorers_data` field of each `ScoringResult` object.
|
70
70
|
|
@@ -127,6 +127,7 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
|
|
127
127
|
)
|
128
128
|
return results
|
129
129
|
|
130
|
+
|
130
131
|
def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str) -> None:
|
131
132
|
"""
|
132
133
|
Checks if an evaluation run name already exists for a given project.
|
@@ -164,6 +165,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
164
165
|
error(f"Failed to check if eval run name exists: {str(e)}")
|
165
166
|
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
166
167
|
|
168
|
+
|
167
169
|
def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> None:
|
168
170
|
"""
|
169
171
|
Logs evaluation results to the Judgment API database.
|
@@ -203,6 +205,7 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
203
205
|
error(f"Failed to save evaluation results to DB: {str(e)}")
|
204
206
|
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
205
207
|
|
208
|
+
|
206
209
|
def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
|
207
210
|
"""
|
208
211
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
@@ -214,7 +217,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
214
217
|
project_name (str): The name of the project the evaluation results belong to
|
215
218
|
eval_name (str): The name of the evaluation run
|
216
219
|
examples (List[Example]): The examples to evaluate
|
217
|
-
scorers (List[Union[JudgmentScorer,
|
220
|
+
scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
218
221
|
model (str): The model used as a judge when using LLM as a Judge
|
219
222
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
220
223
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
@@ -254,19 +257,19 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
254
257
|
|
255
258
|
debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
|
256
259
|
|
257
|
-
# Group
|
260
|
+
# Group APIJudgmentScorers and JudgevalScorers, then evaluate them in parallel
|
258
261
|
debug("Grouping scorers by type")
|
259
|
-
judgment_scorers: List[
|
260
|
-
|
262
|
+
judgment_scorers: List[APIJudgmentScorer] = []
|
263
|
+
local_scorers: List[JudgevalScorer] = []
|
261
264
|
for scorer in evaluation_run.scorers:
|
262
|
-
if isinstance(scorer, (
|
265
|
+
if isinstance(scorer, (APIJudgmentScorer, ClassifierScorer)):
|
263
266
|
judgment_scorers.append(scorer)
|
264
267
|
debug(f"Added judgment scorer: {type(scorer).__name__}")
|
265
268
|
else:
|
266
|
-
|
267
|
-
debug(f"Added
|
269
|
+
local_scorers.append(scorer)
|
270
|
+
debug(f"Added local scorer: {type(scorer).__name__}")
|
268
271
|
|
269
|
-
debug(f"Found {len(judgment_scorers)} judgment scorers and {len(
|
272
|
+
debug(f"Found {len(judgment_scorers)} judgment scorers and {len(local_scorers)} local scorers")
|
270
273
|
|
271
274
|
api_results: List[ScoringResult] = []
|
272
275
|
local_results: List[ScoringResult] = []
|
@@ -288,7 +291,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
288
291
|
log_results=evaluation_run.log_results
|
289
292
|
)
|
290
293
|
debug("Sending request to Judgment API")
|
291
|
-
response_data: List[Dict] = execute_api_eval(api_evaluation_run) #
|
294
|
+
response_data: List[Dict] = execute_api_eval(api_evaluation_run) # Dicts are `ScoringResult` objs
|
292
295
|
info(f"Received {len(response_data['results'])} results from API")
|
293
296
|
except JudgmentAPIError as e:
|
294
297
|
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
@@ -317,7 +320,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
317
320
|
api_results.append(ScoringResult(**filtered_result))
|
318
321
|
|
319
322
|
# Run local evals
|
320
|
-
if
|
323
|
+
if local_scorers: # List[JudgevalScorer]
|
321
324
|
info("Starting local evaluation")
|
322
325
|
for example in evaluation_run.examples:
|
323
326
|
with example_logging_context(example.timestamp, example.example_id):
|
@@ -326,7 +329,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
326
329
|
results: List[ScoringResult] = asyncio.run(
|
327
330
|
a_execute_scoring(
|
328
331
|
evaluation_run.examples,
|
329
|
-
|
332
|
+
local_scorers,
|
330
333
|
model=evaluation_run.model,
|
331
334
|
ignore_errors=True,
|
332
335
|
skip_on_missing_params=True,
|
@@ -353,3 +356,65 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
353
356
|
if not result.scorers_data: # none of the scorers could be executed on this example
|
354
357
|
info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
|
355
358
|
return merged_results
|
359
|
+
|
360
|
+
def assert_test(scoring_results: List[ScoringResult]) -> None:
|
361
|
+
"""
|
362
|
+
Collects all failed scorers from the scoring results.
|
363
|
+
|
364
|
+
Args:
|
365
|
+
ScoringResults (List[ScoringResult]): List of scoring results to check
|
366
|
+
|
367
|
+
Returns:
|
368
|
+
None. Raises exceptions for any failed test cases.
|
369
|
+
"""
|
370
|
+
failed_cases: List[ScorerData] = []
|
371
|
+
|
372
|
+
for result in scoring_results:
|
373
|
+
if not result.success:
|
374
|
+
|
375
|
+
# Create a test case context with all relevant fields
|
376
|
+
test_case = {
|
377
|
+
'input': result.input,
|
378
|
+
'actual_output': result.actual_output,
|
379
|
+
'expected_output': result.expected_output,
|
380
|
+
'context': result.context,
|
381
|
+
'retrieval_context': result.retrieval_context,
|
382
|
+
'eval_run_name': result.eval_run_name,
|
383
|
+
'failed_scorers': []
|
384
|
+
}
|
385
|
+
if result.scorers_data:
|
386
|
+
# If the result was not successful, check each scorer_data
|
387
|
+
for scorer_data in result.scorers_data:
|
388
|
+
if not scorer_data.success:
|
389
|
+
test_case['failed_scorers'].append(scorer_data)
|
390
|
+
failed_cases.append(test_case)
|
391
|
+
|
392
|
+
if failed_cases:
|
393
|
+
error_msg = f"The following test cases failed: \n"
|
394
|
+
for fail_case in failed_cases:
|
395
|
+
error_msg += f"\nInput: {fail_case['input']}\n"
|
396
|
+
error_msg += f"Actual Output: {fail_case['actual_output']}\n"
|
397
|
+
error_msg += f"Expected Output: {fail_case['expected_output']}\n"
|
398
|
+
error_msg += f"Context: {fail_case['context']}\n"
|
399
|
+
error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
|
400
|
+
error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
|
401
|
+
|
402
|
+
for fail_scorer in fail_case['failed_scorers']:
|
403
|
+
|
404
|
+
error_msg += (
|
405
|
+
f"\nScorer Name: {fail_scorer.name}\n"
|
406
|
+
f"Threshold: {fail_scorer.threshold}\n"
|
407
|
+
f"Success: {fail_scorer.success}\n"
|
408
|
+
f"Score: {fail_scorer.score}\n"
|
409
|
+
f"Reason: {fail_scorer.reason}\n"
|
410
|
+
f"Strict Mode: {fail_scorer.strict_mode}\n"
|
411
|
+
f"Evaluation Model: {fail_scorer.evaluation_model}\n"
|
412
|
+
f"Error: {fail_scorer.error}\n"
|
413
|
+
f"Evaluation Cost: {fail_scorer.evaluation_cost}\n"
|
414
|
+
f"Verbose Logs: {fail_scorer.verbose_logs}\n"
|
415
|
+
f"Additional Metadata: {fail_scorer.additional_metadata}\n"
|
416
|
+
)
|
417
|
+
error_msg += "-"*100
|
418
|
+
|
419
|
+
raise AssertionError(error_msg)
|
420
|
+
|
judgeval/scorers/__init__.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
from judgeval.scorers.
|
2
|
-
from judgeval.scorers.
|
1
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
2
|
+
from judgeval.scorers.judgeval_scorer import JudgevalScorer
|
3
3
|
from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
|
4
4
|
from judgeval.scorers.judgeval_scorers import (
|
5
5
|
ToolCorrectnessScorer,
|
@@ -11,11 +11,13 @@ from judgeval.scorers.judgeval_scorers import (
|
|
11
11
|
ContextualPrecisionScorer,
|
12
12
|
ContextualRecallScorer,
|
13
13
|
AnswerRelevancyScorer,
|
14
|
+
ScorerWrapper,
|
15
|
+
AnswerCorrectnessScorer,
|
14
16
|
)
|
15
17
|
|
16
18
|
__all__ = [
|
17
|
-
"
|
18
|
-
"
|
19
|
+
"APIJudgmentScorer",
|
20
|
+
"JudgevalScorer",
|
19
21
|
"PromptScorer",
|
20
22
|
"ClassifierScorer",
|
21
23
|
"ToolCorrectnessScorer",
|
@@ -27,4 +29,6 @@ __all__ = [
|
|
27
29
|
"ContextualPrecisionScorer",
|
28
30
|
"ContextualRecallScorer",
|
29
31
|
"AnswerRelevancyScorer",
|
32
|
+
"ScorerWrapper",
|
33
|
+
"AnswerCorrectnessScorer",
|
30
34
|
]
|
@@ -0,0 +1,64 @@
|
|
1
|
+
"""
|
2
|
+
Judgment Scorer class.
|
3
|
+
|
4
|
+
Scores `Example`s using ready-made Judgment evaluators.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from pydantic import BaseModel, field_validator
|
8
|
+
from judgeval.common.logger import debug, info, warning, error
|
9
|
+
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
|
12
|
+
|
13
|
+
class APIJudgmentScorer(BaseModel):
|
14
|
+
"""
|
15
|
+
Class for ready-made, "out-of-the-box" scorer that uses Judgment evaluators to score `Example`s.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
score_type (APIScorer): The Judgment metric to use for scoring `Example`s
|
19
|
+
threshold (float): A value between 0 and 1 that determines the scoring threshold
|
20
|
+
"""
|
21
|
+
threshold: float
|
22
|
+
score_type: APIScorer
|
23
|
+
|
24
|
+
@field_validator('threshold')
|
25
|
+
def validate_threshold(cls, v):
|
26
|
+
"""
|
27
|
+
Validates that the threshold is between 0 and 1 inclusive.
|
28
|
+
"""
|
29
|
+
if not 0 <= v <= 1:
|
30
|
+
error(f"Threshold must be between 0 and 1, got: {v}")
|
31
|
+
raise ValueError(f"Threshold must be between 0 and 1, got: {v}")
|
32
|
+
return v
|
33
|
+
|
34
|
+
@field_validator('score_type')
|
35
|
+
def convert_to_enum_value(cls, v):
|
36
|
+
"""
|
37
|
+
Validates that the `score_type` is a valid `JudgmentMetric` enum value.
|
38
|
+
Converts string values to `JudgmentMetric` enum values.
|
39
|
+
"""
|
40
|
+
debug(f"Attempting to convert score_type value: {v}")
|
41
|
+
if isinstance(v, APIScorer):
|
42
|
+
info(f"Using existing JudgmentMetric: {v.value}")
|
43
|
+
return v.value
|
44
|
+
elif isinstance(v, str):
|
45
|
+
debug(f"Converting string value to JudgmentMetric enum: {v}")
|
46
|
+
return APIScorer[v.upper()].value
|
47
|
+
error(f"Invalid score_type value: {v}")
|
48
|
+
raise ValueError(f"Invalid value for score_type: {v}")
|
49
|
+
|
50
|
+
def __str__(self):
|
51
|
+
return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
|
52
|
+
|
53
|
+
def to_dict(self) -> dict:
|
54
|
+
"""
|
55
|
+
Converts the scorer configuration to a dictionary format.
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
dict: A dictionary containing the scorer's configuration
|
59
|
+
"""
|
60
|
+
return {
|
61
|
+
"score_type": self.score_type,
|
62
|
+
"threshold": self.threshold
|
63
|
+
}
|
64
|
+
|
judgeval/scorers/base_scorer.py
CHANGED
@@ -10,7 +10,7 @@ from judgeval.common.logger import debug, info, warning, error
|
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
|
12
12
|
|
13
|
-
class
|
13
|
+
class APIJudgmentScorer(BaseModel):
|
14
14
|
"""
|
15
15
|
Class for ready-made, "out-of-the-box" scorer that uses Judgment evaluators to score `Example`s.
|
16
16
|
|
@@ -48,4 +48,5 @@ class JudgmentScorer(BaseModel):
|
|
48
48
|
raise ValueError(f"Invalid value for score_type: {v}")
|
49
49
|
|
50
50
|
def __str__(self):
|
51
|
-
return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
|
51
|
+
return f"JudgmentScorer(score_type={self.score_type}, threshold={self.threshold})"
|
52
|
+
|
@@ -9,15 +9,19 @@ from typing import Optional, Dict, Union, List
|
|
9
9
|
from abc import abstractmethod
|
10
10
|
|
11
11
|
from judgeval.common.logger import debug, info, warning, error
|
12
|
-
from judgeval.judges import
|
12
|
+
from judgeval.judges import JudgevalJudge
|
13
13
|
from judgeval.judges.utils import create_judge
|
14
14
|
|
15
15
|
|
16
|
-
class
|
16
|
+
class JudgevalScorer:
|
17
17
|
"""
|
18
|
+
Base class for scorers in `judgeval`.
|
19
|
+
|
20
|
+
In practice, you should not implement this class unless you are creating a custom scorer.
|
21
|
+
Judgeval offers 10+ default scorers that you can use out of the box.
|
22
|
+
|
18
23
|
If you want to create a scorer that does not fall under any of the ready-made Judgment scorers,
|
19
|
-
you can create a custom scorer by extending this class.
|
20
|
-
where none of Judgment's scorers are suitable.
|
24
|
+
you can create a custom scorer by extending this class.
|
21
25
|
"""
|
22
26
|
score_type: str # name of your new scorer
|
23
27
|
threshold: float # The threshold to pass a test while using this scorer as a scorer
|
@@ -73,7 +77,7 @@ class CustomScorer:
|
|
73
77
|
self.verbose_logs = verbose_logs
|
74
78
|
self.additional_metadata = additional_metadata
|
75
79
|
|
76
|
-
def _add_model(self, model: Optional[Union[str, List[str],
|
80
|
+
def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
|
77
81
|
"""
|
78
82
|
Adds the evaluation model to the CustomScorer instance
|
79
83
|
|
@@ -1,12 +1,135 @@
|
|
1
|
-
from
|
2
|
-
from
|
3
|
-
|
4
|
-
|
5
|
-
from judgeval.scorers.judgeval_scorers.
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
1
|
+
from typing import Type, Optional, Any
|
2
|
+
from functools import wraps
|
3
|
+
|
4
|
+
# Import implementations
|
5
|
+
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
6
|
+
ToolCorrectnessScorer as APIToolCorrectnessScorer,
|
7
|
+
JSONCorrectnessScorer as APIJSONCorrectnessScorer,
|
8
|
+
SummarizationScorer as APISummarizationScorer,
|
9
|
+
HallucinationScorer as APIHallucinationScorer,
|
10
|
+
FaithfulnessScorer as APIFaithfulnessScorer,
|
11
|
+
ContextualRelevancyScorer as APIContextualRelevancyScorer,
|
12
|
+
ContextualPrecisionScorer as APIContextualPrecisionScorer,
|
13
|
+
ContextualRecallScorer as APIContextualRecallScorer,
|
14
|
+
AnswerRelevancyScorer as APIAnswerRelevancyScorer,
|
15
|
+
AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
|
16
|
+
)
|
17
|
+
|
18
|
+
from judgeval.scorers.judgeval_scorers.local_implementations import (
|
19
|
+
AnswerRelevancyScorer as LocalAnswerRelevancyScorer,
|
20
|
+
ContextualPrecisionScorer as LocalContextualPrecisionScorer,
|
21
|
+
ContextualRecallScorer as LocalContextualRecallScorer,
|
22
|
+
ContextualRelevancyScorer as LocalContextualRelevancyScorer,
|
23
|
+
FaithfulnessScorer as LocalFaithfulnessScorer,
|
24
|
+
JsonCorrectnessScorer as LocalJsonCorrectnessScorer,
|
25
|
+
ToolCorrectnessScorer as LocalToolCorrectnessScorer,
|
26
|
+
HallucinationScorer as LocalHallucinationScorer,
|
27
|
+
SummarizationScorer as LocalSummarizationScorer,
|
28
|
+
AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer
|
29
|
+
)
|
30
|
+
|
31
|
+
class ScorerWrapper:
|
32
|
+
"""
|
33
|
+
Wrapper class that can dynamically load either API or local implementation of a scorer.
|
34
|
+
"""
|
35
|
+
def __init__(self, api_implementation: Type, local_implementation: Optional[Type] = None):
|
36
|
+
self.api_implementation = api_implementation
|
37
|
+
self.local_implementation = local_implementation
|
38
|
+
self._instance = None
|
39
|
+
self._init_args = None
|
40
|
+
self._init_kwargs = None
|
41
|
+
|
42
|
+
def __call__(self, *args, **kwargs):
|
43
|
+
"""Store initialization arguments for later use when implementation is loaded"""
|
44
|
+
self._init_args = args
|
45
|
+
self._init_kwargs = kwargs
|
46
|
+
return self
|
47
|
+
|
48
|
+
def load_implementation(self, use_judgment: bool = True) -> Any:
|
49
|
+
"""
|
50
|
+
Load the appropriate implementation based on the use_judgment flag.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
use_judgment (bool): If True, use API implementation. If False, use local implementation.
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
Instance of the appropriate implementation
|
57
|
+
|
58
|
+
Raises:
|
59
|
+
ValueError: If local implementation is requested but not available
|
60
|
+
"""
|
61
|
+
if self._instance is not None:
|
62
|
+
return self._instance
|
63
|
+
|
64
|
+
if use_judgment:
|
65
|
+
implementation = self.api_implementation
|
66
|
+
else:
|
67
|
+
if self.local_implementation is None:
|
68
|
+
raise ValueError("No local implementation available for this scorer")
|
69
|
+
implementation = self.local_implementation
|
70
|
+
|
71
|
+
args = self._init_args or ()
|
72
|
+
kwargs = self._init_kwargs or {}
|
73
|
+
self._instance = implementation(*args, **kwargs)
|
74
|
+
return self._instance
|
75
|
+
|
76
|
+
def __getattr__(self, name):
|
77
|
+
"""Defer all attribute access to the loaded implementation"""
|
78
|
+
if self._instance is None:
|
79
|
+
raise RuntimeError("Implementation not loaded. Call load_implementation() first")
|
80
|
+
return getattr(self._instance, name)
|
81
|
+
|
82
|
+
# Create wrapped versions of all scorers
|
83
|
+
|
84
|
+
AnswerCorrectnessScorer = ScorerWrapper(
|
85
|
+
api_implementation=APIAnswerCorrectnessScorer,
|
86
|
+
local_implementation=LocalAnswerCorrectnessScorer
|
87
|
+
)
|
88
|
+
|
89
|
+
AnswerRelevancyScorer = ScorerWrapper(
|
90
|
+
api_implementation=APIAnswerRelevancyScorer,
|
91
|
+
local_implementation=LocalAnswerRelevancyScorer
|
92
|
+
)
|
93
|
+
|
94
|
+
ToolCorrectnessScorer = ScorerWrapper(
|
95
|
+
api_implementation=APIToolCorrectnessScorer,
|
96
|
+
local_implementation=LocalToolCorrectnessScorer
|
97
|
+
)
|
98
|
+
|
99
|
+
JSONCorrectnessScorer = ScorerWrapper(
|
100
|
+
api_implementation=APIJSONCorrectnessScorer,
|
101
|
+
local_implementation=LocalJsonCorrectnessScorer
|
102
|
+
)
|
103
|
+
|
104
|
+
SummarizationScorer = ScorerWrapper(
|
105
|
+
api_implementation=APISummarizationScorer,
|
106
|
+
local_implementation=LocalSummarizationScorer
|
107
|
+
)
|
108
|
+
|
109
|
+
HallucinationScorer = ScorerWrapper(
|
110
|
+
api_implementation=APIHallucinationScorer,
|
111
|
+
local_implementation=LocalHallucinationScorer
|
112
|
+
)
|
113
|
+
|
114
|
+
FaithfulnessScorer = ScorerWrapper(
|
115
|
+
api_implementation=APIFaithfulnessScorer,
|
116
|
+
local_implementation=LocalFaithfulnessScorer
|
117
|
+
)
|
118
|
+
|
119
|
+
ContextualRelevancyScorer = ScorerWrapper(
|
120
|
+
api_implementation=APIContextualRelevancyScorer,
|
121
|
+
local_implementation=LocalContextualRelevancyScorer
|
122
|
+
)
|
123
|
+
|
124
|
+
ContextualPrecisionScorer = ScorerWrapper(
|
125
|
+
api_implementation=APIContextualPrecisionScorer,
|
126
|
+
local_implementation=LocalContextualPrecisionScorer
|
127
|
+
)
|
128
|
+
|
129
|
+
ContextualRecallScorer = ScorerWrapper(
|
130
|
+
api_implementation=APIContextualRecallScorer,
|
131
|
+
local_implementation=LocalContextualRecallScorer
|
132
|
+
)
|
10
133
|
|
11
134
|
__all__ = [
|
12
135
|
"ToolCorrectnessScorer",
|
@@ -0,0 +1,23 @@
|
|
1
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.tool_correctness import ToolCorrectnessScorer
|
2
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.json_correctness import JSONCorrectnessScorer
|
3
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.summarization import SummarizationScorer
|
4
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import HallucinationScorer
|
5
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.faithfulness import FaithfulnessScorer
|
6
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.contextual_relevancy import ContextualRelevancyScorer
|
7
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.contextual_precision import ContextualPrecisionScorer
|
8
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.contextual_recall import ContextualRecallScorer
|
9
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.answer_relevancy import AnswerRelevancyScorer
|
10
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import AnswerCorrectnessScorer
|
11
|
+
|
12
|
+
__all__ = [
|
13
|
+
"ToolCorrectnessScorer",
|
14
|
+
"JSONCorrectnessScorer",
|
15
|
+
"SummarizationScorer",
|
16
|
+
"HallucinationScorer",
|
17
|
+
"FaithfulnessScorer",
|
18
|
+
"ContextualRelevancyScorer",
|
19
|
+
"ContextualPrecisionScorer",
|
20
|
+
"ContextualRecallScorer",
|
21
|
+
"AnswerRelevancyScorer",
|
22
|
+
"AnswerCorrectnessScorer",
|
23
|
+
]
|
@@ -0,0 +1,19 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` answer relevancy scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
|
12
|
+
|
13
|
+
class AnswerCorrectnessScorer(APIJudgmentScorer):
|
14
|
+
def __init__(self, threshold: float):
|
15
|
+
super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_CORRECTNESS)
|
16
|
+
|
17
|
+
@property
|
18
|
+
def __name__(self):
|
19
|
+
return "Answer Correctness"
|
@@ -6,11 +6,11 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
|
12
12
|
|
13
|
-
class AnswerRelevancyScorer(
|
13
|
+
class AnswerRelevancyScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
15
|
super().__init__(threshold=threshold, score_type=APIScorer.ANSWER_RELEVANCY)
|
16
16
|
|
judgeval/scorers/judgeval_scorers/{contextual_precision.py → api_scorers/contextual_precision.py}
RENAMED
@@ -6,11 +6,11 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
|
12
12
|
|
13
|
-
class ContextualPrecisionScorer(
|
13
|
+
class ContextualPrecisionScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
15
|
super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_PRECISION)
|
16
16
|
|
@@ -6,11 +6,11 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
|
12
12
|
|
13
|
-
class ContextualRecallScorer(
|
13
|
+
class ContextualRecallScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
15
|
super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RECALL)
|
16
16
|
|
judgeval/scorers/judgeval_scorers/{contextual_relevancy.py → api_scorers/contextual_relevancy.py}
RENAMED
@@ -6,11 +6,11 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
|
12
12
|
|
13
|
-
class ContextualRelevancyScorer(
|
13
|
+
class ContextualRelevancyScorer(APIJudgmentScorer):
|
14
14
|
"""
|
15
15
|
Scorer that checks if the output of a model is relevant to the retrieval context
|
16
16
|
"""
|
@@ -6,11 +6,11 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
|
12
12
|
|
13
|
-
class FaithfulnessScorer(
|
13
|
+
class FaithfulnessScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
15
|
super().__init__(threshold=threshold, score_type=APIScorer.FAITHFULNESS)
|
16
16
|
|
@@ -6,11 +6,11 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
|
12
12
|
|
13
|
-
class HallucinationScorer(
|
13
|
+
class HallucinationScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
15
|
super().__init__(threshold=threshold, score_type=APIScorer.HALLUCINATION)
|
16
16
|
|
@@ -9,23 +9,23 @@ TODO add link to docs page for this scorer
|
|
9
9
|
# External imports
|
10
10
|
from pydantic import BaseModel, Field
|
11
11
|
# Internal imports
|
12
|
-
from judgeval.scorers.
|
12
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
13
13
|
from judgeval.constants import APIScorer
|
14
14
|
|
15
15
|
|
16
|
-
class JSONCorrectnessScorer(
|
16
|
+
class JSONCorrectnessScorer(APIJudgmentScorer):
|
17
17
|
json_schema: BaseModel = Field(None, exclude=True)
|
18
18
|
|
19
19
|
def __init__(self, threshold: float, json_schema: BaseModel):
|
20
20
|
super().__init__(threshold=threshold, score_type=APIScorer.JSON_CORRECTNESS)
|
21
21
|
object.__setattr__(self, 'json_schema', json_schema)
|
22
|
-
|
22
|
+
|
23
23
|
def to_dict(self):
|
24
|
-
|
25
|
-
|
26
|
-
"
|
27
|
-
"kwargs": {"json_schema": self.json_schema.model_json_schema()}
|
24
|
+
base_dict = super().to_dict() # Get the parent class's dictionary
|
25
|
+
base_dict["kwargs"] = {
|
26
|
+
"json_schema": self.json_schema.model_json_schema()
|
28
27
|
}
|
28
|
+
return base_dict
|
29
29
|
|
30
30
|
@property
|
31
31
|
def __name__(self):
|
@@ -6,11 +6,11 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
|
12
12
|
|
13
|
-
class SummarizationScorer(
|
13
|
+
class SummarizationScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
15
|
super().__init__(threshold=threshold, score_type=APIScorer.SUMMARIZATION)
|
16
16
|
|