judgeval 0.0.17__py3-none-any.whl → 0.0.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +1 -3
- judgeval/clients.py +0 -6
- judgeval/common/logger.py +0 -1
- judgeval/common/tracer.py +250 -42
- judgeval/common/utils.py +9 -5
- judgeval/constants.py +6 -1
- judgeval/data/__init__.py +2 -0
- judgeval/data/api_example.py +2 -2
- judgeval/data/datasets/__init__.py +1 -2
- judgeval/data/datasets/dataset.py +4 -5
- judgeval/data/datasets/eval_dataset_client.py +1 -2
- judgeval/data/datasets/utils.py +1 -2
- judgeval/data/example.py +72 -17
- judgeval/data/scorer_data.py +1 -1
- judgeval/evaluation_run.py +2 -2
- judgeval/judges/__init__.py +0 -1
- judgeval/judges/base_judge.py +1 -1
- judgeval/judges/mixture_of_judges.py +7 -2
- judgeval/judgment_client.py +8 -4
- judgeval/rules.py +2 -4
- judgeval/run_evaluation.py +2 -5
- judgeval/scorers/__init__.py +6 -0
- judgeval/scorers/api_scorer.py +12 -6
- judgeval/scorers/base_scorer.py +12 -6
- judgeval/scorers/judgeval_scorer.py +7 -3
- judgeval/scorers/judgeval_scorers/__init__.py +24 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +6 -0
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +35 -0
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +19 -0
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +19 -0
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +4 -1
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -1
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +7 -6
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +161 -0
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +222 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +1 -8
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +7 -6
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +232 -0
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +102 -0
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +7 -7
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +7 -6
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +1 -2
- judgeval/scorers/prompt_scorer.py +7 -5
- judgeval/scorers/utils.py +1 -1
- {judgeval-0.0.17.dist-info → judgeval-0.0.18.dist-info}/METADATA +1 -1
- {judgeval-0.0.17.dist-info → judgeval-0.0.18.dist-info}/RECORD +56 -48
- /judgeval/data/{datasets/ground_truth.py → ground_truth.py} +0 -0
- {judgeval-0.0.17.dist-info → judgeval-0.0.18.dist-info}/WHEEL +0 -0
- {judgeval-0.0.17.dist-info → judgeval-0.0.18.dist-info}/licenses/LICENSE.md +0 -0
judgeval/evaluation_run.py
CHANGED
@@ -111,7 +111,7 @@ class EvaluationRun(BaseModel):
|
|
111
111
|
# Check if model is string or list of strings
|
112
112
|
if isinstance(v, str):
|
113
113
|
if v not in ACCEPTABLE_MODELS:
|
114
|
-
raise ValueError(f"Model name {v} not recognized.")
|
114
|
+
raise ValueError(f"Model name {v} not recognized. Please select a valid model name.)")
|
115
115
|
return v
|
116
116
|
|
117
117
|
if isinstance(v, list):
|
@@ -119,7 +119,7 @@ class EvaluationRun(BaseModel):
|
|
119
119
|
raise ValueError("When providing a list of models, all elements must be strings")
|
120
120
|
for m in v:
|
121
121
|
if m not in ACCEPTABLE_MODELS:
|
122
|
-
raise ValueError(f"Model name {m} not recognized.")
|
122
|
+
raise ValueError(f"Model name {m} not recognized. Please select a valid model name.")
|
123
123
|
return v
|
124
124
|
raise ValueError(f"Model must be one of: string, list of strings, or JudgevalJudge instance. Received type {type(v)}.")
|
125
125
|
|
judgeval/judges/__init__.py
CHANGED
judgeval/judges/base_judge.py
CHANGED
@@ -5,9 +5,14 @@ Enables client to use multiple models to generate responses and then aggregate t
|
|
5
5
|
"""
|
6
6
|
from judgeval import *
|
7
7
|
import pydantic
|
8
|
-
from typing import List, Union, Mapping
|
8
|
+
from typing import List, Union, Mapping
|
9
9
|
from judgeval.judges import JudgevalJudge
|
10
|
-
from judgeval.common.utils import
|
10
|
+
from judgeval.common.utils import (
|
11
|
+
get_completion_multiple_models,
|
12
|
+
get_chat_completion,
|
13
|
+
aget_completion_multiple_models,
|
14
|
+
aget_chat_completion
|
15
|
+
)
|
11
16
|
from judgeval.common.logger import debug, error
|
12
17
|
|
13
18
|
def build_dynamic_mixture_prompt(
|
judgeval/judgment_client.py
CHANGED
@@ -6,17 +6,17 @@ from typing import Optional, List, Dict, Any, Union
|
|
6
6
|
import requests
|
7
7
|
|
8
8
|
from judgeval.constants import ROOT_API
|
9
|
-
from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
9
|
+
from judgeval.data.datasets import EvalDataset, EvalDatasetClient
|
10
10
|
from judgeval.data import (
|
11
11
|
ScoringResult,
|
12
|
-
Example
|
12
|
+
Example,
|
13
|
+
GroundTruthExample
|
13
14
|
)
|
14
15
|
from judgeval.scorers import (
|
15
16
|
APIJudgmentScorer,
|
16
17
|
JudgevalScorer,
|
17
18
|
ClassifierScorer,
|
18
19
|
ScorerWrapper,
|
19
|
-
score,
|
20
20
|
)
|
21
21
|
from judgeval.evaluation_run import EvaluationRun
|
22
22
|
from judgeval.run_evaluation import (
|
@@ -24,7 +24,11 @@ from judgeval.run_evaluation import (
|
|
24
24
|
assert_test
|
25
25
|
)
|
26
26
|
from judgeval.judges import JudgevalJudge
|
27
|
-
from judgeval.constants import
|
27
|
+
from judgeval.constants import (
|
28
|
+
JUDGMENT_EVAL_FETCH_API_URL,
|
29
|
+
JUDGMENT_EVAL_DELETE_API_URL,
|
30
|
+
JUDGMENT_EVAL_DELETE_PROJECT_API_URL
|
31
|
+
)
|
28
32
|
from judgeval.common.exceptions import JudgmentAPIError
|
29
33
|
from pydantic import BaseModel
|
30
34
|
from judgeval.rules import Rule
|
judgeval/rules.py
CHANGED
@@ -5,14 +5,12 @@ Rules system for Judgeval that enables alerts based on metric thresholds.
|
|
5
5
|
from typing import Dict, List, Optional, Union, Any, Set, Tuple
|
6
6
|
from pydantic import BaseModel, Field, field_validator, ConfigDict
|
7
7
|
from enum import Enum
|
8
|
-
from datetime import datetime
|
9
8
|
import asyncio
|
10
9
|
from concurrent.futures import ThreadPoolExecutor
|
11
10
|
import time
|
12
|
-
import uuid
|
11
|
+
import uuid
|
13
12
|
|
14
|
-
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
15
|
-
from judgeval.scorers.judgeval_scorers import ScorerWrapper # Import from the correct module
|
13
|
+
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
|
16
14
|
|
17
15
|
class AlertStatus(str, Enum):
|
18
16
|
"""Status of an alert evaluation."""
|
judgeval/run_evaluation.py
CHANGED
@@ -5,7 +5,6 @@ from datetime import datetime
|
|
5
5
|
from rich import print as rprint
|
6
6
|
|
7
7
|
from judgeval.data import (
|
8
|
-
Example,
|
9
8
|
ScorerData,
|
10
9
|
ScoringResult
|
11
10
|
)
|
@@ -25,13 +24,11 @@ from judgeval.constants import (
|
|
25
24
|
from judgeval.common.exceptions import JudgmentAPIError
|
26
25
|
from judgeval.evaluation_run import EvaluationRun
|
27
26
|
from judgeval.common.logger import (
|
28
|
-
enable_logging,
|
29
27
|
debug,
|
30
28
|
info,
|
31
29
|
error,
|
32
30
|
example_logging_context
|
33
31
|
)
|
34
|
-
from judgeval.rules import RulesEngine, Rule, AlertResult, AlertStatus
|
35
32
|
|
36
33
|
|
37
34
|
def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
@@ -174,8 +171,8 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
174
171
|
)
|
175
172
|
|
176
173
|
if response.status_code == 409:
|
177
|
-
error(f"
|
178
|
-
raise ValueError(f"
|
174
|
+
error(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
|
175
|
+
raise ValueError(f"Eval run name '{eval_name}' already exists for this project. Please choose a different name or set the `override` flag to true.")
|
179
176
|
|
180
177
|
if not response.ok:
|
181
178
|
response_data = response.json()
|
judgeval/scorers/__init__.py
CHANGED
@@ -14,6 +14,9 @@ from judgeval.scorers.judgeval_scorers import (
|
|
14
14
|
ScorerWrapper,
|
15
15
|
AnswerCorrectnessScorer,
|
16
16
|
Text2SQLScorer,
|
17
|
+
ComparisonScorer,
|
18
|
+
InstructionAdherenceScorer,
|
19
|
+
GroundednessScorer,
|
17
20
|
)
|
18
21
|
|
19
22
|
__all__ = [
|
@@ -33,4 +36,7 @@ __all__ = [
|
|
33
36
|
"ScorerWrapper",
|
34
37
|
"AnswerCorrectnessScorer",
|
35
38
|
"Text2SQLScorer",
|
39
|
+
"ComparisonScorer",
|
40
|
+
"InstructionAdherenceScorer",
|
41
|
+
"GroundednessScorer",
|
36
42
|
]
|
judgeval/scorers/api_scorer.py
CHANGED
@@ -7,7 +7,7 @@ Scores `Example`s using ready-made Judgment evaluators.
|
|
7
7
|
from pydantic import BaseModel, field_validator
|
8
8
|
from judgeval.common.logger import debug, info, warning, error
|
9
9
|
|
10
|
-
from judgeval.constants import APIScorer
|
10
|
+
from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
|
11
11
|
|
12
12
|
|
13
13
|
class APIJudgmentScorer(BaseModel):
|
@@ -18,17 +18,23 @@ class APIJudgmentScorer(BaseModel):
|
|
18
18
|
score_type (APIScorer): The Judgment metric to use for scoring `Example`s
|
19
19
|
threshold (float): A value between 0 and 1 that determines the scoring threshold
|
20
20
|
"""
|
21
|
-
threshold: float
|
22
21
|
score_type: APIScorer
|
22
|
+
threshold: float
|
23
23
|
|
24
24
|
@field_validator('threshold')
|
25
|
-
def validate_threshold(cls, v):
|
25
|
+
def validate_threshold(cls, v, info):
|
26
26
|
"""
|
27
27
|
Validates that the threshold is between 0 and 1 inclusive.
|
28
28
|
"""
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
score_type = info.data.get('score_type')
|
30
|
+
if score_type in UNBOUNDED_SCORERS:
|
31
|
+
if v < 0:
|
32
|
+
error(f"Threshold for {score_type} must be greater than 0, got: {v}")
|
33
|
+
raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {v}")
|
34
|
+
else:
|
35
|
+
if not 0 <= v <= 1:
|
36
|
+
error(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
|
37
|
+
raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
|
32
38
|
return v
|
33
39
|
|
34
40
|
@field_validator('score_type')
|
judgeval/scorers/base_scorer.py
CHANGED
@@ -7,7 +7,7 @@ Scores `Example`s using ready-made Judgment evaluators.
|
|
7
7
|
from pydantic import BaseModel, field_validator
|
8
8
|
from judgeval.common.logger import debug, info, warning, error
|
9
9
|
|
10
|
-
from judgeval.constants import APIScorer
|
10
|
+
from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
|
11
11
|
|
12
12
|
|
13
13
|
class APIJudgmentScorer(BaseModel):
|
@@ -18,17 +18,23 @@ class APIJudgmentScorer(BaseModel):
|
|
18
18
|
score_type (APIScorer): The Judgment metric to use for scoring `Example`s
|
19
19
|
threshold (float): A value between 0 and 1 that determines the scoring threshold
|
20
20
|
"""
|
21
|
-
threshold: float
|
22
21
|
score_type: APIScorer
|
22
|
+
threshold: float
|
23
23
|
|
24
24
|
@field_validator('threshold')
|
25
|
-
def validate_threshold(cls, v):
|
25
|
+
def validate_threshold(cls, v, info):
|
26
26
|
"""
|
27
27
|
Validates that the threshold is between 0 and 1 inclusive.
|
28
28
|
"""
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
score_type = info.data.get('score_type')
|
30
|
+
if score_type in UNBOUNDED_SCORERS:
|
31
|
+
if v < 0:
|
32
|
+
error(f"Threshold for {score_type} must be greater than 0, got: {v}")
|
33
|
+
raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {v}")
|
34
|
+
else:
|
35
|
+
if not 0 <= v <= 1:
|
36
|
+
error(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
|
37
|
+
raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
|
32
38
|
return v
|
33
39
|
|
34
40
|
@field_validator('score_type')
|
@@ -11,7 +11,7 @@ from abc import abstractmethod
|
|
11
11
|
from judgeval.common.logger import debug, info, warning, error
|
12
12
|
from judgeval.judges import JudgevalJudge
|
13
13
|
from judgeval.judges.utils import create_judge
|
14
|
-
|
14
|
+
from judgeval.constants import UNBOUNDED_SCORERS
|
15
15
|
|
16
16
|
class JudgevalScorer:
|
17
17
|
"""
|
@@ -58,8 +58,12 @@ class JudgevalScorer:
|
|
58
58
|
additional_metadata: Optional[Dict] = None
|
59
59
|
):
|
60
60
|
debug(f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}")
|
61
|
-
if
|
62
|
-
|
61
|
+
if score_type in UNBOUNDED_SCORERS:
|
62
|
+
if threshold < 0:
|
63
|
+
raise ValueError(f"Threshold for {score_type} must be greater than 0, got: {threshold}")
|
64
|
+
else:
|
65
|
+
if not 0 <= threshold <= 1:
|
66
|
+
raise ValueError(f"Threshold for {score_type} must be between 0 and 1, got: {threshold}")
|
63
67
|
if strict_mode:
|
64
68
|
warning("Strict mode enabled - scoring will be more rigorous")
|
65
69
|
info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
|
@@ -1,5 +1,4 @@
|
|
1
1
|
from typing import Type, Optional, Any
|
2
|
-
from functools import wraps
|
3
2
|
|
4
3
|
# Import implementations
|
5
4
|
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
@@ -12,7 +11,10 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
|
|
12
11
|
ContextualPrecisionScorer as APIContextualPrecisionScorer,
|
13
12
|
ContextualRecallScorer as APIContextualRecallScorer,
|
14
13
|
AnswerRelevancyScorer as APIAnswerRelevancyScorer,
|
15
|
-
AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
|
14
|
+
AnswerCorrectnessScorer as APIAnswerCorrectnessScorer,
|
15
|
+
ComparisonScorer as APIComparisonScorer,
|
16
|
+
InstructionAdherenceScorer as APIInstructionAdherenceScorer,
|
17
|
+
GroundednessScorer as APIGroundednessScorer,
|
16
18
|
)
|
17
19
|
|
18
20
|
from judgeval.scorers.judgeval_scorers.local_implementations import (
|
@@ -25,7 +27,9 @@ from judgeval.scorers.judgeval_scorers.local_implementations import (
|
|
25
27
|
ToolCorrectnessScorer as LocalToolCorrectnessScorer,
|
26
28
|
HallucinationScorer as LocalHallucinationScorer,
|
27
29
|
SummarizationScorer as LocalSummarizationScorer,
|
28
|
-
AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer
|
30
|
+
AnswerCorrectnessScorer as LocalAnswerCorrectnessScorer,
|
31
|
+
ComparisonScorer as LocalComparisonScorer,
|
32
|
+
InstructionAdherenceScorer as LocalInstructionAdherenceScorer,
|
29
33
|
)
|
30
34
|
|
31
35
|
from judgeval.scorers.judgeval_scorers.classifiers import Text2SQLScorer
|
@@ -134,6 +138,21 @@ ContextualRecallScorer = ScorerWrapper(
|
|
134
138
|
local_implementation=LocalContextualRecallScorer
|
135
139
|
)
|
136
140
|
|
141
|
+
InstructionAdherenceScorer = ScorerWrapper(
|
142
|
+
api_implementation=APIInstructionAdherenceScorer,
|
143
|
+
local_implementation=LocalInstructionAdherenceScorer
|
144
|
+
)
|
145
|
+
|
146
|
+
def ComparisonScorer(threshold: float, criteria: str, description: str):
|
147
|
+
return ScorerWrapper(
|
148
|
+
api_implementation=APIComparisonScorer,
|
149
|
+
local_implementation=LocalComparisonScorer
|
150
|
+
)(threshold=threshold, criteria=criteria, description=description)
|
151
|
+
|
152
|
+
GroundednessScorer = ScorerWrapper(
|
153
|
+
api_implementation=APIGroundednessScorer,
|
154
|
+
)
|
155
|
+
|
137
156
|
__all__ = [
|
138
157
|
"ToolCorrectnessScorer",
|
139
158
|
"JSONCorrectnessScorer",
|
@@ -145,4 +164,6 @@ __all__ = [
|
|
145
164
|
"ContextualRecallScorer",
|
146
165
|
"AnswerRelevancyScorer",
|
147
166
|
"Text2SQLScorer",
|
167
|
+
"ComparisonScorer",
|
168
|
+
"GroundednessScorer",
|
148
169
|
]
|
@@ -8,6 +8,9 @@ from judgeval.scorers.judgeval_scorers.api_scorers.contextual_precision import C
|
|
8
8
|
from judgeval.scorers.judgeval_scorers.api_scorers.contextual_recall import ContextualRecallScorer
|
9
9
|
from judgeval.scorers.judgeval_scorers.api_scorers.answer_relevancy import AnswerRelevancyScorer
|
10
10
|
from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import AnswerCorrectnessScorer
|
11
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonScorer
|
12
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import InstructionAdherenceScorer
|
13
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import GroundednessScorer
|
11
14
|
|
12
15
|
__all__ = [
|
13
16
|
"ToolCorrectnessScorer",
|
@@ -20,4 +23,7 @@ __all__ = [
|
|
20
23
|
"ContextualRecallScorer",
|
21
24
|
"AnswerRelevancyScorer",
|
22
25
|
"AnswerCorrectnessScorer",
|
26
|
+
"ComparisonScorer",
|
27
|
+
"InstructionAdherenceScorer",
|
28
|
+
"GroundednessScorer",
|
23
29
|
]
|
@@ -0,0 +1,35 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` comparison scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
from typing import Optional, Dict
|
12
|
+
|
13
|
+
class ComparisonScorer(APIJudgmentScorer):
|
14
|
+
kwargs: Optional[Dict] = None
|
15
|
+
|
16
|
+
def __init__(self, threshold: float, criteria: str, description: str):
|
17
|
+
super().__init__(threshold=threshold, score_type=APIScorer.COMPARISON)
|
18
|
+
self.kwargs = {"criteria": criteria, "description": description}
|
19
|
+
|
20
|
+
@property
|
21
|
+
def __name__(self):
|
22
|
+
return f"Comparison-{self.kwargs['criteria']}"
|
23
|
+
|
24
|
+
def to_dict(self) -> dict:
|
25
|
+
"""
|
26
|
+
Converts the scorer configuration to a dictionary format.
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
dict: A dictionary containing the scorer's configuration
|
30
|
+
"""
|
31
|
+
return {
|
32
|
+
"score_type": self.score_type,
|
33
|
+
"threshold": self.threshold,
|
34
|
+
"kwargs": self.kwargs
|
35
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` Groundedness scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
|
12
|
+
|
13
|
+
class GroundednessScorer(APIJudgmentScorer):
|
14
|
+
def __init__(self, threshold: float):
|
15
|
+
super().__init__(threshold=threshold, score_type=APIScorer.GROUNDEDNESS)
|
16
|
+
|
17
|
+
@property
|
18
|
+
def __name__(self):
|
19
|
+
return "Groundedness"
|
@@ -0,0 +1,19 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` instruction adherence scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
|
12
|
+
|
13
|
+
class InstructionAdherenceScorer(APIJudgmentScorer):
|
14
|
+
def __init__(self, threshold: float):
|
15
|
+
super().__init__(threshold=threshold, score_type=APIScorer.INSTRUCTION_ADHERENCE)
|
16
|
+
|
17
|
+
@property
|
18
|
+
def __name__(self):
|
19
|
+
return "Instruction Adherence"
|
@@ -8,11 +8,13 @@ from judgeval.scorers.judgeval_scorers.local_implementations.tool_correctness.to
|
|
8
8
|
from judgeval.scorers.judgeval_scorers.local_implementations.hallucination.hallucination_scorer import HallucinationScorer
|
9
9
|
from judgeval.scorers.judgeval_scorers.local_implementations.summarization.summarization_scorer import SummarizationScorer
|
10
10
|
from judgeval.scorers.judgeval_scorers.local_implementations.answer_correctness.answer_correctness_scorer import AnswerCorrectnessScorer
|
11
|
-
|
11
|
+
from judgeval.scorers.judgeval_scorers.local_implementations.comparison.comparison_scorer import ComparisonScorer
|
12
|
+
from judgeval.scorers.judgeval_scorers.local_implementations.instruction_adherence.instruction_adherence import InstructionAdherenceScorer
|
12
13
|
|
13
14
|
__all__ = [
|
14
15
|
"AnswerCorrectnessScorer",
|
15
16
|
"AnswerRelevancyScorer",
|
17
|
+
"ComparisonScorer",
|
16
18
|
"ContextualPrecisionScorer",
|
17
19
|
"ContextualRecallScorer",
|
18
20
|
"ContextualRelevancyScorer",
|
@@ -21,4 +23,5 @@ __all__ = [
|
|
21
23
|
"ToolCorrectnessScorer",
|
22
24
|
"HallucinationScorer",
|
23
25
|
"SummarizationScorer",
|
26
|
+
"InstructionAdherenceScorer",
|
24
27
|
]
|
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
from typing import Optional, List, Union, Tuple
|
2
2
|
|
3
3
|
from judgeval.constants import APIScorer
|
4
|
-
from judgeval.scorers.utils import (
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
4
|
+
from judgeval.scorers.utils import (
|
5
|
+
get_or_create_event_loop,
|
6
|
+
scorer_progress_meter,
|
7
|
+
create_verbose_logs,
|
8
|
+
parse_response_json,
|
9
|
+
check_example_params
|
10
|
+
)
|
10
11
|
from judgeval.scorers import JudgevalScorer
|
11
12
|
from judgeval.judges import JudgevalJudge
|
12
13
|
from judgeval.judges.utils import create_judge
|
File without changes
|
@@ -0,0 +1,161 @@
|
|
1
|
+
from typing import Optional, Union, List
|
2
|
+
from pydantic import BaseModel
|
3
|
+
|
4
|
+
from judgeval.constants import APIScorer
|
5
|
+
from judgeval.scorers import JudgevalScorer
|
6
|
+
from judgeval.judges import JudgevalJudge
|
7
|
+
from judgeval.judges.utils import create_judge
|
8
|
+
from judgeval.data import Example, ExampleParams
|
9
|
+
from judgeval.scorers.utils import (
|
10
|
+
get_or_create_event_loop,
|
11
|
+
scorer_progress_meter,
|
12
|
+
create_verbose_logs,
|
13
|
+
parse_response_json,
|
14
|
+
check_example_params
|
15
|
+
)
|
16
|
+
from .prompts import ComparisonTemplate
|
17
|
+
|
18
|
+
required_params = [
|
19
|
+
ExampleParams.INPUT,
|
20
|
+
ExampleParams.ACTUAL_OUTPUT,
|
21
|
+
ExampleParams.EXPECTED_OUTPUT,
|
22
|
+
]
|
23
|
+
|
24
|
+
class ComparisonDifference(BaseModel):
|
25
|
+
actual_output_sentence: str
|
26
|
+
expected_output_sentence: str
|
27
|
+
reason: str
|
28
|
+
|
29
|
+
class ComparisonDifferences(BaseModel):
|
30
|
+
differences: List[ComparisonDifference]
|
31
|
+
|
32
|
+
class ComparisonScorer(JudgevalScorer):
|
33
|
+
def __init__(
|
34
|
+
self,
|
35
|
+
criteria: str,
|
36
|
+
description: str,
|
37
|
+
threshold: float = 1,
|
38
|
+
model: Optional[Union[str, JudgevalJudge]] = None,
|
39
|
+
include_reason: bool = True,
|
40
|
+
async_mode: bool = True,
|
41
|
+
verbose_mode: bool = False,
|
42
|
+
):
|
43
|
+
super().__init__(
|
44
|
+
score_type=APIScorer.COMPARISON,
|
45
|
+
threshold=threshold,
|
46
|
+
evaluation_model=None,
|
47
|
+
include_reason=include_reason,
|
48
|
+
async_mode=async_mode,
|
49
|
+
verbose_mode=verbose_mode
|
50
|
+
)
|
51
|
+
self.model, self.using_native_model = create_judge(model)
|
52
|
+
self.evaluation_model = self.model.get_model_name()
|
53
|
+
self.criteria = criteria
|
54
|
+
self.description = description
|
55
|
+
|
56
|
+
def score_example(
|
57
|
+
self,
|
58
|
+
example: Example,
|
59
|
+
_show_indicator: bool = True,
|
60
|
+
) -> float:
|
61
|
+
check_example_params(example, required_params, self)
|
62
|
+
|
63
|
+
with scorer_progress_meter(self, display_meter=_show_indicator):
|
64
|
+
if self.async_mode:
|
65
|
+
loop = get_or_create_event_loop()
|
66
|
+
loop.run_until_complete(
|
67
|
+
self.a_score_example(
|
68
|
+
example,
|
69
|
+
_show_indicator=False
|
70
|
+
)
|
71
|
+
)
|
72
|
+
else:
|
73
|
+
self.differences = self._find_differences(example)
|
74
|
+
self.score = len(self.differences)
|
75
|
+
self.reason = str(self.differences)
|
76
|
+
self.success = self.score <= self.threshold
|
77
|
+
self.verbose_logs = create_verbose_logs(
|
78
|
+
self,
|
79
|
+
steps=[
|
80
|
+
f"Score: {self.score}\nReason: {self.reason}",
|
81
|
+
],
|
82
|
+
)
|
83
|
+
|
84
|
+
return len(self.differences)
|
85
|
+
|
86
|
+
async def a_score_example(
|
87
|
+
self,
|
88
|
+
example: Example,
|
89
|
+
_show_indicator: bool = True
|
90
|
+
) -> float:
|
91
|
+
check_example_params(example, required_params, self)
|
92
|
+
|
93
|
+
with scorer_progress_meter(
|
94
|
+
self, async_mode=True, display_meter=_show_indicator
|
95
|
+
):
|
96
|
+
self.differences = self.a_find_differences(example)
|
97
|
+
self.score = len(self.differences)
|
98
|
+
self.reason = str(self.differences)
|
99
|
+
self.success = self.score <= self.threshold
|
100
|
+
self.verbose_logs = create_verbose_logs(
|
101
|
+
self,
|
102
|
+
steps=[
|
103
|
+
f"Score: {self.score}\nReason: {self.reason}",
|
104
|
+
],
|
105
|
+
)
|
106
|
+
|
107
|
+
return self.score
|
108
|
+
|
109
|
+
def _find_differences(self, example: Example) -> float:
|
110
|
+
prompt = ComparisonTemplate.find_differences(
|
111
|
+
criteria=self.criteria,
|
112
|
+
description=self.description,
|
113
|
+
actual_output=example.actual_output,
|
114
|
+
expected_output=example.expected_output
|
115
|
+
)
|
116
|
+
if self.using_native_model:
|
117
|
+
res = self.model.generate(prompt)
|
118
|
+
data = parse_response_json(res, self)
|
119
|
+
return data["differences"]
|
120
|
+
else:
|
121
|
+
try:
|
122
|
+
res: ComparisonDifferences = self.model.generate(prompt, schema=ComparisonDifferences)
|
123
|
+
return res.differences
|
124
|
+
except TypeError:
|
125
|
+
res = self.model.generate(prompt)
|
126
|
+
data = parse_response_json(res, self)
|
127
|
+
return data["differences"]
|
128
|
+
|
129
|
+
async def a_find_differences(self, example: Example) -> float:
|
130
|
+
prompt = ComparisonTemplate.find_differences(
|
131
|
+
criteria=self.criteria,
|
132
|
+
description=self.description,
|
133
|
+
actual_output=example.actual_output,
|
134
|
+
expected_output=example.expected_output
|
135
|
+
)
|
136
|
+
if self.using_native_model:
|
137
|
+
res = await self.model.a_generate(prompt)
|
138
|
+
data = parse_response_json(res, self)
|
139
|
+
return data["differences"]
|
140
|
+
else:
|
141
|
+
try:
|
142
|
+
res: ComparisonDifferences = await self.model.a_generate(prompt, schema=ComparisonDifferences)
|
143
|
+
return res.differences
|
144
|
+
except TypeError:
|
145
|
+
res = await self.model.a_generate(prompt)
|
146
|
+
data = parse_response_json(res, self)
|
147
|
+
return data["differences"]
|
148
|
+
|
149
|
+
def _success_check(self) -> bool:
|
150
|
+
if self.error is not None:
|
151
|
+
self.success = False
|
152
|
+
else:
|
153
|
+
try:
|
154
|
+
self.success = self.score <= self.threshold
|
155
|
+
except:
|
156
|
+
self.success = False
|
157
|
+
return self.success
|
158
|
+
|
159
|
+
@property
|
160
|
+
def __name__(self):
|
161
|
+
return f"Comparison - {self.criteria}"
|