judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +5 -4
- judgeval/clients.py +6 -6
- judgeval/common/__init__.py +7 -2
- judgeval/common/exceptions.py +2 -3
- judgeval/common/logger.py +74 -49
- judgeval/common/s3_storage.py +30 -23
- judgeval/common/tracer.py +1273 -939
- judgeval/common/utils.py +416 -244
- judgeval/constants.py +73 -61
- judgeval/data/__init__.py +1 -1
- judgeval/data/custom_example.py +3 -2
- judgeval/data/datasets/dataset.py +80 -54
- judgeval/data/datasets/eval_dataset_client.py +131 -181
- judgeval/data/example.py +67 -43
- judgeval/data/result.py +11 -9
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +25 -16
- judgeval/data/trace.py +57 -29
- judgeval/data/trace_run.py +5 -11
- judgeval/evaluation_run.py +22 -82
- judgeval/integrations/langgraph.py +546 -184
- judgeval/judges/base_judge.py +1 -2
- judgeval/judges/litellm_judge.py +33 -11
- judgeval/judges/mixture_of_judges.py +128 -78
- judgeval/judges/together_judge.py +22 -9
- judgeval/judges/utils.py +14 -5
- judgeval/judgment_client.py +259 -271
- judgeval/rules.py +169 -142
- judgeval/run_evaluation.py +462 -305
- judgeval/scorers/api_scorer.py +20 -11
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorer.py +77 -58
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
- judgeval/scorers/prompt_scorer.py +48 -37
- judgeval/scorers/score.py +86 -53
- judgeval/scorers/utils.py +11 -7
- judgeval/tracer/__init__.py +1 -1
- judgeval/utils/alerts.py +23 -12
- judgeval/utils/{data_utils.py → file_utils.py} +5 -9
- judgeval/utils/requests.py +29 -0
- judgeval/version_check.py +5 -2
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
- judgeval-0.0.46.dist-info/RECORD +69 -0
- judgeval-0.0.44.dist-info/RECORD +0 -68
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
judgeval/scorers/api_scorer.py
CHANGED
@@ -6,7 +6,7 @@ Scores `Example`s using ready-made Judgment evaluators.
|
|
6
6
|
|
7
7
|
from pydantic import BaseModel, field_validator
|
8
8
|
from typing import List
|
9
|
-
from judgeval.common.logger import debug, info,
|
9
|
+
from judgeval.common.logger import debug, info, error
|
10
10
|
from judgeval.data import ExampleParams
|
11
11
|
from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
|
12
12
|
|
@@ -19,27 +19,34 @@ class APIJudgmentScorer(BaseModel):
|
|
19
19
|
score_type (APIScorer): The Judgment metric to use for scoring `Example`s
|
20
20
|
threshold (float): A value between 0 and 1 that determines the scoring threshold
|
21
21
|
"""
|
22
|
+
|
22
23
|
score_type: APIScorer
|
23
24
|
threshold: float
|
24
|
-
required_params: List[
|
25
|
+
required_params: List[
|
26
|
+
ExampleParams
|
27
|
+
] = [] # List of the required parameters on examples for the scorer
|
25
28
|
|
26
|
-
@field_validator(
|
29
|
+
@field_validator("threshold")
|
27
30
|
def validate_threshold(cls, v, info):
|
28
31
|
"""
|
29
32
|
Validates that the threshold is between 0 and 1 inclusive.
|
30
33
|
"""
|
31
|
-
score_type = info.data.get(
|
34
|
+
score_type = info.data.get("score_type")
|
32
35
|
if score_type in UNBOUNDED_SCORERS:
|
33
36
|
if v < 0:
|
34
37
|
error(f"Threshold for {score_type} must be greater than 0, got: {v}")
|
35
|
-
raise ValueError(
|
38
|
+
raise ValueError(
|
39
|
+
f"Threshold for {score_type} must be greater than 0, got: {v}"
|
40
|
+
)
|
36
41
|
else:
|
37
42
|
if not 0 <= v <= 1:
|
38
43
|
error(f"Threshold for {score_type} must be between 0 and 1, got: {v}")
|
39
|
-
raise ValueError(
|
44
|
+
raise ValueError(
|
45
|
+
f"Threshold for {score_type} must be between 0 and 1, got: {v}"
|
46
|
+
)
|
40
47
|
return v
|
41
48
|
|
42
|
-
@field_validator(
|
49
|
+
@field_validator("score_type")
|
43
50
|
def convert_to_enum_value(cls, v):
|
44
51
|
"""
|
45
52
|
Validates that the `score_type` is a valid `APIScorer` enum value.
|
@@ -61,11 +68,13 @@ class APIJudgmentScorer(BaseModel):
|
|
61
68
|
def to_dict(self) -> dict:
|
62
69
|
"""
|
63
70
|
Converts the scorer configuration to a dictionary format.
|
64
|
-
|
71
|
+
|
65
72
|
Returns:
|
66
73
|
dict: A dictionary containing the scorer's configuration
|
67
74
|
"""
|
68
75
|
return {
|
69
|
-
"score_type": str(
|
70
|
-
|
71
|
-
|
76
|
+
"score_type": str(
|
77
|
+
self.score_type.value
|
78
|
+
), # Convert enum to string for serialization
|
79
|
+
"threshold": self.threshold,
|
80
|
+
}
|
judgeval/scorers/exceptions.py
CHANGED
@@ -13,21 +13,26 @@ from judgeval.judges import JudgevalJudge
|
|
13
13
|
from judgeval.judges.utils import create_judge
|
14
14
|
from judgeval.constants import UNBOUNDED_SCORERS
|
15
15
|
from judgeval.data.example import ExampleParams
|
16
|
+
|
17
|
+
|
16
18
|
class JudgevalScorer:
|
17
19
|
"""
|
18
20
|
Base class for scorers in `judgeval`.
|
19
21
|
|
20
22
|
In practice, you should not implement this class unless you are creating a custom scorer.
|
21
23
|
Judgeval offers 10+ default scorers that you can use out of the box.
|
22
|
-
|
24
|
+
|
23
25
|
If you want to create a scorer that does not fall under any of the ready-made Judgment scorers,
|
24
26
|
you can create a custom scorer by extending this class.
|
25
27
|
"""
|
28
|
+
|
26
29
|
score_type: str # name of your new scorer
|
27
30
|
threshold: float # The threshold to pass a test while using this scorer as a scorer
|
28
31
|
score: Optional[float] = None # The float score of the scorer run on the test case
|
29
|
-
score_breakdown: Dict = None
|
30
|
-
reason: Optional[str] =
|
32
|
+
score_breakdown: Optional[Dict] = None
|
33
|
+
reason: Optional[str] = (
|
34
|
+
None # The reason for the score when evaluating the test case
|
35
|
+
)
|
31
36
|
success: Optional[bool] = None # Whether the test case passed or failed
|
32
37
|
evaluation_model: Optional[str] = None # The model used to evaluate the test case
|
33
38
|
strict_mode: bool = False # Whether to run the scorer in strict mode
|
@@ -39,61 +44,67 @@ class JudgevalScorer:
|
|
39
44
|
evaluation_cost: Optional[float] = None # The cost of running the scorer
|
40
45
|
verbose_logs: Optional[str] = None # The verbose logs of the scorer
|
41
46
|
additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
|
42
|
-
required_params: Optional[List[ExampleParams]] =
|
43
|
-
|
44
|
-
|
47
|
+
required_params: Optional[List[ExampleParams]] = (
|
48
|
+
None # The required parameters for the scorer
|
49
|
+
)
|
45
50
|
|
46
51
|
def __init__(
|
47
|
-
self,
|
48
|
-
score_type: str,
|
49
|
-
threshold: float,
|
50
|
-
score: Optional[float] = None,
|
51
|
-
score_breakdown: Optional[Dict] = None,
|
52
|
-
reason: Optional[str] = None,
|
53
|
-
success: Optional[bool] = None,
|
54
|
-
evaluation_model: Optional[str] = None,
|
52
|
+
self,
|
53
|
+
score_type: str,
|
54
|
+
threshold: float,
|
55
|
+
score: Optional[float] = None,
|
56
|
+
score_breakdown: Optional[Dict] = None,
|
57
|
+
reason: Optional[str] = None,
|
58
|
+
success: Optional[bool] = None,
|
59
|
+
evaluation_model: Optional[str] = None,
|
55
60
|
required_params: Optional[List[ExampleParams]] = None,
|
56
|
-
strict_mode: bool = False,
|
57
|
-
async_mode: bool = True,
|
58
|
-
verbose_mode: bool = True,
|
59
|
-
include_reason: bool = False,
|
61
|
+
strict_mode: bool = False,
|
62
|
+
async_mode: bool = True,
|
63
|
+
verbose_mode: bool = True,
|
64
|
+
include_reason: bool = False,
|
60
65
|
custom_example: bool = False,
|
61
|
-
error: Optional[str] = None,
|
62
|
-
evaluation_cost: Optional[float] = None,
|
63
|
-
verbose_logs: Optional[str] = None,
|
64
|
-
additional_metadata: Optional[Dict] = None
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
66
|
+
error: Optional[str] = None,
|
67
|
+
evaluation_cost: Optional[float] = None,
|
68
|
+
verbose_logs: Optional[str] = None,
|
69
|
+
additional_metadata: Optional[Dict] = None,
|
70
|
+
):
|
71
|
+
debug(
|
72
|
+
f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}"
|
73
|
+
)
|
74
|
+
if score_type in UNBOUNDED_SCORERS:
|
75
|
+
if threshold < 0:
|
76
|
+
raise ValueError(
|
77
|
+
f"Threshold for {score_type} must be greater than 0, got: {threshold}"
|
78
|
+
)
|
79
|
+
else:
|
80
|
+
if not 0 <= threshold <= 1:
|
81
|
+
raise ValueError(
|
82
|
+
f"Threshold for {score_type} must be between 0 and 1, got: {threshold}"
|
83
|
+
)
|
84
|
+
if strict_mode:
|
85
|
+
warning("Strict mode enabled - scoring will be more rigorous")
|
86
|
+
info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
|
87
|
+
self.score_type = score_type
|
88
|
+
self.threshold = threshold
|
89
|
+
self.score = score
|
90
|
+
self.score_breakdown = score_breakdown
|
91
|
+
self.reason = reason
|
92
|
+
self.success = success
|
93
|
+
self.evaluation_model = evaluation_model
|
94
|
+
self.strict_mode = strict_mode
|
95
|
+
self.async_mode = async_mode
|
96
|
+
self.verbose_mode = verbose_mode
|
97
|
+
self.include_reason = include_reason
|
98
|
+
self.custom_example = custom_example
|
99
|
+
self.error = error
|
100
|
+
self.evaluation_cost = evaluation_cost
|
101
|
+
self.verbose_logs = verbose_logs
|
102
|
+
self.additional_metadata = additional_metadata
|
103
|
+
self.required_params = required_params
|
93
104
|
|
94
105
|
def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
|
95
106
|
"""
|
96
|
-
Adds the evaluation model to the JudgevalScorer instance
|
107
|
+
Adds the evaluation model to the JudgevalScorer instance
|
97
108
|
|
98
109
|
This method is used at eval time
|
99
110
|
"""
|
@@ -107,7 +118,9 @@ class JudgevalScorer:
|
|
107
118
|
"""
|
108
119
|
warning("Attempting to call unimplemented score_example method")
|
109
120
|
error("score_example method not implemented")
|
110
|
-
raise NotImplementedError(
|
121
|
+
raise NotImplementedError(
|
122
|
+
"You must implement the `score` method in your custom scorer"
|
123
|
+
)
|
111
124
|
|
112
125
|
@abstractmethod
|
113
126
|
async def a_score_example(self, example, *args, **kwargs) -> float:
|
@@ -116,8 +129,10 @@ class JudgevalScorer:
|
|
116
129
|
"""
|
117
130
|
warning("Attempting to call unimplemented a_score_example method")
|
118
131
|
error("a_score_example method not implemented")
|
119
|
-
raise NotImplementedError(
|
120
|
-
|
132
|
+
raise NotImplementedError(
|
133
|
+
"You must implement the `a_score` method in your custom scorer"
|
134
|
+
)
|
135
|
+
|
121
136
|
@abstractmethod
|
122
137
|
def _success_check(self) -> bool:
|
123
138
|
"""
|
@@ -125,7 +140,9 @@ class JudgevalScorer:
|
|
125
140
|
"""
|
126
141
|
warning("Attempting to call unimplemented success_check method")
|
127
142
|
error("_success_check method not implemented")
|
128
|
-
raise NotImplementedError(
|
143
|
+
raise NotImplementedError(
|
144
|
+
"You must implement the `_success_check` method in your custom scorer"
|
145
|
+
)
|
129
146
|
|
130
147
|
def __str__(self):
|
131
148
|
debug("Converting JudgevalScorer instance to string representation")
|
@@ -150,9 +167,11 @@ class JudgevalScorer:
|
|
150
167
|
"additional_metadata": self.additional_metadata,
|
151
168
|
}
|
152
169
|
return f"JudgevalScorer({attributes})"
|
153
|
-
|
170
|
+
|
154
171
|
def to_dict(self):
|
155
172
|
return {
|
156
|
-
"score_type": str(
|
157
|
-
|
173
|
+
"score_type": str(
|
174
|
+
self.score_type
|
175
|
+
), # Convert enum to string for serialization
|
176
|
+
"threshold": self.threshold,
|
158
177
|
}
|
@@ -1,20 +1,51 @@
|
|
1
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.execution_order import
|
2
|
-
|
3
|
-
|
4
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.
|
5
|
-
|
6
|
-
|
7
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.
|
8
|
-
|
9
|
-
|
10
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.
|
1
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.execution_order import (
|
2
|
+
ExecutionOrderScorer,
|
3
|
+
)
|
4
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.json_correctness import (
|
5
|
+
JSONCorrectnessScorer,
|
6
|
+
)
|
7
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.summarization import (
|
8
|
+
SummarizationScorer,
|
9
|
+
)
|
10
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import (
|
11
|
+
HallucinationScorer,
|
12
|
+
)
|
13
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.faithfulness import (
|
14
|
+
FaithfulnessScorer,
|
15
|
+
)
|
16
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.contextual_relevancy import (
|
17
|
+
ContextualRelevancyScorer,
|
18
|
+
)
|
19
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.contextual_precision import (
|
20
|
+
ContextualPrecisionScorer,
|
21
|
+
)
|
22
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.contextual_recall import (
|
23
|
+
ContextualRecallScorer,
|
24
|
+
)
|
25
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.answer_relevancy import (
|
26
|
+
AnswerRelevancyScorer,
|
27
|
+
)
|
28
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import (
|
29
|
+
AnswerCorrectnessScorer,
|
30
|
+
)
|
11
31
|
from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonScorer
|
12
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import
|
13
|
-
|
14
|
-
|
32
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import (
|
33
|
+
InstructionAdherenceScorer,
|
34
|
+
)
|
35
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import (
|
36
|
+
GroundednessScorer,
|
37
|
+
)
|
38
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import (
|
39
|
+
DerailmentScorer,
|
40
|
+
)
|
15
41
|
from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
|
16
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.classifier_scorer import
|
17
|
-
|
42
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.classifier_scorer import (
|
43
|
+
ClassifierScorer,
|
44
|
+
)
|
45
|
+
from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import (
|
46
|
+
ToolDependencyScorer,
|
47
|
+
)
|
48
|
+
|
18
49
|
__all__ = [
|
19
50
|
"ExecutionOrderScorer",
|
20
51
|
"JSONCorrectnessScorer",
|
@@ -10,16 +10,17 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
|
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
|
13
|
+
|
13
14
|
class AnswerCorrectnessScorer(APIJudgmentScorer):
|
14
15
|
def __init__(self, threshold: float):
|
15
16
|
super().__init__(
|
16
|
-
threshold=threshold,
|
17
|
+
threshold=threshold,
|
17
18
|
score_type=APIScorer.ANSWER_CORRECTNESS,
|
18
19
|
required_params=[
|
19
20
|
ExampleParams.INPUT,
|
20
21
|
ExampleParams.ACTUAL_OUTPUT,
|
21
22
|
ExampleParams.EXPECTED_OUTPUT,
|
22
|
-
]
|
23
|
+
],
|
23
24
|
)
|
24
25
|
|
25
26
|
@property
|
@@ -10,15 +10,16 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
|
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
|
13
|
+
|
13
14
|
class AnswerRelevancyScorer(APIJudgmentScorer):
|
14
15
|
def __init__(self, threshold: float):
|
15
16
|
super().__init__(
|
16
|
-
threshold=threshold,
|
17
|
+
threshold=threshold,
|
17
18
|
score_type=APIScorer.ANSWER_RELEVANCY,
|
18
19
|
required_params=[
|
19
20
|
ExampleParams.INPUT,
|
20
21
|
ExampleParams.ACTUAL_OUTPUT,
|
21
|
-
]
|
22
|
+
],
|
22
23
|
)
|
23
24
|
|
24
25
|
@property
|
@@ -1,11 +1,11 @@
|
|
1
1
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
2
2
|
from judgeval.constants import APIScorer
|
3
|
-
from typing import List, Mapping, Optional
|
4
|
-
|
3
|
+
from typing import List, Mapping, Optional
|
4
|
+
|
5
5
|
|
6
6
|
class ClassifierScorer(APIJudgmentScorer):
|
7
7
|
"""
|
8
|
-
In the Judgment backend, this scorer is implemented as a PromptScorer that takes
|
8
|
+
In the Judgment backend, this scorer is implemented as a PromptScorer that takes
|
9
9
|
1. a system role that may involve the Example object
|
10
10
|
2. options for scores on the example
|
11
11
|
|
@@ -14,7 +14,7 @@ class ClassifierScorer(APIJudgmentScorer):
|
|
14
14
|
ex:
|
15
15
|
system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
|
16
16
|
options = {"positive": 1, "negative": 0}
|
17
|
-
|
17
|
+
|
18
18
|
Args:
|
19
19
|
name (str): The name of the scorer
|
20
20
|
slug (str): A unique identifier for the scorer
|
@@ -25,14 +25,15 @@ class ClassifierScorer(APIJudgmentScorer):
|
|
25
25
|
strict_mode (bool): Whether to use strict mode (default: False)
|
26
26
|
verbose_mode (bool): Whether to include verbose logging (default: False)
|
27
27
|
"""
|
28
|
+
|
28
29
|
name: Optional[str] = None
|
29
30
|
slug: Optional[str] = None
|
30
31
|
conversation: Optional[List[dict]] = None
|
31
32
|
options: Optional[Mapping[str, float]] = None
|
32
33
|
verbose_mode: bool = False
|
33
34
|
strict_mode: bool = False
|
34
|
-
include_reason: bool = True
|
35
|
-
async_mode: bool = True
|
35
|
+
include_reason: bool = True
|
36
|
+
async_mode: bool = True
|
36
37
|
threshold: float = 0.5
|
37
38
|
|
38
39
|
def __init__(
|
@@ -65,26 +66,26 @@ class ClassifierScorer(APIJudgmentScorer):
|
|
65
66
|
Updates the name of the scorer.
|
66
67
|
"""
|
67
68
|
self.name = name
|
68
|
-
|
69
|
+
|
69
70
|
def update_threshold(self, threshold: float):
|
70
71
|
"""
|
71
72
|
Updates the threshold of the scorer.
|
72
73
|
"""
|
73
74
|
self.threshold = threshold
|
74
|
-
|
75
|
+
|
75
76
|
def update_conversation(self, conversation: List[dict]):
|
76
77
|
"""
|
77
78
|
Updates the conversation with the new conversation.
|
78
|
-
|
79
|
+
|
79
80
|
Sample conversation:
|
80
81
|
[{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
|
81
82
|
"""
|
82
83
|
self.conversation = conversation
|
83
|
-
|
84
|
+
|
84
85
|
def update_options(self, options: Mapping[str, float]):
|
85
86
|
"""
|
86
87
|
Updates the options with the new options.
|
87
|
-
|
88
|
+
|
88
89
|
Sample options:
|
89
90
|
{"yes": 1, "no": 0}
|
90
91
|
"""
|
@@ -10,34 +10,36 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
|
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
from typing import Optional, Dict
|
12
12
|
from judgeval.data import ExampleParams
|
13
|
+
|
14
|
+
|
13
15
|
class ComparisonScorer(APIJudgmentScorer):
|
14
16
|
kwargs: Optional[Dict] = None
|
15
17
|
|
16
18
|
def __init__(self, threshold: float, criteria: str, description: str):
|
17
19
|
super().__init__(
|
18
|
-
threshold=threshold,
|
20
|
+
threshold=threshold,
|
19
21
|
score_type=APIScorer.COMPARISON,
|
20
22
|
required_params=[
|
21
23
|
ExampleParams.INPUT,
|
22
24
|
ExampleParams.ACTUAL_OUTPUT,
|
23
25
|
ExampleParams.EXPECTED_OUTPUT,
|
24
|
-
]
|
26
|
+
],
|
25
27
|
)
|
26
28
|
self.kwargs = {"criteria": criteria, "description": description}
|
27
29
|
|
28
30
|
@property
|
29
31
|
def __name__(self):
|
30
32
|
return f"Comparison-{self.kwargs['criteria']}"
|
31
|
-
|
33
|
+
|
32
34
|
def to_dict(self) -> dict:
|
33
35
|
"""
|
34
36
|
Converts the scorer configuration to a dictionary format.
|
35
|
-
|
37
|
+
|
36
38
|
Returns:
|
37
39
|
dict: A dictionary containing the scorer's configuration
|
38
40
|
"""
|
39
41
|
return {
|
40
42
|
"score_type": self.score_type,
|
41
43
|
"threshold": self.threshold,
|
42
|
-
"kwargs": self.kwargs
|
44
|
+
"kwargs": self.kwargs,
|
43
45
|
}
|
@@ -10,17 +10,18 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
|
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
|
13
|
+
|
13
14
|
class ContextualPrecisionScorer(APIJudgmentScorer):
|
14
15
|
def __init__(self, threshold: float):
|
15
16
|
super().__init__(
|
16
|
-
threshold=threshold,
|
17
|
+
threshold=threshold,
|
17
18
|
score_type=APIScorer.CONTEXTUAL_PRECISION,
|
18
19
|
required_params=[
|
19
20
|
ExampleParams.INPUT,
|
20
21
|
ExampleParams.ACTUAL_OUTPUT,
|
21
22
|
ExampleParams.RETRIEVAL_CONTEXT,
|
22
23
|
ExampleParams.EXPECTED_OUTPUT,
|
23
|
-
]
|
24
|
+
],
|
24
25
|
)
|
25
26
|
|
26
27
|
@property
|
@@ -14,15 +14,16 @@ from judgeval.data import ExampleParams
|
|
14
14
|
class ContextualRecallScorer(APIJudgmentScorer):
|
15
15
|
def __init__(self, threshold: float):
|
16
16
|
super().__init__(
|
17
|
-
threshold=threshold,
|
17
|
+
threshold=threshold,
|
18
18
|
score_type=APIScorer.CONTEXTUAL_RECALL,
|
19
19
|
required_params=[
|
20
20
|
ExampleParams.INPUT,
|
21
21
|
ExampleParams.ACTUAL_OUTPUT,
|
22
22
|
ExampleParams.EXPECTED_OUTPUT,
|
23
23
|
ExampleParams.RETRIEVAL_CONTEXT,
|
24
|
-
]
|
24
|
+
],
|
25
25
|
)
|
26
|
+
|
26
27
|
@property
|
27
28
|
def __name__(self):
|
28
29
|
return "Contextual Recall"
|
@@ -10,20 +10,23 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
|
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
|
13
|
+
|
13
14
|
class ContextualRelevancyScorer(APIJudgmentScorer):
|
14
15
|
"""
|
15
16
|
Scorer that checks if the output of a model is relevant to the retrieval context
|
16
17
|
"""
|
18
|
+
|
17
19
|
def __init__(self, threshold: float):
|
18
20
|
super().__init__(
|
19
|
-
threshold=threshold,
|
21
|
+
threshold=threshold,
|
20
22
|
score_type=APIScorer.CONTEXTUAL_RELEVANCY,
|
21
23
|
required_params=[
|
22
24
|
ExampleParams.INPUT,
|
23
25
|
ExampleParams.ACTUAL_OUTPUT,
|
24
26
|
ExampleParams.RETRIEVAL_CONTEXT,
|
25
|
-
]
|
27
|
+
],
|
26
28
|
)
|
29
|
+
|
27
30
|
@property
|
28
31
|
def __name__(self):
|
29
32
|
return "Contextual Relevancy"
|
@@ -9,10 +9,11 @@ TODO add link to docs page for this scorer
|
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
|
12
|
+
|
12
13
|
class DerailmentScorer(APIJudgmentScorer):
|
13
14
|
def __init__(self, threshold: float):
|
14
15
|
super().__init__(
|
15
|
-
threshold=threshold,
|
16
|
+
threshold=threshold,
|
16
17
|
score_type=APIScorer.DERAILMENT,
|
17
18
|
)
|
18
19
|
|
@@ -8,22 +8,31 @@ TODO add link to docs page for this scorer
|
|
8
8
|
# Internal imports
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
|
-
from typing import Optional, Dict
|
11
|
+
from typing import Optional, Dict
|
12
12
|
from judgeval.data import ExampleParams
|
13
13
|
|
14
|
+
|
14
15
|
class ExecutionOrderScorer(APIJudgmentScorer):
|
15
16
|
kwargs: Optional[Dict] = None
|
16
17
|
|
17
|
-
def __init__(
|
18
|
+
def __init__(
|
19
|
+
self,
|
20
|
+
threshold: float,
|
21
|
+
should_exact_match: bool = False,
|
22
|
+
should_consider_ordering: bool = False,
|
23
|
+
):
|
18
24
|
super().__init__(
|
19
|
-
threshold=threshold,
|
25
|
+
threshold=threshold,
|
20
26
|
score_type=APIScorer.EXECUTION_ORDER,
|
21
27
|
required_params=[
|
22
28
|
ExampleParams.ACTUAL_OUTPUT,
|
23
29
|
ExampleParams.EXPECTED_OUTPUT,
|
24
|
-
]
|
30
|
+
],
|
25
31
|
)
|
26
|
-
self.kwargs = {
|
32
|
+
self.kwargs = {
|
33
|
+
"should_exact_match": should_exact_match,
|
34
|
+
"should_consider_ordering": should_consider_ordering,
|
35
|
+
}
|
27
36
|
|
28
37
|
@property
|
29
38
|
def __name__(self):
|
@@ -32,12 +41,12 @@ class ExecutionOrderScorer(APIJudgmentScorer):
|
|
32
41
|
def to_dict(self) -> dict:
|
33
42
|
"""
|
34
43
|
Converts the scorer configuration to a dictionary format.
|
35
|
-
|
44
|
+
|
36
45
|
Returns:
|
37
46
|
dict: A dictionary containing the scorer's configuration
|
38
47
|
"""
|
39
48
|
return {
|
40
49
|
"score_type": self.score_type,
|
41
50
|
"threshold": self.threshold,
|
42
|
-
"kwargs": self.kwargs
|
43
|
-
}
|
51
|
+
"kwargs": self.kwargs,
|
52
|
+
}
|
@@ -10,16 +10,17 @@ from judgeval.scorers.api_scorer import APIJudgmentScorer
|
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
|
13
|
+
|
13
14
|
class FaithfulnessScorer(APIJudgmentScorer):
|
14
15
|
def __init__(self, threshold: float):
|
15
16
|
super().__init__(
|
16
|
-
threshold=threshold,
|
17
|
+
threshold=threshold,
|
17
18
|
score_type=APIScorer.FAITHFULNESS,
|
18
19
|
required_params=[
|
19
20
|
ExampleParams.INPUT,
|
20
21
|
ExampleParams.ACTUAL_OUTPUT,
|
21
22
|
ExampleParams.RETRIEVAL_CONTEXT,
|
22
|
-
]
|
23
|
+
],
|
23
24
|
)
|
24
25
|
|
25
26
|
@property
|