judgeval 0.0.52__py3-none-any.whl → 0.0.54__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/logger.py +46 -199
- judgeval/common/s3_storage.py +2 -6
- judgeval/common/tracer.py +182 -262
- judgeval/common/utils.py +16 -36
- judgeval/constants.py +14 -20
- judgeval/data/__init__.py +0 -2
- judgeval/data/datasets/dataset.py +6 -10
- judgeval/data/datasets/eval_dataset_client.py +25 -27
- judgeval/data/example.py +5 -138
- judgeval/data/judgment_types.py +214 -0
- judgeval/data/result.py +7 -25
- judgeval/data/scorer_data.py +28 -40
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/tool.py +3 -54
- judgeval/data/trace.py +31 -50
- judgeval/data/trace_run.py +3 -3
- judgeval/evaluation_run.py +16 -23
- judgeval/integrations/langgraph.py +11 -12
- judgeval/judges/litellm_judge.py +3 -6
- judgeval/judges/mixture_of_judges.py +8 -25
- judgeval/judges/together_judge.py +3 -6
- judgeval/judgment_client.py +22 -24
- judgeval/rules.py +7 -19
- judgeval/run_evaluation.py +79 -242
- judgeval/scorers/__init__.py +4 -20
- judgeval/scorers/agent_scorer.py +21 -0
- judgeval/scorers/api_scorer.py +28 -38
- judgeval/scorers/base_scorer.py +98 -0
- judgeval/scorers/example_scorer.py +19 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
- judgeval/scorers/score.py +45 -330
- judgeval/scorers/utils.py +6 -88
- judgeval/utils/file_utils.py +4 -6
- judgeval/version_check.py +3 -2
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/METADATA +6 -5
- judgeval-0.0.54.dist-info/RECORD +65 -0
- judgeval/data/custom_example.py +0 -19
- judgeval/scorers/judgeval_scorer.py +0 -177
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
- judgeval/scorers/prompt_scorer.py +0 -296
- judgeval-0.0.52.dist-info/RECORD +0 -69
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/WHEEL +0 -0
- {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
from judgeval.scorers.base_scorer import BaseScorer
|
2
|
+
from judgeval.data import Trace
|
3
|
+
from typing import List, Optional
|
4
|
+
from abc import abstractmethod
|
5
|
+
|
6
|
+
from judgeval.common.logger import warning, error
|
7
|
+
|
8
|
+
|
9
|
+
class AgentScorer(BaseScorer):
|
10
|
+
@abstractmethod
|
11
|
+
async def a_score_trace(
|
12
|
+
self, trace: Trace, tools: Optional[List] = None, *args, **kwargs
|
13
|
+
) -> float:
|
14
|
+
"""
|
15
|
+
Asynchronously measures the score on a trace
|
16
|
+
"""
|
17
|
+
warning("Attempting to call unimplemented a_score_trace method")
|
18
|
+
error("a_score_trace method not implemented")
|
19
|
+
raise NotImplementedError(
|
20
|
+
"You must implement the `a_score_trace` method in your custom scorer"
|
21
|
+
)
|
judgeval/scorers/api_scorer.py
CHANGED
@@ -6,27 +6,35 @@ Scores `Example`s using ready-made Judgment evaluators.
|
|
6
6
|
|
7
7
|
from pydantic import BaseModel, field_validator
|
8
8
|
from typing import List
|
9
|
-
from judgeval.common.logger import debug, info, error
|
10
9
|
from judgeval.data import ExampleParams
|
11
|
-
from judgeval.constants import
|
10
|
+
from judgeval.constants import APIScorerType, UNBOUNDED_SCORERS
|
11
|
+
from judgeval.common.logger import judgeval_logger
|
12
12
|
|
13
13
|
|
14
|
-
class
|
14
|
+
class APIScorerConfig(BaseModel):
|
15
15
|
"""
|
16
|
-
|
16
|
+
Scorer config that is used to send to our Judgment server.
|
17
17
|
|
18
18
|
Args:
|
19
19
|
score_type (APIScorer): The Judgment metric to use for scoring `Example`s
|
20
|
+
name (str): The name of the scorer, usually this is the same as the score_type
|
20
21
|
threshold (float): A value between 0 and 1 that determines the scoring threshold
|
22
|
+
strict_mode (bool): Whether to use strict mode for the scorer
|
23
|
+
required_params (List[ExampleParams]): List of the required parameters on examples for the scorer
|
24
|
+
kwargs (dict): Additional keyword arguments to pass to the scorer
|
21
25
|
"""
|
22
26
|
|
23
|
-
score_type:
|
24
|
-
|
27
|
+
score_type: APIScorerType
|
28
|
+
name: str = ""
|
29
|
+
threshold: float = 0.5
|
30
|
+
strict_mode: bool = False
|
25
31
|
required_params: List[
|
26
32
|
ExampleParams
|
27
|
-
] = [] #
|
33
|
+
] = [] # This is used to check if the example has the required parameters before running the scorer
|
34
|
+
kwargs: dict = {}
|
28
35
|
|
29
36
|
@field_validator("threshold")
|
37
|
+
@classmethod
|
30
38
|
def validate_threshold(cls, v, info):
|
31
39
|
"""
|
32
40
|
Validates that the threshold is between 0 and 1 inclusive.
|
@@ -34,47 +42,29 @@ class APIJudgmentScorer(BaseModel):
|
|
34
42
|
score_type = info.data.get("score_type")
|
35
43
|
if score_type in UNBOUNDED_SCORERS:
|
36
44
|
if v < 0:
|
37
|
-
error(
|
45
|
+
judgeval_logger.error(
|
46
|
+
f"Threshold for {score_type} must be greater than 0, got: {v}"
|
47
|
+
)
|
38
48
|
raise ValueError(
|
39
49
|
f"Threshold for {score_type} must be greater than 0, got: {v}"
|
40
50
|
)
|
41
51
|
else:
|
42
52
|
if not 0 <= v <= 1:
|
43
|
-
error(
|
53
|
+
judgeval_logger.error(
|
54
|
+
f"Threshold for {score_type} must be between 0 and 1, got: {v}"
|
55
|
+
)
|
44
56
|
raise ValueError(
|
45
57
|
f"Threshold for {score_type} must be between 0 and 1, got: {v}"
|
46
58
|
)
|
47
59
|
return v
|
48
60
|
|
49
|
-
@field_validator("
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
if isinstance(v, APIScorer):
|
57
|
-
info(f"Using existing APIScorer: {v}")
|
58
|
-
return v
|
59
|
-
elif isinstance(v, str):
|
60
|
-
debug(f"Converting string value to APIScorer enum: {v}")
|
61
|
-
return APIScorer[v.upper()]
|
62
|
-
error(f"Invalid score_type value: {v}")
|
63
|
-
raise ValueError(f"Invalid value for score_type: {v}")
|
61
|
+
@field_validator("name", mode="after")
|
62
|
+
@classmethod
|
63
|
+
def set_name_to_score_type_if_none(cls, v, info):
|
64
|
+
"""Set name to score_type if not provided"""
|
65
|
+
if v is None:
|
66
|
+
return info.data.get("score_type")
|
67
|
+
return v
|
64
68
|
|
65
69
|
def __str__(self):
|
66
70
|
return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
|
67
|
-
|
68
|
-
def to_dict(self) -> dict:
|
69
|
-
"""
|
70
|
-
Converts the scorer configuration to a dictionary format.
|
71
|
-
|
72
|
-
Returns:
|
73
|
-
dict: A dictionary containing the scorer's configuration
|
74
|
-
"""
|
75
|
-
return {
|
76
|
-
"score_type": str(
|
77
|
-
self.score_type.value
|
78
|
-
), # Convert enum to string for serialization
|
79
|
-
"threshold": self.threshold,
|
80
|
-
}
|
@@ -0,0 +1,98 @@
|
|
1
|
+
"""
|
2
|
+
Base class for all scorers.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import Dict, Optional
|
6
|
+
|
7
|
+
from pydantic import BaseModel
|
8
|
+
|
9
|
+
|
10
|
+
from judgeval.judges.utils import create_judge
|
11
|
+
from typing import Any
|
12
|
+
from pydantic import model_validator, Field
|
13
|
+
from judgeval.common.logger import judgeval_logger
|
14
|
+
|
15
|
+
|
16
|
+
class BaseScorer(BaseModel):
|
17
|
+
"""
|
18
|
+
If you want to create a scorer that does not fall under any of the ready-made Judgment scorers,
|
19
|
+
you can create a custom scorer by extending this class. This is best used for special use cases
|
20
|
+
where none of Judgment's scorers are suitable.
|
21
|
+
"""
|
22
|
+
|
23
|
+
score_type: str # type of your scorer (Faithfulness, PromptScorer)
|
24
|
+
threshold: float = (
|
25
|
+
0.5 # The threshold to pass a test while using this scorer as a scorer
|
26
|
+
)
|
27
|
+
name: Optional[str] = (
|
28
|
+
None # name of your scorer (Faithfulness, PromptScorer-randomslug)
|
29
|
+
)
|
30
|
+
score: Optional[float] = None # The float score of the scorer run on the test case
|
31
|
+
score_breakdown: Optional[Dict] = None
|
32
|
+
reason: Optional[str] = ""
|
33
|
+
using_native_model: Optional[bool] = None # Whether the model is a native model
|
34
|
+
success: Optional[bool] = None # Whether the test case passed or failed
|
35
|
+
model: Optional[Any] = Field(
|
36
|
+
default=None, exclude=True
|
37
|
+
) # The model used to evaluate the test case
|
38
|
+
evaluation_model: Optional[str] = None # The model used to evaluate the test case
|
39
|
+
strict_mode: bool = False # Whether to run the scorer in strict mode
|
40
|
+
error: Optional[str] = None # The error message if the scorer failed
|
41
|
+
additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
|
42
|
+
user: Optional[str] = None # The user ID of the scorer
|
43
|
+
|
44
|
+
@model_validator(mode="before")
|
45
|
+
@classmethod
|
46
|
+
def enforce_strict_threshold(cls, data: dict):
|
47
|
+
if data.get("strict_mode"):
|
48
|
+
data["threshold"] = 1.0
|
49
|
+
return data
|
50
|
+
|
51
|
+
@model_validator(mode="after")
|
52
|
+
@classmethod
|
53
|
+
def default_name(cls, m: "BaseScorer") -> "BaseScorer":
|
54
|
+
if not m.name:
|
55
|
+
# Try to use the class name if it exists and is not empty
|
56
|
+
class_name = getattr(m, "__class__", None)
|
57
|
+
if class_name and getattr(m.__class__, "__name__", None):
|
58
|
+
m.name = m.__class__.__name__
|
59
|
+
else:
|
60
|
+
m.name = m.score_type
|
61
|
+
return m
|
62
|
+
|
63
|
+
def _add_model(self, model: str):
|
64
|
+
"""
|
65
|
+
Adds the evaluation model to the BaseScorer instance
|
66
|
+
|
67
|
+
This method is used at eval time
|
68
|
+
"""
|
69
|
+
self.model, self.using_native_model = create_judge(model)
|
70
|
+
self.evaluation_model = self.model.get_model_name()
|
71
|
+
|
72
|
+
def success_check(self) -> bool:
|
73
|
+
"""
|
74
|
+
For unit testing, determines whether the test case passes or fails
|
75
|
+
"""
|
76
|
+
if self.error:
|
77
|
+
return False
|
78
|
+
if self.score is None:
|
79
|
+
return False
|
80
|
+
return self.score >= self.threshold
|
81
|
+
|
82
|
+
def __str__(self):
|
83
|
+
if self.error:
|
84
|
+
judgeval_logger.warning(f"BaseScorer contains error: {self.error}")
|
85
|
+
attributes = {
|
86
|
+
"score_type": self.score_type,
|
87
|
+
"threshold": self.threshold,
|
88
|
+
"score": self.score,
|
89
|
+
"score_breakdown": self.score_breakdown,
|
90
|
+
"reason": self.reason,
|
91
|
+
"success": self.success,
|
92
|
+
"model": self.model,
|
93
|
+
"evaluation_model": self.evaluation_model,
|
94
|
+
"strict_mode": self.strict_mode,
|
95
|
+
"error": self.error,
|
96
|
+
"additional_metadata": self.additional_metadata,
|
97
|
+
}
|
98
|
+
return f"BaseScorer({attributes})"
|
@@ -0,0 +1,19 @@
|
|
1
|
+
from judgeval.scorers.base_scorer import BaseScorer
|
2
|
+
from judgeval.data import Example
|
3
|
+
from typing import List
|
4
|
+
from pydantic import Field
|
5
|
+
from judgeval.common.logger import judgeval_logger
|
6
|
+
|
7
|
+
|
8
|
+
class ExampleScorer(BaseScorer):
|
9
|
+
score_type: str = "Custom" # default to custom score type
|
10
|
+
required_params: List[str] = Field(default_factory=list)
|
11
|
+
|
12
|
+
async def a_score_example(self, example: Example, *args, **kwargs) -> float:
|
13
|
+
"""
|
14
|
+
Asynchronously measures the score on a single example
|
15
|
+
"""
|
16
|
+
judgeval_logger.error("a_score_example method not implemented")
|
17
|
+
raise NotImplementedError(
|
18
|
+
"You must implement the `a_score_example` method in your custom scorer"
|
19
|
+
)
|
@@ -1,40 +1,21 @@
|
|
1
1
|
from judgeval.scorers.judgeval_scorers.api_scorers.execution_order import (
|
2
2
|
ExecutionOrderScorer,
|
3
3
|
)
|
4
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.json_correctness import (
|
5
|
-
JSONCorrectnessScorer,
|
6
|
-
)
|
7
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.summarization import (
|
8
|
-
SummarizationScorer,
|
9
|
-
)
|
10
4
|
from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import (
|
11
5
|
HallucinationScorer,
|
12
6
|
)
|
13
7
|
from judgeval.scorers.judgeval_scorers.api_scorers.faithfulness import (
|
14
8
|
FaithfulnessScorer,
|
15
9
|
)
|
16
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.contextual_relevancy import (
|
17
|
-
ContextualRelevancyScorer,
|
18
|
-
)
|
19
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.contextual_precision import (
|
20
|
-
ContextualPrecisionScorer,
|
21
|
-
)
|
22
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.contextual_recall import (
|
23
|
-
ContextualRecallScorer,
|
24
|
-
)
|
25
10
|
from judgeval.scorers.judgeval_scorers.api_scorers.answer_relevancy import (
|
26
11
|
AnswerRelevancyScorer,
|
27
12
|
)
|
28
13
|
from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import (
|
29
14
|
AnswerCorrectnessScorer,
|
30
15
|
)
|
31
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.comparison import ComparisonScorer
|
32
16
|
from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import (
|
33
17
|
InstructionAdherenceScorer,
|
34
18
|
)
|
35
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.groundedness import (
|
36
|
-
GroundednessScorer,
|
37
|
-
)
|
38
19
|
from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import (
|
39
20
|
DerailmentScorer,
|
40
21
|
)
|
@@ -57,7 +38,6 @@ __all__ = [
|
|
57
38
|
"ContextualRecallScorer",
|
58
39
|
"AnswerRelevancyScorer",
|
59
40
|
"AnswerCorrectnessScorer",
|
60
|
-
"ComparisonScorer",
|
61
41
|
"InstructionAdherenceScorer",
|
62
42
|
"GroundednessScorer",
|
63
43
|
"DerailmentScorer",
|
@@ -6,23 +6,16 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import
|
10
|
-
from judgeval.constants import
|
9
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
10
|
+
from judgeval.constants import APIScorerType
|
11
11
|
from judgeval.data import ExampleParams
|
12
|
+
from typing import List
|
12
13
|
|
13
14
|
|
14
|
-
class AnswerCorrectnessScorer(
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
ExampleParams.ACTUAL_OUTPUT,
|
22
|
-
ExampleParams.EXPECTED_OUTPUT,
|
23
|
-
],
|
24
|
-
)
|
25
|
-
|
26
|
-
@property
|
27
|
-
def __name__(self):
|
28
|
-
return "Answer Correctness"
|
15
|
+
class AnswerCorrectnessScorer(APIScorerConfig):
|
16
|
+
score_type: APIScorerType = APIScorerType.ANSWER_CORRECTNESS
|
17
|
+
required_params: List[ExampleParams] = [
|
18
|
+
ExampleParams.INPUT,
|
19
|
+
ExampleParams.ACTUAL_OUTPUT,
|
20
|
+
ExampleParams.EXPECTED_OUTPUT,
|
21
|
+
]
|
@@ -1,27 +1,12 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
-
from judgeval.constants import APIScorer
|
1
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
2
|
+
from judgeval.constants import APIScorerType
|
11
3
|
from judgeval.data import ExampleParams
|
4
|
+
from typing import List
|
12
5
|
|
13
6
|
|
14
|
-
class AnswerRelevancyScorer(
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
ExampleParams.INPUT,
|
21
|
-
ExampleParams.ACTUAL_OUTPUT,
|
22
|
-
],
|
23
|
-
)
|
24
|
-
|
25
|
-
@property
|
26
|
-
def __name__(self):
|
27
|
-
return "Answer Relevancy"
|
7
|
+
class AnswerRelevancyScorer(APIScorerConfig):
|
8
|
+
score_type: APIScorerType = APIScorerType.ANSWER_RELEVANCY
|
9
|
+
required_params: List[ExampleParams] = [
|
10
|
+
ExampleParams.INPUT,
|
11
|
+
ExampleParams.ACTUAL_OUTPUT,
|
12
|
+
]
|
@@ -1,9 +1,9 @@
|
|
1
|
-
from judgeval.scorers.api_scorer import
|
2
|
-
from judgeval.constants import
|
3
|
-
from typing import List, Mapping, Optional
|
1
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
2
|
+
from judgeval.constants import APIScorerType
|
3
|
+
from typing import List, Mapping, Optional, Dict, Any
|
4
4
|
|
5
5
|
|
6
|
-
class ClassifierScorer(
|
6
|
+
class ClassifierScorer(APIScorerConfig):
|
7
7
|
"""
|
8
8
|
In the Judgment backend, this scorer is implemented as a PromptScorer that takes
|
9
9
|
1. a system role that may involve the Example object
|
@@ -16,50 +16,15 @@ class ClassifierScorer(APIJudgmentScorer):
|
|
16
16
|
options = {"positive": 1, "negative": 0}
|
17
17
|
|
18
18
|
Args:
|
19
|
-
name (str): The name of the scorer
|
20
19
|
slug (str): A unique identifier for the scorer
|
21
20
|
conversation (List[dict]): The conversation template with placeholders (e.g., {{actual_output}})
|
22
21
|
options (Mapping[str, float]): A mapping of classification options to their corresponding scores
|
23
|
-
threshold (float): The threshold for determining success (default: 0.5)
|
24
|
-
include_reason (bool): Whether to include reasoning in the response (default: True)
|
25
|
-
strict_mode (bool): Whether to use strict mode (default: False)
|
26
|
-
verbose_mode (bool): Whether to include verbose logging (default: False)
|
27
22
|
"""
|
28
23
|
|
29
|
-
name: Optional[str] = None
|
30
24
|
slug: Optional[str] = None
|
31
25
|
conversation: Optional[List[dict]] = None
|
32
26
|
options: Optional[Mapping[str, float]] = None
|
33
|
-
|
34
|
-
strict_mode: bool = False
|
35
|
-
include_reason: bool = True
|
36
|
-
async_mode: bool = True
|
37
|
-
threshold: float = 0.5
|
38
|
-
|
39
|
-
def __init__(
|
40
|
-
self,
|
41
|
-
name: str,
|
42
|
-
slug: str,
|
43
|
-
conversation: List[dict],
|
44
|
-
options: Mapping[str, float],
|
45
|
-
threshold: float = 0.5,
|
46
|
-
include_reason: bool = True,
|
47
|
-
strict_mode: bool = False,
|
48
|
-
verbose_mode: bool = False,
|
49
|
-
async_mode: bool = True,
|
50
|
-
):
|
51
|
-
super().__init__(
|
52
|
-
threshold=threshold,
|
53
|
-
score_type=APIScorer.CLASSIFIER,
|
54
|
-
)
|
55
|
-
self.name = name
|
56
|
-
self.verbose_mode = verbose_mode
|
57
|
-
self.strict_mode = strict_mode
|
58
|
-
self.include_reason = include_reason
|
59
|
-
self.slug = slug
|
60
|
-
self.conversation = conversation
|
61
|
-
self.options = options
|
62
|
-
self.async_mode = async_mode
|
27
|
+
score_type: APIScorerType = APIScorerType.PROMPT_SCORER
|
63
28
|
|
64
29
|
def update_name(self, name: str):
|
65
30
|
"""
|
@@ -94,32 +59,15 @@ class ClassifierScorer(APIJudgmentScorer):
|
|
94
59
|
def __str__(self):
|
95
60
|
return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
|
96
61
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
# "options": self.options,
|
107
|
-
# "threshold": self.threshold,
|
108
|
-
# "include_reason": self.include_reason,
|
109
|
-
# "async_mode": self.async_mode,
|
110
|
-
# "strict_mode": self.strict_mode,
|
111
|
-
# "verbose_mode": self.verbose_mode,
|
112
|
-
# }
|
113
|
-
|
114
|
-
def to_dict(self) -> dict:
|
115
|
-
return {
|
116
|
-
"name": self.name,
|
117
|
-
"score_type": self.name,
|
118
|
-
"conversation": self.conversation,
|
119
|
-
"options": self.options,
|
120
|
-
"threshold": self.threshold,
|
121
|
-
"include_reason": self.include_reason,
|
122
|
-
"async_mode": self.async_mode,
|
123
|
-
"strict_mode": self.strict_mode,
|
124
|
-
"verbose_mode": self.verbose_mode,
|
62
|
+
def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
|
63
|
+
base = super().model_dump(*args, **kwargs)
|
64
|
+
base_fields = set(APIScorerConfig.model_fields.keys())
|
65
|
+
all_fields = set(self.__class__.model_fields.keys())
|
66
|
+
|
67
|
+
extra_fields = all_fields - base_fields - {"kwargs"}
|
68
|
+
|
69
|
+
base["kwargs"] = {
|
70
|
+
k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
|
125
71
|
}
|
72
|
+
|
73
|
+
return base
|
@@ -6,17 +6,9 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import
|
10
|
-
from judgeval.constants import
|
9
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
10
|
+
from judgeval.constants import APIScorerType
|
11
11
|
|
12
12
|
|
13
|
-
class DerailmentScorer(
|
14
|
-
|
15
|
-
super().__init__(
|
16
|
-
threshold=threshold,
|
17
|
-
score_type=APIScorer.DERAILMENT,
|
18
|
-
)
|
19
|
-
|
20
|
-
@property
|
21
|
-
def __name__(self):
|
22
|
-
return "Derailment"
|
13
|
+
class DerailmentScorer(APIScorerConfig):
|
14
|
+
score_type: APIScorerType = APIScorerType.DERAILMENT
|
@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import
|
10
|
-
from judgeval.constants import
|
9
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
10
|
+
from judgeval.constants import APIScorerType
|
11
11
|
from typing import Optional, Dict
|
12
12
|
from judgeval.data import ExampleParams
|
13
13
|
|
14
14
|
|
15
|
-
class ExecutionOrderScorer(
|
15
|
+
class ExecutionOrderScorer(APIScorerConfig):
|
16
16
|
kwargs: Optional[Dict] = None
|
17
17
|
|
18
18
|
def __init__(
|
@@ -23,7 +23,7 @@ class ExecutionOrderScorer(APIJudgmentScorer):
|
|
23
23
|
):
|
24
24
|
super().__init__(
|
25
25
|
threshold=threshold,
|
26
|
-
score_type=
|
26
|
+
score_type=APIScorerType.EXECUTION_ORDER,
|
27
27
|
required_params=[
|
28
28
|
ExampleParams.ACTUAL_OUTPUT,
|
29
29
|
ExampleParams.EXPECTED_OUTPUT,
|
@@ -6,23 +6,16 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import
|
10
|
-
from judgeval.constants import
|
9
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
10
|
+
from judgeval.constants import APIScorerType
|
11
11
|
from judgeval.data import ExampleParams
|
12
|
+
from typing import List
|
12
13
|
|
13
14
|
|
14
|
-
class FaithfulnessScorer(
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
ExampleParams.ACTUAL_OUTPUT,
|
22
|
-
ExampleParams.RETRIEVAL_CONTEXT,
|
23
|
-
],
|
24
|
-
)
|
25
|
-
|
26
|
-
@property
|
27
|
-
def __name__(self):
|
28
|
-
return "Faithfulness"
|
15
|
+
class FaithfulnessScorer(APIScorerConfig):
|
16
|
+
score_type: APIScorerType = APIScorerType.FAITHFULNESS
|
17
|
+
required_params: List[ExampleParams] = [
|
18
|
+
ExampleParams.INPUT,
|
19
|
+
ExampleParams.ACTUAL_OUTPUT,
|
20
|
+
ExampleParams.RETRIEVAL_CONTEXT,
|
21
|
+
]
|
@@ -6,16 +6,16 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import
|
10
|
-
from judgeval.constants import
|
9
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
10
|
+
from judgeval.constants import APIScorerType
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
|
14
|
-
class HallucinationScorer(
|
14
|
+
class HallucinationScorer(APIScorerConfig):
|
15
15
|
def __init__(self, threshold: float):
|
16
16
|
super().__init__(
|
17
17
|
threshold=threshold,
|
18
|
-
score_type=
|
18
|
+
score_type=APIScorerType.HALLUCINATION,
|
19
19
|
required_params=[
|
20
20
|
ExampleParams.INPUT,
|
21
21
|
ExampleParams.ACTUAL_OUTPUT,
|
@@ -6,16 +6,16 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import
|
10
|
-
from judgeval.constants import
|
9
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
10
|
+
from judgeval.constants import APIScorerType
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
|
14
|
-
class InstructionAdherenceScorer(
|
14
|
+
class InstructionAdherenceScorer(APIScorerConfig):
|
15
15
|
def __init__(self, threshold: float):
|
16
16
|
super().__init__(
|
17
17
|
threshold=threshold,
|
18
|
-
score_type=
|
18
|
+
score_type=APIScorerType.INSTRUCTION_ADHERENCE,
|
19
19
|
required_params=[
|
20
20
|
ExampleParams.INPUT,
|
21
21
|
ExampleParams.ACTUAL_OUTPUT,
|
@@ -3,16 +3,16 @@
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
# Internal imports
|
6
|
-
from judgeval.scorers.api_scorer import
|
7
|
-
from judgeval.constants import
|
6
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
7
|
+
from judgeval.constants import APIScorerType
|
8
8
|
from typing import Optional, Dict
|
9
9
|
|
10
10
|
|
11
|
-
class ToolDependencyScorer(
|
11
|
+
class ToolDependencyScorer(APIScorerConfig):
|
12
12
|
kwargs: Optional[Dict] = None
|
13
13
|
|
14
14
|
def __init__(self, threshold: float = 1.0, enable_param_checking: bool = True):
|
15
|
-
super().__init__(threshold=threshold, score_type=
|
15
|
+
super().__init__(threshold=threshold, score_type=APIScorerType.TOOL_DEPENDENCY)
|
16
16
|
self.kwargs = {"enable_param_checking": enable_param_checking}
|
17
17
|
|
18
18
|
@property
|
@@ -3,21 +3,25 @@
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
# Internal imports
|
6
|
-
from judgeval.scorers.api_scorer import
|
7
|
-
from judgeval.constants import
|
8
|
-
from typing import
|
6
|
+
from judgeval.scorers.api_scorer import APIScorerConfig
|
7
|
+
from judgeval.constants import APIScorerType
|
8
|
+
from typing import Dict, Any
|
9
9
|
|
10
10
|
|
11
|
-
class ToolOrderScorer(
|
12
|
-
|
11
|
+
class ToolOrderScorer(APIScorerConfig):
|
12
|
+
score_type: APIScorerType = APIScorerType.TOOL_ORDER
|
13
|
+
threshold: float = 1.0
|
14
|
+
exact_match: bool = False
|
13
15
|
|
14
|
-
def
|
15
|
-
super().
|
16
|
-
|
17
|
-
|
18
|
-
)
|
19
|
-
self.kwargs = {"exact_match": exact_match}
|
16
|
+
def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
|
17
|
+
base = super().model_dump(*args, **kwargs)
|
18
|
+
base_fields = set(APIScorerConfig.model_fields.keys())
|
19
|
+
all_fields = set(self.__class__.model_fields.keys())
|
20
20
|
|
21
|
-
|
22
|
-
|
23
|
-
|
21
|
+
extra_fields = all_fields - base_fields - {"kwargs"}
|
22
|
+
|
23
|
+
base["kwargs"] = {
|
24
|
+
k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
|
25
|
+
}
|
26
|
+
|
27
|
+
return base
|