judgeval 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +139 -12
- judgeval/api/__init__.py +501 -0
- judgeval/api/api_types.py +344 -0
- judgeval/cli.py +2 -4
- judgeval/constants.py +10 -26
- judgeval/data/evaluation_run.py +49 -26
- judgeval/data/example.py +2 -2
- judgeval/data/judgment_types.py +266 -82
- judgeval/data/result.py +4 -5
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +2 -2
- judgeval/data/trace.py +7 -50
- judgeval/data/trace_run.py +7 -4
- judgeval/{dataset.py → dataset/__init__.py} +43 -28
- judgeval/env.py +67 -0
- judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +788 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +75 -15
- judgeval/judges/together_judge.py +86 -18
- judgeval/judges/utils.py +7 -21
- judgeval/{common/logger.py → logger.py} +8 -6
- judgeval/scorers/__init__.py +0 -4
- judgeval/scorers/agent_scorer.py +3 -7
- judgeval/scorers/api_scorer.py +8 -13
- judgeval/scorers/base_scorer.py +52 -32
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
- judgeval/scorers/score.py +21 -31
- judgeval/scorers/trace_api_scorer.py +5 -0
- judgeval/scorers/utils.py +1 -103
- judgeval/tracer/__init__.py +1075 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +37 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +43 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +67 -0
- judgeval/tracer/llm/__init__.py +1233 -0
- judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
- judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
- judgeval/tracer/managers.py +188 -0
- judgeval/tracer/processors/__init__.py +181 -0
- judgeval/tracer/utils.py +20 -0
- judgeval/trainer/__init__.py +5 -0
- judgeval/{common/trainer → trainer}/config.py +12 -9
- judgeval/{common/trainer → trainer}/console.py +2 -9
- judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
- judgeval/{common/trainer → trainer}/trainer.py +119 -17
- judgeval/utils/async_utils.py +2 -3
- judgeval/utils/decorators.py +24 -0
- judgeval/utils/file_utils.py +37 -4
- judgeval/utils/guards.py +32 -0
- judgeval/utils/meta.py +14 -0
- judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
- judgeval/utils/testing.py +88 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +3 -3
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
- judgeval-0.9.0.dist-info/RECORD +80 -0
- judgeval/clients.py +0 -35
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -375
- judgeval/common/api/constants.py +0 -186
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -97
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -2427
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -188
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -207
- judgeval/common/tracer/trace_manager.py +0 -101
- judgeval/common/trainer/__init__.py +0 -5
- judgeval/common/utils.py +0 -948
- judgeval/integrations/langgraph.py +0 -844
- judgeval/judges/mixture_of_judges.py +0 -287
- judgeval/judgment_client.py +0 -267
- judgeval/rules.py +0 -521
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.8.0.dist-info/RECORD +0 -82
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/judges/__init__.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from judgeval.judges.base_judge import JudgevalJudge
|
2
2
|
from judgeval.judges.litellm_judge import LiteLLMJudge
|
3
3
|
from judgeval.judges.together_judge import TogetherJudge
|
4
|
-
from judgeval.judges.mixture_of_judges import MixtureOfJudges
|
5
4
|
|
6
|
-
|
5
|
+
|
6
|
+
__all__ = ["JudgevalJudge", "LiteLLMJudge", "TogetherJudge"]
|
judgeval/judges/litellm_judge.py
CHANGED
@@ -1,21 +1,77 @@
|
|
1
1
|
import pydantic
|
2
|
-
from typing import List, Union, Mapping
|
2
|
+
from typing import Dict, List, Union, Mapping, Any
|
3
3
|
|
4
|
+
from judgeval.constants import ACCEPTABLE_MODELS
|
4
5
|
from judgeval.judges import JudgevalJudge
|
5
|
-
from judgeval.
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
6
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
7
|
+
|
8
|
+
try:
|
9
|
+
import litellm
|
10
|
+
except ImportError:
|
11
|
+
raise ImportError(
|
12
|
+
"Litellm is not installed and required for the litellm judge. Please install it with `pip install litellm`."
|
13
|
+
)
|
14
|
+
|
15
|
+
|
16
|
+
def fetch_litellm_api_response(
|
17
|
+
model: str,
|
18
|
+
messages: List[Dict[str, str]],
|
19
|
+
response_format: Union[Dict[str, Any], None] = None,
|
20
|
+
) -> str:
|
21
|
+
if response_format is not None:
|
22
|
+
response = litellm.completion(
|
23
|
+
model=model,
|
24
|
+
messages=messages,
|
25
|
+
response_format=response_format,
|
26
|
+
)
|
27
|
+
else:
|
28
|
+
response = litellm.completion(
|
29
|
+
model=model,
|
30
|
+
messages=messages,
|
31
|
+
)
|
32
|
+
|
33
|
+
content = response.choices[0].message.content # type: ignore[attr-defined]
|
34
|
+
if content is None:
|
35
|
+
raise ValueError("Received empty response from litellm")
|
36
|
+
return content
|
37
|
+
|
38
|
+
|
39
|
+
async def afetch_litellm_api_response(
|
40
|
+
model: str,
|
41
|
+
messages: List[Dict[str, str]],
|
42
|
+
response_format: Union[Dict[str, Any], None] = None,
|
43
|
+
) -> str:
|
44
|
+
if not messages:
|
45
|
+
raise ValueError("Messages cannot be empty")
|
46
|
+
|
47
|
+
if model not in ACCEPTABLE_MODELS:
|
48
|
+
raise ValueError(
|
49
|
+
f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}."
|
50
|
+
)
|
51
|
+
|
52
|
+
if response_format is not None:
|
53
|
+
response = await litellm.acompletion(
|
54
|
+
model=model, messages=messages, response_format=response_format
|
55
|
+
)
|
56
|
+
else:
|
57
|
+
response = await litellm.acompletion(
|
58
|
+
model=model,
|
59
|
+
messages=messages,
|
60
|
+
)
|
61
|
+
|
62
|
+
content = response.choices[0].message.content # type: ignore[attr-defined]
|
63
|
+
if content is None:
|
64
|
+
raise ValueError("Received empty response from litellm")
|
65
|
+
return content
|
66
|
+
|
11
67
|
|
12
68
|
BASE_CONVERSATION = [
|
13
69
|
{"role": "system", "content": "You are a helpful assistant."},
|
14
|
-
]
|
70
|
+
]
|
15
71
|
|
16
72
|
|
17
73
|
class LiteLLMJudge(JudgevalJudge):
|
18
|
-
def __init__(self, model: str =
|
74
|
+
def __init__(self, model: str = JUDGMENT_DEFAULT_GPT_MODEL, **kwargs):
|
19
75
|
self.model = model
|
20
76
|
self.kwargs = kwargs
|
21
77
|
super().__init__(model_name=model)
|
@@ -25,17 +81,19 @@ class LiteLLMJudge(JudgevalJudge):
|
|
25
81
|
input: Union[str, List[Mapping[str, str]]],
|
26
82
|
schema: Union[pydantic.BaseModel, None] = None,
|
27
83
|
) -> str:
|
84
|
+
response_format = schema.model_json_schema() if schema else None
|
85
|
+
|
28
86
|
if isinstance(input, str):
|
29
87
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
30
88
|
return fetch_litellm_api_response(
|
31
|
-
model=self.model, messages=convo, response_format=
|
89
|
+
model=self.model, messages=convo, response_format=response_format
|
32
90
|
)
|
33
91
|
elif isinstance(input, list):
|
92
|
+
messages = [dict(msg) for msg in input]
|
34
93
|
return fetch_litellm_api_response(
|
35
|
-
model=self.model, messages=
|
94
|
+
model=self.model, messages=messages, response_format=response_format
|
36
95
|
)
|
37
96
|
else:
|
38
|
-
judgeval_logger.error(f"Invalid input type received: {type(input)}")
|
39
97
|
raise TypeError(
|
40
98
|
f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
|
41
99
|
)
|
@@ -45,19 +103,21 @@ class LiteLLMJudge(JudgevalJudge):
|
|
45
103
|
input: Union[str, List[Mapping[str, str]]],
|
46
104
|
schema: Union[pydantic.BaseModel, None] = None,
|
47
105
|
) -> str:
|
106
|
+
response_format = schema.model_json_schema() if schema else None
|
107
|
+
|
48
108
|
if isinstance(input, str):
|
49
109
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
50
110
|
response = await afetch_litellm_api_response(
|
51
|
-
model=self.model, messages=convo, response_format=
|
111
|
+
model=self.model, messages=convo, response_format=response_format
|
52
112
|
)
|
53
113
|
return response
|
54
114
|
elif isinstance(input, list):
|
115
|
+
messages = [dict(msg) for msg in input]
|
55
116
|
response = await afetch_litellm_api_response(
|
56
|
-
model=self.model, messages=
|
117
|
+
model=self.model, messages=messages, response_format=response_format
|
57
118
|
)
|
58
119
|
return response
|
59
120
|
else:
|
60
|
-
judgeval_logger.error(f"Invalid input type received: {type(input)}")
|
61
121
|
raise TypeError(
|
62
122
|
f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
|
63
123
|
)
|
@@ -3,15 +3,77 @@ Implementation of using TogetherAI inference for judges.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
from pydantic import BaseModel
|
6
|
-
from typing import List, Union
|
7
|
-
|
6
|
+
from typing import Dict, List, Union, Any, cast
|
8
7
|
from judgeval.judges import JudgevalJudge
|
9
|
-
from judgeval.
|
10
|
-
|
11
|
-
|
8
|
+
from judgeval.logger import judgeval_logger
|
9
|
+
from judgeval.env import (
|
10
|
+
JUDGMENT_DEFAULT_TOGETHER_MODEL,
|
11
|
+
TOGETHERAI_API_KEY,
|
12
|
+
TOGETHER_API_KEY,
|
12
13
|
)
|
13
|
-
|
14
|
-
|
14
|
+
|
15
|
+
together_api_key = TOGETHERAI_API_KEY or TOGETHER_API_KEY
|
16
|
+
if together_api_key:
|
17
|
+
try:
|
18
|
+
from together import Together, AsyncTogether # type: ignore[import-untyped]
|
19
|
+
|
20
|
+
together_client = Together(api_key=together_api_key)
|
21
|
+
async_together_client = AsyncTogether(api_key=together_api_key)
|
22
|
+
except Exception:
|
23
|
+
pass
|
24
|
+
|
25
|
+
|
26
|
+
def fetch_together_api_response(
|
27
|
+
model: str,
|
28
|
+
messages: List[Dict[str, str]],
|
29
|
+
response_format: Union[Dict[str, Any], None] = None,
|
30
|
+
) -> str:
|
31
|
+
if not messages:
|
32
|
+
raise ValueError("Messages cannot be empty")
|
33
|
+
|
34
|
+
if response_format is not None:
|
35
|
+
response = together_client.chat.completions.create(
|
36
|
+
model=model,
|
37
|
+
messages=messages,
|
38
|
+
response_format=response_format,
|
39
|
+
)
|
40
|
+
else:
|
41
|
+
response = together_client.chat.completions.create(
|
42
|
+
model=model,
|
43
|
+
messages=messages,
|
44
|
+
)
|
45
|
+
|
46
|
+
content = response.choices[0].message.content # type: ignore[attr-defined]
|
47
|
+
if content is None:
|
48
|
+
raise ValueError("Received empty response from TogetherAI")
|
49
|
+
return cast(str, content)
|
50
|
+
|
51
|
+
|
52
|
+
async def afetch_together_api_response(
|
53
|
+
model: str,
|
54
|
+
messages: List[Dict[str, str]],
|
55
|
+
response_format: Union[Dict[str, Any], None] = None,
|
56
|
+
) -> str:
|
57
|
+
if not messages:
|
58
|
+
raise ValueError("Messages cannot be empty")
|
59
|
+
|
60
|
+
if response_format is not None:
|
61
|
+
response = await async_together_client.chat.completions.create(
|
62
|
+
model=model,
|
63
|
+
messages=messages,
|
64
|
+
response_format=response_format,
|
65
|
+
)
|
66
|
+
else:
|
67
|
+
response = await async_together_client.chat.completions.create(
|
68
|
+
model=model,
|
69
|
+
messages=messages,
|
70
|
+
)
|
71
|
+
|
72
|
+
content = response.choices[0].message.content # type: ignore[attr-defined]
|
73
|
+
if content is None:
|
74
|
+
raise ValueError("Received empty response from TogetherAI")
|
75
|
+
return cast(str, content)
|
76
|
+
|
15
77
|
|
16
78
|
BASE_CONVERSATION = [
|
17
79
|
{"role": "system", "content": "You are a helpful assistant."},
|
@@ -19,46 +81,52 @@ BASE_CONVERSATION = [
|
|
19
81
|
|
20
82
|
|
21
83
|
class TogetherJudge(JudgevalJudge):
|
22
|
-
def __init__(self, model: str =
|
84
|
+
def __init__(self, model: str = JUDGMENT_DEFAULT_TOGETHER_MODEL, **kwargs):
|
23
85
|
self.model = model
|
24
86
|
self.kwargs = kwargs
|
25
87
|
super().__init__(model_name=model)
|
26
88
|
|
27
|
-
# TODO: Fix cost for generate and a_generate
|
28
89
|
def generate(
|
29
|
-
self,
|
90
|
+
self,
|
91
|
+
input: Union[str, List[Dict[str, str]]],
|
92
|
+
schema: Union[BaseModel, None] = None,
|
30
93
|
) -> str:
|
94
|
+
response_format = schema.model_json_schema() if schema else None
|
95
|
+
|
31
96
|
if isinstance(input, str):
|
32
97
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
33
98
|
return fetch_together_api_response(
|
34
|
-
self.model, convo, response_format=
|
99
|
+
self.model, convo, response_format=response_format
|
35
100
|
)
|
36
101
|
elif isinstance(input, list):
|
37
|
-
|
102
|
+
messages = [dict(msg) for msg in input]
|
38
103
|
return fetch_together_api_response(
|
39
|
-
self.model,
|
104
|
+
self.model, messages, response_format=response_format
|
40
105
|
)
|
41
106
|
else:
|
42
107
|
judgeval_logger.error(f"Invalid input type received: {type(input)}")
|
43
108
|
raise TypeError("Input must be a string or a list of dictionaries.")
|
44
109
|
|
45
110
|
async def a_generate(
|
46
|
-
self,
|
111
|
+
self,
|
112
|
+
input: Union[str, List[Dict[str, str]]],
|
113
|
+
schema: Union[BaseModel, None] = None,
|
47
114
|
) -> str:
|
115
|
+
response_format = schema.model_json_schema() if schema else None
|
116
|
+
|
48
117
|
if isinstance(input, str):
|
49
118
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
50
119
|
res = await afetch_together_api_response(
|
51
|
-
self.model, convo, response_format=
|
120
|
+
self.model, convo, response_format=response_format
|
52
121
|
)
|
53
122
|
return res
|
54
123
|
elif isinstance(input, list):
|
55
|
-
|
124
|
+
messages = [dict(msg) for msg in input]
|
56
125
|
res = await afetch_together_api_response(
|
57
|
-
self.model,
|
126
|
+
self.model, messages, response_format=response_format
|
58
127
|
)
|
59
128
|
return res
|
60
129
|
else:
|
61
|
-
judgeval_logger.error(f"Invalid input type received: {type(input)}")
|
62
130
|
raise TypeError("Input must be a string or a list of dictionaries.")
|
63
131
|
|
64
132
|
def load_model(self) -> str:
|
judgeval/judges/utils.py
CHANGED
@@ -3,22 +3,21 @@ This module contains utility functions for judge models.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
import litellm
|
6
|
-
from typing import Optional, Union, Tuple
|
6
|
+
from typing import Optional, Union, Tuple
|
7
7
|
|
8
|
-
from judgeval.
|
9
|
-
from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge
|
10
|
-
from judgeval.
|
8
|
+
from judgeval.exceptions import InvalidJudgeModelError
|
9
|
+
from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge
|
10
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
11
11
|
from judgeval.constants import (
|
12
12
|
TOGETHER_SUPPORTED_MODELS,
|
13
13
|
JUDGMENT_SUPPORTED_MODELS,
|
14
|
-
ACCEPTABLE_MODELS,
|
15
14
|
)
|
16
15
|
|
17
16
|
LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
|
18
17
|
|
19
18
|
|
20
19
|
def create_judge(
|
21
|
-
model: Optional[Union[str,
|
20
|
+
model: Optional[Union[str, JudgevalJudge]] = None,
|
22
21
|
) -> Tuple[JudgevalJudge, bool]:
|
23
22
|
"""
|
24
23
|
Creates a judge model from string(s) or a judgeval judge object.
|
@@ -31,28 +30,15 @@ def create_judge(
|
|
31
30
|
If no model is provided, uses GPT4o as the default judge.
|
32
31
|
"""
|
33
32
|
if model is None: # default option
|
34
|
-
return LiteLLMJudge(model=
|
33
|
+
return LiteLLMJudge(model=JUDGMENT_DEFAULT_GPT_MODEL), True
|
35
34
|
if not isinstance(model, (str, list, JudgevalJudge)):
|
36
35
|
raise InvalidJudgeModelError(
|
37
36
|
f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead."
|
38
37
|
)
|
39
38
|
# If model is already a valid judge type, return it and mark native
|
40
|
-
if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge
|
39
|
+
if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge)):
|
41
40
|
return model, True
|
42
41
|
|
43
|
-
# Either string or List[str]
|
44
|
-
if isinstance(model, list):
|
45
|
-
for m in model:
|
46
|
-
if m in JUDGMENT_SUPPORTED_MODELS:
|
47
|
-
raise NotImplementedError(
|
48
|
-
"""Judgment models are not yet supported for local scoring.
|
49
|
-
Please either set the `use_judgment` flag to True or use
|
50
|
-
non-Judgment models."""
|
51
|
-
)
|
52
|
-
if m not in ACCEPTABLE_MODELS:
|
53
|
-
raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
|
54
|
-
return MixtureOfJudges(models=model), True
|
55
|
-
# If model is a string, check that it corresponds to a valid model
|
56
42
|
if model in LITELLM_SUPPORTED_MODELS:
|
57
43
|
return LiteLLMJudge(model=model), True
|
58
44
|
if model in TOGETHER_SUPPORTED_MODELS:
|
@@ -1,10 +1,9 @@
|
|
1
|
-
# logger.py
|
2
|
-
|
3
1
|
import logging
|
4
2
|
import sys
|
5
|
-
import os
|
6
3
|
|
7
|
-
|
4
|
+
from judgeval.env import JUDGMENT_NO_COLOR
|
5
|
+
from judgeval.utils.decorators import use_once
|
6
|
+
|
8
7
|
RESET = "\033[0m"
|
9
8
|
RED = "\033[31m"
|
10
9
|
YELLOW = "\033[33m"
|
@@ -38,8 +37,9 @@ class ColorFormatter(logging.Formatter):
|
|
38
37
|
return message
|
39
38
|
|
40
39
|
|
40
|
+
@use_once
|
41
41
|
def _setup_judgeval_logger():
|
42
|
-
use_color = sys.stdout.isatty() and
|
42
|
+
use_color = sys.stdout.isatty() and JUDGMENT_NO_COLOR is None
|
43
43
|
handler = logging.StreamHandler(sys.stdout)
|
44
44
|
handler.setLevel(logging.DEBUG)
|
45
45
|
handler.setFormatter(
|
@@ -56,5 +56,7 @@ def _setup_judgeval_logger():
|
|
56
56
|
return logger
|
57
57
|
|
58
58
|
|
59
|
-
# Global logger you can import elsewhere
|
60
59
|
judgeval_logger = _setup_judgeval_logger()
|
60
|
+
|
61
|
+
|
62
|
+
__all__ = ("judgeval_logger",)
|
judgeval/scorers/__init__.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
from judgeval.scorers.api_scorer import APIScorerConfig
|
2
2
|
from judgeval.scorers.base_scorer import BaseScorer
|
3
3
|
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
4
|
-
ExecutionOrderScorer,
|
5
|
-
HallucinationScorer,
|
6
4
|
FaithfulnessScorer,
|
7
5
|
AnswerRelevancyScorer,
|
8
6
|
AnswerCorrectnessScorer,
|
@@ -17,8 +15,6 @@ __all__ = [
|
|
17
15
|
"APIScorerConfig",
|
18
16
|
"BaseScorer",
|
19
17
|
"PromptScorer",
|
20
|
-
"ExecutionOrderScorer",
|
21
|
-
"HallucinationScorer",
|
22
18
|
"FaithfulnessScorer",
|
23
19
|
"AnswerRelevancyScorer",
|
24
20
|
"AnswerCorrectnessScorer",
|
judgeval/scorers/agent_scorer.py
CHANGED
@@ -1,21 +1,17 @@
|
|
1
1
|
from judgeval.scorers.base_scorer import BaseScorer
|
2
|
-
from judgeval.data import Trace
|
2
|
+
from judgeval.data.judgment_types import Trace as JudgmentTrace
|
3
3
|
from typing import List, Optional
|
4
4
|
from abc import abstractmethod
|
5
5
|
|
6
|
-
from judgeval.common.logger import warning, error
|
7
6
|
|
8
|
-
|
9
|
-
class AgentScorer(BaseScorer):
|
7
|
+
class TraceScorer(BaseScorer):
|
10
8
|
@abstractmethod
|
11
9
|
async def a_score_trace(
|
12
|
-
self, trace:
|
10
|
+
self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
|
13
11
|
) -> float:
|
14
12
|
"""
|
15
13
|
Asynchronously measures the score on a trace
|
16
14
|
"""
|
17
|
-
warning("Attempting to call unimplemented a_score_trace method")
|
18
|
-
error("a_score_trace method not implemented")
|
19
15
|
raise NotImplementedError(
|
20
16
|
"You must implement the `a_score_trace` method in your custom scorer"
|
21
17
|
)
|
judgeval/scorers/api_scorer.py
CHANGED
@@ -4,11 +4,12 @@ Judgment Scorer class.
|
|
4
4
|
Scores `Example`s using ready-made Judgment evaluators.
|
5
5
|
"""
|
6
6
|
|
7
|
+
from __future__ import annotations
|
8
|
+
|
7
9
|
from pydantic import BaseModel, field_validator
|
8
10
|
from typing import List
|
9
|
-
from judgeval.
|
10
|
-
from judgeval.
|
11
|
-
from judgeval.common.logger import judgeval_logger
|
11
|
+
from judgeval.constants import UNBOUNDED_SCORERS, APIScorerType
|
12
|
+
from judgeval.data.example import ExampleParams
|
12
13
|
|
13
14
|
|
14
15
|
class APIScorerConfig(BaseModel):
|
@@ -28,9 +29,10 @@ class APIScorerConfig(BaseModel):
|
|
28
29
|
name: str = ""
|
29
30
|
threshold: float = 0.5
|
30
31
|
strict_mode: bool = False
|
31
|
-
|
32
|
-
|
33
|
-
] = []
|
32
|
+
|
33
|
+
# This is used to check if the example has the required parameters before running the scorer
|
34
|
+
required_params: List[ExampleParams] = []
|
35
|
+
|
34
36
|
kwargs: dict = {}
|
35
37
|
|
36
38
|
@field_validator("threshold")
|
@@ -42,17 +44,11 @@ class APIScorerConfig(BaseModel):
|
|
42
44
|
score_type = info.data.get("score_type")
|
43
45
|
if score_type in UNBOUNDED_SCORERS:
|
44
46
|
if v < 0:
|
45
|
-
judgeval_logger.error(
|
46
|
-
f"Threshold for {score_type} must be greater than 0, got: {v}"
|
47
|
-
)
|
48
47
|
raise ValueError(
|
49
48
|
f"Threshold for {score_type} must be greater than 0, got: {v}"
|
50
49
|
)
|
51
50
|
else:
|
52
51
|
if not 0 <= v <= 1:
|
53
|
-
judgeval_logger.error(
|
54
|
-
f"Threshold for {score_type} must be between 0 and 1, got: {v}"
|
55
|
-
)
|
56
52
|
raise ValueError(
|
57
53
|
f"Threshold for {score_type} must be between 0 and 1, got: {v}"
|
58
54
|
)
|
@@ -61,7 +57,6 @@ class APIScorerConfig(BaseModel):
|
|
61
57
|
@field_validator("name", mode="after")
|
62
58
|
@classmethod
|
63
59
|
def set_name_to_score_type_if_none(cls, v, info):
|
64
|
-
"""Set name to score_type if not provided"""
|
65
60
|
if v is None:
|
66
61
|
return info.data.get("score_type")
|
67
62
|
return v
|
judgeval/scorers/base_scorer.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
Base class for all scorers.
|
3
3
|
"""
|
4
4
|
|
5
|
+
from __future__ import annotations
|
5
6
|
from typing import Dict, Optional
|
6
7
|
|
7
8
|
from pydantic import BaseModel
|
@@ -19,44 +20,63 @@ class BaseScorer(BaseModel):
|
|
19
20
|
where none of Judgment's scorers are suitable.
|
20
21
|
"""
|
21
22
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
)
|
29
|
-
|
30
|
-
|
23
|
+
# type of your scorer (Faithfulness, PromptScorer)
|
24
|
+
score_type: str
|
25
|
+
|
26
|
+
# The threshold to pass a test while using this scorer as a scorer
|
27
|
+
threshold: float = 0.5
|
28
|
+
|
29
|
+
# name of your scorer (Faithfulness, PromptScorer-randomslug)
|
30
|
+
name: Optional[str] = None
|
31
|
+
|
32
|
+
# The name of the class of the scorer
|
33
|
+
class_name: Optional[str] = None
|
34
|
+
|
35
|
+
# The float score of the scorer run on the test case
|
36
|
+
score: Optional[float] = None
|
37
|
+
|
31
38
|
score_breakdown: Optional[Dict] = None
|
32
39
|
reason: Optional[str] = ""
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
40
|
+
|
41
|
+
# Whether the model is a native model
|
42
|
+
using_native_model: Optional[bool] = None
|
43
|
+
|
44
|
+
# Whether the test case passed or failed
|
45
|
+
success: Optional[bool] = None
|
46
|
+
|
47
|
+
# The name of the model used to evaluate the test case
|
48
|
+
model: Optional[str] = None
|
49
|
+
|
50
|
+
# The model used to evaluate the test case
|
51
|
+
model_client: Optional[Any] = Field(default=None, exclude=True)
|
52
|
+
|
53
|
+
# Whether to run the scorer in strict mode
|
54
|
+
strict_mode: bool = False
|
55
|
+
|
56
|
+
# The error message if the scorer failed
|
57
|
+
error: Optional[str] = None
|
58
|
+
|
59
|
+
# Additional metadata for the scorer
|
60
|
+
additional_metadata: Optional[Dict] = None
|
61
|
+
|
62
|
+
# The user ID of the scorer
|
63
|
+
user: Optional[str] = None
|
64
|
+
|
65
|
+
# Whether the scorer is hosted on the server
|
66
|
+
server_hosted: bool = False
|
44
67
|
|
45
68
|
@model_validator(mode="after")
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
return data
|
69
|
+
def enforce_strict_threshold(self):
|
70
|
+
if self.strict_mode:
|
71
|
+
self.threshold = 1.0
|
72
|
+
return self
|
51
73
|
|
52
74
|
@model_validator(mode="after")
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
m.name = m.class_name
|
59
|
-
return m
|
75
|
+
def default_name(self):
|
76
|
+
self.class_name = self.__class__.__name__
|
77
|
+
if not self.name:
|
78
|
+
self.name = self.class_name
|
79
|
+
return self
|
60
80
|
|
61
81
|
def _add_model(self, model: str):
|
62
82
|
"""
|
@@ -2,18 +2,16 @@ from judgeval.scorers.base_scorer import BaseScorer
|
|
2
2
|
from judgeval.data import Example
|
3
3
|
from typing import List
|
4
4
|
from pydantic import Field
|
5
|
-
from judgeval.common.logger import judgeval_logger
|
6
5
|
|
7
6
|
|
8
7
|
class ExampleScorer(BaseScorer):
|
9
|
-
score_type: str = "Custom"
|
8
|
+
score_type: str = "Custom"
|
10
9
|
required_params: List[str] = Field(default_factory=list)
|
11
10
|
|
12
11
|
async def a_score_example(self, example: Example, *args, **kwargs) -> float:
|
13
12
|
"""
|
14
13
|
Asynchronously measures the score on a single example
|
15
14
|
"""
|
16
|
-
judgeval_logger.error("a_score_example method not implemented")
|
17
15
|
raise NotImplementedError(
|
18
16
|
"You must implement the `a_score_example` method in your custom scorer"
|
19
17
|
)
|
@@ -1,9 +1,3 @@
|
|
1
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.execution_order import (
|
2
|
-
ExecutionOrderScorer,
|
3
|
-
)
|
4
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import (
|
5
|
-
HallucinationScorer,
|
6
|
-
)
|
7
1
|
from judgeval.scorers.judgeval_scorers.api_scorers.faithfulness import (
|
8
2
|
FaithfulnessScorer,
|
9
3
|
)
|
@@ -28,18 +22,10 @@ from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import (
|
|
28
22
|
)
|
29
23
|
|
30
24
|
__all__ = [
|
31
|
-
"ExecutionOrderScorer",
|
32
|
-
"JSONCorrectnessScorer",
|
33
|
-
"SummarizationScorer",
|
34
|
-
"HallucinationScorer",
|
35
25
|
"FaithfulnessScorer",
|
36
|
-
"ContextualRelevancyScorer",
|
37
|
-
"ContextualPrecisionScorer",
|
38
|
-
"ContextualRecallScorer",
|
39
26
|
"AnswerRelevancyScorer",
|
40
27
|
"AnswerCorrectnessScorer",
|
41
28
|
"InstructionAdherenceScorer",
|
42
|
-
"GroundednessScorer",
|
43
29
|
"DerailmentScorer",
|
44
30
|
"ToolOrderScorer",
|
45
31
|
"PromptScorer",
|