judgeval 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +0 -71
- judgeval/clients.py +14 -3
- judgeval/common/tracer.py +57 -31
- judgeval/constants.py +1 -0
- judgeval/data/__init__.py +2 -1
- judgeval/data/scorer_data.py +2 -2
- judgeval/evaluation_run.py +16 -15
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/base_judge.py +1 -1
- judgeval/judges/litellm_judge.py +2 -2
- judgeval/judges/mixture_of_judges.py +2 -2
- judgeval/judges/together_judge.py +2 -2
- judgeval/judges/utils.py +4 -4
- judgeval/judgment_client.py +67 -15
- judgeval/run_evaluation.py +79 -14
- judgeval/scorers/__init__.py +8 -4
- judgeval/scorers/api_scorer.py +64 -0
- judgeval/scorers/base_scorer.py +3 -2
- judgeval/scorers/exceptions.py +11 -0
- judgeval/scorers/{custom_scorer.py → judgeval_scorer.py} +9 -5
- judgeval/scorers/judgeval_scorers/__init__.py +132 -9
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +23 -0
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +19 -0
- judgeval/scorers/judgeval_scorers/{answer_relevancy.py → api_scorers/answer_relevancy.py} +2 -2
- judgeval/scorers/judgeval_scorers/{contextual_precision.py → api_scorers/contextual_precision.py} +2 -2
- judgeval/scorers/judgeval_scorers/{contextual_recall.py → api_scorers/contextual_recall.py} +2 -2
- judgeval/scorers/judgeval_scorers/{contextual_relevancy.py → api_scorers/contextual_relevancy.py} +2 -2
- judgeval/scorers/judgeval_scorers/{faithfulness.py → api_scorers/faithfulness.py} +2 -2
- judgeval/scorers/judgeval_scorers/{hallucination.py → api_scorers/hallucination.py} +2 -2
- judgeval/scorers/judgeval_scorers/{json_correctness.py → api_scorers/json_correctness.py} +7 -7
- judgeval/scorers/judgeval_scorers/{summarization.py → api_scorers/summarization.py} +2 -2
- judgeval/scorers/judgeval_scorers/{tool_correctness.py → api_scorers/tool_correctness.py} +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +24 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +4 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +272 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +169 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +4 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +292 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +174 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +259 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +106 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +249 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +142 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +240 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +121 -0
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +318 -0
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +265 -0
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +258 -0
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +104 -0
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +127 -0
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +247 -0
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +541 -0
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +151 -0
- judgeval/scorers/prompt_scorer.py +4 -4
- judgeval/scorers/score.py +14 -14
- judgeval/scorers/utils.py +40 -6
- {judgeval-0.0.3.dist-info → judgeval-0.0.5.dist-info}/METADATA +1 -1
- judgeval-0.0.5.dist-info/RECORD +78 -0
- judgeval-0.0.3.dist-info/RECORD +0 -46
- {judgeval-0.0.3.dist-info → judgeval-0.0.5.dist-info}/WHEEL +0 -0
- {judgeval-0.0.3.dist-info → judgeval-0.0.5.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py
CHANGED
@@ -1,42 +1,4 @@
|
|
1
1
|
# Import key components that should be publicly accessible
|
2
|
-
from judgeval.common.utils import (
|
3
|
-
get_chat_completion,
|
4
|
-
aget_chat_completion,
|
5
|
-
get_completion_multiple_models,
|
6
|
-
aget_completion_multiple_models
|
7
|
-
)
|
8
|
-
from judgeval.data import (
|
9
|
-
Example,
|
10
|
-
ProcessExample,
|
11
|
-
ScorerData,
|
12
|
-
ScoringResult,
|
13
|
-
)
|
14
|
-
from judgeval.data.datasets import (
|
15
|
-
EvalDataset,
|
16
|
-
GroundTruthExample
|
17
|
-
)
|
18
|
-
|
19
|
-
from judgeval.judges import (
|
20
|
-
judgevalJudge,
|
21
|
-
LiteLLMJudge,
|
22
|
-
TogetherJudge,
|
23
|
-
MixtureOfJudges
|
24
|
-
)
|
25
|
-
from judgeval.scorers import (
|
26
|
-
JudgmentScorer,
|
27
|
-
CustomScorer,
|
28
|
-
PromptScorer,
|
29
|
-
ClassifierScorer,
|
30
|
-
ToolCorrectnessScorer,
|
31
|
-
JSONCorrectnessScorer,
|
32
|
-
SummarizationScorer,
|
33
|
-
HallucinationScorer,
|
34
|
-
FaithfulnessScorer,
|
35
|
-
ContextualRelevancyScorer,
|
36
|
-
ContextualPrecisionScorer,
|
37
|
-
ContextualRecallScorer,
|
38
|
-
AnswerRelevancyScorer
|
39
|
-
)
|
40
2
|
from judgeval.clients import client, langfuse, together_client
|
41
3
|
from judgeval.judgment_client import JudgmentClient
|
42
4
|
|
@@ -46,38 +8,5 @@ __all__ = [
|
|
46
8
|
'langfuse',
|
47
9
|
'together_client',
|
48
10
|
|
49
|
-
# # Common utilities
|
50
|
-
# 'get_chat_completion',
|
51
|
-
# 'aget_chat_completion',
|
52
|
-
# 'get_completion_multiple_models',
|
53
|
-
# 'aget_completion_multiple_models',
|
54
|
-
|
55
|
-
# # Data classes
|
56
|
-
# 'Example',
|
57
|
-
# 'ProcessExample',
|
58
|
-
# 'ScorerData',
|
59
|
-
# 'ScoringResult',
|
60
|
-
|
61
|
-
# # Judges
|
62
|
-
# 'judgevalJudge',
|
63
|
-
# 'LiteLLMJudge',
|
64
|
-
# 'TogetherJudge',
|
65
|
-
# 'MixtureOfJudges',
|
66
|
-
|
67
|
-
# # Scorers
|
68
|
-
# 'JudgmentScorer',
|
69
|
-
# 'CustomScorer',
|
70
|
-
# 'PromptScorer',
|
71
|
-
# 'ClassifierScorer',
|
72
|
-
# 'ToolCorrectnessScorer',
|
73
|
-
# 'JSONCorrectnessScorer',
|
74
|
-
# 'SummarizationScorer',
|
75
|
-
# 'HallucinationScorer',
|
76
|
-
# 'FaithfulnessScorer',
|
77
|
-
# 'ContextualRelevancyScorer',
|
78
|
-
# 'ContextualPrecisionScorer',
|
79
|
-
# 'ContextualRecallScorer',
|
80
|
-
# 'AnswerRelevancyScorer',
|
81
|
-
|
82
11
|
'JudgmentClient',
|
83
12
|
]
|
judgeval/clients.py
CHANGED
@@ -2,18 +2,29 @@ import os
|
|
2
2
|
from dotenv import load_dotenv
|
3
3
|
from openai import OpenAI
|
4
4
|
from langfuse import Langfuse
|
5
|
+
from typing import Optional
|
5
6
|
from together import Together, AsyncTogether
|
6
7
|
|
7
8
|
PATH_TO_DOTENV = os.path.join(os.path.dirname(__file__), ".env")
|
8
9
|
load_dotenv(dotenv_path=PATH_TO_DOTENV)
|
9
10
|
|
10
|
-
# Initialize clients
|
11
|
+
# Initialize required clients
|
11
12
|
client = OpenAI()
|
12
13
|
langfuse = Langfuse(
|
13
14
|
secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
|
14
15
|
public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
|
15
16
|
host=os.getenv("LANGFUSE_HOST"),
|
16
17
|
)
|
17
|
-
|
18
|
-
|
18
|
+
|
19
|
+
# Initialize optional Together clients
|
20
|
+
together_client: Optional['Together'] = None
|
21
|
+
async_together_client: Optional['AsyncTogether'] = None
|
22
|
+
|
23
|
+
# Only initialize Together clients if API key is available
|
24
|
+
if os.getenv("TOGETHERAI_API_KEY"):
|
25
|
+
try:
|
26
|
+
together_client = Together(api_key=os.getenv("TOGETHERAI_API_KEY"))
|
27
|
+
async_together_client = AsyncTogether(api_key=os.getenv("TOGETHERAI_API_KEY"))
|
28
|
+
except Exception:
|
29
|
+
pass
|
19
30
|
|
judgeval/common/tracer.py
CHANGED
@@ -7,16 +7,7 @@ import functools
|
|
7
7
|
import requests
|
8
8
|
import uuid
|
9
9
|
from contextlib import contextmanager
|
10
|
-
from typing import
|
11
|
-
Optional,
|
12
|
-
Any,
|
13
|
-
List,
|
14
|
-
Literal,
|
15
|
-
Tuple,
|
16
|
-
Generator,
|
17
|
-
TypeAlias,
|
18
|
-
Union
|
19
|
-
)
|
10
|
+
from typing import Optional, Any, List, Literal, Tuple, Generator, TypeAlias, Union
|
20
11
|
from dataclasses import dataclass, field
|
21
12
|
from datetime import datetime
|
22
13
|
from openai import OpenAI
|
@@ -33,7 +24,7 @@ from http import HTTPStatus
|
|
33
24
|
from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL
|
34
25
|
from judgeval.judgment_client import JudgmentClient
|
35
26
|
from judgeval.data import Example
|
36
|
-
from judgeval.scorers import
|
27
|
+
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
37
28
|
from judgeval.data.result import ScoringResult
|
38
29
|
|
39
30
|
# Define type aliases for better code readability and maintainability
|
@@ -76,16 +67,42 @@ class TraceEntry:
|
|
76
67
|
elif self.type == "evaluation":
|
77
68
|
print(f"{indent}Evaluation: {self.evaluation_result} ({self.duration:.3f}s)")
|
78
69
|
|
79
|
-
def
|
80
|
-
"""
|
70
|
+
def _serialize_inputs(self) -> dict:
|
71
|
+
"""Helper method to serialize input data safely.
|
72
|
+
|
73
|
+
Returns a dict with serializable versions of inputs, converting non-serializable
|
74
|
+
objects to None with a warning.
|
75
|
+
"""
|
76
|
+
serialized_inputs = {}
|
77
|
+
for key, value in self.inputs.items():
|
78
|
+
if isinstance(value, BaseModel):
|
79
|
+
serialized_inputs[key] = value.model_dump()
|
80
|
+
elif isinstance(value, (list, tuple)):
|
81
|
+
# Handle lists/tuples of arguments
|
82
|
+
serialized_inputs[key] = [
|
83
|
+
item.model_dump() if isinstance(item, BaseModel)
|
84
|
+
else None if not self._is_json_serializable(item)
|
85
|
+
else item
|
86
|
+
for item in value
|
87
|
+
]
|
88
|
+
else:
|
89
|
+
if self._is_json_serializable(value):
|
90
|
+
serialized_inputs[key] = value
|
91
|
+
else:
|
92
|
+
warnings.warn(f"Input '{key}' for function {self.function} is not JSON serializable. Setting to None.")
|
93
|
+
serialized_inputs[key] = None
|
94
|
+
return serialized_inputs
|
95
|
+
|
96
|
+
def _is_json_serializable(self, obj: Any) -> bool:
|
97
|
+
"""Helper method to check if an object is JSON serializable."""
|
81
98
|
try:
|
82
|
-
|
99
|
+
json.dumps(obj)
|
100
|
+
return True
|
83
101
|
except (TypeError, OverflowError, ValueError):
|
84
|
-
|
85
|
-
warnings.warn(f"Output for function {self.function} is not JSON serializable. Setting to None.")
|
86
|
-
output = None
|
102
|
+
return False
|
87
103
|
|
88
|
-
|
104
|
+
def to_dict(self) -> dict:
|
105
|
+
"""Convert the trace entry to a dictionary format for storage/transmission."""
|
89
106
|
return {
|
90
107
|
"type": self.type,
|
91
108
|
"function": self.function,
|
@@ -93,8 +110,8 @@ class TraceEntry:
|
|
93
110
|
"message": self.message,
|
94
111
|
"timestamp": self.timestamp,
|
95
112
|
"duration": self.duration,
|
96
|
-
"output":
|
97
|
-
"inputs": self.
|
113
|
+
"output": self._serialize_output(),
|
114
|
+
"inputs": self._serialize_inputs(),
|
98
115
|
"evaluation_result": [result.to_dict() for result in self.evaluation_result] if self.evaluation_result else None,
|
99
116
|
"span_type": self.span_type
|
100
117
|
}
|
@@ -104,18 +121,22 @@ class TraceEntry:
|
|
104
121
|
|
105
122
|
Handles special cases:
|
106
123
|
- Pydantic models are converted using model_dump()
|
107
|
-
-
|
124
|
+
- Non-serializable objects return None with a warning
|
108
125
|
"""
|
109
126
|
if isinstance(self.output, BaseModel):
|
110
127
|
return self.output.model_dump()
|
111
128
|
|
112
|
-
|
113
|
-
|
114
|
-
|
129
|
+
try:
|
130
|
+
# Try to serialize the output to verify it's JSON compatible
|
131
|
+
json.dumps(self.output)
|
132
|
+
return self.output
|
133
|
+
except (TypeError, OverflowError, ValueError):
|
134
|
+
warnings.warn(f"Output for function {self.function} is not JSON serializable. Setting to None.")
|
135
|
+
return None
|
115
136
|
|
116
137
|
class TraceClient:
|
117
138
|
"""Client for managing a single trace context"""
|
118
|
-
def __init__(self, tracer, trace_id: str, name: str, project_name: str = "default_project"):
|
139
|
+
def __init__(self, tracer, trace_id: str, name: str, project_name: str = "default_project", overwrite: bool = False):
|
119
140
|
self.tracer = tracer
|
120
141
|
self.trace_id = trace_id
|
121
142
|
self.name = name
|
@@ -125,6 +146,7 @@ class TraceClient:
|
|
125
146
|
self.start_time = time.time()
|
126
147
|
self.span_type = None
|
127
148
|
self._current_span: Optional[TraceEntry] = None
|
149
|
+
self.overwrite = overwrite
|
128
150
|
|
129
151
|
@contextmanager
|
130
152
|
def span(self, name: str, span_type: SpanType = "span"):
|
@@ -165,7 +187,7 @@ class TraceClient:
|
|
165
187
|
|
166
188
|
async def async_evaluate(
|
167
189
|
self,
|
168
|
-
scorers: List[Union[
|
190
|
+
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
169
191
|
input: Optional[str] = None,
|
170
192
|
actual_output: Optional[str] = None,
|
171
193
|
expected_output: Optional[str] = None,
|
@@ -175,7 +197,7 @@ class TraceClient:
|
|
175
197
|
expected_tools: Optional[List[str]] = None,
|
176
198
|
additional_metadata: Optional[Dict[str, Any]] = None,
|
177
199
|
model: Optional[str] = None,
|
178
|
-
log_results: Optional[bool] =
|
200
|
+
log_results: Optional[bool] = True,
|
179
201
|
):
|
180
202
|
start_time = time.time() # Record start time
|
181
203
|
example = Example(
|
@@ -195,9 +217,13 @@ class TraceClient:
|
|
195
217
|
model=model,
|
196
218
|
metadata={},
|
197
219
|
log_results=log_results,
|
198
|
-
project_name=
|
199
|
-
eval_run_name=
|
200
|
-
|
220
|
+
project_name=self.project_name,
|
221
|
+
eval_run_name=(
|
222
|
+
f"{self.name.capitalize()}-"
|
223
|
+
f"{self._current_span}-"
|
224
|
+
f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]"
|
225
|
+
),
|
226
|
+
override=self.overwrite
|
201
227
|
)
|
202
228
|
|
203
229
|
self.record_evaluation(scoring_results, start_time) # Pass start_time to record_evaluation
|
@@ -393,7 +419,7 @@ class Tracer:
|
|
393
419
|
def trace(self, name: str, project_name: str = "default_project", overwrite: bool = False) -> Generator[TraceClient, None, None]:
|
394
420
|
"""Start a new trace context using a context manager"""
|
395
421
|
trace_id = str(uuid.uuid4())
|
396
|
-
trace = TraceClient(self, trace_id, name, project_name=project_name)
|
422
|
+
trace = TraceClient(self, trace_id, name, project_name=project_name, overwrite=overwrite)
|
397
423
|
prev_trace = self._current_trace
|
398
424
|
self._current_trace = trace
|
399
425
|
|
judgeval/constants.py
CHANGED
@@ -15,6 +15,7 @@ class APIScorer(str, Enum):
|
|
15
15
|
"""
|
16
16
|
FAITHFULNESS = "faithfulness"
|
17
17
|
ANSWER_RELEVANCY = "answer_relevancy"
|
18
|
+
ANSWER_CORRECTNESS = "answer_correctness"
|
18
19
|
HALLUCINATION = "hallucination"
|
19
20
|
SUMMARIZATION = "summarization"
|
20
21
|
CONTEXTUAL_RECALL = "contextual_recall"
|
judgeval/data/__init__.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1
|
-
from judgeval.data.example import Example
|
1
|
+
from judgeval.data.example import Example, ExampleParams
|
2
2
|
from judgeval.data.api_example import ProcessExample, create_process_example
|
3
3
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
4
4
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
5
5
|
|
6
6
|
__all__ = [
|
7
7
|
"Example",
|
8
|
+
"ExampleParams",
|
8
9
|
"ProcessExample",
|
9
10
|
"create_process_example",
|
10
11
|
"ScorerData",
|
judgeval/data/scorer_data.py
CHANGED
@@ -7,7 +7,7 @@ ScorerData holds the information related to a single, completed Scorer evaluatio
|
|
7
7
|
from typing import List, Union, Optional, Dict
|
8
8
|
from pydantic import BaseModel, Field
|
9
9
|
|
10
|
-
from judgeval.scorers import
|
10
|
+
from judgeval.scorers import JudgevalScorer
|
11
11
|
|
12
12
|
class ScorerData(BaseModel):
|
13
13
|
"""
|
@@ -47,7 +47,7 @@ class ScorerData(BaseModel):
|
|
47
47
|
}
|
48
48
|
|
49
49
|
|
50
|
-
def create_scorer_data(scorer:
|
50
|
+
def create_scorer_data(scorer: JudgevalScorer) -> ScorerData:
|
51
51
|
"""
|
52
52
|
After a `scorer` is run, it contains information about the example that was evaluated
|
53
53
|
using the scorer. For example, after computing Faithfulness, the `scorer` object will contain
|
judgeval/evaluation_run.py
CHANGED
@@ -2,11 +2,10 @@ from typing import List, Optional, Dict, Any, Union
|
|
2
2
|
from pydantic import BaseModel, field_validator
|
3
3
|
|
4
4
|
from judgeval.data import Example
|
5
|
-
from judgeval.scorers import
|
6
|
-
from judgeval.judges import judgevalJudge
|
5
|
+
from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
|
7
6
|
from judgeval.constants import ACCEPTABLE_MODELS
|
8
7
|
from judgeval.common.logger import debug, error
|
9
|
-
|
8
|
+
from judgeval.judges import JudgevalJudge
|
10
9
|
|
11
10
|
class EvaluationRun(BaseModel):
|
12
11
|
"""
|
@@ -28,8 +27,8 @@ class EvaluationRun(BaseModel):
|
|
28
27
|
project_name: Optional[str] = None
|
29
28
|
eval_name: Optional[str] = None
|
30
29
|
examples: List[Example]
|
31
|
-
scorers: List[Union[
|
32
|
-
model: Union[str, List[str],
|
30
|
+
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
|
31
|
+
model: Union[str, List[str], JudgevalJudge]
|
33
32
|
aggregator: Optional[str] = None
|
34
33
|
metadata: Optional[Dict[str, Any]] = None
|
35
34
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
@@ -39,8 +38,9 @@ class EvaluationRun(BaseModel):
|
|
39
38
|
data = super().model_dump(**kwargs)
|
40
39
|
|
41
40
|
data["scorers"] = [
|
42
|
-
scorer.to_dict()
|
43
|
-
if hasattr(scorer, "
|
41
|
+
scorer.to_dict() if hasattr(scorer, "to_dict")
|
42
|
+
else scorer.model_dump() if hasattr(scorer, "model_dump")
|
43
|
+
else {"score_type": scorer.score_type, "threshold": scorer.threshold}
|
44
44
|
for scorer in self.scorers
|
45
45
|
]
|
46
46
|
return data
|
@@ -81,7 +81,7 @@ class EvaluationRun(BaseModel):
|
|
81
81
|
if not v:
|
82
82
|
raise ValueError("Scorers cannot be empty.")
|
83
83
|
for s in v:
|
84
|
-
if not isinstance(s,
|
84
|
+
if not isinstance(s, APIJudgmentScorer) and not isinstance(s, JudgevalScorer):
|
85
85
|
raise ValueError(f"Invalid type for Scorer: {type(s)}")
|
86
86
|
return v
|
87
87
|
|
@@ -89,20 +89,21 @@ class EvaluationRun(BaseModel):
|
|
89
89
|
def validate_model(cls, v, values):
|
90
90
|
if not v:
|
91
91
|
raise ValueError("Model cannot be empty.")
|
92
|
+
|
92
93
|
# Check if model is a judgevalJudge
|
93
|
-
if isinstance(v,
|
94
|
-
# Verify all scorers are
|
94
|
+
if isinstance(v, JudgevalJudge):
|
95
|
+
# Verify all scorers are JudgevalScorer when using judgevalJudge
|
95
96
|
scorers = values.data.get('scorers', [])
|
96
|
-
if not all(isinstance(s,
|
97
|
-
raise ValueError("When using a judgevalJudge model, all scorers must be
|
97
|
+
if not all(isinstance(s, JudgevalScorer) for s in scorers):
|
98
|
+
raise ValueError("When using a judgevalJudge model, all scorers must be JudgevalScorer type")
|
98
99
|
return v
|
99
|
-
|
100
|
+
|
100
101
|
# Check if model is string or list of strings
|
101
102
|
if isinstance(v, str):
|
102
103
|
if v not in ACCEPTABLE_MODELS:
|
103
104
|
raise ValueError(f"Model name {v} not recognized.")
|
104
105
|
return v
|
105
|
-
|
106
|
+
|
106
107
|
if isinstance(v, list):
|
107
108
|
if not all(isinstance(m, str) for m in v):
|
108
109
|
raise ValueError("When providing a list of models, all elements must be strings")
|
@@ -110,7 +111,7 @@ class EvaluationRun(BaseModel):
|
|
110
111
|
if m not in ACCEPTABLE_MODELS:
|
111
112
|
raise ValueError(f"Model name {m} not recognized.")
|
112
113
|
return v
|
113
|
-
raise ValueError(f"Model must be one of: string, list of strings, or
|
114
|
+
raise ValueError(f"Model must be one of: string, list of strings, or JudgevalJudge instance. Received type {type(v)}.")
|
114
115
|
|
115
116
|
@field_validator('aggregator', mode='before')
|
116
117
|
def validate_aggregator(cls, v, values):
|
judgeval/judges/__init__.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from pydantic import BaseModel
|
2
|
-
from judgeval.judges.base_judge import
|
2
|
+
from judgeval.judges.base_judge import JudgevalJudge
|
3
3
|
from judgeval.judges.litellm_judge import LiteLLMJudge
|
4
4
|
from judgeval.judges.together_judge import TogetherJudge
|
5
5
|
from judgeval.judges.mixture_of_judges import MixtureOfJudges
|
6
6
|
|
7
|
-
__all__ = ["
|
7
|
+
__all__ = ["JudgevalJudge", "LiteLLMJudge", "TogetherJudge", "MixtureOfJudges"]
|
judgeval/judges/base_judge.py
CHANGED
@@ -6,7 +6,7 @@ from abc import ABC, abstractmethod
|
|
6
6
|
from typing import Optional, List
|
7
7
|
|
8
8
|
|
9
|
-
class
|
9
|
+
class JudgevalJudge(ABC):
|
10
10
|
def __init__(self, model_name: Optional[str] = None, *args, **kwargs):
|
11
11
|
self.model_name = model_name
|
12
12
|
self.model = self.load_model(*args, **kwargs)
|
judgeval/judges/litellm_judge.py
CHANGED
@@ -2,7 +2,7 @@ import pydantic
|
|
2
2
|
from typing import List, Union, Mapping
|
3
3
|
|
4
4
|
from judgeval import *
|
5
|
-
from judgeval.judges import
|
5
|
+
from judgeval.judges import JudgevalJudge
|
6
6
|
from judgeval.common.utils import afetch_litellm_api_response, fetch_litellm_api_response
|
7
7
|
from judgeval.common.logger import debug, error
|
8
8
|
|
@@ -11,7 +11,7 @@ BASE_CONVERSATION = [
|
|
11
11
|
] # for string inputs, we need to add the user query to a base conversation, since LiteLLM only accepts a list of dictionaries as a chat history
|
12
12
|
|
13
13
|
|
14
|
-
class LiteLLMJudge(
|
14
|
+
class LiteLLMJudge(JudgevalJudge):
|
15
15
|
def __init__(self, model: str = "gpt-4o-mini", **kwargs):
|
16
16
|
debug(f"Initializing LiteLLMJudge with model={model}")
|
17
17
|
self.model = model
|
@@ -6,7 +6,7 @@ Enables client to use multiple models to generate responses and then aggregate t
|
|
6
6
|
from judgeval import *
|
7
7
|
import pydantic
|
8
8
|
from typing import List, Union, Mapping, Dict
|
9
|
-
from judgeval.judges import
|
9
|
+
from judgeval.judges import JudgevalJudge
|
10
10
|
from judgeval.common.utils import get_completion_multiple_models, get_chat_completion, aget_completion_multiple_models, aget_chat_completion
|
11
11
|
from judgeval.common.logger import debug, error
|
12
12
|
|
@@ -115,7 +115,7 @@ def build_dynamic_mixture_prompt(
|
|
115
115
|
BASE_CONVERSATION = [
|
116
116
|
{"role": "system", "content": "You are a helpful assistant."},
|
117
117
|
] # for string inputs, we need to add the user query to a base conversation, since LiteLLM only accepts a list of dictionaries as a chat history
|
118
|
-
class MixtureOfJudges(
|
118
|
+
class MixtureOfJudges(JudgevalJudge):
|
119
119
|
"""
|
120
120
|
IMPORTANT: When supplying custom prompts and conversation histories for aggregation, supply them in the following format:
|
121
121
|
in kwargs:
|
@@ -6,14 +6,14 @@ from pydantic import BaseModel
|
|
6
6
|
from typing import List, Union, Mapping
|
7
7
|
from judgeval.common.logger import debug, error
|
8
8
|
|
9
|
-
from judgeval.judges import
|
9
|
+
from judgeval.judges import JudgevalJudge
|
10
10
|
from judgeval.common.utils import fetch_together_api_response, afetch_together_api_response
|
11
11
|
|
12
12
|
BASE_CONVERSATION = [
|
13
13
|
{"role": "system", "content": "You are a helpful assistant."},
|
14
14
|
]
|
15
15
|
|
16
|
-
class TogetherJudge(
|
16
|
+
class TogetherJudge(JudgevalJudge):
|
17
17
|
def __init__(self, model: str = "QWEN", **kwargs):
|
18
18
|
debug(f"Initializing TogetherJudge with model={model}")
|
19
19
|
self.model = model
|
judgeval/judges/utils.py
CHANGED
@@ -5,13 +5,13 @@ import litellm
|
|
5
5
|
from typing import Optional, Union, Tuple, List
|
6
6
|
|
7
7
|
from judgeval.common.exceptions import InvalidJudgeModelError
|
8
|
-
from judgeval.judges import
|
8
|
+
from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
|
9
9
|
from judgeval.constants import TOGETHER_SUPPORTED_MODELS
|
10
10
|
|
11
11
|
LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
|
12
12
|
|
13
13
|
def create_judge(
|
14
|
-
model: Optional[Union[str, List[str],
|
14
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = None) -> Tuple[JudgevalJudge, bool]:
|
15
15
|
"""
|
16
16
|
Creates a judge model from string(s) or a judgeval judge object.
|
17
17
|
|
@@ -24,10 +24,10 @@ def create_judge(
|
|
24
24
|
"""
|
25
25
|
if model is None: # default option
|
26
26
|
return LiteLLMJudge(model="gpt-4o"), True
|
27
|
-
if not isinstance(model, (str, list,
|
27
|
+
if not isinstance(model, (str, list, JudgevalJudge)):
|
28
28
|
raise InvalidJudgeModelError(f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead.")
|
29
29
|
# If model is already a valid judge type, return it and mark native
|
30
|
-
if isinstance(model, (
|
30
|
+
if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges)):
|
31
31
|
return model, True
|
32
32
|
|
33
33
|
# Either string or List[str]
|
judgeval/judgment_client.py
CHANGED
@@ -7,11 +7,22 @@ import requests
|
|
7
7
|
|
8
8
|
from judgeval.constants import ROOT_API
|
9
9
|
from judgeval.data.datasets import EvalDataset
|
10
|
-
from judgeval.data import
|
11
|
-
|
12
|
-
|
10
|
+
from judgeval.data import (
|
11
|
+
ScoringResult,
|
12
|
+
Example
|
13
|
+
)
|
14
|
+
from judgeval.scorers import (
|
15
|
+
APIJudgmentScorer,
|
16
|
+
JudgevalScorer,
|
17
|
+
ClassifierScorer,
|
18
|
+
ScorerWrapper
|
19
|
+
)
|
13
20
|
from judgeval.evaluation_run import EvaluationRun
|
14
|
-
from judgeval.run_evaluation import
|
21
|
+
from judgeval.run_evaluation import (
|
22
|
+
run_eval,
|
23
|
+
assert_test
|
24
|
+
)
|
25
|
+
from judgeval.judges import JudgevalJudge
|
15
26
|
from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL
|
16
27
|
from judgeval.common.exceptions import JudgmentAPIError
|
17
28
|
from pydantic import BaseModel
|
@@ -37,25 +48,32 @@ class JudgmentClient:
|
|
37
48
|
def run_evaluation(
|
38
49
|
self,
|
39
50
|
examples: List[Example],
|
40
|
-
scorers: List[Union[
|
41
|
-
model: Union[str, List[str],
|
51
|
+
scorers: List[Union[ScorerWrapper, JudgevalScorer]],
|
52
|
+
model: Union[str, List[str], JudgevalJudge],
|
42
53
|
aggregator: Optional[str] = None,
|
43
54
|
metadata: Optional[Dict[str, Any]] = None,
|
44
|
-
log_results: bool =
|
45
|
-
project_name: str = "",
|
46
|
-
eval_run_name: str = "",
|
55
|
+
log_results: bool = True,
|
56
|
+
project_name: str = "default_project",
|
57
|
+
eval_run_name: str = "default_eval_run",
|
47
58
|
override: bool = False,
|
59
|
+
use_judgment: bool = True
|
48
60
|
) -> List[ScoringResult]:
|
49
61
|
"""
|
50
62
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
51
63
|
"""
|
52
64
|
try:
|
65
|
+
# Load appropriate implementations for all scorers
|
66
|
+
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
|
67
|
+
scorer.load_implementation(use_judgment=use_judgment) if isinstance(scorer, ScorerWrapper) else scorer
|
68
|
+
for scorer in scorers
|
69
|
+
]
|
70
|
+
|
53
71
|
eval = EvaluationRun(
|
54
72
|
log_results=log_results,
|
55
73
|
project_name=project_name,
|
56
74
|
eval_name=eval_run_name,
|
57
75
|
examples=examples,
|
58
|
-
scorers=
|
76
|
+
scorers=loaded_scorers,
|
59
77
|
model=model,
|
60
78
|
aggregator=aggregator,
|
61
79
|
metadata=metadata,
|
@@ -68,24 +86,31 @@ class JudgmentClient:
|
|
68
86
|
def evaluate_dataset(
|
69
87
|
self,
|
70
88
|
dataset: EvalDataset,
|
71
|
-
scorers: List[Union[
|
72
|
-
model: Union[str, List[str]],
|
89
|
+
scorers: List[Union[ScorerWrapper, JudgevalScorer]],
|
90
|
+
model: Union[str, List[str], JudgevalJudge],
|
73
91
|
aggregator: Optional[str] = None,
|
74
92
|
metadata: Optional[Dict[str, Any]] = None,
|
75
93
|
project_name: str = "",
|
76
94
|
eval_run_name: str = "",
|
77
|
-
log_results: bool = False
|
95
|
+
log_results: bool = False,
|
96
|
+
use_judgment: bool = True
|
78
97
|
) -> List[ScoringResult]:
|
79
98
|
"""
|
80
99
|
Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
|
81
100
|
"""
|
82
101
|
try:
|
102
|
+
# Load appropriate implementations for all scorers
|
103
|
+
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
|
104
|
+
scorer.load_implementation(use_judgment=use_judgment) if isinstance(scorer, ScorerWrapper) else scorer
|
105
|
+
for scorer in scorers
|
106
|
+
]
|
107
|
+
|
83
108
|
evaluation_run = EvaluationRun(
|
84
109
|
log_results=log_results,
|
85
110
|
project_name=project_name,
|
86
111
|
eval_name=eval_run_name,
|
87
112
|
examples=dataset.examples,
|
88
|
-
scorers=
|
113
|
+
scorers=loaded_scorers,
|
89
114
|
model=model,
|
90
115
|
aggregator=aggregator,
|
91
116
|
metadata=metadata,
|
@@ -241,4 +266,31 @@ class JudgmentClient:
|
|
241
266
|
raise JudgmentAPIError(f"Failed to save classifier scorer: {response.json().get('detail', '')}")
|
242
267
|
|
243
268
|
return response.json()["slug"]
|
244
|
-
|
269
|
+
|
270
|
+
|
271
|
+
def assert_test(
|
272
|
+
self,
|
273
|
+
examples: List[Example],
|
274
|
+
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
275
|
+
model: Union[str, List[str], JudgevalJudge],
|
276
|
+
aggregator: Optional[str] = None,
|
277
|
+
metadata: Optional[Dict[str, Any]] = None,
|
278
|
+
log_results: bool = False,
|
279
|
+
project_name: str = "",
|
280
|
+
eval_run_name: str = "",
|
281
|
+
override: bool = False,
|
282
|
+
) -> None:
|
283
|
+
|
284
|
+
results = self.run_evaluation(
|
285
|
+
examples=examples,
|
286
|
+
scorers=scorers,
|
287
|
+
model=model,
|
288
|
+
aggregator=aggregator,
|
289
|
+
metadata=metadata,
|
290
|
+
log_results=log_results,
|
291
|
+
project_name=project_name,
|
292
|
+
eval_run_name=eval_run_name,
|
293
|
+
override=override
|
294
|
+
)
|
295
|
+
|
296
|
+
assert_test(results)
|