judgeval 0.0.31__py3-none-any.whl → 0.0.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +3 -1
- judgeval/common/s3_storage.py +93 -0
- judgeval/common/tracer.py +869 -183
- judgeval/constants.py +1 -1
- judgeval/data/datasets/dataset.py +5 -1
- judgeval/data/datasets/eval_dataset_client.py +2 -2
- judgeval/data/sequence.py +16 -26
- judgeval/data/sequence_run.py +2 -0
- judgeval/judgment_client.py +44 -166
- judgeval/rules.py +4 -7
- judgeval/run_evaluation.py +2 -2
- judgeval/scorers/__init__.py +4 -4
- judgeval/scorers/judgeval_scorers/__init__.py +0 -176
- judgeval/version_check.py +22 -0
- {judgeval-0.0.31.dist-info → judgeval-0.0.34.dist-info}/METADATA +15 -2
- judgeval-0.0.34.dist-info/RECORD +63 -0
- judgeval/scorers/base_scorer.py +0 -58
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -27
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -276
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -161
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -222
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -156
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -318
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -232
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -102
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -551
- judgeval-0.0.31.dist-info/RECORD +0 -96
- {judgeval-0.0.31.dist-info → judgeval-0.0.34.dist-info}/WHEEL +0 -0
- {judgeval-0.0.31.dist-info → judgeval-0.0.34.dist-info}/licenses/LICENSE.md +0 -0
judgeval/constants.py
CHANGED
@@ -43,7 +43,7 @@ JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
|
43
43
|
JUDGMENT_SEQUENCE_EVAL_API_URL = f"{ROOT_API}/evaluate_sequence/"
|
44
44
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
45
45
|
JUDGMENT_DATASETS_APPEND_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
46
|
-
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/
|
46
|
+
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
|
47
47
|
JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
|
48
48
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
49
49
|
JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
|
@@ -7,12 +7,13 @@ import yaml
|
|
7
7
|
from dataclasses import dataclass, field
|
8
8
|
from typing import List, Union, Literal
|
9
9
|
|
10
|
-
from judgeval.data import Example
|
10
|
+
from judgeval.data import Example, Sequence
|
11
11
|
from judgeval.common.logger import debug, error, warning, info
|
12
12
|
|
13
13
|
@dataclass
|
14
14
|
class EvalDataset:
|
15
15
|
examples: List[Example]
|
16
|
+
sequences: List[Sequence]
|
16
17
|
_alias: Union[str, None] = field(default=None)
|
17
18
|
_id: Union[str, None] = field(default=None)
|
18
19
|
judgment_api_key: str = field(default="")
|
@@ -21,11 +22,13 @@ class EvalDataset:
|
|
21
22
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
|
22
23
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
|
23
24
|
examples: List[Example] = [],
|
25
|
+
sequences: List[Sequence] = []
|
24
26
|
):
|
25
27
|
debug(f"Initializing EvalDataset with {len(examples)} examples")
|
26
28
|
if not judgment_api_key:
|
27
29
|
warning("No judgment_api_key provided")
|
28
30
|
self.examples = examples
|
31
|
+
self.sequences = sequences
|
29
32
|
self._alias = None
|
30
33
|
self._id = None
|
31
34
|
self.judgment_api_key = judgment_api_key
|
@@ -309,6 +312,7 @@ class EvalDataset:
|
|
309
312
|
return (
|
310
313
|
f"{self.__class__.__name__}("
|
311
314
|
f"examples={self.examples}, "
|
315
|
+
f"sequences={self.sequences}, "
|
312
316
|
f"_alias={self._alias}, "
|
313
317
|
f"_id={self._id}"
|
314
318
|
f")"
|
@@ -13,7 +13,7 @@ from judgeval.constants import (
|
|
13
13
|
JUDGMENT_DATASETS_INSERT_API_URL,
|
14
14
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
|
15
15
|
)
|
16
|
-
from judgeval.data import Example
|
16
|
+
from judgeval.data import Example, Sequence
|
17
17
|
from judgeval.data.datasets import EvalDataset
|
18
18
|
|
19
19
|
|
@@ -201,8 +201,8 @@ class EvalDatasetClient:
|
|
201
201
|
|
202
202
|
info(f"Successfully pulled dataset with alias '{alias}'")
|
203
203
|
payload = response.json()
|
204
|
-
|
205
204
|
dataset.examples = [Example(**e) for e in payload.get("examples", [])]
|
205
|
+
dataset.sequences = [Sequence(**s) for s in payload.get("sequences", [])]
|
206
206
|
dataset._alias = payload.get("alias")
|
207
207
|
dataset._id = payload.get("id")
|
208
208
|
progress.update(
|
judgeval/data/sequence.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from pydantic import BaseModel, Field, field_validator, model_validator
|
2
2
|
from typing import List, Optional, Union, Any
|
3
3
|
from judgeval.data.example import Example
|
4
|
-
from judgeval.scorers import
|
4
|
+
from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
|
5
5
|
from uuid import uuid4
|
6
6
|
from datetime import datetime, timezone
|
7
7
|
|
@@ -16,42 +16,32 @@ class Sequence(BaseModel):
|
|
16
16
|
scorers: Optional[Any] = None
|
17
17
|
parent_sequence_id: Optional[str] = None
|
18
18
|
sequence_order: Optional[int] = 0
|
19
|
+
root_sequence_id: Optional[str] = None
|
20
|
+
inputs: Optional[str] = None
|
21
|
+
output: Optional[str] = None
|
19
22
|
|
20
23
|
@field_validator("scorers")
|
21
24
|
def validate_scorer(cls, v):
|
22
|
-
loaded_scorers = []
|
23
25
|
for scorer in v or []:
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
else:
|
28
|
-
loaded_scorers.append(scorer)
|
29
|
-
except Exception as e:
|
30
|
-
raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
31
|
-
return loaded_scorers
|
26
|
+
if not isinstance(scorer, APIJudgmentScorer) and not isinstance(scorer, JudgevalScorer):
|
27
|
+
raise ValueError(f"Invalid scorer type: {type(scorer)}")
|
28
|
+
return v
|
32
29
|
|
33
|
-
@model_validator(mode=
|
34
|
-
def
|
35
|
-
"""Recursively set
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
# Recurse into deeper nested sequences
|
40
|
-
item.set_parent_sequence_ids()
|
41
|
-
return self
|
30
|
+
@model_validator(mode="after")
|
31
|
+
def populate_sequence_metadata(self) -> "Sequence":
|
32
|
+
"""Recursively set parent_sequence_id, root_sequence_id, and sequence_order."""
|
33
|
+
# If root_sequence_id isn't already set, assign it to self
|
34
|
+
if self.root_sequence_id is None:
|
35
|
+
self.root_sequence_id = self.sequence_id
|
42
36
|
|
43
|
-
@model_validator(mode='after')
|
44
|
-
def set_parent_and_order(self) -> "Sequence":
|
45
|
-
"""Set parent_sequence_id and sequence_order for all items."""
|
46
37
|
for idx, item in enumerate(self.items):
|
47
|
-
# Set sequence_order for both Example and Sequence objects
|
48
38
|
item.sequence_order = idx
|
49
|
-
|
50
39
|
if isinstance(item, Sequence):
|
51
40
|
item.parent_sequence_id = self.sequence_id
|
52
|
-
item.
|
41
|
+
item.root_sequence_id = self.root_sequence_id
|
42
|
+
item.populate_sequence_metadata()
|
53
43
|
return self
|
54
|
-
|
44
|
+
|
55
45
|
class Config:
|
56
46
|
arbitrary_types_allowed = True
|
57
47
|
|
judgeval/data/sequence_run.py
CHANGED
@@ -21,6 +21,7 @@ class SequenceRun(BaseModel):
|
|
21
21
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
22
22
|
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
23
23
|
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
24
|
+
append (Optional[bool]): Whether to append to existing evaluation results
|
24
25
|
"""
|
25
26
|
|
26
27
|
# The user will specify whether they want log_results when they call run_eval
|
@@ -33,6 +34,7 @@ class SequenceRun(BaseModel):
|
|
33
34
|
aggregator: Optional[str] = None
|
34
35
|
metadata: Optional[Dict[str, Any]] = None
|
35
36
|
trace_span_id: Optional[str] = None
|
37
|
+
append: Optional[bool] = False
|
36
38
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
37
39
|
judgment_api_key: Optional[str] = ""
|
38
40
|
override: Optional[bool] = False
|
judgeval/judgment_client.py
CHANGED
@@ -17,7 +17,6 @@ from judgeval.scorers import (
|
|
17
17
|
APIJudgmentScorer,
|
18
18
|
JudgevalScorer,
|
19
19
|
ClassifierScorer,
|
20
|
-
ScorerWrapper,
|
21
20
|
)
|
22
21
|
from judgeval.evaluation_run import EvaluationRun
|
23
22
|
from judgeval.run_evaluation import (
|
@@ -74,7 +73,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
74
73
|
def a_run_evaluation(
|
75
74
|
self,
|
76
75
|
examples: List[Example],
|
77
|
-
scorers: List[Union[
|
76
|
+
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
78
77
|
model: Union[str, List[str], JudgevalJudge],
|
79
78
|
aggregator: Optional[str] = None,
|
80
79
|
metadata: Optional[Dict[str, Any]] = None,
|
@@ -83,49 +82,57 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
83
82
|
eval_run_name: str = "default_eval_run",
|
84
83
|
override: bool = False,
|
85
84
|
append: bool = False,
|
86
|
-
use_judgment: bool = True,
|
87
85
|
ignore_errors: bool = True,
|
88
86
|
rules: Optional[List[Rule]] = None
|
89
87
|
) -> List[ScoringResult]:
|
90
|
-
return self.run_evaluation(
|
88
|
+
return self.run_evaluation(
|
89
|
+
examples=examples,
|
90
|
+
scorers=scorers,
|
91
|
+
model=model,
|
92
|
+
aggregator=aggregator,
|
93
|
+
metadata=metadata,
|
94
|
+
log_results=log_results,
|
95
|
+
project_name=project_name,
|
96
|
+
eval_run_name=eval_run_name,
|
97
|
+
override=override,
|
98
|
+
append=append,
|
99
|
+
ignore_errors=ignore_errors,
|
100
|
+
rules=rules
|
101
|
+
)
|
91
102
|
|
92
103
|
def run_sequence_evaluation(
|
93
104
|
self,
|
94
105
|
sequences: List[Sequence],
|
95
106
|
model: Union[str, List[str], JudgevalJudge],
|
107
|
+
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
96
108
|
aggregator: Optional[str] = None,
|
97
109
|
project_name: str = "default_project",
|
98
110
|
eval_run_name: str = "default_eval_sequence",
|
99
|
-
use_judgment: bool = True,
|
100
111
|
log_results: bool = True,
|
112
|
+
append: bool = False,
|
101
113
|
override: bool = False,
|
102
114
|
ignore_errors: bool = True,
|
103
115
|
rules: Optional[List[Rule]] = None
|
104
116
|
) -> List[ScoringResult]:
|
105
117
|
try:
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
new_rule = rule.model_copy()
|
125
|
-
new_rule.conditions = processed_conditions
|
126
|
-
loaded_rules.append(new_rule)
|
127
|
-
except Exception as e:
|
128
|
-
raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
|
118
|
+
def get_all_sequences(root: Sequence) -> List[Sequence]:
|
119
|
+
all_sequences = [root]
|
120
|
+
|
121
|
+
for item in root.items:
|
122
|
+
if isinstance(item, Sequence):
|
123
|
+
all_sequences.extend(get_all_sequences(item))
|
124
|
+
|
125
|
+
return all_sequences
|
126
|
+
|
127
|
+
def flatten_sequence_list(sequences: List[Sequence]) -> List[Sequence]:
|
128
|
+
flattened = []
|
129
|
+
for seq in sequences:
|
130
|
+
flattened.extend(get_all_sequences(seq))
|
131
|
+
return flattened
|
132
|
+
|
133
|
+
flattened_sequences = flatten_sequence_list(sequences)
|
134
|
+
for sequence in flattened_sequences:
|
135
|
+
sequence.scorers = scorers
|
129
136
|
|
130
137
|
sequence_run = SequenceRun(
|
131
138
|
project_name=project_name,
|
@@ -134,11 +141,11 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
134
141
|
model=model,
|
135
142
|
aggregator=aggregator,
|
136
143
|
log_results=log_results,
|
144
|
+
append=append,
|
137
145
|
judgment_api_key=self.judgment_api_key,
|
138
146
|
organization_id=self.organization_id
|
139
147
|
)
|
140
|
-
|
141
|
-
return run_sequence_eval(sequence_run, override, ignore_errors, use_judgment)
|
148
|
+
return run_sequence_eval(sequence_run, override, ignore_errors)
|
142
149
|
except ValueError as e:
|
143
150
|
raise ValueError(f"Please check your SequenceRun object, one or more fields are invalid: \n{str(e)}")
|
144
151
|
except Exception as e:
|
@@ -147,7 +154,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
147
154
|
def run_evaluation(
|
148
155
|
self,
|
149
156
|
examples: Union[List[Example], List[CustomExample]],
|
150
|
-
scorers: List[Union[
|
157
|
+
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
151
158
|
model: Union[str, List[str], JudgevalJudge],
|
152
159
|
aggregator: Optional[str] = None,
|
153
160
|
metadata: Optional[Dict[str, Any]] = None,
|
@@ -156,7 +163,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
156
163
|
eval_run_name: str = "default_eval_run",
|
157
164
|
override: bool = False,
|
158
165
|
append: bool = False,
|
159
|
-
use_judgment: bool = True,
|
160
166
|
ignore_errors: bool = True,
|
161
167
|
async_execution: bool = False,
|
162
168
|
rules: Optional[List[Rule]] = None
|
@@ -166,7 +172,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
166
172
|
|
167
173
|
Args:
|
168
174
|
examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
|
169
|
-
scorers (List[Union[
|
175
|
+
scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
170
176
|
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
171
177
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
172
178
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
|
@@ -174,7 +180,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
174
180
|
project_name (str): The name of the project the evaluation results belong to
|
175
181
|
eval_run_name (str): A name for this evaluation run
|
176
182
|
override (bool): Whether to override an existing evaluation run with the same name
|
177
|
-
use_judgment (bool): Whether to use Judgment API for evaluation
|
178
183
|
ignore_errors (bool): Whether to ignore errors during evaluation (safely handled)
|
179
184
|
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
180
185
|
|
@@ -185,58 +190,21 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
185
190
|
raise ValueError("Cannot set both override and append to True. Please choose one.")
|
186
191
|
|
187
192
|
try:
|
188
|
-
|
189
|
-
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
|
190
|
-
for scorer in scorers:
|
191
|
-
try:
|
192
|
-
if isinstance(scorer, ScorerWrapper):
|
193
|
-
loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
|
194
|
-
else:
|
195
|
-
loaded_scorers.append(scorer)
|
196
|
-
except Exception as e:
|
197
|
-
raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
198
|
-
|
199
|
-
# Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
|
200
|
-
if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
|
193
|
+
if rules and any(isinstance(scorer, JudgevalScorer) for scorer in scorers):
|
201
194
|
raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
|
202
195
|
|
203
|
-
# Convert ScorerWrapper in rules to their implementations
|
204
|
-
loaded_rules = None
|
205
|
-
if rules:
|
206
|
-
loaded_rules = []
|
207
|
-
for rule in rules:
|
208
|
-
try:
|
209
|
-
processed_conditions = []
|
210
|
-
for condition in rule.conditions:
|
211
|
-
# Convert metric if it's a ScorerWrapper
|
212
|
-
if isinstance(condition.metric, ScorerWrapper):
|
213
|
-
try:
|
214
|
-
condition_copy = condition.model_copy()
|
215
|
-
condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
|
216
|
-
processed_conditions.append(condition_copy)
|
217
|
-
except Exception as e:
|
218
|
-
raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
|
219
|
-
else:
|
220
|
-
processed_conditions.append(condition)
|
221
|
-
|
222
|
-
# Create new rule with processed conditions
|
223
|
-
new_rule = rule.model_copy()
|
224
|
-
new_rule.conditions = processed_conditions
|
225
|
-
loaded_rules.append(new_rule)
|
226
|
-
except Exception as e:
|
227
|
-
raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
|
228
196
|
eval = EvaluationRun(
|
229
197
|
log_results=log_results,
|
230
198
|
append=append,
|
231
199
|
project_name=project_name,
|
232
200
|
eval_name=eval_run_name,
|
233
201
|
examples=examples,
|
234
|
-
scorers=
|
202
|
+
scorers=scorers,
|
235
203
|
model=model,
|
236
204
|
aggregator=aggregator,
|
237
205
|
metadata=metadata,
|
238
206
|
judgment_api_key=self.judgment_api_key,
|
239
|
-
rules=
|
207
|
+
rules=rules,
|
240
208
|
organization_id=self.organization_id
|
241
209
|
)
|
242
210
|
return run_eval(eval, override, ignore_errors=ignore_errors, async_execution=async_execution)
|
@@ -244,98 +212,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
244
212
|
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
|
245
213
|
except Exception as e:
|
246
214
|
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
247
|
-
|
248
|
-
def evaluate_dataset(
|
249
|
-
self,
|
250
|
-
dataset: EvalDataset,
|
251
|
-
scorers: List[Union[ScorerWrapper, JudgevalScorer]],
|
252
|
-
model: Union[str, List[str], JudgevalJudge],
|
253
|
-
aggregator: Optional[str] = None,
|
254
|
-
metadata: Optional[Dict[str, Any]] = None,
|
255
|
-
project_name: str = "",
|
256
|
-
eval_run_name: str = "",
|
257
|
-
log_results: bool = True,
|
258
|
-
use_judgment: bool = True,
|
259
|
-
rules: Optional[List[Rule]] = None
|
260
|
-
) -> List[ScoringResult]:
|
261
|
-
"""
|
262
|
-
Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
|
263
|
-
|
264
|
-
Args:
|
265
|
-
dataset (EvalDataset): The dataset containing examples to evaluate
|
266
|
-
scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
|
267
|
-
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
268
|
-
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
269
|
-
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
|
270
|
-
project_name (str): The name of the project the evaluation results belong to
|
271
|
-
eval_run_name (str): A name for this evaluation run
|
272
|
-
log_results (bool): Whether to log the results to the Judgment API
|
273
|
-
use_judgment (bool): Whether to use Judgment API for evaluation
|
274
|
-
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
275
|
-
|
276
|
-
Returns:
|
277
|
-
List[ScoringResult]: The results of the evaluation
|
278
|
-
"""
|
279
|
-
try:
|
280
|
-
# Load appropriate implementations for all scorers
|
281
|
-
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
|
282
|
-
for scorer in scorers:
|
283
|
-
try:
|
284
|
-
if isinstance(scorer, ScorerWrapper):
|
285
|
-
loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
|
286
|
-
else:
|
287
|
-
loaded_scorers.append(scorer)
|
288
|
-
except Exception as e:
|
289
|
-
raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
290
|
-
|
291
|
-
# Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
|
292
|
-
if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
|
293
|
-
raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
|
294
|
-
|
295
|
-
# Convert ScorerWrapper in rules to their implementations
|
296
|
-
loaded_rules = None
|
297
|
-
if rules:
|
298
|
-
loaded_rules = []
|
299
|
-
for rule in rules:
|
300
|
-
try:
|
301
|
-
processed_conditions = []
|
302
|
-
for condition in rule.conditions:
|
303
|
-
# Convert metric if it's a ScorerWrapper
|
304
|
-
if isinstance(condition.metric, ScorerWrapper):
|
305
|
-
try:
|
306
|
-
condition_copy = condition.model_copy()
|
307
|
-
condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
|
308
|
-
processed_conditions.append(condition_copy)
|
309
|
-
except Exception as e:
|
310
|
-
raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
|
311
|
-
else:
|
312
|
-
processed_conditions.append(condition)
|
313
|
-
|
314
|
-
# Create new rule with processed conditions
|
315
|
-
new_rule = rule.model_copy()
|
316
|
-
new_rule.conditions = processed_conditions
|
317
|
-
loaded_rules.append(new_rule)
|
318
|
-
except Exception as e:
|
319
|
-
raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
|
320
|
-
|
321
|
-
evaluation_run = EvaluationRun(
|
322
|
-
log_results=log_results,
|
323
|
-
project_name=project_name,
|
324
|
-
eval_name=eval_run_name,
|
325
|
-
examples=dataset.examples,
|
326
|
-
scorers=loaded_scorers,
|
327
|
-
model=model,
|
328
|
-
aggregator=aggregator,
|
329
|
-
metadata=metadata,
|
330
|
-
judgment_api_key=self.judgment_api_key,
|
331
|
-
rules=loaded_rules,
|
332
|
-
organization_id=self.organization_id
|
333
|
-
)
|
334
|
-
return run_eval(evaluation_run)
|
335
|
-
except ValueError as e:
|
336
|
-
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
|
337
|
-
except Exception as e:
|
338
|
-
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
339
215
|
|
340
216
|
def create_dataset(self) -> EvalDataset:
|
341
217
|
return self.eval_dataset_client.create_dataset()
|
@@ -566,6 +442,8 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
566
442
|
raise JudgmentAPIError(f"Failed to fetch classifier scorer '{slug}': {response.json().get('detail', '')}")
|
567
443
|
|
568
444
|
scorer_config = response.json()
|
445
|
+
created_at = scorer_config.pop("created_at")
|
446
|
+
updated_at = scorer_config.pop("updated_at")
|
569
447
|
|
570
448
|
try:
|
571
449
|
return ClassifierScorer(**scorer_config)
|
judgeval/rules.py
CHANGED
@@ -10,7 +10,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|
10
10
|
import time
|
11
11
|
import uuid
|
12
12
|
|
13
|
-
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
13
|
+
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
14
14
|
|
15
15
|
class AlertStatus(str, Enum):
|
16
16
|
"""Status of an alert evaluation."""
|
@@ -23,22 +23,19 @@ class Condition(BaseModel):
|
|
23
23
|
|
24
24
|
Example:
|
25
25
|
{
|
26
|
-
"metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer
|
26
|
+
"metric": FaithfulnessScorer(threshold=0.7) # Must be a scorer object: APIJudgmentScorer, JudgevalScorer
|
27
27
|
}
|
28
28
|
|
29
29
|
The Condition class uses the scorer's threshold and success function internally.
|
30
30
|
"""
|
31
31
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
32
32
|
|
33
|
-
metric: Union[APIJudgmentScorer, JudgevalScorer
|
33
|
+
metric: Union[APIJudgmentScorer, JudgevalScorer]
|
34
34
|
|
35
35
|
@property
|
36
36
|
def metric_name(self) -> str:
|
37
37
|
"""Get the name of the metric for lookups in scores dictionary."""
|
38
|
-
if
|
39
|
-
# Handle ScorerWrapper case specifically
|
40
|
-
return self.metric.scorer.score_type if hasattr(self.metric.scorer, 'score_type') else str(self.metric.scorer)
|
41
|
-
elif hasattr(self.metric, 'score_type'):
|
38
|
+
if hasattr(self.metric, 'score_type'):
|
42
39
|
# Handle APIJudgmentScorer and JudgevalScorer which have score_type
|
43
40
|
return self.metric.score_type
|
44
41
|
elif hasattr(self.metric, '__name__'):
|
judgeval/run_evaluation.py
CHANGED
@@ -334,9 +334,9 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
|
|
334
334
|
# Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
|
335
335
|
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
336
336
|
|
337
|
-
def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True
|
337
|
+
def run_sequence_eval(sequence_run: SequenceRun, override: bool = False, ignore_errors: bool = True) -> List[ScoringResult]:
|
338
338
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
339
|
-
if not override and sequence_run.log_results:
|
339
|
+
if not override and sequence_run.log_results and not sequence_run.append:
|
340
340
|
check_eval_run_name_exists(
|
341
341
|
sequence_run.eval_name,
|
342
342
|
sequence_run.project_name,
|
judgeval/scorers/__init__.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
2
2
|
from judgeval.scorers.judgeval_scorer import JudgevalScorer
|
3
3
|
from judgeval.scorers.prompt_scorer import PromptScorer, ClassifierScorer
|
4
|
-
from judgeval.scorers.judgeval_scorers import (
|
4
|
+
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
5
5
|
ExecutionOrderScorer,
|
6
6
|
JSONCorrectnessScorer,
|
7
7
|
SummarizationScorer,
|
@@ -11,14 +11,15 @@ from judgeval.scorers.judgeval_scorers import (
|
|
11
11
|
ContextualPrecisionScorer,
|
12
12
|
ContextualRecallScorer,
|
13
13
|
AnswerRelevancyScorer,
|
14
|
-
ScorerWrapper,
|
15
14
|
AnswerCorrectnessScorer,
|
16
|
-
Text2SQLScorer,
|
17
15
|
ComparisonScorer,
|
18
16
|
InstructionAdherenceScorer,
|
19
17
|
GroundednessScorer,
|
20
18
|
DerailmentScorer,
|
21
19
|
)
|
20
|
+
from judgeval.scorers.judgeval_scorers.classifiers import (
|
21
|
+
Text2SQLScorer,
|
22
|
+
)
|
22
23
|
|
23
24
|
__all__ = [
|
24
25
|
"APIJudgmentScorer",
|
@@ -34,7 +35,6 @@ __all__ = [
|
|
34
35
|
"ContextualPrecisionScorer",
|
35
36
|
"ContextualRecallScorer",
|
36
37
|
"AnswerRelevancyScorer",
|
37
|
-
"ScorerWrapper",
|
38
38
|
"AnswerCorrectnessScorer",
|
39
39
|
"Text2SQLScorer",
|
40
40
|
"ComparisonScorer",
|