judgeval 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +126 -59
- judgeval/common/utils.py +12 -13
- judgeval/constants.py +61 -10
- judgeval/data/datasets/dataset.py +3 -2
- judgeval/data/datasets/eval_dataset_client.py +25 -14
- judgeval/data/example.py +8 -1
- judgeval/evaluation_run.py +9 -0
- judgeval/judges/together_judge.py +1 -1
- judgeval/judges/utils.py +1 -1
- judgeval/judgment_client.py +163 -28
- judgeval/rules.py +384 -0
- judgeval/run_evaluation.py +32 -14
- judgeval/scorers/api_scorer.py +11 -12
- judgeval/scorers/base_scorer.py +1 -1
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -1
- judgeval/utils/alerts.py +43 -0
- {judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/METADATA +1 -1
- {judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/RECORD +20 -18
- {judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/WHEEL +0 -0
- {judgeval-0.0.13.dist-info → judgeval-0.0.15.dist-info}/licenses/LICENSE.md +0 -0
judgeval/data/example.py
CHANGED
@@ -5,7 +5,7 @@ Classes for representing examples in a dataset.
|
|
5
5
|
|
6
6
|
from typing import TypeVar, Optional, Any, Dict, List
|
7
7
|
from uuid import uuid4
|
8
|
-
from pydantic import BaseModel, Field
|
8
|
+
from pydantic import BaseModel, Field, field_validator
|
9
9
|
from enum import Enum
|
10
10
|
from datetime import datetime
|
11
11
|
import time
|
@@ -40,6 +40,13 @@ class Example(BaseModel):
|
|
40
40
|
timestamp: Optional[str] = None
|
41
41
|
trace_id: Optional[str] = None
|
42
42
|
|
43
|
+
@field_validator('input', 'actual_output', mode='before')
|
44
|
+
def convert_to_str(cls, value):
|
45
|
+
try:
|
46
|
+
return str(value)
|
47
|
+
except Exception:
|
48
|
+
return repr(value)
|
49
|
+
|
43
50
|
def __init__(self, **data):
|
44
51
|
if 'example_id' not in data:
|
45
52
|
data['example_id'] = str(uuid4())
|
judgeval/evaluation_run.py
CHANGED
@@ -6,6 +6,7 @@ from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
|
|
6
6
|
from judgeval.constants import ACCEPTABLE_MODELS
|
7
7
|
from judgeval.common.logger import debug, error
|
8
8
|
from judgeval.judges import JudgevalJudge
|
9
|
+
from judgeval.rules import Rule
|
9
10
|
|
10
11
|
class EvaluationRun(BaseModel):
|
11
12
|
"""
|
@@ -20,10 +21,12 @@ class EvaluationRun(BaseModel):
|
|
20
21
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
21
22
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
22
23
|
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
24
|
+
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
23
25
|
"""
|
24
26
|
|
25
27
|
# The user will specify whether they want log_results when they call run_eval
|
26
28
|
log_results: bool = False # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
|
29
|
+
organization_id: Optional[str] = None
|
27
30
|
project_name: Optional[str] = None
|
28
31
|
eval_name: Optional[str] = None
|
29
32
|
examples: List[Example]
|
@@ -34,6 +37,7 @@ class EvaluationRun(BaseModel):
|
|
34
37
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
35
38
|
judgment_api_key: Optional[str] = ""
|
36
39
|
override: Optional[bool] = False
|
40
|
+
rules: Optional[List[Rule]] = None
|
37
41
|
|
38
42
|
def model_dump(self, **kwargs):
|
39
43
|
data = super().model_dump(**kwargs)
|
@@ -44,6 +48,11 @@ class EvaluationRun(BaseModel):
|
|
44
48
|
else {"score_type": scorer.score_type, "threshold": scorer.threshold}
|
45
49
|
for scorer in self.scorers
|
46
50
|
]
|
51
|
+
|
52
|
+
if self.rules:
|
53
|
+
# Process rules to ensure proper serialization
|
54
|
+
data["rules"] = [rule.model_dump() for rule in self.rules]
|
55
|
+
|
47
56
|
return data
|
48
57
|
|
49
58
|
@field_validator('log_results', mode='before')
|
@@ -14,7 +14,7 @@ BASE_CONVERSATION = [
|
|
14
14
|
]
|
15
15
|
|
16
16
|
class TogetherJudge(JudgevalJudge):
|
17
|
-
def __init__(self, model: str = "
|
17
|
+
def __init__(self, model: str = "Qwen/Qwen2.5-72B-Instruct-Turbo", **kwargs):
|
18
18
|
debug(f"Initializing TogetherJudge with model={model}")
|
19
19
|
self.model = model
|
20
20
|
self.kwargs = kwargs
|
judgeval/judges/utils.py
CHANGED
@@ -39,7 +39,7 @@ def create_judge(
|
|
39
39
|
Please either set the `use_judgment` flag to True or use
|
40
40
|
non-Judgment models."""
|
41
41
|
)
|
42
|
-
if m not in
|
42
|
+
if m not in ACCEPTABLE_MODELS:
|
43
43
|
raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
|
44
44
|
return MixtureOfJudges(models=model), True
|
45
45
|
# If model is a string, check that it corresponds to a valid model
|
judgeval/judgment_client.py
CHANGED
@@ -15,7 +15,8 @@ from judgeval.scorers import (
|
|
15
15
|
APIJudgmentScorer,
|
16
16
|
JudgevalScorer,
|
17
17
|
ClassifierScorer,
|
18
|
-
ScorerWrapper
|
18
|
+
ScorerWrapper,
|
19
|
+
score,
|
19
20
|
)
|
20
21
|
from judgeval.evaluation_run import EvaluationRun
|
21
22
|
from judgeval.run_evaluation import (
|
@@ -26,6 +27,7 @@ from judgeval.judges import JudgevalJudge
|
|
26
27
|
from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL, JUDGMENT_EVAL_DELETE_API_URL, JUDGMENT_EVAL_DELETE_PROJECT_API_URL
|
27
28
|
from judgeval.common.exceptions import JudgmentAPIError
|
28
29
|
from pydantic import BaseModel
|
30
|
+
from judgeval.rules import Rule
|
29
31
|
|
30
32
|
class EvalRunRequestBody(BaseModel):
|
31
33
|
eval_name: str
|
@@ -34,9 +36,10 @@ class EvalRunRequestBody(BaseModel):
|
|
34
36
|
|
35
37
|
|
36
38
|
class JudgmentClient:
|
37
|
-
def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY")):
|
39
|
+
def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"), organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
|
38
40
|
self.judgment_api_key = judgment_api_key
|
39
|
-
self.
|
41
|
+
self.organization_id = organization_id
|
42
|
+
self.eval_dataset_client = EvalDatasetClient(judgment_api_key, organization_id)
|
40
43
|
|
41
44
|
# Verify API key is valid
|
42
45
|
result, response = self._validate_api_key()
|
@@ -57,17 +60,69 @@ class JudgmentClient:
|
|
57
60
|
project_name: str = "default_project",
|
58
61
|
eval_run_name: str = "default_eval_run",
|
59
62
|
override: bool = False,
|
60
|
-
use_judgment: bool = True
|
63
|
+
use_judgment: bool = True,
|
64
|
+
rules: Optional[List[Rule]] = None
|
61
65
|
) -> List[ScoringResult]:
|
62
66
|
"""
|
63
67
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
68
|
+
|
69
|
+
Args:
|
70
|
+
examples (List[Example]): The examples to evaluate
|
71
|
+
scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
|
72
|
+
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
73
|
+
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
74
|
+
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
|
75
|
+
log_results (bool): Whether to log the results to the Judgment API
|
76
|
+
project_name (str): The name of the project the evaluation results belong to
|
77
|
+
eval_run_name (str): A name for this evaluation run
|
78
|
+
override (bool): Whether to override an existing evaluation run with the same name
|
79
|
+
use_judgment (bool): Whether to use Judgment API for evaluation
|
80
|
+
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
List[ScoringResult]: The results of the evaluation
|
64
84
|
"""
|
65
85
|
try:
|
66
86
|
# Load appropriate implementations for all scorers
|
67
|
-
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
|
68
|
-
|
69
|
-
|
70
|
-
|
87
|
+
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
|
88
|
+
for scorer in scorers:
|
89
|
+
try:
|
90
|
+
if isinstance(scorer, ScorerWrapper):
|
91
|
+
loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
|
92
|
+
else:
|
93
|
+
loaded_scorers.append(scorer)
|
94
|
+
except Exception as e:
|
95
|
+
raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
96
|
+
|
97
|
+
# Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
|
98
|
+
if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
|
99
|
+
raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
|
100
|
+
|
101
|
+
# Convert ScorerWrapper in rules to their implementations
|
102
|
+
loaded_rules = None
|
103
|
+
if rules:
|
104
|
+
loaded_rules = []
|
105
|
+
for rule in rules:
|
106
|
+
try:
|
107
|
+
processed_conditions = []
|
108
|
+
for condition in rule.conditions:
|
109
|
+
# Convert metric if it's a ScorerWrapper
|
110
|
+
if isinstance(condition.metric, ScorerWrapper):
|
111
|
+
try:
|
112
|
+
condition_copy = condition.model_copy()
|
113
|
+
condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
|
114
|
+
processed_conditions.append(condition_copy)
|
115
|
+
except Exception as e:
|
116
|
+
raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
|
117
|
+
else:
|
118
|
+
processed_conditions.append(condition)
|
119
|
+
|
120
|
+
# Create new rule with processed conditions
|
121
|
+
new_rule = rule.model_copy()
|
122
|
+
new_rule.conditions = processed_conditions
|
123
|
+
loaded_rules.append(new_rule)
|
124
|
+
except Exception as e:
|
125
|
+
raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
|
71
126
|
|
72
127
|
eval = EvaluationRun(
|
73
128
|
log_results=log_results,
|
@@ -78,11 +133,15 @@ class JudgmentClient:
|
|
78
133
|
model=model,
|
79
134
|
aggregator=aggregator,
|
80
135
|
metadata=metadata,
|
81
|
-
judgment_api_key=self.judgment_api_key
|
136
|
+
judgment_api_key=self.judgment_api_key,
|
137
|
+
rules=loaded_rules,
|
138
|
+
organization_id=self.organization_id
|
82
139
|
)
|
83
140
|
return run_eval(eval, override)
|
84
141
|
except ValueError as e:
|
85
142
|
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
|
143
|
+
except Exception as e:
|
144
|
+
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
86
145
|
|
87
146
|
def evaluate_dataset(
|
88
147
|
self,
|
@@ -94,17 +153,68 @@ class JudgmentClient:
|
|
94
153
|
project_name: str = "",
|
95
154
|
eval_run_name: str = "",
|
96
155
|
log_results: bool = False,
|
97
|
-
use_judgment: bool = True
|
156
|
+
use_judgment: bool = True,
|
157
|
+
rules: Optional[List[Rule]] = None
|
98
158
|
) -> List[ScoringResult]:
|
99
159
|
"""
|
100
160
|
Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
|
161
|
+
|
162
|
+
Args:
|
163
|
+
dataset (EvalDataset): The dataset containing examples to evaluate
|
164
|
+
scorers (List[Union[ScorerWrapper, JudgevalScorer]]): A list of scorers to use for evaluation
|
165
|
+
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
166
|
+
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
167
|
+
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
|
168
|
+
project_name (str): The name of the project the evaluation results belong to
|
169
|
+
eval_run_name (str): A name for this evaluation run
|
170
|
+
log_results (bool): Whether to log the results to the Judgment API
|
171
|
+
use_judgment (bool): Whether to use Judgment API for evaluation
|
172
|
+
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
List[ScoringResult]: The results of the evaluation
|
101
176
|
"""
|
102
177
|
try:
|
103
178
|
# Load appropriate implementations for all scorers
|
104
|
-
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
|
105
|
-
|
106
|
-
|
107
|
-
|
179
|
+
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = []
|
180
|
+
for scorer in scorers:
|
181
|
+
try:
|
182
|
+
if isinstance(scorer, ScorerWrapper):
|
183
|
+
loaded_scorers.append(scorer.load_implementation(use_judgment=use_judgment))
|
184
|
+
else:
|
185
|
+
loaded_scorers.append(scorer)
|
186
|
+
except Exception as e:
|
187
|
+
raise ValueError(f"Failed to load implementation for scorer {scorer}: {str(e)}")
|
188
|
+
|
189
|
+
# Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
|
190
|
+
if rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
|
191
|
+
raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
|
192
|
+
|
193
|
+
# Convert ScorerWrapper in rules to their implementations
|
194
|
+
loaded_rules = None
|
195
|
+
if rules:
|
196
|
+
loaded_rules = []
|
197
|
+
for rule in rules:
|
198
|
+
try:
|
199
|
+
processed_conditions = []
|
200
|
+
for condition in rule.conditions:
|
201
|
+
# Convert metric if it's a ScorerWrapper
|
202
|
+
if isinstance(condition.metric, ScorerWrapper):
|
203
|
+
try:
|
204
|
+
condition_copy = condition.model_copy()
|
205
|
+
condition_copy.metric = condition.metric.load_implementation(use_judgment=use_judgment)
|
206
|
+
processed_conditions.append(condition_copy)
|
207
|
+
except Exception as e:
|
208
|
+
raise ValueError(f"Failed to convert ScorerWrapper to implementation in rule '{rule.name}', condition metric '{condition.metric}': {str(e)}")
|
209
|
+
else:
|
210
|
+
processed_conditions.append(condition)
|
211
|
+
|
212
|
+
# Create new rule with processed conditions
|
213
|
+
new_rule = rule.model_copy()
|
214
|
+
new_rule.conditions = processed_conditions
|
215
|
+
loaded_rules.append(new_rule)
|
216
|
+
except Exception as e:
|
217
|
+
raise ValueError(f"Failed to process rule '{rule.name}': {str(e)}")
|
108
218
|
|
109
219
|
evaluation_run = EvaluationRun(
|
110
220
|
log_results=log_results,
|
@@ -115,11 +225,15 @@ class JudgmentClient:
|
|
115
225
|
model=model,
|
116
226
|
aggregator=aggregator,
|
117
227
|
metadata=metadata,
|
118
|
-
judgment_api_key=self.judgment_api_key
|
228
|
+
judgment_api_key=self.judgment_api_key,
|
229
|
+
rules=loaded_rules,
|
230
|
+
organization_id=self.organization_id
|
119
231
|
)
|
120
232
|
return run_eval(evaluation_run)
|
121
233
|
except ValueError as e:
|
122
234
|
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
|
235
|
+
except Exception as e:
|
236
|
+
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
123
237
|
|
124
238
|
def create_dataset(self) -> EvalDataset:
|
125
239
|
return self.eval_dataset_client.create_dataset()
|
@@ -189,9 +303,11 @@ class JudgmentClient:
|
|
189
303
|
eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL,
|
190
304
|
headers={
|
191
305
|
"Content-Type": "application/json",
|
192
|
-
"Authorization": f"Bearer {self.judgment_api_key}"
|
306
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
307
|
+
"X-Organization-Id": self.organization_id
|
193
308
|
},
|
194
|
-
json=eval_run_request_body.model_dump()
|
309
|
+
json=eval_run_request_body.model_dump(),
|
310
|
+
verify=False)
|
195
311
|
if eval_run.status_code != requests.codes.ok:
|
196
312
|
raise ValueError(f"Error fetching eval results: {eval_run.json()}")
|
197
313
|
|
@@ -222,7 +338,8 @@ class JudgmentClient:
|
|
222
338
|
json=eval_run_request_body.model_dump(),
|
223
339
|
headers={
|
224
340
|
"Content-Type": "application/json",
|
225
|
-
"Authorization": f"Bearer {self.judgment_api_key}"
|
341
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
342
|
+
"X-Organization-Id": self.organization_id
|
226
343
|
})
|
227
344
|
if response.status_code != requests.codes.ok:
|
228
345
|
raise ValueError(f"Error deleting eval results: {response.json()}")
|
@@ -241,11 +358,12 @@ class JudgmentClient:
|
|
241
358
|
response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
|
242
359
|
json={
|
243
360
|
"project_name": project_name,
|
244
|
-
"judgment_api_key": self.judgment_api_key
|
361
|
+
"judgment_api_key": self.judgment_api_key,
|
245
362
|
},
|
246
363
|
headers={
|
247
364
|
"Content-Type": "application/json",
|
248
|
-
"Authorization": f"Bearer {self.judgment_api_key}"
|
365
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
366
|
+
"X-Organization-Id": self.organization_id
|
249
367
|
})
|
250
368
|
if response.status_code != requests.codes.ok:
|
251
369
|
raise ValueError(f"Error deleting eval results: {response.json()}")
|
@@ -261,7 +379,8 @@ class JudgmentClient:
|
|
261
379
|
"Content-Type": "application/json",
|
262
380
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
263
381
|
},
|
264
|
-
json={} # Empty body now
|
382
|
+
json={}, # Empty body now
|
383
|
+
verify=False
|
265
384
|
)
|
266
385
|
if response.status_code == 200:
|
267
386
|
return True, response.json()
|
@@ -283,7 +402,6 @@ class JudgmentClient:
|
|
283
402
|
"""
|
284
403
|
request_body = {
|
285
404
|
"slug": slug,
|
286
|
-
# "judgment_api_key": self.judgment_api_key
|
287
405
|
}
|
288
406
|
|
289
407
|
response = requests.post(
|
@@ -291,8 +409,10 @@ class JudgmentClient:
|
|
291
409
|
json=request_body,
|
292
410
|
headers={
|
293
411
|
"Content-Type": "application/json",
|
294
|
-
"Authorization": f"Bearer {self.judgment_api_key}"
|
295
|
-
|
412
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
413
|
+
"X-Organization-Id": self.organization_id
|
414
|
+
},
|
415
|
+
verify=False
|
296
416
|
)
|
297
417
|
|
298
418
|
if response.status_code == 500:
|
@@ -325,7 +445,6 @@ class JudgmentClient:
|
|
325
445
|
"name": scorer.name,
|
326
446
|
"conversation": scorer.conversation,
|
327
447
|
"options": scorer.options,
|
328
|
-
# "judgment_api_key": self.judgment_api_key,
|
329
448
|
"slug": slug
|
330
449
|
}
|
331
450
|
|
@@ -334,8 +453,10 @@ class JudgmentClient:
|
|
334
453
|
json=request_body,
|
335
454
|
headers={
|
336
455
|
"Content-Type": "application/json",
|
337
|
-
"Authorization": f"Bearer {self.judgment_api_key}"
|
338
|
-
|
456
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
457
|
+
"X-Organization-Id": self.organization_id
|
458
|
+
},
|
459
|
+
verify=False
|
339
460
|
)
|
340
461
|
|
341
462
|
if response.status_code == 500:
|
@@ -358,9 +479,22 @@ class JudgmentClient:
|
|
358
479
|
project_name: str = "default_project",
|
359
480
|
eval_run_name: str = "default_eval_run",
|
360
481
|
override: bool = False,
|
482
|
+
rules: Optional[List[Rule]] = None
|
361
483
|
) -> None:
|
362
484
|
"""
|
363
485
|
Asserts a test by running the evaluation and checking the results for success
|
486
|
+
|
487
|
+
Args:
|
488
|
+
examples (List[Example]): The examples to evaluate
|
489
|
+
scorers (List[Union[APIJudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
490
|
+
model (Union[str, List[str], JudgevalJudge]): The model used as a judge when using LLM as a Judge
|
491
|
+
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
492
|
+
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run
|
493
|
+
log_results (bool): Whether to log the results to the Judgment API
|
494
|
+
project_name (str): The name of the project the evaluation results belong to
|
495
|
+
eval_run_name (str): A name for this evaluation run
|
496
|
+
override (bool): Whether to override an existing evaluation run with the same name
|
497
|
+
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
364
498
|
"""
|
365
499
|
results = self.run_evaluation(
|
366
500
|
examples=examples,
|
@@ -371,7 +505,8 @@ class JudgmentClient:
|
|
371
505
|
log_results=log_results,
|
372
506
|
project_name=project_name,
|
373
507
|
eval_run_name=eval_run_name,
|
374
|
-
override=override
|
508
|
+
override=override,
|
509
|
+
rules=rules
|
375
510
|
)
|
376
511
|
|
377
512
|
assert_test(results)
|