judgeval 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,193 @@
1
+
2
+ from typing import Optional
3
+ import requests
4
+ from rich.progress import Progress, SpinnerColumn, TextColumn
5
+
6
+ from judgeval.common.logger import debug, error, warning, info
7
+ from judgeval.constants import (
8
+ JUDGMENT_DATASETS_PUSH_API_URL,
9
+ JUDGMENT_DATASETS_PULL_API_URL,
10
+ JUDGMENT_DATASETS_PULL_ALL_API_URL
11
+ )
12
+ from judgeval.data import Example
13
+ from judgeval.data.datasets import EvalDataset
14
+ from judgeval.data.datasets.ground_truth import GroundTruthExample
15
+
16
+
17
+
18
+
19
+ class EvalDatasetClient:
20
+ def __init__(self, judgment_api_key: str):
21
+ self.judgment_api_key = judgment_api_key
22
+
23
+ def create_dataset(self) -> EvalDataset:
24
+ return EvalDataset(judgment_api_key=self.judgment_api_key)
25
+
26
+ def push(self, dataset: EvalDataset, alias: str,overwrite: Optional[bool] = False) -> bool:
27
+ debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
28
+ if overwrite:
29
+ warning(f"Overwrite enabled for alias '{alias}'")
30
+ """
31
+ Pushes the dataset to Judgment platform
32
+
33
+ Mock request:
34
+ dataset = {
35
+ "alias": alias,
36
+ "ground_truths": [...],
37
+ "examples": [...],
38
+ "overwrite": overwrite
39
+ } ==>
40
+ {
41
+ "_alias": alias,
42
+ "_id": "..." # ID of the dataset
43
+ }
44
+ """
45
+ with Progress(
46
+ SpinnerColumn(style="rgb(106,0,255)"),
47
+ TextColumn("[progress.description]{task.description}"),
48
+ transient=False,
49
+ ) as progress:
50
+ task_id = progress.add_task(
51
+ f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
52
+ total=100,
53
+ )
54
+ content = {
55
+ "alias": alias,
56
+ "ground_truths": [g.to_dict() for g in dataset.ground_truths],
57
+ "examples": [e.to_dict() for e in dataset.examples],
58
+ "overwrite": overwrite,
59
+ "judgment_api_key": dataset.judgment_api_key
60
+ }
61
+ try:
62
+ response = requests.post(
63
+ JUDGMENT_DATASETS_PUSH_API_URL,
64
+ json=content
65
+ )
66
+ if response.status_code == 500:
67
+ error(f"Server error during push: {content.get('message')}")
68
+ return False
69
+ response.raise_for_status()
70
+ except requests.exceptions.HTTPError as err:
71
+ if response.status_code == 422:
72
+ error(f"Validation error during push: {err.response.json()}")
73
+ else:
74
+ error(f"HTTP error during push: {err}")
75
+
76
+ info(f"Successfully pushed dataset with alias '{alias}'")
77
+ payload = response.json()
78
+ dataset._alias = payload.get("_alias")
79
+ dataset._id = payload.get("_id")
80
+ progress.update(
81
+ task_id,
82
+ description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
83
+ )
84
+ return True
85
+
86
+ def pull(self, alias: str) -> EvalDataset:
87
+ debug(f"Pulling dataset with alias '{alias}'")
88
+ """
89
+ Pulls the dataset from Judgment platform
90
+
91
+ Mock request:
92
+ {
93
+ "alias": alias,
94
+ "user_id": user_id
95
+ }
96
+ ==>
97
+ {
98
+ "ground_truths": [...],
99
+ "examples": [...],
100
+ "_alias": alias,
101
+ "_id": "..." # ID of the dataset
102
+ }
103
+ """
104
+ # Make a POST request to the Judgment API to get the dataset
105
+ dataset = self.create_dataset()
106
+
107
+ with Progress(
108
+ SpinnerColumn(style="rgb(106,0,255)"),
109
+ TextColumn("[progress.description]{task.description}"),
110
+ transient=False,
111
+ ) as progress:
112
+ task_id = progress.add_task(
113
+ f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
114
+ total=100,
115
+ )
116
+ request_body = {
117
+ "alias": alias,
118
+ "judgment_api_key": self.judgment_api_key
119
+ }
120
+
121
+ try:
122
+ response = requests.post(
123
+ JUDGMENT_DATASETS_PULL_API_URL,
124
+ json=request_body
125
+ )
126
+ response.raise_for_status()
127
+ except requests.exceptions.RequestException as e:
128
+ error(f"Error pulling dataset: {str(e)}")
129
+ raise
130
+
131
+ info(f"Successfully pulled dataset with alias '{alias}'")
132
+ payload = response.json()
133
+ dataset.ground_truths = [GroundTruthExample(**g) for g in payload.get("ground_truths", [])]
134
+ dataset.examples = [Example(**e) for e in payload.get("examples", [])]
135
+ dataset._alias = payload.get("_alias")
136
+ dataset._id = payload.get("_id")
137
+ progress.update(
138
+ task_id,
139
+ description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
140
+ )
141
+
142
+ return dataset
143
+
144
+ def pull_all_user_dataset_stats(self) -> dict:
145
+ debug(f"Pulling user datasets stats for user_id: {self.judgment_api_key}'")
146
+ """
147
+ Pulls the user datasets stats from Judgment platform
148
+
149
+ Mock request:
150
+ {
151
+ "user_id": user_id
152
+ }
153
+ ==>
154
+ {
155
+ "test_dataset_1": {"examples_count": len(dataset1.examples), "ground_truths_count": len(dataset1.ground_truths)},
156
+ "test_dataset_2": {"examples_count": len(dataset2.examples), "ground_truths_count": len(dataset2.ground_truths)},
157
+ ...
158
+ }
159
+ """
160
+ # Make a POST request to the Judgment API to get the dataset
161
+
162
+ with Progress(
163
+ SpinnerColumn(style="rgb(106,0,255)"),
164
+ TextColumn("[progress.description]{task.description}"),
165
+ transient=False,
166
+ ) as progress:
167
+ task_id = progress.add_task(
168
+ f"Pulling [rgb(106,0,255)]' datasets'[/rgb(106,0,255)] from Judgment...",
169
+ total=100,
170
+ )
171
+ request_body = {
172
+ "judgment_api_key": self.judgment_api_key
173
+ }
174
+
175
+ try:
176
+ response = requests.post(
177
+ JUDGMENT_DATASETS_PULL_ALL_API_URL,
178
+ json=request_body
179
+ )
180
+ response.raise_for_status()
181
+ except requests.exceptions.RequestException as e:
182
+ error(f"Error pulling dataset: {str(e)}")
183
+ raise
184
+
185
+ info(f"Successfully pulled datasets for userid: {self.judgment_api_key}'")
186
+ payload = response.json()
187
+
188
+ progress.update(
189
+ task_id,
190
+ description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
191
+ )
192
+
193
+ return payload
judgeval/data/result.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import List, Union, Optional
2
+ from typing import List, Union, Optional, Dict, Any
3
3
 
4
4
  from judgeval.data import ScorerData, ProcessExample
5
5
 
@@ -18,6 +18,9 @@ class ScoringResult:
18
18
  expected_output (Optional[str]): The expected output of the example
19
19
  context (Optional[List[str]]): The context of the example
20
20
  retrieval_context (Optional[List[str]]): The retrieval context of the example
21
+ additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
22
+ tools_called (Optional[List[str]]): The tools called by the example
23
+ expected_tools (Optional[List[str]]): The expected tools of the example
21
24
  trace_id (Optional[str]): The trace id of the example
22
25
 
23
26
  """
@@ -31,6 +34,9 @@ class ScoringResult:
31
34
  expected_output: Optional[str] = None
32
35
  context: Optional[List[str]] = None
33
36
  retrieval_context: Optional[List[str]] = None
37
+ additional_metadata: Optional[Dict[str, Any]] = None
38
+ tools_called: Optional[List[str]] = None
39
+ expected_tools: Optional[List[str]] = None
34
40
  trace_id: Optional[str] = None
35
41
 
36
42
  example_id: Optional[str] = None
@@ -46,6 +52,9 @@ class ScoringResult:
46
52
  "expected_output": self.expected_output,
47
53
  "context": self.context,
48
54
  "retrieval_context": self.retrieval_context,
55
+ "additional_metadata": self.additional_metadata,
56
+ "tools_called": self.tools_called,
57
+ "expected_tools": self.expected_tools,
49
58
  "trace_id": self.trace_id,
50
59
  "example_id": self.example_id
51
60
  }
@@ -59,6 +68,9 @@ class ScoringResult:
59
68
  expected_output={self.expected_output}, \
60
69
  context={self.context}, \
61
70
  retrieval_context={self.retrieval_context}, \
71
+ additional_metadata={self.additional_metadata}, \
72
+ tools_called={self.tools_called}, \
73
+ expected_tools={self.expected_tools}, \
62
74
  trace_id={self.trace_id})"
63
75
 
64
76
 
@@ -79,5 +91,8 @@ def generate_scoring_result(
79
91
  expected_output=process_example.expected_output,
80
92
  context=process_example.context,
81
93
  retrieval_context=process_example.retrieval_context,
94
+ additional_metadata=process_example.additional_metadata,
95
+ tools_called=process_example.tools_called,
96
+ expected_tools=process_example.expected_tools,
82
97
  trace_id=process_example.trace_id
83
98
  )
@@ -15,7 +15,7 @@ class EvaluationRun(BaseModel):
15
15
  project_name (str): The name of the project the evaluation results belong to
16
16
  eval_name (str): A name for this evaluation run
17
17
  examples (List[Example]): The examples to evaluate
18
- scorers (List[Union[JudgmentScorer, CustomScorer]]): A list of scorers to use for evaluation
18
+ scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
19
19
  model (str): The model used as a judge when using LLM as a Judge
20
20
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
21
21
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
@@ -33,6 +33,7 @@ class EvaluationRun(BaseModel):
33
33
  metadata: Optional[Dict[str, Any]] = None
34
34
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
35
35
  judgment_api_key: Optional[str] = ""
36
+ override: Optional[bool] = False
36
37
 
37
38
  def model_dump(self, **kwargs):
38
39
  data = super().model_dump(**kwargs)
judgeval/judges/utils.py CHANGED
@@ -6,7 +6,7 @@ from typing import Optional, Union, Tuple, List
6
6
 
7
7
  from judgeval.common.exceptions import InvalidJudgeModelError
8
8
  from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
9
- from judgeval.constants import TOGETHER_SUPPORTED_MODELS
9
+ from judgeval.constants import TOGETHER_SUPPORTED_MODELS, JUDGMENT_SUPPORTED_MODELS, ACCEPTABLE_MODELS
10
10
 
11
11
  LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
12
12
 
@@ -33,7 +33,13 @@ def create_judge(
33
33
  # Either string or List[str]
34
34
  if isinstance(model, list):
35
35
  for m in model:
36
- if m not in TOGETHER_SUPPORTED_MODELS and m not in LITELLM_SUPPORTED_MODELS:
36
+ if m in JUDGMENT_SUPPORTED_MODELS:
37
+ raise NotImplementedError(
38
+ """Judgment models are not yet supported for local scoring.
39
+ Please either set the `use_judgment` flag to True or use
40
+ non-Judgment models."""
41
+ )
42
+ if m not in LITELLM_SUPPORTED_MODELS and m not in TOGETHER_SUPPORTED_MODELS:
37
43
  raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
38
44
  return MixtureOfJudges(models=model), True
39
45
  # If model is a string, check that it corresponds to a valid model
@@ -41,5 +47,11 @@ def create_judge(
41
47
  return LiteLLMJudge(model=model), True
42
48
  if model in TOGETHER_SUPPORTED_MODELS:
43
49
  return TogetherJudge(model=model), True
50
+ if model in JUDGMENT_SUPPORTED_MODELS:
51
+ raise NotImplementedError(
52
+ """Judgment models are not yet supported for local scoring.
53
+ Please either set the `use_judgment` flag to True or use
54
+ non-Judgment models."""
55
+ )
44
56
  else:
45
57
  raise InvalidJudgeModelError(f"Invalid judge model chosen: {model}")
@@ -6,7 +6,7 @@ from typing import Optional, List, Dict, Any, Union
6
6
  import requests
7
7
 
8
8
  from judgeval.constants import ROOT_API
9
- from judgeval.data.datasets import EvalDataset
9
+ from judgeval.data.datasets import EvalDataset, EvalDatasetClient
10
10
  from judgeval.data import (
11
11
  ScoringResult,
12
12
  Example
@@ -23,7 +23,7 @@ from judgeval.run_evaluation import (
23
23
  assert_test
24
24
  )
25
25
  from judgeval.judges import JudgevalJudge
26
- from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL
26
+ from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL, JUDGMENT_EVAL_DELETE_API_URL, JUDGMENT_EVAL_DELETE_PROJECT_API_URL
27
27
  from judgeval.common.exceptions import JudgmentAPIError
28
28
  from pydantic import BaseModel
29
29
 
@@ -36,6 +36,7 @@ class EvalRunRequestBody(BaseModel):
36
36
  class JudgmentClient:
37
37
  def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY")):
38
38
  self.judgment_api_key = judgment_api_key
39
+ self.eval_dataset_client = EvalDatasetClient(judgment_api_key)
39
40
 
40
41
  # Verify API key is valid
41
42
  result, response = self._validate_api_key()
@@ -121,7 +122,7 @@ class JudgmentClient:
121
122
  raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
122
123
 
123
124
  def create_dataset(self) -> EvalDataset:
124
- return EvalDataset(judgment_api_key=self.judgment_api_key)
125
+ return self.eval_dataset_client.create_dataset()
125
126
 
126
127
  def push_dataset(self, alias: str, dataset: EvalDataset, overwrite: Optional[bool] = False) -> bool:
127
128
  """
@@ -137,7 +138,7 @@ class JudgmentClient:
137
138
  """
138
139
  # Set judgment_api_key just in case it was not set
139
140
  dataset.judgment_api_key = self.judgment_api_key
140
- return dataset.push(alias, overwrite)
141
+ return self.eval_dataset_client.push(dataset, alias, overwrite)
141
142
 
142
143
  def pull_dataset(self, alias: str) -> EvalDataset:
143
144
  """
@@ -149,9 +150,20 @@ class JudgmentClient:
149
150
  Returns:
150
151
  EvalDataset: The retrieved dataset
151
152
  """
152
- dataset = EvalDataset(judgment_api_key=self.judgment_api_key)
153
- dataset.pull(alias)
154
- return dataset
153
+ return self.eval_dataset_client.pull(alias)
154
+
155
+ def pull_all_user_dataset_stats(self) -> dict:
156
+ """
157
+ Retrieves all dataset stats from the Judgment platform for the user.
158
+
159
+ Args:
160
+ alias (str): The name of the dataset to retrieve
161
+
162
+ Returns:
163
+ EvalDataset: The retrieved dataset
164
+ """
165
+ return self.eval_dataset_client.pull_all_user_dataset_stats()
166
+
155
167
 
156
168
  # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
157
169
  def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
@@ -182,6 +194,51 @@ class JudgmentClient:
182
194
  eval_run_result[0]["id"] = result_id
183
195
  eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
184
196
  return eval_run_result
197
+
198
+ def delete_eval(self, project_name: str, eval_run_name: str) -> bool:
199
+ """
200
+ Deletes an evaluation from the server by project and run name.
201
+
202
+ Args:
203
+ project_name (str): Name of the project
204
+ eval_run_name (str): Name of the evaluation run
205
+
206
+ Returns:
207
+ bool: Whether the evaluation was successfully deleted
208
+ """
209
+ eval_run_request_body = EvalRunRequestBody(project_name=project_name,
210
+ eval_name=eval_run_name,
211
+ judgment_api_key=self.judgment_api_key)
212
+ response = requests.delete(JUDGMENT_EVAL_DELETE_API_URL,
213
+ json=eval_run_request_body.model_dump(),
214
+ headers={
215
+ "Content-Type": "application/json",
216
+ })
217
+ if response.status_code != requests.codes.ok:
218
+ raise ValueError(f"Error deleting eval results: {response.json()}")
219
+ return response.json()
220
+
221
+ def delete_project_evals(self, project_name: str) -> bool:
222
+ """
223
+ Deletes all evaluations from the server for a given project.
224
+
225
+ Args:
226
+ project_name (str): Name of the project
227
+
228
+ Returns:
229
+ bool: Whether the evaluations were successfully deleted
230
+ """
231
+ response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
232
+ json={
233
+ "project_name": project_name,
234
+ "judgment_api_key": self.judgment_api_key
235
+ },
236
+ headers={
237
+ "Content-Type": "application/json",
238
+ })
239
+ if response.status_code != requests.codes.ok:
240
+ raise ValueError(f"Error deleting eval results: {response.json()}")
241
+ return response.json()
185
242
 
186
243
  def _validate_api_key(self):
187
244
  """
@@ -97,6 +97,13 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
97
97
  raise ValueError("The API and local results are not aligned.")
98
98
  if api_result.retrieval_context != local_result.retrieval_context:
99
99
  raise ValueError("The API and local results are not aligned.")
100
+ if api_result.additional_metadata != local_result.additional_metadata:
101
+ raise ValueError("The API and local results are not aligned.")
102
+ if api_result.tools_called != local_result.tools_called:
103
+ raise ValueError("The API and local results are not aligned.")
104
+ if api_result.expected_tools != local_result.expected_tools:
105
+ raise ValueError("The API and local results are not aligned.")
106
+
100
107
 
101
108
  # Merge ScorerData from the API and local scorers together
102
109
  api_scorer_data = api_result.scorers_data
@@ -254,6 +261,12 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
254
261
  debug(f"Context: {example.context}")
255
262
  if example.retrieval_context:
256
263
  debug(f"Retrieval context: {example.retrieval_context}")
264
+ if example.additional_metadata:
265
+ debug(f"Additional metadata: {example.additional_metadata}")
266
+ if example.tools_called:
267
+ debug(f"Tools called: {example.tools_called}")
268
+ if example.expected_tools:
269
+ debug(f"Expected tools: {example.expected_tools}")
257
270
 
258
271
  debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
259
272
 
@@ -379,6 +392,9 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
379
392
  'expected_output': result.expected_output,
380
393
  'context': result.context,
381
394
  'retrieval_context': result.retrieval_context,
395
+ 'additional_metadata': result.additional_metadata,
396
+ 'tools_called': result.tools_called,
397
+ 'expected_tools': result.expected_tools,
382
398
  'eval_run_name': result.eval_run_name,
383
399
  'failed_scorers': []
384
400
  }
@@ -397,6 +413,9 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
397
413
  error_msg += f"Expected Output: {fail_case['expected_output']}\n"
398
414
  error_msg += f"Context: {fail_case['context']}\n"
399
415
  error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
416
+ error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
417
+ error_msg += f"Tools Called: {fail_case['tools_called']}\n"
418
+ error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
400
419
  error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
401
420
 
402
421
  for fail_scorer in fail_case['failed_scorers']:
@@ -1,5 +1,5 @@
1
1
  """
2
- Custom Scorer class
2
+ Judgeval Scorer class
3
3
 
4
4
  Enables client to create custom scorers that do not fall under any of the ready-made Judgment scorers.
5
5
  To create a custom scorer, extend this class and implement the `score_example`, `a_score_example`, and `success_check` methods.
@@ -57,12 +57,12 @@ class JudgevalScorer:
57
57
  verbose_logs: Optional[str] = None,
58
58
  additional_metadata: Optional[Dict] = None
59
59
  ):
60
- debug(f"Initializing CustomScorer with score_type={score_type}, threshold={threshold}")
60
+ debug(f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}")
61
61
  if not 0 <= threshold <= 1:
62
62
  raise ValueError("Threshold must be between 0 and 1")
63
63
  if strict_mode:
64
64
  warning("Strict mode enabled - scoring will be more rigorous")
65
- info(f"CustomScorer initialized with evaluation_model: {evaluation_model}")
65
+ info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
66
66
  self.score_type = score_type
67
67
  self.threshold = threshold
68
68
  self.score = score
@@ -81,7 +81,7 @@ class JudgevalScorer:
81
81
 
82
82
  def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
83
83
  """
84
- Adds the evaluation model to the CustomScorer instance
84
+ Adds the evaluation model to the JudgevalScorer instance
85
85
 
86
86
  This method is used at eval time
87
87
  """
@@ -116,10 +116,10 @@ class JudgevalScorer:
116
116
  raise NotImplementedError("You must implement the `passes` method in your custom scorer")
117
117
 
118
118
  def __str__(self):
119
- debug("Converting CustomScorer instance to string representation")
119
+ debug("Converting JudgevalScorer instance to string representation")
120
120
  if self.error:
121
- warning(f"CustomScorer contains error: {self.error}")
122
- info(f"CustomScorer status - success: {self.success}, score: {self.score}")
121
+ warning(f"JudgevalScorer contains error: {self.error}")
122
+ info(f"JudgevalScorer status - success: {self.success}, score: {self.score}")
123
123
  attributes = {
124
124
  "score_type": self.score_type,
125
125
  "threshold": self.threshold,
@@ -137,4 +137,4 @@ class JudgevalScorer:
137
137
  "verbose_logs": self.verbose_logs,
138
138
  "additional_metadata": self.additional_metadata,
139
139
  }
140
- return f"CustomScorer({attributes})"
140
+ return f"JudgevalScorer({attributes})"
@@ -2,7 +2,7 @@
2
2
  Code for the local implementation of the Faithfulness metric.
3
3
  """
4
4
  from typing import List, Optional, Union
5
-
5
+ from pprint import pprint
6
6
  from judgeval.constants import APIScorer
7
7
  from judgeval.data import (
8
8
  Example,
@@ -114,11 +114,13 @@ class FaithfulnessScorer(JudgevalScorer):
114
114
  ):
115
115
  self.claims = await self._a_generate_claims(example.actual_output)
116
116
 
117
+
117
118
  if self.additional_metadata is None:
118
119
  self.additional_metadata = {}
119
120
  self.additional_metadata["claims"] = self.claims
120
121
 
121
122
  self.verdicts = await self._a_generate_verdicts(example.retrieval_context)
123
+
122
124
  self.additional_metadata["verdicts"] = [v.model_dump() for v in self.verdicts] # Add verdicts generated to metadata
123
125
 
124
126
  self.score = self._calculate_score()
@@ -129,10 +129,13 @@ JSON:
129
129
  def create_verdicts(claims, retrieval_context):
130
130
  return f"""==== TASK INSTRUCTIONS ====
131
131
  You will be provided with a list of claims from an LLM's output text, accompanied by the retrieval documents that the LLM used to generate the output.
132
- Your task is to determine whether EACH claim is factually consistent with the retrieval context ("yes", "no", or "idk").
133
- ONLY choose 'no' if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGMENT.
132
+ I'm pretty sure that many of the claims are factually contradictory to the retrieval context, but I want you to double check that I'm right.
133
+ For each claim, choose one of ("yes", "no", or "idk") to represent whether the claim is correct based on the retrieval context.
134
+ YOU SHOULD be very scrutinous--if any part of the claim is contradicted by the retrieval context, you should choose "no". Think really hard about finding the contradictions, since they can be subtle!
135
+
136
+ Choose 'no' if the retrieval context CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGMENT.
134
137
  Claims made using vague, suggestive, or speculative language such as 'may have', 'possibility due to', do NOT count as a contradiction.
135
- Claims that are not justified by the retrieval context due to a lack of information MUST BE ANSWERED with 'idk'.
138
+ Claims that are fuzzy based on lack of information MUST BE ANSWERED with 'idk'.
136
139
 
137
140
  ==== FORMATTING YOUR ANSWER ====
138
141
  Please return your answer in JSON format, with the 'verdicts' key as a list of JSON objects. Each JSON object should have 2 fields: 'verdict' and 'reason'.
@@ -72,7 +72,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
72
72
  strict_mode=strict_mode,
73
73
  verbose_mode=verbose_mode,
74
74
  )
75
- # Then initialize CustomScorer
75
+ # Then initialize JudgevalScorer
76
76
  JudgevalScorer.__init__(
77
77
  self,
78
78
  score_type=name,
@@ -309,7 +309,7 @@ class ClassifierScorer(PromptScorer):
309
309
  strict_mode=strict_mode,
310
310
  verbose_mode=verbose_mode,
311
311
  )
312
- # Then initialize CustomScorer
312
+ # Then initialize JudgevalScorer
313
313
  JudgevalScorer.__init__(
314
314
  self,
315
315
  score_type=name,
judgeval/scorers/score.py CHANGED
@@ -1,5 +1,5 @@
1
1
  """
2
- Infrastructure for executing evaluations of `Example`s using one or more `CustomScorer`s.
2
+ Infrastructure for executing evaluations of `Example`s using one or more `JudgevalScorer`s.
3
3
  """
4
4
 
5
5
 
@@ -30,15 +30,15 @@ async def safe_a_score_example(
30
30
  ):
31
31
  """
32
32
  Scoring task function when not using a progress indicator!
33
- "Safely" scores an `Example` using a `CustomScorer` by gracefully handling any exceptions that may occur.
33
+ "Safely" scores an `Example` using a `JudgevalScorer` by gracefully handling any exceptions that may occur.
34
34
 
35
35
  Args:
36
- scorer (CustomScorer): The `CustomScorer` to use for scoring the example.
36
+ scorer (JudgevalScorer): The `JudgevalScorer` to use for scoring the example.
37
37
  example (Example): The `Example` to be scored.
38
38
 
39
39
  ignore_errors (bool): Whether to ignore errors during the evaluation.
40
40
  If set to false, any error will be raised and stop the evaluation.
41
- If set to true, the error will be stored in the `error` attribute of the `CustomScorer` and the `success` attribute will be set to False.
41
+ If set to true, the error will be stored in the `error` attribute of the `JudgevalScorer` and the `success` attribute will be set to False.
42
42
 
43
43
  skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
44
44
  """
@@ -102,12 +102,12 @@ async def score_task(
102
102
  skip_on_missing_params: bool = True,
103
103
  ):
104
104
  """
105
- Task function for asynchronously measuring a given example using a custom scorer.
105
+ Task function for asynchronously measuring a given example using a JudgevalScorer.
106
106
 
107
107
  Args:
108
108
  task_id (int): The ID of the task being measured.
109
109
  progress (Progress): An instance of the Progress class to track task progress.
110
- scorer (CustomScorer): An instance of the CustomScorer class used to score the example.
110
+ scorer (JudgevalScorer): An instance of the JudgevalScorer class used to score the example.
111
111
  example (Example): The example to be scored.
112
112
  ignore_errors (bool, optional): Whether to ignore errors during scoring. Defaults to True.
113
113
  skip_on_missing_params (bool, optional): Whether to skip scoring if there are missing parameters. Defaults to True.
@@ -189,10 +189,10 @@ async def score_with_indicator(
189
189
  show_indicator: bool,
190
190
  ):
191
191
  """
192
- Scores an example using a list of custom scorers, optionally displaying a progress indicator.
192
+ Scores an example using a list of JudgevalScorers, optionally displaying a progress indicator.
193
193
 
194
194
  Args:
195
- scorers (List[CustomScorer]): A list of custom scorer objects to evaluate the example.
195
+ scorers (List[JudgevalScorer]): A list of JudgevalScorer objects to evaluate the example.
196
196
  example (Example): The example to be scored.
197
197
  ignore_errors (bool): If True, errors during scoring will be ignored.
198
198
  skip_on_missing_params (bool): If True, scoring will be skipped if required parameters are missing.
@@ -253,8 +253,8 @@ async def a_execute_scoring(
253
253
  _use_bar_indicator: bool = True,
254
254
  ) -> List[ScoringResult]:
255
255
  """
256
- Executes evaluations of `Example`s asynchronously using one or more `CustomScorer`s.
257
- Each `Example` will be evaluated by all of the `CustomScorer`s in the `scorers` list.
256
+ Executes evaluations of `Example`s asynchronously using one or more `JudgevalScorer`s.
257
+ Each `Example` will be evaluated by all of the `JudgevalScorer`s in the `scorers` list.
258
258
 
259
259
  Args:
260
260
  examples (List[Example]): A list of `Example` objects to be evaluated.
@@ -379,7 +379,7 @@ async def a_eval_examples_helper(
379
379
  Evaluate a single example asynchronously using a list of scorers.
380
380
 
381
381
  Args:
382
- scorers (List[CustomScorer]): List of CustomScorer objects to evaluate the example.
382
+ scorers (List[JudgevalScorer]): List of JudgevalScorer objects to evaluate the example.
383
383
  example (Example): The example to be evaluated.
384
384
  scoring_results (List[ScoringResult]): List to store the scoring results.
385
385
  score_index (int): Index at which the result should be stored in scoring_results.