judgeval 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. judgeval/__init__.py +83 -0
  2. judgeval/clients.py +19 -0
  3. judgeval/common/__init__.py +8 -0
  4. judgeval/common/exceptions.py +28 -0
  5. judgeval/common/logger.py +189 -0
  6. judgeval/common/tracer.py +587 -0
  7. judgeval/common/utils.py +763 -0
  8. judgeval/constants.py +55 -0
  9. judgeval/data/__init__.py +14 -0
  10. judgeval/data/api_example.py +111 -0
  11. judgeval/data/datasets/__init__.py +4 -0
  12. judgeval/data/datasets/dataset.py +407 -0
  13. judgeval/data/datasets/ground_truth.py +54 -0
  14. judgeval/data/datasets/utils.py +74 -0
  15. judgeval/data/example.py +76 -0
  16. judgeval/data/result.py +83 -0
  17. judgeval/data/scorer_data.py +86 -0
  18. judgeval/evaluation_run.py +130 -0
  19. judgeval/judges/__init__.py +7 -0
  20. judgeval/judges/base_judge.py +44 -0
  21. judgeval/judges/litellm_judge.py +49 -0
  22. judgeval/judges/mixture_of_judges.py +248 -0
  23. judgeval/judges/together_judge.py +55 -0
  24. judgeval/judges/utils.py +45 -0
  25. judgeval/judgment_client.py +244 -0
  26. judgeval/run_evaluation.py +355 -0
  27. judgeval/scorers/__init__.py +30 -0
  28. judgeval/scorers/base_scorer.py +51 -0
  29. judgeval/scorers/custom_scorer.py +134 -0
  30. judgeval/scorers/judgeval_scorers/__init__.py +21 -0
  31. judgeval/scorers/judgeval_scorers/answer_relevancy.py +19 -0
  32. judgeval/scorers/judgeval_scorers/contextual_precision.py +19 -0
  33. judgeval/scorers/judgeval_scorers/contextual_recall.py +19 -0
  34. judgeval/scorers/judgeval_scorers/contextual_relevancy.py +22 -0
  35. judgeval/scorers/judgeval_scorers/faithfulness.py +19 -0
  36. judgeval/scorers/judgeval_scorers/hallucination.py +19 -0
  37. judgeval/scorers/judgeval_scorers/json_correctness.py +32 -0
  38. judgeval/scorers/judgeval_scorers/summarization.py +20 -0
  39. judgeval/scorers/judgeval_scorers/tool_correctness.py +19 -0
  40. judgeval/scorers/prompt_scorer.py +439 -0
  41. judgeval/scorers/score.py +427 -0
  42. judgeval/scorers/utils.py +175 -0
  43. judgeval-0.0.1.dist-info/METADATA +40 -0
  44. judgeval-0.0.1.dist-info/RECORD +46 -0
  45. judgeval-0.0.1.dist-info/WHEEL +4 -0
  46. judgeval-0.0.1.dist-info/licenses/LICENSE.md +202 -0
@@ -0,0 +1,19 @@
1
+ """
2
+ `judgeval` faithfulness scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.base_scorer import JudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+
12
+
13
+ class FaithfulnessScorer(JudgmentScorer):
14
+ def __init__(self, threshold: float):
15
+ super().__init__(threshold=threshold, score_type=APIScorer.FAITHFULNESS)
16
+
17
+ @property
18
+ def __name__(self):
19
+ return "Faithfulness"
@@ -0,0 +1,19 @@
1
+ """
2
+ `judgeval` hallucination scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.base_scorer import JudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+
12
+
13
+ class HallucinationScorer(JudgmentScorer):
14
+ def __init__(self, threshold: float):
15
+ super().__init__(threshold=threshold, score_type=APIScorer.HALLUCINATION)
16
+
17
+ @property
18
+ def __name__(self):
19
+ return "Hallucination"
@@ -0,0 +1,32 @@
1
+ """
2
+ `judgeval` JSON correctness scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+
9
+ # External imports
10
+ from pydantic import BaseModel, Field
11
+ # Internal imports
12
+ from judgeval.scorers.base_scorer import JudgmentScorer
13
+ from judgeval.constants import APIScorer
14
+
15
+
16
+ class JSONCorrectnessScorer(JudgmentScorer):
17
+ json_schema: BaseModel = Field(None, exclude=True)
18
+
19
+ def __init__(self, threshold: float, json_schema: BaseModel):
20
+ super().__init__(threshold=threshold, score_type=APIScorer.JSON_CORRECTNESS)
21
+ object.__setattr__(self, 'json_schema', json_schema)
22
+
23
+ def to_dict(self):
24
+ return {
25
+ "score_type": self.score_type,
26
+ "threshold": self.threshold,
27
+ "kwargs": {"json_schema": self.json_schema.model_json_schema()}
28
+ }
29
+
30
+ @property
31
+ def __name__(self):
32
+ return "JSON Correctness"
@@ -0,0 +1,20 @@
1
+ """
2
+ `judgeval` summarization scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.base_scorer import JudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+
12
+
13
+ class SummarizationScorer(JudgmentScorer):
14
+ def __init__(self, threshold: float):
15
+ super().__init__(threshold=threshold, score_type=APIScorer.SUMMARIZATION)
16
+
17
+ @property
18
+ def __name__(self):
19
+ return "Summarization"
20
+
@@ -0,0 +1,19 @@
1
+ """
2
+ `judgeval` tool correctness scorer
3
+
4
+ TODO add link to docs page for this scorer
5
+
6
+ """
7
+
8
+ # Internal imports
9
+ from judgeval.scorers.base_scorer import JudgmentScorer
10
+ from judgeval.constants import APIScorer
11
+
12
+
13
+ class ToolCorrectnessScorer(JudgmentScorer):
14
+ def __init__(self, threshold: float):
15
+ super().__init__(threshold=threshold, score_type=APIScorer.TOOL_CORRECTNESS)
16
+
17
+ @property
18
+ def __name__(self):
19
+ return "Tool Correctness"
@@ -0,0 +1,439 @@
1
+ """
2
+ Code that implements a prompt-based scorer for evaluating examples.
3
+
4
+ The PromptScorer class is a base class that can be used to create custom scoring metrics using LLM prompts.
5
+ To implement a subclass of PromptScorer, you need to implement the following methods:
6
+ - build_measure_prompt(): builds the conversation prompt that is sent to the LLM judge
7
+ - build_schema(): defines the expected response schema from the LLM
8
+ - process_response(): parses the response from the LLM judge
9
+ - success_check(): determines whether the evaluation was successful
10
+
11
+ The core idea of PromptScorer is to provide a flexible way to create custom scoring metrics
12
+ by leveraging LLM judges to evaluate examples. The scorer constructs a prompt, sends it to
13
+ the judge, and parses the structured response to determine a score.
14
+
15
+ For example, the SentimentScorer subclass uses PromptScorer to detect negative sentiment in responses
16
+ by prompting an LLM to rate the negativity on a 1-5 scale and provide a reason for the rating.
17
+
18
+ The PromptScorer supports both synchronous and asynchronous evaluation modes, includes optional
19
+ reason fields in responses, and can operate in strict mode with higher thresholds.
20
+
21
+ NOTE: When implementing build_measure_prompt and build_schema:
22
+ - The prompt should guide the LLM to generate a response matching your schema
23
+ - The schema should include "score" and optionally "reason" fields
24
+ - The score field type and range should match your scoring criteria
25
+ - The reason field provides explanatory context for the score
26
+ """
27
+
28
+ from abc import abstractmethod
29
+ from typing import List, Optional, Union, Tuple, Any, Mapping
30
+ from pydantic import BaseModel, model_serializer, Field
31
+
32
+ from judgeval.data import Example
33
+ from judgeval.scorers import CustomScorer
34
+ from judgeval.scorers.utils import (scorer_progress_meter,
35
+ parse_response_json,
36
+ get_or_create_event_loop,
37
+ create_verbose_logs)
38
+
39
+
40
+ class ReasonScore(BaseModel):
41
+ reason: str
42
+ score: float
43
+
44
+
45
+ class PromptScorer(CustomScorer, BaseModel):
46
+ name: str
47
+ score_type: str
48
+ threshold: float = Field(default=0.5)
49
+ using_native_model: bool = Field(default=True)
50
+
51
+ # DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD
52
+ _response: Optional[dict] = None
53
+ _result: Optional[float] = None
54
+
55
+ def __init__(
56
+ self,
57
+ name: str,
58
+ threshold: float = 0.5,
59
+ include_reason: bool = True,
60
+ async_mode: bool = True,
61
+ strict_mode: bool = False,
62
+ verbose_mode: bool = False,
63
+ ):
64
+ # Initialize BaseModel first
65
+ BaseModel.__init__(
66
+ self,
67
+ name=name,
68
+ score_type=name,
69
+ threshold=1 if strict_mode else threshold,
70
+ include_reason=include_reason,
71
+ async_mode=async_mode,
72
+ strict_mode=strict_mode,
73
+ verbose_mode=verbose_mode,
74
+ )
75
+ # Then initialize CustomScorer
76
+ CustomScorer.__init__(
77
+ self,
78
+ score_type=name,
79
+ threshold=1 if strict_mode else threshold,
80
+ include_reason=include_reason,
81
+ async_mode=async_mode,
82
+ strict_mode=strict_mode,
83
+ verbose_mode=verbose_mode,
84
+ )
85
+
86
+ def score_example(
87
+ self,
88
+ example: Example,
89
+ _show_indicator: bool = True
90
+ ) -> float:
91
+ """
92
+ Synchronous method for scoring an example using the prompt criteria.
93
+ """
94
+ with scorer_progress_meter(self, display_meter=_show_indicator):
95
+ if self.async_mode:
96
+ loop = get_or_create_event_loop()
97
+ loop.run_until_complete(
98
+ self.a_score_example(example, _show_indicator=False)
99
+ )
100
+ else:
101
+ result, reason = self.evaluate(example)
102
+ self.reason = reason
103
+ self._result = result
104
+ self.verbose_logs = create_verbose_logs(
105
+ self,
106
+ steps=[
107
+ f"Results: {self._result}\nReason: {self.reason}",
108
+ ],
109
+ )
110
+ return result
111
+
112
+ async def a_score_example(
113
+ self,
114
+ example: Example,
115
+ _show_indicator: bool = True,
116
+ ) -> float:
117
+ """
118
+ Async method for scoring an example using the prompt criteria.
119
+ """
120
+ with scorer_progress_meter(self, display_meter=_show_indicator):
121
+ result, reason = await self.a_evaluate(example)
122
+ self.reason = reason
123
+ self._result = result
124
+ self.verbose_logs = create_verbose_logs(
125
+ self,
126
+ steps=[
127
+ f"Results: {self._result}\nReason: {self.reason}",
128
+ ],
129
+ )
130
+ return result
131
+
132
+ def evaluate(self, example: Example) -> Tuple[Any, str]:
133
+ """
134
+ Synchronous helper method for evaluating an example using the prompt criteria.
135
+
136
+ Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
137
+ for evaluation. The result is then parsed as JSON and returned.
138
+
139
+ NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field.
140
+ """
141
+ prompt = self._build_measure_prompt(example)
142
+ if self.using_native_model:
143
+ res = self.model.generate(prompt)
144
+ response = parse_response_json(res, self)
145
+ result, reason = self._process_response(response)
146
+ return result, reason
147
+ else:
148
+ raise NotImplementedError("Non-native judge models are not supported in synchronous mode yet.")
149
+
150
+ async def a_evaluate(self, example: Example) -> Tuple[Any, str]:
151
+ """
152
+ Asynchronous helper method for evaluating an example using the prompt criteria.
153
+
154
+ Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
155
+ for evaluation. The result is then parsed as JSON and returned.
156
+
157
+ NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field.
158
+ """
159
+ judge_prompt = self._build_measure_prompt(example)
160
+ schema = self._build_schema()
161
+ prompt = self._enforce_prompt_format(judge_prompt=judge_prompt, schema=schema)
162
+ if self.using_native_model:
163
+ res = await self.model.a_generate(prompt)
164
+ response = parse_response_json(res, self)
165
+ self._response = response
166
+
167
+ result, reason = self._process_response(response)
168
+ self.score = result
169
+ self.reason = reason
170
+ self._response = response
171
+ return result, reason
172
+ else:
173
+ raise NotImplementedError("Non-native judge models are not supported in async mode yet.")
174
+
175
+ # TODO: can we make this take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args
176
+ @abstractmethod
177
+ def _build_measure_prompt(self, example: Example) -> List[dict]:
178
+ # builds the prompt that is sent to the model inside of the `score_example()` method
179
+ # returns either a string prompt or a conversation prompt of the form [{"role": "system", "content": "..."}, ...]
180
+
181
+ """
182
+ This function creates the prompt that the judge model uses to evaluate examples.
183
+
184
+ The prompt is typically a set of instructions that the judge model uses to evaluate the example.
185
+
186
+ This function returns a conversation prompt of the form
187
+ [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}]
188
+
189
+ A basic version of implementing this function could be as follows:
190
+ SYSTEM_ROLE = ...
191
+ return [
192
+ {"role": "system", "content": SYSTEM_ROLE},
193
+ {"role": "user", "content": f"Response: {example.actual_output}\n\nYour judgment: "}
194
+ ]
195
+ """
196
+ pass
197
+
198
+ # TODO: does this need to take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args
199
+ @abstractmethod
200
+ def _build_schema(self) -> dict:
201
+ """
202
+ This function returns a dictionary that represents the schema of the JSON response that the judge model should return.
203
+
204
+ The keys of the dictionary are the expected keys in the response, and the values are the types of the corresponding values.
205
+
206
+ Example: If you want to have the judge model return a score and a reason, you would write:
207
+ return {"score": int, "reason": str}
208
+ """
209
+ pass
210
+
211
+ def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict):
212
+ """
213
+ Formats the final prompt to the judge model.
214
+
215
+ This function takes a list of dictionaries (`judge_prompt`) and a schema dictionary (`schema`),
216
+ and appends a schema enforcement prompt to the content of the first dictionary in the list, which is assumed to be the system prompt.
217
+ The schema enforcement prompt instructs the judge model to provide its response in a specific JSON format.
218
+
219
+ Args:
220
+ judge_prompt (List[dict]): A list of dictionaries representing the judge prompt.
221
+ Each dictionary should contain a "content" key.
222
+ schema (dict): A dictionary representing the schema. The keys are the expected keys in the response,
223
+ and the values are the types of the corresponding values.
224
+
225
+ Returns:
226
+ List[dict]: The modified judge prompt with the schema enforcement prompt appended to the content
227
+ of the first dictionary.
228
+
229
+ Raises:
230
+ TypeError: If `judge_prompt` is not a list of dictionaries.
231
+
232
+ Example:
233
+ judge_prompt = [{"content": "Please evaluate the following:"}]
234
+ schema = {"score": int, "comments": str}
235
+ formatted_prompt = format_measure_prompt(judge_prompt, schema)
236
+ # formatted_prompt[0]["content"] will include the schema enforcement prompt
237
+ """
238
+ SCHEMA_ENFORCEMENT_PROMPT = "\n\nPlease provide your response in the following JSON format: {"
239
+ if isinstance(judge_prompt, list) and all(isinstance(item, dict) for item in judge_prompt):
240
+ # create formatting string for schema enforcement
241
+ # schema is a map between key and type of the value
242
+ for key, key_type in schema.items():
243
+ SCHEMA_ENFORCEMENT_PROMPT += f'"{key}": <{key}> ({key_type.__name__}), '
244
+ SCHEMA_ENFORCEMENT_PROMPT = SCHEMA_ENFORCEMENT_PROMPT[:-2] + "}" # remove trailing comma and space
245
+ judge_prompt[0]["content"] += SCHEMA_ENFORCEMENT_PROMPT
246
+ return judge_prompt
247
+ else:
248
+ raise TypeError(f"Prompt must be a list of dictionaries. Got {type(judge_prompt)} instead.")
249
+
250
+ @abstractmethod
251
+ def _process_response(self, response: dict):
252
+ """
253
+ Customizable method for processing the response from the judge model.
254
+
255
+ You can add any additional logic to parse the JSON response here and return the result and reason for decision.
256
+
257
+ If you don't need a reason for the decision, you can simply return (score, None).
258
+
259
+ Example:
260
+ score = response["score"]
261
+ reason = response["reason"]
262
+ return score, reason
263
+ """
264
+ pass
265
+
266
+ @abstractmethod
267
+ def _success_check(self, **kwargs) -> bool:
268
+ """
269
+ Determines whether or not the PromptScorer should consider the evaluation of a single example successful.
270
+ """
271
+ pass
272
+
273
+ @property
274
+ def __name__(self):
275
+ return self.name
276
+
277
+
278
+ class ClassifierScorer(PromptScorer):
279
+
280
+ """
281
+ This is a PromptScorer that takes
282
+ 1. a system role that may involve the Example object
283
+ 2. options for scores on the example
284
+
285
+ and uses a judge to execute the evaluation from the system role and classify into one of the options
286
+
287
+ ex:
288
+ system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
289
+ options = {"positive": 1, "negative": 0}
290
+ """
291
+
292
+ conversation: List[dict]
293
+ options: Mapping[str, float]
294
+
295
+ def __init__(self, name: str, slug: str, conversation: List[dict], options: Mapping[str, float],
296
+ threshold: float = 0.5, include_reason: bool = True,
297
+ async_mode: bool = True, strict_mode: bool = False, verbose_mode: bool = False):
298
+ # Initialize BaseModel first with all fields
299
+ BaseModel.__init__(
300
+ self,
301
+ name=name,
302
+ slug=slug,
303
+ score_type=name,
304
+ conversation=conversation,
305
+ options=options,
306
+ threshold=threshold,
307
+ include_reason=include_reason,
308
+ async_mode=async_mode,
309
+ strict_mode=strict_mode,
310
+ verbose_mode=verbose_mode,
311
+ )
312
+ # Then initialize CustomScorer
313
+ CustomScorer.__init__(
314
+ self,
315
+ score_type=name,
316
+ threshold=threshold,
317
+ include_reason=include_reason,
318
+ async_mode=async_mode,
319
+ strict_mode=strict_mode,
320
+ verbose_mode=verbose_mode,
321
+ )
322
+
323
+ def _build_measure_prompt(self, example: Example) -> List[dict]:
324
+ """
325
+ Builds the measure prompt for the classifier scorer.
326
+
327
+ Args:
328
+ example (Example): The example to build the prompt for
329
+
330
+ Returns:
331
+ List[dict]: The measure prompt for the classifier scorer
332
+ """
333
+ replacement_words = {
334
+ "{{actual_output}}": example.actual_output,
335
+ "{{expected_output}}": example.expected_output,
336
+ "{{context}}": example.context,
337
+ "{{retrieval_context}}": example.retrieval_context,
338
+ "{{tools_called}}": example.tools_called,
339
+ "{{expected_tools}}": example.expected_tools,
340
+ }
341
+ # Make a copy of the conversation to avoid modifying the original
342
+ conversation_copy = [dict(message) for message in self.conversation]
343
+
344
+ # Only replace if double brackets are found in the content
345
+ for message in conversation_copy:
346
+ content = message["content"]
347
+ if "{{" in content:
348
+ for key, value in replacement_words.items():
349
+ if key in content:
350
+ message["content"] = content.replace(key, str(value))
351
+ return conversation_copy
352
+
353
+ def _build_schema(self) -> dict:
354
+ return self.options
355
+
356
+ def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[dict]:
357
+ """
358
+ Enforces the judge model to choose an option from the schema.
359
+
360
+ We want the model to choose an option from the schema and a reason for the choice.
361
+ """
362
+ options = list(schema.keys())
363
+ options_str = ", ".join(options)
364
+
365
+ system_role = judge_prompt[0]["content"]
366
+ system_role += (
367
+ f"\n\nYou must choose one of the following options: {options_str}. "
368
+ "Format your response as a JSON object with two fields:\n"
369
+ "1. 'choice': Your selected option (must be one of the provided choices)\n"
370
+ "2. 'reason': A brief explanation for why you made this choice\n\n"
371
+ "Example response format:\n"
372
+ "{\n"
373
+ ' "choice": "<one of the valid options>",\n'
374
+ ' "reason": "<your explanation>"\n'
375
+ "}"
376
+ )
377
+
378
+ judge_prompt[0]["content"] = system_role
379
+ return judge_prompt
380
+
381
+ def _process_response(self, response: dict) -> Tuple[float, str]:
382
+ choice = response.get("choice")
383
+ if choice not in self.options:
384
+ raise ValueError(f"Invalid choice: {choice}. Expected one of: {self.options.keys()}")
385
+ reason = response.get("reason", "No reason could be found in model response.")
386
+ return self.options[choice], reason
387
+
388
+ def _success_check(self, **kwargs) -> bool:
389
+ return self.score >= self.threshold
390
+
391
+ def update_name(self, name: str):
392
+ """
393
+ Updates the name of the scorer.
394
+ """
395
+ self.name = name
396
+
397
+ def update_threshold(self, threshold: float):
398
+ """
399
+ Updates the threshold of the scorer.
400
+ """
401
+ self.threshold = threshold
402
+
403
+ def update_conversation(self, conversation: List[dict]):
404
+ """
405
+ Updates the conversation with the new conversation.
406
+
407
+ Sample conversation:
408
+ [{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
409
+ """
410
+ self.conversation = conversation
411
+
412
+ def update_options(self, options: Mapping[str, float]):
413
+ """
414
+ Updates the options with the new options.
415
+
416
+ Sample options:
417
+ {"yes": 1, "no": 0}
418
+ """
419
+ self.options = options
420
+
421
+ def __str__(self):
422
+ return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
423
+
424
+ @model_serializer
425
+ def serialize_model(self) -> dict:
426
+ """
427
+ Defines how the ClassifierScorer should be serialized when model_dump() is called.
428
+ """
429
+ return {
430
+ "name": self.name,
431
+ "score_type": self.score_type,
432
+ "conversation": self.conversation,
433
+ "options": self.options,
434
+ "threshold": self.threshold,
435
+ "include_reason": self.include_reason,
436
+ "async_mode": self.async_mode,
437
+ "strict_mode": self.strict_mode,
438
+ "verbose_mode": self.verbose_mode,
439
+ }