judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,32 +0,0 @@
1
- """
2
- `judgeval` JSON correctness scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
-
9
- # External imports
10
- from pydantic import BaseModel, Field
11
- # Internal imports
12
- from judgeval.scorers.api_scorer import APIJudgmentScorer
13
- from judgeval.constants import APIScorer
14
-
15
-
16
- class JSONCorrectnessScorer(APIJudgmentScorer):
17
- json_schema: BaseModel = Field(None, exclude=True)
18
-
19
- def __init__(self, threshold: float, json_schema: BaseModel):
20
- super().__init__(threshold=threshold, score_type=APIScorer.JSON_CORRECTNESS)
21
- object.__setattr__(self, 'json_schema', json_schema)
22
-
23
- def to_dict(self):
24
- base_dict = super().to_dict() # Get the parent class's dictionary
25
- base_dict["kwargs"] = {
26
- "json_schema": self.json_schema.model_json_schema()
27
- }
28
- return base_dict
29
-
30
- @property
31
- def __name__(self):
32
- return "JSON Correctness"
@@ -1,20 +0,0 @@
1
- """
2
- `judgeval` summarization scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
11
-
12
-
13
- class SummarizationScorer(APIJudgmentScorer):
14
- def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.SUMMARIZATION)
16
-
17
- @property
18
- def __name__(self):
19
- return "Summarization"
20
-
@@ -1,19 +0,0 @@
1
- """
2
- `judgeval` tool correctness scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
11
-
12
-
13
- class ToolCorrectnessScorer(APIJudgmentScorer):
14
- def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.TOOL_CORRECTNESS)
16
-
17
- @property
18
- def __name__(self):
19
- return "Tool Correctness"
@@ -1,3 +0,0 @@
1
- from .text2sql import Text2SQLScorer
2
-
3
- __all__ = ["Text2SQLScorer"]
@@ -1,3 +0,0 @@
1
- from .text2sql_scorer import Text2SQLScorer
2
-
3
- __all__ = ["Text2SQLScorer"]
@@ -1,54 +0,0 @@
1
- """
2
- ClassifierScorer implementation for basic Text-to-SQL evaluation.
3
-
4
- Takes a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
5
- Determines if the LLM-generated SQL query is valid and works for the natural language query.
6
- """
7
- from judgeval.scorers import ClassifierScorer
8
-
9
- Text2SQLScorer = ClassifierScorer(
10
- "Text to SQL",
11
- slug="text2sql-1010101010",
12
- threshold=1.0,
13
- conversation=[{
14
- "role": "system",
15
- "content": """You will be given a natural language query, a corresponding LLM-generated SQL query, and a table schema + (optional) metadata.
16
-
17
- ** TASK INSTRUCTIONS **
18
- Your task is to decide whether the LLM generated SQL query properly filters for what the natural language query is asking, based on the table schema + (optional) metadata.
19
- Additionally, you should check if the SQL query is valid based on the table schema (checking for syntax errors, false column names, etc.)
20
-
21
- ** TIPS **
22
- - Look for correct references to the table schema for column names, table names, etc.
23
- - Check that the SQL query can be executed; make sure JOINs, GROUP BYs, ORDER BYs, etc. are valid with respect to the table schema.
24
- - Check that aggregation functions (COUNT, SUM, AVG, etc.) are used appropriately with GROUP BY clauses
25
- - Verify that WHERE conditions use the correct operators and data types for comparisons
26
- - Ensure LIMIT and OFFSET clauses make sense for the query's purpose
27
- - Check that JOINs use the correct keys and maintain referential integrity
28
- - Verify that ORDER BY clauses use valid column names and sort directions
29
- - Check for proper handling of NULL values where relevant
30
- - Ensure subqueries are properly constructed and correlated when needed
31
- - EVEN IF THE QUERY IS VALID, IF IT DOESN'T WORK FOR THE NATURAL LANGUAGE QUERY, YOU SHOULD CHOOSE "N" AS THE ANSWER.
32
-
33
- ** FORMATTING YOUR ANSWER **
34
- If the SQL query is valid and works for the natural language query, choose option "Y" and otherwise "N". Provide a justification for your decision; if you choose "N", explain what about the LLM-generated SQL query is incorrect, or explain why it doesn't address the natural language query.
35
- IF YOUR JUSTIFICATION SHOWS THAT THE SQL QUERY IS VALID AND WORKS FOR THE NATURAL LANGUAGE QUERY, YOU SHOULD CHOOSE "Y" AS THE ANSWER.
36
- IF THE SQL QUERY IS INVALID, YOU SHOULD CHOOSE "N" AS THE ANSWER.
37
-
38
- ** YOUR TURN **
39
- Natural language query:
40
- {{input}}
41
-
42
- LLM generated SQL query:
43
- {{actual_output}}
44
-
45
- Table schema:
46
- {{context}}
47
- """
48
- }],
49
- options={
50
- "Y": 1.0,
51
- "N": 0.0
52
- }
53
- )
54
-
@@ -1,24 +0,0 @@
1
- from judgeval.scorers.judgeval_scorers.local_implementations.answer_relevancy.answer_relevancy_scorer import AnswerRelevancyScorer
2
- from judgeval.scorers.judgeval_scorers.local_implementations.contextual_precision.contextual_precision_scorer import ContextualPrecisionScorer
3
- from judgeval.scorers.judgeval_scorers.local_implementations.contextual_recall.contextual_recall_scorer import ContextualRecallScorer
4
- from judgeval.scorers.judgeval_scorers.local_implementations.contextual_relevancy.contextual_relevancy_scorer import ContextualRelevancyScorer
5
- from judgeval.scorers.judgeval_scorers.local_implementations.faithfulness.faithfulness_scorer import FaithfulnessScorer
6
- from judgeval.scorers.judgeval_scorers.local_implementations.json_correctness.json_correctness_scorer import JsonCorrectnessScorer
7
- from judgeval.scorers.judgeval_scorers.local_implementations.tool_correctness.tool_correctness_scorer import ToolCorrectnessScorer
8
- from judgeval.scorers.judgeval_scorers.local_implementations.hallucination.hallucination_scorer import HallucinationScorer
9
- from judgeval.scorers.judgeval_scorers.local_implementations.summarization.summarization_scorer import SummarizationScorer
10
- from judgeval.scorers.judgeval_scorers.local_implementations.answer_correctness.answer_correctness_scorer import AnswerCorrectnessScorer
11
-
12
-
13
- __all__ = [
14
- "AnswerCorrectnessScorer",
15
- "AnswerRelevancyScorer",
16
- "ContextualPrecisionScorer",
17
- "ContextualRecallScorer",
18
- "ContextualRelevancyScorer",
19
- "FaithfulnessScorer",
20
- "JsonCorrectnessScorer",
21
- "ToolCorrectnessScorer",
22
- "HallucinationScorer",
23
- "SummarizationScorer",
24
- ]
@@ -1,4 +0,0 @@
1
- from judgeval.scorers.judgeval_scorers.local_implementations.answer_correctness.answer_correctness_scorer import AnswerCorrectnessScorer
2
-
3
-
4
- __all__ = ["AnswerCorrectnessScorer"]
@@ -1,277 +0,0 @@
1
- from typing import Optional, List, Union, Tuple
2
- from pydantic import BaseModel
3
-
4
- from judgeval.constants import APIScorer
5
- from judgeval.judges import JudgevalJudge
6
- from judgeval.judges.utils import create_judge
7
- from judgeval.data import Example, ExampleParams
8
- from judgeval.scorers import JudgevalScorer
9
- from judgeval.scorers.utils import (
10
- get_or_create_event_loop,
11
- parse_response_json,
12
- scorer_progress_meter,
13
- create_verbose_logs,
14
- check_example_params,
15
- )
16
- from .prompts import (
17
- ACVerdict,
18
- AnswerCorrectnessTemplate,
19
- Statements,
20
- Verdicts,
21
- Reason,
22
- )
23
-
24
-
25
- required_params = [
26
- ExampleParams.INPUT,
27
- ExampleParams.ACTUAL_OUTPUT,
28
- ExampleParams.EXPECTED_OUTPUT,
29
- ]
30
-
31
-
32
- class AnswerCorrectnessScorer(JudgevalScorer):
33
- def __init__(
34
- self,
35
- threshold: float = 0.5,
36
- model: Optional[Union[str, JudgevalJudge]] = None,
37
- include_reason: bool = True,
38
- async_mode: bool = True,
39
- strict_mode: bool = False,
40
- verbose_mode: bool = False
41
- ):
42
- super().__init__(
43
- score_type=APIScorer.ANSWER_CORRECTNESS,
44
- threshold=1 if strict_mode else threshold,
45
- evaluation_model=None,
46
- include_reason=include_reason,
47
- async_mode=async_mode,
48
- strict_mode=strict_mode,
49
- verbose_mode=verbose_mode
50
- )
51
- self.model, self.using_native_model = create_judge(model)
52
- self.evaluation_model = self.model.get_model_name()
53
-
54
- async def _a_get_statements(self, expected_output: str) -> List[str]:
55
- prompt = AnswerCorrectnessTemplate.deduce_statements(
56
- expected_output=expected_output,
57
- )
58
- if self.using_native_model:
59
- res = await self.model.a_generate(prompt)
60
- data = parse_response_json(res, self)
61
- return data["statements"]
62
- else:
63
- try:
64
- res: Statements = await self.model.a_generate(
65
- prompt, schema=Statements
66
- )
67
- return res.statements
68
- except TypeError:
69
- res = await self.model.a_generate(prompt)
70
- data = parse_response_json(res, self)
71
- return data["statements"]
72
-
73
- def _get_statements(self, expected_output: str) -> List[str]:
74
- prompt = AnswerCorrectnessTemplate.deduce_statements(
75
- expected_output=expected_output,
76
- )
77
- if self.using_native_model:
78
- res = self.model.generate(prompt)
79
- data = parse_response_json(res, self)
80
- return data["statements"]
81
- else:
82
- try:
83
- res: Statements = self.model.generate(
84
- prompt, schema=Statements
85
- )
86
- return res.statements
87
- except TypeError:
88
- res = self.model.generate(prompt)
89
- data = parse_response_json(res, self)
90
- return data["statements"]
91
-
92
- async def _a_get_verdicts(self, actual_output: str) -> List[ACVerdict]:
93
- if len(self.statements) == 0:
94
- return []
95
-
96
- prompt = AnswerCorrectnessTemplate.generate_verdicts(
97
- actual_output=actual_output,
98
- statements=self.statements,
99
- )
100
-
101
- if self.using_native_model:
102
- res = await self.model.a_generate(prompt)
103
- data = parse_response_json(res, self)
104
- return [ACVerdict(**item) for item in data["verdicts"]]
105
- else:
106
- try:
107
- res: Verdicts = await self.model.a_generate(prompt, schema=Verdicts)
108
- return [item for item in res.verdicts]
109
- except TypeError:
110
- res = await self.model.a_generate(prompt)
111
- data = parse_response_json(res, self)
112
- return [ACVerdict(**item) for item in data["verdicts"]]
113
-
114
- def _get_verdicts(self, actual_output: str) -> List[ACVerdict]:
115
- if len(self.statements) == 0:
116
- return []
117
-
118
- prompt = AnswerCorrectnessTemplate.generate_verdicts(
119
- actual_output=actual_output,
120
- statements=self.statements,
121
- )
122
-
123
- if self.using_native_model:
124
- res = self.model.generate(prompt)
125
- data = parse_response_json(res, self)
126
- return [ACVerdict(**item) for item in data["verdicts"]]
127
- else:
128
- try:
129
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
130
- return [item for item in res.verdicts]
131
- except TypeError:
132
- res = self.model.generate(prompt)
133
- data = parse_response_json(res, self)
134
- return [ACVerdict(**item) for item in data["verdicts"]]
135
-
136
- async def _a_get_reason(self) -> str:
137
- if self.include_reason is False:
138
- return None
139
-
140
- incorrect_statements: List[Tuple[str, str]] = []
141
- for idx, verdict in enumerate(self.verdicts):
142
- if verdict.verdict.strip().lower() == "no":
143
- incorrect_statements.append((self.statements[idx], verdict.reason))
144
-
145
- prompt = AnswerCorrectnessTemplate.generate_reason(
146
- incorrect_statements=incorrect_statements,
147
- score=format(self.score, ".2f"),
148
- )
149
- if self.using_native_model:
150
- res = await self.model.a_generate(prompt)
151
- data = parse_response_json(res, self)
152
- return data["reason"]
153
- else:
154
- try:
155
- res: Reason = await self.model.a_generate(
156
- prompt=prompt, schema=Reason
157
- )
158
- return res.reason
159
- except TypeError:
160
- res = await self.model.a_generate(prompt)
161
- data = parse_response_json(res, self)
162
- return data["reason"]
163
-
164
- def _get_reason(self) -> str:
165
- if self.include_reason is False:
166
- return None
167
-
168
- incorrect_statements: List[Tuple[str, str]] = []
169
- for idx, verdict in enumerate(self.verdicts):
170
- if verdict.verdict.strip().lower() == "no":
171
- incorrect_statements.append((self.statements[idx], verdict.reason))
172
-
173
- prompt = AnswerCorrectnessTemplate.generate_reason(
174
- incorrect_statements=incorrect_statements,
175
- score=format(self.score, ".2f"),
176
- )
177
- if self.using_native_model:
178
- res = self.model.generate(prompt)
179
- data = parse_response_json(res, self)
180
- return data["reason"]
181
- else:
182
- try:
183
- res: Reason = self.model.generate(
184
- prompt=prompt, schema=Reason
185
- )
186
- return res.reason
187
- except TypeError:
188
- res = self.model.generate(prompt)
189
- data = parse_response_json(res, self)
190
- return data["reason"]
191
-
192
- def _compute_score(self) -> float:
193
- number_of_verdicts = len(self.verdicts)
194
- if number_of_verdicts == 0:
195
- return 1
196
-
197
- correct_count = 0
198
- for verdict in self.verdicts:
199
- if verdict.verdict.strip().lower() == "yes":
200
- correct_count += 1
201
-
202
- score = correct_count / number_of_verdicts
203
- return 0 if self.strict_mode and score < self.threshold else score
204
-
205
- def score_example(
206
- self,
207
- example: Example,
208
- _show_indicator: bool = True,
209
- ) -> float:
210
- check_example_params(example, required_params, self)
211
-
212
- with scorer_progress_meter(self, display_meter=_show_indicator):
213
- try:
214
- if self.async_mode:
215
- loop = get_or_create_event_loop()
216
- loop.run_until_complete(
217
- self.a_score_example(example, _show_indicator=False)
218
- )
219
- else:
220
- self.statements = self._get_statements(example.expected_output)
221
- self.verdicts = self._get_verdicts(example.actual_output)
222
- self.score = self._compute_score()
223
- self.reason = self._get_reason()
224
- self.success = self.score >= self.threshold
225
- self.verbose_logs = create_verbose_logs(
226
- self,
227
- steps=[
228
- f"Statements:\n{self.statements}",
229
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
230
- f"Score: {self.score}\nReason: {self.reason}",
231
- ],
232
- )
233
- return self.score
234
- except Exception as e:
235
- print(f"Error in score_example for AnswerCorrectnessScorer: {e}")
236
- raise
237
-
238
- async def a_score_example(
239
- self,
240
- example: Example,
241
- _show_indicator: bool = True,
242
- ) -> float:
243
- check_example_params(example, required_params, self)
244
-
245
- with scorer_progress_meter(self, async_mode=True, display_meter=_show_indicator):
246
- try:
247
- self.statements: List[str] = await self._a_get_statements(example.expected_output)
248
- self.verdicts: List[ACVerdict] = await self._a_get_verdicts(example.actual_output)
249
- self.score = self._compute_score()
250
- self.reason = await self._a_get_reason()
251
- self.success = self.score >= self.threshold
252
- self.verbose_logs = create_verbose_logs(
253
- self,
254
- steps=[
255
- f"Statements:\n{self.statements}",
256
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
257
- f"Score: {self.score}\nReason: {self.reason}",
258
- ],
259
- )
260
- return self.score
261
- except Exception as e:
262
- print(f"Error in a_score_example for AnswerCorrectnessScorer: {e}")
263
- raise
264
-
265
- def _success_check(self) -> bool:
266
- if self.error is not None:
267
- self.success = False
268
- else:
269
- try:
270
- self.success = self.score >= self.threshold
271
- except:
272
- self.success = False
273
- return self.success
274
-
275
- @property
276
- def __name__(self):
277
- return "Answer Correctness"
@@ -1,169 +0,0 @@
1
- """
2
- Util prompts for AnswerCorrectnessScorer
3
- """
4
-
5
- from typing import List, Optional, Tuple
6
- from pydantic import BaseModel, Field
7
-
8
-
9
- # BaseModels to enforce formatting in LLM JSON response
10
- class Statements(BaseModel):
11
- statements: List[str]
12
-
13
-
14
- class ACVerdict(BaseModel):
15
- verdict: str
16
- reason: str
17
-
18
-
19
- class Verdicts(BaseModel):
20
- verdicts: List[ACVerdict]
21
-
22
-
23
- class Reason(BaseModel):
24
- reason: str
25
-
26
-
27
- class AnswerCorrectnessTemplate:
28
- @staticmethod
29
- def deduce_statements(expected_output):
30
- return f"""You will be presented with a piece of text. Your task is to break down the text and generate a list of statements contained within the text. Single words and ambiguous phrases should be considered statements.
31
-
32
- ===== START OF EXAMPLES =====
33
- Example 1:
34
- Example text: The weather is sunny today. Temperature is 75 degrees. Don't forget your sunscreen!
35
-
36
- Output:
37
- {{
38
- "statements": ["The weather is sunny today", "Temperature is 75 degrees", "Don't forget your sunscreen!"]
39
- }}
40
-
41
- Example 2:
42
- Example text: I love pizza. It has cheese and tomato sauce and the crust is crispy.
43
-
44
- Output:
45
- {{
46
- "statements": ["I love pizza", "It has cheese and tomato sauce", "The crust is crispy"]
47
- }}
48
- ===== END OF EXAMPLES =====
49
-
50
-
51
- **
52
- IMPORTANT: Please return your answer in valid JSON format, with the "statements" key mapping to a list of strings. No words or explanation is needed.
53
- **
54
-
55
- ==== START OF INPUT ====
56
- Text:
57
- {expected_output}
58
- ==== END OF INPUT ====
59
-
60
- ==== YOUR ANSWER ====
61
- JSON:
62
- """
63
-
64
- @staticmethod
65
- def generate_verdicts(statements, actual_output):
66
- return f"""You will be provided with:\n
67
- - a list of statements from a text that we will refer to as expected output
68
- - a text that we will refer to as actual output\n
69
-
70
- Your task is to determine whether each statement from the expected output is correct/consistent with the actual output text.
71
- More specifically, you should generate a JSON object with the key "verdicts". "verdicts" will map to a list of nested JSON objects with two keys: `verdict` and `reason`.
72
- The "reason" key should provide an explanation for your choice, regardless of which verdict you select. Try providing quotes from the text(s) to justify your answer where possible.
73
- The "verdict" key be EXACTLY EITHER "yes" or "no". You should select "yes" if the statement is correct/consistent based on the actual output and "no" otherwise.
74
-
75
- ==== OUTPUT FORMATTING ====
76
- IMPORTANT: Please make sure to only return in JSON format, with the "verdicts" key mapping to a list of JSON objects. Each JSON object should contain keys "verdict" (one of "yes" or "no") and "reason" (str).
77
-
78
- ==== START OF EXAMPLES ====
79
- Example input 1: What's the capital of France?
80
- Example expected output statements 1: ["Paris is the capital of France", "It is located in northern France", "The city has a population of over 2 million"]
81
- Example actual output 1: "Paris is the capital city of France. It is situated in the northern part of the country and has over 2 million residents."
82
- Example JSON 1:
83
- {{
84
- "verdicts": [
85
- {{
86
- "verdict": "yes",
87
- "reason": "The actual output directly states 'Paris is the capital city of France', which matches the statement"
88
- }},
89
- {{
90
- "verdict": "yes",
91
- "reason": "The actual output confirms this by saying it is 'situated in the northern part of the country'"
92
- }},
93
- {{
94
- "verdict": "yes",
95
- "reason": "The actual output mentions the city 'has over 2 million residents', matching the population statement"
96
- }}
97
- ]
98
- }}
99
-
100
- Example input 2: What is the largest planet in our solar system?
101
- Example expected output statements 2: ["Jupiter is the largest planet", "It is a gas giant", "Jupiter has 79 known moons", "The Great Red Spot is a storm on Jupiter"]
102
- Example actual output 2: "Jupiter is the biggest planet in the solar system. It is made mostly of gas. The planet has many moons orbiting it."
103
- Example JSON 2:
104
- {{
105
- "verdicts": [
106
- {{
107
- "verdict": "yes",
108
- "reason": "The actual output confirms 'Jupiter is the biggest planet', which is equivalent to it being the largest"
109
- }},
110
- {{
111
- "verdict": "yes",
112
- "reason": "The actual output states it is 'made mostly of gas', indicating it is a gas giant"
113
- }},
114
- {{
115
- "verdict": "no",
116
- "reason": "While the actual output mentions Jupiter has 'many moons', it does not specify the exact number of 79 known moons"
117
- }},
118
- {{
119
- "verdict": "no",
120
- "reason": "The actual output makes no mention of the Great Red Spot or any storms on Jupiter"
121
- }}
122
- ]
123
- }}
124
- ==== END OF EXAMPLES ====
125
-
126
- ** LASTLY **
127
- Since you are tasked to choose a verdict for each statement, the number of "verdicts" SHOULD BE EXACTLY EQUAL to the number of "statements".
128
-
129
-
130
- ==== YOUR TURN =====
131
-
132
- Statements:
133
- {statements}
134
-
135
- Actual output:
136
- {actual_output}
137
-
138
- JSON:
139
- """
140
-
141
- @staticmethod
142
- def generate_reason(incorrect_statements: List[Tuple[str, str]], score: float):
143
- incorrect_statements = "\n".join([f"statement: {statement}\nreason: {reason}\n------" for statement, reason in incorrect_statements])
144
- return f"""==== TASK INSTRUCTIONS ====\nYou will provided with two inputs: an answer correctness score and a list of inconsistent/incorrect statements made in a model's output (with the reason why it's irrelevant). Your task is to provide a CLEAR and CONCISE reason for the answer correctness score.
145
- For context, there were a list of statements generated from an expected output. The model's actual output was then compared to the expected output, and we collected a list of claims made in the expected output that were either incorrect or inconsistent with the actual output.
146
- The score represents how well the model's output matches the expected output.
147
- You should explain why the score is not higher, but also include why its current score is fair.
148
- The incorrect statements represent parts of the model output that are incorrect or inconsistent with the expected output. The incorrect statement will be paired with the reason why it's incorrect.
149
- If there are no incorrect statements, instead respond with a positive remark with an upbeat encouraging tone (but don't overblow the kind attitude).
150
-
151
-
152
- ==== FORMATTING YOUR ANSWER ====
153
- IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
154
- Example JSON:
155
- {{
156
- "reason": "The score is <answer_relevancy_score> because <your_reason>."
157
- }}
158
-
159
- ==== YOUR TURN ====
160
- ---- ANSWER CORRECTNESS SCORE ----
161
- {score}
162
-
163
- ---- INCORRECT STATEMENTS ----
164
- {incorrect_statements}
165
-
166
- ---- YOUR RESPONSE ----
167
- JSON:
168
- """
169
-
@@ -1,4 +0,0 @@
1
- from judgeval.scorers.judgeval_scorers.local_implementations.answer_relevancy.answer_relevancy_scorer import AnswerRelevancyScorer
2
-
3
-
4
- __all__ = ["AnswerRelevancyScorer"]