judgeval 0.0.32__py3-none-any.whl → 0.0.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/s3_storage.py +93 -0
- judgeval/common/tracer.py +612 -123
- judgeval/data/sequence.py +4 -10
- judgeval/judgment_client.py +25 -86
- judgeval/rules.py +4 -7
- judgeval/run_evaluation.py +1 -1
- judgeval/scorers/__init__.py +4 -4
- judgeval/scorers/judgeval_scorers/__init__.py +0 -176
- {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/METADATA +15 -2
- judgeval-0.0.33.dist-info/RECORD +63 -0
- judgeval/scorers/base_scorer.py +0 -58
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -27
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -276
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -161
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -222
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -156
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -318
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -232
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -102
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -551
- judgeval-0.0.32.dist-info/RECORD +0 -97
- {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/WHEEL +0 -0
- {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,247 +0,0 @@
|
|
1
|
-
from pydantic import BaseModel, Field
|
2
|
-
from typing import List, Optional
|
3
|
-
from enum import Enum
|
4
|
-
|
5
|
-
|
6
|
-
class ScoreType(Enum):
|
7
|
-
CONTRADICTION = "Contradiction"
|
8
|
-
INFO_COVERAGE = "Info Coverage"
|
9
|
-
|
10
|
-
|
11
|
-
class ContradictionVerdict(BaseModel):
|
12
|
-
# yes, no, or idk
|
13
|
-
verdict: str
|
14
|
-
reason: Optional[str] = Field(default=None)
|
15
|
-
|
16
|
-
|
17
|
-
class InfoCoverageVerdict(BaseModel):
|
18
|
-
summary_verdict: str
|
19
|
-
original_verdict: str
|
20
|
-
question: str = Field(default=None)
|
21
|
-
|
22
|
-
|
23
|
-
class Verdicts(BaseModel):
|
24
|
-
verdicts: List[ContradictionVerdict]
|
25
|
-
|
26
|
-
|
27
|
-
class Questions(BaseModel):
|
28
|
-
questions: List[str]
|
29
|
-
|
30
|
-
|
31
|
-
class Answers(BaseModel):
|
32
|
-
answers: List[str]
|
33
|
-
|
34
|
-
|
35
|
-
class Reason(BaseModel):
|
36
|
-
reason: str
|
37
|
-
|
38
|
-
|
39
|
-
class SummarizationTemplate:
|
40
|
-
@staticmethod
|
41
|
-
def generate_reason(contradictions, redundancies, questions, score):
|
42
|
-
return f"""==== TASK INSTRUCTIONS ====
|
43
|
-
An LLM has been tasked to summarize a text. You will be provided with the following:
|
44
|
-
1) information in the LLM's summary contradicting the original text
|
45
|
-
2) extra information in the LLM's summary not mentioned in the original text
|
46
|
-
3) [Optional] questions that cannot be answered by the LLM's summary
|
47
|
-
4) the summarization score, which is a 0-1 score indicating how good the summary is to the original text (higher the better)
|
48
|
-
|
49
|
-
YOUR TASK is to use this info to explain how well the LLM performed at summarizing the text.
|
50
|
-
Please CLEARLY and CONCISELY justify the score based on the provided information.
|
51
|
-
|
52
|
-
==== FORMATTING YOUR ANSWER ====
|
53
|
-
Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
54
|
-
Example JSON:
|
55
|
-
{{
|
56
|
-
"reason": "The score is <summarization_score> because <your_reason>."
|
57
|
-
}}
|
58
|
-
|
59
|
-
For 'None' values in contradictions, extra information, or questions that the original text can answer but not the summary, DON'T mention anything and instead offer some praise.
|
60
|
-
|
61
|
-
==== EXAMPLES ====
|
62
|
-
---- START OF EXAMPLE 1 ----
|
63
|
-
Example Contradictions:
|
64
|
-
["The text claims Marie Curie won the Nobel Prize in Chemistry in 1903, but she actually won it in Physics that year", "The summary states she worked alone, but the original text mentions she collaborated with her husband Pierre"]
|
65
|
-
|
66
|
-
Example Extra Information:
|
67
|
-
["The summary mentions she taught at Oxford University, but this is not mentioned in the original text"]
|
68
|
-
|
69
|
-
Example Questions Original Text Can Answer But Summary Cannot:
|
70
|
-
["What other awards did Marie Curie receive besides the Nobel Prize?"]
|
71
|
-
|
72
|
-
Example Score: 0.65
|
73
|
-
|
74
|
-
Example Response:
|
75
|
-
{{
|
76
|
-
"reason": "The score of 0.65 reflects issues with factual accuracy and coverage. The summary contains two factual errors about Curie's Nobel Prize field and her collaboration status, while also making unverified claims about Oxford. The summary also fails to address key questions about her other awards."
|
77
|
-
}}
|
78
|
-
---- END OF EXAMPLE 1 ----
|
79
|
-
---- START OF EXAMPLE 2 ----
|
80
|
-
Example Contradictions:
|
81
|
-
["The summary states Shakespeare wrote 40 plays, but the original text clearly states he wrote 37 plays"]
|
82
|
-
|
83
|
-
Example Extra Information:
|
84
|
-
["The summary claims Shakespeare attended Oxford University, but this is not mentioned anywhere in the original text"]
|
85
|
-
|
86
|
-
Example Questions Original Text Can Answer But Summary Cannot:
|
87
|
-
None
|
88
|
-
|
89
|
-
Example Score: 0.82
|
90
|
-
|
91
|
-
Example Response:
|
92
|
-
{{
|
93
|
-
"reason": "The score of 0.82 reflects a generally good summary with a few issues. While the summary contains one factual error about the number of Shakespeare's plays and makes an unverified claim about Oxford attendance, it successfully covers the key information from the original text without missing any important details."
|
94
|
-
}}
|
95
|
-
---- END OF EXAMPLE 2 ----
|
96
|
-
|
97
|
-
==== YOUR TURN ====
|
98
|
-
Summarization Score:
|
99
|
-
{score}
|
100
|
-
|
101
|
-
Contradicting Information in the original text:
|
102
|
-
{contradictions}
|
103
|
-
|
104
|
-
Extra Information not mentioned in the original text:
|
105
|
-
{redundancies}
|
106
|
-
"""
|
107
|
-
|
108
|
-
@staticmethod
|
109
|
-
def generate_answers(questions, text):
|
110
|
-
return f"""==== TASK INSTRUCTIONS ====
|
111
|
-
You will be provided with a passage of text and an accompanying list of questions.
|
112
|
-
Your task is to determine whether the provided text contains sufficient information to answer each question by choosing 'yes' or 'no' for each question.
|
113
|
-
To clarify, you should choose 'yes' if the provided text contains sufficient information to answer the question, and 'no' otherwise.
|
114
|
-
|
115
|
-
==== FORMATTING YOUR ANSWER ====
|
116
|
-
You should generate a JSON with key 'answers', which is a list of strings that determines whether the provided text contains sufficient information to answer EACH question.
|
117
|
-
Since you are determining a verdict for each question, the length of 'answers' SHOULD BE STRICTLY EQUAL to that of the questions.
|
118
|
-
|
119
|
-
==== EXAMPLES ====
|
120
|
-
---- START OF EXAMPLE 1 ----
|
121
|
-
Example Text: The Eiffel Tower was completed in 1889 for the World's Fair in Paris. It stands 324 meters tall and was named after engineer Gustave Eiffel.
|
122
|
-
Example Questions: ["Does the text contain information about when the Eiffel Tower was built?"]
|
123
|
-
Example Answers:
|
124
|
-
{{
|
125
|
-
"answers": ["yes"]
|
126
|
-
}}
|
127
|
-
---- END OF EXAMPLE 1 ----
|
128
|
-
---- START OF EXAMPLE 2 ----
|
129
|
-
Example Text: "The Statue of Liberty was a gift from France to the United States. It was dedicated in 1886 and stands on Liberty Island in New York Harbor."
|
130
|
-
Example Questions: ["Does the text mention who gave the Statue of Liberty?", "Does the text indicate where the statue is located?"]
|
131
|
-
Example Answers:
|
132
|
-
{{
|
133
|
-
"answers": ["yes", "yes"]
|
134
|
-
}}
|
135
|
-
---- END OF EXAMPLE 2 ----
|
136
|
-
===== END OF EXAMPLES ======
|
137
|
-
|
138
|
-
==== YOUR TURN ====
|
139
|
-
Text:
|
140
|
-
{text}
|
141
|
-
|
142
|
-
Questions:
|
143
|
-
{questions}
|
144
|
-
|
145
|
-
JSON:
|
146
|
-
"""
|
147
|
-
|
148
|
-
@staticmethod
|
149
|
-
def generate_questions(text, n):
|
150
|
-
return f"""==== TASK INSTRUCTIONS ====
|
151
|
-
Based on the given text, generate {n} closed-ended questions that can be answered with either a 'yes' or 'no'.
|
152
|
-
The questions generated should ALWAYS result in a 'yes' based on the given text.
|
153
|
-
|
154
|
-
==== FORMATTING YOUR ANSWER ====
|
155
|
-
Only return a JSON with a 'questions' key, which is a list of strings. The questions need to be closed ended, meaning they are answered with either 'yes' or 'no'.
|
156
|
-
Remember that for this task, we should be able to use the given text to answer 'yes' for each question you generate.
|
157
|
-
|
158
|
-
==== EXAMPLES ====
|
159
|
-
---- START OF EXAMPLE 1 ----
|
160
|
-
Example Text: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. Einstein won the Nobel Prize in 1968. Einstein is a German Scientist."
|
161
|
-
N = 2 questions
|
162
|
-
|
163
|
-
Example Answers:
|
164
|
-
{{
|
165
|
-
"questions": ["Is there enough information about Einstein's nationality?", "Is there enough information to know Einstein's Nobel Prize year?"]
|
166
|
-
}}
|
167
|
-
---- END OF EXAMPLE 1 ----
|
168
|
-
---- START OF EXAMPLE 2 ----
|
169
|
-
Example Text: "The Great Wall of China was built over many centuries by different Chinese dynasties. Construction began more than 2,000 years ago and continued through multiple dynasties. The wall stretches for thousands of miles across China's northern borders."
|
170
|
-
N = 2 questions
|
171
|
-
|
172
|
-
Example Answers:
|
173
|
-
{{
|
174
|
-
"questions": ["Does the text provide information about when construction of the Great Wall began?", "Is there information about the Great Wall's location relative to China?"]
|
175
|
-
}}
|
176
|
-
---- END OF EXAMPLE 2 ----
|
177
|
-
===== END OF EXAMPLES ======
|
178
|
-
|
179
|
-
==== YOUR TURN ====
|
180
|
-
Text:
|
181
|
-
{text}
|
182
|
-
|
183
|
-
N = {n}
|
184
|
-
|
185
|
-
JSON:
|
186
|
-
"""
|
187
|
-
|
188
|
-
@staticmethod
|
189
|
-
def generate_contradiction_verdicts(original_text, summary_claims):
|
190
|
-
return f"""==== TASK INSTRUCTIONS ====
|
191
|
-
|
192
|
-
You will be provided with a text and a list of summary claims. The list of claims is drawn from a summary of the original text.
|
193
|
-
Your task is to determine whether each claim is factually consistent with the original text.
|
194
|
-
|
195
|
-
NOTE: You should NOT use your prior knowledge in your judgment. It does NOT matter if the claim is correct; we're just interested in whether the claim is factually consistent with the original text.
|
196
|
-
Claims that is not backed up due to a lack of information/is not mentioned in the summary MUST be answered 'idk'.
|
197
|
-
Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
|
198
|
-
|
199
|
-
==== FORMATTING YOUR ANSWER ====
|
200
|
-
You should format your answer JSON with a key 'verdicts', which is a list of JSON objects. Each JSON object corresponds to a claim in the summary claims, and should have 2 fields: 'verdict' and 'reason'.
|
201
|
-
The 'verdict' key should be EXACTLY one of 'yes', 'no', or 'idk', which represents whether the given summary claim agrees with the original text.
|
202
|
-
The 'reason' key should be a string that provides a justification for the verdict. You should reference the original text in your reason where appropriate.
|
203
|
-
|
204
|
-
Since you are determining a verdict for each claim, the length of 'verdicts' SHOULD BE EXACTLY EQUAL to that of the summary claims.
|
205
|
-
|
206
|
-
|
207
|
-
==== EXAMPLE ====
|
208
|
-
Example Original Text: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. Einstein won the Nobel Prize in 1968. Einstein is a German Scientist."
|
209
|
-
Example Summary Claims: ["Barack Obama is a caucasian male.", "Zurich is a city in London", "Einstein won the Nobel Prize for the discovery of the photoelectric effect which may have contributed to his fame.", "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect.", "Einstein was a Germen chef."]
|
210
|
-
|
211
|
-
Example:
|
212
|
-
{{
|
213
|
-
"verdicts": [
|
214
|
-
{{
|
215
|
-
"verdict": "idk",
|
216
|
-
"reason": "The original text does not mention Barack Obama at all, let alone his racial features."
|
217
|
-
}},
|
218
|
-
{{
|
219
|
-
"verdict": "idk",
|
220
|
-
"reason": "The original text does not mention Zurich, nor does it mention Zurich being in London"
|
221
|
-
}},
|
222
|
-
{{
|
223
|
-
"verdict": "yes",
|
224
|
-
"reason": "The original text directly states that Einstein won the Nobel Prize for his discovery of the photoelectric effect, which matches this claim."
|
225
|
-
}},
|
226
|
-
{{
|
227
|
-
"verdict": "no",
|
228
|
-
"reason": "The summary claims Einstein won the Nobel Prize in 1969, which is untrue as the original text states it is 1968 instead."
|
229
|
-
}},
|
230
|
-
{{
|
231
|
-
"verdict": "no",
|
232
|
-
"reason": "The summary claims Einstein is a Germen chef, which is not correct as the original text states he was a German scientist instead."
|
233
|
-
}}
|
234
|
-
]
|
235
|
-
}}
|
236
|
-
===== END OF EXAMPLE ======
|
237
|
-
|
238
|
-
|
239
|
-
==== YOUR TURN ====
|
240
|
-
Original Text:
|
241
|
-
{original_text}
|
242
|
-
|
243
|
-
Summary Claims:
|
244
|
-
{summary_claims}
|
245
|
-
|
246
|
-
JSON:
|
247
|
-
"""
|