judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,325 +0,0 @@
1
- """
2
- Code for the local implementation of the Faithfulness metric.
3
- """
4
- from typing import List, Optional, Union
5
- from pprint import pprint
6
- from judgeval.constants import APIScorer
7
- from judgeval.data import (
8
- Example,
9
- ExampleParams
10
- )
11
- from judgeval.scorers import JudgevalScorer
12
- from judgeval.scorers.utils import (
13
- get_or_create_event_loop,
14
- check_example_params
15
- )
16
- from judgeval.judges.utils import create_judge
17
- from judgeval.judges import JudgevalJudge
18
- from judgeval.scorers.utils import (
19
- scorer_progress_meter,
20
- create_verbose_logs,
21
- parse_response_json
22
- )
23
- from judgeval.scorers.judgeval_scorers.local_implementations.faithfulness.prompts import (
24
- FaithfulnessTemplate,
25
- FaithfulnessVerdict,
26
- Verdicts,
27
- Reason,
28
- Claims,
29
- )
30
-
31
-
32
- required_params = [
33
- ExampleParams.INPUT,
34
- ExampleParams.ACTUAL_OUTPUT,
35
- ExampleParams.RETRIEVAL_CONTEXT,
36
- ]
37
-
38
-
39
- class FaithfulnessScorer(JudgevalScorer):
40
- def __init__(
41
- self,
42
- threshold: float = 0.5,
43
- model: Optional[Union[str, JudgevalJudge]] = None,
44
- include_reason: bool = True,
45
- async_mode: bool = True,
46
- strict_mode: bool = False,
47
- verbose_mode: bool = False,
48
- user: Optional[str] = None
49
- ):
50
- super().__init__(
51
- score_type=APIScorer.FAITHFULNESS,
52
- threshold=1 if strict_mode else threshold,
53
- evaluation_model=None,
54
- include_reason=include_reason,
55
- async_mode=async_mode,
56
- strict_mode=strict_mode,
57
- verbose_mode=verbose_mode
58
- )
59
- self.user = user
60
- self.model, self.using_native_model = create_judge(model)
61
- self.using_native_model = True # NOTE: SETTING THIS FOR LITELLM and TOGETHER usage
62
- self.evaluation_model = self.model.get_model_name()
63
-
64
- def score_example(
65
- self,
66
- example: Example,
67
- all_claims: bool = False,
68
- _show_indicator: bool = True,
69
- ) -> float:
70
- check_example_params(example, required_params, self)
71
-
72
- with scorer_progress_meter(self, display_meter=_show_indicator):
73
- if self.async_mode:
74
- loop = get_or_create_event_loop()
75
- loop.run_until_complete(
76
- self.a_score_example(
77
- example,
78
- all_claims=all_claims,
79
- _show_indicator=False
80
- )
81
- )
82
- else:
83
- self.claims = self._generate_claims(example.actual_output, all_claims=all_claims)
84
- if self.additional_metadata is None:
85
- self.additional_metadata = {}
86
- self.additional_metadata["claims"] = self.claims # Add claims generated to metadata
87
-
88
- self.verdicts = self._generate_verdicts(example.retrieval_context)
89
- self.additional_metadata["verdicts"] = [v.model_dump() for v in self.verdicts] # Add verdicts generated to metadata
90
-
91
- self.score = self._calculate_score()
92
- self.reason = self._generate_reason()
93
- self.success = self.score >= self.threshold
94
- self.verbose_logs = create_verbose_logs(
95
- self,
96
- steps=[
97
- f"Claims:\n{self.claims}",
98
- f"Verdicts:\n{self.verdicts}",
99
- f"Score: {self.score}\nReason: {self.reason}",
100
- ],
101
- )
102
-
103
- return self.score
104
-
105
- async def a_score_example(
106
- self,
107
- example: Example,
108
- _show_indicator: bool = True
109
- ) -> float:
110
- check_example_params(example, required_params, self)
111
-
112
- with scorer_progress_meter(
113
- self, async_mode=True, display_meter=_show_indicator
114
- ):
115
- self.claims = await self._a_generate_claims(example.actual_output)
116
-
117
-
118
- if self.additional_metadata is None:
119
- self.additional_metadata = {}
120
- self.additional_metadata["claims"] = self.claims
121
-
122
- self.verdicts = await self._a_generate_verdicts(example.retrieval_context)
123
-
124
- self.additional_metadata["verdicts"] = [v.model_dump() for v in self.verdicts] # Add verdicts generated to metadata
125
-
126
- self.score = self._calculate_score()
127
- self.reason = await self._a_generate_reason()
128
- self.success = self.score >= self.threshold
129
- self.verbose_logs = create_verbose_logs(
130
- self,
131
- steps=[
132
- f"Claims:\n{self.claims}",
133
- f"Verdicts:\n{self.verdicts}",
134
- f"Score: {self.score}\nReason: {self.reason}",
135
- ],
136
- )
137
-
138
- return self.score
139
-
140
- async def _a_generate_reason(self) -> str:
141
- if self.include_reason is False:
142
- return None
143
-
144
- contradictions = []
145
- for verdict in self.verdicts:
146
- if verdict.verdict.strip().lower() == "no":
147
- contradictions.append(verdict.model_dump())
148
-
149
- prompt: dict = FaithfulnessTemplate.justify_reason(
150
- contradictions=contradictions,
151
- score=format(self.score, ".2f"),
152
- )
153
- if self.using_native_model:
154
- res = await self.model.a_generate(prompt)
155
- data = parse_response_json(res, self)
156
- return data["reason"]
157
- else:
158
- try:
159
- res: Reason = await self.model.a_generate(prompt, schema=Reason)
160
- return res.reason
161
- except TypeError:
162
- res = await self.model.a_generate(prompt)
163
- data = parse_response_json(res, self)
164
- return data["reason"]
165
-
166
- def _generate_reason(self) -> str:
167
- if self.include_reason is False:
168
- return None
169
-
170
- contradictions = []
171
- for verdict in self.verdicts:
172
- if verdict.verdict.strip().lower() == "no":
173
- contradictions.append(verdict.reason)
174
-
175
- prompt: dict = FaithfulnessTemplate.justify_reason(
176
- contradictions=contradictions,
177
- score=format(self.score, ".2f"),
178
- )
179
-
180
- if self.using_native_model:
181
- res = self.model.generate(prompt)
182
- data = parse_response_json(res, self)
183
- return data["reason"]
184
- else:
185
- try:
186
- res: Reason = self.model.generate(prompt, schema=Reason)
187
- return res.reason
188
- except TypeError:
189
- res = self.model.generate(prompt)
190
- data = parse_response_json(res, self)
191
- return data["reason"]
192
-
193
- async def _a_generate_verdicts(self, retrieval_context: str) -> List[FaithfulnessVerdict]:
194
- if len(self.claims) == 0:
195
- return []
196
-
197
- verdicts: List[FaithfulnessVerdict] = []
198
-
199
- claims = [
200
- claim["claim"] for claim in self.claims
201
- ] # We only need the claims, not the quotes involved
202
-
203
- prompt = FaithfulnessTemplate.create_verdicts(
204
- claims=claims,
205
- retrieval_context=retrieval_context,
206
- )
207
- if self.using_native_model:
208
- res = await self.model.a_generate(prompt)
209
- data = parse_response_json(res, self)
210
- verdicts = [
211
- FaithfulnessVerdict(**item) for item in data["verdicts"]
212
- ]
213
- return verdicts
214
- else:
215
- try:
216
- res: Verdicts = await self.model.generate(
217
- prompt, schema=Verdicts
218
- )
219
- verdicts = [item for item in res.verdicts]
220
- return verdicts
221
- except TypeError:
222
- res = await self.model.a_generate(prompt)
223
- data = parse_response_json(res, self)
224
- verdicts = [
225
- FaithfulnessVerdict(**item) for item in data["verdicts"]
226
- ]
227
- return verdicts
228
-
229
- def _generate_verdicts(self, retrieval_context: str) -> List[FaithfulnessVerdict]:
230
- if len(self.claims) == 0:
231
- return []
232
-
233
- verdicts: List[FaithfulnessVerdict] = []
234
-
235
- claims = [
236
- claim["claim"] for claim in self.claims
237
- ] # We only need the claims, not the quotes involved
238
-
239
- prompt = FaithfulnessTemplate.create_verdicts(
240
- claims=claims,
241
- retrieval_context=retrieval_context,
242
- )
243
- if self.using_native_model:
244
- res = self.model.generate(prompt)
245
- data = parse_response_json(res, self)
246
- verdicts = [
247
- FaithfulnessVerdict(**item) for item in data["verdicts"]
248
- ]
249
- return verdicts
250
- else:
251
- try:
252
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
253
- verdicts = [item for item in res.verdicts]
254
- return verdicts
255
- except TypeError:
256
- res = self.model.generate(prompt)
257
- data = parse_response_json(res, self)
258
- verdicts = [
259
- FaithfulnessVerdict(**item) for item in data["verdicts"]
260
- ]
261
- return verdicts
262
-
263
- async def _a_generate_claims(self, actual_output: str) -> List[str]:
264
- prompt = FaithfulnessTemplate.find_claims(text=actual_output)
265
- if self.using_native_model:
266
- res = await self.model.a_generate(prompt)
267
- data = parse_response_json(res, self)
268
- return data["claims"]
269
- else:
270
- try:
271
- res: Claims = await self.model.a_generate(prompt, schema=Claims)
272
- return res.claims
273
- except TypeError:
274
- res = await self.model.a_generate(prompt)
275
- data = parse_response_json(res, self)
276
- return data["claims"]
277
-
278
- def _generate_claims(self, actual_output: str, all_claims: bool = False) -> List[str]:
279
- prompt = FaithfulnessTemplate.find_claims(text=actual_output)
280
- if self.using_native_model:
281
- res = self.model.generate(prompt)
282
- data = parse_response_json(res, self)
283
- return data["claims"]
284
- else:
285
- try:
286
- res: Claims = self.model.generate(prompt, schema=Claims)
287
- return res.claims
288
- except TypeError:
289
- res = self.model.generate(prompt)
290
- data = parse_response_json(res, self)
291
- return data["claims"]
292
-
293
- def _calculate_score(self) -> float:
294
- number_of_verdicts = len(self.verdicts)
295
- if number_of_verdicts == 0:
296
- return 1
297
-
298
- faithfulness_count = 0
299
- for verdict in self.verdicts:
300
- if verdict.verdict.strip().lower() != "no":
301
- faithfulness_count += 1
302
-
303
- score = faithfulness_count / number_of_verdicts
304
- return 0 if self.strict_mode and score < self.threshold else score
305
-
306
- def _success_check(self) -> bool:
307
- if self.error is not None:
308
- self.success = False
309
- else:
310
- try:
311
- self.success = self.score >= self.threshold
312
- except:
313
- self.success = False
314
- return self.success
315
-
316
- def get_claims(self):
317
- return self.claims
318
-
319
- def get_verdicts(self):
320
- return self.verdicts
321
-
322
- @property
323
- def __name__(self):
324
- return "Faithfulness"
325
-
@@ -1,268 +0,0 @@
1
- from typing import List, Optional
2
- from pydantic import BaseModel, Field
3
-
4
-
5
- class FaithfulnessVerdict(BaseModel):
6
- verdict: str
7
- reason: Optional[str] = Field(default=None)
8
-
9
-
10
- class Verdicts(BaseModel):
11
- verdicts: List[FaithfulnessVerdict]
12
-
13
-
14
- class Truths(BaseModel):
15
- truths: List[str]
16
-
17
-
18
- class Claims(BaseModel):
19
- claims: List[str]
20
-
21
-
22
- class Reason(BaseModel):
23
- reason: str
24
-
25
-
26
- class FaithfulnessTemplate:
27
- @staticmethod
28
- def find_claims(text):
29
- return f"""==== TASK INSTRUCTIONS ====
30
- You will be provided with a passage of text. Based on the text, your task is to generate a comprehensive list of ALL CLAIMS that can be inferred from the text.
31
- For every claim that you derive from the text, provide the source of the claim via quoting the original text. Please try to extract EVERY CLAIM that is in the original text; priortize generating the most claims rather than being concise.
32
- You should NOT include any prior knowledge, and take the text at face value when extracting claims.
33
-
34
- ==== FORMATTING YOUR ANSWER ====
35
- Please return your answer in JSON format, with the "claims" key as a list of JSON objects with keys "claim" and "quote". No words or explanation beyond the output JSON is needed.
36
-
37
- ==== EXAMPLES ====
38
-
39
- ---- START OF EXAMPLE 1 ----
40
- Example Text:
41
- "Einstein won the nobel prize in 1968 for his discovery of the photoelectric effect."
42
-
43
- Example JSON:
44
- {{
45
- "claims": [
46
- {{
47
- "claim": "Einstein won the nobel prize for his discovery of the photoelectric effect.",
48
- "quote": "Einstein won the nobel prize in 1968 for his discovery of the photoelectric effect."
49
- }},
50
- {{
51
- "claim": "Einstein won the nobel prize in 1968.",
52
- "quote": "Einstein won the nobel prize in 1968 for his discovery of the photoelectric effect."
53
- }}
54
- ]
55
- }}
56
- ---- END OF EXAMPLE 1 ----
57
-
58
- ---- START OF EXAMPLE 2 ----
59
- Example Text: "The Wright brothers successfully flew the first powered airplane on December 17, 1903, in Kitty Hawk, North Carolina."
60
-
61
- {{
62
- "claims": [
63
- {{
64
- "claim": "The Wright brothers flew the first powered airplane.",
65
- "quote": "The Wright brothers successfully flew the first powered airplane on December 17, 1903, in Kitty Hawk, North Carolina."
66
- }},
67
- {{
68
- "claim": "The Wright brothers made their flight in Kitty Hawk, North Carolina.",
69
- "quote": "The Wright brothers successfully flew the first powered airplane on December 17, 1903, in Kitty Hawk, North Carolina."
70
- }},
71
- {{
72
- "claim": "The first powered airplane flight occurred on December 17, 1903.",
73
- "quote": "The Wright brothers successfully flew the first powered airplane on December 17, 1903, in Kitty Hawk, North Carolina."
74
- }}
75
- ]
76
- }}
77
- ---- END OF EXAMPLE 2 ----
78
-
79
- ---- START OF EXAMPLE 3 ----
80
- Example Text:
81
- "The Great Wall of China was built over many centuries by different Chinese dynasties. Construction began more than 2,000 years ago during the Warring States period. The most famous sections were built during the Ming Dynasty. The wall stretches for thousands of miles across northern China and was primarily built for military defense."
82
-
83
- Example JSON:
84
- {{
85
- "claims": [
86
- {{
87
- "claim": "The Great Wall of China was built by multiple Chinese dynasties",
88
- "quote": "The Great Wall of China was built over many centuries by different Chinese dynasties."
89
- }},
90
- {{
91
- "claim": "Construction of the Great Wall began over 2,000 years ago",
92
- "quote": "Construction began more than 2,000 years ago during the Warring States period."
93
- }},
94
- {{
95
- "claim": "Construction started during the Warring States period",
96
- "quote": "Construction began more than 2,000 years ago during the Warring States period."
97
- }},
98
- {{
99
- "claim": "The most well-known parts of the wall were constructed during the Ming Dynasty",
100
- "quote": "The most famous sections were built during the Ming Dynasty."
101
- }},
102
- {{
103
- "claim": "The Great Wall extends for thousands of miles",
104
- "quote": "The wall stretches for thousands of miles across northern China"
105
- }},
106
- {{
107
- "claim": "The wall is located in northern China",
108
- "quote": "The wall stretches for thousands of miles across northern China"
109
- }},
110
- {{
111
- "claim": "The Great Wall was constructed for defensive military purposes",
112
- "quote": "was primarily built for military defense."
113
- }}
114
- ]
115
- }}
116
- ---- END OF EXAMPLE 3 ----
117
-
118
- ==== END OF EXAMPLES ====
119
-
120
- ==== YOUR TURN ====
121
-
122
- Example Text:
123
- {text}
124
-
125
- JSON:
126
- """
127
-
128
- @staticmethod
129
- def create_verdicts(claims, retrieval_context):
130
- return f"""==== TASK INSTRUCTIONS ====
131
- You will be provided with a list of claims from an LLM's output text, accompanied by the retrieval documents that the LLM used to generate the output.
132
- I'm pretty sure that many of the claims are factually contradictory to the retrieval context, but I want you to double check that I'm right.
133
- For each claim, choose one of ("yes", "no", or "idk") to represent whether the claim is correct based on the retrieval context.
134
- YOU SHOULD be very scrutinous--if any part of the claim is contradicted by the retrieval context, you should choose "no". Think really hard about finding the contradictions, since they can be subtle!
135
-
136
- Choose 'no' if the retrieval context CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGMENT.
137
- Claims made using vague, suggestive, or speculative language such as 'may have', 'possibility due to', do NOT count as a contradiction.
138
- Claims that are fuzzy based on lack of information MUST BE ANSWERED with 'idk'.
139
-
140
- ==== FORMATTING YOUR ANSWER ====
141
- Please return your answer in JSON format, with the 'verdicts' key as a list of JSON objects. Each JSON object should have 2 fields: 'verdict' and 'reason'.
142
- The 'verdict' key should be either 'yes', 'no', or 'idk', which states WHETHER THE CLAIM AGREES with the context.
143
- The 'reason' key should be a string explaining why the claim is 'yes', 'no', or 'idk'. Make specific reference to the retrieval context to justify your verdict.
144
-
145
- ==== EXAMPLES ====
146
- ---- START OF EXAMPLE 1 ----
147
- Example retrieval contexts: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. Einstein won the Nobel Prize in 1968. Einstein is a German Scientist."
148
- Example claims: ["Barack Obama is a caucasian male.", "Zurich is a city in London", "Einstein won the Nobel Prize for the discovery of the photoelectric effect which may have contributed to his fame.", "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect.", "Einstein was a Germen chef."]
149
-
150
- Example JSON:
151
- {{
152
- "verdicts": [
153
- {{
154
- "verdict": "idk",
155
- "reason": "The claim about Barack Obama's ethnicity cannot be verified from the given retrieval context as it contains no information about Barack Obama."
156
- }},
157
- {{
158
- "verdict": "idk",
159
- "reason": "The claim about Zurich being a city in London cannot be verified from the given retrieval context as it contains no information about Zurich or London."
160
- }},
161
- {{
162
- "verdict": "yes",
163
- "reason": "The retrieval context confirms that Einstein won the Nobel Prize for discovering the photoelectric effect."
164
- }},
165
- {{
166
- "verdict": "no",
167
- "reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead."
168
- }},
169
- {{
170
- "verdict": "no",
171
- "reason": "The actual output claims Einstein is a Germen chef, which is not correct as the retrieval context states he was a German scientist instead."
172
- }}
173
- ]
174
- }}
175
- ---- END OF EXAMPLE 1 ----
176
- ---- START OF EXAMPLE 2 ----
177
- Example retrieval contexts: "The Great Wall of China was built over many centuries by different Chinese dynasties. Construction began more than 2,000 years ago. The wall stretches for thousands of miles across China's northern borders. Most of the existing wall was built during the Ming Dynasty."
178
- Example claims: ["The Great Wall was built in a single year.", "The Great Wall may have taken centuries to complete.", "The Great Wall was built by the Romans.", "The Great Wall is located in China's northern region.", "The Great Wall is 100 meters long."]
179
-
180
- Example JSON:
181
- {{
182
- "verdicts": [
183
- {{
184
- "verdict": "no",
185
- "reason": "The claim that the Great Wall was built in a single year directly contradicts the retrieval context, which states it was built over many centuries."
186
- }},
187
- {{
188
- "verdict": "yes",
189
- "reason": "The retrieval context confirms that the Great Wall was built over many centuries by different Chinese dynasties."
190
- }},
191
- {{
192
- "verdict": "no",
193
- "reason": "The claim states the Romans built the wall, which contradicts the retrieval context that specifies it was built by Chinese dynasties."
194
- }},
195
- {{
196
- "verdict": "yes",
197
- "reason": "The retrieval context explicitly states that the wall stretches across China's northern borders."
198
- }},
199
- {{
200
- "verdict": "no",
201
- "reason": "The claim that the wall is 100 meters long contradicts the retrieval context which states it stretches for thousands of miles."
202
- }}
203
- ]
204
- }}
205
- ---- END OF EXAMPLE 2 ----
206
- ==== END OF EXAMPLES ====
207
-
208
- ==== YOUR TURN ====
209
- Retrieval Contexts:
210
- {retrieval_context}
211
-
212
- Claims:
213
- {claims}
214
-
215
- JSON:
216
- """
217
-
218
- @staticmethod
219
- def justify_reason(score, contradictions):
220
- return f"""==== TASK INSTRUCTIONS ====
221
- You will be provided with a list of contradictions and a faithfulness score.
222
- The list of contradictions will be references to statements made by a RAG generator that contradicted one or more document(s) from the retrieval context.
223
- - To clarify, the LLM produced an `actual output` that contained claims that contradicted the `retrieval context` used to produce the output.
224
- The faithfulness score is a 0 - 1 float (1 is highest) indicating how factually consistent the RAG generator's output is to the retrieval context.
225
-
226
- Your task is to CLEARLY and CONCISELY summarize the contradictions to justify the score.
227
- If there are no contradictions, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
228
- Your reason MUST use information from the contradictions in your reason.
229
-
230
-
231
- ==== FORMATTING YOUR ANSWER ====
232
- Please make sure to only return in JSON format, with the 'reason' key providing the reason.
233
-
234
- Example JSON:
235
- {{
236
- "reason": "The score is <faithfulness_score> because <your_reason>."
237
- }}
238
-
239
- ==== EXAMPLE ====
240
- Example Contradictions:
241
- [
242
- {{
243
- "verdict": "no",
244
- "reason": "The output claims Marie Curie was born in Russia, but the context clearly states she was born in Warsaw, Poland."
245
- }},
246
- {{
247
- "verdict": "no",
248
- "reason": "The output states Marie Curie discovered uranium, but the context indicates she discovered radium and polonium."
249
- }}
250
- ]
251
-
252
- Example Faithfulness Score:
253
- 0.60
254
-
255
- Example Response:
256
- {{
257
- "reason": "The score is 0.60 because the output made two significant factual errors: incorrectly stating Marie Curie was born in Russia instead of Poland, and wrongly attributing the discovery of uranium to her instead of radium and polonium."
258
- }}
259
-
260
- ==== YOUR TURN ====
261
- Faithfulness Score:
262
- {score}
263
-
264
- Contradictions:
265
- {contradictions}
266
-
267
- JSON:
268
- """
@@ -1,3 +0,0 @@
1
- from judgeval.scorers.judgeval_scorers.local_implementations.hallucination.hallucination_scorer import HallucinationScorer
2
-
3
- __all__ = ["HallucinationScorer"]