judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,263 +0,0 @@
1
- """
2
- Metric that evaluates hallucinations in model outputs
3
-
4
- The hallucination metric determines whether your LLM generates factually correct information by comparing
5
- the actual_output to the provided context.
6
-
7
- If you're looking to evaluate hallucination for a RAG system, refer to the faithfulness metric instead.
8
-
9
- The HallucinationMetric uses an LLM to determine, for each context in contexts, whether there are any
10
- contradictions to the actual_output.
11
-
12
- Although extremely similar to the FaithfulnessMetric, the HallucinationMetric is calculated differently
13
- since it uses contexts as the source of truth instead. Since contexts is the ideal segment of your
14
- knowledge base relevant to a specific input, the degree of hallucination can be measured by the degree
15
- of which the contexts is disagreed upon.
16
-
17
- Faithfulness is measuring the number of statements in output that agree with contexts.
18
- Hallucination is measuring the fraction of contexts that agree with output (do not contradict == agree)
19
- """
20
-
21
- from typing import Optional, Union, List
22
-
23
- from judgeval.constants import APIScorer
24
- from judgeval.scorers.utils import (get_or_create_event_loop,
25
- scorer_progress_meter,
26
- create_verbose_logs,
27
- parse_response_json,
28
- check_example_params,
29
- )
30
- from judgeval.scorers import JudgevalScorer
31
- from judgeval.judges import JudgevalJudge
32
- from judgeval.judges.utils import create_judge
33
- from judgeval.data import Example, ExampleParams
34
- from judgeval.scorers.judgeval_scorers.local_implementations.hallucination.prompts import *
35
-
36
-
37
- required_params = [
38
- ExampleParams.INPUT,
39
- ExampleParams.ACTUAL_OUTPUT,
40
- ExampleParams.CONTEXT,
41
- ]
42
-
43
-
44
- class HallucinationScorer(JudgevalScorer):
45
- def __init__(
46
- self,
47
- threshold: float = 0.5,
48
- model: Optional[Union[str, JudgevalJudge]] = None,
49
- include_reason: bool = True,
50
- async_mode: bool = False,
51
- strict_mode: bool = False,
52
- verbose_mode: bool = False,
53
- ):
54
- super().__init__(
55
- score_type=APIScorer.HALLUCINATION,
56
- threshold=1 if strict_mode else threshold,
57
- evaluation_model=None,
58
- include_reason=include_reason,
59
- async_mode=async_mode,
60
- strict_mode=strict_mode,
61
- verbose_mode=verbose_mode
62
- )
63
- self.model, self.using_native_model = create_judge(model)
64
- self.evaluation_model = self.model.get_model_name()
65
-
66
- def score_example(
67
- self,
68
- example: Example,
69
- _show_indicator: bool = True,
70
- ) -> float:
71
- check_example_params(example, required_params, self)
72
-
73
- with scorer_progress_meter(self, display_meter=_show_indicator):
74
- if self.async_mode:
75
- loop = get_or_create_event_loop()
76
- loop.run_until_complete(
77
- self.a_score_example(example, _show_indicator=False)
78
- )
79
- else:
80
- self.verdicts: List[HallucinationVerdict] = (
81
- self._generate_verdicts(
82
- example.actual_output, example.context
83
- )
84
- )
85
- self.score = self._calculate_score()
86
- self.reason = self._generate_reason()
87
- self.success = self.score <= self.threshold
88
- self.verbose_logs = create_verbose_logs(
89
- self,
90
- steps=[
91
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
92
- f"Score: {self.score}\nReason: {self.reason}",
93
- ],
94
- )
95
-
96
- return self.score
97
-
98
- async def a_score_example(
99
- self,
100
- example: Example,
101
- _show_indicator: bool = True,
102
- ) -> float:
103
- check_example_params(example, required_params, self)
104
-
105
- with scorer_progress_meter(
106
- self, async_mode=True, display_meter=_show_indicator
107
- ):
108
- self.verdicts: List[HallucinationVerdict] = (
109
- await self._a_generate_verdicts(
110
- example.actual_output, example.context
111
- )
112
- )
113
- self.score = self._calculate_score()
114
- self.reason = await self._a_generate_reason()
115
- self.success = self.score <= self.threshold
116
- self.verbose_logs = create_verbose_logs(
117
- self,
118
- steps=[
119
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
120
- f"Score: {self.score}\nReason: {self.reason}",
121
- ],
122
- )
123
-
124
- return self.score
125
-
126
- async def _a_generate_reason(self):
127
- if self.include_reason is False:
128
- return None
129
-
130
- contradictions = []
131
- for verdict in self.verdicts:
132
- if verdict.verdict.strip().lower() == "no":
133
- contradictions.append(verdict.reason)
134
-
135
- prompt: dict = HallucinationTemplate.generate_reason(
136
- contradictions=contradictions,
137
- score=format(self.score, ".2f"),
138
- )
139
-
140
- if self.using_native_model:
141
- res = await self.model.a_generate(prompt)
142
- data = parse_response_json(res, self)
143
- return data["reason"]
144
- else:
145
- try:
146
- res: Reason = await self.model.a_generate(prompt, schema=Reason)
147
- return res.reason
148
- except TypeError:
149
- res = await self.model.a_generate(prompt)
150
- data = parse_response_json(res, self)
151
- return data["reason"]
152
-
153
- def _generate_reason(self):
154
- if self.include_reason is False:
155
- return None
156
-
157
- factual_alignments = []
158
- contradictions = []
159
- for verdict in self.verdicts:
160
- if verdict.verdict.strip().lower() == "no":
161
- contradictions.append(verdict.reason)
162
-
163
- prompt: dict = HallucinationTemplate.generate_reason(
164
- factual_alignments=factual_alignments,
165
- contradictions=contradictions,
166
- score=format(self.score, ".2f"),
167
- )
168
-
169
- if self.using_native_model:
170
- res = self.model.generate(prompt)
171
- data = parse_response_json(res, self)
172
- return data["reason"]
173
- else:
174
- try:
175
- res: Reason = self.model.generate(prompt, schema=Reason)
176
- return res.reason
177
- except TypeError:
178
- res = self.model.generate(prompt)
179
- data = parse_response_json(res, self)
180
- return data["reason"]
181
-
182
- async def _a_generate_verdicts(
183
- self, actual_output: str, contexts: List[str]
184
- ) -> List[HallucinationVerdict]:
185
- verdicts: List[HallucinationVerdict] = []
186
- prompt = HallucinationTemplate.generate_verdicts(
187
- actual_output=actual_output, contexts=contexts
188
- )
189
- if self.using_native_model:
190
- res = await self.model.a_generate(prompt)
191
- data = parse_response_json(res, self)
192
- verdicts = [
193
- HallucinationVerdict(**item) for item in data["verdicts"]
194
- ]
195
- return verdicts
196
- else:
197
- try:
198
- res: Verdicts = await self.model.a_generate(
199
- prompt, schema=Verdicts
200
- )
201
- verdicts = [item for item in res.verdicts]
202
- return verdicts
203
- except TypeError:
204
- res = await self.model.a_generate(prompt)
205
- data = parse_response_json(res, self)
206
- verdicts = [
207
- HallucinationVerdict(**item) for item in data["verdicts"]
208
- ]
209
- return verdicts
210
-
211
- def _generate_verdicts(
212
- self, actual_output: str, contexts: List[str]
213
- ) -> List[HallucinationVerdict]:
214
- verdicts: List[HallucinationVerdict] = []
215
- prompt = HallucinationTemplate.generate_verdicts(
216
- actual_output=actual_output, contexts=contexts
217
- )
218
- if self.using_native_model:
219
- res = self.model.generate(prompt)
220
- data = parse_response_json(res, self)
221
- verdicts = [
222
- HallucinationVerdict(**item) for item in data["verdicts"]
223
- ]
224
- return verdicts
225
- else:
226
- try:
227
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
228
- verdicts = [item for item in res.verdicts]
229
- return verdicts
230
- except TypeError:
231
- res = self.model.generate(prompt)
232
- data = parse_response_json(res, self)
233
- verdicts = [
234
- HallucinationVerdict(**item) for item in data["verdicts"]
235
- ]
236
- return verdicts
237
-
238
- def _calculate_score(self) -> float:
239
- number_of_verdicts = len(self.verdicts)
240
- if number_of_verdicts == 0:
241
- return 0
242
-
243
- hallucination_count = 0
244
- for verdict in self.verdicts:
245
- if verdict.verdict.strip().lower() == "no":
246
- hallucination_count += 1
247
-
248
- score = hallucination_count / number_of_verdicts
249
- return 1 if self.strict_mode and score > self.threshold else score
250
-
251
- def _success_check(self) -> bool:
252
- if self.error is not None:
253
- self.success = False
254
- else:
255
- try:
256
- self.success = self.score <= self.threshold
257
- except:
258
- self.success = False
259
- return self.success
260
-
261
- @property
262
- def __name__(self):
263
- return "Hallucination"
@@ -1,104 +0,0 @@
1
- from typing import List, Optional
2
- from pydantic import BaseModel, Field
3
-
4
-
5
- class HallucinationVerdict(BaseModel):
6
- verdict: str
7
- reason: str
8
-
9
-
10
- class Verdicts(BaseModel):
11
- verdicts: List[HallucinationVerdict]
12
-
13
-
14
- class Reason(BaseModel):
15
- reason: str
16
-
17
-
18
- class HallucinationTemplate:
19
- @staticmethod
20
- def generate_verdicts(actual_output, contexts):
21
- return f"""==== TASK INSTRUCTIONS ====
22
- You will be provided with an `actual output` (the response of an LLM to a particular query) and `contexts` (ground truth contextual information from a knowledge base).
23
- Your task is to take each context in contexts and determine whether the `actual output` factually agrees with the context.
24
-
25
- Additional notes:
26
- You should NOT use any prior knowledge you have in your decision making process; take each context at face value.
27
- Since you will determine a verdict for EACH context, the number of 'verdicts' is EXACTLY EQUAL TO the number of contexts.
28
- You should be lenient in your judgment when the actual output lacks detail with respect to the context segment; you should ONLY provide a 'no' answer if the context contradicts the actual output.
29
-
30
- ==== FORMATTING INSTRUCTIONS ====
31
- You should return a JSON object with a key 'verdicts', which is a list of JSON objects. Each JSON object corresponds to a context in `contexts`, and should have 2 fields: 'verdict' and 'reason'.
32
- The 'verdict' key should be EXACTLY one of 'yes' or 'no', representing whether the `actual output` factually agrees with the context segment.
33
- The 'reason' is the justification for the verdict. If your verdict is 'no', try to provide a correction in the reason.
34
-
35
- ==== EXAMPLE ====
36
- Example contexts: ["Einstein won the Nobel Prize for his discovery of the photoelectric effect.", "Einstein won the Nobel Prize in 1968."]
37
- Example actual output: "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect."
38
-
39
- Example:
40
- {{
41
- "verdicts": [
42
- {{
43
- "verdict": "yes",
44
- "reason": "The actual output agrees with the provided context which states that Einstein won the Nobel Prize for his discovery of the photoelectric effect."
45
- }},
46
- {{
47
- "verdict": "no",
48
- "reason": "The actual output contradicts the provided context which states that Einstein won the Nobel Prize in 1968, not 1969."
49
- }}
50
- ]
51
- }}
52
-
53
- ==== YOUR TURN ====
54
- Contexts:
55
- {contexts}
56
-
57
- Actual Output:
58
- {actual_output}
59
-
60
- JSON:
61
- """
62
-
63
- @staticmethod
64
- def generate_reason(contradictions, score):
65
- return f"""==== TASK INSTRUCTIONS ====
66
- An LLM has been provided with a list of `contexts` (ground truth contextual information from a knowledge base) and `actual output` (the response of an LLM to a particular query).
67
- You will be provided with a list of `contradictions`, which are factual discrepancies between the context segments and the actual output.
68
- Additionally, you will be provided with a hallucination score, which is a float (0 - 1, where 0 is the best score) indicating the fraction of context segments that contradict the actual output.
69
-
70
- Your task is to provide a CLEAR and CONCISE reason for the hallucination score.
71
- If the hallucination score is 0 (no contradictions), you should instead respond with a positive remark with an upbeat encouraging tone (but don't overblow the kind attitude).
72
-
73
- ==== FORMATTING INSTRUCTIONS ====
74
- Please make sure to only return in JSON format, with the 'reason' key providing the reason.
75
- Example JSON:
76
- {{
77
- "reason": "The score is <hallucination_score> because <your_reason>."
78
- }}
79
-
80
- ==== EXAMPLE ====
81
- Example Contradictions:
82
- [
83
- "The actual output claims Einstein won the Nobel Prize in 1969, which contradicts the context stating he won it in 1968.",
84
- "The actual output states Einstein was a chemist, but the context indicates he was a physicist.",
85
- "The actual output claims Einstein was born in Switzerland, while the context states he was born in Germany."
86
- ]
87
-
88
- Example Hallucination Score:
89
- 0.75
90
-
91
- Example Response:
92
- {{
93
- "reason": "The score is 0.75 because the actual output made multiple factual errors: incorrectly stating Einstein's Nobel Prize year (1969 vs 1968), his profession (chemist vs physicist), and birthplace (Switzerland vs Germany)."
94
- }}
95
-
96
- ==== YOUR TURN ====
97
- Contradictions:
98
- {contradictions}
99
-
100
- Hallucination Score:
101
- {score}
102
-
103
- JSON:
104
- """
@@ -1,5 +0,0 @@
1
- from judgeval.scorers.judgeval_scorers.local_implementations.json_correctness.json_correctness_scorer import JsonCorrectnessScorer
2
-
3
- __all__ = [
4
- "JsonCorrectnessScorer",
5
- ]
@@ -1,134 +0,0 @@
1
- from typing import List, Optional, Union, Any
2
- from pydantic import BaseModel, ValidationError, create_model
3
-
4
- from judgeval.constants import APIScorer
5
- from judgeval.judges import JudgevalJudge
6
- from judgeval.judges.utils import create_judge
7
- from judgeval.scorers.utils import (get_or_create_event_loop,
8
- scorer_progress_meter,
9
- create_verbose_logs,
10
- parse_response_json,
11
- check_example_params
12
- )
13
- from judgeval.scorers import JudgevalScorer
14
- from judgeval.data import Example, ExampleParams
15
-
16
-
17
- required_params = [
18
- ExampleParams.INPUT,
19
- ExampleParams.ACTUAL_OUTPUT
20
- ]
21
-
22
-
23
- class JsonCorrectnessScorer(JudgevalScorer):
24
-
25
- def __init__(
26
- self,
27
- json_schema: Union[BaseModel, dict],
28
- model: Optional[Union[str, JudgevalJudge]] = None,
29
- threshold: float = 0.5,
30
- async_mode: bool = True,
31
- strict_mode: bool = False,
32
- verbose_mode: bool = False,
33
- user: Optional[str] = None
34
- ):
35
- super().__init__(
36
- score_type=APIScorer.JSON_CORRECTNESS,
37
- threshold=1 if strict_mode else threshold,
38
- evaluation_model=None,
39
- include_reason=False,
40
- async_mode=async_mode,
41
- strict_mode=strict_mode,
42
- verbose_mode=verbose_mode
43
- )
44
- self.user = user
45
- self.model, self.using_native_model = create_judge(model)
46
- self.evaluation_model = self.model.get_model_name()
47
-
48
- if isinstance(json_schema, dict):
49
- # Convert to BaseModel
50
- fields = {
51
- key: (str if prop["type"] == "string" else Any, ...)
52
- for key, prop in json_schema["properties"].items()
53
- }
54
-
55
- # Dynamically create the model
56
- DynamicModel = create_model(json_schema["title"], **fields)
57
-
58
- self.json_schema = DynamicModel
59
- else:
60
- self.json_schema = json_schema
61
-
62
- def score_example(self, example: Example, _show_indicator: bool = True) -> float:
63
- check_example_params(example, required_params, self)
64
- with scorer_progress_meter(
65
- self,
66
- async_mode=self.async_mode,
67
- display_meter=_show_indicator,
68
- ):
69
- if self.async_mode:
70
- loop = get_or_create_event_loop()
71
- loop.run_until_complete(
72
- self.a_measure(example, _show_indicator=False)
73
- )
74
- else:
75
- valid_json = True
76
- try:
77
- self.json_schema.model_validate_json(
78
- example.actual_output
79
- )
80
- except ValidationError as e:
81
- valid_json = False
82
-
83
- self.score = 1.0 if valid_json else 0
84
- self.success = self.score >= self.threshold
85
- self.verbose_logs = create_verbose_logs(
86
- self,
87
- steps=[
88
- f"LLM outputed Json:\n{example.actual_output}",
89
- f"Score: {self.score}",
90
- ],
91
- )
92
-
93
- return self.score
94
-
95
- async def a_score_example(self, example: Example, _show_indicator: bool = True) -> float:
96
- check_example_params(example, required_params, self)
97
- with scorer_progress_meter(
98
- self,
99
- async_mode=self.async_mode,
100
- display_meter=_show_indicator,
101
- ):
102
- valid_json = True
103
- try:
104
- self.json_schema.model_validate_json(
105
- example.actual_output
106
- )
107
- except ValidationError as e:
108
- valid_json = False
109
-
110
- self.score = 1.0 if valid_json else 0
111
- self.success = self.score >= self.threshold
112
- self.verbose_logs = create_verbose_logs(
113
- self,
114
- steps=[
115
- f"LLM outputed Json:\n{example.actual_output}",
116
- f"Score: {self.score}",
117
- ],
118
- )
119
- return self.score
120
-
121
- def _success_check(self):
122
- if self.error is not None:
123
- self.success = False
124
- else:
125
- try:
126
- self.success = self.score >= self.threshold
127
- except:
128
- self.success = False
129
- return self.success
130
-
131
- @property
132
- def __name__(self):
133
- return "JSON Correctness"
134
-
@@ -1,3 +0,0 @@
1
- from .summarization_scorer import SummarizationScorer
2
-
3
- __all__ = ["SummarizationScorer"]