judgeval 0.0.32__py3-none-any.whl → 0.0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. judgeval/common/s3_storage.py +93 -0
  2. judgeval/common/tracer.py +612 -123
  3. judgeval/data/sequence.py +4 -10
  4. judgeval/judgment_client.py +25 -86
  5. judgeval/rules.py +4 -7
  6. judgeval/run_evaluation.py +1 -1
  7. judgeval/scorers/__init__.py +4 -4
  8. judgeval/scorers/judgeval_scorers/__init__.py +0 -176
  9. {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/METADATA +15 -2
  10. judgeval-0.0.33.dist-info/RECORD +63 -0
  11. judgeval/scorers/base_scorer.py +0 -58
  12. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -27
  13. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  14. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -276
  15. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  16. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  17. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  18. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  19. judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  20. judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -161
  21. judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -222
  22. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  23. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  24. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  25. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  26. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  27. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  28. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  29. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  30. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  31. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -3
  32. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -156
  33. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  34. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -318
  35. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  36. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  37. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -264
  38. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  39. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -232
  40. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -102
  41. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  42. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  43. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  44. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  45. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -551
  46. judgeval-0.0.32.dist-info/RECORD +0 -97
  47. {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/WHEEL +0 -0
  48. {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,232 +0,0 @@
1
- from typing import Optional, List, Union, Tuple
2
- from pydantic import BaseModel
3
-
4
- from judgeval.constants import APIScorer
5
- from judgeval.scorers.utils import (get_or_create_event_loop,
6
- scorer_progress_meter,
7
- create_verbose_logs,
8
- parse_response_json,
9
- check_example_params
10
- )
11
- from judgeval.scorers import JudgevalScorer
12
- from judgeval.judges import JudgevalJudge
13
- from judgeval.judges.utils import create_judge
14
- from judgeval.data import Example, ExampleParams
15
- from judgeval.scorers.judgeval_scorers.local_implementations.instruction_adherence.prompt import (
16
- InstructionAdherenceTemplate,
17
- )
18
- required_params = [
19
- ExampleParams.INPUT,
20
- ExampleParams.ACTUAL_OUTPUT,
21
- ]
22
-
23
- class Instructions(BaseModel):
24
- instructions: List[str]
25
-
26
- class Verdict(BaseModel):
27
- instruction: str
28
- score: float
29
- reason: str
30
-
31
- class ListOfVerdicts(BaseModel):
32
- verdicts: List[Verdict]
33
-
34
- class InstructionAdherenceScorer(JudgevalScorer):
35
- def __init__(
36
- self,
37
- threshold: float = 0.5,
38
- model: Optional[Union[str, JudgevalJudge]] = None,
39
- include_reason: bool = True,
40
- async_mode: bool = True,
41
- strict_mode: bool = False,
42
- verbose_mode: bool = False,
43
- ):
44
- super().__init__(
45
- score_type=APIScorer.INSTRUCTION_ADHERENCE,
46
- threshold=1 if strict_mode else threshold,
47
- evaluation_model=None,
48
- include_reason=include_reason,
49
- async_mode=async_mode,
50
- strict_mode=strict_mode,
51
- verbose_mode=verbose_mode
52
- )
53
- self.model, self.using_native_model = create_judge(model)
54
- self.evaluation_model = self.model.get_model_name()
55
-
56
- def score_example(
57
- self,
58
- example: Example,
59
- _show_indicator: bool = True,
60
- ) -> float:
61
- check_example_params(example, required_params, self)
62
-
63
- with scorer_progress_meter(self, display_meter=_show_indicator):
64
- try:
65
- if self.async_mode:
66
- loop = get_or_create_event_loop()
67
- loop.run_until_complete(
68
- self.a_score_example(example, _show_indicator=False)
69
- )
70
- else:
71
- self.instructions: List[str] = self._get_instructions(example.input)
72
- self.verdicts: List[Verdict] = (
73
- self._get_verdicts(self.instructions, example.actual_output)
74
- )
75
- self.score = self._compute_score()
76
- self.reason = str(self.verdicts)
77
- self.success = self.score >= self.threshold
78
- self.verbose_logs = create_verbose_logs(
79
- self,
80
- steps=[
81
- f"Instructions:\n{self.instructions}",
82
- f"Score: {self.score}\nReason: {self.reason}",
83
- ],
84
- )
85
- return self.score
86
- except Exception as e:
87
- raise
88
-
89
- async def a_score_example(
90
- self,
91
- example: Example,
92
- _show_indicator: bool = True,
93
- ) -> float:
94
- check_example_params(example, required_params, self)
95
- try:
96
- with scorer_progress_meter(
97
- self, async_mode=True, display_meter=_show_indicator
98
- ):
99
- self.instructions: List[str] = await self._a_get_instructions(example.input)
100
- self.verdicts: List[Verdict] = (
101
- await self._a_get_verdicts(self.instructions, example.actual_output)
102
- )
103
- self.score = self._compute_score()
104
- self.reason = str(self.verdicts)
105
- self.success = self.score >= self.threshold
106
- self.verbose_logs = create_verbose_logs(
107
- self,
108
- steps=[
109
- f"Instructions:\n{self.instructions}",
110
- f"Score: {self.score}\nReason: {self.reason}",
111
- ],
112
- )
113
- return self.score
114
- except Exception as e:
115
- raise e
116
-
117
-
118
- async def _a_get_verdicts(
119
- self, instructions: List[str], actual_output: str
120
- ) -> List[Verdict]:
121
- if len(instructions) == 0:
122
- return []
123
-
124
- prompt = InstructionAdherenceTemplate.generate_verdicts(
125
- instructions=instructions,
126
- actual_output=actual_output,
127
- )
128
- if self.using_native_model:
129
- res = await self.model.a_generate(prompt)
130
- data = parse_response_json(res, self)
131
- return [
132
- Verdict(**item) for item in data["verdicts"]
133
- ]
134
- else:
135
- try:
136
- res: List[Verdict] = await self.model.a_generate(
137
- prompt, schema=List[Verdict]
138
- )
139
- return res
140
- except TypeError:
141
- res = await self.model.a_generate(prompt)
142
- data = parse_response_json(res, self)
143
- return [
144
- Verdict(**item) for item in data["verdicts"]
145
- ]
146
-
147
- def _get_verdicts(self, instructions: List[str], actual_output: str) -> List[Verdict]:
148
- if len(instructions) == 0:
149
- return []
150
-
151
- prompt = InstructionAdherenceTemplate.generate_verdicts(
152
- instructions=instructions,
153
- actual_output=actual_output,
154
- )
155
- if self.using_native_model:
156
- res = self.model.generate(prompt)
157
- data = parse_response_json(res, self)
158
- return [Verdict(**item) for item in data["verdicts"]]
159
- else:
160
- try:
161
- res: List[Verdict] = self.model.generate(prompt, schema=List[Verdict])
162
- return res
163
- except TypeError:
164
- res = self.model.generate(prompt)
165
- data = parse_response_json(res, self)
166
- return [
167
- Verdict(**item) for item in data["verdicts"]
168
- ]
169
-
170
- async def _a_get_instructions(
171
- self,
172
- input: str,
173
- ) -> List[str]:
174
- prompt = InstructionAdherenceTemplate.get_instructions(
175
- input=input,
176
- )
177
- if self.using_native_model:
178
- res = await self.model.a_generate(prompt)
179
- data = parse_response_json(res, self)
180
- return data["instructions"]
181
- else:
182
- try:
183
- res: List[str] = await self.model.a_generate(
184
- prompt, schema=List[str]
185
- )
186
- return res
187
- except TypeError:
188
- res = await self.model.a_generate(prompt)
189
- data = parse_response_json(res, self)
190
- return data["instructions"]
191
-
192
- def _get_instructions(
193
- self,
194
- input: str,
195
- ) -> List[str]:
196
- prompt = InstructionAdherenceTemplate.get_instructions(
197
- input=input,
198
- )
199
- if self.using_native_model:
200
- res = self.model.generate(prompt)
201
- data = parse_response_json(res, self)
202
- return data["instructions"]
203
- else:
204
- try:
205
- res: List[str] = self.model.generate(prompt, schema=List[str])
206
- return res
207
- except TypeError:
208
- res = self.model.generate(prompt)
209
- data = parse_response_json(res, self)
210
- return data["instructions"]
211
-
212
- def _compute_score(self):
213
- if len(self.verdicts) == 0:
214
- return 1
215
- score = 0
216
- for verdict in self.verdicts:
217
- score += verdict.score
218
- return score / len(self.verdicts)
219
-
220
- def success_check(self) -> bool:
221
- if self.error is not None:
222
- self.success = False
223
- else:
224
- try:
225
- self.success = self.score >= self.threshold
226
- except:
227
- self.success = False
228
- return self.success
229
-
230
- @property
231
- def __name__(self):
232
- return "Instruction Adherence"
@@ -1,102 +0,0 @@
1
- """
2
- Util prompts for InstructionAdherenceScorer
3
- """
4
-
5
- from typing import List, Optional, Tuple
6
- from pydantic import BaseModel, Field
7
-
8
-
9
- class InstructionAdherenceTemplate:
10
- @staticmethod
11
- def get_instructions(input):
12
- return f"""You will be presented with a piece of text. Your task is to break down the text and generate a list of the instructions contained within the text.
13
-
14
- ===== START OF EXAMPLES =====
15
- Example 1:
16
- Example text: Hello my name is John Doe. I like cars. Write two poems about the weather and create a joke. Also what is 5 + 5?
17
-
18
- Output:
19
- {{
20
- "instructions": ["Write two poem about the weather", "Create a joke", "What is 5 + 5?"]
21
- }}
22
- ===== END OF EXAMPLES =====
23
-
24
-
25
- **
26
- IMPORTANT: Please return your answer in valid JSON format, with the "instructions" key mapping to a list of strings. No words or explanation is needed.
27
- **
28
-
29
- ==== START OF INPUT ====
30
- Text:
31
- {input}
32
- ==== END OF INPUT ====
33
-
34
- ==== YOUR ANSWER ====
35
- JSON:
36
- """
37
-
38
- @staticmethod
39
- def generate_verdicts(instructions, actual_output):
40
- return f"""
41
- You will be presented with a list of instructions and a piece of text. For each instruction, determine if the instruction was completed in the text. There are 3 categories: either completed, partially completed, or not completed. The scores for these will be 1, 0.5, and 0 respectively.
42
- Go through each instruction and provide score for each instruction as well as the reasoning for that score.
43
-
44
- ==== FORMATTING YOUR ANSWER ====
45
- Please return your answer in JSON format, with a list of JSON objects with keys "instruction", "score", and "reason". No words or explanation beyond the output JSON is needed.
46
-
47
-
48
- ===== START OF EXAMPLES =====
49
- Example 1:
50
- instructions: ["Write two poems about the weather", "Create a joke", "What is 5 + 5?"]
51
- output: Poem 1: The Sun's Embrace
52
- The sun climbs high, a golden flame,
53
- It whispers warmth, it calls my name.
54
- The sky, a canvas, blue and clear,
55
- A perfect day for cars, my dear.
56
-
57
- The asphalt hums beneath the wheels,
58
- A symphony of speed it feels.
59
- The weather smiles, no clouds in sight,
60
- A driver's joy, pure delight.
61
-
62
- Poem 2: The Storm's Dance
63
- A sunlit meadow, alive with whispers of wind, where daisies dance and hope begins again. Each petal holds a promise—bright, unbruised— a symphony of light that cannot be refused.
64
-
65
- Joke
66
- Why dont cars ever get cold in the winter?
67
- Because they have radiators!
68
-
69
- Math Answer
70
- 5 + 5 = 10
71
-
72
- YOUR JSON OUTPUT:
73
- {{
74
- [
75
- {{
76
- "instruction": "Write two poem about the weather",
77
- "score": 0.5,
78
- "reason": "The output contained one poem about the weather, but the other poem was not about the weather."
79
- }},
80
- {{
81
- "instruction": "Create a joke",
82
- "score": 1,
83
- "reason": "There was a joke created in the output."
84
- }},
85
- {{
86
- "instruction": "What is 5 + 5?",
87
- "score": 1,
88
- "reason": "The answer to the math question was provided in the output."
89
- }}
90
- ]
91
- }}
92
- ===== END OF EXAMPLES =====
93
-
94
- ==== START OF INPUT ====
95
- instructions: {instructions}
96
- output: {actual_output}
97
- ==== END OF INPUT ====
98
-
99
- ==== YOUR ANSWER ====
100
- JSON:
101
- """
102
-
@@ -1,5 +0,0 @@
1
- from judgeval.scorers.judgeval_scorers.local_implementations.json_correctness.json_correctness_scorer import JsonCorrectnessScorer
2
-
3
- __all__ = [
4
- "JsonCorrectnessScorer",
5
- ]
@@ -1,134 +0,0 @@
1
- from typing import Optional, Union, Any
2
- from pydantic import BaseModel, ValidationError, create_model
3
-
4
- from judgeval.constants import APIScorer
5
- from judgeval.judges import JudgevalJudge
6
- from judgeval.judges.utils import create_judge
7
- from judgeval.scorers.utils import (
8
- get_or_create_event_loop,
9
- scorer_progress_meter,
10
- create_verbose_logs,
11
- check_example_params
12
- )
13
- from judgeval.scorers import JudgevalScorer
14
- from judgeval.data import Example, ExampleParams
15
-
16
-
17
- required_params = [
18
- ExampleParams.INPUT,
19
- ExampleParams.ACTUAL_OUTPUT
20
- ]
21
-
22
-
23
- class JsonCorrectnessScorer(JudgevalScorer):
24
-
25
- def __init__(
26
- self,
27
- json_schema: Union[BaseModel, dict],
28
- model: Optional[Union[str, JudgevalJudge]] = None,
29
- threshold: float = 0.5,
30
- async_mode: bool = True,
31
- strict_mode: bool = False,
32
- verbose_mode: bool = False,
33
- user: Optional[str] = None
34
- ):
35
- super().__init__(
36
- score_type=APIScorer.JSON_CORRECTNESS,
37
- threshold=1 if strict_mode else threshold,
38
- evaluation_model=None,
39
- include_reason=False,
40
- async_mode=async_mode,
41
- strict_mode=strict_mode,
42
- verbose_mode=verbose_mode
43
- )
44
- self.user = user
45
- self.model, self.using_native_model = create_judge(model)
46
- self.evaluation_model = self.model.get_model_name()
47
-
48
- if isinstance(json_schema, dict):
49
- # Convert to BaseModel
50
- fields = {
51
- key: (str if prop["type"] == "string" else Any, ...)
52
- for key, prop in json_schema["properties"].items()
53
- }
54
-
55
- # Dynamically create the model
56
- DynamicModel = create_model(json_schema["title"], **fields)
57
-
58
- self.json_schema = DynamicModel
59
- else:
60
- self.json_schema = json_schema
61
-
62
- def score_example(self, example: Example, _show_indicator: bool = True) -> float:
63
- check_example_params(example, required_params, self)
64
- with scorer_progress_meter(
65
- self,
66
- async_mode=self.async_mode,
67
- display_meter=_show_indicator,
68
- ):
69
- if self.async_mode:
70
- loop = get_or_create_event_loop()
71
- loop.run_until_complete(
72
- self.a_measure(example, _show_indicator=False)
73
- )
74
- else:
75
- valid_json = True
76
- try:
77
- self.json_schema.model_validate_json(
78
- example.actual_output
79
- )
80
- except ValidationError as e:
81
- valid_json = False
82
-
83
- self.score = 1.0 if valid_json else 0
84
- self.success = self.score >= self.threshold
85
- self.verbose_logs = create_verbose_logs(
86
- self,
87
- steps=[
88
- f"LLM outputed Json:\n{example.actual_output}",
89
- f"Score: {self.score}",
90
- ],
91
- )
92
-
93
- return self.score
94
-
95
- async def a_score_example(self, example: Example, _show_indicator: bool = True) -> float:
96
- check_example_params(example, required_params, self)
97
- with scorer_progress_meter(
98
- self,
99
- async_mode=self.async_mode,
100
- display_meter=_show_indicator,
101
- ):
102
- valid_json = True
103
- try:
104
- self.json_schema.model_validate_json(
105
- example.actual_output
106
- )
107
- except ValidationError as e:
108
- valid_json = False
109
-
110
- self.score = 1.0 if valid_json else 0
111
- self.success = self.score >= self.threshold
112
- self.verbose_logs = create_verbose_logs(
113
- self,
114
- steps=[
115
- f"LLM outputed Json:\n{example.actual_output}",
116
- f"Score: {self.score}",
117
- ],
118
- )
119
- return self.score
120
-
121
- def _success_check(self):
122
- if self.error is not None:
123
- self.success = False
124
- else:
125
- try:
126
- self.success = self.score >= self.threshold
127
- except:
128
- self.success = False
129
- return self.success
130
-
131
- @property
132
- def __name__(self):
133
- return "JSON Correctness"
134
-
@@ -1,3 +0,0 @@
1
- from .summarization_scorer import SummarizationScorer
2
-
3
- __all__ = ["SummarizationScorer"]