judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,157 +0,0 @@
1
- from typing import List, Union
2
-
3
- from judgeval.constants import APIScorer
4
- from judgeval.scorers.utils import (
5
- scorer_progress_meter,
6
- create_verbose_logs,
7
- parse_response_json,
8
- check_example_params
9
- )
10
- from judgeval.data import Example, ExampleParams
11
- from judgeval.scorers import JudgevalScorer
12
-
13
-
14
- required_params = [
15
- ExampleParams.INPUT,
16
- ExampleParams.ACTUAL_OUTPUT,
17
- ExampleParams.EXPECTED_TOOLS,
18
- ExampleParams.TOOLS_CALLED,
19
- ]
20
-
21
-
22
- def get_lcs(seq1, seq2):
23
- m, n = len(seq1), len(seq2)
24
- dp = [[0] * (n + 1) for _ in range(m + 1)]
25
-
26
- for i in range(1, m + 1):
27
- for j in range(1, n + 1):
28
- if seq1[i - 1] == seq2[j - 1]:
29
- dp[i][j] = dp[i - 1][j - 1] + 1
30
- else:
31
- dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
32
-
33
- # Reconstruct the LCS
34
- lcs = []
35
- i, j = m, n
36
- while i > 0 and j > 0:
37
- if seq1[i - 1] == seq2[j - 1]:
38
- lcs.append(seq1[i - 1])
39
- i -= 1
40
- j -= 1
41
- elif dp[i - 1][j] > dp[i][j - 1]:
42
- i -= 1
43
- else:
44
- j -= 1
45
-
46
- return lcs[::-1]
47
-
48
-
49
- class ToolCorrectnessScorer(JudgevalScorer):
50
- def __init__(
51
- self,
52
- threshold: float = 0.5,
53
- include_reason: bool = True,
54
- strict_mode: bool = False,
55
- verbose_mode: bool = False,
56
- should_exact_match: bool = False,
57
- should_consider_ordering: bool = False,
58
- ):
59
- super().__init__(
60
- score_type=APIScorer.TOOL_CORRECTNESS,
61
- threshold=1 if strict_mode else threshold,
62
- evaluation_model=None,
63
- include_reason=include_reason,
64
- async_mode=False,
65
- strict_mode=strict_mode,
66
- verbose_mode=verbose_mode
67
- )
68
- self.should_exact_match = should_exact_match
69
- self.should_consider_ordering = should_consider_ordering
70
-
71
- def measure(
72
- self,
73
- example: Example,
74
- _show_indicator: bool = True,
75
- ) -> float:
76
- check_example_params(example, required_params, self)
77
-
78
- with scorer_progress_meter(self, display_meter=_show_indicator):
79
- self.tools_called: List[str] = example.tools_called
80
- self.expected_tools: List[str] = example.expected_tools
81
- self.score = self._calculate_score()
82
- self.reason = self._generate_reason()
83
- self.success = self.score >= self.threshold
84
- self.verbose_logs = create_verbose_logs(
85
- self,
86
- steps=[
87
- f"Expected Tools:\n{self.expected_tools}",
88
- f"Tools Called:\n{self.tools_called}",
89
- f"Score: {self.score}\nReason: {self.reason}",
90
- ],
91
- )
92
- return self.score
93
-
94
- async def a_measure(
95
- self, test_case: Example, _show_indicator: bool = True
96
- ) -> float:
97
- check_example_params(test_case, required_params, self)
98
- return self.measure(test_case, _show_indicator=_show_indicator)
99
-
100
- def _generate_reason(self):
101
- if self.should_exact_match:
102
- return f"{'Exact match' if self.tools_called == self.expected_tools else 'Not an exact match'}: expected {self.expected_tools}, called {self.tools_called}."
103
-
104
- elif self.should_consider_ordering:
105
- lcs = get_lcs(self.expected_tools, self.tools_called)
106
- missing = set(self.expected_tools) - set(self.tools_called)
107
- out_of_order = set(self.expected_tools) - set(lcs)
108
-
109
- if len(lcs) == len(self.expected_tools):
110
- return f"Correct ordering: all expected tools {self.expected_tools} were called in the correct order."
111
- else:
112
- issues = []
113
- if missing:
114
- issues.append(f"missing tools {list(missing)}")
115
- if out_of_order:
116
- issues.append(f"out-of-order tools {list(out_of_order)}")
117
- return f"Incorrect tool usage: {' and '.join(issues)}; expected {self.expected_tools}, called {self.tools_called}."
118
-
119
- else:
120
- used_expected = set(self.tools_called).intersection(
121
- set(self.expected_tools)
122
- )
123
- missing = set(self.expected_tools) - used_expected
124
-
125
- if len(used_expected) == len(self.expected_tools):
126
- return f"All expected tools {self.expected_tools} were called (order not considered)."
127
- else:
128
- return f"Incomplete tool usage: missing tools {list(missing)}; expected {self.expected_tools}, called {self.tools_called}."
129
-
130
- def _calculate_score(self):
131
- if self.should_exact_match:
132
- return 1.0 if self.tools_called == self.expected_tools else 0.0
133
-
134
- elif self.should_consider_ordering:
135
- longest_common_subsequence = get_lcs(
136
- self.expected_tools, self.tools_called
137
- )
138
- score = len(longest_common_subsequence) / len(self.expected_tools)
139
-
140
- else:
141
- used_expected_tools = set(self.tools_called).intersection(
142
- set(self.expected_tools)
143
- )
144
- score = len(used_expected_tools) / len(self.expected_tools)
145
- return 0 if self.strict_mode and score < self.threshold else score
146
-
147
- def _success_check(self) -> bool:
148
- try:
149
- self.success = self.score >= self.threshold
150
- except:
151
- self.success = False
152
- return self.success
153
-
154
- @property
155
- def __name__(self):
156
- return "Tool Correctness"
157
-
@@ -1,439 +0,0 @@
1
- """
2
- Code that implements a prompt-based scorer for evaluating examples.
3
-
4
- The PromptScorer class is a base class that can be used to create custom scoring metrics using LLM prompts.
5
- To implement a subclass of PromptScorer, you need to implement the following methods:
6
- - build_measure_prompt(): builds the conversation prompt that is sent to the LLM judge
7
- - build_schema(): defines the expected response schema from the LLM
8
- - process_response(): parses the response from the LLM judge
9
- - success_check(): determines whether the evaluation was successful
10
-
11
- The core idea of PromptScorer is to provide a flexible way to create custom scoring metrics
12
- by leveraging LLM judges to evaluate examples. The scorer constructs a prompt, sends it to
13
- the judge, and parses the structured response to determine a score.
14
-
15
- For example, the SentimentScorer subclass uses PromptScorer to detect negative sentiment in responses
16
- by prompting an LLM to rate the negativity on a 1-5 scale and provide a reason for the rating.
17
-
18
- The PromptScorer supports both synchronous and asynchronous evaluation modes, includes optional
19
- reason fields in responses, and can operate in strict mode with higher thresholds.
20
-
21
- NOTE: When implementing build_measure_prompt and build_schema:
22
- - The prompt should guide the LLM to generate a response matching your schema
23
- - The schema should include "score" and optionally "reason" fields
24
- - The score field type and range should match your scoring criteria
25
- - The reason field provides explanatory context for the score
26
- """
27
-
28
- from abc import abstractmethod
29
- from typing import List, Optional, Union, Tuple, Any, Mapping
30
- from pydantic import BaseModel, model_serializer, Field
31
-
32
- from judgeval.data import Example
33
- from judgeval.scorers import JudgevalScorer
34
- from judgeval.scorers.utils import (scorer_progress_meter,
35
- parse_response_json,
36
- get_or_create_event_loop,
37
- create_verbose_logs)
38
-
39
-
40
- class ReasonScore(BaseModel):
41
- reason: str
42
- score: float
43
-
44
-
45
- class PromptScorer(JudgevalScorer, BaseModel):
46
- name: str
47
- score_type: str
48
- threshold: float = Field(default=0.5)
49
- using_native_model: bool = Field(default=True)
50
-
51
- # DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD
52
- _response: Optional[dict] = None
53
- _result: Optional[float] = None
54
-
55
- def __init__(
56
- self,
57
- name: str,
58
- threshold: float = 0.5,
59
- include_reason: bool = True,
60
- async_mode: bool = True,
61
- strict_mode: bool = False,
62
- verbose_mode: bool = False,
63
- ):
64
- # Initialize BaseModel first
65
- BaseModel.__init__(
66
- self,
67
- name=name,
68
- score_type=name,
69
- threshold=1 if strict_mode else threshold,
70
- include_reason=include_reason,
71
- async_mode=async_mode,
72
- strict_mode=strict_mode,
73
- verbose_mode=verbose_mode,
74
- )
75
- # Then initialize JudgevalScorer
76
- JudgevalScorer.__init__(
77
- self,
78
- score_type=name,
79
- threshold=1 if strict_mode else threshold,
80
- include_reason=include_reason,
81
- async_mode=async_mode,
82
- strict_mode=strict_mode,
83
- verbose_mode=verbose_mode,
84
- )
85
-
86
- def score_example(
87
- self,
88
- example: Example,
89
- _show_indicator: bool = True
90
- ) -> float:
91
- """
92
- Synchronous method for scoring an example using the prompt criteria.
93
- """
94
- with scorer_progress_meter(self, display_meter=_show_indicator):
95
- if self.async_mode:
96
- loop = get_or_create_event_loop()
97
- loop.run_until_complete(
98
- self.a_score_example(example, _show_indicator=False)
99
- )
100
- else:
101
- result, reason = self.evaluate(example)
102
- self.reason = reason
103
- self._result = result
104
- self.verbose_logs = create_verbose_logs(
105
- self,
106
- steps=[
107
- f"Results: {self._result}\nReason: {self.reason}",
108
- ],
109
- )
110
- return result
111
-
112
- async def a_score_example(
113
- self,
114
- example: Example,
115
- _show_indicator: bool = True,
116
- ) -> float:
117
- """
118
- Async method for scoring an example using the prompt criteria.
119
- """
120
- with scorer_progress_meter(self, display_meter=_show_indicator):
121
- result, reason = await self.a_evaluate(example)
122
- self.reason = reason
123
- self._result = result
124
- self.verbose_logs = create_verbose_logs(
125
- self,
126
- steps=[
127
- f"Results: {self._result}\nReason: {self.reason}",
128
- ],
129
- )
130
- return result
131
-
132
- def evaluate(self, example: Example) -> Tuple[Any, str]:
133
- """
134
- Synchronous helper method for evaluating an example using the prompt criteria.
135
-
136
- Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
137
- for evaluation. The result is then parsed as JSON and returned.
138
-
139
- NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field.
140
- """
141
- prompt = self._build_measure_prompt(example)
142
- if self.using_native_model:
143
- res = self.model.generate(prompt)
144
- response = parse_response_json(res, self)
145
- result, reason = self._process_response(response)
146
- return result, reason
147
- else:
148
- raise NotImplementedError("Non-native judge models are not supported in synchronous mode yet.")
149
-
150
- async def a_evaluate(self, example: Example) -> Tuple[Any, str]:
151
- """
152
- Asynchronous helper method for evaluating an example using the prompt criteria.
153
-
154
- Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
155
- for evaluation. The result is then parsed as JSON and returned.
156
-
157
- NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field.
158
- """
159
- judge_prompt = self._build_measure_prompt(example)
160
- schema = self._build_schema()
161
- prompt = self._enforce_prompt_format(judge_prompt=judge_prompt, schema=schema)
162
- if self.using_native_model:
163
- res = await self.model.a_generate(prompt)
164
- response = parse_response_json(res, self)
165
- self._response = response
166
-
167
- result, reason = self._process_response(response)
168
- self.score = result
169
- self.reason = reason
170
- self._response = response
171
- return result, reason
172
- else:
173
- raise NotImplementedError("Non-native judge models are not supported in async mode yet.")
174
-
175
- # TODO: can we make this take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args
176
- @abstractmethod
177
- def _build_measure_prompt(self, example: Example) -> List[dict]:
178
- # builds the prompt that is sent to the model inside of the `score_example()` method
179
- # returns either a string prompt or a conversation prompt of the form [{"role": "system", "content": "..."}, ...]
180
-
181
- """
182
- This function creates the prompt that the judge model uses to evaluate examples.
183
-
184
- The prompt is typically a set of instructions that the judge model uses to evaluate the example.
185
-
186
- This function returns a conversation prompt of the form
187
- [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}]
188
-
189
- A basic version of implementing this function could be as follows:
190
- SYSTEM_ROLE = ...
191
- return [
192
- {"role": "system", "content": SYSTEM_ROLE},
193
- {"role": "user", "content": f"Response: {example.actual_output}\n\nYour judgment: "}
194
- ]
195
- """
196
- pass
197
-
198
- # TODO: does this need to take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args
199
- @abstractmethod
200
- def _build_schema(self) -> dict:
201
- """
202
- This function returns a dictionary that represents the schema of the JSON response that the judge model should return.
203
-
204
- The keys of the dictionary are the expected keys in the response, and the values are the types of the corresponding values.
205
-
206
- Example: If you want to have the judge model return a score and a reason, you would write:
207
- return {"score": int, "reason": str}
208
- """
209
- pass
210
-
211
- def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict):
212
- """
213
- Formats the final prompt to the judge model.
214
-
215
- This function takes a list of dictionaries (`judge_prompt`) and a schema dictionary (`schema`),
216
- and appends a schema enforcement prompt to the content of the first dictionary in the list, which is assumed to be the system prompt.
217
- The schema enforcement prompt instructs the judge model to provide its response in a specific JSON format.
218
-
219
- Args:
220
- judge_prompt (List[dict]): A list of dictionaries representing the judge prompt.
221
- Each dictionary should contain a "content" key.
222
- schema (dict): A dictionary representing the schema. The keys are the expected keys in the response,
223
- and the values are the types of the corresponding values.
224
-
225
- Returns:
226
- List[dict]: The modified judge prompt with the schema enforcement prompt appended to the content
227
- of the first dictionary.
228
-
229
- Raises:
230
- TypeError: If `judge_prompt` is not a list of dictionaries.
231
-
232
- Example:
233
- judge_prompt = [{"content": "Please evaluate the following:"}]
234
- schema = {"score": int, "comments": str}
235
- formatted_prompt = format_measure_prompt(judge_prompt, schema)
236
- # formatted_prompt[0]["content"] will include the schema enforcement prompt
237
- """
238
- SCHEMA_ENFORCEMENT_PROMPT = "\n\nPlease provide your response in the following JSON format: {"
239
- if isinstance(judge_prompt, list) and all(isinstance(item, dict) for item in judge_prompt):
240
- # create formatting string for schema enforcement
241
- # schema is a map between key and type of the value
242
- for key, key_type in schema.items():
243
- SCHEMA_ENFORCEMENT_PROMPT += f'"{key}": <{key}> ({key_type.__name__}), '
244
- SCHEMA_ENFORCEMENT_PROMPT = SCHEMA_ENFORCEMENT_PROMPT[:-2] + "}" # remove trailing comma and space
245
- judge_prompt[0]["content"] += SCHEMA_ENFORCEMENT_PROMPT
246
- return judge_prompt
247
- else:
248
- raise TypeError(f"Prompt must be a list of dictionaries. Got {type(judge_prompt)} instead.")
249
-
250
- @abstractmethod
251
- def _process_response(self, response: dict):
252
- """
253
- Customizable method for processing the response from the judge model.
254
-
255
- You can add any additional logic to parse the JSON response here and return the result and reason for decision.
256
-
257
- If you don't need a reason for the decision, you can simply return (score, None).
258
-
259
- Example:
260
- score = response["score"]
261
- reason = response["reason"]
262
- return score, reason
263
- """
264
- pass
265
-
266
- @abstractmethod
267
- def _success_check(self, **kwargs) -> bool:
268
- """
269
- Determines whether or not the PromptScorer should consider the evaluation of a single example successful.
270
- """
271
- pass
272
-
273
- @property
274
- def __name__(self):
275
- return self.name
276
-
277
-
278
- class ClassifierScorer(PromptScorer):
279
-
280
- """
281
- This is a PromptScorer that takes
282
- 1. a system role that may involve the Example object
283
- 2. options for scores on the example
284
-
285
- and uses a judge to execute the evaluation from the system role and classify into one of the options
286
-
287
- ex:
288
- system_role = "You are a judge that evaluates whether the response is positive or negative. The response is: {example.actual_output}"
289
- options = {"positive": 1, "negative": 0}
290
- """
291
-
292
- conversation: List[dict]
293
- options: Mapping[str, float]
294
-
295
- def __init__(self, name: str, slug: str, conversation: List[dict], options: Mapping[str, float],
296
- threshold: float = 0.5, include_reason: bool = True,
297
- async_mode: bool = True, strict_mode: bool = False, verbose_mode: bool = False):
298
- # Initialize BaseModel first with all fields
299
- BaseModel.__init__(
300
- self,
301
- name=name,
302
- slug=slug,
303
- score_type=name,
304
- conversation=conversation,
305
- options=options,
306
- threshold=threshold,
307
- include_reason=include_reason,
308
- async_mode=async_mode,
309
- strict_mode=strict_mode,
310
- verbose_mode=verbose_mode,
311
- )
312
- # Then initialize JudgevalScorer
313
- JudgevalScorer.__init__(
314
- self,
315
- score_type=name,
316
- threshold=threshold,
317
- include_reason=include_reason,
318
- async_mode=async_mode,
319
- strict_mode=strict_mode,
320
- verbose_mode=verbose_mode,
321
- )
322
-
323
- def _build_measure_prompt(self, example: Example) -> List[dict]:
324
- """
325
- Builds the measure prompt for the classifier scorer.
326
-
327
- Args:
328
- example (Example): The example to build the prompt for
329
-
330
- Returns:
331
- List[dict]: The measure prompt for the classifier scorer
332
- """
333
- replacement_words = {
334
- "{{actual_output}}": example.actual_output,
335
- "{{expected_output}}": example.expected_output,
336
- "{{context}}": example.context,
337
- "{{retrieval_context}}": example.retrieval_context,
338
- "{{tools_called}}": example.tools_called,
339
- "{{expected_tools}}": example.expected_tools,
340
- }
341
- # Make a copy of the conversation to avoid modifying the original
342
- conversation_copy = [dict(message) for message in self.conversation]
343
-
344
- # Only replace if double brackets are found in the content
345
- for message in conversation_copy:
346
- content = message["content"]
347
- if "{{" in content:
348
- for key, value in replacement_words.items():
349
- if key in content:
350
- message["content"] = content.replace(key, str(value))
351
- return conversation_copy
352
-
353
- def _build_schema(self) -> dict:
354
- return self.options
355
-
356
- def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict) -> List[dict]:
357
- """
358
- Enforces the judge model to choose an option from the schema.
359
-
360
- We want the model to choose an option from the schema and a reason for the choice.
361
- """
362
- options = list(schema.keys())
363
- options_str = ", ".join(options)
364
-
365
- system_role = judge_prompt[0]["content"]
366
- system_role += (
367
- f"\n\nYou must choose one of the following options: {options_str}. "
368
- "Format your response as a JSON object with two fields:\n"
369
- "1. 'choice': Your selected option (must be one of the provided choices)\n"
370
- "2. 'reason': A brief explanation for why you made this choice\n\n"
371
- "Example response format:\n"
372
- "{\n"
373
- ' "choice": "<one of the valid options>",\n'
374
- ' "reason": "<your explanation>"\n'
375
- "}"
376
- )
377
-
378
- judge_prompt[0]["content"] = system_role
379
- return judge_prompt
380
-
381
- def _process_response(self, response: dict) -> Tuple[float, str]:
382
- choice = response.get("choice")
383
- if choice not in self.options:
384
- raise ValueError(f"Invalid choice: {choice}. Expected one of: {self.options.keys()}")
385
- reason = response.get("reason", "No reason could be found in model response.")
386
- return self.options[choice], reason
387
-
388
- def _success_check(self, **kwargs) -> bool:
389
- return self.score >= self.threshold
390
-
391
- def update_name(self, name: str):
392
- """
393
- Updates the name of the scorer.
394
- """
395
- self.name = name
396
-
397
- def update_threshold(self, threshold: float):
398
- """
399
- Updates the threshold of the scorer.
400
- """
401
- self.threshold = threshold
402
-
403
- def update_conversation(self, conversation: List[dict]):
404
- """
405
- Updates the conversation with the new conversation.
406
-
407
- Sample conversation:
408
- [{'role': 'system', 'content': "Did the chatbot answer the user's question in a kind way?: {{actual_output}}."}]
409
- """
410
- self.conversation = conversation
411
-
412
- def update_options(self, options: Mapping[str, float]):
413
- """
414
- Updates the options with the new options.
415
-
416
- Sample options:
417
- {"yes": 1, "no": 0}
418
- """
419
- self.options = options
420
-
421
- def __str__(self):
422
- return f"ClassifierScorer(name={self.name}, slug={self.slug}, conversation={self.conversation}, threshold={self.threshold}, options={self.options})"
423
-
424
- @model_serializer
425
- def serialize_model(self) -> dict:
426
- """
427
- Defines how the ClassifierScorer should be serialized when model_dump() is called.
428
- """
429
- return {
430
- "name": self.name,
431
- "score_type": self.score_type,
432
- "conversation": self.conversation,
433
- "options": self.options,
434
- "threshold": self.threshold,
435
- "include_reason": self.include_reason,
436
- "async_mode": self.async_mode,
437
- "strict_mode": self.strict_mode,
438
- "verbose_mode": self.verbose_mode,
439
- }
@@ -1,36 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: judgeval
3
- Version: 0.0.11
4
- Summary: Judgeval Package
5
- Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
- Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
- Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
- License-Expression: Apache-2.0
9
- License-File: LICENSE.md
10
- Classifier: Operating System :: OS Independent
11
- Classifier: Programming Language :: Python :: 3
12
- Requires-Python: >=3.11
13
- Requires-Dist: anthropic
14
- Requires-Dist: fastapi
15
- Requires-Dist: litellm
16
- Requires-Dist: nest-asyncio
17
- Requires-Dist: openai
18
- Requires-Dist: pandas
19
- Requires-Dist: pika
20
- Requires-Dist: python-dotenv==1.0.1
21
- Requires-Dist: requests
22
- Requires-Dist: supabase
23
- Requires-Dist: together
24
- Requires-Dist: uvicorn
25
- Provides-Extra: dev
26
- Requires-Dist: langfuse==2.50.3; extra == 'dev'
27
- Requires-Dist: patronus; extra == 'dev'
28
- Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
29
- Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
30
- Requires-Dist: pytest>=8.3.4; extra == 'dev'
31
- Requires-Dist: tavily-python; extra == 'dev'
32
- Description-Content-Type: text/markdown
33
-
34
- # judgeval
35
-
36
- Judgeval is an open-source evaluation framework for multi-agent LLM workflows, for both real-time and offline evaluations.