judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. judgeval/__init__.py +5 -4
  2. judgeval/clients.py +6 -6
  3. judgeval/common/__init__.py +7 -2
  4. judgeval/common/exceptions.py +2 -3
  5. judgeval/common/logger.py +74 -49
  6. judgeval/common/s3_storage.py +30 -23
  7. judgeval/common/tracer.py +1273 -939
  8. judgeval/common/utils.py +416 -244
  9. judgeval/constants.py +73 -61
  10. judgeval/data/__init__.py +1 -1
  11. judgeval/data/custom_example.py +3 -2
  12. judgeval/data/datasets/dataset.py +80 -54
  13. judgeval/data/datasets/eval_dataset_client.py +131 -181
  14. judgeval/data/example.py +67 -43
  15. judgeval/data/result.py +11 -9
  16. judgeval/data/scorer_data.py +4 -2
  17. judgeval/data/tool.py +25 -16
  18. judgeval/data/trace.py +57 -29
  19. judgeval/data/trace_run.py +5 -11
  20. judgeval/evaluation_run.py +22 -82
  21. judgeval/integrations/langgraph.py +546 -184
  22. judgeval/judges/base_judge.py +1 -2
  23. judgeval/judges/litellm_judge.py +33 -11
  24. judgeval/judges/mixture_of_judges.py +128 -78
  25. judgeval/judges/together_judge.py +22 -9
  26. judgeval/judges/utils.py +14 -5
  27. judgeval/judgment_client.py +259 -271
  28. judgeval/rules.py +169 -142
  29. judgeval/run_evaluation.py +462 -305
  30. judgeval/scorers/api_scorer.py +20 -11
  31. judgeval/scorers/exceptions.py +1 -0
  32. judgeval/scorers/judgeval_scorer.py +77 -58
  33. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
  36. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
  37. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
  38. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
  39. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
  40. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
  41. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
  42. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
  43. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
  44. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
  45. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
  46. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
  47. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
  48. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
  49. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
  50. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
  51. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
  52. judgeval/scorers/prompt_scorer.py +48 -37
  53. judgeval/scorers/score.py +86 -53
  54. judgeval/scorers/utils.py +11 -7
  55. judgeval/tracer/__init__.py +1 -1
  56. judgeval/utils/alerts.py +23 -12
  57. judgeval/utils/{data_utils.py → file_utils.py} +5 -9
  58. judgeval/utils/requests.py +29 -0
  59. judgeval/version_check.py +5 -2
  60. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
  61. judgeval-0.0.46.dist-info/RECORD +69 -0
  62. judgeval-0.0.44.dist-info/RECORD +0 -68
  63. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
  64. {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
@@ -37,8 +37,7 @@ class JudgevalJudge(ABC):
37
37
  A string.
38
38
  """
39
39
  pass
40
-
40
+
41
41
  @abstractmethod
42
42
  def get_model_name(self, *args, **kwargs) -> str:
43
43
  pass
44
-
@@ -1,9 +1,11 @@
1
1
  import pydantic
2
2
  from typing import List, Union, Mapping
3
3
 
4
- from judgeval import *
5
4
  from judgeval.judges import JudgevalJudge
6
- from judgeval.common.utils import afetch_litellm_api_response, fetch_litellm_api_response
5
+ from judgeval.common.utils import (
6
+ afetch_litellm_api_response,
7
+ fetch_litellm_api_response,
8
+ )
7
9
  from judgeval.common.logger import debug, error
8
10
 
9
11
  BASE_CONVERSATION = [
@@ -18,30 +20,50 @@ class LiteLLMJudge(JudgevalJudge):
18
20
  self.kwargs = kwargs
19
21
  super().__init__(model_name=model)
20
22
 
21
- def generate(self, input: Union[str, List[Mapping[str, str]]], schema: pydantic.BaseModel = None) -> str:
23
+ def generate(
24
+ self,
25
+ input: Union[str, List[Mapping[str, str]]],
26
+ schema: pydantic.BaseModel = None,
27
+ ) -> str:
22
28
  debug(f"Generating response for input type: {type(input)}")
23
29
  if isinstance(input, str):
24
30
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
25
- return fetch_litellm_api_response(model=self.model, messages=convo, response_format=schema)
31
+ return fetch_litellm_api_response(
32
+ model=self.model, messages=convo, response_format=schema
33
+ )
26
34
  elif isinstance(input, list):
27
- return fetch_litellm_api_response(model=self.model, messages=input, response_format=schema)
35
+ return fetch_litellm_api_response(
36
+ model=self.model, messages=input, response_format=schema
37
+ )
28
38
  else:
29
39
  error(f"Invalid input type received: {type(input)}")
30
- raise TypeError(f"Input must be a string or a list of dictionaries. Input type of: {type(input)}")
40
+ raise TypeError(
41
+ f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
42
+ )
31
43
 
32
- async def a_generate(self, input: Union[str, List[Mapping[str, str]]], schema: pydantic.BaseModel = None) -> str:
44
+ async def a_generate(
45
+ self,
46
+ input: Union[str, List[Mapping[str, str]]],
47
+ schema: pydantic.BaseModel = None,
48
+ ) -> str:
33
49
  debug(f"Async generating response for input type: {type(input)}")
34
50
  if isinstance(input, str):
35
51
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
36
- response = await afetch_litellm_api_response(model=self.model, messages=convo, response_format=schema)
52
+ response = await afetch_litellm_api_response(
53
+ model=self.model, messages=convo, response_format=schema
54
+ )
37
55
  return response
38
56
  elif isinstance(input, list):
39
- response = await afetch_litellm_api_response(model=self.model, messages=input, response_format=schema)
57
+ response = await afetch_litellm_api_response(
58
+ model=self.model, messages=input, response_format=schema
59
+ )
40
60
  return response
41
61
  else:
42
62
  error(f"Invalid input type received: {type(input)}")
43
- raise TypeError(f"Input must be a string or a list of dictionaries. Input type of: {type(input)}")
44
-
63
+ raise TypeError(
64
+ f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
65
+ )
66
+
45
67
  def load_model(self):
46
68
  return self.model
47
69
 
@@ -3,23 +3,24 @@ Implementation for Mixture of Judges model through Judgeval
3
3
 
4
4
  Enables client to use multiple models to generate responses and then aggregate them into a single response.
5
5
  """
6
- from judgeval import *
6
+
7
7
  import pydantic
8
- from typing import List, Union, Mapping
8
+ from typing import List, Union
9
9
  from judgeval.judges import JudgevalJudge
10
10
  from judgeval.common.utils import (
11
- get_completion_multiple_models,
12
- get_chat_completion,
13
- aget_completion_multiple_models,
14
- aget_chat_completion
11
+ get_completion_multiple_models,
12
+ get_chat_completion,
13
+ aget_completion_multiple_models,
14
+ aget_chat_completion,
15
15
  )
16
16
  from judgeval.common.logger import debug, error
17
17
 
18
+
18
19
  def build_dynamic_mixture_prompt(
19
- judge_responses: List[str],
20
- custom_system_prompt: str = None,
21
- custom_conversation_history: List[Mapping] = None
22
- ) -> List[Mapping]:
20
+ judge_responses: List[str],
21
+ custom_system_prompt: str | None = None,
22
+ custom_conversation_history: List[dict] | None = None,
23
+ ) -> List[dict]:
23
24
  """
24
25
  Dynamically builds a prompt to mix judge responses together for the Mixture of Judges model.
25
26
 
@@ -29,10 +30,15 @@ def build_dynamic_mixture_prompt(
29
30
  Args:
30
31
  judge_responses (List[str]): List of responses from individual judges to be synthesized
31
32
  custom_system_prompt (str, optional): Custom system prompt to override the default one. Defaults to None.
32
- custom_conversation_history (List[Mapping], optional): Custom conversation history to override the default one. Defaults to None.
33
+ custom_conversation_history (List[dict], optional): Custom conversation history to override the default one. Defaults to None.
33
34
  """
34
- formatted_responses = "\n".join([f"# Judge {i + 1}'s response: #\n{response}" for i, response in enumerate(judge_responses)])
35
-
35
+ formatted_responses = "\n".join(
36
+ [
37
+ f"# Judge {i + 1}'s response: #\n{response}"
38
+ for i, response in enumerate(judge_responses)
39
+ ]
40
+ )
41
+
36
42
  # This is the default prompt for the Mixture of Judges model
37
43
  """
38
44
  You are tasked with synthesizing responses from multiple expert judges. You will receive N individual answers on the same topic. Your job is to:
@@ -51,75 +57,99 @@ def build_dynamic_mixture_prompt(
51
57
 
52
58
  default_conversation = [ # inject the judge responses into the default prompt
53
59
  {
54
- 'role': 'system',
55
- 'content': 'You are tasked with synthesizing responses from multiple expert judges. You will receive N individual answers on the same topic. Your job is to:\n1. Analyze and compare the key points, patterns, and agreements between the answers.\n2. Identify the consensus by focusing on areas where most or all of the answers align. Consider common reasoning and frequently mentioned conclusions.\n3. Condense the responses into a single, coherent, and concise answer that represents the collective judgment of the group.\n4. When opinions differ or contradict, highlight the most supported viewpoint while briefly acknowledging the dissenting perspectives.\n5. Ensure the final answer is balanced and clear, providing a comprehensive summary that captures the wisdom of all judges while avoiding repetition.\n\n**IMPORTANT**: IF THE JUDGE RESPONSES ARE IN JSON FORMAT, YOU MUST RESPOND USING THE SAME JSON FORMAT THAT THE RESPONSES ARE IN. If the judge responses are in JSON, you MUST RESPOND IN VALID JSON FORMAT. '
56
- },
60
+ "role": "system",
61
+ "content": "You are tasked with synthesizing responses from multiple expert judges. You will receive N individual answers on the same topic. Your job is to:\n1. Analyze and compare the key points, patterns, and agreements between the answers.\n2. Identify the consensus by focusing on areas where most or all of the answers align. Consider common reasoning and frequently mentioned conclusions.\n3. Condense the responses into a single, coherent, and concise answer that represents the collective judgment of the group.\n4. When opinions differ or contradict, highlight the most supported viewpoint while briefly acknowledging the dissenting perspectives.\n5. Ensure the final answer is balanced and clear, providing a comprehensive summary that captures the wisdom of all judges while avoiding repetition.\n\n**IMPORTANT**: IF THE JUDGE RESPONSES ARE IN JSON FORMAT, YOU MUST RESPOND USING THE SAME JSON FORMAT THAT THE RESPONSES ARE IN. If the judge responses are in JSON, you MUST RESPOND IN VALID JSON FORMAT. ",
62
+ },
57
63
  {
58
- 'role': 'user',
59
- 'content': '## Start of Judge Responses ## \n# Judge 1\'s response: #\n{\n"claims": [\n{\n"claim": "A 30-day full refund is offered.",\n"quote": "We offer a 30-day full refund at no extra cost."\n},\n{\n"claim": "The 30-day full refund comes at no extra cost.",\n"quote": "We offer a 30-day full refund at no extra cost."\n}\n]\n}\n\n# Judge 2\'s response: #\n{\n "claims": [\n {\n "claim": "A full refund is offered within 30 days.",\n "quote": "We offer a 30-day full refund at no extra cost."\n },\n {\n "claim": "The 30-day full refund is offered at no extra cost.",\n "quote": "We offer a 30-day full refund at no extra cost."\n }\n ]\n}\n# Judge 3\'s response: #\n {\n "claims": [\n {\n "claim": "A 30-day full refund is offered.",\n "quote": "We offer a 30-day full refund at no extra cost."\n },\n {\n "claim": "The 30-day full refund is offered at no extra cost.",\n "quote": "We offer a 30-day full refund at no extra cost."\n }\n ]\n}\n## End of Judge Responses ##\nSynthesized response:'
60
- },
64
+ "role": "user",
65
+ "content": '## Start of Judge Responses ## \n# Judge 1\'s response: #\n{\n"claims": [\n{\n"claim": "A 30-day full refund is offered.",\n"quote": "We offer a 30-day full refund at no extra cost."\n},\n{\n"claim": "The 30-day full refund comes at no extra cost.",\n"quote": "We offer a 30-day full refund at no extra cost."\n}\n]\n}\n\n# Judge 2\'s response: #\n{\n "claims": [\n {\n "claim": "A full refund is offered within 30 days.",\n "quote": "We offer a 30-day full refund at no extra cost."\n },\n {\n "claim": "The 30-day full refund is offered at no extra cost.",\n "quote": "We offer a 30-day full refund at no extra cost."\n }\n ]\n}\n# Judge 3\'s response: #\n {\n "claims": [\n {\n "claim": "A 30-day full refund is offered.",\n "quote": "We offer a 30-day full refund at no extra cost."\n },\n {\n "claim": "The 30-day full refund is offered at no extra cost.",\n "quote": "We offer a 30-day full refund at no extra cost."\n }\n ]\n}\n## End of Judge Responses ##\nSynthesized response:',
66
+ },
61
67
  {
62
- 'role': 'assistant',
63
- 'content': 'The consensus among the judges is clear and unanimous. All three judges agree that a 30-day full refund is offered, and this refund is available at no extra cost. This conclusion is consistently supported by their statements, as each of their claims is directly quoted as: "We offer a 30-day full refund at no extra cost." There are no dissenting perspectives or opposing views provided in any of the responses, indicating complete alignment on this topic.\n\nJSON:\n{\n "claims": [\n {\n "claim": "A full refund is offered within 30 days.",\n "quote": "We offer a 30-day full refund at no extra cost."\n },\n {\n "claim": "The 30-day full refund is offered at no extra cost.",\n "quote": "We offer a 30-day full refund at no extra cost."\n }\n ]\n}'
68
+ "role": "assistant",
69
+ "content": 'The consensus among the judges is clear and unanimous. All three judges agree that a 30-day full refund is offered, and this refund is available at no extra cost. This conclusion is consistently supported by their statements, as each of their claims is directly quoted as: "We offer a 30-day full refund at no extra cost." There are no dissenting perspectives or opposing views provided in any of the responses, indicating complete alignment on this topic.\n\nJSON:\n{\n "claims": [\n {\n "claim": "A full refund is offered within 30 days.",\n "quote": "We offer a 30-day full refund at no extra cost."\n },\n {\n "claim": "The 30-day full refund is offered at no extra cost.",\n "quote": "We offer a 30-day full refund at no extra cost."\n }\n ]\n}',
64
70
  },
65
71
  {
66
- 'role': 'user',
67
- 'content': "## Start of Judge Responses ##\n# Judge 1's response: # \nThe capital of France is Paris.\n\n# Judge 2's response: #\nThe capital of France is Paris.\n\n# Judge 3's response: # \nThe capital of France is Paris. It's one of the most popular tourist destinations in the world, known for its art, culture, and history. It's also famous for its iconic landmarks such as the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral.\n\n## End of Judge Responses ##\nSynthesized response:"
72
+ "role": "user",
73
+ "content": "## Start of Judge Responses ##\n# Judge 1's response: # \nThe capital of France is Paris.\n\n# Judge 2's response: #\nThe capital of France is Paris.\n\n# Judge 3's response: # \nThe capital of France is Paris. It's one of the most popular tourist destinations in the world, known for its art, culture, and history. It's also famous for its iconic landmarks such as the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral.\n\n## End of Judge Responses ##\nSynthesized response:",
68
74
  },
69
75
  {
70
- 'role': 'assistant',
71
- 'content': "The capital of France is Paris. It is widely recognized as one of the world's most popular tourist destinations, celebrated for its rich art, culture, and history. Paris is renowned for its iconic landmarks, including the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral."
72
- },
76
+ "role": "assistant",
77
+ "content": "The capital of France is Paris. It is widely recognized as one of the world's most popular tourist destinations, celebrated for its rich art, culture, and history. Paris is renowned for its iconic landmarks, including the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral.",
78
+ },
73
79
  {
74
- 'role': 'user',
75
- 'content': f'## Start of Judge Responses ##\n{formatted_responses}\n## End of Judge Responses ##\nSynthesized response:\n'
76
- }
80
+ "role": "user",
81
+ "content": f"## Start of Judge Responses ##\n{formatted_responses}\n## End of Judge Responses ##\nSynthesized response:\n",
82
+ },
77
83
  ]
78
-
84
+
79
85
  # If a custom system prompt is provided, validate and use it
80
86
  if custom_system_prompt is not None:
81
87
  if not isinstance(custom_system_prompt, str):
82
- error(f"TypeError: Custom system prompt must be a string. Received: {type(custom_system_prompt)}.")
83
- raise TypeError(f"Custom system prompt must be a string. Received: {type(custom_system_prompt)}.")
88
+ error(
89
+ f"TypeError: Custom system prompt must be a string. Received: {type(custom_system_prompt)}."
90
+ )
91
+ raise TypeError(
92
+ f"Custom system prompt must be a string. Received: {type(custom_system_prompt)}."
93
+ )
84
94
  if not custom_system_prompt:
85
95
  error("ValueError: Custom system prompt cannot be empty")
86
96
  raise ValueError("Custom system prompt cannot be empty")
87
97
  # Override the default system prompt, but also add special instructions for handling JSON
88
- default_conversation[0]['content'] = custom_system_prompt + "\n\n**IMPORTANT**: IF THE JUDGE RESPONSES ARE IN JSON FORMAT, YOU MUST RESPOND USING THE SAME JSON FORMAT THAT THE RESPONSES ARE IN. If the judge responses are in JSON, you MUST RESPOND IN VALID JSON FORMAT."
89
-
98
+ default_conversation[0]["content"] = (
99
+ custom_system_prompt
100
+ + "\n\n**IMPORTANT**: IF THE JUDGE RESPONSES ARE IN JSON FORMAT, YOU MUST RESPOND USING THE SAME JSON FORMAT THAT THE RESPONSES ARE IN. If the judge responses are in JSON, you MUST RESPOND IN VALID JSON FORMAT."
101
+ )
102
+
90
103
  # If a custom conversation history is provided, append the judge responses to it
91
104
  if custom_conversation_history is not None:
92
105
  # Validate custom conversation history format
93
106
  for message in custom_conversation_history:
94
107
  if not isinstance(message, dict):
95
- error(f"TypeError: Custom conversation history must be a list of dictionaries. Received: {message}.")
96
- raise TypeError(f"Custom conversation history must be a list of dictionaries. Received: {message}.")
97
-
98
- if 'role' not in message or 'content' not in message:
108
+ error(
109
+ f"TypeError: Custom conversation history must be a list of dictionaries. Received: {message}."
110
+ )
111
+ raise TypeError(
112
+ f"Custom conversation history must be a list of dictionaries. Received: {message}."
113
+ )
114
+
115
+ if "role" not in message or "content" not in message:
99
116
  error("ValueError: Each message must have 'role' and 'content' keys")
100
117
  raise ValueError("Each message must have 'role' and 'content' keys")
101
-
102
- if not isinstance(message['role'], str) or not isinstance(message['content'], str):
103
- error(f"TypeError: Message role and content must be strings. Received: {type(message['role'])}, {type(message['content'])}.")
104
- raise TypeError(f"Message role and content must be strings. Received: {type(message['role'])}, {type(message['content'])}.")
105
-
106
- if message['role'] not in ['system', 'user', 'assistant']:
107
- error(f"ValueError: Message role must be one of: 'system', 'user', 'assistant'. Received: {message['role']}.")
108
- raise ValueError(f"Message role must be one of: 'system', 'user', 'assistant'. Received: {message['role']}.")
109
-
118
+
119
+ if not isinstance(message["role"], str) or not isinstance(
120
+ message["content"], str
121
+ ):
122
+ error(
123
+ f"TypeError: Message role and content must be strings. Received: {type(message['role'])}, {type(message['content'])}."
124
+ )
125
+ raise TypeError(
126
+ f"Message role and content must be strings. Received: {type(message['role'])}, {type(message['content'])}."
127
+ )
128
+
129
+ if message["role"] not in ["system", "user", "assistant"]:
130
+ error(
131
+ f"ValueError: Message role must be one of: 'system', 'user', 'assistant'. Received: {message['role']}."
132
+ )
133
+ raise ValueError(
134
+ f"Message role must be one of: 'system', 'user', 'assistant'. Received: {message['role']}."
135
+ )
136
+
110
137
  judge_responses_prompt = {
111
- 'role': 'user',
112
- 'content': f'## Start of Judge Responses ##\n{formatted_responses}\n## End of Judge Responses ##\nSynthesized response:\n'
138
+ "role": "user",
139
+ "content": f"## Start of Judge Responses ##\n{formatted_responses}\n## End of Judge Responses ##\nSynthesized response:\n",
113
140
  }
114
141
  return custom_conversation_history + [judge_responses_prompt]
115
-
142
+
116
143
  # Otherwise return the default conversation with system prompt and examples
117
144
  # No customization, return the default conversation with system prompt and examples
118
145
  return default_conversation
119
146
 
147
+
120
148
  BASE_CONVERSATION = [
121
149
  {"role": "system", "content": "You are a helpful assistant."},
122
150
  ] # for string inputs, we need to add the user query to a base conversation, since LiteLLM only accepts a list of dictionaries as a chat history
151
+
152
+
123
153
  class MixtureOfJudges(JudgevalJudge):
124
154
  """
125
155
  IMPORTANT: When supplying custom prompts and conversation histories for aggregation, supply them in the following format:
@@ -134,10 +164,17 @@ class MixtureOfJudges(JudgevalJudge):
134
164
  ]
135
165
  }
136
166
  """
137
- def __init__(self,
138
- models: List[str] = ['QWEN', 'LLAMA3_70B_INSTRUCT_TURBO', 'MISTRAL_8x22B_INSTRUCT'],
139
- aggregator: str = 'gpt-4.1',
140
- **kwargs):
167
+
168
+ def __init__(
169
+ self,
170
+ models: List[str] = [
171
+ "QWEN",
172
+ "LLAMA3_70B_INSTRUCT_TURBO",
173
+ "MISTRAL_8x22B_INSTRUCT",
174
+ ],
175
+ aggregator: str = "gpt-4.1",
176
+ **kwargs,
177
+ ):
141
178
  """
142
179
  `models` are the individual judge models to be used for generating responses.
143
180
  `aggregator` is the model that will aggregate the responses from the individual judges.
@@ -150,11 +187,12 @@ class MixtureOfJudges(JudgevalJudge):
150
187
  super().__init__(model_name=models)
151
188
 
152
189
  def generate(
153
- self,
154
- input: Union[str, List[Mapping[str, str]]],
155
- response_schema: pydantic.BaseModel = None,
156
- aggregation_schema: pydantic.BaseModel = None,
157
- **kwargs) -> str:
190
+ self,
191
+ input: Union[str, List[dict]],
192
+ response_schema: pydantic.BaseModel = None,
193
+ aggregation_schema: pydantic.BaseModel = None,
194
+ **kwargs,
195
+ ) -> str:
158
196
  """
159
197
  Args:
160
198
  input (Union[str, List[Mapping[str, str]]]): Input query or conversation history to the model.
@@ -163,7 +201,7 @@ class MixtureOfJudges(JudgevalJudge):
163
201
  kwargs: Additional keyword arguments.
164
202
  """
165
203
  debug(f"Generating response for input type: {type(input)}")
166
-
204
+
167
205
  # Convert input to conversation format if needed
168
206
  if isinstance(input, str):
169
207
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
@@ -171,20 +209,26 @@ class MixtureOfJudges(JudgevalJudge):
171
209
  convo = input
172
210
  else:
173
211
  error(f"Invalid input type received: {type(input)}")
174
- raise TypeError(f"Input must be a string or a list of dictionaries. Input type of: {type(input)}")
212
+ raise TypeError(
213
+ f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
214
+ )
175
215
 
176
216
  try:
177
217
  responses = get_completion_multiple_models(
178
218
  models=self.models,
179
219
  messages=[convo] * len(self.models),
180
- response_formats=[response_schema] * len(self.models)
220
+ response_formats=[response_schema] * len(self.models),
181
221
  )
182
222
  except Exception as e:
183
223
  error(f"Error getting completions from multiple models: {str(e)}")
184
224
  raise
185
225
 
186
- compiled_mixture_prompt = build_dynamic_mixture_prompt(responses, self.kwargs.get('custom_prompt'), self.kwargs.get('custom_conversation'))
187
-
226
+ compiled_mixture_prompt = build_dynamic_mixture_prompt(
227
+ responses,
228
+ self.kwargs.get("custom_prompt"),
229
+ self.kwargs.get("custom_conversation"),
230
+ )
231
+
188
232
  try:
189
233
  mixed_response = get_chat_completion(
190
234
  model_type=self.aggregator,
@@ -194,16 +238,16 @@ class MixtureOfJudges(JudgevalJudge):
194
238
  except Exception as e:
195
239
  error(f"Error getting chat completion from aggregator: {str(e)}")
196
240
  raise
197
-
241
+
198
242
  return mixed_response
199
243
 
200
244
  async def a_generate(
201
- self,
202
- input: Union[str, List[Mapping[str, str]]],
203
- response_schema: pydantic.BaseModel = None,
204
- aggregation_schema: pydantic.BaseModel = None,
205
- **kwargs
206
- ) -> str:
245
+ self,
246
+ input: Union[str, List[dict]],
247
+ response_schema: pydantic.BaseModel = None,
248
+ aggregation_schema: pydantic.BaseModel = None,
249
+ **kwargs,
250
+ ) -> str:
207
251
  """
208
252
  Args:
209
253
  input (Union[str, List[Mapping[str, str]]]): Input query or conversation history to the model.
@@ -212,7 +256,7 @@ class MixtureOfJudges(JudgevalJudge):
212
256
  kwargs: Additional keyword arguments.
213
257
  """
214
258
  debug(f"Generating response for input type: {type(input)}")
215
-
259
+
216
260
  # Convert input to conversation format if needed
217
261
  if isinstance(input, str):
218
262
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
@@ -220,20 +264,26 @@ class MixtureOfJudges(JudgevalJudge):
220
264
  convo = input
221
265
  else:
222
266
  error(f"Invalid input type received: {type(input)}")
223
- raise TypeError(f"Input must be a string or a list of dictionaries. Input type of: {type(input)}")
267
+ raise TypeError(
268
+ f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
269
+ )
224
270
 
225
271
  try:
226
272
  responses = await aget_completion_multiple_models(
227
273
  models=self.models,
228
274
  messages=[convo] * len(self.models),
229
- response_formats=[response_schema] * len(self.models)
275
+ response_formats=[response_schema] * len(self.models),
230
276
  )
231
277
  except Exception as e:
232
278
  error(f"Error getting async completions from multiple models: {str(e)}")
233
279
  raise
234
280
 
235
- compiled_mixture_prompt = build_dynamic_mixture_prompt(responses, self.kwargs.get('custom_prompt'), self.kwargs.get('custom_conversation'))
236
-
281
+ compiled_mixture_prompt = build_dynamic_mixture_prompt(
282
+ responses,
283
+ self.kwargs.get("custom_prompt"),
284
+ self.kwargs.get("custom_conversation"),
285
+ )
286
+
237
287
  try:
238
288
  mixed_response = await aget_chat_completion(
239
289
  model_type=self.aggregator,
@@ -243,9 +293,9 @@ class MixtureOfJudges(JudgevalJudge):
243
293
  except Exception as e:
244
294
  error(f"Error getting async chat completion from aggregator: {str(e)}")
245
295
  raise
246
-
296
+
247
297
  return mixed_response
248
-
298
+
249
299
  def load_model(self):
250
300
  return self.models
251
301
 
@@ -3,16 +3,20 @@ Implementation of using TogetherAI inference for judges.
3
3
  """
4
4
 
5
5
  from pydantic import BaseModel
6
- from typing import List, Union, Mapping
6
+ from typing import List, Union
7
7
  from judgeval.common.logger import debug, error
8
8
 
9
9
  from judgeval.judges import JudgevalJudge
10
- from judgeval.common.utils import fetch_together_api_response, afetch_together_api_response
10
+ from judgeval.common.utils import (
11
+ fetch_together_api_response,
12
+ afetch_together_api_response,
13
+ )
11
14
 
12
15
  BASE_CONVERSATION = [
13
16
  {"role": "system", "content": "You are a helpful assistant."},
14
17
  ]
15
18
 
19
+
16
20
  class TogetherJudge(JudgevalJudge):
17
21
  def __init__(self, model: str = "Qwen/Qwen2.5-72B-Instruct-Turbo", **kwargs):
18
22
  debug(f"Initializing TogetherJudge with model={model}")
@@ -21,27 +25,37 @@ class TogetherJudge(JudgevalJudge):
21
25
  super().__init__(model_name=model)
22
26
 
23
27
  # TODO: Fix cost for generate and a_generate
24
- def generate(self, input: Union[str, List[Mapping[str, str]]], schema: BaseModel = None) -> str:
28
+ def generate(self, input: Union[str, List[dict]], schema: BaseModel = None) -> str:
25
29
  debug(f"Generating response for input type: {type(input)}")
26
30
  if isinstance(input, str):
27
31
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
28
- return fetch_together_api_response(self.model, convo, response_format=schema)
32
+ return fetch_together_api_response(
33
+ self.model, convo, response_format=schema
34
+ )
29
35
  elif isinstance(input, list):
30
36
  convo = input
31
- return fetch_together_api_response(self.model, convo, response_format=schema)
37
+ return fetch_together_api_response(
38
+ self.model, convo, response_format=schema
39
+ )
32
40
  else:
33
41
  error(f"Invalid input type received: {type(input)}")
34
42
  raise TypeError("Input must be a string or a list of dictionaries.")
35
43
 
36
- async def a_generate(self, input: Union[str, List[dict]], schema: BaseModel = None) -> str:
44
+ async def a_generate(
45
+ self, input: Union[str, List[dict]], schema: BaseModel = None
46
+ ) -> str:
37
47
  debug(f"Async generating response for input type: {type(input)}")
38
48
  if isinstance(input, str):
39
49
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
40
- res = await afetch_together_api_response(self.model, convo, response_format=schema)
50
+ res = await afetch_together_api_response(
51
+ self.model, convo, response_format=schema
52
+ )
41
53
  return res
42
54
  elif isinstance(input, list):
43
55
  convo = input
44
- res = await afetch_together_api_response(self.model, convo, response_format=schema)
56
+ res = await afetch_together_api_response(
57
+ self.model, convo, response_format=schema
58
+ )
45
59
  return res
46
60
  else:
47
61
  error(f"Invalid input type received: {type(input)}")
@@ -52,4 +66,3 @@ class TogetherJudge(JudgevalJudge):
52
66
 
53
67
  def get_model_name(self) -> str:
54
68
  return self.model
55
-
judgeval/judges/utils.py CHANGED
@@ -1,23 +1,30 @@
1
1
  """
2
2
  This module contains utility functions for judge models.
3
3
  """
4
+
4
5
  import litellm
5
6
  from typing import Optional, Union, Tuple, List
6
7
 
7
8
  from judgeval.common.exceptions import InvalidJudgeModelError
8
9
  from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
9
- from judgeval.constants import TOGETHER_SUPPORTED_MODELS, JUDGMENT_SUPPORTED_MODELS, ACCEPTABLE_MODELS
10
+ from judgeval.constants import (
11
+ TOGETHER_SUPPORTED_MODELS,
12
+ JUDGMENT_SUPPORTED_MODELS,
13
+ ACCEPTABLE_MODELS,
14
+ )
10
15
 
11
16
  LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
12
17
 
18
+
13
19
  def create_judge(
14
- model: Optional[Union[str, List[str], JudgevalJudge]] = None) -> Tuple[JudgevalJudge, bool]:
20
+ model: Optional[Union[str, List[str], JudgevalJudge]] = None,
21
+ ) -> Tuple[JudgevalJudge, bool]:
15
22
  """
16
23
  Creates a judge model from string(s) or a judgeval judge object.
17
24
 
18
25
  If `model` is a single string, it is assumed to be a judge model name.
19
26
  If `model` is a list of strings, it is assumed to be a list of judge model names (for MixtureOfJudges).
20
- If `model` is a judgeval judge object, it is returned as is.
27
+ If `model` is a judgeval judge object, it is returned as is.
21
28
 
22
29
  Returns a tuple of (initialized judgevalBaseLLM, using_native_model boolean)
23
30
  If no model is provided, uses GPT4o as the default judge.
@@ -25,10 +32,12 @@ def create_judge(
25
32
  if model is None: # default option
26
33
  return LiteLLMJudge(model="gpt-4.1"), True
27
34
  if not isinstance(model, (str, list, JudgevalJudge)):
28
- raise InvalidJudgeModelError(f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead.")
35
+ raise InvalidJudgeModelError(
36
+ f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead."
37
+ )
29
38
  # If model is already a valid judge type, return it and mark native
30
39
  if isinstance(model, (JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges)):
31
- return model, True
40
+ return model, True
32
41
 
33
42
  # Either string or List[str]
34
43
  if isinstance(model, list):