judgeval 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. judgeval/__init__.py +83 -0
  2. judgeval/clients.py +19 -0
  3. judgeval/common/__init__.py +8 -0
  4. judgeval/common/exceptions.py +28 -0
  5. judgeval/common/logger.py +189 -0
  6. judgeval/common/tracer.py +587 -0
  7. judgeval/common/utils.py +763 -0
  8. judgeval/constants.py +55 -0
  9. judgeval/data/__init__.py +14 -0
  10. judgeval/data/api_example.py +111 -0
  11. judgeval/data/datasets/__init__.py +4 -0
  12. judgeval/data/datasets/dataset.py +407 -0
  13. judgeval/data/datasets/ground_truth.py +54 -0
  14. judgeval/data/datasets/utils.py +74 -0
  15. judgeval/data/example.py +76 -0
  16. judgeval/data/result.py +83 -0
  17. judgeval/data/scorer_data.py +86 -0
  18. judgeval/evaluation_run.py +130 -0
  19. judgeval/judges/__init__.py +7 -0
  20. judgeval/judges/base_judge.py +44 -0
  21. judgeval/judges/litellm_judge.py +49 -0
  22. judgeval/judges/mixture_of_judges.py +248 -0
  23. judgeval/judges/together_judge.py +55 -0
  24. judgeval/judges/utils.py +45 -0
  25. judgeval/judgment_client.py +244 -0
  26. judgeval/run_evaluation.py +355 -0
  27. judgeval/scorers/__init__.py +30 -0
  28. judgeval/scorers/base_scorer.py +51 -0
  29. judgeval/scorers/custom_scorer.py +134 -0
  30. judgeval/scorers/judgeval_scorers/__init__.py +21 -0
  31. judgeval/scorers/judgeval_scorers/answer_relevancy.py +19 -0
  32. judgeval/scorers/judgeval_scorers/contextual_precision.py +19 -0
  33. judgeval/scorers/judgeval_scorers/contextual_recall.py +19 -0
  34. judgeval/scorers/judgeval_scorers/contextual_relevancy.py +22 -0
  35. judgeval/scorers/judgeval_scorers/faithfulness.py +19 -0
  36. judgeval/scorers/judgeval_scorers/hallucination.py +19 -0
  37. judgeval/scorers/judgeval_scorers/json_correctness.py +32 -0
  38. judgeval/scorers/judgeval_scorers/summarization.py +20 -0
  39. judgeval/scorers/judgeval_scorers/tool_correctness.py +19 -0
  40. judgeval/scorers/prompt_scorer.py +439 -0
  41. judgeval/scorers/score.py +427 -0
  42. judgeval/scorers/utils.py +175 -0
  43. judgeval-0.0.1.dist-info/METADATA +40 -0
  44. judgeval-0.0.1.dist-info/RECORD +46 -0
  45. judgeval-0.0.1.dist-info/WHEEL +4 -0
  46. judgeval-0.0.1.dist-info/licenses/LICENSE.md +202 -0
@@ -0,0 +1,248 @@
1
+ """
2
+ Implementation for Mixture of Judges model through Judgeval
3
+
4
+ Enables client to use multiple models to generate responses and then aggregate them into a single response.
5
+ """
6
+ from judgeval import *
7
+ import pydantic
8
+ from typing import List, Union, Mapping, Dict
9
+ from judgeval.judges import judgevalJudge
10
+ from judgeval.common.utils import get_completion_multiple_models, get_chat_completion, aget_completion_multiple_models, aget_chat_completion
11
+ from judgeval.common.logger import debug, error
12
+
13
+ def build_dynamic_mixture_prompt(
14
+ judge_responses: List[str],
15
+ custom_system_prompt: str = None,
16
+ custom_conversation_history: List[Mapping] = None
17
+ ) -> List[Mapping]:
18
+ """
19
+ Dynamically builds a prompt to mix judge responses together for the Mixture of Judges model.
20
+
21
+ In this implementation, we simply concatenate the judge responses into a formatted string, then
22
+ pass it into a default prompt template. This template can be customized by providing a custom prompt.
23
+
24
+ Args:
25
+ judge_responses (List[str]): List of responses from individual judges to be synthesized
26
+ custom_system_prompt (str, optional): Custom system prompt to override the default one. Defaults to None.
27
+ custom_conversation_history (List[Mapping], optional): Custom conversation history to override the default one. Defaults to None.
28
+ """
29
+ formatted_responses = "\n".join([f"# Judge {i + 1}'s response: #\n{response}" for i, response in enumerate(judge_responses)])
30
+
31
+ # This is the default prompt for the Mixture of Judges model
32
+ """
33
+ You are tasked with synthesizing responses from multiple expert judges. You will receive N individual answers on the same topic. Your job is to:
34
+
35
+ 1. Analyze and compare the key points, patterns, and agreements between the answers.
36
+ 2. Identify the consensus by focusing on areas where most or all of the answers align. Consider common reasoning and frequently mentioned conclusions.
37
+ 3. Condense the responses into a single, coherent, and concise answer that represents the collective judgment of the group.
38
+ 4. When opinions differ or contradict, highlight the most supported viewpoint while briefly acknowledging the dissenting perspectives.
39
+ 5. Ensure the final answer is balanced and clear, providing a comprehensive summary that captures the wisdom of all judges while avoiding repetition.
40
+
41
+ ## Start of Judge Responses ##
42
+ {{judge_responses}}
43
+ ## End of Judge Responses ##
44
+ Synthesized response:
45
+ """
46
+
47
+ default_conversation = [ # inject the judge responses into the default prompt
48
+ {
49
+ 'role': 'system',
50
+ 'content': 'You are tasked with synthesizing responses from multiple expert judges. You will receive N individual answers on the same topic. Your job is to:\n1. Analyze and compare the key points, patterns, and agreements between the answers.\n2. Identify the consensus by focusing on areas where most or all of the answers align. Consider common reasoning and frequently mentioned conclusions.\n3. Condense the responses into a single, coherent, and concise answer that represents the collective judgment of the group.\n4. When opinions differ or contradict, highlight the most supported viewpoint while briefly acknowledging the dissenting perspectives.\n5. Ensure the final answer is balanced and clear, providing a comprehensive summary that captures the wisdom of all judges while avoiding repetition.\n\n**IMPORTANT**: IF THE JUDGE RESPONSES ARE IN JSON FORMAT, YOU MUST RESPOND USING THE SAME JSON FORMAT THAT THE RESPONSES ARE IN. If the judge responses are in JSON, you MUST RESPOND IN VALID JSON FORMAT. '
51
+ },
52
+ {
53
+ 'role': 'user',
54
+ 'content': '## Start of Judge Responses ## \n# Judge 1\'s response: #\n{\n"claims": [\n{\n"claim": "A 30-day full refund is offered.",\n"quote": "We offer a 30-day full refund at no extra cost."\n},\n{\n"claim": "The 30-day full refund comes at no extra cost.",\n"quote": "We offer a 30-day full refund at no extra cost."\n}\n]\n}\n\n# Judge 2\'s response: #\n{\n "claims": [\n {\n "claim": "A full refund is offered within 30 days.",\n "quote": "We offer a 30-day full refund at no extra cost."\n },\n {\n "claim": "The 30-day full refund is offered at no extra cost.",\n "quote": "We offer a 30-day full refund at no extra cost."\n }\n ]\n}\n# Judge 3\'s response: #\n {\n "claims": [\n {\n "claim": "A 30-day full refund is offered.",\n "quote": "We offer a 30-day full refund at no extra cost."\n },\n {\n "claim": "The 30-day full refund is offered at no extra cost.",\n "quote": "We offer a 30-day full refund at no extra cost."\n }\n ]\n}\n## End of Judge Responses ##\nSynthesized response:'
55
+ },
56
+ {
57
+ 'role': 'assistant',
58
+ 'content': 'The consensus among the judges is clear and unanimous. All three judges agree that a 30-day full refund is offered, and this refund is available at no extra cost. This conclusion is consistently supported by their statements, as each of their claims is directly quoted as: "We offer a 30-day full refund at no extra cost." There are no dissenting perspectives or opposing views provided in any of the responses, indicating complete alignment on this topic.\n\nJSON:\n{\n "claims": [\n {\n "claim": "A full refund is offered within 30 days.",\n "quote": "We offer a 30-day full refund at no extra cost."\n },\n {\n "claim": "The 30-day full refund is offered at no extra cost.",\n "quote": "We offer a 30-day full refund at no extra cost."\n }\n ]\n}'
59
+ },
60
+ {
61
+ 'role': 'user',
62
+ 'content': "## Start of Judge Responses ##\n# Judge 1's response: # \nThe capital of France is Paris.\n\n# Judge 2's response: #\nThe capital of France is Paris.\n\n# Judge 3's response: # \nThe capital of France is Paris. It's one of the most popular tourist destinations in the world, known for its art, culture, and history. It's also famous for its iconic landmarks such as the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral.\n\n## End of Judge Responses ##\nSynthesized response:"
63
+ },
64
+ {
65
+ 'role': 'assistant',
66
+ 'content': "The capital of France is Paris. It is widely recognized as one of the world's most popular tourist destinations, celebrated for its rich art, culture, and history. Paris is renowned for its iconic landmarks, including the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral."
67
+ },
68
+ {
69
+ 'role': 'user',
70
+ 'content': f'## Start of Judge Responses ##\n{formatted_responses}\n## End of Judge Responses ##\nSynthesized response:\n'
71
+ }
72
+ ]
73
+
74
+ # If a custom system prompt is provided, validate and use it
75
+ if custom_system_prompt is not None:
76
+ if not isinstance(custom_system_prompt, str):
77
+ error(f"TypeError: Custom system prompt must be a string. Received: {type(custom_system_prompt)}.")
78
+ raise TypeError(f"Custom system prompt must be a string. Received: {type(custom_system_prompt)}.")
79
+ if not custom_system_prompt:
80
+ error("ValueError: Custom system prompt cannot be empty")
81
+ raise ValueError("Custom system prompt cannot be empty")
82
+ # Override the default system prompt, but also add special instructions for handling JSON
83
+ default_conversation[0]['content'] = custom_system_prompt + "\n\n**IMPORTANT**: IF THE JUDGE RESPONSES ARE IN JSON FORMAT, YOU MUST RESPOND USING THE SAME JSON FORMAT THAT THE RESPONSES ARE IN. If the judge responses are in JSON, you MUST RESPOND IN VALID JSON FORMAT."
84
+
85
+ # If a custom conversation history is provided, append the judge responses to it
86
+ if custom_conversation_history is not None:
87
+ # Validate custom conversation history format
88
+ for message in custom_conversation_history:
89
+ if not isinstance(message, dict):
90
+ error(f"TypeError: Custom conversation history must be a list of dictionaries. Received: {message}.")
91
+ raise TypeError(f"Custom conversation history must be a list of dictionaries. Received: {message}.")
92
+
93
+ if 'role' not in message or 'content' not in message:
94
+ error("ValueError: Each message must have 'role' and 'content' keys")
95
+ raise ValueError("Each message must have 'role' and 'content' keys")
96
+
97
+ if not isinstance(message['role'], str) or not isinstance(message['content'], str):
98
+ error(f"TypeError: Message role and content must be strings. Received: {type(message['role'])}, {type(message['content'])}.")
99
+ raise TypeError(f"Message role and content must be strings. Received: {type(message['role'])}, {type(message['content'])}.")
100
+
101
+ if message['role'] not in ['system', 'user', 'assistant']:
102
+ error(f"ValueError: Message role must be one of: 'system', 'user', 'assistant'. Received: {message['role']}.")
103
+ raise ValueError(f"Message role must be one of: 'system', 'user', 'assistant'. Received: {message['role']}.")
104
+
105
+ judge_responses_prompt = {
106
+ 'role': 'user',
107
+ 'content': f'## Start of Judge Responses ##\n{formatted_responses}\n## End of Judge Responses ##\nSynthesized response:\n'
108
+ }
109
+ return custom_conversation_history + [judge_responses_prompt]
110
+
111
+ # Otherwise return the default conversation with system prompt and examples
112
+ # No customization, return the default conversation with system prompt and examples
113
+ return default_conversation
114
+
115
+ BASE_CONVERSATION = [
116
+ {"role": "system", "content": "You are a helpful assistant."},
117
+ ] # for string inputs, we need to add the user query to a base conversation, since LiteLLM only accepts a list of dictionaries as a chat history
118
+ class MixtureOfJudges(judgevalJudge):
119
+ """
120
+ IMPORTANT: When supplying custom prompts and conversation histories for aggregation, supply them in the following format:
121
+ in kwargs:
122
+ {
123
+ "custom_prompt": "Your custom prompt here",
124
+ "custom_conversation": [
125
+ {"role": "system", "content": "System message 1"},
126
+ {"role": "user", "content": "User message 1"},
127
+ {"role": "assistant", "content": "Assistant message 1"},
128
+ ...
129
+ ]
130
+ }
131
+ """
132
+ def __init__(self,
133
+ models: List[str] = ['QWEN', 'LLAMA3_70B_INSTRUCT_TURBO', 'MISTRAL_8x22B_INSTRUCT'],
134
+ aggregator: str = 'gpt-4o',
135
+ **kwargs):
136
+ """
137
+ `models` are the individual judge models to be used for generating responses.
138
+ `aggregator` is the model that will aggregate the responses from the individual judges.
139
+
140
+ kwargs include "custom_prompt" and "custom_conversation" for customizing the prompt for the Mixture of Judges model.
141
+ """
142
+ self.models = models
143
+ self.aggregator = aggregator
144
+ self.kwargs = kwargs
145
+ super().__init__(model_name=models)
146
+
147
+ def generate(
148
+ self,
149
+ input: Union[str, List[Mapping[str, str]]],
150
+ response_schema: pydantic.BaseModel = None,
151
+ aggregation_schema: pydantic.BaseModel = None,
152
+ **kwargs) -> str:
153
+ """
154
+ Args:
155
+ input (Union[str, List[Mapping[str, str]]]): Input query or conversation history to the model.
156
+ response_schema (pydantic.BaseModel): Response schema for individual judge models.
157
+ aggregation_schema (pydantic.BaseModel): Response schema for the aggregator model.
158
+ kwargs: Additional keyword arguments.
159
+ """
160
+ debug(f"Generating response for input type: {type(input)}")
161
+
162
+ # Convert input to conversation format if needed
163
+ if isinstance(input, str):
164
+ convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
165
+ elif isinstance(input, list):
166
+ convo = input
167
+ else:
168
+ error(f"Invalid input type received: {type(input)}")
169
+ raise TypeError(f"Input must be a string or a list of dictionaries. Input type of: {type(input)}")
170
+
171
+ try:
172
+ responses = get_completion_multiple_models(
173
+ models=self.models,
174
+ messages=[convo] * len(self.models),
175
+ response_formats=[response_schema] * len(self.models)
176
+ )
177
+ except Exception as e:
178
+ error(f"Error getting completions from multiple models: {str(e)}")
179
+ raise
180
+
181
+ compiled_mixture_prompt = build_dynamic_mixture_prompt(responses, self.kwargs.get('custom_prompt'), self.kwargs.get('custom_conversation'))
182
+
183
+ try:
184
+ mixed_response = get_chat_completion(
185
+ model_type=self.aggregator,
186
+ messages=compiled_mixture_prompt,
187
+ response_format=aggregation_schema,
188
+ )
189
+ except Exception as e:
190
+ error(f"Error getting chat completion from aggregator: {str(e)}")
191
+ raise
192
+
193
+ return mixed_response
194
+
195
+ async def a_generate(
196
+ self,
197
+ input: Union[str, List[Mapping[str, str]]],
198
+ response_schema: pydantic.BaseModel = None,
199
+ aggregation_schema: pydantic.BaseModel = None,
200
+ **kwargs
201
+ ) -> str:
202
+ """
203
+ Args:
204
+ input (Union[str, List[Mapping[str, str]]]): Input query or conversation history to the model.
205
+ response_schema (pydantic.BaseModel): Response schema for individual judge models.
206
+ aggregation_schema (pydantic.BaseModel): Response schema for the aggregator model.
207
+ kwargs: Additional keyword arguments.
208
+ """
209
+ debug(f"Generating response for input type: {type(input)}")
210
+
211
+ # Convert input to conversation format if needed
212
+ if isinstance(input, str):
213
+ convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
214
+ elif isinstance(input, list):
215
+ convo = input
216
+ else:
217
+ error(f"Invalid input type received: {type(input)}")
218
+ raise TypeError(f"Input must be a string or a list of dictionaries. Input type of: {type(input)}")
219
+
220
+ try:
221
+ responses = await aget_completion_multiple_models(
222
+ models=self.models,
223
+ messages=[convo] * len(self.models),
224
+ response_formats=[response_schema] * len(self.models)
225
+ )
226
+ except Exception as e:
227
+ error(f"Error getting async completions from multiple models: {str(e)}")
228
+ raise
229
+
230
+ compiled_mixture_prompt = build_dynamic_mixture_prompt(responses, self.kwargs.get('custom_prompt'), self.kwargs.get('custom_conversation'))
231
+
232
+ try:
233
+ mixed_response = await aget_chat_completion(
234
+ model_type=self.aggregator,
235
+ messages=compiled_mixture_prompt,
236
+ response_format=aggregation_schema,
237
+ )
238
+ except Exception as e:
239
+ error(f"Error getting async chat completion from aggregator: {str(e)}")
240
+ raise
241
+
242
+ return mixed_response
243
+
244
+ def load_model(self):
245
+ return self.models
246
+
247
+ def get_model_name(self) -> List[str]:
248
+ return self.models
@@ -0,0 +1,55 @@
1
+ """
2
+ Implementation of using TogetherAI inference for judges.
3
+ """
4
+
5
+ from pydantic import BaseModel
6
+ from typing import List, Union, Mapping
7
+ from judgeval.common.logger import debug, error
8
+
9
+ from judgeval.judges import judgevalJudge
10
+ from judgeval.common.utils import fetch_together_api_response, afetch_together_api_response
11
+
12
+ BASE_CONVERSATION = [
13
+ {"role": "system", "content": "You are a helpful assistant."},
14
+ ]
15
+
16
+ class TogetherJudge(judgevalJudge):
17
+ def __init__(self, model: str = "QWEN", **kwargs):
18
+ debug(f"Initializing TogetherJudge with model={model}")
19
+ self.model = model
20
+ self.kwargs = kwargs
21
+ super().__init__(model_name=model)
22
+
23
+ # TODO: Fix cost for generate and a_generate
24
+ def generate(self, input: Union[str, List[Mapping[str, str]]], schema: BaseModel = None) -> str:
25
+ debug(f"Generating response for input type: {type(input)}")
26
+ if isinstance(input, str):
27
+ convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
28
+ return fetch_together_api_response(self.model, convo, response_format=schema)
29
+ elif isinstance(input, list):
30
+ convo = input
31
+ return fetch_together_api_response(self.model, convo, response_format=schema)
32
+ else:
33
+ error(f"Invalid input type received: {type(input)}")
34
+ raise TypeError("Input must be a string or a list of dictionaries.")
35
+
36
+ async def a_generate(self, input: Union[str, List[dict]], schema: BaseModel = None) -> str:
37
+ debug(f"Async generating response for input type: {type(input)}")
38
+ if isinstance(input, str):
39
+ convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
40
+ res = await afetch_together_api_response(self.model, convo, response_format=schema)
41
+ return res
42
+ elif isinstance(input, list):
43
+ convo = input
44
+ res = await afetch_together_api_response(self.model, convo, response_format=schema)
45
+ return res
46
+ else:
47
+ error(f"Invalid input type received: {type(input)}")
48
+ raise TypeError("Input must be a string or a list of dictionaries.")
49
+
50
+ def load_model(self) -> str:
51
+ return self.model
52
+
53
+ def get_model_name(self) -> str:
54
+ return self.model
55
+
@@ -0,0 +1,45 @@
1
+ """
2
+ This module contains utility functions for judge models.
3
+ """
4
+ import litellm
5
+ from typing import Optional, Union, Tuple, List
6
+
7
+ from judgeval.common.exceptions import InvalidJudgeModelError
8
+ from judgeval.judges import judgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
9
+ from judgeval.constants import TOGETHER_SUPPORTED_MODELS
10
+
11
+ LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
12
+
13
+ def create_judge(
14
+ model: Optional[Union[str, List[str], judgevalJudge]] = None) -> Tuple[judgevalJudge, bool]:
15
+ """
16
+ Creates a judge model from string(s) or a judgeval judge object.
17
+
18
+ If `model` is a single string, it is assumed to be a judge model name.
19
+ If `model` is a list of strings, it is assumed to be a list of judge model names (for MixtureOfJudges).
20
+ If `model` is a judgeval judge object, it is returned as is.
21
+
22
+ Returns a tuple of (initialized judgevalBaseLLM, using_native_model boolean)
23
+ If no model is provided, uses GPT4o as the default judge.
24
+ """
25
+ if model is None: # default option
26
+ return LiteLLMJudge(model="gpt-4o"), True
27
+ if not isinstance(model, (str, list, judgevalJudge)):
28
+ raise InvalidJudgeModelError(f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead.")
29
+ # If model is already a valid judge type, return it and mark native
30
+ if isinstance(model, (judgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges)):
31
+ return model, True
32
+
33
+ # Either string or List[str]
34
+ if isinstance(model, list):
35
+ for m in model:
36
+ if m not in TOGETHER_SUPPORTED_MODELS and m not in LITELLM_SUPPORTED_MODELS:
37
+ raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
38
+ return MixtureOfJudges(models=model), True
39
+ # If model is a string, check that it corresponds to a valid model
40
+ if model in LITELLM_SUPPORTED_MODELS:
41
+ return LiteLLMJudge(model=model), True
42
+ if model in TOGETHER_SUPPORTED_MODELS:
43
+ return TogetherJudge(model=model), True
44
+ else:
45
+ raise InvalidJudgeModelError(f"Invalid judge model chosen: {model}")
@@ -0,0 +1,244 @@
1
+ """
2
+ Implements the JudgmentClient to interact with the Judgment API.
3
+ """
4
+ import os
5
+ from typing import Optional, List, Dict, Any, Union
6
+ import requests
7
+
8
+ from judgeval.constants import ROOT_API
9
+ from judgeval.data.datasets import EvalDataset
10
+ from judgeval.data import ScoringResult, Example
11
+ from judgeval.judges import judgevalJudge
12
+ from judgeval.scorers import JudgmentScorer, CustomScorer, ClassifierScorer
13
+ from judgeval.evaluation_run import EvaluationRun
14
+ from judgeval.run_evaluation import run_eval
15
+ from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL
16
+ from judgeval.common.exceptions import JudgmentAPIError
17
+ from pydantic import BaseModel
18
+
19
+ class EvalRunRequestBody(BaseModel):
20
+ eval_name: str
21
+ project_name: str
22
+ judgment_api_key: str
23
+
24
+
25
+ class JudgmentClient:
26
+ def __init__(self, judgment_api_key: str = os.getenv("JUDGMENT_API_KEY")):
27
+ self.judgment_api_key = judgment_api_key
28
+
29
+ # Verify API key is valid
30
+ result, response = self._validate_api_key()
31
+ if not result:
32
+ # May be bad to output their invalid API key...
33
+ raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
34
+ else:
35
+ print(f"Successfully initialized JudgmentClient, welcome back {response.get('detail', {}).get('user_name', 'user')}!")
36
+
37
+ def run_evaluation(
38
+ self,
39
+ examples: List[Example],
40
+ scorers: List[Union[JudgmentScorer, CustomScorer]],
41
+ model: Union[str, List[str], judgevalJudge],
42
+ aggregator: Optional[str] = None,
43
+ metadata: Optional[Dict[str, Any]] = None,
44
+ log_results: bool = False,
45
+ project_name: str = "",
46
+ eval_run_name: str = "",
47
+ override: bool = False,
48
+ ) -> List[ScoringResult]:
49
+ """
50
+ Executes an evaluation of `Example`s using one or more `Scorer`s
51
+ """
52
+ try:
53
+ eval = EvaluationRun(
54
+ log_results=log_results,
55
+ project_name=project_name,
56
+ eval_name=eval_run_name,
57
+ examples=examples,
58
+ scorers=scorers,
59
+ model=model,
60
+ aggregator=aggregator,
61
+ metadata=metadata,
62
+ judgment_api_key=self.judgment_api_key
63
+ )
64
+ return run_eval(eval, override)
65
+ except ValueError as e:
66
+ raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
67
+
68
+ def evaluate_dataset(
69
+ self,
70
+ dataset: EvalDataset,
71
+ scorers: List[Union[JudgmentScorer, CustomScorer]],
72
+ model: Union[str, List[str]],
73
+ aggregator: Optional[str] = None,
74
+ metadata: Optional[Dict[str, Any]] = None,
75
+ project_name: str = "",
76
+ eval_run_name: str = "",
77
+ log_results: bool = False
78
+ ) -> List[ScoringResult]:
79
+ """
80
+ Executes an evaluation of a `EvalDataset` using one or more `Scorer`s
81
+ """
82
+ try:
83
+ evaluation_run = EvaluationRun(
84
+ log_results=log_results,
85
+ project_name=project_name,
86
+ eval_name=eval_run_name,
87
+ examples=dataset.examples,
88
+ scorers=scorers,
89
+ model=model,
90
+ aggregator=aggregator,
91
+ metadata=metadata,
92
+ judgment_api_key=self.judgment_api_key
93
+ )
94
+ return run_eval(evaluation_run)
95
+ except ValueError as e:
96
+ raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}")
97
+
98
+ def create_dataset(self) -> EvalDataset:
99
+ return EvalDataset(judgment_api_key=self.judgment_api_key)
100
+
101
+ def push_dataset(self, alias: str, dataset: EvalDataset, overwrite: Optional[bool] = False) -> bool:
102
+ """
103
+ Uploads an `EvalDataset` to the Judgment platform for storage.
104
+
105
+ Args:
106
+ alias (str): The name to use for the dataset
107
+ dataset (EvalDataset): The dataset to upload to Judgment
108
+ overwrite (Optional[bool]): Whether to overwrite the dataset if it already exists
109
+
110
+ Returns:
111
+ bool: Whether the dataset was successfully uploaded
112
+ """
113
+ # Set judgment_api_key just in case it was not set
114
+ dataset.judgment_api_key = self.judgment_api_key
115
+ return dataset.push(alias, overwrite)
116
+
117
+ def pull_dataset(self, alias: str) -> EvalDataset:
118
+ """
119
+ Retrieves a saved `EvalDataset` from the Judgment platform.
120
+
121
+ Args:
122
+ alias (str): The name of the dataset to retrieve
123
+
124
+ Returns:
125
+ EvalDataset: The retrieved dataset
126
+ """
127
+ dataset = EvalDataset(judgment_api_key=self.judgment_api_key)
128
+ dataset.pull(alias)
129
+ return dataset
130
+
131
+ # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
132
+ def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
133
+ """Pull evaluation results from the server.
134
+
135
+ Args:
136
+ project_name (str): Name of the project
137
+ eval_run_name (str): Name of the evaluation run
138
+
139
+ Returns:
140
+ Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing:
141
+ - id (str): The evaluation run ID
142
+ - results (List[ScoringResult]): List of scoring results
143
+ """
144
+ eval_run_request_body = EvalRunRequestBody(project_name=project_name,
145
+ eval_name=eval_run_name,
146
+ judgment_api_key=self.judgment_api_key)
147
+ eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL,
148
+ json=eval_run_request_body.model_dump())
149
+ if eval_run.status_code != requests.codes.ok:
150
+ raise ValueError(f"Error fetching eval results: {eval_run.json()}")
151
+
152
+ eval_run_result = [{}]
153
+ for result in eval_run.json():
154
+ result_id = result.get("id", "")
155
+ result_data = result.get("result", dict())
156
+ filtered_result = {k: v for k, v in result_data.items() if k in ScoringResult.__annotations__}
157
+ eval_run_result[0]["id"] = result_id
158
+ eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
159
+ return eval_run_result
160
+
161
+ def _validate_api_key(self):
162
+ """
163
+ Validates that the user api key is valid
164
+ """
165
+ response = requests.post(
166
+ f"{ROOT_API}/validate_api_key/",
167
+ json={"api_key": self.judgment_api_key}
168
+ )
169
+ if response.status_code == 200:
170
+ return True, response.json()
171
+ else:
172
+ return False, response.json().get("detail", "Error validating API key")
173
+
174
+ def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
175
+ """
176
+ Fetches a classifier scorer configuration from the Judgment API.
177
+
178
+ Args:
179
+ slug (str): Slug identifier of the custom scorer to fetch
180
+
181
+ Returns:
182
+ ClassifierScorer: The configured classifier scorer object
183
+
184
+ Raises:
185
+ JudgmentAPIError: If the scorer cannot be fetched or doesn't exist
186
+ """
187
+ request_body = {
188
+ "slug": slug,
189
+ "judgment_api_key": self.judgment_api_key
190
+ }
191
+
192
+ response = requests.post(
193
+ f"{ROOT_API}/fetch_scorer/",
194
+ json=request_body
195
+ )
196
+
197
+ if response.status_code == 500:
198
+ raise JudgmentAPIError(f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {response.json().get('detail', '')}")
199
+ elif response.status_code != 200:
200
+ raise JudgmentAPIError(f"Failed to fetch classifier scorer '{slug}': {response.json().get('detail', '')}")
201
+
202
+ scorer_config = response.json()
203
+
204
+ try:
205
+ return ClassifierScorer(**scorer_config)
206
+ except Exception as e:
207
+ raise JudgmentAPIError(f"Failed to create classifier scorer '{slug}' with config {scorer_config}: {str(e)}")
208
+
209
+ def push_classifier_scorer(self, scorer: ClassifierScorer, slug: str = None) -> str:
210
+ """
211
+ Pushes a classifier scorer configuration to the Judgment API.
212
+
213
+ Args:
214
+ slug (str): Slug identifier for the scorer. If it exists, the scorer will be updated.
215
+ scorer (ClassifierScorer): The classifier scorer to save
216
+
217
+ Returns:
218
+ str: The slug identifier of the saved scorer
219
+
220
+ Raises:
221
+ JudgmentAPIError: If there's an error saving the scorer
222
+ """
223
+ request_body = {
224
+ "name": scorer.name,
225
+ "conversation": scorer.conversation,
226
+ "options": scorer.options,
227
+ "judgment_api_key": self.judgment_api_key,
228
+ "slug": slug
229
+ }
230
+
231
+ response = requests.post(
232
+ f"{ROOT_API}/save_scorer/",
233
+ json=request_body
234
+ )
235
+
236
+ if response.status_code == 500:
237
+ raise JudgmentAPIError(f"The server is temporarily unavailable. \
238
+ Please try your request again in a few moments. \
239
+ Error details: {response.json().get('detail', '')}")
240
+ elif response.status_code != 200:
241
+ raise JudgmentAPIError(f"Failed to save classifier scorer: {response.json().get('detail', '')}")
242
+
243
+ return response.json()["slug"]
244
+