judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,74 +0,0 @@
1
- from typing import List, Optional
2
-
3
- from judgeval.data.datasets.ground_truth import GroundTruthExample
4
- from judgeval.data import Example
5
-
6
-
7
- def examples_to_ground_truths(examples: List[Example]) -> List[GroundTruthExample]:
8
- """
9
- Convert a list of `Example` objects to a list of `GroundTruthExample` objects.
10
-
11
- Args:
12
- examples (List[Example]): A list of `Example` objects to convert.
13
-
14
- Returns:
15
- List[GroundTruthExample]: A list of `GroundTruthExample` objects.
16
- """
17
-
18
- if not isinstance(examples, list):
19
- raise TypeError("Input should be a list of `Example` objects")
20
-
21
- ground_truths = []
22
- ground_truths = []
23
- for e in examples:
24
- g_truth = {
25
- "input": e.input,
26
- "actual_output": e.actual_output,
27
- "expected_output": e.expected_output,
28
- "context": e.context,
29
- "retrieval_context": e.retrieval_context,
30
- "tools_called": e.tools_called,
31
- "expected_tools": e.expected_tools,
32
- }
33
- ground_truths.append(GroundTruthExample(**g_truth))
34
- return ground_truths
35
-
36
-
37
- def ground_truths_to_examples(
38
- ground_truths: List[GroundTruthExample],
39
- _alias: Optional[str] = None,
40
- _id: Optional[str] = None,
41
- ) -> List[Example]:
42
- """
43
- Converts a list of `GroundTruthExample` objects to a list of `Example` objects.
44
-
45
- Args:
46
- ground_truths (List[GroundTruthExample]): A list of `GroundTruthExample` objects to convert.
47
- _alias (Optional[str]): The alias of the dataset.
48
- _id (Optional[str]): The ID of the dataset.
49
-
50
- Returns:
51
- List[Example]: A list of `Example` objects.
52
- """
53
-
54
- if not isinstance(ground_truths, list):
55
- raise TypeError("Input should be a list of `GroundTruthExample` objects")
56
-
57
- examples = []
58
- for index, ground_truth in enumerate(ground_truths):
59
- e = Example(
60
- input=ground_truth.input,
61
- actual_output=ground_truth.actual_output,
62
- expected_output=ground_truth.expected_output,
63
- context=ground_truth.context,
64
- retrieval_context=ground_truth.retrieval_context,
65
- additional_metadata=ground_truth.additional_metadata,
66
- tools_called=ground_truth.tools_called,
67
- expected_tools=ground_truth.expected_tools,
68
- comments=ground_truth.comments,
69
- _dataset_alias=_alias,
70
- _dataset_id=_id,
71
- _dataset_rank=index,
72
- )
73
- examples.append(e)
74
- return examples
@@ -1,132 +0,0 @@
1
- from typing import List, Optional, Dict, Any, Union
2
- from pydantic import BaseModel, field_validator
3
-
4
- from judgeval.data import Example
5
- from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
6
- from judgeval.constants import ACCEPTABLE_MODELS
7
- from judgeval.common.logger import debug, error
8
- from judgeval.judges import JudgevalJudge
9
-
10
- class EvaluationRun(BaseModel):
11
- """
12
- Stores example and evaluation scorers together for running an eval task
13
-
14
- Args:
15
- project_name (str): The name of the project the evaluation results belong to
16
- eval_name (str): A name for this evaluation run
17
- examples (List[Example]): The examples to evaluate
18
- scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
19
- model (str): The model used as a judge when using LLM as a Judge
20
- aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
21
- metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
22
- judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
23
- """
24
-
25
- # The user will specify whether they want log_results when they call run_eval
26
- log_results: bool = False # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
27
- project_name: Optional[str] = None
28
- eval_name: Optional[str] = None
29
- examples: List[Example]
30
- scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
31
- model: Union[str, List[str], JudgevalJudge]
32
- aggregator: Optional[str] = None
33
- metadata: Optional[Dict[str, Any]] = None
34
- # API Key will be "" until user calls client.run_eval(), then API Key will be set
35
- judgment_api_key: Optional[str] = ""
36
- override: Optional[bool] = False
37
-
38
- def model_dump(self, **kwargs):
39
- data = super().model_dump(**kwargs)
40
-
41
- data["scorers"] = [
42
- scorer.to_dict() if hasattr(scorer, "to_dict")
43
- else scorer.model_dump() if hasattr(scorer, "model_dump")
44
- else {"score_type": scorer.score_type, "threshold": scorer.threshold}
45
- for scorer in self.scorers
46
- ]
47
- return data
48
-
49
- @field_validator('log_results', mode='before')
50
- def validate_log_results(cls, v):
51
- if not isinstance(v, bool):
52
- raise ValueError(f"log_results must be a boolean. Received {v} of type {type(v)}")
53
- return v
54
-
55
- @field_validator('project_name')
56
- def validate_project_name(cls, v, values):
57
- if values.data.get('log_results', False) and not v:
58
- debug("No project name provided when log_results is True")
59
- error("Validation failed: Project name required when logging results")
60
- raise ValueError("Project name is required when log_results is True. Please include the project_name argument.")
61
- return v
62
-
63
- @field_validator('eval_name')
64
- def validate_eval_name(cls, v, values):
65
- if values.data.get('log_results', False) and not v:
66
- debug("No eval name provided when log_results is True")
67
- error("Validation failed: Eval name required when logging results")
68
- raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
69
- return v
70
-
71
- @field_validator('examples')
72
- def validate_examples(cls, v):
73
- if not v:
74
- raise ValueError("Examples cannot be empty.")
75
- for ex in v:
76
- if not isinstance(ex, Example):
77
- raise ValueError(f"Invalid type for Example: {type(ex)}")
78
- return v
79
-
80
- @field_validator('scorers')
81
- def validate_scorers(cls, v):
82
- if not v:
83
- raise ValueError("Scorers cannot be empty.")
84
- for s in v:
85
- if not isinstance(s, APIJudgmentScorer) and not isinstance(s, JudgevalScorer):
86
- raise ValueError(f"Invalid type for Scorer: {type(s)}")
87
- return v
88
-
89
- @field_validator('model')
90
- def validate_model(cls, v, values):
91
- if not v:
92
- raise ValueError("Model cannot be empty.")
93
-
94
- # Check if model is a judgevalJudge
95
- if isinstance(v, JudgevalJudge):
96
- # Verify all scorers are JudgevalScorer when using judgevalJudge
97
- scorers = values.data.get('scorers', [])
98
- if not all(isinstance(s, JudgevalScorer) for s in scorers):
99
- raise ValueError("When using a judgevalJudge model, all scorers must be JudgevalScorer type")
100
- return v
101
-
102
- # Check if model is string or list of strings
103
- if isinstance(v, str):
104
- if v not in ACCEPTABLE_MODELS:
105
- raise ValueError(f"Model name {v} not recognized.")
106
- return v
107
-
108
- if isinstance(v, list):
109
- if not all(isinstance(m, str) for m in v):
110
- raise ValueError("When providing a list of models, all elements must be strings")
111
- for m in v:
112
- if m not in ACCEPTABLE_MODELS:
113
- raise ValueError(f"Model name {m} not recognized.")
114
- return v
115
- raise ValueError(f"Model must be one of: string, list of strings, or JudgevalJudge instance. Received type {type(v)}.")
116
-
117
- @field_validator('aggregator', mode='before')
118
- def validate_aggregator(cls, v, values):
119
- model = values.data.get('model')
120
- if isinstance(model, list) and v is None:
121
- raise ValueError("Aggregator cannot be empty.")
122
-
123
- if isinstance(model, list) and not isinstance(v, str):
124
- raise ValueError("Aggregator must be a string if provided.")
125
-
126
- if v is not None and v not in ACCEPTABLE_MODELS:
127
- raise ValueError(f"Model name {v} not recognized.")
128
-
129
- return v
130
-
131
- class Config:
132
- arbitrary_types_allowed = True
@@ -1,248 +0,0 @@
1
- """
2
- Implementation for Mixture of Judges model through Judgeval
3
-
4
- Enables client to use multiple models to generate responses and then aggregate them into a single response.
5
- """
6
- from judgeval import *
7
- import pydantic
8
- from typing import List, Union, Mapping, Dict
9
- from judgeval.judges import JudgevalJudge
10
- from judgeval.common.utils import get_completion_multiple_models, get_chat_completion, aget_completion_multiple_models, aget_chat_completion
11
- from judgeval.common.logger import debug, error
12
-
13
- def build_dynamic_mixture_prompt(
14
- judge_responses: List[str],
15
- custom_system_prompt: str = None,
16
- custom_conversation_history: List[Mapping] = None
17
- ) -> List[Mapping]:
18
- """
19
- Dynamically builds a prompt to mix judge responses together for the Mixture of Judges model.
20
-
21
- In this implementation, we simply concatenate the judge responses into a formatted string, then
22
- pass it into a default prompt template. This template can be customized by providing a custom prompt.
23
-
24
- Args:
25
- judge_responses (List[str]): List of responses from individual judges to be synthesized
26
- custom_system_prompt (str, optional): Custom system prompt to override the default one. Defaults to None.
27
- custom_conversation_history (List[Mapping], optional): Custom conversation history to override the default one. Defaults to None.
28
- """
29
- formatted_responses = "\n".join([f"# Judge {i + 1}'s response: #\n{response}" for i, response in enumerate(judge_responses)])
30
-
31
- # This is the default prompt for the Mixture of Judges model
32
- """
33
- You are tasked with synthesizing responses from multiple expert judges. You will receive N individual answers on the same topic. Your job is to:
34
-
35
- 1. Analyze and compare the key points, patterns, and agreements between the answers.
36
- 2. Identify the consensus by focusing on areas where most or all of the answers align. Consider common reasoning and frequently mentioned conclusions.
37
- 3. Condense the responses into a single, coherent, and concise answer that represents the collective judgment of the group.
38
- 4. When opinions differ or contradict, highlight the most supported viewpoint while briefly acknowledging the dissenting perspectives.
39
- 5. Ensure the final answer is balanced and clear, providing a comprehensive summary that captures the wisdom of all judges while avoiding repetition.
40
-
41
- ## Start of Judge Responses ##
42
- {{judge_responses}}
43
- ## End of Judge Responses ##
44
- Synthesized response:
45
- """
46
-
47
- default_conversation = [ # inject the judge responses into the default prompt
48
- {
49
- 'role': 'system',
50
- 'content': 'You are tasked with synthesizing responses from multiple expert judges. You will receive N individual answers on the same topic. Your job is to:\n1. Analyze and compare the key points, patterns, and agreements between the answers.\n2. Identify the consensus by focusing on areas where most or all of the answers align. Consider common reasoning and frequently mentioned conclusions.\n3. Condense the responses into a single, coherent, and concise answer that represents the collective judgment of the group.\n4. When opinions differ or contradict, highlight the most supported viewpoint while briefly acknowledging the dissenting perspectives.\n5. Ensure the final answer is balanced and clear, providing a comprehensive summary that captures the wisdom of all judges while avoiding repetition.\n\n**IMPORTANT**: IF THE JUDGE RESPONSES ARE IN JSON FORMAT, YOU MUST RESPOND USING THE SAME JSON FORMAT THAT THE RESPONSES ARE IN. If the judge responses are in JSON, you MUST RESPOND IN VALID JSON FORMAT. '
51
- },
52
- {
53
- 'role': 'user',
54
- 'content': '## Start of Judge Responses ## \n# Judge 1\'s response: #\n{\n"claims": [\n{\n"claim": "A 30-day full refund is offered.",\n"quote": "We offer a 30-day full refund at no extra cost."\n},\n{\n"claim": "The 30-day full refund comes at no extra cost.",\n"quote": "We offer a 30-day full refund at no extra cost."\n}\n]\n}\n\n# Judge 2\'s response: #\n{\n "claims": [\n {\n "claim": "A full refund is offered within 30 days.",\n "quote": "We offer a 30-day full refund at no extra cost."\n },\n {\n "claim": "The 30-day full refund is offered at no extra cost.",\n "quote": "We offer a 30-day full refund at no extra cost."\n }\n ]\n}\n# Judge 3\'s response: #\n {\n "claims": [\n {\n "claim": "A 30-day full refund is offered.",\n "quote": "We offer a 30-day full refund at no extra cost."\n },\n {\n "claim": "The 30-day full refund is offered at no extra cost.",\n "quote": "We offer a 30-day full refund at no extra cost."\n }\n ]\n}\n## End of Judge Responses ##\nSynthesized response:'
55
- },
56
- {
57
- 'role': 'assistant',
58
- 'content': 'The consensus among the judges is clear and unanimous. All three judges agree that a 30-day full refund is offered, and this refund is available at no extra cost. This conclusion is consistently supported by their statements, as each of their claims is directly quoted as: "We offer a 30-day full refund at no extra cost." There are no dissenting perspectives or opposing views provided in any of the responses, indicating complete alignment on this topic.\n\nJSON:\n{\n "claims": [\n {\n "claim": "A full refund is offered within 30 days.",\n "quote": "We offer a 30-day full refund at no extra cost."\n },\n {\n "claim": "The 30-day full refund is offered at no extra cost.",\n "quote": "We offer a 30-day full refund at no extra cost."\n }\n ]\n}'
59
- },
60
- {
61
- 'role': 'user',
62
- 'content': "## Start of Judge Responses ##\n# Judge 1's response: # \nThe capital of France is Paris.\n\n# Judge 2's response: #\nThe capital of France is Paris.\n\n# Judge 3's response: # \nThe capital of France is Paris. It's one of the most popular tourist destinations in the world, known for its art, culture, and history. It's also famous for its iconic landmarks such as the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral.\n\n## End of Judge Responses ##\nSynthesized response:"
63
- },
64
- {
65
- 'role': 'assistant',
66
- 'content': "The capital of France is Paris. It is widely recognized as one of the world's most popular tourist destinations, celebrated for its rich art, culture, and history. Paris is renowned for its iconic landmarks, including the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral."
67
- },
68
- {
69
- 'role': 'user',
70
- 'content': f'## Start of Judge Responses ##\n{formatted_responses}\n## End of Judge Responses ##\nSynthesized response:\n'
71
- }
72
- ]
73
-
74
- # If a custom system prompt is provided, validate and use it
75
- if custom_system_prompt is not None:
76
- if not isinstance(custom_system_prompt, str):
77
- error(f"TypeError: Custom system prompt must be a string. Received: {type(custom_system_prompt)}.")
78
- raise TypeError(f"Custom system prompt must be a string. Received: {type(custom_system_prompt)}.")
79
- if not custom_system_prompt:
80
- error("ValueError: Custom system prompt cannot be empty")
81
- raise ValueError("Custom system prompt cannot be empty")
82
- # Override the default system prompt, but also add special instructions for handling JSON
83
- default_conversation[0]['content'] = custom_system_prompt + "\n\n**IMPORTANT**: IF THE JUDGE RESPONSES ARE IN JSON FORMAT, YOU MUST RESPOND USING THE SAME JSON FORMAT THAT THE RESPONSES ARE IN. If the judge responses are in JSON, you MUST RESPOND IN VALID JSON FORMAT."
84
-
85
- # If a custom conversation history is provided, append the judge responses to it
86
- if custom_conversation_history is not None:
87
- # Validate custom conversation history format
88
- for message in custom_conversation_history:
89
- if not isinstance(message, dict):
90
- error(f"TypeError: Custom conversation history must be a list of dictionaries. Received: {message}.")
91
- raise TypeError(f"Custom conversation history must be a list of dictionaries. Received: {message}.")
92
-
93
- if 'role' not in message or 'content' not in message:
94
- error("ValueError: Each message must have 'role' and 'content' keys")
95
- raise ValueError("Each message must have 'role' and 'content' keys")
96
-
97
- if not isinstance(message['role'], str) or not isinstance(message['content'], str):
98
- error(f"TypeError: Message role and content must be strings. Received: {type(message['role'])}, {type(message['content'])}.")
99
- raise TypeError(f"Message role and content must be strings. Received: {type(message['role'])}, {type(message['content'])}.")
100
-
101
- if message['role'] not in ['system', 'user', 'assistant']:
102
- error(f"ValueError: Message role must be one of: 'system', 'user', 'assistant'. Received: {message['role']}.")
103
- raise ValueError(f"Message role must be one of: 'system', 'user', 'assistant'. Received: {message['role']}.")
104
-
105
- judge_responses_prompt = {
106
- 'role': 'user',
107
- 'content': f'## Start of Judge Responses ##\n{formatted_responses}\n## End of Judge Responses ##\nSynthesized response:\n'
108
- }
109
- return custom_conversation_history + [judge_responses_prompt]
110
-
111
- # Otherwise return the default conversation with system prompt and examples
112
- # No customization, return the default conversation with system prompt and examples
113
- return default_conversation
114
-
115
- BASE_CONVERSATION = [
116
- {"role": "system", "content": "You are a helpful assistant."},
117
- ] # for string inputs, we need to add the user query to a base conversation, since LiteLLM only accepts a list of dictionaries as a chat history
118
- class MixtureOfJudges(JudgevalJudge):
119
- """
120
- IMPORTANT: When supplying custom prompts and conversation histories for aggregation, supply them in the following format:
121
- in kwargs:
122
- {
123
- "custom_prompt": "Your custom prompt here",
124
- "custom_conversation": [
125
- {"role": "system", "content": "System message 1"},
126
- {"role": "user", "content": "User message 1"},
127
- {"role": "assistant", "content": "Assistant message 1"},
128
- ...
129
- ]
130
- }
131
- """
132
- def __init__(self,
133
- models: List[str] = ['QWEN', 'LLAMA3_70B_INSTRUCT_TURBO', 'MISTRAL_8x22B_INSTRUCT'],
134
- aggregator: str = 'gpt-4o',
135
- **kwargs):
136
- """
137
- `models` are the individual judge models to be used for generating responses.
138
- `aggregator` is the model that will aggregate the responses from the individual judges.
139
-
140
- kwargs include "custom_prompt" and "custom_conversation" for customizing the prompt for the Mixture of Judges model.
141
- """
142
- self.models = models
143
- self.aggregator = aggregator
144
- self.kwargs = kwargs
145
- super().__init__(model_name=models)
146
-
147
- def generate(
148
- self,
149
- input: Union[str, List[Mapping[str, str]]],
150
- response_schema: pydantic.BaseModel = None,
151
- aggregation_schema: pydantic.BaseModel = None,
152
- **kwargs) -> str:
153
- """
154
- Args:
155
- input (Union[str, List[Mapping[str, str]]]): Input query or conversation history to the model.
156
- response_schema (pydantic.BaseModel): Response schema for individual judge models.
157
- aggregation_schema (pydantic.BaseModel): Response schema for the aggregator model.
158
- kwargs: Additional keyword arguments.
159
- """
160
- debug(f"Generating response for input type: {type(input)}")
161
-
162
- # Convert input to conversation format if needed
163
- if isinstance(input, str):
164
- convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
165
- elif isinstance(input, list):
166
- convo = input
167
- else:
168
- error(f"Invalid input type received: {type(input)}")
169
- raise TypeError(f"Input must be a string or a list of dictionaries. Input type of: {type(input)}")
170
-
171
- try:
172
- responses = get_completion_multiple_models(
173
- models=self.models,
174
- messages=[convo] * len(self.models),
175
- response_formats=[response_schema] * len(self.models)
176
- )
177
- except Exception as e:
178
- error(f"Error getting completions from multiple models: {str(e)}")
179
- raise
180
-
181
- compiled_mixture_prompt = build_dynamic_mixture_prompt(responses, self.kwargs.get('custom_prompt'), self.kwargs.get('custom_conversation'))
182
-
183
- try:
184
- mixed_response = get_chat_completion(
185
- model_type=self.aggregator,
186
- messages=compiled_mixture_prompt,
187
- response_format=aggregation_schema,
188
- )
189
- except Exception as e:
190
- error(f"Error getting chat completion from aggregator: {str(e)}")
191
- raise
192
-
193
- return mixed_response
194
-
195
- async def a_generate(
196
- self,
197
- input: Union[str, List[Mapping[str, str]]],
198
- response_schema: pydantic.BaseModel = None,
199
- aggregation_schema: pydantic.BaseModel = None,
200
- **kwargs
201
- ) -> str:
202
- """
203
- Args:
204
- input (Union[str, List[Mapping[str, str]]]): Input query or conversation history to the model.
205
- response_schema (pydantic.BaseModel): Response schema for individual judge models.
206
- aggregation_schema (pydantic.BaseModel): Response schema for the aggregator model.
207
- kwargs: Additional keyword arguments.
208
- """
209
- debug(f"Generating response for input type: {type(input)}")
210
-
211
- # Convert input to conversation format if needed
212
- if isinstance(input, str):
213
- convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
214
- elif isinstance(input, list):
215
- convo = input
216
- else:
217
- error(f"Invalid input type received: {type(input)}")
218
- raise TypeError(f"Input must be a string or a list of dictionaries. Input type of: {type(input)}")
219
-
220
- try:
221
- responses = await aget_completion_multiple_models(
222
- models=self.models,
223
- messages=[convo] * len(self.models),
224
- response_formats=[response_schema] * len(self.models)
225
- )
226
- except Exception as e:
227
- error(f"Error getting async completions from multiple models: {str(e)}")
228
- raise
229
-
230
- compiled_mixture_prompt = build_dynamic_mixture_prompt(responses, self.kwargs.get('custom_prompt'), self.kwargs.get('custom_conversation'))
231
-
232
- try:
233
- mixed_response = await aget_chat_completion(
234
- model_type=self.aggregator,
235
- messages=compiled_mixture_prompt,
236
- response_format=aggregation_schema,
237
- )
238
- except Exception as e:
239
- error(f"Error getting async chat completion from aggregator: {str(e)}")
240
- raise
241
-
242
- return mixed_response
243
-
244
- def load_model(self):
245
- return self.models
246
-
247
- def get_model_name(self) -> List[str]:
248
- return self.models