judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,286 +0,0 @@
1
- """
2
- Implementation for Mixture of Judges model through Judgeval
3
-
4
- Enables client to use multiple models to generate responses and then aggregate them into a single response.
5
- """
6
-
7
- import pydantic
8
- from typing import List, Union
9
- from judgeval.judges import JudgevalJudge
10
- from judgeval.common.utils import (
11
- get_completion_multiple_models,
12
- get_chat_completion,
13
- aget_completion_multiple_models,
14
- aget_chat_completion,
15
- )
16
- from judgeval.common.logger import judgeval_logger
17
-
18
-
19
- def build_dynamic_mixture_prompt(
20
- judge_responses: List[str],
21
- custom_system_prompt: str | None = None,
22
- custom_conversation_history: List[dict] | None = None,
23
- ) -> List[dict]:
24
- """
25
- Dynamically builds a prompt to mix judge responses together for the Mixture of Judges model.
26
-
27
- In this implementation, we simply concatenate the judge responses into a formatted string, then
28
- pass it into a default prompt template. This template can be customized by providing a custom prompt.
29
-
30
- Args:
31
- judge_responses (List[str]): List of responses from individual judges to be synthesized
32
- custom_system_prompt (str, optional): Custom system prompt to override the default one. Defaults to None.
33
- custom_conversation_history (List[dict], optional): Custom conversation history to override the default one. Defaults to None.
34
- """
35
- formatted_responses = "\n".join(
36
- [
37
- f"# Judge {i + 1}'s response: #\n{response}"
38
- for i, response in enumerate(judge_responses)
39
- ]
40
- )
41
-
42
- # This is the default prompt for the Mixture of Judges model
43
- """
44
- You are tasked with synthesizing responses from multiple expert judges. You will receive N individual answers on the same topic. Your job is to:
45
-
46
- 1. Analyze and compare the key points, patterns, and agreements between the answers.
47
- 2. Identify the consensus by focusing on areas where most or all of the answers align. Consider common reasoning and frequently mentioned conclusions.
48
- 3. Condense the responses into a single, coherent, and concise answer that represents the collective judgment of the group.
49
- 4. When opinions differ or contradict, highlight the most supported viewpoint while briefly acknowledging the dissenting perspectives.
50
- 5. Ensure the final answer is balanced and clear, providing a comprehensive summary that captures the wisdom of all judges while avoiding repetition.
51
-
52
- ## Start of Judge Responses ##
53
- {{judge_responses}}
54
- ## End of Judge Responses ##
55
- Synthesized response:
56
- """
57
-
58
- default_conversation = [ # inject the judge responses into the default prompt
59
- {
60
- "role": "system",
61
- "content": "You are tasked with synthesizing responses from multiple expert judges. You will receive N individual answers on the same topic. Your job is to:\n1. Analyze and compare the key points, patterns, and agreements between the answers.\n2. Identify the consensus by focusing on areas where most or all of the answers align. Consider common reasoning and frequently mentioned conclusions.\n3. Condense the responses into a single, coherent, and concise answer that represents the collective judgment of the group.\n4. When opinions differ or contradict, highlight the most supported viewpoint while briefly acknowledging the dissenting perspectives.\n5. Ensure the final answer is balanced and clear, providing a comprehensive summary that captures the wisdom of all judges while avoiding repetition.\n\n**IMPORTANT**: IF THE JUDGE RESPONSES ARE IN JSON FORMAT, YOU MUST RESPOND USING THE SAME JSON FORMAT THAT THE RESPONSES ARE IN. If the judge responses are in JSON, you MUST RESPOND IN VALID JSON FORMAT. ",
62
- },
63
- {
64
- "role": "user",
65
- "content": '## Start of Judge Responses ## \n# Judge 1\'s response: #\n{\n"claims": [\n{\n"claim": "A 30-day full refund is offered.",\n"quote": "We offer a 30-day full refund at no extra cost."\n},\n{\n"claim": "The 30-day full refund comes at no extra cost.",\n"quote": "We offer a 30-day full refund at no extra cost."\n}\n]\n}\n\n# Judge 2\'s response: #\n{\n "claims": [\n {\n "claim": "A full refund is offered within 30 days.",\n "quote": "We offer a 30-day full refund at no extra cost."\n },\n {\n "claim": "The 30-day full refund is offered at no extra cost.",\n "quote": "We offer a 30-day full refund at no extra cost."\n }\n ]\n}\n# Judge 3\'s response: #\n {\n "claims": [\n {\n "claim": "A 30-day full refund is offered.",\n "quote": "We offer a 30-day full refund at no extra cost."\n },\n {\n "claim": "The 30-day full refund is offered at no extra cost.",\n "quote": "We offer a 30-day full refund at no extra cost."\n }\n ]\n}\n## End of Judge Responses ##\nSynthesized response:',
66
- },
67
- {
68
- "role": "assistant",
69
- "content": 'The consensus among the judges is clear and unanimous. All three judges agree that a 30-day full refund is offered, and this refund is available at no extra cost. This conclusion is consistently supported by their statements, as each of their claims is directly quoted as: "We offer a 30-day full refund at no extra cost." There are no dissenting perspectives or opposing views provided in any of the responses, indicating complete alignment on this topic.\n\nJSON:\n{\n "claims": [\n {\n "claim": "A full refund is offered within 30 days.",\n "quote": "We offer a 30-day full refund at no extra cost."\n },\n {\n "claim": "The 30-day full refund is offered at no extra cost.",\n "quote": "We offer a 30-day full refund at no extra cost."\n }\n ]\n}',
70
- },
71
- {
72
- "role": "user",
73
- "content": "## Start of Judge Responses ##\n# Judge 1's response: # \nThe capital of France is Paris.\n\n# Judge 2's response: #\nThe capital of France is Paris.\n\n# Judge 3's response: # \nThe capital of France is Paris. It's one of the most popular tourist destinations in the world, known for its art, culture, and history. It's also famous for its iconic landmarks such as the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral.\n\n## End of Judge Responses ##\nSynthesized response:",
74
- },
75
- {
76
- "role": "assistant",
77
- "content": "The capital of France is Paris. It is widely recognized as one of the world's most popular tourist destinations, celebrated for its rich art, culture, and history. Paris is renowned for its iconic landmarks, including the Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral.",
78
- },
79
- {
80
- "role": "user",
81
- "content": f"## Start of Judge Responses ##\n{formatted_responses}\n## End of Judge Responses ##\nSynthesized response:\n",
82
- },
83
- ]
84
-
85
- # If a custom system prompt is provided, validate and use it
86
- if custom_system_prompt is not None:
87
- if not isinstance(custom_system_prompt, str):
88
- judgeval_logger.error(
89
- f"TypeError: Custom system prompt must be a string. Received: {type(custom_system_prompt)}."
90
- )
91
- raise TypeError(
92
- f"Custom system prompt must be a string. Received: {type(custom_system_prompt)}."
93
- )
94
- if not custom_system_prompt:
95
- raise ValueError("Custom system prompt cannot be empty")
96
- # Override the default system prompt, but also add special instructions for handling JSON
97
- default_conversation[0]["content"] = (
98
- custom_system_prompt
99
- + "\n\n**IMPORTANT**: IF THE JUDGE RESPONSES ARE IN JSON FORMAT, YOU MUST RESPOND USING THE SAME JSON FORMAT THAT THE RESPONSES ARE IN. If the judge responses are in JSON, you MUST RESPOND IN VALID JSON FORMAT."
100
- )
101
-
102
- # If a custom conversation history is provided, append the judge responses to it
103
- if custom_conversation_history is not None:
104
- # Validate custom conversation history format
105
- for message in custom_conversation_history:
106
- if not isinstance(message, dict):
107
- raise TypeError(
108
- f"Custom conversation history must be a list of dictionaries. Received: {message}."
109
- )
110
-
111
- if "role" not in message or "content" not in message:
112
- raise ValueError("Each message must have 'role' and 'content' keys")
113
-
114
- if not isinstance(message["role"], str) or not isinstance(
115
- message["content"], str
116
- ):
117
- raise TypeError(
118
- f"Message role and content must be strings. Received: {type(message['role'])}, {type(message['content'])}."
119
- )
120
-
121
- if message["role"] not in ["system", "user", "assistant"]:
122
- raise ValueError(
123
- f"Message role must be one of: 'system', 'user', 'assistant'. Received: {message['role']}."
124
- )
125
-
126
- judge_responses_prompt = {
127
- "role": "user",
128
- "content": f"## Start of Judge Responses ##\n{formatted_responses}\n## End of Judge Responses ##\nSynthesized response:\n",
129
- }
130
- return custom_conversation_history + [judge_responses_prompt]
131
-
132
- # Otherwise return the default conversation with system prompt and examples
133
- # No customization, return the default conversation with system prompt and examples
134
- return default_conversation
135
-
136
-
137
- BASE_CONVERSATION = [
138
- {"role": "system", "content": "You are a helpful assistant."},
139
- ] # for string inputs, we need to add the user query to a base conversation, since LiteLLM only accepts a list of dictionaries as a chat history
140
-
141
-
142
- class MixtureOfJudges(JudgevalJudge):
143
- """
144
- IMPORTANT: When supplying custom prompts and conversation histories for aggregation, supply them in the following format:
145
- in kwargs:
146
- {
147
- "custom_prompt": "Your custom prompt here",
148
- "custom_conversation": [
149
- {"role": "system", "content": "System message 1"},
150
- {"role": "user", "content": "User message 1"},
151
- {"role": "assistant", "content": "Assistant message 1"},
152
- ...
153
- ]
154
- }
155
- """
156
-
157
- def __init__(
158
- self,
159
- models: List[str] = [
160
- "QWEN",
161
- "LLAMA3_70B_INSTRUCT_TURBO",
162
- "MISTRAL_8x22B_INSTRUCT",
163
- ],
164
- aggregator: str = "gpt-4.1",
165
- **kwargs,
166
- ):
167
- """
168
- `models` are the individual judge models to be used for generating responses.
169
- `aggregator` is the model that will aggregate the responses from the individual judges.
170
-
171
- kwargs include "custom_prompt" and "custom_conversation" for customizing the prompt for the Mixture of Judges model.
172
- """
173
- self.models = models
174
- self.aggregator = aggregator
175
- self.kwargs = kwargs
176
- super().__init__(model_name=models)
177
-
178
- def generate(
179
- self,
180
- input: Union[str, List[dict]],
181
- response_schema: pydantic.BaseModel = None,
182
- aggregation_schema: pydantic.BaseModel = None,
183
- **kwargs,
184
- ) -> str:
185
- """
186
- Args:
187
- input (Union[str, List[Mapping[str, str]]]): Input query or conversation history to the model.
188
- response_schema (pydantic.BaseModel): Response schema for individual judge models.
189
- aggregation_schema (pydantic.BaseModel): Response schema for the aggregator model.
190
- kwargs: Additional keyword arguments.
191
- """
192
-
193
- # Convert input to conversation format if needed
194
- if isinstance(input, str):
195
- convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
196
- elif isinstance(input, list):
197
- convo = input
198
- else:
199
- judgeval_logger.error(f"Invalid input type received: {type(input)}")
200
- raise TypeError(
201
- f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
202
- )
203
-
204
- try:
205
- responses = get_completion_multiple_models(
206
- models=self.models,
207
- messages=[convo] * len(self.models),
208
- response_formats=[response_schema] * len(self.models),
209
- )
210
- except Exception:
211
- raise
212
-
213
- compiled_mixture_prompt = build_dynamic_mixture_prompt(
214
- responses,
215
- self.kwargs.get("custom_prompt"),
216
- self.kwargs.get("custom_conversation"),
217
- )
218
-
219
- try:
220
- mixed_response = get_chat_completion(
221
- model_type=self.aggregator,
222
- messages=compiled_mixture_prompt,
223
- response_format=aggregation_schema,
224
- )
225
- except Exception:
226
- raise
227
-
228
- return mixed_response
229
-
230
- async def a_generate(
231
- self,
232
- input: Union[str, List[dict]],
233
- response_schema: pydantic.BaseModel = None,
234
- aggregation_schema: pydantic.BaseModel = None,
235
- **kwargs,
236
- ) -> str:
237
- """
238
- Args:
239
- input (Union[str, List[Mapping[str, str]]]): Input query or conversation history to the model.
240
- response_schema (pydantic.BaseModel): Response schema for individual judge models.
241
- aggregation_schema (pydantic.BaseModel): Response schema for the aggregator model.
242
- kwargs: Additional keyword arguments.
243
- """
244
-
245
- # Convert input to conversation format if needed
246
- if isinstance(input, str):
247
- convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
248
- elif isinstance(input, list):
249
- convo = input
250
- else:
251
- judgeval_logger.error(f"Invalid input type received: {type(input)}")
252
- raise TypeError(
253
- f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
254
- )
255
-
256
- try:
257
- responses = await aget_completion_multiple_models(
258
- models=self.models,
259
- messages=[convo] * len(self.models),
260
- response_formats=[response_schema] * len(self.models),
261
- )
262
- except Exception:
263
- raise
264
-
265
- compiled_mixture_prompt = build_dynamic_mixture_prompt(
266
- responses,
267
- self.kwargs.get("custom_prompt"),
268
- self.kwargs.get("custom_conversation"),
269
- )
270
-
271
- try:
272
- mixed_response = await aget_chat_completion(
273
- model_type=self.aggregator,
274
- messages=compiled_mixture_prompt,
275
- response_format=aggregation_schema,
276
- )
277
- except Exception:
278
- raise
279
-
280
- return mixed_response
281
-
282
- def load_model(self):
283
- return self.models
284
-
285
- def get_model_name(self) -> List[str]:
286
- return self.models
@@ -1,369 +0,0 @@
1
- """
2
- Implements the JudgmentClient to interact with the Judgment API.
3
- """
4
-
5
- import os
6
- from uuid import uuid4
7
- from typing import Optional, List, Dict, Any, Union, Callable
8
-
9
- from judgeval.data.datasets import EvalDataset, EvalDatasetClient
10
- from judgeval.data import (
11
- ScoringResult,
12
- Example,
13
- Trace,
14
- )
15
- from judgeval.scorers import (
16
- APIScorerConfig,
17
- BaseScorer,
18
- )
19
- from judgeval.evaluation_run import EvaluationRun
20
- from judgeval.run_evaluation import (
21
- run_eval,
22
- assert_test,
23
- run_trace_eval,
24
- )
25
- from judgeval.data.trace_run import TraceRun
26
- from judgeval.common.api import JudgmentApiClient
27
- from judgeval.common.exceptions import JudgmentAPIError
28
- from langchain_core.callbacks import BaseCallbackHandler
29
- from judgeval.common.tracer import Tracer
30
- from judgeval.common.utils import validate_api_key
31
- from pydantic import BaseModel
32
- from judgeval.common.logger import judgeval_logger
33
-
34
-
35
- class EvalRunRequestBody(BaseModel):
36
- eval_name: str
37
- project_name: str
38
-
39
-
40
- class DeleteEvalRunRequestBody(BaseModel):
41
- eval_names: List[str]
42
- project_name: str
43
-
44
-
45
- class SingletonMeta(type):
46
- _instances: Dict[type, "JudgmentClient"] = {}
47
-
48
- def __call__(cls, *args, **kwargs):
49
- if cls not in cls._instances:
50
- instance = super().__call__(*args, **kwargs)
51
- cls._instances[cls] = instance
52
- return cls._instances[cls]
53
-
54
-
55
- class JudgmentClient(metaclass=SingletonMeta):
56
- def __init__(
57
- self,
58
- api_key: Optional[str] = os.getenv("JUDGMENT_API_KEY"),
59
- organization_id: Optional[str] = os.getenv("JUDGMENT_ORG_ID"),
60
- ):
61
- if not api_key:
62
- raise ValueError(
63
- "api_key parameter must be provided. Please provide a valid API key value or set the JUDGMENT_API_KEY environment variable."
64
- )
65
-
66
- if not organization_id:
67
- raise ValueError(
68
- "organization_id parameter must be provided. Please provide a valid organization ID value or set the JUDGMENT_ORG_ID environment variable."
69
- )
70
-
71
- self.judgment_api_key = api_key
72
- self.organization_id = organization_id
73
- self.api_client = JudgmentApiClient(api_key, organization_id)
74
- self.eval_dataset_client = EvalDatasetClient(api_key, organization_id)
75
-
76
- # Verify API key is valid
77
- result, response = validate_api_key(api_key)
78
- if not result:
79
- # May be bad to output their invalid API key...
80
- raise JudgmentAPIError(f"Issue with passed in Judgment API key: {response}")
81
- else:
82
- judgeval_logger.info("Successfully initialized JudgmentClient!")
83
-
84
- def run_trace_evaluation(
85
- self,
86
- scorers: List[Union[APIScorerConfig, BaseScorer]],
87
- examples: Optional[List[Example]] = None,
88
- function: Optional[Callable] = None,
89
- tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
90
- traces: Optional[List[Trace]] = None,
91
- tools: Optional[List[Dict[str, Any]]] = None,
92
- project_name: str = "default_project",
93
- eval_run_name: str = "default_eval_trace",
94
- model: Optional[str] = "gpt-4.1",
95
- append: bool = False,
96
- override: bool = False,
97
- ) -> List[ScoringResult]:
98
- try:
99
- if examples and not function:
100
- raise ValueError("Cannot pass in examples without a function")
101
-
102
- if traces and function:
103
- raise ValueError("Cannot pass in traces and function")
104
-
105
- if examples and traces:
106
- raise ValueError("Cannot pass in both examples and traces")
107
-
108
- trace_run = TraceRun(
109
- project_name=project_name,
110
- eval_name=eval_run_name,
111
- traces=traces,
112
- scorers=scorers,
113
- model=model,
114
- append=append,
115
- organization_id=self.organization_id,
116
- tools=tools,
117
- )
118
- return run_trace_eval(
119
- trace_run, self.judgment_api_key, override, function, tracer, examples
120
- )
121
- except ValueError as e:
122
- raise ValueError(
123
- f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}"
124
- )
125
- except Exception as e:
126
- raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
127
-
128
- def run_evaluation(
129
- self,
130
- examples: List[Example],
131
- scorers: List[Union[APIScorerConfig, BaseScorer]],
132
- model: Optional[str] = "gpt-4.1",
133
- project_name: str = "default_project",
134
- eval_run_name: str = "default_eval_run",
135
- override: bool = False,
136
- append: bool = False,
137
- ) -> List[ScoringResult]:
138
- """
139
- Executes an evaluation of `Example`s using one or more `Scorer`s
140
-
141
- Args:
142
- examples (List[Example]): The examples to evaluate
143
- scorers (List[Union[APIScorerConfig, BaseScorer]]): A list of scorers to use for evaluation
144
- model (str): The model used as a judge when using LLM as a Judge
145
- project_name (str): The name of the project the evaluation results belong to
146
- eval_run_name (str): A name for this evaluation run
147
- override (bool): Whether to override an existing evaluation run with the same name
148
- append (bool): Whether to append to an existing evaluation run with the same name
149
-
150
- Returns:
151
- List[ScoringResult]: The results of the evaluation
152
- """
153
- if override and append:
154
- raise ValueError(
155
- "Cannot set both override and append to True. Please choose one."
156
- )
157
-
158
- try:
159
- eval = EvaluationRun(
160
- append=append,
161
- override=override,
162
- project_name=project_name,
163
- eval_name=eval_run_name,
164
- examples=examples,
165
- scorers=scorers,
166
- model=model,
167
- organization_id=self.organization_id,
168
- )
169
- return run_eval(
170
- eval,
171
- self.judgment_api_key,
172
- override,
173
- )
174
- except ValueError as e:
175
- raise ValueError(
176
- f"Please check your EvaluationRun object, one or more fields are invalid: \n{str(e)}"
177
- )
178
- except Exception as e:
179
- raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
180
-
181
- def create_dataset(self) -> EvalDataset:
182
- return self.eval_dataset_client.create_dataset()
183
-
184
- def push_dataset(
185
- self,
186
- alias: str,
187
- dataset: EvalDataset,
188
- project_name: str,
189
- overwrite: Optional[bool] = False,
190
- ) -> bool:
191
- """
192
- Uploads an `EvalDataset` to the Judgment platform for storage.
193
-
194
- Args:
195
- alias (str): The name to use for the dataset
196
- dataset (EvalDataset): The dataset to upload to Judgment
197
- overwrite (Optional[bool]): Whether to overwrite the dataset if it already exists
198
-
199
- Returns:
200
- bool: Whether the dataset was successfully uploaded
201
- """
202
- # Set judgment_api_key just in case it was not set
203
- dataset.judgment_api_key = self.judgment_api_key
204
- return self.eval_dataset_client.push(dataset, alias, project_name, overwrite)
205
-
206
- def append_dataset(
207
- self, alias: str, examples: List[Example], project_name: str
208
- ) -> bool:
209
- """
210
- Appends an `EvalDataset` to the Judgment platform for storage.
211
- """
212
- return self.eval_dataset_client.append_examples(alias, examples, project_name)
213
-
214
- def pull_dataset(self, alias: str, project_name: str) -> EvalDataset:
215
- """
216
- Retrieves a saved `EvalDataset` from the Judgment platform.
217
-
218
- Args:
219
- alias (str): The name of the dataset to retrieve
220
-
221
- Returns:
222
- EvalDataset: The retrieved dataset
223
- """
224
- return self.eval_dataset_client.pull(alias, project_name)
225
-
226
- def delete_dataset(self, alias: str, project_name: str) -> bool:
227
- """
228
- Deletes a saved `EvalDataset` from the Judgment platform.
229
- """
230
- return self.eval_dataset_client.delete(alias, project_name)
231
-
232
- def pull_project_dataset_stats(self, project_name: str) -> dict:
233
- """
234
- Retrieves all dataset stats from the Judgment platform for the project.
235
-
236
- Args:
237
- project_name (str): The name of the project to retrieve
238
-
239
- Returns:
240
- dict: The retrieved dataset stats
241
- """
242
- return self.eval_dataset_client.pull_project_dataset_stats(project_name)
243
-
244
- # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
245
- def pull_eval(
246
- self, project_name: str, eval_run_name: str
247
- ) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
248
- """Pull evaluation results from the server.
249
-
250
- Args:
251
- project_name (str): Name of the project
252
- eval_run_name (str): Name of the evaluation run
253
-
254
- Returns:
255
- Dict[str, Union[str, List[ScoringResult]]]: Dictionary containing:
256
- - id (str): The evaluation run ID
257
- - results (List[ScoringResult]): List of scoring results
258
- """
259
- return self.api_client.fetch_evaluation_results(project_name, eval_run_name)
260
-
261
- def create_project(self, project_name: str) -> bool:
262
- """
263
- Creates a project on the server.
264
- """
265
- self.api_client.create_project(project_name)
266
- return True
267
-
268
- def delete_project(self, project_name: str) -> bool:
269
- """
270
- Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
271
- """
272
- self.api_client.delete_project(project_name)
273
- return True
274
-
275
- def assert_test(
276
- self,
277
- examples: List[Example],
278
- scorers: List[Union[APIScorerConfig, BaseScorer]],
279
- model: Optional[str] = "gpt-4.1",
280
- project_name: str = "default_test",
281
- eval_run_name: str = str(uuid4()),
282
- override: bool = False,
283
- append: bool = False,
284
- ) -> None:
285
- """
286
- Asserts a test by running the evaluation and checking the results for success
287
-
288
- Args:
289
- examples (List[Example]): The examples to evaluate.
290
- scorers (List[Union[APIScorerConfig, BaseScorer]]): A list of scorers to use for evaluation
291
- model (str): The model used as a judge when using LLM as a Judge
292
- project_name (str): The name of the project the evaluation results belong to
293
- eval_run_name (str): A name for this evaluation run
294
- override (bool): Whether to override an existing evaluation run with the same name
295
- append (bool): Whether to append to an existing evaluation run with the same name
296
- async_execution (bool): Whether to run the evaluation asynchronously
297
- """
298
-
299
- results: List[ScoringResult]
300
-
301
- results = self.run_evaluation(
302
- examples=examples,
303
- scorers=scorers,
304
- model=model,
305
- project_name=project_name,
306
- eval_run_name=eval_run_name,
307
- override=override,
308
- append=append,
309
- )
310
- assert_test(results)
311
-
312
- def assert_trace_test(
313
- self,
314
- scorers: List[Union[APIScorerConfig, BaseScorer]],
315
- examples: Optional[List[Example]] = None,
316
- function: Optional[Callable] = None,
317
- tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
318
- traces: Optional[List[Trace]] = None,
319
- tools: Optional[List[Dict[str, Any]]] = None,
320
- model: Optional[str] = "gpt-4.1",
321
- project_name: str = "default_test",
322
- eval_run_name: str = str(uuid4()),
323
- override: bool = False,
324
- append: bool = False,
325
- async_execution: bool = False,
326
- ) -> None:
327
- """
328
- Asserts a test by running the evaluation and checking the results for success
329
-
330
- Args:
331
- examples (List[Example]): The examples to evaluate.
332
- scorers (List[Union[APIScorerConfig, BaseScorer]]): A list of scorers to use for evaluation
333
- model (str): The model used as a judge when using LLM as a Judge
334
- project_name (str): The name of the project the evaluation results belong to
335
- eval_run_name (str): A name for this evaluation run
336
- override (bool): Whether to override an existing evaluation run with the same name
337
- append (bool): Whether to append to an existing evaluation run with the same name
338
- function (Optional[Callable]): A function to use for evaluation
339
- tracer (Optional[Union[Tracer, BaseCallbackHandler]]): A tracer to use for evaluation
340
- tools (Optional[List[Dict[str, Any]]]): A list of tools to use for evaluation
341
- async_execution (bool): Whether to run the evaluation asynchronously
342
- """
343
-
344
- # Check for enable_param_checking and tools
345
- for scorer in scorers:
346
- if hasattr(scorer, "kwargs") and scorer.kwargs is not None:
347
- if scorer.kwargs.get("enable_param_checking") is True:
348
- if not tools:
349
- raise ValueError(
350
- f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer."
351
- )
352
-
353
- results: List[ScoringResult]
354
-
355
- results = self.run_trace_evaluation(
356
- examples=examples,
357
- traces=traces,
358
- scorers=scorers,
359
- model=model,
360
- project_name=project_name,
361
- eval_run_name=eval_run_name,
362
- override=override,
363
- append=append,
364
- function=function,
365
- tracer=tracer,
366
- tools=tools,
367
- )
368
-
369
- assert_test(results)