judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,346 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import concurrent.futures
5
+ import time
6
+ import threading
7
+ from typing import Any, List, Tuple, TYPE_CHECKING
8
+ from rich import print as rprint
9
+
10
+ from judgeval.data import ScorerData, ScoringResult
11
+ from judgeval.scorers.score import a_execute_scoring
12
+ from judgeval.api import JudgmentSyncClient
13
+ from judgeval.env import (
14
+ JUDGMENT_MAX_CONCURRENT_EVALUATIONS,
15
+ )
16
+ from judgeval.exceptions import JudgmentAPIError, JudgmentRuntimeError
17
+ from judgeval.logger import judgeval_logger
18
+
19
+ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
20
+
21
+ if TYPE_CHECKING:
22
+ from judgeval.data.evaluation_run import ExampleEvaluationRun
23
+
24
+
25
+ def safe_run_async(coro):
26
+ """
27
+ Safely run an async coroutine whether or not there's already an event loop running.
28
+
29
+ Args:
30
+ coro: The coroutine to run
31
+
32
+ Returns:
33
+ The result of the coroutine
34
+ """
35
+ try:
36
+ # Try to get the running loop
37
+ asyncio.get_running_loop()
38
+ # If we get here, there's already a loop running
39
+ # Run in a separate thread to avoid "asyncio.run() cannot be called from a running event loop"
40
+ with concurrent.futures.ThreadPoolExecutor() as executor:
41
+ future = executor.submit(asyncio.run, coro)
42
+ return future.result()
43
+ except RuntimeError:
44
+ # No event loop is running, safe to use asyncio.run()
45
+ return asyncio.run(coro)
46
+
47
+
48
+ def log_evaluation_results(
49
+ scoring_results: List[Any],
50
+ run: ExampleEvaluationRun,
51
+ ) -> str:
52
+ """
53
+ Logs evaluation results to the Judgment API database.
54
+
55
+ Args:
56
+ merged_results (List[ScoringResult]): The results to log
57
+ evaluation_run (EvaluationRun): The evaluation run containing project info and API key
58
+ judgment_api_key (str): The API key for the Judgment API
59
+
60
+ Raises:
61
+ JudgmentAPIError: If there's an API error during logging
62
+ ValueError: If there's a validation error with the results
63
+ """
64
+ try:
65
+ if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
66
+ raise ValueError("API key and organization ID are required")
67
+
68
+ api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
69
+ response = api_client.log_eval_results(
70
+ {
71
+ "results": scoring_results, # type: ignore
72
+ "run": run.model_dump(warnings=False), # type: ignore
73
+ }
74
+ )
75
+ url = response.get("ui_results_url")
76
+ assert url is not None
77
+ return url
78
+
79
+ except Exception as e:
80
+ judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
81
+ raise JudgmentRuntimeError(
82
+ f"Request failed while saving evaluation results to DB: {str(e)}"
83
+ )
84
+
85
+
86
+ def _poll_evaluation_until_complete(
87
+ evaluation_run: ExampleEvaluationRun,
88
+ expected_examples_count: int,
89
+ poll_interval_seconds: float = 5,
90
+ max_failures: int = 5,
91
+ max_poll_count: int = 60, # This should be equivalent to 5 minutes
92
+ ) -> Tuple[List[ScoringResult], str]:
93
+ """
94
+ Polls until the evaluation is complete and returns the results.
95
+
96
+ Args:
97
+ eval_name (str): Name of the evaluation run
98
+ project_name (str): Name of the project
99
+ judgment_api_key (str): API key for authentication
100
+ organization_id (str): Organization ID for the evaluation
101
+ poll_interval_seconds (int, optional): Time between status checks in seconds. Defaults to 5.
102
+ original_examples (List[Example], optional): The original examples sent for evaluation.
103
+ If provided, will match results with original examples.
104
+
105
+ Returns:
106
+ List[ScoringResult]: The evaluation results
107
+ """
108
+ project_name = evaluation_run.project_name
109
+ experiment_run_id = evaluation_run.id
110
+
111
+ if not project_name or not experiment_run_id:
112
+ raise ValueError("Project name and experiment run ID are required")
113
+
114
+ poll_count = 0
115
+ exception_count = 0
116
+ if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
117
+ raise ValueError("Judgment API key and organization ID are required")
118
+ api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
119
+ while poll_count < max_poll_count:
120
+ poll_count += 1
121
+ try:
122
+ # Check status
123
+ results_response = api_client.fetch_experiment_run(
124
+ {
125
+ "experiment_run_id": experiment_run_id,
126
+ "project_name": project_name,
127
+ }
128
+ )
129
+
130
+ example_scorer_pairings = results_response.get("results", []) or []
131
+ if len(example_scorer_pairings) != expected_examples_count:
132
+ time.sleep(poll_interval_seconds)
133
+ continue
134
+
135
+ url = results_response.get("ui_results_url")
136
+
137
+ scoring_result_list = []
138
+ for res in example_scorer_pairings:
139
+ example = res.get("data", {}).copy()
140
+ example["example_id"] = res.get("example_id")
141
+ scoring_result = ScoringResult(
142
+ scorers_data=res.get("scorers", []),
143
+ success=all(
144
+ t.get("success", False) for t in res.get("scorers", [])
145
+ ),
146
+ data_object=example,
147
+ )
148
+ scoring_result_list.append(scoring_result)
149
+
150
+ assert url is not None
151
+ return scoring_result_list, url
152
+ except Exception as e:
153
+ exception_count += 1
154
+ if isinstance(e, JudgmentAPIError):
155
+ raise
156
+
157
+ judgeval_logger.error(f"Error checking evaluation status: {str(e)}")
158
+ if exception_count > max_failures:
159
+ raise JudgmentRuntimeError(
160
+ f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
161
+ )
162
+
163
+ time.sleep(poll_interval_seconds)
164
+
165
+ raise JudgmentRuntimeError(
166
+ f"Error checking evaluation status after {poll_count} attempts"
167
+ )
168
+
169
+
170
+ def progress_logger(stop_event, msg="Working...", interval=5):
171
+ start = time.time()
172
+ while not stop_event.is_set():
173
+ elapsed = int(time.time() - start)
174
+ judgeval_logger.info(f"{msg} ({elapsed} sec)")
175
+ stop_event.wait(interval)
176
+
177
+
178
+ def run_eval(
179
+ evaluation_run: ExampleEvaluationRun,
180
+ ) -> List[ScoringResult]:
181
+ """
182
+ Executes an evaluation of `Example`s using one or more `Scorer`s
183
+
184
+ Args:
185
+ evaluation_run (ExampleEvaluationRun): Stores example and evaluation together for running
186
+
187
+ Returns:
188
+ List[ScoringResult]: A list of ScoringResult objects
189
+ """
190
+ # Check that every example has the same keys
191
+ keys = evaluation_run.examples[0].get_fields().keys()
192
+ for example in evaluation_run.examples:
193
+ current_keys = example.get_fields().keys()
194
+ if current_keys != keys:
195
+ raise ValueError(
196
+ f"All examples must have the same keys: {current_keys} != {keys}"
197
+ )
198
+
199
+ results: List[ScoringResult] = []
200
+ url = ""
201
+
202
+ if (
203
+ len(evaluation_run.custom_scorers) > 0
204
+ and len(evaluation_run.judgment_scorers) > 0
205
+ ):
206
+ error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
207
+ judgeval_logger.error(error_msg)
208
+ raise ValueError(error_msg)
209
+
210
+ e2b_scorers = [cs for cs in evaluation_run.custom_scorers if cs.server_hosted]
211
+
212
+ if evaluation_run.judgment_scorers or e2b_scorers:
213
+ if evaluation_run.judgment_scorers and e2b_scorers:
214
+ error_msg = "We currently do not support running both hosted custom scorers and Judgment API scorers at the same time. Please run your evaluation with one or the other, but not both."
215
+ judgeval_logger.error(error_msg)
216
+ raise ValueError(error_msg)
217
+
218
+ if len(e2b_scorers) > 1:
219
+ error_msg = "We currently do not support running multiple hosted custom scorers at the same time."
220
+ judgeval_logger.error(error_msg)
221
+ raise ValueError(error_msg)
222
+
223
+ stop_event = threading.Event()
224
+ t = threading.Thread(
225
+ target=progress_logger, args=(stop_event, "Running evaluation...")
226
+ )
227
+ t.start()
228
+ try:
229
+ if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
230
+ raise ValueError("Judgment API key and organization ID are required")
231
+ api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
232
+ response = api_client.add_to_run_eval_queue_examples(
233
+ evaluation_run.model_dump(warnings=False) # type: ignore
234
+ )
235
+
236
+ if not response.get("success", False):
237
+ error_message = response.error
238
+ judgeval_logger.error(
239
+ f"Error adding evaluation to queue: {error_message}"
240
+ )
241
+ raise JudgmentRuntimeError(error_message)
242
+
243
+ results, url = _poll_evaluation_until_complete(
244
+ evaluation_run=evaluation_run,
245
+ expected_examples_count=len(evaluation_run.examples),
246
+ )
247
+ finally:
248
+ stop_event.set()
249
+ t.join()
250
+ else:
251
+ results = safe_run_async(
252
+ a_execute_scoring(
253
+ evaluation_run.examples,
254
+ evaluation_run.custom_scorers,
255
+ model=evaluation_run.model,
256
+ throttle_value=0,
257
+ max_concurrent=JUDGMENT_MAX_CONCURRENT_EVALUATIONS,
258
+ )
259
+ )
260
+
261
+ send_results = [
262
+ scoring_result.model_dump(warnings=False) for scoring_result in results
263
+ ]
264
+ url = log_evaluation_results(send_results, evaluation_run)
265
+ rprint(
266
+ f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
267
+ )
268
+ return results
269
+
270
+
271
+ def assert_test(scoring_results: List[ScoringResult]) -> None:
272
+ """
273
+ Collects all failed scorers from the scoring results.
274
+
275
+ Args:
276
+ ScoringResults (List[ScoringResult]): List of scoring results to check
277
+
278
+ Returns:
279
+ None. Raises exceptions for any failed test cases.
280
+ """
281
+ failed_cases: List[List[ScorerData]] = []
282
+
283
+ for result in scoring_results:
284
+ if not result.success:
285
+ # Create a test case context with all relevant fields
286
+ test_case: List[ScorerData] = []
287
+ if result.scorers_data:
288
+ # If the result was not successful, check each scorer_data
289
+ for scorer_data in result.scorers_data:
290
+ if not scorer_data.success:
291
+ test_case.append(scorer_data)
292
+ failed_cases.append(test_case)
293
+
294
+ if failed_cases:
295
+ error_msg = "The following test cases failed: \n"
296
+ for fail_case in failed_cases:
297
+ for fail_scorer in fail_case:
298
+ error_msg += (
299
+ f"\nScorer Name: {fail_scorer.name}\n"
300
+ f"Threshold: {fail_scorer.threshold}\n"
301
+ f"Success: {fail_scorer.success}\n"
302
+ f"Score: {fail_scorer.score}\n"
303
+ f"Reason: {fail_scorer.reason}\n"
304
+ f"Strict Mode: {fail_scorer.strict_mode}\n"
305
+ f"Evaluation Model: {fail_scorer.evaluation_model}\n"
306
+ f"Error: {fail_scorer.error}\n"
307
+ f"Additional Metadata: {fail_scorer.additional_metadata}\n"
308
+ )
309
+ error_msg += "-" * 100
310
+
311
+ total_tests = len(scoring_results)
312
+ failed_tests = len(failed_cases)
313
+ passed_tests = total_tests - failed_tests
314
+
315
+ # Print summary with colors
316
+ rprint("\n" + "=" * 80)
317
+ if failed_tests == 0:
318
+ rprint(
319
+ f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]"
320
+ )
321
+ else:
322
+ rprint(
323
+ f"[bold red]⚠️ TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]"
324
+ )
325
+ rprint("=" * 80 + "\n")
326
+
327
+ # Print individual test cases
328
+ for i, result in enumerate(scoring_results):
329
+ test_num = i + 1
330
+ if result.success:
331
+ rprint(f"[green]✓ Test {test_num}: PASSED[/green]")
332
+ else:
333
+ rprint(f"[red]✗ Test {test_num}: FAILED[/red]")
334
+ if result.scorers_data:
335
+ for scorer_data in result.scorers_data:
336
+ if not scorer_data.success:
337
+ rprint(f" [yellow]Scorer: {scorer_data.name}[/yellow]")
338
+ rprint(f" [red] Score: {scorer_data.score}[/red]")
339
+ rprint(f" [red] Reason: {scorer_data.reason}[/red]")
340
+ if scorer_data.error:
341
+ rprint(f" [red] Error: {scorer_data.error}[/red]")
342
+ rprint(" " + "-" * 40)
343
+
344
+ rprint("\n" + "=" * 80)
345
+ if failed_tests > 0:
346
+ raise AssertionError(failed_cases)
judgeval/exceptions.py ADDED
@@ -0,0 +1,28 @@
1
+ from __future__ import annotations
2
+
3
+ from httpx import HTTPError, Response
4
+ from typing import Optional
5
+
6
+
7
+ class JudgmentAPIError(HTTPError):
8
+ status_code: int
9
+ detail: str
10
+ response: Optional[Response]
11
+
12
+ def __init__(self, status_code: int, detail: str, response: Optional[Response]):
13
+ self.status_code = status_code
14
+ self.detail = detail
15
+ self.response = response
16
+ super().__init__(f"{status_code}: {detail}")
17
+
18
+
19
+ class JudgmentTestError(Exception): ...
20
+
21
+
22
+ class JudgmentRuntimeError(RuntimeError): ...
23
+
24
+
25
+ class InvalidJudgeModelError(Exception): ...
26
+
27
+
28
+ __all__ = ("JudgmentAPIError", "JudgmentRuntimeError", "InvalidJudgeModelError")
@@ -0,0 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC
4
+ import os
5
+
6
+
7
+ class Langgraph(ABC):
8
+ @staticmethod
9
+ def initialize(otel_only: bool = True):
10
+ os.environ["LANGSMITH_OTEL_ENABLED"] = "true"
11
+ os.environ["LANGSMITH_TRACING"] = "true"
12
+ if otel_only:
13
+ os.environ["LANGSMITH_OTEL_ONLY"] = "true"
@@ -0,0 +1,51 @@
1
+ from abc import ABC
2
+ from judgeval.tracer import Tracer
3
+ from judgeval.logger import judgeval_logger
4
+ from judgeval.utils.url import url_for
5
+ from judgeval.utils.project import _resolve_project_id
6
+
7
+
8
+ try:
9
+ import openlit # type: ignore
10
+ except ImportError:
11
+ raise ImportError(
12
+ "Openlit is not installed and required for the openlit integration. Please install it with `pip install openlit`."
13
+ )
14
+
15
+
16
+ class Openlit(ABC):
17
+ @staticmethod
18
+ def initialize(
19
+ **kwargs,
20
+ ):
21
+ tracer = Tracer.get_instance()
22
+ if not tracer or not tracer._initialized:
23
+ raise ValueError(
24
+ "Openlit must be initialized after the tracer has been initialized. Please create the Tracer instance first before initializing Openlit."
25
+ )
26
+
27
+ api_key = tracer.api_key
28
+ organization_id = tracer.organization_id
29
+ project_name = tracer.project_name
30
+
31
+ project_id = _resolve_project_id(project_name, api_key, organization_id)
32
+ if not project_id:
33
+ judgeval_logger.warning(
34
+ f"Project {project_name} not found. Please create it first at https://app.judgmentlabs.ai/org/{organization_id}/projects."
35
+ )
36
+ return
37
+
38
+ openlit.init(
39
+ service_name=project_name,
40
+ otlp_endpoint=url_for("/otel"),
41
+ otlp_headers={
42
+ "Authorization": f"Bearer {api_key}",
43
+ "X-Organization-Id": organization_id,
44
+ "X-Project-Id": project_id,
45
+ },
46
+ tracer=tracer.get_tracer(),
47
+ **kwargs,
48
+ )
49
+
50
+
51
+ __all__ = ["Openlit"]
@@ -1,6 +1,6 @@
1
1
  from judgeval.judges.base_judge import JudgevalJudge
2
2
  from judgeval.judges.litellm_judge import LiteLLMJudge
3
3
  from judgeval.judges.together_judge import TogetherJudge
4
- from judgeval.judges.mixture_of_judges import MixtureOfJudges
5
4
 
6
- __all__ = ["JudgevalJudge", "LiteLLMJudge", "TogetherJudge", "MixtureOfJudges"]
5
+
6
+ __all__ = ["JudgevalJudge", "LiteLLMJudge", "TogetherJudge"]
@@ -1,20 +1,77 @@
1
1
  import pydantic
2
- from typing import List, Union, Mapping
2
+ from typing import Dict, List, Union, Mapping, Any
3
3
 
4
+ from judgeval.constants import ACCEPTABLE_MODELS
4
5
  from judgeval.judges import JudgevalJudge
5
- from judgeval.common.utils import (
6
- afetch_litellm_api_response,
7
- fetch_litellm_api_response,
8
- )
9
- from judgeval.common.logger import judgeval_logger
6
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
7
+
8
+ try:
9
+ import litellm
10
+ except ImportError:
11
+ raise ImportError(
12
+ "Litellm is not installed and required for the litellm judge. Please install it with `pip install litellm`."
13
+ )
14
+
15
+
16
+ def fetch_litellm_api_response(
17
+ model: str,
18
+ messages: List[Dict[str, str]],
19
+ response_format: Union[Dict[str, Any], None] = None,
20
+ ) -> str:
21
+ if response_format is not None:
22
+ response = litellm.completion(
23
+ model=model,
24
+ messages=messages,
25
+ response_format=response_format,
26
+ )
27
+ else:
28
+ response = litellm.completion(
29
+ model=model,
30
+ messages=messages,
31
+ )
32
+
33
+ content = response.choices[0].message.content # type: ignore[attr-defined]
34
+ if content is None:
35
+ raise ValueError("Received empty response from litellm")
36
+ return content
37
+
38
+
39
+ async def afetch_litellm_api_response(
40
+ model: str,
41
+ messages: List[Dict[str, str]],
42
+ response_format: Union[Dict[str, Any], None] = None,
43
+ ) -> str:
44
+ if not messages:
45
+ raise ValueError("Messages cannot be empty")
46
+
47
+ if model not in ACCEPTABLE_MODELS:
48
+ raise ValueError(
49
+ f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}."
50
+ )
51
+
52
+ if response_format is not None:
53
+ response = await litellm.acompletion(
54
+ model=model, messages=messages, response_format=response_format
55
+ )
56
+ else:
57
+ response = await litellm.acompletion(
58
+ model=model,
59
+ messages=messages,
60
+ )
61
+
62
+ content = response.choices[0].message.content # type: ignore[attr-defined]
63
+ if content is None:
64
+ raise ValueError("Received empty response from litellm")
65
+ return content
66
+
10
67
 
11
68
  BASE_CONVERSATION = [
12
69
  {"role": "system", "content": "You are a helpful assistant."},
13
- ] # for string inputs, we need to add the user query to a base conversation, since LiteLLM only accepts a list of dictionaries as a chat history
70
+ ]
14
71
 
15
72
 
16
73
  class LiteLLMJudge(JudgevalJudge):
17
- def __init__(self, model: str = "gpt-4.1-mini", **kwargs):
74
+ def __init__(self, model: str = JUDGMENT_DEFAULT_GPT_MODEL, **kwargs):
18
75
  self.model = model
19
76
  self.kwargs = kwargs
20
77
  super().__init__(model_name=model)
@@ -22,19 +79,21 @@ class LiteLLMJudge(JudgevalJudge):
22
79
  def generate(
23
80
  self,
24
81
  input: Union[str, List[Mapping[str, str]]],
25
- schema: pydantic.BaseModel = None,
82
+ schema: Union[pydantic.BaseModel, None] = None,
26
83
  ) -> str:
84
+ response_format = schema.model_json_schema() if schema else None
85
+
27
86
  if isinstance(input, str):
28
87
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
29
88
  return fetch_litellm_api_response(
30
- model=self.model, messages=convo, response_format=schema
89
+ model=self.model, messages=convo, response_format=response_format
31
90
  )
32
91
  elif isinstance(input, list):
92
+ messages = [dict(msg) for msg in input]
33
93
  return fetch_litellm_api_response(
34
- model=self.model, messages=input, response_format=schema
94
+ model=self.model, messages=messages, response_format=response_format
35
95
  )
36
96
  else:
37
- judgeval_logger.error(f"Invalid input type received: {type(input)}")
38
97
  raise TypeError(
39
98
  f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
40
99
  )
@@ -42,21 +101,23 @@ class LiteLLMJudge(JudgevalJudge):
42
101
  async def a_generate(
43
102
  self,
44
103
  input: Union[str, List[Mapping[str, str]]],
45
- schema: pydantic.BaseModel = None,
104
+ schema: Union[pydantic.BaseModel, None] = None,
46
105
  ) -> str:
106
+ response_format = schema.model_json_schema() if schema else None
107
+
47
108
  if isinstance(input, str):
48
109
  convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
49
110
  response = await afetch_litellm_api_response(
50
- model=self.model, messages=convo, response_format=schema
111
+ model=self.model, messages=convo, response_format=response_format
51
112
  )
52
113
  return response
53
114
  elif isinstance(input, list):
115
+ messages = [dict(msg) for msg in input]
54
116
  response = await afetch_litellm_api_response(
55
- model=self.model, messages=input, response_format=schema
117
+ model=self.model, messages=messages, response_format=response_format
56
118
  )
57
119
  return response
58
120
  else:
59
- judgeval_logger.error(f"Invalid input type received: {type(input)}")
60
121
  raise TypeError(
61
122
  f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
62
123
  )