judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,684 +0,0 @@
1
- import asyncio
2
- import concurrent.futures
3
- import time
4
- import json
5
- import sys
6
- import threading
7
- from typing import List, Dict, Union, Optional, Callable, Tuple, Any
8
- from rich import print as rprint
9
-
10
- from judgeval.data import ScorerData, ScoringResult, Example, Trace
11
- from judgeval.scorers import BaseScorer, APIScorerConfig
12
- from judgeval.scorers.score import a_execute_scoring
13
- from judgeval.common.api import JudgmentApiClient
14
- from judgeval.constants import (
15
- MAX_CONCURRENT_EVALUATIONS,
16
- )
17
- from judgeval.common.exceptions import JudgmentAPIError
18
- from judgeval.common.api.api import JudgmentAPIException
19
- from judgeval.common.logger import judgeval_logger
20
- from judgeval.evaluation_run import EvaluationRun
21
- from judgeval.data.trace_run import TraceRun
22
- from judgeval.common.tracer import Tracer
23
- from langchain_core.callbacks import BaseCallbackHandler
24
-
25
-
26
- def safe_run_async(coro):
27
- """
28
- Safely run an async coroutine whether or not there's already an event loop running.
29
-
30
- Args:
31
- coro: The coroutine to run
32
-
33
- Returns:
34
- The result of the coroutine
35
- """
36
- try:
37
- # Try to get the running loop
38
- asyncio.get_running_loop()
39
- # If we get here, there's already a loop running
40
- # Run in a separate thread to avoid "asyncio.run() cannot be called from a running event loop"
41
- with concurrent.futures.ThreadPoolExecutor() as executor:
42
- future = executor.submit(asyncio.run, coro)
43
- return future.result()
44
- except RuntimeError:
45
- # No event loop is running, safe to use asyncio.run()
46
- return asyncio.run(coro)
47
-
48
-
49
- def send_to_rabbitmq(evaluation_run: EvaluationRun) -> Dict[str, Any]:
50
- """
51
- Sends an evaluation run to the RabbitMQ evaluation queue.
52
- """
53
- if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
54
- raise ValueError("API key and organization ID are required")
55
- if not evaluation_run.eval_name or not evaluation_run.project_name:
56
- raise ValueError("Eval name and project name are required")
57
- api_client = JudgmentApiClient(
58
- evaluation_run.judgment_api_key, evaluation_run.organization_id
59
- )
60
- return api_client.add_to_evaluation_queue(
61
- evaluation_run.eval_name, evaluation_run.project_name
62
- )
63
-
64
-
65
- def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
66
- """
67
- Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
68
-
69
- Args:
70
- evaluation_run (EvaluationRun): The evaluation run object containing the examples, scorers, and metadata
71
-
72
- Returns:
73
- List[Dict]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult`
74
- object.
75
- """
76
-
77
- try:
78
- # submit API request to execute evals
79
- if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
80
- raise ValueError("API key and organization ID are required")
81
- api_client = JudgmentApiClient(
82
- evaluation_run.judgment_api_key, evaluation_run.organization_id
83
- )
84
- return api_client.run_evaluation(evaluation_run.model_dump())
85
- except Exception as e:
86
- judgeval_logger.error(f"Error: {e}")
87
-
88
- details = "No details provided"
89
- if isinstance(e, JudgmentAPIException):
90
- details = e.response_json.get("detail", "No details provided")
91
-
92
- raise JudgmentAPIError(
93
- "An error occurred while executing the Judgment API request: " + details
94
- )
95
-
96
-
97
- def execute_api_trace_eval(trace_run: TraceRun, judgment_api_key: str) -> Dict:
98
- """
99
- Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
100
- """
101
-
102
- try:
103
- # submit API request to execute evals
104
- if not judgment_api_key or not trace_run.organization_id:
105
- raise ValueError("API key and organization ID are required")
106
- api_client = JudgmentApiClient(judgment_api_key, trace_run.organization_id)
107
- return api_client.run_trace_evaluation(trace_run.model_dump(warnings=False))
108
- except Exception as e:
109
- judgeval_logger.error(f"Error: {e}")
110
-
111
- details = "An unknown error occurred."
112
- if isinstance(e, JudgmentAPIException):
113
- details = e.response_json.get("detail", "An unknown error occurred.")
114
-
115
- raise JudgmentAPIError(
116
- "An error occurred while executing the Judgment API request: " + details
117
- )
118
-
119
-
120
- def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
121
- """
122
- Checks if any `ScoringResult` objects are missing `scorers_data`.
123
-
124
- If any are missing, logs an error and returns the results.
125
- """
126
- for i, result in enumerate(results):
127
- if not result.scorers_data:
128
- judgeval_logger.error(
129
- f"Scorer data is missing for example {i}. "
130
- "This is usually caused when the example does not contain "
131
- "the fields required by the scorer. "
132
- "Check that your example contains the fields required by the scorers. "
133
- "TODO add docs link here for reference."
134
- )
135
- return results
136
-
137
-
138
- def check_experiment_type(
139
- eval_name: str,
140
- project_name: str,
141
- judgment_api_key: str,
142
- organization_id: str,
143
- is_trace: bool,
144
- ) -> None:
145
- """
146
- Checks if the current experiment, if one exists, has the same type (examples of traces)
147
- """
148
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
149
-
150
- try:
151
- api_client.check_experiment_type(eval_name, project_name, is_trace)
152
- except JudgmentAPIException as e:
153
- if e.response.status_code == 422:
154
- judgeval_logger.error(f"{e.response_json}")
155
- raise ValueError(f"{e.response_json}")
156
- else:
157
- raise e
158
- except Exception as e:
159
- judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
160
- raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
161
-
162
-
163
- def check_eval_run_name_exists(
164
- eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
165
- ) -> None:
166
- """
167
- Checks if an evaluation run name already exists for a given project.
168
-
169
- Args:
170
- eval_name (str): Name of the evaluation run
171
- project_name (str): Name of the project
172
- judgment_api_key (str): API key for authentication
173
-
174
- Raises:
175
- ValueError: If the evaluation run name already exists
176
- JudgmentAPIError: If there's an API error during the check
177
- """
178
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
179
- try:
180
- api_client.check_eval_run_name_exists(eval_name, project_name)
181
- except JudgmentAPIException as e:
182
- if e.response.status_code == 409:
183
- error_str = f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true. See https://docs.judgmentlabs.ai/sdk-reference/judgment-client#override for more information."
184
- judgeval_logger.error(error_str)
185
- raise ValueError(error_str)
186
- else:
187
- raise e
188
-
189
- except Exception as e:
190
- judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
191
- raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
192
-
193
-
194
- def log_evaluation_results(
195
- scoring_results: List[ScoringResult],
196
- run: Union[EvaluationRun, TraceRun],
197
- judgment_api_key: str,
198
- ) -> str:
199
- """
200
- Logs evaluation results to the Judgment API database.
201
-
202
- Args:
203
- merged_results (List[ScoringResult]): The results to log
204
- evaluation_run (EvaluationRun): The evaluation run containing project info and API key
205
- judgment_api_key (str): The API key for the Judgment API
206
-
207
- Raises:
208
- JudgmentAPIError: If there's an API error during logging
209
- ValueError: If there's a validation error with the results
210
- """
211
- try:
212
- if not judgment_api_key or not run.organization_id:
213
- raise ValueError("API key and organization ID are required")
214
-
215
- api_client = JudgmentApiClient(judgment_api_key, run.organization_id)
216
- response = api_client.log_evaluation_results(
217
- scoring_results,
218
- run.model_dump(warnings=False),
219
- )
220
- url = response.get("ui_results_url")
221
- return url
222
-
223
- except Exception as e:
224
- judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
225
- raise JudgmentAPIError(
226
- f"Request failed while saving evaluation results to DB: {str(e)}"
227
- )
228
-
229
-
230
- def check_examples(
231
- examples: List[Example], scorers: List[Union[APIScorerConfig, BaseScorer]]
232
- ) -> None:
233
- """
234
- Checks if the example contains the necessary parameters for the scorer.
235
- """
236
- prompt_user = False
237
- for scorer in scorers:
238
- for example in examples:
239
- missing_params = []
240
- for param in scorer.required_params:
241
- if getattr(example, param.value) is None:
242
- missing_params.append(f"{param.value}")
243
- if missing_params:
244
- rprint(
245
- f"[yellow]āš ļø WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]"
246
- )
247
- rprint(f"Missing parameters: {', '.join(missing_params)}")
248
- rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
249
- rprint("-" * 40)
250
- prompt_user = True
251
-
252
- if prompt_user:
253
- user_input = input("Do you want to continue? (y/n)")
254
- if user_input.lower() != "y":
255
- sys.exit(0)
256
- else:
257
- rprint("[green]Continuing...[/green]")
258
-
259
-
260
- def run_trace_eval(
261
- trace_run: TraceRun,
262
- judgment_api_key: str,
263
- override: bool = False,
264
- function: Optional[Callable] = None,
265
- tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
266
- examples: Optional[List[Example]] = None,
267
- ) -> List[ScoringResult]:
268
- # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
269
- if not override and not trace_run.append:
270
- check_eval_run_name_exists(
271
- trace_run.eval_name,
272
- trace_run.project_name,
273
- judgment_api_key,
274
- trace_run.organization_id,
275
- )
276
-
277
- if trace_run.append:
278
- # Check that the current experiment, if one exists, has the same type (examples or traces)
279
- check_experiment_type(
280
- trace_run.eval_name,
281
- trace_run.project_name,
282
- judgment_api_key,
283
- trace_run.organization_id,
284
- True,
285
- )
286
- if function and tracer and examples is not None:
287
- new_traces: List[Trace] = []
288
-
289
- # Handle case where tracer is actually a callback handler
290
- actual_tracer = tracer
291
- if hasattr(tracer, "tracer") and hasattr(tracer.tracer, "traces"):
292
- # This is a callback handler, get the underlying tracer
293
- actual_tracer = tracer.tracer
294
-
295
- actual_tracer.offline_mode = True
296
- actual_tracer.traces = []
297
- judgeval_logger.info("Running agent function: ")
298
- for example in examples:
299
- if example.input:
300
- if isinstance(example.input, str):
301
- function(example.input)
302
- elif isinstance(example.input, dict):
303
- function(**example.input)
304
- else:
305
- raise ValueError(
306
- f"Input must be string or dict, got {type(example.input)}"
307
- )
308
- else:
309
- function()
310
-
311
- for i, trace in enumerate(actual_tracer.traces):
312
- # We set the root-level trace span with the expected tools of the Trace
313
- trace = Trace(**trace)
314
- trace.trace_spans[0].expected_tools = examples[i].expected_tools
315
- new_traces.append(trace)
316
- trace_run.traces = new_traces
317
- actual_tracer.traces = []
318
-
319
- # Execute evaluation using Judgment API
320
- try: # execute an EvaluationRun with just JudgmentScorers
321
- judgeval_logger.info("Executing Trace Evaluation... ")
322
- response_data: Dict = execute_api_trace_eval(trace_run, judgment_api_key)
323
- scoring_results = [
324
- ScoringResult(**result) for result in response_data["results"]
325
- ]
326
- except JudgmentAPIError as e:
327
- raise JudgmentAPIError(
328
- f"An error occurred while executing the Judgment API request: {str(e)}"
329
- )
330
- except ValueError as e:
331
- raise ValueError(
332
- f"Please check your TraceRun object, one or more fields are invalid: {str(e)}"
333
- )
334
-
335
- # Convert the response data to `ScoringResult` objects
336
- # TODO: allow for custom scorer on traces
337
-
338
- url = log_evaluation_results(
339
- response_data["agent_results"], trace_run, judgment_api_key
340
- )
341
- rprint(
342
- f"\nšŸ” You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
343
- )
344
- return scoring_results
345
-
346
-
347
- async def get_evaluation_status(
348
- eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
349
- ) -> Dict:
350
- """
351
- Gets the status of an async evaluation run.
352
-
353
- Args:
354
- eval_name (str): Name of the evaluation run
355
- project_name (str): Name of the project
356
- judgment_api_key (str): API key for authentication
357
- organization_id (str): Organization ID for the evaluation
358
-
359
- Returns:
360
- Dict: Status information including:
361
- - status: 'pending', 'running', 'completed', or 'failed'
362
- - results: List of ScoringResult objects if completed
363
- - error: Error message if failed
364
- """
365
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
366
- try:
367
- return api_client.get_evaluation_status(eval_name, project_name)
368
- except Exception as e:
369
- raise JudgmentAPIError(
370
- f"An error occurred while checking evaluation status: {str(e)}"
371
- )
372
-
373
-
374
- def retrieve_counts(result: Dict):
375
- scorer_data_count = 0
376
- for example in result.get("examples", []):
377
- for scorer in example.get("scorer_data", []):
378
- scorer_data_count += 1
379
- return scorer_data_count
380
-
381
-
382
- def _poll_evaluation_until_complete(
383
- eval_name: str,
384
- project_name: str,
385
- judgment_api_key: str,
386
- organization_id: str,
387
- expected_scorer_data_count: int,
388
- poll_interval_seconds: float = 5,
389
- max_failures: int = 5,
390
- max_poll_count: int = 24, # This should be equivalent to 120 seconds
391
- ) -> Tuple[List[ScoringResult], str]:
392
- """
393
- Polls until the evaluation is complete and returns the results.
394
-
395
- Args:
396
- eval_name (str): Name of the evaluation run
397
- project_name (str): Name of the project
398
- judgment_api_key (str): API key for authentication
399
- organization_id (str): Organization ID for the evaluation
400
- poll_interval_seconds (int, optional): Time between status checks in seconds. Defaults to 5.
401
- original_examples (List[Example], optional): The original examples sent for evaluation.
402
- If provided, will match results with original examples.
403
-
404
- Returns:
405
- List[ScoringResult]: The evaluation results
406
- """
407
- poll_count = 0
408
- exception_count = 0
409
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
410
- while poll_count < max_poll_count:
411
- poll_count += 1
412
- try:
413
- # Check status
414
- status_response = api_client.get_evaluation_status(eval_name, project_name)
415
-
416
- if status_response.get("status") != "completed":
417
- time.sleep(poll_interval_seconds)
418
- continue
419
-
420
- results_response = api_client.fetch_evaluation_results(
421
- project_name, eval_name
422
- )
423
- url = results_response.get("ui_results_url")
424
-
425
- if results_response.get("examples") is None:
426
- time.sleep(poll_interval_seconds)
427
- continue
428
-
429
- examples_data = results_response.get("examples", [])
430
- scoring_results = []
431
- scorer_data_count = 0
432
-
433
- for example_data in examples_data:
434
- scorer_data_list = []
435
- for raw_scorer_data in example_data.get("scorer_data", []):
436
- scorer_data = ScorerData(**raw_scorer_data)
437
- scorer_data_list.append(scorer_data)
438
- scorer_data_count += 1
439
-
440
- example = Example(**example_data)
441
-
442
- success = all(scorer_data.success for scorer_data in scorer_data_list)
443
- scoring_result = ScoringResult(
444
- success=success,
445
- scorers_data=scorer_data_list,
446
- data_object=example,
447
- )
448
- scoring_results.append(scoring_result)
449
-
450
- if scorer_data_count != expected_scorer_data_count:
451
- time.sleep(poll_interval_seconds)
452
- continue
453
-
454
- return scoring_results, url
455
- except Exception as e:
456
- exception_count += 1
457
- if isinstance(e, JudgmentAPIError):
458
- raise
459
-
460
- judgeval_logger.error(f"Error checking evaluation status: {str(e)}")
461
- if exception_count > max_failures:
462
- raise JudgmentAPIError(
463
- f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
464
- )
465
-
466
- time.sleep(poll_interval_seconds)
467
-
468
- raise JudgmentAPIError(
469
- f"Error checking evaluation status after {poll_count} attempts"
470
- )
471
-
472
-
473
- def progress_logger(stop_event, msg="Working...", interval=5):
474
- start = time.time()
475
- while not stop_event.is_set():
476
- elapsed = int(time.time() - start)
477
- judgeval_logger.info(f"{msg} ({elapsed} sec)")
478
- stop_event.wait(interval)
479
-
480
-
481
- def run_eval(
482
- evaluation_run: EvaluationRun,
483
- judgment_api_key: str,
484
- override: bool = False,
485
- ) -> List[ScoringResult]:
486
- """
487
- Executes an evaluation of `Example`s using one or more `Scorer`s
488
-
489
- Args:
490
- evaluation_run (EvaluationRun): Stores example and evaluation together for running
491
- override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
492
-
493
- Returns:
494
- List[ScoringResult]: A list of ScoringResult objects
495
- """
496
-
497
- # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
498
- if not override and not evaluation_run.append:
499
- check_eval_run_name_exists(
500
- evaluation_run.eval_name,
501
- evaluation_run.project_name,
502
- judgment_api_key,
503
- evaluation_run.organization_id,
504
- )
505
-
506
- if evaluation_run.append:
507
- # Check that the current experiment, if one exists, has the same type (examples of traces)
508
- check_experiment_type(
509
- evaluation_run.eval_name,
510
- evaluation_run.project_name,
511
- judgment_api_key,
512
- evaluation_run.organization_id,
513
- False,
514
- )
515
-
516
- # Set example IDs if not already set
517
- for idx, example in enumerate(evaluation_run.examples):
518
- example.example_index = idx # Set numeric index
519
-
520
- judgment_scorers: List[APIScorerConfig] = []
521
- local_scorers: List[BaseScorer] = []
522
- for scorer in evaluation_run.scorers:
523
- if isinstance(scorer, APIScorerConfig):
524
- judgment_scorers.append(scorer)
525
- else:
526
- local_scorers.append(scorer)
527
-
528
- results: List[ScoringResult] = []
529
- url = ""
530
-
531
- if len(local_scorers) > 0 and len(judgment_scorers) > 0:
532
- error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
533
- judgeval_logger.error(error_msg)
534
- raise ValueError(error_msg)
535
-
536
- if len(judgment_scorers) > 0:
537
- check_examples(evaluation_run.examples, judgment_scorers)
538
- stop_event = threading.Event()
539
- t = threading.Thread(
540
- target=progress_logger, args=(stop_event, "Running evaluation...")
541
- )
542
- t.start()
543
- try:
544
- api_client = JudgmentApiClient(
545
- judgment_api_key, evaluation_run.organization_id
546
- )
547
- response = api_client.add_to_evaluation_queue(
548
- evaluation_run.model_dump(warnings=False)
549
- )
550
-
551
- if not response.get("success", False):
552
- error_message = response.error
553
- judgeval_logger.error(
554
- f"Error adding evaluation to queue: {error_message}"
555
- )
556
- raise JudgmentAPIError(error_message)
557
-
558
- old_scorer_data_count = 0
559
- if evaluation_run.append:
560
- try:
561
- results_response = api_client.fetch_evaluation_results(
562
- evaluation_run.project_name, evaluation_run.eval_name
563
- )
564
- old_scorer_data_count = retrieve_counts(results_response)
565
- except Exception:
566
- # This usually means the user did append = True but the eval run name doesn't exist yet
567
- pass
568
-
569
- results, url = _poll_evaluation_until_complete(
570
- eval_name=evaluation_run.eval_name,
571
- project_name=evaluation_run.project_name,
572
- judgment_api_key=judgment_api_key,
573
- organization_id=evaluation_run.organization_id,
574
- expected_scorer_data_count=(
575
- len(evaluation_run.scorers) * len(evaluation_run.examples)
576
- )
577
- + old_scorer_data_count,
578
- )
579
- finally:
580
- stop_event.set()
581
- t.join()
582
-
583
- if len(local_scorers) > 0:
584
- results = safe_run_async(
585
- a_execute_scoring(
586
- evaluation_run.examples,
587
- local_scorers,
588
- model=evaluation_run.model,
589
- throttle_value=0,
590
- max_concurrent=MAX_CONCURRENT_EVALUATIONS,
591
- )
592
- )
593
-
594
- send_results = [
595
- scoring_result.model_dump(warnings=False) for scoring_result in results
596
- ]
597
-
598
- url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
599
- rprint(
600
- f"\nšŸ” You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
601
- )
602
- return results
603
-
604
-
605
- def assert_test(scoring_results: List[ScoringResult]) -> None:
606
- """
607
- Collects all failed scorers from the scoring results.
608
-
609
- Args:
610
- ScoringResults (List[ScoringResult]): List of scoring results to check
611
-
612
- Returns:
613
- None. Raises exceptions for any failed test cases.
614
- """
615
- failed_cases: List[ScorerData] = []
616
-
617
- for result in scoring_results:
618
- if not result.success:
619
- # Create a test case context with all relevant fields
620
- test_case: Dict = {"failed_scorers": []}
621
- if result.scorers_data:
622
- # If the result was not successful, check each scorer_data
623
- for scorer_data in result.scorers_data:
624
- if not scorer_data.success:
625
- if scorer_data.name == "Tool Order":
626
- # Remove threshold, evaluation model for Tool Order scorer
627
- scorer_data.threshold = None
628
- scorer_data.evaluation_model = None
629
- test_case["failed_scorers"].append(scorer_data)
630
- failed_cases.append(test_case)
631
-
632
- if failed_cases:
633
- error_msg = "The following test cases failed: \n"
634
- for fail_case in failed_cases:
635
- for fail_scorer in fail_case["failed_scorers"]:
636
- error_msg += (
637
- f"\nScorer Name: {fail_scorer.name}\n"
638
- f"Threshold: {fail_scorer.threshold}\n"
639
- f"Success: {fail_scorer.success}\n"
640
- f"Score: {fail_scorer.score}\n"
641
- f"Reason: {fail_scorer.reason}\n"
642
- f"Strict Mode: {fail_scorer.strict_mode}\n"
643
- f"Evaluation Model: {fail_scorer.evaluation_model}\n"
644
- f"Error: {fail_scorer.error}\n"
645
- f"Additional Metadata: {fail_scorer.additional_metadata}\n"
646
- )
647
- error_msg += "-" * 100
648
-
649
- total_tests = len(scoring_results)
650
- failed_tests = len(failed_cases)
651
- passed_tests = total_tests - failed_tests
652
-
653
- # Print summary with colors
654
- rprint("\n" + "=" * 80)
655
- if failed_tests == 0:
656
- rprint(
657
- f"[bold green]šŸŽ‰ ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]"
658
- )
659
- else:
660
- rprint(
661
- f"[bold red]āš ļø TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]"
662
- )
663
- rprint("=" * 80 + "\n")
664
-
665
- # Print individual test cases
666
- for i, result in enumerate(scoring_results):
667
- test_num = i + 1
668
- if result.success:
669
- rprint(f"[green]āœ“ Test {test_num}: PASSED[/green]")
670
- else:
671
- rprint(f"[red]āœ— Test {test_num}: FAILED[/red]")
672
- if result.scorers_data:
673
- for scorer_data in result.scorers_data:
674
- if not scorer_data.success:
675
- rprint(f" [yellow]Scorer: {scorer_data.name}[/yellow]")
676
- rprint(f" [red] Score: {scorer_data.score}[/red]")
677
- rprint(f" [red] Reason: {scorer_data.reason}[/red]")
678
- if scorer_data.error:
679
- rprint(f" [red] Error: {scorer_data.error}[/red]")
680
- rprint(" " + "-" * 40)
681
-
682
- rprint("\n" + "=" * 80)
683
- if failed_tests > 0:
684
- raise AssertionError(failed_cases)
@@ -1,14 +0,0 @@
1
- """
2
- `judgeval` answer relevancy scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIScorerConfig
10
- from judgeval.constants import APIScorerType
11
-
12
-
13
- class DerailmentScorer(APIScorerConfig):
14
- score_type: APIScorerType = APIScorerType.DERAILMENT