judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,439 +0,0 @@
1
- import asyncio
2
- import requests
3
- from typing import List, Dict
4
- from datetime import datetime
5
- from rich import print as rprint
6
-
7
- from judgeval.data import (
8
- Example,
9
- ScorerData,
10
- ScoringResult
11
- )
12
- from judgeval.scorers import (
13
- JudgevalScorer,
14
- APIJudgmentScorer,
15
- ClassifierScorer
16
- )
17
- from judgeval.scorers.score import a_execute_scoring
18
-
19
- from judgeval.constants import (
20
- ROOT_API,
21
- JUDGMENT_EVAL_API_URL,
22
- JUDGMENT_EVAL_LOG_API_URL,
23
- )
24
- from judgeval.common.exceptions import JudgmentAPIError
25
- from judgeval.evaluation_run import EvaluationRun
26
- from judgeval.common.logger import (
27
- enable_logging,
28
- debug,
29
- info,
30
- error,
31
- example_logging_context
32
- )
33
-
34
-
35
- def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
36
- """
37
- Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
38
-
39
- Args:
40
- evaluation_run (EvaluationRun): The evaluation run object containing the examples, scorers, and metadata
41
-
42
- Returns:
43
- List[Dict]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult`
44
- object.
45
- """
46
-
47
- try:
48
- # submit API request to execute evals
49
- payload = evaluation_run.model_dump(warnings=False)
50
- response = requests.post(JUDGMENT_EVAL_API_URL, json=payload)
51
- response_data = response.json()
52
- except Exception as e:
53
- error(f"Error: {e}")
54
- details = response.json().get("detail", "No details provided")
55
- raise JudgmentAPIError("An error occurred while executing the Judgment API request: " + details)
56
- # Check if the response status code is not 2XX
57
- # Add check for the duplicate eval run name
58
- if not response.ok:
59
- error_message = response_data.get('detail', 'An unknown error occurred.')
60
- error(f"Error: {error_message=}")
61
- raise JudgmentAPIError(error_message)
62
- return response_data
63
-
64
-
65
- def merge_results(api_results: List[ScoringResult], local_results: List[ScoringResult]) -> List[ScoringResult]:
66
- """
67
- When executing scorers that come from both the Judgment API and local scorers, we're left with
68
- results for each type of scorer. This function merges the results from the API and local evaluations,
69
- grouped by example. In particular, we merge the `scorers_data` field of each `ScoringResult` object.
70
-
71
- Args:
72
- api_results (List[ScoringResult]): The `ScoringResult`s from the API evaluation
73
- local_results (List[ScoringResult]): The `ScoringResult`s from the local evaluation
74
-
75
- Returns:
76
- List[ScoringResult]: The merged `ScoringResult`s (updated `scorers_data` field)
77
- """
78
- # No merge required
79
- if not local_results and api_results:
80
- return api_results
81
- if not api_results and local_results:
82
- return local_results
83
-
84
- if len(api_results) != len(local_results):
85
- # Results should be of same length because each ScoringResult is a 1-1 mapping to an Example
86
- raise ValueError(f"The number of API and local results do not match: {len(api_results)} vs {len(local_results)}")
87
-
88
- # Each ScoringResult in api and local have all the same fields besides `scorers_data`
89
- for api_result, local_result in zip(api_results, local_results):
90
- if api_result.input != local_result.input:
91
- raise ValueError("The API and local results are not aligned.")
92
- if api_result.actual_output != local_result.actual_output:
93
- raise ValueError("The API and local results are not aligned.")
94
- if api_result.expected_output != local_result.expected_output:
95
- raise ValueError("The API and local results are not aligned.")
96
- if api_result.context != local_result.context:
97
- raise ValueError("The API and local results are not aligned.")
98
- if api_result.retrieval_context != local_result.retrieval_context:
99
- raise ValueError("The API and local results are not aligned.")
100
- if api_result.additional_metadata != local_result.additional_metadata:
101
- raise ValueError("The API and local results are not aligned.")
102
- if api_result.tools_called != local_result.tools_called:
103
- raise ValueError("The API and local results are not aligned.")
104
- if api_result.expected_tools != local_result.expected_tools:
105
- raise ValueError("The API and local results are not aligned.")
106
-
107
-
108
- # Merge ScorerData from the API and local scorers together
109
- api_scorer_data = api_result.scorers_data
110
- local_scorer_data = local_result.scorers_data
111
- if api_scorer_data is None and local_scorer_data is not None:
112
- api_result.scorers_data = local_scorer_data
113
-
114
- if api_scorer_data is not None and local_scorer_data is not None:
115
- api_result.scorers_data = api_scorer_data + local_scorer_data
116
-
117
- return api_results
118
-
119
-
120
- def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
121
- """
122
- Checks if any `ScoringResult` objects are missing `scorers_data`.
123
-
124
- If any are missing, logs an error and returns the results.
125
- """
126
- for i, result in enumerate(results):
127
- if not result.scorers_data:
128
- error(
129
- f"Scorer data is missing for example {i}. "
130
- "This is usually caused when the example does not contain "
131
- "the fields required by the scorer. "
132
- "Check that your example contains the fields required by the scorers. "
133
- "TODO add docs link here for reference."
134
- )
135
- return results
136
-
137
-
138
- def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_key: str) -> None:
139
- """
140
- Checks if an evaluation run name already exists for a given project.
141
-
142
- Args:
143
- eval_name (str): Name of the evaluation run
144
- project_name (str): Name of the project
145
- judgment_api_key (str): API key for authentication
146
-
147
- Raises:
148
- ValueError: If the evaluation run name already exists
149
- JudgmentAPIError: If there's an API error during the check
150
- """
151
- try:
152
- response = requests.post(
153
- f"{ROOT_API}/eval-run-name-exists/",
154
- json={
155
- "eval_name": eval_name,
156
- "project_name": project_name,
157
- "judgment_api_key": judgment_api_key,
158
- }
159
- )
160
-
161
- if response.status_code == 409:
162
- error(f"Evaluation run name '{eval_name}' already exists for this project")
163
- raise ValueError(f"Evaluation run name '{eval_name}' already exists for this project")
164
-
165
- if not response.ok:
166
- response_data = response.json()
167
- error_message = response_data.get('detail', 'An unknown error occurred.')
168
- error(f"Error checking eval run name: {error_message}")
169
- raise JudgmentAPIError(error_message)
170
-
171
- except requests.exceptions.RequestException as e:
172
- error(f"Failed to check if eval run name exists: {str(e)}")
173
- raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
174
-
175
-
176
- def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> None:
177
- """
178
- Logs evaluation results to the Judgment API database.
179
-
180
- Args:
181
- merged_results (List[ScoringResult]): The results to log
182
- evaluation_run (EvaluationRun): The evaluation run containing project info and API key
183
-
184
- Raises:
185
- JudgmentAPIError: If there's an API error during logging
186
- ValueError: If there's a validation error with the results
187
- """
188
- try:
189
- res = requests.post(
190
- JUDGMENT_EVAL_LOG_API_URL,
191
- json={
192
- "results": [result.to_dict() for result in merged_results],
193
- "judgment_api_key": evaluation_run.judgment_api_key,
194
- "project_name": evaluation_run.project_name,
195
- "eval_name": evaluation_run.eval_name,
196
- }
197
- )
198
-
199
- if not res.ok:
200
- response_data = res.json()
201
- error_message = response_data.get('detail', 'An unknown error occurred.')
202
- error(f"Error {res.status_code}: {error_message}")
203
- raise JudgmentAPIError(error_message)
204
-
205
- if "ui_results_url" in res.json():
206
- rprint(f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)]{res.json()['ui_results_url']}[/]\n")
207
-
208
- except requests.exceptions.RequestException as e:
209
- error(f"Request failed while saving evaluation results to DB: {str(e)}")
210
- raise JudgmentAPIError(f"Request failed while saving evaluation results to DB: {str(e)}")
211
- except Exception as e:
212
- error(f"Failed to save evaluation results to DB: {str(e)}")
213
- raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
214
-
215
-
216
- def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
217
- """
218
- Executes an evaluation of `Example`s using one or more `Scorer`s
219
-
220
- Args:
221
- evaluation_run (EvaluationRun): Stores example and evaluation together for running
222
-
223
- Args:
224
- project_name (str): The name of the project the evaluation results belong to
225
- eval_name (str): The name of the evaluation run
226
- examples (List[Example]): The examples to evaluate
227
- scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
228
- model (str): The model used as a judge when using LLM as a Judge
229
- aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
230
- metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
231
- judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
232
- log_results (bool): Whether to log the results to the Judgment API
233
-
234
-
235
- Returns:
236
- List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
237
- """
238
-
239
- # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
240
- if not override and evaluation_run.log_results:
241
- check_eval_run_name_exists(
242
- evaluation_run.eval_name,
243
- evaluation_run.project_name,
244
- evaluation_run.judgment_api_key
245
- )
246
-
247
- # Set example IDs if not already set
248
- debug("Initializing examples with IDs and timestamps")
249
- for idx, example in enumerate(evaluation_run.examples):
250
- if example.example_id is None:
251
- example.example_id = idx
252
- debug(f"Set example ID {idx} for input: {example.input[:50]}...")
253
- example.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
254
- with example_logging_context(example.timestamp, example.example_id):
255
- debug(f"Initialized example {example.example_id}")
256
- debug(f"Input: {example.input}")
257
- debug(f"Actual output: {example.actual_output}")
258
- if example.expected_output:
259
- debug(f"Expected output: {example.expected_output}")
260
- if example.context:
261
- debug(f"Context: {example.context}")
262
- if example.retrieval_context:
263
- debug(f"Retrieval context: {example.retrieval_context}")
264
- if example.additional_metadata:
265
- debug(f"Additional metadata: {example.additional_metadata}")
266
- if example.tools_called:
267
- debug(f"Tools called: {example.tools_called}")
268
- if example.expected_tools:
269
- debug(f"Expected tools: {example.expected_tools}")
270
-
271
- debug(f"Starting evaluation run with {len(evaluation_run.examples)} examples")
272
-
273
- # Group APIJudgmentScorers and JudgevalScorers, then evaluate them in parallel
274
- debug("Grouping scorers by type")
275
- judgment_scorers: List[APIJudgmentScorer] = []
276
- local_scorers: List[JudgevalScorer] = []
277
- for scorer in evaluation_run.scorers:
278
- if isinstance(scorer, (APIJudgmentScorer, ClassifierScorer)):
279
- judgment_scorers.append(scorer)
280
- debug(f"Added judgment scorer: {type(scorer).__name__}")
281
- else:
282
- local_scorers.append(scorer)
283
- debug(f"Added local scorer: {type(scorer).__name__}")
284
-
285
- debug(f"Found {len(judgment_scorers)} judgment scorers and {len(local_scorers)} local scorers")
286
-
287
- api_results: List[ScoringResult] = []
288
- local_results: List[ScoringResult] = []
289
-
290
- # Execute evaluation using Judgment API
291
- if judgment_scorers:
292
- info("Starting API evaluation")
293
- debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
294
- try: # execute an EvaluationRun with just JudgmentScorers
295
- api_evaluation_run: EvaluationRun = EvaluationRun(
296
- eval_name=evaluation_run.eval_name,
297
- project_name=evaluation_run.project_name,
298
- examples=evaluation_run.examples,
299
- scorers=judgment_scorers,
300
- model=evaluation_run.model,
301
- aggregator=evaluation_run.aggregator,
302
- metadata=evaluation_run.metadata,
303
- judgment_api_key=evaluation_run.judgment_api_key,
304
- log_results=evaluation_run.log_results
305
- )
306
- debug("Sending request to Judgment API")
307
- response_data: List[Dict] = execute_api_eval(api_evaluation_run) # Dicts are `ScoringResult` objs
308
- info(f"Received {len(response_data['results'])} results from API")
309
- except JudgmentAPIError as e:
310
- error(f"An error occurred while executing the Judgment API request: {str(e)}")
311
- raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
312
- except ValueError as e:
313
- raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
314
-
315
- # Convert the response data to `ScoringResult` objects
316
- debug("Processing API results")
317
- for idx, result in enumerate(response_data["results"]):
318
- with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
319
- for scorer in judgment_scorers:
320
- debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
321
- # filter for key-value pairs that are used to initialize ScoringResult
322
- # there may be some stuff in here that doesn't belong in ScoringResult
323
- # TODO: come back and refactor this to have ScoringResult take in **kwargs
324
- filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
325
-
326
- # Convert scorers_data dicts to ScorerData objects
327
- if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
328
- filtered_result["scorers_data"] = [
329
- ScorerData(**scorer_dict)
330
- for scorer_dict in filtered_result["scorers_data"]
331
- ]
332
-
333
- api_results.append(ScoringResult(**filtered_result))
334
-
335
- # Run local evals
336
- if local_scorers: # List[JudgevalScorer]
337
- info("Starting local evaluation")
338
- for example in evaluation_run.examples:
339
- with example_logging_context(example.timestamp, example.example_id):
340
- debug(f"Processing example {example.example_id}: {example.input}")
341
-
342
- results: List[ScoringResult] = asyncio.run(
343
- a_execute_scoring(
344
- evaluation_run.examples,
345
- local_scorers,
346
- model=evaluation_run.model,
347
- ignore_errors=True,
348
- skip_on_missing_params=True,
349
- show_indicator=True,
350
- _use_bar_indicator=True,
351
- throttle_value=0,
352
- max_concurrent=100,
353
- )
354
- )
355
- local_results = results
356
- info(f"Local evaluation complete with {len(local_results)} results")
357
-
358
- # Aggregate the ScorerData from the API and local evaluations
359
- debug("Merging API and local results")
360
- merged_results: List[ScoringResult] = merge_results(api_results, local_results)
361
- merged_results = check_missing_scorer_data(merged_results)
362
-
363
- info(f"Successfully merged {len(merged_results)} results")
364
-
365
- if evaluation_run.log_results:
366
- log_evaluation_results(merged_results, evaluation_run)
367
-
368
- for i, result in enumerate(merged_results):
369
- if not result.scorers_data: # none of the scorers could be executed on this example
370
- info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
371
- return merged_results
372
-
373
- def assert_test(scoring_results: List[ScoringResult]) -> None:
374
- """
375
- Collects all failed scorers from the scoring results.
376
-
377
- Args:
378
- ScoringResults (List[ScoringResult]): List of scoring results to check
379
-
380
- Returns:
381
- None. Raises exceptions for any failed test cases.
382
- """
383
- failed_cases: List[ScorerData] = []
384
-
385
- for result in scoring_results:
386
- if not result.success:
387
-
388
- # Create a test case context with all relevant fields
389
- test_case = {
390
- 'input': result.input,
391
- 'actual_output': result.actual_output,
392
- 'expected_output': result.expected_output,
393
- 'context': result.context,
394
- 'retrieval_context': result.retrieval_context,
395
- 'additional_metadata': result.additional_metadata,
396
- 'tools_called': result.tools_called,
397
- 'expected_tools': result.expected_tools,
398
- 'eval_run_name': result.eval_run_name,
399
- 'failed_scorers': []
400
- }
401
- if result.scorers_data:
402
- # If the result was not successful, check each scorer_data
403
- for scorer_data in result.scorers_data:
404
- if not scorer_data.success:
405
- test_case['failed_scorers'].append(scorer_data)
406
- failed_cases.append(test_case)
407
-
408
- if failed_cases:
409
- error_msg = f"The following test cases failed: \n"
410
- for fail_case in failed_cases:
411
- error_msg += f"\nInput: {fail_case['input']}\n"
412
- error_msg += f"Actual Output: {fail_case['actual_output']}\n"
413
- error_msg += f"Expected Output: {fail_case['expected_output']}\n"
414
- error_msg += f"Context: {fail_case['context']}\n"
415
- error_msg += f"Retrieval Context: {fail_case['retrieval_context']}\n"
416
- error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
417
- error_msg += f"Tools Called: {fail_case['tools_called']}\n"
418
- error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
419
- error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
420
-
421
- for fail_scorer in fail_case['failed_scorers']:
422
-
423
- error_msg += (
424
- f"\nScorer Name: {fail_scorer.name}\n"
425
- f"Threshold: {fail_scorer.threshold}\n"
426
- f"Success: {fail_scorer.success}\n"
427
- f"Score: {fail_scorer.score}\n"
428
- f"Reason: {fail_scorer.reason}\n"
429
- f"Strict Mode: {fail_scorer.strict_mode}\n"
430
- f"Evaluation Model: {fail_scorer.evaluation_model}\n"
431
- f"Error: {fail_scorer.error}\n"
432
- f"Evaluation Cost: {fail_scorer.evaluation_cost}\n"
433
- f"Verbose Logs: {fail_scorer.verbose_logs}\n"
434
- f"Additional Metadata: {fail_scorer.additional_metadata}\n"
435
- )
436
- error_msg += "-"*100
437
-
438
- raise AssertionError(error_msg)
439
-
@@ -1,140 +0,0 @@
1
- """
2
- Judgeval Scorer class
3
-
4
- Enables client to create custom scorers that do not fall under any of the ready-made Judgment scorers.
5
- To create a custom scorer, extend this class and implement the `score_example`, `a_score_example`, and `success_check` methods.
6
- """
7
-
8
- from typing import Optional, Dict, Union, List
9
- from abc import abstractmethod
10
-
11
- from judgeval.common.logger import debug, info, warning, error
12
- from judgeval.judges import JudgevalJudge
13
- from judgeval.judges.utils import create_judge
14
-
15
-
16
- class JudgevalScorer:
17
- """
18
- Base class for scorers in `judgeval`.
19
-
20
- In practice, you should not implement this class unless you are creating a custom scorer.
21
- Judgeval offers 10+ default scorers that you can use out of the box.
22
-
23
- If you want to create a scorer that does not fall under any of the ready-made Judgment scorers,
24
- you can create a custom scorer by extending this class.
25
- """
26
- score_type: str # name of your new scorer
27
- threshold: float # The threshold to pass a test while using this scorer as a scorer
28
- score: Optional[float] = None # The float score of the scorer run on the test case
29
- score_breakdown: Dict = None
30
- reason: Optional[str] = None # The reason for the score when evaluating the test case
31
- success: Optional[bool] = None # Whether the test case passed or failed
32
- evaluation_model: Optional[str] = None # The model used to evaluate the test case
33
- strict_mode: bool = False # Whether to run the scorer in strict mode
34
- async_mode: bool = True # Whether to run the scorer in async mode
35
- verbose_mode: bool = True # Whether to run the scorer in verbose mode
36
- include_reason: bool = False # Whether to include the reason in the output
37
- error: Optional[str] = None # The error message if the scorer failed
38
- evaluation_cost: Optional[float] = None # The cost of running the scorer
39
- verbose_logs: Optional[str] = None # The verbose logs of the scorer
40
- additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
41
-
42
- def __init__(
43
- self,
44
- score_type: str,
45
- threshold: float,
46
- score: Optional[float] = None,
47
- score_breakdown: Optional[Dict] = None,
48
- reason: Optional[str] = None,
49
- success: Optional[bool] = None,
50
- evaluation_model: Optional[str] = None,
51
- strict_mode: bool = False,
52
- async_mode: bool = True,
53
- verbose_mode: bool = True,
54
- include_reason: bool = False,
55
- error: Optional[str] = None,
56
- evaluation_cost: Optional[float] = None,
57
- verbose_logs: Optional[str] = None,
58
- additional_metadata: Optional[Dict] = None
59
- ):
60
- debug(f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}")
61
- if not 0 <= threshold <= 1:
62
- raise ValueError("Threshold must be between 0 and 1")
63
- if strict_mode:
64
- warning("Strict mode enabled - scoring will be more rigorous")
65
- info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
66
- self.score_type = score_type
67
- self.threshold = threshold
68
- self.score = score
69
- self.score_breakdown = score_breakdown
70
- self.reason = reason
71
- self.success = success
72
- self.evaluation_model = evaluation_model
73
- self.strict_mode = strict_mode
74
- self.async_mode = async_mode
75
- self.verbose_mode = verbose_mode
76
- self.include_reason = include_reason
77
- self.error = error
78
- self.evaluation_cost = evaluation_cost
79
- self.verbose_logs = verbose_logs
80
- self.additional_metadata = additional_metadata
81
-
82
- def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
83
- """
84
- Adds the evaluation model to the JudgevalScorer instance
85
-
86
- This method is used at eval time
87
- """
88
- self.model, self.using_native_model = create_judge(model)
89
- self.evaluation_model = self.model.get_model_name()
90
-
91
- @abstractmethod
92
- def score_example(self, example, *args, **kwargs) -> float:
93
- """
94
- Measures the score on a single example
95
- """
96
- warning("Attempting to call unimplemented score_example method")
97
- error("score_example method not implemented")
98
- raise NotImplementedError("You must implement the `score` method in your custom scorer")
99
-
100
- @abstractmethod
101
- async def a_score_example(self, example, *args, **kwargs) -> float:
102
- """
103
- Asynchronously measures the score on a single example
104
- """
105
- warning("Attempting to call unimplemented a_score_example method")
106
- error("a_score_example method not implemented")
107
- raise NotImplementedError("You must implement the `a_score` method in your custom scorer")
108
-
109
- @abstractmethod
110
- def _success_check(self) -> bool:
111
- """
112
- For unit testing, determines whether the test case passes or fails
113
- """
114
- warning("Attempting to call unimplemented success_check method")
115
- error("success_check method not implemented")
116
- raise NotImplementedError("You must implement the `passes` method in your custom scorer")
117
-
118
- def __str__(self):
119
- debug("Converting JudgevalScorer instance to string representation")
120
- if self.error:
121
- warning(f"JudgevalScorer contains error: {self.error}")
122
- info(f"JudgevalScorer status - success: {self.success}, score: {self.score}")
123
- attributes = {
124
- "score_type": self.score_type,
125
- "threshold": self.threshold,
126
- "score": self.score,
127
- "score_breakdown": self.score_breakdown,
128
- "reason": self.reason,
129
- "success": self.success,
130
- "evaluation_model": self.evaluation_model,
131
- "strict_mode": self.strict_mode,
132
- "async_mode": self.async_mode,
133
- "verbose_mode": self.verbose_mode,
134
- "include_reason": self.include_reason,
135
- "error": self.error,
136
- "evaluation_cost": self.evaluation_cost,
137
- "verbose_logs": self.verbose_logs,
138
- "additional_metadata": self.additional_metadata,
139
- }
140
- return f"JudgevalScorer({attributes})"
@@ -1,19 +0,0 @@
1
- """
2
- `judgeval` contextual precision scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
11
-
12
-
13
- class ContextualPrecisionScorer(APIJudgmentScorer):
14
- def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_PRECISION)
16
-
17
- @property
18
- def __name__(self):
19
- return "Contextual Precision"
@@ -1,19 +0,0 @@
1
- """
2
- `judgeval` contextual recall scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
11
-
12
-
13
- class ContextualRecallScorer(APIJudgmentScorer):
14
- def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RECALL)
16
-
17
- @property
18
- def __name__(self):
19
- return "Contextual Recall"
@@ -1,22 +0,0 @@
1
- """
2
- `judgeval` contextual relevancy scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
11
-
12
-
13
- class ContextualRelevancyScorer(APIJudgmentScorer):
14
- """
15
- Scorer that checks if the output of a model is relevant to the retrieval context
16
- """
17
- def __init__(self, threshold: float):
18
- super().__init__(threshold=threshold, score_type=APIScorer.CONTEXTUAL_RELEVANCY)
19
-
20
- @property
21
- def __name__(self):
22
- return "Contextual Relevancy"
@@ -1,19 +0,0 @@
1
- """
2
- `judgeval` hallucination scorer
3
-
4
- TODO add link to docs page for this scorer
5
-
6
- """
7
-
8
- # Internal imports
9
- from judgeval.scorers.api_scorer import APIJudgmentScorer
10
- from judgeval.constants import APIScorer
11
-
12
-
13
- class HallucinationScorer(APIJudgmentScorer):
14
- def __init__(self, threshold: float):
15
- super().__init__(threshold=threshold, score_type=APIScorer.HALLUCINATION)
16
-
17
- @property
18
- def __name__(self):
19
- return "Hallucination"