judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
judgeval/scorers/score.py CHANGED
@@ -1,301 +1,104 @@
1
1
  """
2
- Infrastructure for executing evaluations of `Example`s using one or more `JudgevalScorer`s.
2
+ Infrastructure for executing evaluations of `Example`s using one or more `ExampleScorer`s.
3
3
  """
4
4
 
5
-
6
5
  import asyncio
7
- import time
6
+ import time
8
7
  from tqdm.asyncio import tqdm_asyncio
9
8
  from typing import List, Union, Optional, Callable
10
- from rich.progress import Progress, SpinnerColumn, TextColumn
11
9
 
12
10
  from judgeval.data import (
13
- Example,
11
+ Example,
14
12
  ScoringResult,
15
13
  generate_scoring_result,
16
- create_process_example,
17
14
  create_scorer_data,
18
15
  )
19
- from judgeval.scorers import JudgevalScorer
20
- from judgeval.scorers.utils import clone_scorers, scorer_console_msg
21
- from judgeval.common.exceptions import MissingTestCaseParamsError
22
- from judgeval.common.logger import example_logging_context, debug, error, warning, info
16
+ from judgeval.scorers.example_scorer import ExampleScorer
17
+ from judgeval.scorers.utils import clone_scorers
18
+ from judgeval.logger import judgeval_logger
23
19
  from judgeval.judges import JudgevalJudge
20
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
21
+
24
22
 
25
23
  async def safe_a_score_example(
26
- scorer: JudgevalScorer,
24
+ scorer: ExampleScorer,
27
25
  example: Example,
28
- ignore_errors: bool,
29
- skip_on_missing_params: bool,
30
26
  ):
31
27
  """
32
28
  Scoring task function when not using a progress indicator!
33
- "Safely" scores an `Example` using a `JudgevalScorer` by gracefully handling any exceptions that may occur.
29
+ "Safely" scores an `Example` using a `ExampleScorer` by gracefully handling any exceptions that may occur.
34
30
 
35
31
  Args:
36
- scorer (JudgevalScorer): The `JudgevalScorer` to use for scoring the example.
32
+ scorer (ExampleScorer): The `ExampleScorer` to use for scoring the example.
37
33
  example (Example): The `Example` to be scored.
38
-
39
- ignore_errors (bool): Whether to ignore errors during the evaluation.
40
- If set to false, any error will be raised and stop the evaluation.
41
- If set to true, the error will be stored in the `error` attribute of the `JudgevalScorer` and the `success` attribute will be set to False.
42
-
43
- skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
44
34
  """
45
- debug(f"Starting safe_a_score_example for example {example.example_id}")
46
35
  try:
47
- await scorer.a_score_example(example, _show_indicator=False)
48
- info(f"Successfully scored example {example.example_id}")
49
- except MissingTestCaseParamsError as e:
50
- if skip_on_missing_params: # Skip the example if the scorer requires parameters that are missing
51
- with example_logging_context(example.timestamp, example.example_id):
52
- warning(f"Skipping example {example.example_id} due to missing parameters")
53
- scorer.skipped = True
54
- return
36
+ score = await scorer.a_score_example(example)
37
+ if score is None:
38
+ raise Exception("a_score_example need to return a score")
39
+ elif score < 0:
40
+ judgeval_logger.warning("score cannot be less than 0 , setting to 0")
41
+ score = 0
42
+ elif score > 1:
43
+ judgeval_logger.warning("score cannot be greater than 1 , setting to 1")
44
+ score = 1
55
45
  else:
56
- if ignore_errors: # Gracefully handle the error, does not stop the evaluation
57
- scorer.error = str(e)
58
- scorer.success = False
59
- with example_logging_context(example.timestamp, example.example_id):
60
- warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
61
- else: # Raise the error and stop the evaluation
62
- with example_logging_context(example.timestamp, example.example_id):
63
- error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
64
- raise
65
- except TypeError: # in case a_score_example does not accept _show_indicator
66
- try:
67
- await scorer.a_score_example(example)
68
- except MissingTestCaseParamsError as e:
69
- if skip_on_missing_params:
70
- scorer.skipped = True
71
- with example_logging_context(example.timestamp, example.example_id):
72
- warning(f"Skipping example {example.example_id} due to missing parameters")
73
- return
74
- else:
75
- if ignore_errors:
76
- scorer.error = str(e)
77
- scorer.success = False
78
- with example_logging_context(example.timestamp, example.example_id):
79
- warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
80
- else:
81
- with example_logging_context(example.timestamp, example.example_id):
82
- error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
83
- raise
46
+ scorer.score = score
47
+ scorer.success = scorer.success_check()
84
48
  except Exception as e:
85
- if ignore_errors:
86
- scorer.error = str(e)
87
- scorer.success = False # Assuming you want to set success to False
88
- with example_logging_context(example.timestamp, example.example_id):
89
- warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
90
- else:
91
- with example_logging_context(example.timestamp, example.example_id):
92
- error(f"Stopping example {example.example_id}: {str(e)}")
93
- raise
94
-
95
-
96
- async def score_task(
97
- task_id: int,
98
- progress: Progress,
99
- scorer: JudgevalScorer,
100
- example: Example,
101
- ignore_errors: bool = True,
102
- skip_on_missing_params: bool = True,
103
- ):
104
- """
105
- Task function for asynchronously measuring a given example using a JudgevalScorer.
106
-
107
- Args:
108
- task_id (int): The ID of the task being measured.
109
- progress (Progress): An instance of the Progress class to track task progress.
110
- scorer (JudgevalScorer): An instance of the JudgevalScorer class used to score the example.
111
- example (Example): The example to be scored.
112
- ignore_errors (bool, optional): Whether to ignore errors during scoring. Defaults to True.
113
- skip_on_missing_params (bool, optional): Whether to skip scoring if there are missing parameters. Defaults to True.
114
-
115
- Raises:
116
- MissingTestCaseParamsError: If required test case parameters are missing and skip_on_missing_params is False.
117
- Exception: If an unexpected error occurs and ignore_errors is False.
118
-
119
- Returns:
120
- None
121
- """
122
- while not progress.finished:
123
- start_time = time.perf_counter()
124
-
125
- try:
126
- await scorer.a_score_example(example, _show_indicator=False)
127
- finish_text = "Completed"
128
- except MissingTestCaseParamsError as e:
129
- if skip_on_missing_params:
130
- scorer.skipped = True
131
- with example_logging_context(example.timestamp, example.example_id):
132
- debug(f"Skipping example {example.example_id} due to missing parameters")
133
- return
134
- else:
135
- if ignore_errors:
136
- scorer.error = str(e)
137
- scorer.success = False # Override success
138
- finish_text = "Failed"
139
- else:
140
- with example_logging_context(example.timestamp, example.example_id):
141
- error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
142
- raise
143
- except TypeError:
144
- try:
145
- await scorer.a_score_example(example)
146
- finish_text = "Completed"
147
- except MissingTestCaseParamsError as e:
148
- if skip_on_missing_params:
149
- scorer.skipped = True
150
- with example_logging_context(example.timestamp, example.example_id):
151
- debug(f"Skipping example {example.example_id} due to missing parameters")
152
- return
153
- else:
154
- if ignore_errors:
155
- scorer.error = str(e)
156
- scorer.success = False # Override success
157
- finish_text = "Failed"
158
- else:
159
- with example_logging_context(example.timestamp, example.example_id):
160
- error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
161
- raise
162
- except Exception as e:
163
- if ignore_errors:
164
- scorer.error = str(e)
165
- scorer.success = False # Override success
166
- finish_text = "Failed"
167
- with example_logging_context(example.timestamp, example.example_id):
168
- warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
169
- else:
170
- with example_logging_context(example.timestamp, example.example_id):
171
- error(f"Stopping example {example.example_id}: {str(e)}")
172
- raise
173
-
174
- end_time = time.perf_counter()
175
- time_taken = format(end_time - start_time, ".2f")
176
- progress.update(task_id, advance=100) # Mark task as complete
177
- progress.update(
178
- task_id,
179
- description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]{finish_text}! ({time_taken}s)",
180
- )
181
- break
182
-
183
-
184
- async def score_with_indicator(
185
- scorers: List[JudgevalScorer],
186
- example: Example,
187
- ignore_errors: bool,
188
- skip_on_missing_params: bool,
189
- show_indicator: bool,
190
- ):
191
- """
192
- Scores an example using a list of JudgevalScorers, optionally displaying a progress indicator.
193
-
194
- Args:
195
- scorers (List[JudgevalScorer]): A list of JudgevalScorer objects to evaluate the example.
196
- example (Example): The example to be scored.
197
- ignore_errors (bool): If True, errors during scoring will be ignored.
198
- skip_on_missing_params (bool): If True, scoring will be skipped if required parameters are missing.
199
- show_indicator (bool): If True, a progress indicator will be displayed during scoring.
200
-
201
- Returns:
202
- None
203
-
204
- Raises:
205
- Any exceptions raised by the scoring functions, unless `ignore_errors` is True.
206
- """
207
- if show_indicator:
208
- with Progress(
209
- SpinnerColumn(style="rgb(106,0,255)"),
210
- TextColumn("[progress.description]{task.description}"),
211
- transient=True,
212
- ) as progress:
213
- tasks = []
214
- for scorer in scorers:
215
- task_id = progress.add_task(
216
- description=scorer_console_msg(
217
- scorer, async_mode=True
218
- ),
219
- total=100,
220
- ) # Add task to progress bar
221
- tasks.append(
222
- score_task(
223
- task_id,
224
- progress,
225
- scorer,
226
- example,
227
- ignore_errors,
228
- skip_on_missing_params,
229
- ) # Create and execute task to score the example with a single scorer
230
- )
231
- await asyncio.gather(*tasks)
232
- else:
233
- tasks = [
234
- safe_a_score_example(
235
- scorer, example, ignore_errors, skip_on_missing_params
236
- )
237
- for scorer in scorers
238
- ]
239
-
240
- await asyncio.gather(*tasks)
49
+ judgeval_logger.error(f"Error during scoring: {str(e)}")
50
+ scorer.error = str(e)
51
+ scorer.success = False
52
+ scorer.score = 0
53
+ return
241
54
 
242
55
 
243
56
  async def a_execute_scoring(
244
57
  examples: List[Example],
245
- scorers: List[JudgevalScorer],
246
- model: Optional[Union[str, List[str], JudgevalJudge]] = None,
247
- ignore_errors: bool = True,
248
- skip_on_missing_params: bool = True,
249
- show_indicator: bool = True,
58
+ scorers: List[ExampleScorer],
59
+ model: Optional[Union[str, List[str], JudgevalJudge]] = JUDGMENT_DEFAULT_GPT_MODEL,
60
+ ignore_errors: bool = False,
250
61
  throttle_value: int = 0,
251
62
  max_concurrent: int = 100,
252
- verbose_mode: Optional[bool] = None,
253
- _use_bar_indicator: bool = True,
63
+ show_progress: bool = True,
254
64
  ) -> List[ScoringResult]:
255
65
  """
256
- Executes evaluations of `Example`s asynchronously using one or more `JudgevalScorer`s.
257
- Each `Example` will be evaluated by all of the `JudgevalScorer`s in the `scorers` list.
66
+ Executes evaluations of `Example`s asynchronously using one or more `ExampleScorer`s.
67
+ Each `Example` will be evaluated by all of the `ExampleScorer`s in the `scorers` list.
258
68
 
259
69
  Args:
260
70
  examples (List[Example]): A list of `Example` objects to be evaluated.
261
- scorers (List[JudgevalScorer]): A list of `JudgevalScorer` objects to evaluate the examples.
71
+ scorers (List[ExampleScorer]): A list of `ExampleScorer` objects to evaluate the examples.
262
72
  model (Union[str, List[str], JudgevalJudge]): The model to use for evaluation.
263
73
  ignore_errors (bool): Whether to ignore errors during evaluation.
264
- skip_on_missing_params (bool): Whether to skip evaluation if parameters are missing.
265
- show_indicator (bool): Whether to show a progress indicator.
266
74
  throttle_value (int): The amount of time to wait between starting each task.
267
75
  max_concurrent (int): The maximum number of concurrent tasks.
268
- verbose_mode (Optional[bool]): If set, enables verbose mode for scorers.
269
- _use_bar_indicator (bool): Whether to use a progress bar indicator.
76
+ show_progress (bool): Whether to show the progress bar indicator.
270
77
 
271
78
  Returns:
272
79
  List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
273
80
  """
81
+
274
82
  semaphore = asyncio.Semaphore(max_concurrent)
275
83
 
276
84
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
277
- try:
278
- async with semaphore:
85
+ async with semaphore:
86
+ try:
279
87
  return await func(*args, **kwargs)
280
- except Exception as e:
281
- error(f"Error executing function: {e}")
282
- if kwargs.get('ignore_errors', False):
283
- # Return None when ignoring errors
284
- return None
285
- raise
286
-
287
- if verbose_mode is not None:
288
- for scorer in scorers:
289
- scorer.verbose_mode = verbose_mode
88
+ except Exception as e:
89
+ judgeval_logger.error(f"Error executing function: {e}")
90
+ if kwargs.get("ignore_errors", False):
91
+ return None
92
+ raise
290
93
 
291
- # Add model to scorers
292
94
  for scorer in scorers:
293
- scorer._add_model(model)
95
+ if not scorer.model and isinstance(model, str):
96
+ scorer._add_model(model)
294
97
 
295
- scoring_results: List[ScoringResult] = [None for _ in examples]
98
+ scoring_results: List[Optional[ScoringResult]] = [None for _ in examples]
296
99
  tasks = []
297
100
 
298
- if show_indicator and _use_bar_indicator:
101
+ if show_progress:
299
102
  with tqdm_asyncio(
300
103
  desc=f"Evaluating {len(examples)} example(s) in parallel",
301
104
  unit="Example",
@@ -303,24 +106,12 @@ async def a_execute_scoring(
303
106
  bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
304
107
  ) as pbar:
305
108
  for i, ex in enumerate(examples):
306
- with example_logging_context(ex.timestamp, ex.example_id):
307
- debug(f"Starting scoring for example {ex.example_id}")
308
- debug(f"Input: {ex.input}")
309
- debug(f"Using {len(scorers)} scorers")
310
- for scorer in scorers:
311
- debug(f"Using scorer: {type(scorer).__name__}")
312
- if hasattr(scorer, 'threshold'):
313
- debug(f"Scorer threshold: {scorer.threshold}")
314
- if hasattr(scorer, 'model'):
315
- debug(f"Scorer model: {type(scorer.model).__name__}")
316
109
  if isinstance(ex, Example):
317
110
  if len(scorers) == 0:
318
111
  pbar.update(1)
319
112
  continue
320
-
321
- cloned_scorers: List[JudgevalScorer] = clone_scorers(
322
- scorers
323
- )
113
+
114
+ cloned_scorers = clone_scorers(scorers) # type: ignore
324
115
  task = execute_with_semaphore(
325
116
  func=a_eval_examples_helper,
326
117
  scorers=cloned_scorers,
@@ -328,9 +119,6 @@ async def a_execute_scoring(
328
119
  scoring_results=scoring_results,
329
120
  score_index=i,
330
121
  ignore_errors=ignore_errors,
331
- skip_on_missing_params=skip_on_missing_params,
332
- show_indicator=show_indicator,
333
- _use_bar_indicator=_use_bar_indicator,
334
122
  pbar=pbar,
335
123
  )
336
124
  tasks.append(asyncio.create_task(task))
@@ -343,9 +131,7 @@ async def a_execute_scoring(
343
131
  if len(scorers) == 0:
344
132
  continue
345
133
 
346
- cloned_scorers: List[JudgevalScorer] = clone_scorers(
347
- scorers
348
- )
134
+ cloned_scorers = clone_scorers(scorers) # type: ignore
349
135
  task = execute_with_semaphore(
350
136
  func=a_eval_examples_helper,
351
137
  scorers=cloned_scorers,
@@ -353,75 +139,60 @@ async def a_execute_scoring(
353
139
  scoring_results=scoring_results,
354
140
  score_index=i,
355
141
  ignore_errors=ignore_errors,
356
- skip_on_missing_params=skip_on_missing_params,
357
- _use_bar_indicator=_use_bar_indicator,
358
- show_indicator=show_indicator,
142
+ pbar=None,
359
143
  )
360
- tasks.append(asyncio.create_task((task)))
144
+ tasks.append(asyncio.create_task(task))
361
145
 
362
146
  await asyncio.sleep(throttle_value)
363
147
  await asyncio.gather(*tasks)
364
- return scoring_results
148
+ return [result for result in scoring_results if result is not None]
365
149
 
366
150
 
367
151
  async def a_eval_examples_helper(
368
- scorers: List[JudgevalScorer],
152
+ scorers: List[ExampleScorer],
369
153
  example: Example,
370
- scoring_results: List[ScoringResult],
154
+ scoring_results: List[Optional[ScoringResult]],
371
155
  score_index: int,
372
156
  ignore_errors: bool,
373
- skip_on_missing_params: bool,
374
- show_indicator: bool,
375
- _use_bar_indicator: bool,
376
157
  pbar: Optional[tqdm_asyncio] = None,
377
- ) -> None:
158
+ ) -> None:
378
159
  """
379
160
  Evaluate a single example asynchronously using a list of scorers.
380
-
161
+
381
162
  Args:
382
- scorers (List[JudgevalScorer]): List of JudgevalScorer objects to evaluate the example.
163
+ scorers (List[ExampleScorer]): List of ExampleScorer objects to evaluate the example.
383
164
  example (Example): The example to be evaluated.
384
165
  scoring_results (List[ScoringResult]): List to store the scoring results.
385
166
  score_index (int): Index at which the result should be stored in scoring_results.
386
167
  ignore_errors (bool): Flag to indicate whether to ignore errors during scoring.
387
- skip_on_missing_params (bool): Flag to indicate whether to skip scoring if parameters are missing.
388
- show_indicator (bool): Flag to indicate whether to show a progress indicator.
389
- _use_bar_indicator (bool): Flag to indicate whether to use a bar indicator for progress.
390
168
  pbar (Optional[tqdm_asyncio]): Optional progress bar for tracking progress.
391
169
  Returns:
392
170
  None
393
171
  """
394
- show_metrics_indicator = show_indicator and not _use_bar_indicator
395
-
396
- for scorer in scorers:
397
- scorer.skipped = False
398
- scorer.error = None # Reset scorer error
399
172
 
400
- # scoring the Example
401
- process_example = create_process_example(example) # Creates process example to track progress
402
173
  scoring_start_time = time.perf_counter()
403
- await score_with_indicator(
404
- scorers=scorers,
405
- example=example,
406
- skip_on_missing_params=skip_on_missing_params,
407
- ignore_errors=ignore_errors,
408
- show_indicator=show_metrics_indicator,
409
- ) # execute the scoring functions of each scorer on the example
410
174
 
411
- # Now that all the scoring functions of each scorer have executed, we collect
412
- # the results and update the process example with the scorer data
175
+ tasks = [safe_a_score_example(scorer, example) for scorer in scorers]
176
+
177
+ await asyncio.gather(*tasks)
178
+
179
+ success = True
180
+ scorer_data_list = []
413
181
  for scorer in scorers:
414
- # At this point, the scorer has been executed and already contains data.
415
- if getattr(scorer, 'skipped', False):
182
+ if getattr(scorer, "skipped", False):
416
183
  continue
417
- scorer_data = create_scorer_data(scorer) # Fetch scorer data from completed scorer evaluation
418
- process_example.update_scorer_data(scorer_data) # Update process example with the same scorer data
419
-
420
- test_end_time = time.perf_counter()
421
- run_duration = test_end_time - scoring_start_time
422
-
423
- process_example.update_run_duration(run_duration) # Update process example with execution time duration
424
- scoring_results[score_index] = generate_scoring_result(process_example) # Converts the outcomes of the executed test to a ScoringResult and saves it
184
+ scorer_data = create_scorer_data(scorer)
185
+ for s in scorer_data:
186
+ success = success and s.success
187
+ scorer_data_list.extend(scorer_data)
188
+
189
+ scoring_end_time = time.perf_counter()
190
+ run_duration = scoring_end_time - scoring_start_time
191
+
192
+ scoring_result = generate_scoring_result(
193
+ example, scorer_data_list, run_duration, success
194
+ )
195
+ scoring_results[score_index] = scoring_result
425
196
 
426
197
  if pbar is not None:
427
198
  pbar.update(1)