judgeval 0.0.51__py3-none-any.whl → 0.0.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. judgeval/common/logger.py +46 -199
  2. judgeval/common/s3_storage.py +2 -6
  3. judgeval/common/tracer.py +182 -262
  4. judgeval/common/utils.py +16 -36
  5. judgeval/constants.py +14 -20
  6. judgeval/data/__init__.py +0 -2
  7. judgeval/data/datasets/dataset.py +6 -10
  8. judgeval/data/datasets/eval_dataset_client.py +25 -27
  9. judgeval/data/example.py +5 -138
  10. judgeval/data/judgment_types.py +214 -0
  11. judgeval/data/result.py +7 -25
  12. judgeval/data/scorer_data.py +28 -40
  13. judgeval/data/scripts/fix_default_factory.py +23 -0
  14. judgeval/data/scripts/openapi_transform.py +123 -0
  15. judgeval/data/tool.py +3 -54
  16. judgeval/data/trace.py +31 -50
  17. judgeval/data/trace_run.py +3 -3
  18. judgeval/evaluation_run.py +16 -23
  19. judgeval/integrations/langgraph.py +11 -12
  20. judgeval/judges/litellm_judge.py +3 -6
  21. judgeval/judges/mixture_of_judges.py +8 -25
  22. judgeval/judges/together_judge.py +3 -6
  23. judgeval/judgment_client.py +22 -24
  24. judgeval/rules.py +7 -19
  25. judgeval/run_evaluation.py +79 -242
  26. judgeval/scorers/__init__.py +4 -20
  27. judgeval/scorers/agent_scorer.py +21 -0
  28. judgeval/scorers/api_scorer.py +28 -38
  29. judgeval/scorers/base_scorer.py +98 -0
  30. judgeval/scorers/example_scorer.py +19 -0
  31. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
  32. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
  34. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
  35. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
  36. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
  37. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
  38. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
  40. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
  41. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
  42. judgeval/scorers/score.py +45 -330
  43. judgeval/scorers/utils.py +6 -88
  44. judgeval/utils/file_utils.py +4 -6
  45. judgeval/version_check.py +3 -2
  46. {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/METADATA +3 -2
  47. judgeval-0.0.53.dist-info/RECORD +65 -0
  48. judgeval/data/custom_example.py +0 -19
  49. judgeval/scorers/judgeval_scorer.py +0 -177
  50. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
  51. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
  52. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
  53. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
  54. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
  55. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
  56. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
  57. judgeval/scorers/prompt_scorer.py +0 -296
  58. judgeval-0.0.51.dist-info/RECORD +0 -69
  59. {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/WHEEL +0 -0
  60. {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/licenses/LICENSE.md +0 -0
judgeval/scorers/score.py CHANGED
@@ -1,303 +1,72 @@
1
1
  """
2
- Infrastructure for executing evaluations of `Example`s using one or more `JudgevalScorer`s.
2
+ Infrastructure for executing evaluations of `Example`s using one or more `BaseScorer`s.
3
3
  """
4
4
 
5
5
  import asyncio
6
6
  import time
7
7
  from tqdm.asyncio import tqdm_asyncio
8
8
  from typing import List, Union, Optional, Callable
9
- from rich.progress import Progress, SpinnerColumn, TextColumn
10
9
 
11
10
  from judgeval.data import (
12
11
  Example,
13
- CustomExample,
14
12
  ScoringResult,
15
13
  generate_scoring_result,
16
14
  create_scorer_data,
17
15
  )
18
- from judgeval.scorers import JudgevalScorer
19
- from judgeval.scorers.utils import clone_scorers, scorer_console_msg
20
- from judgeval.common.exceptions import MissingTestCaseParamsError
21
- from judgeval.common.logger import example_logging_context, debug, error, warning, info
16
+ from judgeval.scorers import BaseScorer
17
+ from judgeval.scorers.utils import clone_scorers
18
+ from judgeval.common.logger import judgeval_logger
22
19
  from judgeval.judges import JudgevalJudge
23
20
 
24
21
 
25
22
  async def safe_a_score_example(
26
- scorer: JudgevalScorer,
23
+ scorer: BaseScorer,
27
24
  example: Example,
28
- ignore_errors: bool,
29
- skip_on_missing_params: bool,
30
25
  ):
31
26
  """
32
27
  Scoring task function when not using a progress indicator!
33
- "Safely" scores an `Example` using a `JudgevalScorer` by gracefully handling any exceptions that may occur.
28
+ "Safely" scores an `Example` using a `BaseScorer` by gracefully handling any exceptions that may occur.
34
29
 
35
30
  Args:
36
- scorer (JudgevalScorer): The `JudgevalScorer` to use for scoring the example.
31
+ scorer (BaseScorer): The `BaseScorer` to use for scoring the example.
37
32
  example (Example): The `Example` to be scored.
38
33
 
39
34
  ignore_errors (bool): Whether to ignore errors during the evaluation.
40
35
  If set to false, any error will be raised and stop the evaluation.
41
- If set to true, the error will be stored in the `error` attribute of the `JudgevalScorer` and the `success` attribute will be set to False.
36
+ If set to true, the error will be stored in the `error` attribute of the `BaseScorer` and the `success` attribute will be set to False.
42
37
 
43
38
  skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
44
39
  """
45
- debug(f"Starting safe_a_score_example for example {example.example_id}")
46
40
  try:
47
- await scorer.a_score_example(example, _show_indicator=False)
48
- info(f"Successfully scored example {example.example_id}")
49
- except MissingTestCaseParamsError as e:
50
- if (
51
- skip_on_missing_params
52
- ): # Skip the example if the scorer requires parameters that are missing
53
- with example_logging_context(example.created_at, example.example_id):
54
- warning(
55
- f"Skipping example {example.example_id} due to missing parameters"
56
- )
57
- scorer.skipped = True
58
- return
59
- else:
60
- if (
61
- ignore_errors
62
- ): # Gracefully handle the error, does not stop the evaluation
63
- scorer.error = str(e)
64
- scorer.success = False
65
- with example_logging_context(example.created_at, example.example_id):
66
- warning(
67
- f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters"
68
- )
69
- else: # Raise the error and stop the evaluation
70
- with example_logging_context(example.created_at, example.example_id):
71
- error(
72
- f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
73
- )
74
- raise
75
- except TypeError: # in case a_score_example does not accept _show_indicator
76
- try:
77
- await scorer.a_score_example(example)
78
- except MissingTestCaseParamsError as e:
79
- if skip_on_missing_params:
80
- scorer.skipped = True
81
- with example_logging_context(example.created_at, example.example_id):
82
- warning(
83
- f"Skipping example {example.example_id} due to missing parameters"
84
- )
85
- return
86
- else:
87
- if ignore_errors:
88
- scorer.error = str(e)
89
- scorer.success = False
90
- with example_logging_context(
91
- example.created_at, example.example_id
92
- ):
93
- warning(
94
- f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters"
95
- )
96
- else:
97
- with example_logging_context(
98
- example.created_at, example.example_id
99
- ):
100
- error(
101
- f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
102
- )
103
- raise
41
+ scorer.score = await scorer.a_score_example(example)
42
+ scorer.success = scorer.success_check()
104
43
  except Exception as e:
105
- if ignore_errors:
106
- scorer.error = str(e)
107
- scorer.success = False # Assuming you want to set success to False
108
- with example_logging_context(example.created_at, example.example_id):
109
- warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
110
- else:
111
- with example_logging_context(example.created_at, example.example_id):
112
- error(f"Stopping example {example.example_id}: {str(e)}")
113
- raise
114
-
115
-
116
- async def score_task(
117
- task_id: int,
118
- progress: Progress,
119
- scorer: JudgevalScorer,
120
- example: Example,
121
- ignore_errors: bool = True,
122
- skip_on_missing_params: bool = True,
123
- ):
124
- """
125
- Task function for asynchronously measuring a given example using a JudgevalScorer.
126
-
127
- Args:
128
- task_id (int): The ID of the task being measured.
129
- progress (Progress): An instance of the Progress class to track task progress.
130
- scorer (JudgevalScorer): An instance of the JudgevalScorer class used to score the example.
131
- example (Example): The example to be scored.
132
- ignore_errors (bool, optional): Whether to ignore errors during scoring. Defaults to True.
133
- skip_on_missing_params (bool, optional): Whether to skip scoring if there are missing parameters. Defaults to True.
134
-
135
- Raises:
136
- MissingTestCaseParamsError: If required test case parameters are missing and skip_on_missing_params is False.
137
- Exception: If an unexpected error occurs and ignore_errors is False.
138
-
139
- Returns:
140
- None
141
- """
142
- while not progress.finished:
143
- start_time = time.perf_counter()
144
-
145
- try:
146
- await scorer.a_score_example(example, _show_indicator=False)
147
- finish_text = "Completed"
148
- except MissingTestCaseParamsError as e:
149
- if skip_on_missing_params:
150
- scorer.skipped = True
151
- with example_logging_context(example.created_at, example.example_id):
152
- debug(
153
- f"Skipping example {example.example_id} due to missing parameters"
154
- )
155
- return
156
- else:
157
- if ignore_errors:
158
- scorer.error = str(e)
159
- scorer.success = False # Override success
160
- finish_text = "Failed"
161
- else:
162
- with example_logging_context(
163
- example.created_at, example.example_id
164
- ):
165
- error(
166
- f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
167
- )
168
- raise
169
- except TypeError:
170
- try:
171
- await scorer.a_score_example(example)
172
- finish_text = "Completed"
173
- except MissingTestCaseParamsError as e:
174
- if skip_on_missing_params:
175
- scorer.skipped = True
176
- with example_logging_context(
177
- example.created_at, example.example_id
178
- ):
179
- debug(
180
- f"Skipping example {example.example_id} due to missing parameters"
181
- )
182
- return
183
- else:
184
- if ignore_errors:
185
- scorer.error = str(e)
186
- scorer.success = False # Override success
187
- finish_text = "Failed"
188
- else:
189
- with example_logging_context(
190
- example.created_at, example.example_id
191
- ):
192
- error(
193
- f"Stopping example {example.example_id}: {str(e)} due to missing parameters"
194
- )
195
- raise
196
- except Exception as e:
197
- if ignore_errors:
198
- scorer.error = str(e)
199
- scorer.success = False # Override success
200
- finish_text = "Failed"
201
- with example_logging_context(example.created_at, example.example_id):
202
- warning(
203
- f"Ignoring errors for example {example.example_id}: {str(e)}"
204
- )
205
- else:
206
- with example_logging_context(example.created_at, example.example_id):
207
- error(f"Stopping example {example.example_id}: {str(e)}")
208
- raise
209
-
210
- end_time = time.perf_counter()
211
- time_taken = format(end_time - start_time, ".2f")
212
- progress.update(task_id, advance=100) # Mark task as complete
213
- progress.update(
214
- task_id,
215
- description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]{finish_text}! ({time_taken}s)",
216
- )
217
- break
218
-
219
-
220
- async def score_with_indicator(
221
- scorers: List[JudgevalScorer],
222
- example: Example,
223
- ignore_errors: bool,
224
- skip_on_missing_params: bool,
225
- show_indicator: bool,
226
- ):
227
- """
228
- Scores an example using a list of JudgevalScorers, optionally displaying a progress indicator.
229
-
230
- Args:
231
- scorers (List[JudgevalScorer]): A list of JudgevalScorer objects to evaluate the example.
232
- example (Example): The example to be scored.
233
- ignore_errors (bool): If True, errors during scoring will be ignored.
234
- skip_on_missing_params (bool): If True, scoring will be skipped if required parameters are missing.
235
- show_indicator (bool): If True, a progress indicator will be displayed during scoring.
236
-
237
- Returns:
238
- None
239
-
240
- Raises:
241
- Any exceptions raised by the scoring functions, unless `ignore_errors` is True.
242
- """
243
- if show_indicator:
244
- with Progress(
245
- SpinnerColumn(style="rgb(106,0,255)"),
246
- TextColumn("[progress.description]{task.description}"),
247
- transient=True,
248
- ) as progress:
249
- tasks = []
250
- for scorer in scorers:
251
- task_id = progress.add_task(
252
- description=scorer_console_msg(scorer, async_mode=True),
253
- total=100,
254
- ) # Add task to progress bar
255
- tasks.append(
256
- score_task(
257
- task_id,
258
- progress,
259
- scorer,
260
- example,
261
- ignore_errors,
262
- skip_on_missing_params,
263
- ) # Create and execute task to score the example with a single scorer
264
- )
265
- await asyncio.gather(*tasks)
266
- else:
267
- tasks = [
268
- safe_a_score_example(scorer, example, ignore_errors, skip_on_missing_params)
269
- for scorer in scorers
270
- ]
271
-
272
- await asyncio.gather(*tasks)
44
+ judgeval_logger.error(f"Error during scoring: {str(e)}")
45
+ scorer.error = str(e)
46
+ scorer.success = False
47
+ return
273
48
 
274
49
 
275
50
  async def a_execute_scoring(
276
- examples: Union[List[Example], List[CustomExample]],
277
- scorers: List[JudgevalScorer],
51
+ examples: List[Example],
52
+ scorers: List[BaseScorer],
278
53
  model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
279
- ignore_errors: bool = True,
280
- skip_on_missing_params: bool = True,
281
- show_indicator: bool = True,
54
+ ignore_errors: bool = False,
282
55
  throttle_value: int = 0,
283
56
  max_concurrent: int = 100,
284
- verbose_mode: Optional[bool] = None,
285
- _use_bar_indicator: bool = True,
286
57
  ) -> List[ScoringResult]:
287
58
  """
288
- Executes evaluations of `Example`s asynchronously using one or more `JudgevalScorer`s.
289
- Each `Example` will be evaluated by all of the `JudgevalScorer`s in the `scorers` list.
59
+ Executes evaluations of `Example`s asynchronously using one or more `BaseScorer`s.
60
+ Each `Example` will be evaluated by all of the `BaseScorer`s in the `scorers` list.
290
61
 
291
62
  Args:
292
- examples (Union[List[Example], List[CustomExample]]): A list of `Example` objects to be evaluated.
293
- scorers (List[JudgevalScorer]): A list of `JudgevalScorer` objects to evaluate the examples.
63
+ examples (List[Example]): A list of `Example` objects to be evaluated.
64
+ scorers (List[BaseScorer]): A list of `BaseScorer` objects to evaluate the examples.
294
65
  model (Union[str, List[str], JudgevalJudge]): The model to use for evaluation.
295
66
  ignore_errors (bool): Whether to ignore errors during evaluation.
296
- skip_on_missing_params (bool): Whether to skip evaluation if parameters are missing.
297
- show_indicator (bool): Whether to show a progress indicator.
298
67
  throttle_value (int): The amount of time to wait between starting each task.
299
68
  max_concurrent (int): The maximum number of concurrent tasks.
300
- verbose_mode (Optional[bool]): If set, enables verbose mode for scorers.
69
+
301
70
  _use_bar_indicator (bool): Whether to use a progress bar indicator.
302
71
 
303
72
  Returns:
@@ -311,69 +80,31 @@ async def a_execute_scoring(
311
80
  try:
312
81
  return await func(*args, **kwargs)
313
82
  except Exception as e:
314
- print(f"Error executing function: {e}")
83
+ judgeval_logger.error(f"Error executing function: {e}")
315
84
  if kwargs.get("ignore_errors", False):
316
85
  # Simply return None when ignoring errors, as expected by the test
317
86
  return None
318
87
  # If we're not ignoring errors, propagate the exception
319
88
  raise
320
89
 
321
- if verbose_mode is not None:
322
- for scorer in scorers:
323
- scorer.verbose_mode = verbose_mode
324
-
325
90
  # Add model to scorers
326
91
  for scorer in scorers:
327
92
  scorer._add_model(model)
328
93
 
329
94
  scoring_results: List[ScoringResult] = [None for _ in examples]
330
95
  tasks = []
331
- cloned_scorers: List[JudgevalScorer]
332
-
333
- if show_indicator and _use_bar_indicator:
334
- with tqdm_asyncio(
335
- desc=f"Evaluating {len(examples)} example(s) in parallel",
336
- unit="Example",
337
- total=len(examples),
338
- bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
339
- ) as pbar:
340
- for i, ex in enumerate(examples):
341
- with example_logging_context(ex.created_at, ex.example_id):
342
- debug(f"Starting scoring for example {ex.example_id}")
343
- debug(f"Input: {ex.input}")
344
- debug(f"Using {len(scorers)} scorers")
345
- for scorer in scorers:
346
- debug(f"Using scorer: {type(scorer).__name__}")
347
- if hasattr(scorer, "threshold"):
348
- debug(f"Scorer threshold: {scorer.threshold}")
349
- if hasattr(scorer, "model"):
350
- debug(f"Scorer model: {type(scorer.model).__name__}")
351
- if isinstance(ex, Example) or isinstance(ex, CustomExample):
352
- if len(scorers) == 0:
353
- pbar.update(1)
354
- continue
355
-
356
- cloned_scorers = clone_scorers(scorers)
357
- task = execute_with_semaphore(
358
- func=a_eval_examples_helper,
359
- scorers=cloned_scorers,
360
- example=ex,
361
- scoring_results=scoring_results,
362
- score_index=i,
363
- ignore_errors=ignore_errors,
364
- skip_on_missing_params=skip_on_missing_params,
365
- show_indicator=show_indicator,
366
- _use_bar_indicator=_use_bar_indicator,
367
- pbar=pbar,
368
- )
369
- tasks.append(asyncio.create_task(task))
370
-
371
- await asyncio.sleep(throttle_value)
372
- await asyncio.gather(*tasks)
373
- else:
96
+ cloned_scorers: List[BaseScorer]
97
+
98
+ with tqdm_asyncio(
99
+ desc=f"Evaluating {len(examples)} example(s) in parallel",
100
+ unit="Example",
101
+ total=len(examples),
102
+ bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
103
+ ) as pbar:
374
104
  for i, ex in enumerate(examples):
375
- if isinstance(ex, Example) or isinstance(ex, CustomExample):
105
+ if isinstance(ex, Example):
376
106
  if len(scorers) == 0:
107
+ pbar.update(1)
377
108
  continue
378
109
 
379
110
  cloned_scorers = clone_scorers(scorers)
@@ -384,11 +115,9 @@ async def a_execute_scoring(
384
115
  scoring_results=scoring_results,
385
116
  score_index=i,
386
117
  ignore_errors=ignore_errors,
387
- skip_on_missing_params=skip_on_missing_params,
388
- _use_bar_indicator=_use_bar_indicator,
389
- show_indicator=show_indicator,
118
+ pbar=pbar,
390
119
  )
391
- tasks.append(asyncio.create_task((task)))
120
+ tasks.append(asyncio.create_task(task))
392
121
 
393
122
  await asyncio.sleep(throttle_value)
394
123
  await asyncio.gather(*tasks)
@@ -396,48 +125,33 @@ async def a_execute_scoring(
396
125
 
397
126
 
398
127
  async def a_eval_examples_helper(
399
- scorers: List[JudgevalScorer],
400
- example: Union[Example, CustomExample],
128
+ scorers: List[BaseScorer],
129
+ example: Example,
401
130
  scoring_results: List[ScoringResult],
402
131
  score_index: int,
403
132
  ignore_errors: bool,
404
- skip_on_missing_params: bool,
405
- show_indicator: bool,
406
- _use_bar_indicator: bool,
407
133
  pbar: Optional[tqdm_asyncio] = None,
408
134
  ) -> None:
409
135
  """
410
136
  Evaluate a single example asynchronously using a list of scorers.
411
137
 
412
138
  Args:
413
- scorers (List[JudgevalScorer]): List of JudgevalScorer objects to evaluate the example.
139
+ scorers (List[BaseScorer]): List of BaseScorer objects to evaluate the example.
414
140
  example (Example): The example to be evaluated.
415
141
  scoring_results (List[ScoringResult]): List to store the scoring results.
416
142
  score_index (int): Index at which the result should be stored in scoring_results.
417
143
  ignore_errors (bool): Flag to indicate whether to ignore errors during scoring.
418
- skip_on_missing_params (bool): Flag to indicate whether to skip scoring if parameters are missing.
419
- show_indicator (bool): Flag to indicate whether to show a progress indicator.
420
- _use_bar_indicator (bool): Flag to indicate whether to use a bar indicator for progress.
421
144
  pbar (Optional[tqdm_asyncio]): Optional progress bar for tracking progress.
422
145
  Returns:
423
146
  None
424
147
  """
425
148
 
426
- show_metrics_indicator = show_indicator and not _use_bar_indicator
427
-
428
- for scorer in scorers:
429
- scorer.skipped = False
430
- scorer.error = None # Reset scorer error
431
-
432
149
  # scoring the Example
433
150
  scoring_start_time = time.perf_counter()
434
- await score_with_indicator(
435
- scorers=scorers,
436
- example=example,
437
- skip_on_missing_params=skip_on_missing_params,
438
- ignore_errors=ignore_errors,
439
- show_indicator=show_metrics_indicator,
440
- ) # execute the scoring functions of each scorer on the example
151
+
152
+ tasks = [safe_a_score_example(scorer, example) for scorer in scorers]
153
+
154
+ await asyncio.gather(*tasks)
441
155
 
442
156
  # Now that all the scoring functions of each scorer have executed, we collect
443
157
  # the results and update the ScoringResult with the scorer data
@@ -450,8 +164,9 @@ async def a_eval_examples_helper(
450
164
  scorer_data = create_scorer_data(
451
165
  scorer
452
166
  ) # Fetch scorer data from completed scorer evaluation
453
- success = success and scorer_data.success
454
- scorer_data_list.append(scorer_data)
167
+ for s in scorer_data:
168
+ success = success and s.success
169
+ scorer_data_list.extend(scorer_data)
455
170
 
456
171
  scoring_end_time = time.perf_counter()
457
172
  run_duration = scoring_end_time - scoring_start_time
judgeval/scorers/utils.py CHANGED
@@ -6,19 +6,15 @@ import asyncio
6
6
  import nest_asyncio
7
7
  import inspect
8
8
  import json
9
- import sys
10
9
  import re
11
- from contextlib import contextmanager
12
- from rich.progress import Progress, SpinnerColumn, TextColumn
13
- from rich.console import Console
14
10
  from typing import List, Optional
15
11
 
16
- from judgeval.scorers import JudgevalScorer
12
+ from judgeval.scorers import BaseScorer
17
13
  from judgeval.data import Example, ExampleParams
18
14
  from judgeval.scorers.exceptions import MissingExampleParamsError
19
15
 
20
16
 
21
- def clone_scorers(scorers: List[JudgevalScorer]) -> List[JudgevalScorer]:
17
+ def clone_scorers(scorers: List[BaseScorer]) -> List[BaseScorer]:
22
18
  """
23
19
  Creates duplicates of the scorers passed as argument.
24
20
  """
@@ -32,60 +28,14 @@ def clone_scorers(scorers: List[JudgevalScorer]) -> List[JudgevalScorer]:
32
28
  valid_args = {key: args[key] for key in valid_params if key in args}
33
29
 
34
30
  cloned_scorer = scorer_class(**valid_args)
35
- # kinda hacky, but in case the class inheriting from JudgevalScorer doesn't have `model` in its __init__,
31
+ # kinda hacky, but in case the class inheriting from BaseScorer doesn't have `model` in its __init__,
36
32
  # we need to explicitly include it here so that we can add the judge model to the cloned scorer
37
33
  cloned_scorer._add_model(model=args.get("model"))
38
34
  cloned_scorers.append(cloned_scorer)
39
35
  return cloned_scorers
40
36
 
41
37
 
42
- def scorer_console_msg(
43
- scorer: JudgevalScorer,
44
- async_mode: Optional[bool] = None,
45
- ):
46
- """
47
- Renders a message to be displayed to console when a scorer is being executed.
48
- """
49
- if async_mode is None:
50
- run_async = scorer.async_mode
51
- else:
52
- run_async = async_mode
53
-
54
- return f"🔨 Executing Judgment's [rgb(106,0,255)]{scorer.__name__} Scorer[/rgb(106,0,255)]! \
55
- [rgb(55,65,81)](using {scorer.evaluation_model}, async_mode={run_async})...[/rgb(55,65,81)]"
56
-
57
-
58
- @contextmanager
59
- def scorer_progress_meter(
60
- scorer: JudgevalScorer,
61
- async_mode: Optional[bool] = None,
62
- display_meter: bool = True,
63
- total: int = 100,
64
- transient: bool = True,
65
- ):
66
- """
67
- Context manager to display a progress indicator (spinner) while a scorer is being run.
68
- """
69
- console = Console(file=sys.stderr)
70
- if display_meter:
71
- with Progress(
72
- SpinnerColumn(style="rgb(106,0,255)"),
73
- TextColumn("[progress.description]{task.description}"),
74
- console=console,
75
- transient=transient,
76
- ) as progress:
77
- progress.add_task(
78
- description=scorer_console_msg(scorer, async_mode),
79
- total=total,
80
- )
81
- yield
82
- else:
83
- yield
84
-
85
-
86
- def parse_response_json(
87
- llm_response: str, scorer: Optional[JudgevalScorer] = None
88
- ) -> dict:
38
+ def parse_response_json(llm_response: str, scorer: Optional[BaseScorer] = None) -> dict:
89
39
  """
90
40
  Extracts JSON output from an LLM response and returns it as a dictionary.
91
41
 
@@ -93,7 +43,7 @@ def parse_response_json(
93
43
 
94
44
  Args:
95
45
  llm_response (str): The response from an LLM.
96
- scorer (JudgevalScorer, optional): The scorer object to forward errors to (if any).
46
+ scorer (BaseScorer, optional): The scorer object to forward errors to (if any).
97
47
  """
98
48
  start = llm_response.find("{") # opening bracket
99
49
  end = llm_response.rfind("}") + 1 # closing bracket
@@ -120,38 +70,6 @@ def parse_response_json(
120
70
  raise Exception(f"An unexpected error occurred: {str(e)}")
121
71
 
122
72
 
123
- def print_verbose_logs(metric: str, logs: str):
124
- print("*" * 50)
125
- print(f"{metric} Verbose Logs")
126
- print("*" * 50)
127
- print("")
128
- print(logs)
129
- print("")
130
- print("=" * 70)
131
-
132
-
133
- def create_verbose_logs(metric: JudgevalScorer, steps: List[str]) -> str:
134
- """
135
- Creates verbose logs for a scorer object.
136
-
137
- Args:
138
- metric (JudgevalScorer): The scorer object.
139
- steps (List[str]): The steps to be included in the verbose logs.
140
-
141
- Returns:
142
- str: The verbose logs (Concatenated steps).
143
- """
144
-
145
- verbose_logs = ""
146
- for i in range(len(steps) - 1):
147
- verbose_logs += steps[i]
148
- if i < len(steps) - 2: # don't add new line for penultimate step
149
- verbose_logs += " \n \n"
150
- if metric.verbose_mode:
151
- print_verbose_logs(metric.__name__, verbose_logs + f"\n \n{steps[-1]}")
152
- return verbose_logs
153
-
154
-
155
73
  def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
156
74
  """
157
75
  Get or create an asyncio event loop.
@@ -186,7 +104,7 @@ def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
186
104
  def check_example_params(
187
105
  example: Example,
188
106
  example_params: List[ExampleParams],
189
- scorer: JudgevalScorer,
107
+ scorer: BaseScorer,
190
108
  ):
191
109
  if isinstance(example, Example) is False:
192
110
  error_str = f"in check_example_params(): Expected example to be of type 'Example', but got {type(example)}"
@@ -1,16 +1,15 @@
1
1
  import yaml
2
2
  from typing import List
3
- from judgeval.common.logger import debug, info, error
3
+ from judgeval.common.logger import judgeval_logger
4
4
 
5
5
  from judgeval.data import Example
6
6
 
7
7
 
8
8
  def get_examples_from_yaml(file_path: str) -> List[Example] | None:
9
- debug(f"Loading dataset from YAML file: {file_path}")
10
9
  """
11
10
  Adds examples from a YAML file.
12
11
 
13
- The format of the YAML file is expected to be a dictionary with one key: "examples".
12
+ The format of the YAML file is expected to be a dictionary with one key: "examples".
14
13
  The value of the key is a list of dictionaries, where each dictionary represents an example.
15
14
 
16
15
  The YAML file is expected to have the following format:
@@ -42,12 +41,11 @@ def get_examples_from_yaml(file_path: str) -> List[Example] | None:
42
41
  raise ValueError("The YAML file is empty.")
43
42
  examples = payload.get("examples", [])
44
43
  except FileNotFoundError:
45
- error(f"YAML file not found: {file_path}")
44
+ judgeval_logger.error(f"YAML file not found: {file_path}")
46
45
  raise FileNotFoundError(f"The file {file_path} was not found.")
47
46
  except yaml.YAMLError:
48
- error(f"Invalid YAML file: {file_path}")
47
+ judgeval_logger.error(f"Invalid YAML file: {file_path}")
49
48
  raise ValueError(f"The file {file_path} is not a valid YAML file.")
50
49
 
51
- info(f"Added {len(examples)} examples from YAML")
52
50
  new_examples = [Example(**e) for e in examples]
53
51
  return new_examples
judgeval/version_check.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import importlib.metadata
2
2
  from judgeval.utils.requests import requests
3
3
  import threading
4
+ from judgeval.common.logger import judgeval_logger
4
5
 
5
6
 
6
7
  def check_latest_version(package_name: str = "judgeval"):
@@ -13,8 +14,8 @@ def check_latest_version(package_name: str = "judgeval"):
13
14
  latest_version = response.json()["info"]["version"]
14
15
 
15
16
  if current_version != latest_version:
16
- print(
17
- f"\033[93mUPDATE AVAILABLE:\033[0m You are using '{package_name}=={current_version}', "
17
+ judgeval_logger.warning(
18
+ f"UPDATE AVAILABLE: You are using '{package_name}=={current_version}', "
18
19
  f"but the latest version is '{latest_version}'. While this version is still supported, "
19
20
  f"we recommend upgrading to avoid potential issues or missing features: "
20
21
  f"`pip install --upgrade {package_name}`"