judgeval 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. judgeval/__init__.py +83 -0
  2. judgeval/clients.py +19 -0
  3. judgeval/common/__init__.py +8 -0
  4. judgeval/common/exceptions.py +28 -0
  5. judgeval/common/logger.py +189 -0
  6. judgeval/common/tracer.py +587 -0
  7. judgeval/common/utils.py +763 -0
  8. judgeval/constants.py +55 -0
  9. judgeval/data/__init__.py +14 -0
  10. judgeval/data/api_example.py +111 -0
  11. judgeval/data/datasets/__init__.py +4 -0
  12. judgeval/data/datasets/dataset.py +407 -0
  13. judgeval/data/datasets/ground_truth.py +54 -0
  14. judgeval/data/datasets/utils.py +74 -0
  15. judgeval/data/example.py +76 -0
  16. judgeval/data/result.py +83 -0
  17. judgeval/data/scorer_data.py +86 -0
  18. judgeval/evaluation_run.py +130 -0
  19. judgeval/judges/__init__.py +7 -0
  20. judgeval/judges/base_judge.py +44 -0
  21. judgeval/judges/litellm_judge.py +49 -0
  22. judgeval/judges/mixture_of_judges.py +248 -0
  23. judgeval/judges/together_judge.py +55 -0
  24. judgeval/judges/utils.py +45 -0
  25. judgeval/judgment_client.py +244 -0
  26. judgeval/run_evaluation.py +355 -0
  27. judgeval/scorers/__init__.py +30 -0
  28. judgeval/scorers/base_scorer.py +51 -0
  29. judgeval/scorers/custom_scorer.py +134 -0
  30. judgeval/scorers/judgeval_scorers/__init__.py +21 -0
  31. judgeval/scorers/judgeval_scorers/answer_relevancy.py +19 -0
  32. judgeval/scorers/judgeval_scorers/contextual_precision.py +19 -0
  33. judgeval/scorers/judgeval_scorers/contextual_recall.py +19 -0
  34. judgeval/scorers/judgeval_scorers/contextual_relevancy.py +22 -0
  35. judgeval/scorers/judgeval_scorers/faithfulness.py +19 -0
  36. judgeval/scorers/judgeval_scorers/hallucination.py +19 -0
  37. judgeval/scorers/judgeval_scorers/json_correctness.py +32 -0
  38. judgeval/scorers/judgeval_scorers/summarization.py +20 -0
  39. judgeval/scorers/judgeval_scorers/tool_correctness.py +19 -0
  40. judgeval/scorers/prompt_scorer.py +439 -0
  41. judgeval/scorers/score.py +427 -0
  42. judgeval/scorers/utils.py +175 -0
  43. judgeval-0.0.1.dist-info/METADATA +40 -0
  44. judgeval-0.0.1.dist-info/RECORD +46 -0
  45. judgeval-0.0.1.dist-info/WHEEL +4 -0
  46. judgeval-0.0.1.dist-info/licenses/LICENSE.md +202 -0
@@ -0,0 +1,427 @@
1
+ """
2
+ Infrastructure for executing evaluations of `Example`s using one or more `CustomScorer`s.
3
+ """
4
+
5
+
6
+ import asyncio
7
+ import time
8
+ from tqdm.asyncio import tqdm_asyncio
9
+ from typing import List, Union, Optional, Callable
10
+ from rich.progress import Progress, SpinnerColumn, TextColumn
11
+
12
+ from judgeval.data import (
13
+ Example,
14
+ ScoringResult,
15
+ generate_scoring_result,
16
+ create_process_example,
17
+ create_scorer_data,
18
+ )
19
+ from judgeval.scorers import CustomScorer
20
+ from judgeval.scorers.utils import clone_scorers, scorer_console_msg
21
+ from judgeval.common.exceptions import MissingTestCaseParamsError
22
+ from judgeval.common.logger import example_logging_context, debug, error, warning, info
23
+ from judgeval.judges import judgevalJudge
24
+
25
+ async def safe_a_score_example(
26
+ scorer: CustomScorer,
27
+ example: Example,
28
+ ignore_errors: bool,
29
+ skip_on_missing_params: bool,
30
+ ):
31
+ """
32
+ Scoring task function when not using a progress indicator!
33
+ "Safely" scores an `Example` using a `CustomScorer` by gracefully handling any exceptions that may occur.
34
+
35
+ Args:
36
+ scorer (CustomScorer): The `CustomScorer` to use for scoring the example.
37
+ example (Example): The `Example` to be scored.
38
+
39
+ ignore_errors (bool): Whether to ignore errors during the evaluation.
40
+ If set to false, any error will be raised and stop the evaluation.
41
+ If set to true, the error will be stored in the `error` attribute of the `CustomScorer` and the `success` attribute will be set to False.
42
+
43
+ skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
44
+ """
45
+ debug(f"Starting safe_a_score_example for example {example.example_id}")
46
+ try:
47
+ await scorer.a_score_example(example, _show_indicator=False)
48
+ info(f"Successfully scored example {example.example_id}")
49
+ except MissingTestCaseParamsError as e:
50
+ if skip_on_missing_params: # Skip the example if the scorer requires parameters that are missing
51
+ with example_logging_context(example.timestamp, example.example_id):
52
+ warning(f"Skipping example {example.example_id} due to missing parameters")
53
+ scorer.skipped = True
54
+ return
55
+ else:
56
+ if ignore_errors: # Gracefully handle the error, does not stop the evaluation
57
+ scorer.error = str(e)
58
+ scorer.success = False
59
+ with example_logging_context(example.timestamp, example.example_id):
60
+ warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
61
+ else: # Raise the error and stop the evaluation
62
+ with example_logging_context(example.timestamp, example.example_id):
63
+ error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
64
+ raise
65
+ except TypeError: # in case a_score_example does not accept _show_indicator
66
+ try:
67
+ await scorer.a_score_example(example)
68
+ except MissingTestCaseParamsError as e:
69
+ if skip_on_missing_params:
70
+ scorer.skipped = True
71
+ with example_logging_context(example.timestamp, example.example_id):
72
+ warning(f"Skipping example {example.example_id} due to missing parameters")
73
+ return
74
+ else:
75
+ if ignore_errors:
76
+ scorer.error = str(e)
77
+ scorer.success = False
78
+ with example_logging_context(example.timestamp, example.example_id):
79
+ warning(f"Ignoring errors for example {example.example_id}: {str(e)} due to missing parameters")
80
+ else:
81
+ with example_logging_context(example.timestamp, example.example_id):
82
+ error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
83
+ raise
84
+ except Exception as e:
85
+ if ignore_errors:
86
+ scorer.error = str(e)
87
+ scorer.success = False # Assuming you want to set success to False
88
+ with example_logging_context(example.timestamp, example.example_id):
89
+ warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
90
+ else:
91
+ with example_logging_context(example.timestamp, example.example_id):
92
+ error(f"Stopping example {example.example_id}: {str(e)}")
93
+ raise
94
+
95
+
96
+ async def score_task(
97
+ task_id: int,
98
+ progress: Progress,
99
+ scorer: CustomScorer,
100
+ example: Example,
101
+ ignore_errors: bool = True,
102
+ skip_on_missing_params: bool = True,
103
+ ):
104
+ """
105
+ Task function for asynchronously measuring a given example using a custom scorer.
106
+
107
+ Args:
108
+ task_id (int): The ID of the task being measured.
109
+ progress (Progress): An instance of the Progress class to track task progress.
110
+ scorer (CustomScorer): An instance of the CustomScorer class used to score the example.
111
+ example (Example): The example to be scored.
112
+ ignore_errors (bool, optional): Whether to ignore errors during scoring. Defaults to True.
113
+ skip_on_missing_params (bool, optional): Whether to skip scoring if there are missing parameters. Defaults to True.
114
+
115
+ Raises:
116
+ MissingTestCaseParamsError: If required test case parameters are missing and skip_on_missing_params is False.
117
+ Exception: If an unexpected error occurs and ignore_errors is False.
118
+
119
+ Returns:
120
+ None
121
+ """
122
+ while not progress.finished:
123
+ start_time = time.perf_counter()
124
+
125
+ try:
126
+ await scorer.a_score_example(example, _show_indicator=False)
127
+ finish_text = "Completed"
128
+ except MissingTestCaseParamsError as e:
129
+ if skip_on_missing_params:
130
+ scorer.skipped = True
131
+ with example_logging_context(example.timestamp, example.example_id):
132
+ debug(f"Skipping example {example.example_id} due to missing parameters")
133
+ return
134
+ else:
135
+ if ignore_errors:
136
+ scorer.error = str(e)
137
+ scorer.success = False # Override success
138
+ finish_text = "Failed"
139
+ else:
140
+ with example_logging_context(example.timestamp, example.example_id):
141
+ error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
142
+ raise
143
+ except TypeError:
144
+ try:
145
+ await scorer.a_score_example(example)
146
+ finish_text = "Completed"
147
+ except MissingTestCaseParamsError as e:
148
+ if skip_on_missing_params:
149
+ scorer.skipped = True
150
+ with example_logging_context(example.timestamp, example.example_id):
151
+ debug(f"Skipping example {example.example_id} due to missing parameters")
152
+ return
153
+ else:
154
+ if ignore_errors:
155
+ scorer.error = str(e)
156
+ scorer.success = False # Override success
157
+ finish_text = "Failed"
158
+ else:
159
+ with example_logging_context(example.timestamp, example.example_id):
160
+ error(f"Stopping example {example.example_id}: {str(e)} due to missing parameters")
161
+ raise
162
+ except Exception as e:
163
+ if ignore_errors:
164
+ scorer.error = str(e)
165
+ scorer.success = False # Override success
166
+ finish_text = "Failed"
167
+ with example_logging_context(example.timestamp, example.example_id):
168
+ warning(f"Ignoring errors for example {example.example_id}: {str(e)}")
169
+ else:
170
+ with example_logging_context(example.timestamp, example.example_id):
171
+ error(f"Stopping example {example.example_id}: {str(e)}")
172
+ raise
173
+
174
+ end_time = time.perf_counter()
175
+ time_taken = format(end_time - start_time, ".2f")
176
+ progress.update(task_id, advance=100) # Mark task as complete
177
+ progress.update(
178
+ task_id,
179
+ description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]{finish_text}! ({time_taken}s)",
180
+ )
181
+ break
182
+
183
+
184
+ async def score_with_indicator(
185
+ scorers: List[CustomScorer],
186
+ example: Example,
187
+ ignore_errors: bool,
188
+ skip_on_missing_params: bool,
189
+ show_indicator: bool,
190
+ ):
191
+ """
192
+ Scores an example using a list of custom scorers, optionally displaying a progress indicator.
193
+
194
+ Args:
195
+ scorers (List[CustomScorer]): A list of custom scorer objects to evaluate the example.
196
+ example (Example): The example to be scored.
197
+ ignore_errors (bool): If True, errors during scoring will be ignored.
198
+ skip_on_missing_params (bool): If True, scoring will be skipped if required parameters are missing.
199
+ show_indicator (bool): If True, a progress indicator will be displayed during scoring.
200
+
201
+ Returns:
202
+ None
203
+
204
+ Raises:
205
+ Any exceptions raised by the scoring functions, unless `ignore_errors` is True.
206
+ """
207
+ if show_indicator:
208
+ with Progress(
209
+ SpinnerColumn(style="rgb(106,0,255)"),
210
+ TextColumn("[progress.description]{task.description}"),
211
+ transient=True,
212
+ ) as progress:
213
+ tasks = []
214
+ for scorer in scorers:
215
+ task_id = progress.add_task(
216
+ description=scorer_console_msg(
217
+ scorer, async_mode=True
218
+ ),
219
+ total=100,
220
+ ) # Add task to progress bar
221
+ tasks.append(
222
+ score_task(
223
+ task_id,
224
+ progress,
225
+ scorer,
226
+ example,
227
+ ignore_errors,
228
+ skip_on_missing_params,
229
+ ) # Create and execute task to score the example with a single scorer
230
+ )
231
+ await asyncio.gather(*tasks)
232
+ else:
233
+ tasks = [
234
+ safe_a_score_example(
235
+ scorer, example, ignore_errors, skip_on_missing_params
236
+ )
237
+ for scorer in scorers
238
+ ]
239
+
240
+ await asyncio.gather(*tasks)
241
+
242
+
243
+ async def a_execute_scoring(
244
+ examples: List[Example],
245
+ scorers: List[CustomScorer],
246
+ model: Optional[Union[str, List[str], judgevalJudge]] = None,
247
+ ignore_errors: bool = True,
248
+ skip_on_missing_params: bool = True,
249
+ show_indicator: bool = True,
250
+ throttle_value: int = 0,
251
+ max_concurrent: int = 100,
252
+ verbose_mode: Optional[bool] = None,
253
+ _use_bar_indicator: bool = True,
254
+ ) -> List[ScoringResult]:
255
+ """
256
+ Executes evaluations of `Example`s asynchronously using one or more `CustomScorer`s.
257
+ Each `Example` will be evaluated by all of the `CustomScorer`s in the `scorers` list.
258
+
259
+ examples (List[Example]): A list of `Example` objects to be evaluated.
260
+ scorers (List[CustomScorer]): A list of `CustomScorer` objects to evaluate the examples.
261
+ model (Union[str, List[str], judgevalJudge]): The model to use for evaluation.
262
+ ignore_errors (bool): Whether to ignore errors during evaluation.
263
+ skip_on_missing_params (bool): Whether to skip evaluation if parameters are missing.
264
+ show_indicator (bool): Whether to show a progress indicator.
265
+ throttle_value (int): The amount of time to wait between starting each task.
266
+ max_concurrent (int): The maximum number of concurrent tasks.
267
+ verbose_mode (Optional[bool]): If set, enables verbose mode for scorers.
268
+ _use_bar_indicator (bool): Whether to use a progress bar indicator.
269
+
270
+ Returns:
271
+ List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
272
+ """
273
+ semaphore = asyncio.Semaphore(max_concurrent)
274
+
275
+ async def execute_with_semaphore(func: Callable, *args, **kwargs):
276
+ try:
277
+ async with semaphore:
278
+ return await func(*args, **kwargs)
279
+ except Exception as e:
280
+ error(f"Error executing function: {e}")
281
+ if kwargs.get('ignore_errors', False):
282
+ # Return None when ignoring errors
283
+ return None
284
+ raise
285
+
286
+ if verbose_mode is not None:
287
+ for scorer in scorers:
288
+ scorer.verbose_mode = verbose_mode
289
+
290
+ # Add model to scorers
291
+ for scorer in scorers:
292
+ scorer._add_model(model)
293
+
294
+ scoring_results: List[ScoringResult] = [None for _ in examples]
295
+ tasks = []
296
+
297
+ if show_indicator and _use_bar_indicator:
298
+ with tqdm_asyncio(
299
+ desc=f"Evaluating {len(examples)} example(s) in parallel",
300
+ unit="Example",
301
+ total=len(examples),
302
+ bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
303
+ ) as pbar:
304
+ for i, ex in enumerate(examples):
305
+ with example_logging_context(ex.timestamp, ex.example_id):
306
+ debug(f"Starting scoring for example {ex.example_id}")
307
+ debug(f"Input: {ex.input}")
308
+ debug(f"Using {len(scorers)} scorers")
309
+ for scorer in scorers:
310
+ debug(f"Using scorer: {type(scorer).__name__}")
311
+ if hasattr(scorer, 'threshold'):
312
+ debug(f"Scorer threshold: {scorer.threshold}")
313
+ if hasattr(scorer, 'model'):
314
+ debug(f"Scorer model: {type(scorer.model).__name__}")
315
+ if isinstance(ex, Example):
316
+ if len(scorers) == 0:
317
+ pbar.update(1)
318
+ continue
319
+
320
+ cloned_scorers: List[CustomScorer] = clone_scorers(
321
+ scorers
322
+ )
323
+ task = execute_with_semaphore(
324
+ func=a_eval_examples_helper,
325
+ scorers=cloned_scorers,
326
+ example=ex,
327
+ scoring_results=scoring_results,
328
+ score_index=i,
329
+ ignore_errors=ignore_errors,
330
+ skip_on_missing_params=skip_on_missing_params,
331
+ show_indicator=show_indicator,
332
+ _use_bar_indicator=_use_bar_indicator,
333
+ pbar=pbar,
334
+ )
335
+ tasks.append(asyncio.create_task(task))
336
+
337
+ await asyncio.sleep(throttle_value)
338
+ await asyncio.gather(*tasks)
339
+ else:
340
+ for i, ex in enumerate(examples):
341
+ if isinstance(ex, Example):
342
+ if len(scorers) == 0:
343
+ continue
344
+
345
+ cloned_scorers: List[CustomScorer] = clone_scorers(
346
+ scorers
347
+ )
348
+ task = execute_with_semaphore(
349
+ func=a_eval_examples_helper,
350
+ scorers=cloned_scorers,
351
+ example=ex,
352
+ scoring_results=scoring_results,
353
+ score_index=i,
354
+ ignore_errors=ignore_errors,
355
+ skip_on_missing_params=skip_on_missing_params,
356
+ _use_bar_indicator=_use_bar_indicator,
357
+ show_indicator=show_indicator,
358
+ )
359
+ tasks.append(asyncio.create_task((task)))
360
+
361
+ await asyncio.sleep(throttle_value)
362
+ await asyncio.gather(*tasks)
363
+ return scoring_results
364
+
365
+
366
+ async def a_eval_examples_helper(
367
+ scorers: List[CustomScorer],
368
+ example: Example,
369
+ scoring_results: List[ScoringResult],
370
+ score_index: int,
371
+ ignore_errors: bool,
372
+ skip_on_missing_params: bool,
373
+ show_indicator: bool,
374
+ _use_bar_indicator: bool,
375
+ pbar: Optional[tqdm_asyncio] = None,
376
+ ) -> None:
377
+ """
378
+ Evaluate a single example asynchronously using a list of scorers.
379
+
380
+ Args:
381
+ scorers (List[CustomScorer]): List of CustomScorer objects to evaluate the example.
382
+ example (Example): The example to be evaluated.
383
+ scoring_results (List[ScoringResult]): List to store the scoring results.
384
+ score_index (int): Index at which the result should be stored in scoring_results.
385
+ ignore_errors (bool): Flag to indicate whether to ignore errors during scoring.
386
+ skip_on_missing_params (bool): Flag to indicate whether to skip scoring if parameters are missing.
387
+ show_indicator (bool): Flag to indicate whether to show a progress indicator.
388
+ _use_bar_indicator (bool): Flag to indicate whether to use a bar indicator for progress.
389
+ pbar (Optional[tqdm_asyncio]): Optional progress bar for tracking progress.
390
+ Returns:
391
+ None
392
+ """
393
+ show_metrics_indicator = show_indicator and not _use_bar_indicator
394
+
395
+ for scorer in scorers:
396
+ scorer.skipped = False
397
+ scorer.error = None # Reset scorer error
398
+
399
+ # scoring the Example
400
+ process_example = create_process_example(example) # Creates process example to track progress
401
+ scoring_start_time = time.perf_counter()
402
+ await score_with_indicator(
403
+ scorers=scorers,
404
+ example=example,
405
+ skip_on_missing_params=skip_on_missing_params,
406
+ ignore_errors=ignore_errors,
407
+ show_indicator=show_metrics_indicator,
408
+ ) # execute the scoring functions of each scorer on the example
409
+
410
+ # Now that all the scoring functions of each scorer have executed, we collect
411
+ # the results and update the process example with the scorer data
412
+ for scorer in scorers:
413
+ # At this point, the scorer has been executed and already contains data.
414
+ if getattr(scorer, 'skipped', False):
415
+ continue
416
+
417
+ scorer_data = create_scorer_data(scorer) # Fetch scorer data from completed scorer evaluation
418
+ process_example.update_scorer_data(scorer_data) # Update process example with the same scorer data
419
+
420
+ test_end_time = time.perf_counter()
421
+ run_duration = test_end_time - scoring_start_time
422
+
423
+ process_example.update_run_duration(run_duration) # Update process example with execution time duration
424
+ scoring_results[score_index] = generate_scoring_result(process_example) # Converts the outcomes of the executed test to a ScoringResult and saves it
425
+
426
+ if pbar is not None:
427
+ pbar.update(1)
@@ -0,0 +1,175 @@
1
+ """
2
+ Util functions for Scorer objects
3
+ """
4
+
5
+ import asyncio
6
+ import nest_asyncio
7
+ import inspect
8
+ import json
9
+ import sys
10
+ import re
11
+ from contextlib import contextmanager
12
+ from rich.progress import Progress, SpinnerColumn, TextColumn
13
+ from rich.console import Console
14
+ from typing import List, Optional, Any
15
+
16
+ from judgeval.scorers import CustomScorer
17
+
18
+
19
+ def clone_scorers(scorers: List[CustomScorer]) -> List[CustomScorer]:
20
+ """
21
+ Creates duplicates of the scorers passed as argument.
22
+ """
23
+ cloned_scorers = []
24
+ for s in scorers:
25
+ scorer_class = type(s)
26
+ args = vars(s)
27
+
28
+ signature = inspect.signature(scorer_class.__init__)
29
+ valid_params = signature.parameters.keys()
30
+ valid_args = {key: args[key] for key in valid_params if key in args}
31
+
32
+ cloned_scorer = scorer_class(**valid_args)
33
+ # kinda hacky, but in case the class inheriting from CustomScorer doesn't have `model` in its __init__,
34
+ # we need to explicitly include it here so that we can add the judge model to the cloned scorer
35
+ cloned_scorer._add_model(model=args.get("model"))
36
+ cloned_scorers.append(cloned_scorer)
37
+ return cloned_scorers
38
+
39
+
40
+ def scorer_console_msg(
41
+ scorer: CustomScorer,
42
+ async_mode: Optional[bool] = None,
43
+ ):
44
+ """
45
+ Renders a message to be displayed to console when a scorer is being executed.
46
+ """
47
+ if async_mode is None:
48
+ run_async = scorer.async_mode
49
+ else:
50
+ run_async = async_mode
51
+
52
+ return f"🔨 Executing Judgment's [rgb(106,0,255)]{scorer.__name__} Scorer[/rgb(106,0,255)]! \
53
+ [rgb(55,65,81)](using {scorer.evaluation_model}, async_mode={run_async})...[/rgb(55,65,81)]"
54
+
55
+
56
+ @contextmanager
57
+ def scorer_progress_meter(
58
+ scorer: CustomScorer,
59
+ async_mode: Optional[bool] = None,
60
+ display_meter: bool = True,
61
+ total: int = 100,
62
+ transient: bool = True,
63
+ ):
64
+ """
65
+ Context manager to display a progress indicator (spinner) while a scorer is being run.
66
+ """
67
+ console = Console(file=sys.stderr)
68
+ if display_meter:
69
+ with Progress(
70
+ SpinnerColumn(style="rgb(106,0,255)"),
71
+ TextColumn("[progress.description]{task.description}"),
72
+ console=console,
73
+ transient=transient,
74
+ ) as progress:
75
+ progress.add_task(
76
+ description=scorer_console_msg(scorer, async_mode),
77
+ total=total,
78
+ )
79
+ yield
80
+ else:
81
+ yield
82
+
83
+
84
+ def parse_response_json(llm_response: str, scorer: Optional[CustomScorer] = None) -> dict:
85
+ """
86
+ Extracts JSON output from an LLM response and returns it as a dictionary.
87
+
88
+ If the JSON is invalid, the error is forwarded to the `scorer`, if provided.
89
+
90
+ Args:
91
+ llm_response (str): The response from an LLM.
92
+ scorer (CustomScorer, optional): The scorer object to forward errors to (if any).
93
+ """
94
+ start = llm_response.find("{") # opening bracket
95
+ end = llm_response.rfind("}") + 1 # closing bracket
96
+
97
+ if end == 0 and start != -1: # add the closing bracket if it's missing
98
+ llm_response = llm_response + "}"
99
+ end = len(llm_response)
100
+
101
+ json_str = llm_response[start:end] if start != -1 and end != 0 else "" # extract the JSON string
102
+ json_str = re.sub(r",\s*([\]}])", r"\1", json_str) # Remove trailing comma if present
103
+
104
+ try:
105
+ return json.loads(json_str)
106
+ except json.JSONDecodeError:
107
+ error_str = "Evaluation LLM outputted an invalid JSON. Please use a stronger evaluation model."
108
+ if scorer is not None:
109
+ scorer.error = error_str
110
+ raise ValueError(error_str)
111
+ except Exception as e:
112
+ raise Exception(f"An unexpected error occurred: {str(e)}")
113
+
114
+
115
+ def print_verbose_logs(metric: str, logs: str):
116
+ print("*" * 50)
117
+ print(f"{metric} Verbose Logs")
118
+ print("*" * 50)
119
+ print("")
120
+ print(logs)
121
+ print("")
122
+ print("=" * 70)
123
+
124
+
125
+ def create_verbose_logs(metric: CustomScorer, steps: List[str]) -> str:
126
+ """
127
+ Creates verbose logs for a scorer object.
128
+
129
+ Args:
130
+ metric (CustomScorer): The scorer object.
131
+ steps (List[str]): The steps to be included in the verbose logs.
132
+
133
+ Returns:
134
+ str: The verbose logs (Concatenated steps).
135
+ """
136
+
137
+ verbose_logs = ""
138
+ for i in range(len(steps) - 1):
139
+ verbose_logs += steps[i]
140
+ if i < len(steps) - 2: # don't add new line for penultimate step
141
+ verbose_logs += " \n \n"
142
+ if metric.verbose_mode:
143
+ print_verbose_logs(metric.__name__, verbose_logs + f"\n \n{steps[-1]}")
144
+ return verbose_logs
145
+
146
+
147
+ def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
148
+ """
149
+ Get or create an asyncio event loop.
150
+
151
+ This function attempts to retrieve the current event loop using `asyncio.get_event_loop()`.
152
+ If the event loop is already running, it applies the `nest_asyncio` patch to allow nested
153
+ asynchronous execution. If the event loop is closed or not found, it creates a new event loop
154
+ and sets it as the current event loop.
155
+
156
+ Returns:
157
+ asyncio.AbstractEventLoop: The current or newly created event loop.
158
+
159
+ Raises:
160
+ RuntimeError: If the event loop is closed.
161
+ """
162
+ try:
163
+ loop = asyncio.get_event_loop()
164
+ if loop.is_running():
165
+ print(
166
+ "Event loop is already running. Applying nest_asyncio patch to allow async execution..."
167
+ )
168
+ nest_asyncio.apply()
169
+
170
+ if loop.is_closed():
171
+ raise RuntimeError
172
+ except RuntimeError:
173
+ loop = asyncio.new_event_loop()
174
+ asyncio.set_event_loop(loop)
175
+ return loop
@@ -0,0 +1,40 @@
1
+ Metadata-Version: 2.4
2
+ Name: judgeval
3
+ Version: 0.0.1
4
+ Summary: Judgeval Package
5
+ Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
+ Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
+ Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
+ License-Expression: Apache-2.0
9
+ License-File: LICENSE.md
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.11
13
+ Requires-Dist: anthropic>=0.43.1
14
+ Requires-Dist: deepeval>=2.1.8
15
+ Requires-Dist: fastapi>=0.115.6
16
+ Requires-Dist: langfuse==2.50.3
17
+ Requires-Dist: litellm>=1.48.10
18
+ Requires-Dist: openai>=1.47.1
19
+ Requires-Dist: pandas>=2.2.3
20
+ Requires-Dist: patronus>=0.0.17
21
+ Requires-Dist: python-dotenv==1.0.1
22
+ Requires-Dist: requests>=2.32.3
23
+ Requires-Dist: supabase>=2.11.0
24
+ Requires-Dist: together>=1.3.11
25
+ Requires-Dist: uvicorn>=0.34.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
28
+ Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
29
+ Requires-Dist: pytest>=8.3.4; extra == 'dev'
30
+ Description-Content-Type: text/markdown
31
+
32
+ # judgeval
33
+
34
+
35
+ TODOS
36
+
37
+ 1. public interface for Example and Measurement objects
38
+ 2. call to backend
39
+ 3. datasets and logging
40
+ 4. exporting to platform
@@ -0,0 +1,46 @@
1
+ judgeval/__init__.py,sha256=ZLaxoUwq-b86Ugj3543Z1v4Q0D8ukHRpxV7LcZrcOZs,1875
2
+ judgeval/clients.py,sha256=boWW-nA7Yqt2zBflMxl2NpdUIFSSKebv3rJiG50K1s4,594
3
+ judgeval/constants.py,sha256=5OiK-r6DLD2utExYaFnvfpSYxrXbVYzRDxSO6lVNE6k,1919
4
+ judgeval/evaluation_run.py,sha256=Z1Y9_w7xWK6sjH4sXupPYNl6-BTlJu4kYF8KJcB3MF8,5763
5
+ judgeval/judgment_client.py,sha256=QBEgWkkNvrHpkmhPIS0YhsDYDWYxGenQSjopz5QSsas,9574
6
+ judgeval/run_evaluation.py,sha256=LzEoWhtsXE_HwMRgmzkkMKfbqw2h1sh7WjChy5HOUfQ,16252
7
+ judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
8
+ judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
9
+ judgeval/common/logger.py,sha256=QXN3UMymmKu2iMEMEgATLBnMDjGr_pE2iOSEFoICgg8,6092
10
+ judgeval/common/tracer.py,sha256=7t--uQMcxVLl2Hqemem_EUy90lAhbvrHss5_ujFlI8Y,22310
11
+ judgeval/common/utils.py,sha256=3WRyyX0tvnnj_VAVlEdtZrfzyWj6zfX04xdpCtE1m5Y,33736
12
+ judgeval/data/__init__.py,sha256=-Js66xgj0g7wtUk8Q2CIK9ynGL8pGlVRubVlfO7OgX4,438
13
+ judgeval/data/api_example.py,sha256=vwWFbI6eJr5VgURCRbuSiMtEXLUbTCih_BcaqEBy-pg,4108
14
+ judgeval/data/example.py,sha256=lymGZ3jG818-r2vyFunt6OLFrhESOyJnbhao_ljTjlA,2471
15
+ judgeval/data/result.py,sha256=CVp_mZrBbKjIH9rPB6rg7T2jY1jUy7JVyI7_kUbRC7w,3490
16
+ judgeval/data/scorer_data.py,sha256=H7s-yEEUdWlR4mRgy5JkLOQXNkn2YLu9wkZwcAlh6QQ,3276
17
+ judgeval/data/datasets/__init__.py,sha256=Xh6TSsCcEsJeYjjubfeGa3WU8YQfuwKXH3jR9EeDFgg,171
18
+ judgeval/data/datasets/dataset.py,sha256=9GGspdKDhMw2dJAS7ZvOZHSoNGwMzCtgnFYDe6y4yog,16484
19
+ judgeval/data/datasets/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
20
+ judgeval/data/datasets/utils.py,sha256=lQxyl7mevct7JcDSyIrU_8QOzT-EYPWEvoUiAeOdeek,2502
21
+ judgeval/judges/__init__.py,sha256=4DuGgrJAec2GavvU3wgQ5alOgi9q0GzFzrtGBJxO2Cs,339
22
+ judgeval/judges/base_judge.py,sha256=PUj7tITRjWjk6Www5Qcrt6Z3YNr9ix69aqjFs-dMQjA,1000
23
+ judgeval/judges/litellm_judge.py,sha256=xW0Ld7dumercO7ej8aTNDRow1lsVpgaDENTFq1WkiU0,2424
24
+ judgeval/judges/mixture_of_judges.py,sha256=WELi58rKtJuamkQWbhcqMn7AfRj1qX1LcDyOE0B3FZc,15532
25
+ judgeval/judges/together_judge.py,sha256=eISgPcnAS71Di9FL9z0AfvBK9nb57a_muCUzboxv7gQ,2275
26
+ judgeval/judges/utils.py,sha256=Aj1XrK0WXeJStsDLB-Cx757fRuJCWmJRMVD3ngoa9Oo,2110
27
+ judgeval/scorers/__init__.py,sha256=U9uiW7y707g8ibjc2ZCBlm61ijziJMM2xQSHDV7FerQ,845
28
+ judgeval/scorers/base_scorer.py,sha256=_n-w7b_PD_-DoW1gr2c3CtrT5dvLehRjDYLQDF-81LM,1786
29
+ judgeval/scorers/custom_scorer.py,sha256=SrkrtIqs9yaqvUZG3ilXIGm6S78uX1YOygqWpl4xXfw,6039
30
+ judgeval/scorers/prompt_scorer.py,sha256=-YVG9k03q85YnyLVR2lsC-RmxUM4Q5ynHdUECi63iCk,17813
31
+ judgeval/scorers/score.py,sha256=zvS5xF3qlUYS716TeyNPikYZWXrUTKirAr1uqXO-P14,18589
32
+ judgeval/scorers/utils.py,sha256=RCpHc7EQF_LnujsZ0KV-kTYvmHpVS1psJqIgZOuqOgA,5613
33
+ judgeval/scorers/judgeval_scorers/__init__.py,sha256=YcpuD8qFuhyPEwFiKBgqxQpjqo43I3ODHH3yJnR75B4,1044
34
+ judgeval/scorers/judgeval_scorers/answer_relevancy.py,sha256=xpS-WclR8SV7HDmpUPp3P55Ybfcm6Gj1kfr2hJG_-cg,452
35
+ judgeval/scorers/judgeval_scorers/contextual_precision.py,sha256=3nFcrfNTGmHnmfM3UxlLLzAK0qz9VKkEMjmDa6j7thI,468
36
+ judgeval/scorers/judgeval_scorers/contextual_recall.py,sha256=O8bnQxJJGmIjSUMgR7_8VG2qlSP-7OJiHauCxHGkcgw,456
37
+ judgeval/scorers/judgeval_scorers/contextual_relevancy.py,sha256=rrUmyWZdKrONjm_seRvFltS5YM9QiiO6o5-DVlNTYPE,569
38
+ judgeval/scorers/judgeval_scorers/faithfulness.py,sha256=CHqB-_KMnRw24jg0am0SpHuYMx3u3Gy5YtQNAPNXZ_I,437
39
+ judgeval/scorers/judgeval_scorers/hallucination.py,sha256=7omZbrQWTgZ4Fnw8wQSgvsmuj-g9bm74AP1xZM_mDhs,441
40
+ judgeval/scorers/judgeval_scorers/json_correctness.py,sha256=guGR5lgTtwG8qqVI7dFt6hHGEYM98XzZhqPzh69Ogqg,866
41
+ judgeval/scorers/judgeval_scorers/summarization.py,sha256=OwF-sNtRg2HN6FoRCCYTow_SNfS9Rcxzb7ZDBTEld7k,445
42
+ judgeval/scorers/judgeval_scorers/tool_correctness.py,sha256=7nAXrDKfPkeMzRLiNTevbUV5pjRXPTM2dbkfGRgSJ-s,452
43
+ judgeval-0.0.1.dist-info/METADATA,sha256=4TZnY-nRVAgCKd9e-Ok_qr-B25sB39HYMvl-THUgHzg,1278
44
+ judgeval-0.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
45
+ judgeval-0.0.1.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
46
+ judgeval-0.0.1.dist-info/RECORD,,