judgeval 0.0.25__py3-none-any.whl → 0.0.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,17 +23,35 @@ from judgeval.constants import (
23
23
  ROOT_API,
24
24
  JUDGMENT_EVAL_API_URL,
25
25
  JUDGMENT_EVAL_LOG_API_URL,
26
- MAX_CONCURRENT_EVALUATIONS
26
+ MAX_CONCURRENT_EVALUATIONS,
27
+ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
27
28
  )
28
29
  from judgeval.common.exceptions import JudgmentAPIError
29
- from judgeval.evaluation_run import EvaluationRun
30
30
  from judgeval.common.logger import (
31
31
  debug,
32
32
  info,
33
33
  error,
34
34
  example_logging_context
35
35
  )
36
+ from judgeval.evaluation_run import EvaluationRun
37
+
36
38
 
39
+ def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
40
+ """
41
+ Sends an evaluation run to the RabbitMQ evaluation queue.
42
+ """
43
+ payload = evaluation_run.model_dump(warnings=False)
44
+ response = requests.post(
45
+ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
46
+ headers={
47
+ "Content-Type": "application/json",
48
+ "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
49
+ "X-Organization-Id": evaluation_run.organization_id
50
+ },
51
+ json=payload,
52
+ verify=True
53
+ )
54
+ return response.json()
37
55
 
38
56
  def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
39
57
  """
@@ -51,13 +69,15 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
51
69
  # submit API request to execute evals
52
70
  payload = evaluation_run.model_dump(warnings=False)
53
71
  response = requests.post(
54
- JUDGMENT_EVAL_API_URL, headers={
55
- "Content-Type": "application/json",
56
- "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
57
- "X-Organization-Id": evaluation_run.organization_id
58
- },
59
- json=payload,
60
- verify=True)
72
+ JUDGMENT_EVAL_API_URL,
73
+ headers={
74
+ "Content-Type": "application/json",
75
+ "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
76
+ "X-Organization-Id": evaluation_run.organization_id
77
+ },
78
+ json=payload,
79
+ verify=True
80
+ )
61
81
  response_data = response.json()
62
82
  except Exception as e:
63
83
  error(f"Error: {e}")
@@ -97,21 +117,23 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
97
117
 
98
118
  # Each ScoringResult in api and local have all the same fields besides `scorers_data`
99
119
  for api_result, local_result in zip(api_results, local_results):
100
- if api_result.input != local_result.input:
120
+ if not (api_result.data_object and local_result.data_object):
121
+ raise ValueError("Data object is None in one of the results.")
122
+ if api_result.data_object.input != local_result.data_object.input:
101
123
  raise ValueError("The API and local results are not aligned.")
102
- if api_result.actual_output != local_result.actual_output:
124
+ if api_result.data_object.actual_output != local_result.data_object.actual_output:
103
125
  raise ValueError("The API and local results are not aligned.")
104
- if api_result.expected_output != local_result.expected_output:
126
+ if api_result.data_object.expected_output != local_result.data_object.expected_output:
105
127
  raise ValueError("The API and local results are not aligned.")
106
- if api_result.context != local_result.context:
128
+ if api_result.data_object.context != local_result.data_object.context:
107
129
  raise ValueError("The API and local results are not aligned.")
108
- if api_result.retrieval_context != local_result.retrieval_context:
130
+ if api_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
109
131
  raise ValueError("The API and local results are not aligned.")
110
- if api_result.additional_metadata != local_result.additional_metadata:
132
+ if api_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
111
133
  raise ValueError("The API and local results are not aligned.")
112
- if api_result.tools_called != local_result.tools_called:
134
+ if api_result.data_object.tools_called != local_result.data_object.tools_called:
113
135
  raise ValueError("The API and local results are not aligned.")
114
- if api_result.expected_tools != local_result.expected_tools:
136
+ if api_result.data_object.expected_tools != local_result.data_object.expected_tools:
115
137
  raise ValueError("The API and local results are not aligned.")
116
138
 
117
139
 
@@ -281,13 +303,14 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
281
303
  # Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
282
304
  print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
283
305
 
284
-
285
- def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
306
+ def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
286
307
  """
287
308
  Executes an evaluation of `Example`s using one or more `Scorer`s
288
309
 
289
310
  Args:
290
311
  evaluation_run (EvaluationRun): Stores example and evaluation together for running
312
+ override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
313
+ ignore_errors (bool, optional): Whether to ignore scorer errors during evaluation. Defaults to True.
291
314
 
292
315
  Args:
293
316
  project_name (str): The name of the project the evaluation results belong to
@@ -354,101 +377,101 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
354
377
 
355
378
  api_results: List[ScoringResult] = []
356
379
  local_results: List[ScoringResult] = []
357
-
358
- # Execute evaluation using Judgment API
359
- if judgment_scorers:
380
+
381
+ if async_execution:
360
382
  check_examples(evaluation_run.examples, evaluation_run.scorers)
361
- info("Starting API evaluation")
362
- debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
363
- try: # execute an EvaluationRun with just JudgmentScorers
364
- api_evaluation_run: EvaluationRun = EvaluationRun(
365
- eval_name=evaluation_run.eval_name,
366
- project_name=evaluation_run.project_name,
367
- examples=evaluation_run.examples,
368
- scorers=judgment_scorers,
369
- model=evaluation_run.model,
370
- aggregator=evaluation_run.aggregator,
371
- metadata=evaluation_run.metadata,
372
- judgment_api_key=evaluation_run.judgment_api_key,
373
- organization_id=evaluation_run.organization_id,
374
- log_results=evaluation_run.log_results,
375
- rules=evaluation_run.rules
376
- )
377
- debug("Sending request to Judgment API")
378
- response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
379
- info(f"Received {len(response_data['results'])} results from API")
380
- except JudgmentAPIError as e:
381
- error(f"An error occurred while executing the Judgment API request: {str(e)}")
382
- raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
383
- except ValueError as e:
384
- raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
385
-
386
- # Convert the response data to `ScoringResult` objects
387
- debug("Processing API results")
388
- for idx, result in enumerate(response_data["results"]):
389
- with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
390
- for scorer in judgment_scorers:
391
- debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
392
- # filter for key-value pairs that are used to initialize ScoringResult
393
- # there may be some stuff in here that doesn't belong in ScoringResult
394
- # TODO: come back and refactor this to have ScoringResult take in **kwargs
395
- filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
396
-
397
- # Convert scorers_data dicts to ScorerData objects
398
- if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
399
- filtered_result["scorers_data"] = [
400
- ScorerData(**scorer_dict)
401
- for scorer_dict in filtered_result["scorers_data"]
402
- ]
403
-
404
- api_results.append(ScoringResult(**filtered_result))
405
- # Run local evals
406
- if local_scorers: # List[JudgevalScorer]
407
- # We should be removing local scorers soon
408
- info("Starting local evaluation")
409
- for example in evaluation_run.examples:
410
- with example_logging_context(example.timestamp, example.example_id):
411
- debug(f"Processing example {example.example_id}: {example.input}")
412
-
413
- results: List[ScoringResult] = asyncio.run(
414
- a_execute_scoring(
415
- evaluation_run.examples,
416
- local_scorers,
417
- model=evaluation_run.model,
418
- ignore_errors=True,
419
- skip_on_missing_params=True,
420
- show_indicator=True,
421
- _use_bar_indicator=True,
422
- throttle_value=0,
423
- max_concurrent=MAX_CONCURRENT_EVALUATIONS,
424
- )
383
+ info("Starting async evaluation")
384
+ payload = evaluation_run.model_dump(warnings=False)
385
+ requests.post(
386
+ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
387
+ headers={
388
+ "Content-Type": "application/json",
389
+ "Authorization": f"Bearer {evaluation_run.judgment_api_key}",
390
+ "X-Organization-Id": evaluation_run.organization_id
391
+ },
392
+ json=payload,
393
+ verify=True
425
394
  )
426
- local_results = results
427
- info(f"Local evaluation complete with {len(local_results)} results")
428
- # Aggregate the ScorerData from the API and local evaluations
429
- debug("Merging API and local results")
430
- merged_results: List[ScoringResult] = merge_results(api_results, local_results)
431
- merged_results = check_missing_scorer_data(merged_results)
432
-
433
- info(f"Successfully merged {len(merged_results)} results")
434
-
435
- # Evaluate rules against local scoring results if rules exist (this cant be done just yet)
436
- # if evaluation_run.rules and merged_results:
437
- # run_rules(
438
- # local_results=merged_results,
439
- # rules=evaluation_run.rules,
440
- # judgment_api_key=evaluation_run.judgment_api_key,
441
- # organization_id=evaluation_run.organization_id
442
- # )
443
-
444
- if evaluation_run.log_results:
445
- pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
446
- rprint(pretty_str)
447
-
448
- for i, result in enumerate(merged_results):
449
- if not result.scorers_data: # none of the scorers could be executed on this example
450
- info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
451
- return merged_results
395
+ print("Successfully added evaluation to queue")
396
+ else:
397
+ if judgment_scorers:
398
+ # Execute evaluation using Judgment API
399
+ check_examples(evaluation_run.examples, evaluation_run.scorers)
400
+ info("Starting API evaluation")
401
+ debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
402
+ try: # execute an EvaluationRun with just JudgmentScorers
403
+ api_evaluation_run: EvaluationRun = EvaluationRun(
404
+ eval_name=evaluation_run.eval_name,
405
+ project_name=evaluation_run.project_name,
406
+ examples=evaluation_run.examples,
407
+ scorers=judgment_scorers,
408
+ model=evaluation_run.model,
409
+ aggregator=evaluation_run.aggregator,
410
+ metadata=evaluation_run.metadata,
411
+ judgment_api_key=evaluation_run.judgment_api_key,
412
+ organization_id=evaluation_run.organization_id,
413
+ log_results=evaluation_run.log_results,
414
+ rules=evaluation_run.rules
415
+ )
416
+ debug("Sending request to Judgment API")
417
+ response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
418
+ info(f"Received {len(response_data['results'])} results from API")
419
+ except JudgmentAPIError as e:
420
+ error(f"An error occurred while executing the Judgment API request: {str(e)}")
421
+ raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
422
+ except ValueError as e:
423
+ raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
424
+
425
+ # Convert the response data to `ScoringResult` objects
426
+ debug("Processing API results")
427
+ api_results = [ScoringResult(**result) for result in response_data["results"]]
428
+ # Run local evals
429
+ if local_scorers: # List[JudgevalScorer]
430
+ # We should be removing local scorers soon
431
+ info("Starting local evaluation")
432
+ for example in evaluation_run.examples:
433
+ with example_logging_context(example.timestamp, example.example_id):
434
+ debug(f"Processing example {example.example_id}: {example.input}")
435
+
436
+ results: List[ScoringResult] = asyncio.run(
437
+ a_execute_scoring(
438
+ evaluation_run.examples,
439
+ local_scorers,
440
+ model=evaluation_run.model,
441
+ ignore_errors=ignore_errors,
442
+ skip_on_missing_params=True,
443
+ show_indicator=True,
444
+ _use_bar_indicator=True,
445
+ throttle_value=0,
446
+ max_concurrent=MAX_CONCURRENT_EVALUATIONS,
447
+ )
448
+ )
449
+ local_results = results
450
+ info(f"Local evaluation complete with {len(local_results)} results")
451
+ # Aggregate the ScorerData from the API and local evaluations
452
+ debug("Merging API and local results")
453
+ merged_results: List[ScoringResult] = merge_results(api_results, local_results)
454
+ merged_results = check_missing_scorer_data(merged_results)
455
+
456
+ info(f"Successfully merged {len(merged_results)} results")
457
+
458
+ # Evaluate rules against local scoring results if rules exist (this cant be done just yet)
459
+ # if evaluation_run.rules and merged_results:
460
+ # run_rules(
461
+ # local_results=merged_results,
462
+ # rules=evaluation_run.rules,
463
+ # judgment_api_key=evaluation_run.judgment_api_key,
464
+ # organization_id=evaluation_run.organization_id
465
+ # )
466
+ # print(merged_results)
467
+ if evaluation_run.log_results:
468
+ pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
469
+ rprint(pretty_str)
470
+
471
+ for i, result in enumerate(merged_results):
472
+ if not result.scorers_data: # none of the scorers could be executed on this example
473
+ info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
474
+ return merged_results
452
475
 
453
476
  def assert_test(scoring_results: List[ScoringResult]) -> None:
454
477
  """
@@ -467,15 +490,14 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
467
490
 
468
491
  # Create a test case context with all relevant fields
469
492
  test_case = {
470
- 'input': result.input,
471
- 'actual_output': result.actual_output,
472
- 'expected_output': result.expected_output,
473
- 'context': result.context,
474
- 'retrieval_context': result.retrieval_context,
475
- 'additional_metadata': result.additional_metadata,
476
- 'tools_called': result.tools_called,
477
- 'expected_tools': result.expected_tools,
478
- 'eval_run_name': result.eval_run_name,
493
+ 'input': result.data_object.input,
494
+ 'actual_output': result.data_object.actual_output,
495
+ 'expected_output': result.data_object.expected_output,
496
+ 'context': result.data_object.context,
497
+ 'retrieval_context': result.data_object.retrieval_context,
498
+ 'additional_metadata': result.data_object.additional_metadata,
499
+ 'tools_called': result.data_object.tools_called,
500
+ 'expected_tools': result.data_object.expected_tools,
479
501
  'failed_scorers': []
480
502
  }
481
503
  if result.scorers_data:
@@ -496,7 +518,6 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
496
518
  error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
497
519
  error_msg += f"Tools Called: {fail_case['tools_called']}\n"
498
520
  error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
499
- error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
500
521
 
501
522
  for fail_scorer in fail_case['failed_scorers']:
502
523
 
judgeval/scorers/score.py CHANGED
@@ -13,7 +13,6 @@ from judgeval.data import (
13
13
  Example,
14
14
  ScoringResult,
15
15
  generate_scoring_result,
16
- create_process_example,
17
16
  create_scorer_data,
18
17
  )
19
18
  from judgeval.scorers import JudgevalScorer
@@ -274,15 +273,16 @@ async def a_execute_scoring(
274
273
  semaphore = asyncio.Semaphore(max_concurrent)
275
274
 
276
275
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
277
- try:
278
- async with semaphore:
276
+ async with semaphore:
277
+ try:
279
278
  return await func(*args, **kwargs)
280
- except Exception as e:
281
- error(f"Error executing function: {e}")
282
- if kwargs.get('ignore_errors', False):
283
- # Return None when ignoring errors
284
- return None
285
- raise
279
+ except Exception as e:
280
+ print(f"Error executing function: {e}")
281
+ if kwargs.get('ignore_errors', False):
282
+ # Simply return None when ignoring errors, as expected by the test
283
+ return None
284
+ # If we're not ignoring errors, propagate the exception
285
+ raise
286
286
 
287
287
  if verbose_mode is not None:
288
288
  for scorer in scorers:
@@ -391,6 +391,7 @@ async def a_eval_examples_helper(
391
391
  Returns:
392
392
  None
393
393
  """
394
+
394
395
  show_metrics_indicator = show_indicator and not _use_bar_indicator
395
396
 
396
397
  for scorer in scorers:
@@ -398,7 +399,6 @@ async def a_eval_examples_helper(
398
399
  scorer.error = None # Reset scorer error
399
400
 
400
401
  # scoring the Example
401
- process_example = create_process_example(example) # Creates process example to track progress
402
402
  scoring_start_time = time.perf_counter()
403
403
  await score_with_indicator(
404
404
  scorers=scorers,
@@ -409,19 +409,22 @@ async def a_eval_examples_helper(
409
409
  ) # execute the scoring functions of each scorer on the example
410
410
 
411
411
  # Now that all the scoring functions of each scorer have executed, we collect
412
- # the results and update the process example with the scorer data
412
+ # the results and update the ScoringResult with the scorer data
413
+ success = True
414
+ scorer_data_list = []
413
415
  for scorer in scorers:
414
416
  # At this point, the scorer has been executed and already contains data.
415
417
  if getattr(scorer, 'skipped', False):
416
418
  continue
417
419
  scorer_data = create_scorer_data(scorer) # Fetch scorer data from completed scorer evaluation
418
- process_example.update_scorer_data(scorer_data) # Update process example with the same scorer data
419
-
420
- test_end_time = time.perf_counter()
421
- run_duration = test_end_time - scoring_start_time
420
+ success = success and scorer_data.success
421
+ scorer_data_list.append(scorer_data)
422
+
423
+ scoring_end_time = time.perf_counter()
424
+ run_duration = scoring_end_time - scoring_start_time
425
+
426
+ scoring_result = generate_scoring_result(example, scorer_data_list, run_duration, success)
427
+ scoring_results[score_index] = scoring_result
422
428
 
423
- process_example.update_run_duration(run_duration) # Update process example with execution time duration
424
- scoring_results[score_index] = generate_scoring_result(process_example) # Converts the outcomes of the executed test to a ScoringResult and saves it
425
-
426
429
  if pbar is not None:
427
430
  pbar.update(1)
judgeval/utils/alerts.py CHANGED
@@ -40,4 +40,35 @@ class AlertResult(BaseModel):
40
40
  @property
41
41
  def conditions_results(self) -> List[Dict[str, Any]]:
42
42
  """Backwards compatibility property for the conditions_result field"""
43
- return self.conditions_result
43
+ return self.conditions_result
44
+
45
+ def model_dump(self, **kwargs):
46
+ """
47
+ Convert the AlertResult to a dictionary for JSON serialization.
48
+
49
+ Args:
50
+ **kwargs: Additional arguments to pass to Pydantic's model_dump
51
+
52
+ Returns:
53
+ dict: Dictionary representation of the AlertResult
54
+ """
55
+ data = super().model_dump(**kwargs) if hasattr(super(), "model_dump") else super().dict(**kwargs)
56
+
57
+ # Handle the NotificationConfig object if it exists
58
+ if hasattr(self, "notification") and self.notification is not None:
59
+ if hasattr(self.notification, "model_dump"):
60
+ data["notification"] = self.notification.model_dump()
61
+ elif hasattr(self.notification, "dict"):
62
+ data["notification"] = self.notification.dict()
63
+ else:
64
+ # Manually convert the notification to a dictionary
65
+ notif = self.notification
66
+ data["notification"] = {
67
+ "enabled": notif.enabled,
68
+ "communication_methods": notif.communication_methods,
69
+ "email_addresses": notif.email_addresses,
70
+ "slack_channels": getattr(notif, "slack_channels", []),
71
+ "send_at": notif.send_at
72
+ }
73
+
74
+ return data
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.25
3
+ Version: 0.0.27
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,24 +1,23 @@
1
1
  judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
2
2
  judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
3
- judgeval/constants.py,sha256=VhJppAECTUDQwzC_FpzJw2wPlkYoogsadHxaJIY_J8U,5073
3
+ judgeval/constants.py,sha256=ksAXhAXovzJKH0uHOdQtREs168uCJRG79PooHNmEbYQ,5313
4
4
  judgeval/evaluation_run.py,sha256=RgJD60lJsunNQzObjo7iXnAzXWgubCLOAAuuamAAuoI,6354
5
- judgeval/judgment_client.py,sha256=e-2e4KK-xy8-WLgzg8H0D6pZC8By9IWdu2iK-lHe39A,24076
6
- judgeval/rules.py,sha256=ebsiDEBVAnYTQxwVNvh_RpmKeWBnjQXgHs8KofTjcAs,15526
7
- judgeval/run_evaluation.py,sha256=YOzkyeWl-r3vaz0jB5nM-1VULi7ALmJ9_f58ENqexXk,23827
5
+ judgeval/judgment_client.py,sha256=uf0V1-eu3qnFTwrQ_Ckcv8IiWRVv7dbvou4P4KjU6hM,26794
6
+ judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
7
+ judgeval/run_evaluation.py,sha256=N2ppmEE5WoSReChKjr_n0NcdAUlUR6Nua7M1C_3zHQ8,24949
8
8
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
9
9
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
10
10
  judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
11
- judgeval/common/tracer.py,sha256=cc_K1poBg3Vzl2Nf7yhHlklrOe6Fb_TEekvjAVAQFSc,39958
11
+ judgeval/common/tracer.py,sha256=L6JkCHj6kxhtDzf9OPg5ZC-NUUH4VDvDcV4utPi_I38,57544
12
12
  judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
13
- judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
14
- judgeval/data/api_example.py,sha256=dzkrQ0xno08y6qNfqL2djXbapUyc2B2aQ5iANn0o4CY,3667
13
+ judgeval/data/__init__.py,sha256=dG5ytBOeOWCTd5o0KP7IblqtW4G1EBaGreLWepM3jas,345
15
14
  judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
16
- judgeval/data/result.py,sha256=4fgjKtUmT3br7K6fkRiNIxTGKUuwMeGyRLqzkpxwXKE,4436
15
+ judgeval/data/result.py,sha256=YHD-dVYJN4JFpM-YCGgBtSdFcGAOyWYL41sf0TE9Hzg,3122
17
16
  judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
18
17
  judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
19
- judgeval/data/datasets/dataset.py,sha256=DjJNy-qvviXMGBl_JhiBzvgiJH1_3rYtAWeHP6Daw6E,11897
20
- judgeval/data/datasets/eval_dataset_client.py,sha256=B4bRy0Di2oFlaBbvp4_hRx2g_9e6Cs0y3ZUT9reMyhw,10926
21
- judgeval/integrations/langgraph.py,sha256=yBbZrePkY19dLLgleeIYFVzakEPaiko6YuccLbwSYcE,10957
18
+ judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
19
+ judgeval/data/datasets/eval_dataset_client.py,sha256=P9fEmcNrjPPaiYbbLiEiBziZrIexA39HN9qzClt6uPE,12691
20
+ judgeval/integrations/langgraph.py,sha256=fGDZOTlVbxTO4ErC-m9OSg3h-RkOIIWXCfhjgkKRh4E,11187
22
21
  judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
23
22
  judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
24
23
  judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
@@ -31,7 +30,7 @@ judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1m
31
30
  judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
32
31
  judgeval/scorers/judgeval_scorer.py,sha256=jq_rzfTG0XBTuLCaa6TlaK4YcT-LlgsO1LEm6hpOYdg,6601
33
32
  judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
34
- judgeval/scorers/score.py,sha256=GALVmeApP1Cyih2vY93zRaU6RShtW4jJDG47Pm6yfnw,18657
33
+ judgeval/scorers/score.py,sha256=ObFAlMbNRcGrfBpH4WW_6OA3CjrneC539xSWhGH60GQ,18578
35
34
  judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
36
35
  judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
37
36
  judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
@@ -86,8 +85,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.p
86
85
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py,sha256=6GnRz2h-6Fwt4sl__0RgQOyo3n3iDO4MNuHWxdu-rrM,10242
87
86
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
88
87
  judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
89
- judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
90
- judgeval-0.0.25.dist-info/METADATA,sha256=09S16QU5qwYqwvrsdg36KVvv9-tnVcSKccgDldPqWpQ,5418
91
- judgeval-0.0.25.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
92
- judgeval-0.0.25.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
93
- judgeval-0.0.25.dist-info/RECORD,,
88
+ judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
89
+ judgeval-0.0.27.dist-info/METADATA,sha256=yoUWIaLIDPksMYQSxDIbVFjtFVCxim6-5LSQ2P13a-U,5418
90
+ judgeval-0.0.27.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
91
+ judgeval-0.0.27.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
92
+ judgeval-0.0.27.dist-info/RECORD,,
@@ -1,98 +0,0 @@
1
- from typing import List, Optional, Dict, Any, Union
2
- from pydantic import BaseModel, ConfigDict, model_validator
3
-
4
- from judgeval.data.example import Example
5
- from judgeval.data.scorer_data import ScorerData
6
- from judgeval.common.logger import debug, error
7
-
8
- class ProcessExample(BaseModel):
9
- """
10
- ProcessExample is an `Example` object that contains intermediate information
11
- about an undergoing evaluation on the original `Example`. It is used purely for
12
- internal operations and keeping track of the evaluation process.
13
- """
14
- name: str
15
- input: Optional[str] = None
16
- actual_output: Optional[Union[str, List[str]]] = None
17
- expected_output: Optional[Union[str, List[str]]] = None
18
- context: Optional[list] = None
19
- retrieval_context: Optional[list] = None
20
- tools_called: Optional[list] = None
21
- expected_tools: Optional[list] = None
22
-
23
- # make these optional, not all test cases in a conversation will be evaluated
24
- success: Optional[bool] = None
25
- scorers_data: Optional[List[ScorerData]] = None
26
- run_duration: Optional[float] = None
27
- evaluation_cost: Optional[float] = None
28
-
29
- order: Optional[int] = None
30
- # These should map 1 to 1 from golden
31
- additional_metadata: Optional[Dict] = None
32
- comments: Optional[str] = None
33
- trace_id: Optional[str] = None
34
- model_config = ConfigDict(arbitrary_types_allowed=True)
35
-
36
- def update_scorer_data(self, scorer_data: ScorerData):
37
- """
38
- Updates scorer data field of test case after the scorers have been
39
- evaluated on this test case.
40
- """
41
- debug(f"Updating scorer data for example '{self.name}' with scorer: {scorer_data}")
42
- # self.scorers_data is a list of ScorerData objects that contain the
43
- # evaluation results of each scorer on this test case
44
- if self.scorers_data is None:
45
- self.scorers_data = [scorer_data]
46
- else:
47
- self.scorers_data.append(scorer_data)
48
-
49
- if self.success is None:
50
- # self.success will be None when it is a message
51
- # in that case we will be setting success for the first time
52
- self.success = scorer_data.success
53
- else:
54
- if scorer_data.success is False:
55
- debug(f"Example '{self.name}' marked as failed due to scorer: {scorer_data}")
56
- self.success = False
57
-
58
- def update_run_duration(self, run_duration: float):
59
- self.run_duration = run_duration
60
-
61
-
62
- def create_process_example(
63
- example: Example,
64
- ) -> ProcessExample:
65
- """
66
- When an LLM Test Case is executed, we track its progress using an ProcessExample.
67
-
68
- This will track things like the success of the test case, as well as the metadata (such as verdicts and claims in Faithfulness).
69
- """
70
- success = True
71
- if example.name is not None:
72
- name = example.name
73
- else:
74
- name = "Test Case Placeholder"
75
- debug(f"No name provided for example, using default name: {name}")
76
- order = None
77
- scorers_data = []
78
-
79
- debug(f"Creating ProcessExample for: {name}")
80
- process_ex = ProcessExample(
81
- name=name,
82
- input=example.input,
83
- actual_output=example.actual_output,
84
- expected_output=example.expected_output,
85
- context=example.context,
86
- retrieval_context=example.retrieval_context,
87
- tools_called=example.tools_called,
88
- expected_tools=example.expected_tools,
89
- success=success,
90
- scorers_data=scorers_data,
91
- run_duration=None,
92
- evaluation_cost=None,
93
- order=order,
94
- additional_metadata=example.additional_metadata,
95
- trace_id=example.trace_id
96
- )
97
- return process_ex
98
-