judgeval 0.0.25__py3-none-any.whl → 0.0.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +528 -166
- judgeval/constants.py +7 -4
- judgeval/data/__init__.py +0 -3
- judgeval/data/datasets/dataset.py +42 -19
- judgeval/data/datasets/eval_dataset_client.py +59 -20
- judgeval/data/result.py +34 -56
- judgeval/integrations/langgraph.py +16 -12
- judgeval/judgment_client.py +85 -23
- judgeval/rules.py +177 -60
- judgeval/run_evaluation.py +143 -122
- judgeval/scorers/score.py +21 -18
- judgeval/utils/alerts.py +32 -1
- {judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/METADATA +1 -1
- {judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/RECORD +16 -17
- judgeval/data/api_example.py +0 -98
- {judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/WHEEL +0 -0
- {judgeval-0.0.25.dist-info → judgeval-0.0.27.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
CHANGED
@@ -23,17 +23,35 @@ from judgeval.constants import (
|
|
23
23
|
ROOT_API,
|
24
24
|
JUDGMENT_EVAL_API_URL,
|
25
25
|
JUDGMENT_EVAL_LOG_API_URL,
|
26
|
-
MAX_CONCURRENT_EVALUATIONS
|
26
|
+
MAX_CONCURRENT_EVALUATIONS,
|
27
|
+
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL
|
27
28
|
)
|
28
29
|
from judgeval.common.exceptions import JudgmentAPIError
|
29
|
-
from judgeval.evaluation_run import EvaluationRun
|
30
30
|
from judgeval.common.logger import (
|
31
31
|
debug,
|
32
32
|
info,
|
33
33
|
error,
|
34
34
|
example_logging_context
|
35
35
|
)
|
36
|
+
from judgeval.evaluation_run import EvaluationRun
|
37
|
+
|
36
38
|
|
39
|
+
def send_to_rabbitmq(evaluation_run: EvaluationRun) -> None:
|
40
|
+
"""
|
41
|
+
Sends an evaluation run to the RabbitMQ evaluation queue.
|
42
|
+
"""
|
43
|
+
payload = evaluation_run.model_dump(warnings=False)
|
44
|
+
response = requests.post(
|
45
|
+
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
46
|
+
headers={
|
47
|
+
"Content-Type": "application/json",
|
48
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
49
|
+
"X-Organization-Id": evaluation_run.organization_id
|
50
|
+
},
|
51
|
+
json=payload,
|
52
|
+
verify=True
|
53
|
+
)
|
54
|
+
return response.json()
|
37
55
|
|
38
56
|
def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
39
57
|
"""
|
@@ -51,13 +69,15 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
|
|
51
69
|
# submit API request to execute evals
|
52
70
|
payload = evaluation_run.model_dump(warnings=False)
|
53
71
|
response = requests.post(
|
54
|
-
JUDGMENT_EVAL_API_URL,
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
72
|
+
JUDGMENT_EVAL_API_URL,
|
73
|
+
headers={
|
74
|
+
"Content-Type": "application/json",
|
75
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
76
|
+
"X-Organization-Id": evaluation_run.organization_id
|
77
|
+
},
|
78
|
+
json=payload,
|
79
|
+
verify=True
|
80
|
+
)
|
61
81
|
response_data = response.json()
|
62
82
|
except Exception as e:
|
63
83
|
error(f"Error: {e}")
|
@@ -97,21 +117,23 @@ def merge_results(api_results: List[ScoringResult], local_results: List[ScoringR
|
|
97
117
|
|
98
118
|
# Each ScoringResult in api and local have all the same fields besides `scorers_data`
|
99
119
|
for api_result, local_result in zip(api_results, local_results):
|
100
|
-
if api_result.
|
120
|
+
if not (api_result.data_object and local_result.data_object):
|
121
|
+
raise ValueError("Data object is None in one of the results.")
|
122
|
+
if api_result.data_object.input != local_result.data_object.input:
|
101
123
|
raise ValueError("The API and local results are not aligned.")
|
102
|
-
if api_result.actual_output != local_result.actual_output:
|
124
|
+
if api_result.data_object.actual_output != local_result.data_object.actual_output:
|
103
125
|
raise ValueError("The API and local results are not aligned.")
|
104
|
-
if api_result.expected_output != local_result.expected_output:
|
126
|
+
if api_result.data_object.expected_output != local_result.data_object.expected_output:
|
105
127
|
raise ValueError("The API and local results are not aligned.")
|
106
|
-
if api_result.context != local_result.context:
|
128
|
+
if api_result.data_object.context != local_result.data_object.context:
|
107
129
|
raise ValueError("The API and local results are not aligned.")
|
108
|
-
if api_result.retrieval_context != local_result.retrieval_context:
|
130
|
+
if api_result.data_object.retrieval_context != local_result.data_object.retrieval_context:
|
109
131
|
raise ValueError("The API and local results are not aligned.")
|
110
|
-
if api_result.additional_metadata != local_result.additional_metadata:
|
132
|
+
if api_result.data_object.additional_metadata != local_result.data_object.additional_metadata:
|
111
133
|
raise ValueError("The API and local results are not aligned.")
|
112
|
-
if api_result.tools_called != local_result.tools_called:
|
134
|
+
if api_result.data_object.tools_called != local_result.data_object.tools_called:
|
113
135
|
raise ValueError("The API and local results are not aligned.")
|
114
|
-
if api_result.expected_tools != local_result.expected_tools:
|
136
|
+
if api_result.data_object.expected_tools != local_result.data_object.expected_tools:
|
115
137
|
raise ValueError("The API and local results are not aligned.")
|
116
138
|
|
117
139
|
|
@@ -281,13 +303,14 @@ def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) ->
|
|
281
303
|
# Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
|
282
304
|
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
283
305
|
|
284
|
-
|
285
|
-
def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
|
306
|
+
def run_eval(evaluation_run: EvaluationRun, override: bool = False, ignore_errors: bool = True, async_execution: bool = False) -> List[ScoringResult]:
|
286
307
|
"""
|
287
308
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
288
309
|
|
289
310
|
Args:
|
290
311
|
evaluation_run (EvaluationRun): Stores example and evaluation together for running
|
312
|
+
override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
|
313
|
+
ignore_errors (bool, optional): Whether to ignore scorer errors during evaluation. Defaults to True.
|
291
314
|
|
292
315
|
Args:
|
293
316
|
project_name (str): The name of the project the evaluation results belong to
|
@@ -354,101 +377,101 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
354
377
|
|
355
378
|
api_results: List[ScoringResult] = []
|
356
379
|
local_results: List[ScoringResult] = []
|
357
|
-
|
358
|
-
|
359
|
-
if judgment_scorers:
|
380
|
+
|
381
|
+
if async_execution:
|
360
382
|
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
361
|
-
info("Starting
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
judgment_api_key=evaluation_run.judgment_api_key,
|
373
|
-
organization_id=evaluation_run.organization_id,
|
374
|
-
log_results=evaluation_run.log_results,
|
375
|
-
rules=evaluation_run.rules
|
376
|
-
)
|
377
|
-
debug("Sending request to Judgment API")
|
378
|
-
response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
|
379
|
-
info(f"Received {len(response_data['results'])} results from API")
|
380
|
-
except JudgmentAPIError as e:
|
381
|
-
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
382
|
-
raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
|
383
|
-
except ValueError as e:
|
384
|
-
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
|
385
|
-
|
386
|
-
# Convert the response data to `ScoringResult` objects
|
387
|
-
debug("Processing API results")
|
388
|
-
for idx, result in enumerate(response_data["results"]):
|
389
|
-
with example_logging_context(evaluation_run.examples[idx].timestamp, evaluation_run.examples[idx].example_id):
|
390
|
-
for scorer in judgment_scorers:
|
391
|
-
debug(f"Processing API result for example {idx} and scorer {scorer.score_type}")
|
392
|
-
# filter for key-value pairs that are used to initialize ScoringResult
|
393
|
-
# there may be some stuff in here that doesn't belong in ScoringResult
|
394
|
-
# TODO: come back and refactor this to have ScoringResult take in **kwargs
|
395
|
-
filtered_result = {k: v for k, v in result.items() if k in ScoringResult.__annotations__}
|
396
|
-
|
397
|
-
# Convert scorers_data dicts to ScorerData objects
|
398
|
-
if "scorers_data" in filtered_result and filtered_result["scorers_data"]:
|
399
|
-
filtered_result["scorers_data"] = [
|
400
|
-
ScorerData(**scorer_dict)
|
401
|
-
for scorer_dict in filtered_result["scorers_data"]
|
402
|
-
]
|
403
|
-
|
404
|
-
api_results.append(ScoringResult(**filtered_result))
|
405
|
-
# Run local evals
|
406
|
-
if local_scorers: # List[JudgevalScorer]
|
407
|
-
# We should be removing local scorers soon
|
408
|
-
info("Starting local evaluation")
|
409
|
-
for example in evaluation_run.examples:
|
410
|
-
with example_logging_context(example.timestamp, example.example_id):
|
411
|
-
debug(f"Processing example {example.example_id}: {example.input}")
|
412
|
-
|
413
|
-
results: List[ScoringResult] = asyncio.run(
|
414
|
-
a_execute_scoring(
|
415
|
-
evaluation_run.examples,
|
416
|
-
local_scorers,
|
417
|
-
model=evaluation_run.model,
|
418
|
-
ignore_errors=True,
|
419
|
-
skip_on_missing_params=True,
|
420
|
-
show_indicator=True,
|
421
|
-
_use_bar_indicator=True,
|
422
|
-
throttle_value=0,
|
423
|
-
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
424
|
-
)
|
383
|
+
info("Starting async evaluation")
|
384
|
+
payload = evaluation_run.model_dump(warnings=False)
|
385
|
+
requests.post(
|
386
|
+
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
387
|
+
headers={
|
388
|
+
"Content-Type": "application/json",
|
389
|
+
"Authorization": f"Bearer {evaluation_run.judgment_api_key}",
|
390
|
+
"X-Organization-Id": evaluation_run.organization_id
|
391
|
+
},
|
392
|
+
json=payload,
|
393
|
+
verify=True
|
425
394
|
)
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
395
|
+
print("Successfully added evaluation to queue")
|
396
|
+
else:
|
397
|
+
if judgment_scorers:
|
398
|
+
# Execute evaluation using Judgment API
|
399
|
+
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
400
|
+
info("Starting API evaluation")
|
401
|
+
debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
|
402
|
+
try: # execute an EvaluationRun with just JudgmentScorers
|
403
|
+
api_evaluation_run: EvaluationRun = EvaluationRun(
|
404
|
+
eval_name=evaluation_run.eval_name,
|
405
|
+
project_name=evaluation_run.project_name,
|
406
|
+
examples=evaluation_run.examples,
|
407
|
+
scorers=judgment_scorers,
|
408
|
+
model=evaluation_run.model,
|
409
|
+
aggregator=evaluation_run.aggregator,
|
410
|
+
metadata=evaluation_run.metadata,
|
411
|
+
judgment_api_key=evaluation_run.judgment_api_key,
|
412
|
+
organization_id=evaluation_run.organization_id,
|
413
|
+
log_results=evaluation_run.log_results,
|
414
|
+
rules=evaluation_run.rules
|
415
|
+
)
|
416
|
+
debug("Sending request to Judgment API")
|
417
|
+
response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
|
418
|
+
info(f"Received {len(response_data['results'])} results from API")
|
419
|
+
except JudgmentAPIError as e:
|
420
|
+
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
421
|
+
raise JudgmentAPIError(f"An error occurred while executing the Judgment API request: {str(e)}")
|
422
|
+
except ValueError as e:
|
423
|
+
raise ValueError(f"Please check your EvaluationRun object, one or more fields are invalid: {str(e)}")
|
424
|
+
|
425
|
+
# Convert the response data to `ScoringResult` objects
|
426
|
+
debug("Processing API results")
|
427
|
+
api_results = [ScoringResult(**result) for result in response_data["results"]]
|
428
|
+
# Run local evals
|
429
|
+
if local_scorers: # List[JudgevalScorer]
|
430
|
+
# We should be removing local scorers soon
|
431
|
+
info("Starting local evaluation")
|
432
|
+
for example in evaluation_run.examples:
|
433
|
+
with example_logging_context(example.timestamp, example.example_id):
|
434
|
+
debug(f"Processing example {example.example_id}: {example.input}")
|
435
|
+
|
436
|
+
results: List[ScoringResult] = asyncio.run(
|
437
|
+
a_execute_scoring(
|
438
|
+
evaluation_run.examples,
|
439
|
+
local_scorers,
|
440
|
+
model=evaluation_run.model,
|
441
|
+
ignore_errors=ignore_errors,
|
442
|
+
skip_on_missing_params=True,
|
443
|
+
show_indicator=True,
|
444
|
+
_use_bar_indicator=True,
|
445
|
+
throttle_value=0,
|
446
|
+
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
447
|
+
)
|
448
|
+
)
|
449
|
+
local_results = results
|
450
|
+
info(f"Local evaluation complete with {len(local_results)} results")
|
451
|
+
# Aggregate the ScorerData from the API and local evaluations
|
452
|
+
debug("Merging API and local results")
|
453
|
+
merged_results: List[ScoringResult] = merge_results(api_results, local_results)
|
454
|
+
merged_results = check_missing_scorer_data(merged_results)
|
455
|
+
|
456
|
+
info(f"Successfully merged {len(merged_results)} results")
|
457
|
+
|
458
|
+
# Evaluate rules against local scoring results if rules exist (this cant be done just yet)
|
459
|
+
# if evaluation_run.rules and merged_results:
|
460
|
+
# run_rules(
|
461
|
+
# local_results=merged_results,
|
462
|
+
# rules=evaluation_run.rules,
|
463
|
+
# judgment_api_key=evaluation_run.judgment_api_key,
|
464
|
+
# organization_id=evaluation_run.organization_id
|
465
|
+
# )
|
466
|
+
# print(merged_results)
|
467
|
+
if evaluation_run.log_results:
|
468
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
|
469
|
+
rprint(pretty_str)
|
470
|
+
|
471
|
+
for i, result in enumerate(merged_results):
|
472
|
+
if not result.scorers_data: # none of the scorers could be executed on this example
|
473
|
+
info(f"None of the scorers could be executed on example {i}. This is usually because the Example is missing the fields needed by the scorers. Try checking that the Example has the necessary fields for your scorers.")
|
474
|
+
return merged_results
|
452
475
|
|
453
476
|
def assert_test(scoring_results: List[ScoringResult]) -> None:
|
454
477
|
"""
|
@@ -467,15 +490,14 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
467
490
|
|
468
491
|
# Create a test case context with all relevant fields
|
469
492
|
test_case = {
|
470
|
-
'input': result.input,
|
471
|
-
'actual_output': result.actual_output,
|
472
|
-
'expected_output': result.expected_output,
|
473
|
-
'context': result.context,
|
474
|
-
'retrieval_context': result.retrieval_context,
|
475
|
-
'additional_metadata': result.additional_metadata,
|
476
|
-
'tools_called': result.tools_called,
|
477
|
-
'expected_tools': result.expected_tools,
|
478
|
-
'eval_run_name': result.eval_run_name,
|
493
|
+
'input': result.data_object.input,
|
494
|
+
'actual_output': result.data_object.actual_output,
|
495
|
+
'expected_output': result.data_object.expected_output,
|
496
|
+
'context': result.data_object.context,
|
497
|
+
'retrieval_context': result.data_object.retrieval_context,
|
498
|
+
'additional_metadata': result.data_object.additional_metadata,
|
499
|
+
'tools_called': result.data_object.tools_called,
|
500
|
+
'expected_tools': result.data_object.expected_tools,
|
479
501
|
'failed_scorers': []
|
480
502
|
}
|
481
503
|
if result.scorers_data:
|
@@ -496,7 +518,6 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
496
518
|
error_msg += f"Additional Metadata: {fail_case['additional_metadata']}\n"
|
497
519
|
error_msg += f"Tools Called: {fail_case['tools_called']}\n"
|
498
520
|
error_msg += f"Expected Tools: {fail_case['expected_tools']}\n"
|
499
|
-
error_msg += f"Eval Run Name: {fail_case['eval_run_name']}\n"
|
500
521
|
|
501
522
|
for fail_scorer in fail_case['failed_scorers']:
|
502
523
|
|
judgeval/scorers/score.py
CHANGED
@@ -13,7 +13,6 @@ from judgeval.data import (
|
|
13
13
|
Example,
|
14
14
|
ScoringResult,
|
15
15
|
generate_scoring_result,
|
16
|
-
create_process_example,
|
17
16
|
create_scorer_data,
|
18
17
|
)
|
19
18
|
from judgeval.scorers import JudgevalScorer
|
@@ -274,15 +273,16 @@ async def a_execute_scoring(
|
|
274
273
|
semaphore = asyncio.Semaphore(max_concurrent)
|
275
274
|
|
276
275
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
277
|
-
|
278
|
-
|
276
|
+
async with semaphore:
|
277
|
+
try:
|
279
278
|
return await func(*args, **kwargs)
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
279
|
+
except Exception as e:
|
280
|
+
print(f"Error executing function: {e}")
|
281
|
+
if kwargs.get('ignore_errors', False):
|
282
|
+
# Simply return None when ignoring errors, as expected by the test
|
283
|
+
return None
|
284
|
+
# If we're not ignoring errors, propagate the exception
|
285
|
+
raise
|
286
286
|
|
287
287
|
if verbose_mode is not None:
|
288
288
|
for scorer in scorers:
|
@@ -391,6 +391,7 @@ async def a_eval_examples_helper(
|
|
391
391
|
Returns:
|
392
392
|
None
|
393
393
|
"""
|
394
|
+
|
394
395
|
show_metrics_indicator = show_indicator and not _use_bar_indicator
|
395
396
|
|
396
397
|
for scorer in scorers:
|
@@ -398,7 +399,6 @@ async def a_eval_examples_helper(
|
|
398
399
|
scorer.error = None # Reset scorer error
|
399
400
|
|
400
401
|
# scoring the Example
|
401
|
-
process_example = create_process_example(example) # Creates process example to track progress
|
402
402
|
scoring_start_time = time.perf_counter()
|
403
403
|
await score_with_indicator(
|
404
404
|
scorers=scorers,
|
@@ -409,19 +409,22 @@ async def a_eval_examples_helper(
|
|
409
409
|
) # execute the scoring functions of each scorer on the example
|
410
410
|
|
411
411
|
# Now that all the scoring functions of each scorer have executed, we collect
|
412
|
-
# the results and update the
|
412
|
+
# the results and update the ScoringResult with the scorer data
|
413
|
+
success = True
|
414
|
+
scorer_data_list = []
|
413
415
|
for scorer in scorers:
|
414
416
|
# At this point, the scorer has been executed and already contains data.
|
415
417
|
if getattr(scorer, 'skipped', False):
|
416
418
|
continue
|
417
419
|
scorer_data = create_scorer_data(scorer) # Fetch scorer data from completed scorer evaluation
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
420
|
+
success = success and scorer_data.success
|
421
|
+
scorer_data_list.append(scorer_data)
|
422
|
+
|
423
|
+
scoring_end_time = time.perf_counter()
|
424
|
+
run_duration = scoring_end_time - scoring_start_time
|
425
|
+
|
426
|
+
scoring_result = generate_scoring_result(example, scorer_data_list, run_duration, success)
|
427
|
+
scoring_results[score_index] = scoring_result
|
422
428
|
|
423
|
-
process_example.update_run_duration(run_duration) # Update process example with execution time duration
|
424
|
-
scoring_results[score_index] = generate_scoring_result(process_example) # Converts the outcomes of the executed test to a ScoringResult and saves it
|
425
|
-
|
426
429
|
if pbar is not None:
|
427
430
|
pbar.update(1)
|
judgeval/utils/alerts.py
CHANGED
@@ -40,4 +40,35 @@ class AlertResult(BaseModel):
|
|
40
40
|
@property
|
41
41
|
def conditions_results(self) -> List[Dict[str, Any]]:
|
42
42
|
"""Backwards compatibility property for the conditions_result field"""
|
43
|
-
return self.conditions_result
|
43
|
+
return self.conditions_result
|
44
|
+
|
45
|
+
def model_dump(self, **kwargs):
|
46
|
+
"""
|
47
|
+
Convert the AlertResult to a dictionary for JSON serialization.
|
48
|
+
|
49
|
+
Args:
|
50
|
+
**kwargs: Additional arguments to pass to Pydantic's model_dump
|
51
|
+
|
52
|
+
Returns:
|
53
|
+
dict: Dictionary representation of the AlertResult
|
54
|
+
"""
|
55
|
+
data = super().model_dump(**kwargs) if hasattr(super(), "model_dump") else super().dict(**kwargs)
|
56
|
+
|
57
|
+
# Handle the NotificationConfig object if it exists
|
58
|
+
if hasattr(self, "notification") and self.notification is not None:
|
59
|
+
if hasattr(self.notification, "model_dump"):
|
60
|
+
data["notification"] = self.notification.model_dump()
|
61
|
+
elif hasattr(self.notification, "dict"):
|
62
|
+
data["notification"] = self.notification.dict()
|
63
|
+
else:
|
64
|
+
# Manually convert the notification to a dictionary
|
65
|
+
notif = self.notification
|
66
|
+
data["notification"] = {
|
67
|
+
"enabled": notif.enabled,
|
68
|
+
"communication_methods": notif.communication_methods,
|
69
|
+
"email_addresses": notif.email_addresses,
|
70
|
+
"slack_channels": getattr(notif, "slack_channels", []),
|
71
|
+
"send_at": notif.send_at
|
72
|
+
}
|
73
|
+
|
74
|
+
return data
|
@@ -1,24 +1,23 @@
|
|
1
1
|
judgeval/__init__.py,sha256=dtXxsCmI4eEsZdGSUMy8P_pA0bc2-OSGAgb2C__yJoA,252
|
2
2
|
judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
|
3
|
-
judgeval/constants.py,sha256=
|
3
|
+
judgeval/constants.py,sha256=ksAXhAXovzJKH0uHOdQtREs168uCJRG79PooHNmEbYQ,5313
|
4
4
|
judgeval/evaluation_run.py,sha256=RgJD60lJsunNQzObjo7iXnAzXWgubCLOAAuuamAAuoI,6354
|
5
|
-
judgeval/judgment_client.py,sha256=
|
6
|
-
judgeval/rules.py,sha256=
|
7
|
-
judgeval/run_evaluation.py,sha256=
|
5
|
+
judgeval/judgment_client.py,sha256=uf0V1-eu3qnFTwrQ_Ckcv8IiWRVv7dbvou4P4KjU6hM,26794
|
6
|
+
judgeval/rules.py,sha256=B0ZL0pn72D4Jnlr0zMQ6CPHi7D8AQQRariXCVsiCMiI,20542
|
7
|
+
judgeval/run_evaluation.py,sha256=N2ppmEE5WoSReChKjr_n0NcdAUlUR6Nua7M1C_3zHQ8,24949
|
8
8
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
9
9
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
10
10
|
judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
|
11
|
-
judgeval/common/tracer.py,sha256=
|
11
|
+
judgeval/common/tracer.py,sha256=L6JkCHj6kxhtDzf9OPg5ZC-NUUH4VDvDcV4utPi_I38,57544
|
12
12
|
judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
|
13
|
-
judgeval/data/__init__.py,sha256=
|
14
|
-
judgeval/data/api_example.py,sha256=dzkrQ0xno08y6qNfqL2djXbapUyc2B2aQ5iANn0o4CY,3667
|
13
|
+
judgeval/data/__init__.py,sha256=dG5ytBOeOWCTd5o0KP7IblqtW4G1EBaGreLWepM3jas,345
|
15
14
|
judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
|
16
|
-
judgeval/data/result.py,sha256=
|
15
|
+
judgeval/data/result.py,sha256=YHD-dVYJN4JFpM-YCGgBtSdFcGAOyWYL41sf0TE9Hzg,3122
|
17
16
|
judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
|
18
17
|
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
19
|
-
judgeval/data/datasets/dataset.py,sha256=
|
20
|
-
judgeval/data/datasets/eval_dataset_client.py,sha256=
|
21
|
-
judgeval/integrations/langgraph.py,sha256=
|
18
|
+
judgeval/data/datasets/dataset.py,sha256=AFYjksV_wXx5CqFYJsl3aN8yZ6hC50O1myRuOJ8s8_E,12867
|
19
|
+
judgeval/data/datasets/eval_dataset_client.py,sha256=P9fEmcNrjPPaiYbbLiEiBziZrIexA39HN9qzClt6uPE,12691
|
20
|
+
judgeval/integrations/langgraph.py,sha256=fGDZOTlVbxTO4ErC-m9OSg3h-RkOIIWXCfhjgkKRh4E,11187
|
22
21
|
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
23
22
|
judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
|
24
23
|
judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
|
@@ -31,7 +30,7 @@ judgeval/scorers/base_scorer.py,sha256=xdUlY3CnLdCQ1Z5iUeY22Bim5v-OQruZmaVF_4Y1m
|
|
31
30
|
judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
|
32
31
|
judgeval/scorers/judgeval_scorer.py,sha256=jq_rzfTG0XBTuLCaa6TlaK4YcT-LlgsO1LEm6hpOYdg,6601
|
33
32
|
judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
|
34
|
-
judgeval/scorers/score.py,sha256=
|
33
|
+
judgeval/scorers/score.py,sha256=ObFAlMbNRcGrfBpH4WW_6OA3CjrneC539xSWhGH60GQ,18578
|
35
34
|
judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
|
36
35
|
judgeval/scorers/judgeval_scorers/__init__.py,sha256=xFRb62sp4JmBUSeuAB_pC_7kEGp-lGdqCRIu9--Bbdg,5992
|
37
36
|
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mZ6b_5Dl04k3PaG24ICBajB_j43ody1II1OJhO1DkXo,1648
|
@@ -86,8 +85,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.p
|
|
86
85
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py,sha256=6GnRz2h-6Fwt4sl__0RgQOyo3n3iDO4MNuHWxdu-rrM,10242
|
87
86
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
|
88
87
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
89
|
-
judgeval/utils/alerts.py,sha256=
|
90
|
-
judgeval-0.0.
|
91
|
-
judgeval-0.0.
|
92
|
-
judgeval-0.0.
|
93
|
-
judgeval-0.0.
|
88
|
+
judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
|
89
|
+
judgeval-0.0.27.dist-info/METADATA,sha256=yoUWIaLIDPksMYQSxDIbVFjtFVCxim6-5LSQ2P13a-U,5418
|
90
|
+
judgeval-0.0.27.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
91
|
+
judgeval-0.0.27.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
92
|
+
judgeval-0.0.27.dist-info/RECORD,,
|
judgeval/data/api_example.py
DELETED
@@ -1,98 +0,0 @@
|
|
1
|
-
from typing import List, Optional, Dict, Any, Union
|
2
|
-
from pydantic import BaseModel, ConfigDict, model_validator
|
3
|
-
|
4
|
-
from judgeval.data.example import Example
|
5
|
-
from judgeval.data.scorer_data import ScorerData
|
6
|
-
from judgeval.common.logger import debug, error
|
7
|
-
|
8
|
-
class ProcessExample(BaseModel):
|
9
|
-
"""
|
10
|
-
ProcessExample is an `Example` object that contains intermediate information
|
11
|
-
about an undergoing evaluation on the original `Example`. It is used purely for
|
12
|
-
internal operations and keeping track of the evaluation process.
|
13
|
-
"""
|
14
|
-
name: str
|
15
|
-
input: Optional[str] = None
|
16
|
-
actual_output: Optional[Union[str, List[str]]] = None
|
17
|
-
expected_output: Optional[Union[str, List[str]]] = None
|
18
|
-
context: Optional[list] = None
|
19
|
-
retrieval_context: Optional[list] = None
|
20
|
-
tools_called: Optional[list] = None
|
21
|
-
expected_tools: Optional[list] = None
|
22
|
-
|
23
|
-
# make these optional, not all test cases in a conversation will be evaluated
|
24
|
-
success: Optional[bool] = None
|
25
|
-
scorers_data: Optional[List[ScorerData]] = None
|
26
|
-
run_duration: Optional[float] = None
|
27
|
-
evaluation_cost: Optional[float] = None
|
28
|
-
|
29
|
-
order: Optional[int] = None
|
30
|
-
# These should map 1 to 1 from golden
|
31
|
-
additional_metadata: Optional[Dict] = None
|
32
|
-
comments: Optional[str] = None
|
33
|
-
trace_id: Optional[str] = None
|
34
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
35
|
-
|
36
|
-
def update_scorer_data(self, scorer_data: ScorerData):
|
37
|
-
"""
|
38
|
-
Updates scorer data field of test case after the scorers have been
|
39
|
-
evaluated on this test case.
|
40
|
-
"""
|
41
|
-
debug(f"Updating scorer data for example '{self.name}' with scorer: {scorer_data}")
|
42
|
-
# self.scorers_data is a list of ScorerData objects that contain the
|
43
|
-
# evaluation results of each scorer on this test case
|
44
|
-
if self.scorers_data is None:
|
45
|
-
self.scorers_data = [scorer_data]
|
46
|
-
else:
|
47
|
-
self.scorers_data.append(scorer_data)
|
48
|
-
|
49
|
-
if self.success is None:
|
50
|
-
# self.success will be None when it is a message
|
51
|
-
# in that case we will be setting success for the first time
|
52
|
-
self.success = scorer_data.success
|
53
|
-
else:
|
54
|
-
if scorer_data.success is False:
|
55
|
-
debug(f"Example '{self.name}' marked as failed due to scorer: {scorer_data}")
|
56
|
-
self.success = False
|
57
|
-
|
58
|
-
def update_run_duration(self, run_duration: float):
|
59
|
-
self.run_duration = run_duration
|
60
|
-
|
61
|
-
|
62
|
-
def create_process_example(
|
63
|
-
example: Example,
|
64
|
-
) -> ProcessExample:
|
65
|
-
"""
|
66
|
-
When an LLM Test Case is executed, we track its progress using an ProcessExample.
|
67
|
-
|
68
|
-
This will track things like the success of the test case, as well as the metadata (such as verdicts and claims in Faithfulness).
|
69
|
-
"""
|
70
|
-
success = True
|
71
|
-
if example.name is not None:
|
72
|
-
name = example.name
|
73
|
-
else:
|
74
|
-
name = "Test Case Placeholder"
|
75
|
-
debug(f"No name provided for example, using default name: {name}")
|
76
|
-
order = None
|
77
|
-
scorers_data = []
|
78
|
-
|
79
|
-
debug(f"Creating ProcessExample for: {name}")
|
80
|
-
process_ex = ProcessExample(
|
81
|
-
name=name,
|
82
|
-
input=example.input,
|
83
|
-
actual_output=example.actual_output,
|
84
|
-
expected_output=example.expected_output,
|
85
|
-
context=example.context,
|
86
|
-
retrieval_context=example.retrieval_context,
|
87
|
-
tools_called=example.tools_called,
|
88
|
-
expected_tools=example.expected_tools,
|
89
|
-
success=success,
|
90
|
-
scorers_data=scorers_data,
|
91
|
-
run_duration=None,
|
92
|
-
evaluation_cost=None,
|
93
|
-
order=order,
|
94
|
-
additional_metadata=example.additional_metadata,
|
95
|
-
trace_id=example.trace_id
|
96
|
-
)
|
97
|
-
return process_ex
|
98
|
-
|
File without changes
|
File without changes
|