deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +10 -222
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +3 -6
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +1 -1
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +145 -90
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/amazon_bedrock_model.py +226 -71
- deepeval/models/llms/anthropic_model.py +141 -47
- deepeval/models/llms/azure_model.py +167 -94
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +79 -29
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +125 -59
- deepeval/models/llms/kimi_model.py +126 -81
- deepeval/models/llms/litellm_model.py +92 -18
- deepeval/models/llms/local_model.py +114 -15
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +167 -310
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/utils.py +60 -4
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
deepeval/evaluate/execute.py
CHANGED
|
@@ -56,15 +56,7 @@ from deepeval.telemetry import capture_evaluation_run
|
|
|
56
56
|
from deepeval.metrics import (
|
|
57
57
|
BaseMetric,
|
|
58
58
|
BaseConversationalMetric,
|
|
59
|
-
BaseMultimodalMetric,
|
|
60
59
|
TaskCompletionMetric,
|
|
61
|
-
# RAG metrics that support both single-turn and multimodal
|
|
62
|
-
ContextualPrecisionMetric,
|
|
63
|
-
ContextualRecallMetric,
|
|
64
|
-
ContextualRelevancyMetric,
|
|
65
|
-
AnswerRelevancyMetric,
|
|
66
|
-
FaithfulnessMetric,
|
|
67
|
-
ToolCorrectnessMetric,
|
|
68
60
|
)
|
|
69
61
|
from deepeval.metrics.indicator import (
|
|
70
62
|
measure_metrics_with_indicator,
|
|
@@ -116,15 +108,6 @@ from deepeval.test_run.hyperparameters import (
|
|
|
116
108
|
|
|
117
109
|
logger = logging.getLogger(__name__)
|
|
118
110
|
|
|
119
|
-
MLLM_SUPPORTED_METRICS = [
|
|
120
|
-
ContextualPrecisionMetric,
|
|
121
|
-
ContextualRecallMetric,
|
|
122
|
-
ContextualRelevancyMetric,
|
|
123
|
-
AnswerRelevancyMetric,
|
|
124
|
-
FaithfulnessMetric,
|
|
125
|
-
ToolCorrectnessMetric,
|
|
126
|
-
]
|
|
127
|
-
|
|
128
111
|
|
|
129
112
|
def _skip_metrics_for_error(
|
|
130
113
|
span: Optional[BaseSpan] = None,
|
|
@@ -282,7 +265,6 @@ def execute_test_cases(
|
|
|
282
265
|
metrics: Union[
|
|
283
266
|
List[BaseMetric],
|
|
284
267
|
List[BaseConversationalMetric],
|
|
285
|
-
List[BaseMultimodalMetric],
|
|
286
268
|
],
|
|
287
269
|
error_config: Optional[ErrorConfig] = ErrorConfig(),
|
|
288
270
|
display_config: Optional[DisplayConfig] = DisplayConfig(),
|
|
@@ -315,17 +297,12 @@ def execute_test_cases(
|
|
|
315
297
|
|
|
316
298
|
conversational_metrics: List[BaseConversationalMetric] = []
|
|
317
299
|
llm_metrics: List[BaseMetric] = []
|
|
318
|
-
mllm_metrics: List[BaseMultimodalMetric] = []
|
|
319
300
|
for metric in metrics:
|
|
320
301
|
metric.async_mode = False
|
|
321
302
|
if isinstance(metric, BaseMetric):
|
|
322
303
|
llm_metrics.append(metric)
|
|
323
|
-
if type(metric) in MLLM_SUPPORTED_METRICS:
|
|
324
|
-
mllm_metrics.append(metric)
|
|
325
304
|
elif isinstance(metric, BaseConversationalMetric):
|
|
326
305
|
conversational_metrics.append(metric)
|
|
327
|
-
elif isinstance(metric, BaseMultimodalMetric):
|
|
328
|
-
mllm_metrics.append(metric)
|
|
329
306
|
|
|
330
307
|
test_results: List[TestResult] = []
|
|
331
308
|
|
|
@@ -333,23 +310,17 @@ def execute_test_cases(
|
|
|
333
310
|
progress: Optional[Progress] = None, pbar_id: Optional[int] = None
|
|
334
311
|
):
|
|
335
312
|
llm_test_case_count = -1
|
|
336
|
-
mllm_test_case_count = -1
|
|
337
313
|
conversational_test_case_count = -1
|
|
338
314
|
show_metric_indicator = (
|
|
339
315
|
display_config.show_indicator and not _use_bar_indicator
|
|
340
316
|
)
|
|
341
317
|
for i, test_case in enumerate(test_cases):
|
|
342
318
|
# skip what we know we won't run
|
|
343
|
-
if isinstance(test_case, LLMTestCase)
|
|
319
|
+
if isinstance(test_case, LLMTestCase):
|
|
344
320
|
if not llm_metrics:
|
|
345
321
|
update_pbar(progress, pbar_id)
|
|
346
322
|
continue
|
|
347
323
|
per_case_total = len(llm_metrics)
|
|
348
|
-
elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
|
|
349
|
-
if not mllm_metrics:
|
|
350
|
-
update_pbar(progress, pbar_id)
|
|
351
|
-
continue
|
|
352
|
-
per_case_total = len(mllm_metrics)
|
|
353
324
|
elif isinstance(test_case, ConversationalTestCase):
|
|
354
325
|
if not conversational_metrics:
|
|
355
326
|
update_pbar(progress, pbar_id)
|
|
@@ -364,35 +335,15 @@ def execute_test_cases(
|
|
|
364
335
|
|
|
365
336
|
metrics_for_case = (
|
|
366
337
|
llm_metrics
|
|
367
|
-
if (
|
|
368
|
-
|
|
369
|
-
and not test_case.multimodal
|
|
370
|
-
)
|
|
371
|
-
else (
|
|
372
|
-
mllm_metrics
|
|
373
|
-
if (
|
|
374
|
-
isinstance(test_case, LLMTestCase)
|
|
375
|
-
and test_case.multimodal
|
|
376
|
-
)
|
|
377
|
-
else conversational_metrics
|
|
378
|
-
)
|
|
338
|
+
if (isinstance(test_case, LLMTestCase))
|
|
339
|
+
else conversational_metrics
|
|
379
340
|
)
|
|
380
341
|
api_test_case = create_api_test_case(
|
|
381
342
|
test_case=test_case,
|
|
382
343
|
index=(
|
|
383
344
|
llm_test_case_count + 1
|
|
384
|
-
if (
|
|
385
|
-
|
|
386
|
-
and not test_case.multimodal
|
|
387
|
-
)
|
|
388
|
-
else (
|
|
389
|
-
mllm_test_case_count + 1
|
|
390
|
-
if (
|
|
391
|
-
isinstance(test_case, LLMTestCase)
|
|
392
|
-
and test_case.multimodal
|
|
393
|
-
)
|
|
394
|
-
else conversational_test_case_count + 1
|
|
395
|
-
)
|
|
345
|
+
if (isinstance(test_case, LLMTestCase))
|
|
346
|
+
else (conversational_test_case_count + 1)
|
|
396
347
|
),
|
|
397
348
|
)
|
|
398
349
|
emitted = [False] * len(metrics_for_case)
|
|
@@ -405,15 +356,12 @@ def execute_test_cases(
|
|
|
405
356
|
try:
|
|
406
357
|
|
|
407
358
|
def _run_case():
|
|
408
|
-
nonlocal new_cached_test_case, current_index, llm_test_case_count,
|
|
359
|
+
nonlocal new_cached_test_case, current_index, llm_test_case_count, conversational_test_case_count
|
|
409
360
|
with capture_evaluation_run("test case"):
|
|
410
361
|
for metric in metrics:
|
|
411
362
|
metric.error = None # Reset metric error
|
|
412
363
|
|
|
413
|
-
if (
|
|
414
|
-
isinstance(test_case, LLMTestCase)
|
|
415
|
-
and not test_case.multimodal
|
|
416
|
-
):
|
|
364
|
+
if isinstance(test_case, LLMTestCase):
|
|
417
365
|
llm_test_case_count += 1
|
|
418
366
|
cached_test_case = None
|
|
419
367
|
if cache_config.use_cache:
|
|
@@ -465,29 +413,6 @@ def execute_test_cases(
|
|
|
465
413
|
)
|
|
466
414
|
update_pbar(progress, pbar_test_case_id)
|
|
467
415
|
|
|
468
|
-
# No caching and not sending test cases to Confident AI for multimodal metrics yet
|
|
469
|
-
elif (
|
|
470
|
-
isinstance(test_case, LLMTestCase)
|
|
471
|
-
and test_case.multimodal
|
|
472
|
-
):
|
|
473
|
-
mllm_test_case_count += 1
|
|
474
|
-
for metric in mllm_metrics:
|
|
475
|
-
current_index = index_of[id(metric)]
|
|
476
|
-
res = _execute_metric(
|
|
477
|
-
metric=metric,
|
|
478
|
-
test_case=test_case,
|
|
479
|
-
show_metric_indicator=show_metric_indicator,
|
|
480
|
-
in_component=False,
|
|
481
|
-
error_config=error_config,
|
|
482
|
-
)
|
|
483
|
-
if res == "skip":
|
|
484
|
-
continue
|
|
485
|
-
|
|
486
|
-
metric_data = create_metric_data(metric)
|
|
487
|
-
api_test_case.update_metric_data(metric_data)
|
|
488
|
-
emitted[current_index] = True
|
|
489
|
-
update_pbar(progress, pbar_test_case_id)
|
|
490
|
-
|
|
491
416
|
# No caching for conversational metrics yet
|
|
492
417
|
elif isinstance(test_case, ConversationalTestCase):
|
|
493
418
|
conversational_test_case_count += 1
|
|
@@ -597,7 +522,6 @@ async def a_execute_test_cases(
|
|
|
597
522
|
metrics: Union[
|
|
598
523
|
List[BaseMetric],
|
|
599
524
|
List[BaseConversationalMetric],
|
|
600
|
-
List[BaseMultimodalMetric],
|
|
601
525
|
],
|
|
602
526
|
error_config: Optional[ErrorConfig] = ErrorConfig(),
|
|
603
527
|
display_config: Optional[DisplayConfig] = DisplayConfig(),
|
|
@@ -631,20 +555,14 @@ async def a_execute_test_cases(
|
|
|
631
555
|
metric.verbose_mode = display_config.verbose_mode
|
|
632
556
|
|
|
633
557
|
llm_metrics: List[BaseMetric] = []
|
|
634
|
-
mllm_metrics: List[BaseMultimodalMetric] = []
|
|
635
558
|
conversational_metrics: List[BaseConversationalMetric] = []
|
|
636
559
|
for metric in metrics:
|
|
637
560
|
if isinstance(metric, BaseMetric):
|
|
638
561
|
llm_metrics.append(metric)
|
|
639
|
-
if type(metric) in MLLM_SUPPORTED_METRICS:
|
|
640
|
-
mllm_metrics.append(metric)
|
|
641
|
-
elif isinstance(metric, BaseMultimodalMetric):
|
|
642
|
-
mllm_metrics.append(metric)
|
|
643
562
|
elif isinstance(metric, BaseConversationalMetric):
|
|
644
563
|
conversational_metrics.append(metric)
|
|
645
564
|
|
|
646
565
|
llm_test_case_counter = -1
|
|
647
|
-
mllm_test_case_counter = -1
|
|
648
566
|
conversational_test_case_counter = -1
|
|
649
567
|
test_results: List[Union[TestResult, LLMTestCase]] = []
|
|
650
568
|
tasks = []
|
|
@@ -665,10 +583,7 @@ async def a_execute_test_cases(
|
|
|
665
583
|
with progress:
|
|
666
584
|
for test_case in test_cases:
|
|
667
585
|
with capture_evaluation_run("test case"):
|
|
668
|
-
if (
|
|
669
|
-
isinstance(test_case, LLMTestCase)
|
|
670
|
-
and not test_case.multimodal
|
|
671
|
-
):
|
|
586
|
+
if isinstance(test_case, LLMTestCase):
|
|
672
587
|
if len(llm_metrics) == 0:
|
|
673
588
|
update_pbar(progress, pbar_id)
|
|
674
589
|
continue
|
|
@@ -696,31 +611,6 @@ async def a_execute_test_cases(
|
|
|
696
611
|
)
|
|
697
612
|
tasks.append(asyncio.create_task(task))
|
|
698
613
|
|
|
699
|
-
elif (
|
|
700
|
-
isinstance(test_case, LLMTestCase)
|
|
701
|
-
and test_case.multimodal
|
|
702
|
-
):
|
|
703
|
-
mllm_test_case_counter += 1
|
|
704
|
-
copied_multimodal_metrics: List[
|
|
705
|
-
BaseMultimodalMetric
|
|
706
|
-
] = copy_metrics(mllm_metrics)
|
|
707
|
-
task = execute_with_semaphore(
|
|
708
|
-
func=_a_execute_mllm_test_cases,
|
|
709
|
-
metrics=copied_multimodal_metrics,
|
|
710
|
-
test_case=test_case,
|
|
711
|
-
test_run_manager=test_run_manager,
|
|
712
|
-
test_results=test_results,
|
|
713
|
-
count=mllm_test_case_counter,
|
|
714
|
-
ignore_errors=error_config.ignore_errors,
|
|
715
|
-
skip_on_missing_params=error_config.skip_on_missing_params,
|
|
716
|
-
show_indicator=display_config.show_indicator,
|
|
717
|
-
_use_bar_indicator=_use_bar_indicator,
|
|
718
|
-
_is_assert_test=_is_assert_test,
|
|
719
|
-
progress=progress,
|
|
720
|
-
pbar_id=pbar_id,
|
|
721
|
-
)
|
|
722
|
-
tasks.append(asyncio.create_task(task))
|
|
723
|
-
|
|
724
614
|
elif isinstance(test_case, ConversationalTestCase):
|
|
725
615
|
conversational_test_case_counter += 1
|
|
726
616
|
|
|
@@ -763,10 +653,7 @@ async def a_execute_test_cases(
|
|
|
763
653
|
else:
|
|
764
654
|
for test_case in test_cases:
|
|
765
655
|
with capture_evaluation_run("test case"):
|
|
766
|
-
if (
|
|
767
|
-
isinstance(test_case, LLMTestCase)
|
|
768
|
-
and not test_case.multimodal
|
|
769
|
-
):
|
|
656
|
+
if isinstance(test_case, LLMTestCase):
|
|
770
657
|
if len(llm_metrics) == 0:
|
|
771
658
|
continue
|
|
772
659
|
llm_test_case_counter += 1
|
|
@@ -814,28 +701,6 @@ async def a_execute_test_cases(
|
|
|
814
701
|
)
|
|
815
702
|
tasks.append(asyncio.create_task((task)))
|
|
816
703
|
|
|
817
|
-
elif (
|
|
818
|
-
isinstance(test_case, LLMTestCase) and test_case.multimodal
|
|
819
|
-
):
|
|
820
|
-
mllm_test_case_counter += 1
|
|
821
|
-
copied_multimodal_metrics: List[BaseMultimodalMetric] = (
|
|
822
|
-
copy_metrics(mllm_metrics)
|
|
823
|
-
)
|
|
824
|
-
task = execute_with_semaphore(
|
|
825
|
-
func=_a_execute_mllm_test_cases,
|
|
826
|
-
metrics=copied_multimodal_metrics,
|
|
827
|
-
test_case=test_case,
|
|
828
|
-
test_run_manager=test_run_manager,
|
|
829
|
-
test_results=test_results,
|
|
830
|
-
count=mllm_test_case_counter,
|
|
831
|
-
ignore_errors=error_config.ignore_errors,
|
|
832
|
-
skip_on_missing_params=error_config.skip_on_missing_params,
|
|
833
|
-
_use_bar_indicator=_use_bar_indicator,
|
|
834
|
-
_is_assert_test=_is_assert_test,
|
|
835
|
-
show_indicator=display_config.show_indicator,
|
|
836
|
-
)
|
|
837
|
-
tasks.append(asyncio.create_task(task))
|
|
838
|
-
|
|
839
704
|
await asyncio.sleep(async_config.throttle_value)
|
|
840
705
|
|
|
841
706
|
try:
|
|
@@ -976,85 +841,8 @@ async def _a_execute_llm_test_cases(
|
|
|
976
841
|
update_pbar(progress, pbar_id)
|
|
977
842
|
|
|
978
843
|
|
|
979
|
-
async def _a_execute_mllm_test_cases(
|
|
980
|
-
metrics: List[BaseMultimodalMetric],
|
|
981
|
-
test_case: LLMTestCase,
|
|
982
|
-
test_run_manager: TestRunManager,
|
|
983
|
-
test_results: List[Union[TestResult, LLMTestCase]],
|
|
984
|
-
count: int,
|
|
985
|
-
ignore_errors: bool,
|
|
986
|
-
skip_on_missing_params: bool,
|
|
987
|
-
show_indicator: bool,
|
|
988
|
-
_use_bar_indicator: bool,
|
|
989
|
-
_is_assert_test: bool,
|
|
990
|
-
progress: Optional[Progress] = None,
|
|
991
|
-
pbar_id: Optional[int] = None,
|
|
992
|
-
):
|
|
993
|
-
show_metrics_indicator = show_indicator and not _use_bar_indicator
|
|
994
|
-
pbar_test_case_id = add_pbar(
|
|
995
|
-
progress,
|
|
996
|
-
f" 🎯 Evaluating test case #{count}",
|
|
997
|
-
total=len(metrics),
|
|
998
|
-
)
|
|
999
|
-
|
|
1000
|
-
for metric in metrics:
|
|
1001
|
-
metric.skipped = False
|
|
1002
|
-
metric.error = None # Reset metric error
|
|
1003
|
-
|
|
1004
|
-
api_test_case: LLMApiTestCase = create_api_test_case(
|
|
1005
|
-
test_case=test_case, index=count if not _is_assert_test else None
|
|
1006
|
-
)
|
|
1007
|
-
test_start_time = time.perf_counter()
|
|
1008
|
-
try:
|
|
1009
|
-
await measure_metrics_with_indicator(
|
|
1010
|
-
metrics=metrics,
|
|
1011
|
-
test_case=test_case,
|
|
1012
|
-
cached_test_case=None,
|
|
1013
|
-
skip_on_missing_params=skip_on_missing_params,
|
|
1014
|
-
ignore_errors=ignore_errors,
|
|
1015
|
-
show_indicator=show_metrics_indicator,
|
|
1016
|
-
pbar_eval_id=pbar_test_case_id,
|
|
1017
|
-
progress=progress,
|
|
1018
|
-
)
|
|
1019
|
-
except asyncio.CancelledError:
|
|
1020
|
-
msg = (
|
|
1021
|
-
"Timed out/cancelled while evaluating metric. "
|
|
1022
|
-
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
1023
|
-
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
1024
|
-
)
|
|
1025
|
-
for m in metrics:
|
|
1026
|
-
if getattr(m, "skipped", False):
|
|
1027
|
-
continue
|
|
1028
|
-
# If the task never finished and didn't set a terminal state, mark it now
|
|
1029
|
-
if getattr(m, "success", None) is None and not getattr(
|
|
1030
|
-
m, "error", None
|
|
1031
|
-
):
|
|
1032
|
-
m.success = False
|
|
1033
|
-
m.error = msg
|
|
1034
|
-
if not ignore_errors:
|
|
1035
|
-
raise
|
|
1036
|
-
finally:
|
|
1037
|
-
for metric in metrics:
|
|
1038
|
-
if metric.skipped:
|
|
1039
|
-
continue
|
|
1040
|
-
|
|
1041
|
-
metric_data = create_metric_data(metric)
|
|
1042
|
-
api_test_case.update_metric_data(metric_data)
|
|
1043
|
-
|
|
1044
|
-
test_end_time = time.perf_counter()
|
|
1045
|
-
run_duration = test_end_time - test_start_time
|
|
1046
|
-
api_test_case.update_run_duration(run_duration)
|
|
1047
|
-
|
|
1048
|
-
### Update Test Run ###
|
|
1049
|
-
test_run_manager.update_test_run(api_test_case, test_case)
|
|
1050
|
-
test_results.append(create_test_result(api_test_case))
|
|
1051
|
-
update_pbar(progress, pbar_id)
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
844
|
async def _a_execute_conversational_test_cases(
|
|
1055
|
-
metrics: List[
|
|
1056
|
-
Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric]
|
|
1057
|
-
],
|
|
845
|
+
metrics: List[Union[BaseMetric, BaseConversationalMetric]],
|
|
1058
846
|
test_case: ConversationalTestCase,
|
|
1059
847
|
test_run_manager: TestRunManager,
|
|
1060
848
|
test_results: List[Union[TestResult, LLMTestCase]],
|
deepeval/evaluate/utils.py
CHANGED
|
@@ -11,7 +11,6 @@ from deepeval.metrics import (
|
|
|
11
11
|
ArenaGEval,
|
|
12
12
|
BaseMetric,
|
|
13
13
|
BaseConversationalMetric,
|
|
14
|
-
BaseMultimodalMetric,
|
|
15
14
|
)
|
|
16
15
|
from deepeval.test_case import (
|
|
17
16
|
LLMTestCase,
|
|
@@ -218,9 +217,9 @@ def validate_assert_test_inputs(
|
|
|
218
217
|
)
|
|
219
218
|
|
|
220
219
|
if test_case and metrics:
|
|
221
|
-
if (
|
|
222
|
-
isinstance(
|
|
223
|
-
)
|
|
220
|
+
if (isinstance(test_case, LLMTestCase)) and not all(
|
|
221
|
+
isinstance(metric, BaseMetric) for metric in metrics
|
|
222
|
+
):
|
|
224
223
|
raise ValueError(
|
|
225
224
|
"All 'metrics' for an 'LLMTestCase' must be instances of 'BaseMetric' only."
|
|
226
225
|
)
|
|
@@ -230,18 +229,6 @@ def validate_assert_test_inputs(
|
|
|
230
229
|
raise ValueError(
|
|
231
230
|
"All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
|
|
232
231
|
)
|
|
233
|
-
if (
|
|
234
|
-
isinstance(test_case, LLMTestCase) and test_case.multimodal
|
|
235
|
-
) and not all(
|
|
236
|
-
(
|
|
237
|
-
isinstance(metric, BaseMultimodalMetric)
|
|
238
|
-
or isinstance(metric, BaseMetric)
|
|
239
|
-
)
|
|
240
|
-
for metric in metrics
|
|
241
|
-
):
|
|
242
|
-
raise ValueError(
|
|
243
|
-
"All 'metrics' for multi-modal LLMTestCase must be instances of 'BaseMultimodalMetric' only."
|
|
244
|
-
)
|
|
245
232
|
|
|
246
233
|
if not ((golden and observed_callback) or (test_case and metrics)):
|
|
247
234
|
raise ValueError(
|
|
@@ -259,7 +246,6 @@ def validate_evaluate_inputs(
|
|
|
259
246
|
Union[
|
|
260
247
|
List[BaseMetric],
|
|
261
248
|
List[BaseConversationalMetric],
|
|
262
|
-
List[BaseMultimodalMetric],
|
|
263
249
|
]
|
|
264
250
|
] = None,
|
|
265
251
|
metric_collection: Optional[str] = None,
|
|
@@ -292,10 +278,9 @@ def validate_evaluate_inputs(
|
|
|
292
278
|
if test_cases and metrics:
|
|
293
279
|
for test_case in test_cases:
|
|
294
280
|
for metric in metrics:
|
|
295
|
-
if (
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
) and not isinstance(metric, BaseMetric):
|
|
281
|
+
if (isinstance(test_case, LLMTestCase)) and not isinstance(
|
|
282
|
+
metric, BaseMetric
|
|
283
|
+
):
|
|
299
284
|
raise ValueError(
|
|
300
285
|
f"Metric {metric.__name__} is not a valid metric for LLMTestCase."
|
|
301
286
|
)
|
|
@@ -306,15 +291,6 @@ def validate_evaluate_inputs(
|
|
|
306
291
|
raise ValueError(
|
|
307
292
|
f"Metric {metric.__name__} is not a valid metric for ConversationalTestCase."
|
|
308
293
|
)
|
|
309
|
-
if (
|
|
310
|
-
isinstance(test_case, LLMTestCase) and test_case.multimodal
|
|
311
|
-
) and not (
|
|
312
|
-
isinstance(metric, BaseMultimodalMetric)
|
|
313
|
-
or isinstance(metric, BaseMetric)
|
|
314
|
-
):
|
|
315
|
-
raise ValueError(
|
|
316
|
-
f"Metric {metric.__name__} is not a valid metric for multi-modal LLMTestCase."
|
|
317
|
-
)
|
|
318
294
|
|
|
319
295
|
|
|
320
296
|
def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
|
deepeval/key_handler.py
CHANGED
|
@@ -99,7 +99,10 @@ class ModelKeyValues(Enum):
|
|
|
99
99
|
class EmbeddingKeyValues(Enum):
|
|
100
100
|
# Azure OpenAI
|
|
101
101
|
USE_AZURE_OPENAI_EMBEDDING = "USE_AZURE_OPENAI_EMBEDDING"
|
|
102
|
+
# Azure OpenAI
|
|
103
|
+
AZURE_EMBEDDING_MODEL_NAME = "AZURE_EMBEDDING_MODEL_NAME"
|
|
102
104
|
AZURE_EMBEDDING_DEPLOYMENT_NAME = "AZURE_EMBEDDING_DEPLOYMENT_NAME"
|
|
105
|
+
|
|
103
106
|
# Local
|
|
104
107
|
USE_LOCAL_EMBEDDINGS = "USE_LOCAL_EMBEDDINGS"
|
|
105
108
|
LOCAL_EMBEDDING_MODEL_NAME = "LOCAL_EMBEDDING_MODEL_NAME"
|
deepeval/metrics/__init__.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from .base_metric import (
|
|
2
2
|
BaseMetric,
|
|
3
3
|
BaseConversationalMetric,
|
|
4
|
-
BaseMultimodalMetric,
|
|
5
4
|
BaseArenaMetric,
|
|
6
5
|
)
|
|
7
6
|
|
|
@@ -65,7 +64,6 @@ from .multimodal_metrics import (
|
|
|
65
64
|
ImageCoherenceMetric,
|
|
66
65
|
ImageHelpfulnessMetric,
|
|
67
66
|
ImageReferenceMetric,
|
|
68
|
-
MultimodalGEval,
|
|
69
67
|
)
|
|
70
68
|
|
|
71
69
|
|
|
@@ -73,7 +71,6 @@ __all__ = [
|
|
|
73
71
|
# Base classes
|
|
74
72
|
"BaseMetric",
|
|
75
73
|
"BaseConversationalMetric",
|
|
76
|
-
"BaseMultimodalMetric",
|
|
77
74
|
"BaseArenaMetric",
|
|
78
75
|
# Non-LLM metrics
|
|
79
76
|
"ExactMatchMetric",
|
|
@@ -133,5 +130,4 @@ __all__ = [
|
|
|
133
130
|
"ImageCoherenceMetric",
|
|
134
131
|
"ImageHelpfulnessMetric",
|
|
135
132
|
"ImageReferenceMetric",
|
|
136
|
-
"MultimodalGEval",
|
|
137
133
|
]
|