deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
deepeval/evaluate/execute.py
CHANGED
|
@@ -56,7 +56,6 @@ from deepeval.telemetry import capture_evaluation_run
|
|
|
56
56
|
from deepeval.metrics import (
|
|
57
57
|
BaseMetric,
|
|
58
58
|
BaseConversationalMetric,
|
|
59
|
-
BaseMultimodalMetric,
|
|
60
59
|
TaskCompletionMetric,
|
|
61
60
|
)
|
|
62
61
|
from deepeval.metrics.indicator import (
|
|
@@ -70,7 +69,6 @@ from deepeval.models.retry_policy import (
|
|
|
70
69
|
from deepeval.test_case import (
|
|
71
70
|
LLMTestCase,
|
|
72
71
|
ConversationalTestCase,
|
|
73
|
-
MLLMTestCase,
|
|
74
72
|
)
|
|
75
73
|
from deepeval.test_case.api import create_api_test_case
|
|
76
74
|
from deepeval.test_run import (
|
|
@@ -263,13 +261,10 @@ async def _await_with_outer_deadline(obj, *args, timeout: float, **kwargs):
|
|
|
263
261
|
|
|
264
262
|
|
|
265
263
|
def execute_test_cases(
|
|
266
|
-
test_cases: Union[
|
|
267
|
-
List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
|
|
268
|
-
],
|
|
264
|
+
test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
|
|
269
265
|
metrics: Union[
|
|
270
266
|
List[BaseMetric],
|
|
271
267
|
List[BaseConversationalMetric],
|
|
272
|
-
List[BaseMultimodalMetric],
|
|
273
268
|
],
|
|
274
269
|
error_config: Optional[ErrorConfig] = ErrorConfig(),
|
|
275
270
|
display_config: Optional[DisplayConfig] = DisplayConfig(),
|
|
@@ -302,15 +297,12 @@ def execute_test_cases(
|
|
|
302
297
|
|
|
303
298
|
conversational_metrics: List[BaseConversationalMetric] = []
|
|
304
299
|
llm_metrics: List[BaseMetric] = []
|
|
305
|
-
mllm_metrics: List[BaseMultimodalMetric] = []
|
|
306
300
|
for metric in metrics:
|
|
307
301
|
metric.async_mode = False
|
|
308
302
|
if isinstance(metric, BaseMetric):
|
|
309
303
|
llm_metrics.append(metric)
|
|
310
304
|
elif isinstance(metric, BaseConversationalMetric):
|
|
311
305
|
conversational_metrics.append(metric)
|
|
312
|
-
elif isinstance(metric, BaseMultimodalMetric):
|
|
313
|
-
mllm_metrics.append(metric)
|
|
314
306
|
|
|
315
307
|
test_results: List[TestResult] = []
|
|
316
308
|
|
|
@@ -318,7 +310,6 @@ def execute_test_cases(
|
|
|
318
310
|
progress: Optional[Progress] = None, pbar_id: Optional[int] = None
|
|
319
311
|
):
|
|
320
312
|
llm_test_case_count = -1
|
|
321
|
-
mllm_test_case_count = -1
|
|
322
313
|
conversational_test_case_count = -1
|
|
323
314
|
show_metric_indicator = (
|
|
324
315
|
display_config.show_indicator and not _use_bar_indicator
|
|
@@ -330,11 +321,6 @@ def execute_test_cases(
|
|
|
330
321
|
update_pbar(progress, pbar_id)
|
|
331
322
|
continue
|
|
332
323
|
per_case_total = len(llm_metrics)
|
|
333
|
-
elif isinstance(test_case, MLLMTestCase):
|
|
334
|
-
if not mllm_metrics:
|
|
335
|
-
update_pbar(progress, pbar_id)
|
|
336
|
-
continue
|
|
337
|
-
per_case_total = len(mllm_metrics)
|
|
338
324
|
elif isinstance(test_case, ConversationalTestCase):
|
|
339
325
|
if not conversational_metrics:
|
|
340
326
|
update_pbar(progress, pbar_id)
|
|
@@ -349,23 +335,15 @@ def execute_test_cases(
|
|
|
349
335
|
|
|
350
336
|
metrics_for_case = (
|
|
351
337
|
llm_metrics
|
|
352
|
-
if isinstance(test_case, LLMTestCase)
|
|
353
|
-
else
|
|
354
|
-
mllm_metrics
|
|
355
|
-
if isinstance(test_case, MLLMTestCase)
|
|
356
|
-
else conversational_metrics
|
|
357
|
-
)
|
|
338
|
+
if (isinstance(test_case, LLMTestCase))
|
|
339
|
+
else conversational_metrics
|
|
358
340
|
)
|
|
359
341
|
api_test_case = create_api_test_case(
|
|
360
342
|
test_case=test_case,
|
|
361
343
|
index=(
|
|
362
344
|
llm_test_case_count + 1
|
|
363
|
-
if isinstance(test_case, LLMTestCase)
|
|
364
|
-
else (
|
|
365
|
-
mllm_test_case_count + 1
|
|
366
|
-
if isinstance(test_case, MLLMTestCase)
|
|
367
|
-
else conversational_test_case_count + 1
|
|
368
|
-
)
|
|
345
|
+
if (isinstance(test_case, LLMTestCase))
|
|
346
|
+
else (conversational_test_case_count + 1)
|
|
369
347
|
),
|
|
370
348
|
)
|
|
371
349
|
emitted = [False] * len(metrics_for_case)
|
|
@@ -378,7 +356,7 @@ def execute_test_cases(
|
|
|
378
356
|
try:
|
|
379
357
|
|
|
380
358
|
def _run_case():
|
|
381
|
-
nonlocal new_cached_test_case, current_index, llm_test_case_count,
|
|
359
|
+
nonlocal new_cached_test_case, current_index, llm_test_case_count, conversational_test_case_count
|
|
382
360
|
with capture_evaluation_run("test case"):
|
|
383
361
|
for metric in metrics:
|
|
384
362
|
metric.error = None # Reset metric error
|
|
@@ -435,26 +413,6 @@ def execute_test_cases(
|
|
|
435
413
|
)
|
|
436
414
|
update_pbar(progress, pbar_test_case_id)
|
|
437
415
|
|
|
438
|
-
# No caching and not sending test cases to Confident AI for multimodal metrics yet
|
|
439
|
-
elif isinstance(test_case, MLLMTestCase):
|
|
440
|
-
mllm_test_case_count += 1
|
|
441
|
-
for metric in mllm_metrics:
|
|
442
|
-
current_index = index_of[id(metric)]
|
|
443
|
-
res = _execute_metric(
|
|
444
|
-
metric=metric,
|
|
445
|
-
test_case=test_case,
|
|
446
|
-
show_metric_indicator=show_metric_indicator,
|
|
447
|
-
in_component=False,
|
|
448
|
-
error_config=error_config,
|
|
449
|
-
)
|
|
450
|
-
if res == "skip":
|
|
451
|
-
continue
|
|
452
|
-
|
|
453
|
-
metric_data = create_metric_data(metric)
|
|
454
|
-
api_test_case.update_metric_data(metric_data)
|
|
455
|
-
emitted[current_index] = True
|
|
456
|
-
update_pbar(progress, pbar_test_case_id)
|
|
457
|
-
|
|
458
416
|
# No caching for conversational metrics yet
|
|
459
417
|
elif isinstance(test_case, ConversationalTestCase):
|
|
460
418
|
conversational_test_case_count += 1
|
|
@@ -560,13 +518,10 @@ def execute_test_cases(
|
|
|
560
518
|
|
|
561
519
|
|
|
562
520
|
async def a_execute_test_cases(
|
|
563
|
-
test_cases: Union[
|
|
564
|
-
List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
|
|
565
|
-
],
|
|
521
|
+
test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
|
|
566
522
|
metrics: Union[
|
|
567
523
|
List[BaseMetric],
|
|
568
524
|
List[BaseConversationalMetric],
|
|
569
|
-
List[BaseMultimodalMetric],
|
|
570
525
|
],
|
|
571
526
|
error_config: Optional[ErrorConfig] = ErrorConfig(),
|
|
572
527
|
display_config: Optional[DisplayConfig] = DisplayConfig(),
|
|
@@ -600,20 +555,16 @@ async def a_execute_test_cases(
|
|
|
600
555
|
metric.verbose_mode = display_config.verbose_mode
|
|
601
556
|
|
|
602
557
|
llm_metrics: List[BaseMetric] = []
|
|
603
|
-
mllm_metrics: List[BaseMultimodalMetric] = []
|
|
604
558
|
conversational_metrics: List[BaseConversationalMetric] = []
|
|
605
559
|
for metric in metrics:
|
|
606
560
|
if isinstance(metric, BaseMetric):
|
|
607
561
|
llm_metrics.append(metric)
|
|
608
|
-
elif isinstance(metric, BaseMultimodalMetric):
|
|
609
|
-
mllm_metrics.append(metric)
|
|
610
562
|
elif isinstance(metric, BaseConversationalMetric):
|
|
611
563
|
conversational_metrics.append(metric)
|
|
612
564
|
|
|
613
565
|
llm_test_case_counter = -1
|
|
614
|
-
mllm_test_case_counter = -1
|
|
615
566
|
conversational_test_case_counter = -1
|
|
616
|
-
test_results: List[Union[TestResult,
|
|
567
|
+
test_results: List[Union[TestResult, LLMTestCase]] = []
|
|
617
568
|
tasks = []
|
|
618
569
|
|
|
619
570
|
if display_config.show_indicator and _use_bar_indicator:
|
|
@@ -660,28 +611,6 @@ async def a_execute_test_cases(
|
|
|
660
611
|
)
|
|
661
612
|
tasks.append(asyncio.create_task(task))
|
|
662
613
|
|
|
663
|
-
elif isinstance(test_case, MLLMTestCase):
|
|
664
|
-
mllm_test_case_counter += 1
|
|
665
|
-
copied_multimodal_metrics: List[
|
|
666
|
-
BaseMultimodalMetric
|
|
667
|
-
] = copy_metrics(mllm_metrics)
|
|
668
|
-
task = execute_with_semaphore(
|
|
669
|
-
func=_a_execute_mllm_test_cases,
|
|
670
|
-
metrics=copied_multimodal_metrics,
|
|
671
|
-
test_case=test_case,
|
|
672
|
-
test_run_manager=test_run_manager,
|
|
673
|
-
test_results=test_results,
|
|
674
|
-
count=mllm_test_case_counter,
|
|
675
|
-
ignore_errors=error_config.ignore_errors,
|
|
676
|
-
skip_on_missing_params=error_config.skip_on_missing_params,
|
|
677
|
-
show_indicator=display_config.show_indicator,
|
|
678
|
-
_use_bar_indicator=_use_bar_indicator,
|
|
679
|
-
_is_assert_test=_is_assert_test,
|
|
680
|
-
progress=progress,
|
|
681
|
-
pbar_id=pbar_id,
|
|
682
|
-
)
|
|
683
|
-
tasks.append(asyncio.create_task(task))
|
|
684
|
-
|
|
685
614
|
elif isinstance(test_case, ConversationalTestCase):
|
|
686
615
|
conversational_test_case_counter += 1
|
|
687
616
|
|
|
@@ -772,26 +701,6 @@ async def a_execute_test_cases(
|
|
|
772
701
|
)
|
|
773
702
|
tasks.append(asyncio.create_task((task)))
|
|
774
703
|
|
|
775
|
-
elif isinstance(test_case, MLLMTestCase):
|
|
776
|
-
mllm_test_case_counter += 1
|
|
777
|
-
copied_multimodal_metrics: List[BaseMultimodalMetric] = (
|
|
778
|
-
copy_metrics(mllm_metrics)
|
|
779
|
-
)
|
|
780
|
-
task = execute_with_semaphore(
|
|
781
|
-
func=_a_execute_mllm_test_cases,
|
|
782
|
-
metrics=copied_multimodal_metrics,
|
|
783
|
-
test_case=test_case,
|
|
784
|
-
test_run_manager=test_run_manager,
|
|
785
|
-
test_results=test_results,
|
|
786
|
-
count=mllm_test_case_counter,
|
|
787
|
-
ignore_errors=error_config.ignore_errors,
|
|
788
|
-
skip_on_missing_params=error_config.skip_on_missing_params,
|
|
789
|
-
_use_bar_indicator=_use_bar_indicator,
|
|
790
|
-
_is_assert_test=_is_assert_test,
|
|
791
|
-
show_indicator=display_config.show_indicator,
|
|
792
|
-
)
|
|
793
|
-
tasks.append(asyncio.create_task(task))
|
|
794
|
-
|
|
795
704
|
await asyncio.sleep(async_config.throttle_value)
|
|
796
705
|
|
|
797
706
|
try:
|
|
@@ -815,7 +724,7 @@ async def _a_execute_llm_test_cases(
|
|
|
815
724
|
metrics: List[BaseMetric],
|
|
816
725
|
test_case: LLMTestCase,
|
|
817
726
|
test_run_manager: TestRunManager,
|
|
818
|
-
test_results: List[Union[TestResult,
|
|
727
|
+
test_results: List[Union[TestResult, LLMTestCase]],
|
|
819
728
|
count: int,
|
|
820
729
|
test_run: TestRun,
|
|
821
730
|
ignore_errors: bool,
|
|
@@ -932,88 +841,11 @@ async def _a_execute_llm_test_cases(
|
|
|
932
841
|
update_pbar(progress, pbar_id)
|
|
933
842
|
|
|
934
843
|
|
|
935
|
-
async def _a_execute_mllm_test_cases(
|
|
936
|
-
metrics: List[BaseMultimodalMetric],
|
|
937
|
-
test_case: MLLMTestCase,
|
|
938
|
-
test_run_manager: TestRunManager,
|
|
939
|
-
test_results: List[Union[TestResult, MLLMTestCase]],
|
|
940
|
-
count: int,
|
|
941
|
-
ignore_errors: bool,
|
|
942
|
-
skip_on_missing_params: bool,
|
|
943
|
-
show_indicator: bool,
|
|
944
|
-
_use_bar_indicator: bool,
|
|
945
|
-
_is_assert_test: bool,
|
|
946
|
-
progress: Optional[Progress] = None,
|
|
947
|
-
pbar_id: Optional[int] = None,
|
|
948
|
-
):
|
|
949
|
-
show_metrics_indicator = show_indicator and not _use_bar_indicator
|
|
950
|
-
pbar_test_case_id = add_pbar(
|
|
951
|
-
progress,
|
|
952
|
-
f" 🎯 Evaluating test case #{count}",
|
|
953
|
-
total=len(metrics),
|
|
954
|
-
)
|
|
955
|
-
|
|
956
|
-
for metric in metrics:
|
|
957
|
-
metric.skipped = False
|
|
958
|
-
metric.error = None # Reset metric error
|
|
959
|
-
|
|
960
|
-
api_test_case: LLMApiTestCase = create_api_test_case(
|
|
961
|
-
test_case=test_case, index=count if not _is_assert_test else None
|
|
962
|
-
)
|
|
963
|
-
test_start_time = time.perf_counter()
|
|
964
|
-
try:
|
|
965
|
-
await measure_metrics_with_indicator(
|
|
966
|
-
metrics=metrics,
|
|
967
|
-
test_case=test_case,
|
|
968
|
-
cached_test_case=None,
|
|
969
|
-
skip_on_missing_params=skip_on_missing_params,
|
|
970
|
-
ignore_errors=ignore_errors,
|
|
971
|
-
show_indicator=show_metrics_indicator,
|
|
972
|
-
pbar_eval_id=pbar_test_case_id,
|
|
973
|
-
progress=progress,
|
|
974
|
-
)
|
|
975
|
-
except asyncio.CancelledError:
|
|
976
|
-
msg = (
|
|
977
|
-
"Timed out/cancelled while evaluating metric. "
|
|
978
|
-
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
979
|
-
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
980
|
-
)
|
|
981
|
-
for m in metrics:
|
|
982
|
-
if getattr(m, "skipped", False):
|
|
983
|
-
continue
|
|
984
|
-
# If the task never finished and didn't set a terminal state, mark it now
|
|
985
|
-
if getattr(m, "success", None) is None and not getattr(
|
|
986
|
-
m, "error", None
|
|
987
|
-
):
|
|
988
|
-
m.success = False
|
|
989
|
-
m.error = msg
|
|
990
|
-
if not ignore_errors:
|
|
991
|
-
raise
|
|
992
|
-
finally:
|
|
993
|
-
for metric in metrics:
|
|
994
|
-
if metric.skipped:
|
|
995
|
-
continue
|
|
996
|
-
|
|
997
|
-
metric_data = create_metric_data(metric)
|
|
998
|
-
api_test_case.update_metric_data(metric_data)
|
|
999
|
-
|
|
1000
|
-
test_end_time = time.perf_counter()
|
|
1001
|
-
run_duration = test_end_time - test_start_time
|
|
1002
|
-
api_test_case.update_run_duration(run_duration)
|
|
1003
|
-
|
|
1004
|
-
### Update Test Run ###
|
|
1005
|
-
test_run_manager.update_test_run(api_test_case, test_case)
|
|
1006
|
-
test_results.append(create_test_result(api_test_case))
|
|
1007
|
-
update_pbar(progress, pbar_id)
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
844
|
async def _a_execute_conversational_test_cases(
|
|
1011
|
-
metrics: List[
|
|
1012
|
-
Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric]
|
|
1013
|
-
],
|
|
845
|
+
metrics: List[Union[BaseMetric, BaseConversationalMetric]],
|
|
1014
846
|
test_case: ConversationalTestCase,
|
|
1015
847
|
test_run_manager: TestRunManager,
|
|
1016
|
-
test_results: List[Union[TestResult,
|
|
848
|
+
test_results: List[Union[TestResult, LLMTestCase]],
|
|
1017
849
|
count: int,
|
|
1018
850
|
ignore_errors: bool,
|
|
1019
851
|
skip_on_missing_params: bool,
|
|
@@ -1776,7 +1608,7 @@ async def a_execute_agentic_test_cases(
|
|
|
1776
1608
|
async def _a_execute_agentic_test_case(
|
|
1777
1609
|
golden: Golden,
|
|
1778
1610
|
test_run_manager: TestRunManager,
|
|
1779
|
-
test_results: List[Union[TestResult,
|
|
1611
|
+
test_results: List[Union[TestResult, LLMTestCase]],
|
|
1780
1612
|
count: int,
|
|
1781
1613
|
verbose_mode: Optional[bool],
|
|
1782
1614
|
ignore_errors: bool,
|
|
@@ -3205,7 +3037,7 @@ async def _evaluate_test_case_pairs(
|
|
|
3205
3037
|
|
|
3206
3038
|
def _execute_metric(
|
|
3207
3039
|
metric: BaseMetric,
|
|
3208
|
-
test_case: Union[LLMTestCase, ConversationalTestCase
|
|
3040
|
+
test_case: Union[LLMTestCase, ConversationalTestCase],
|
|
3209
3041
|
show_metric_indicator: bool,
|
|
3210
3042
|
in_component: bool,
|
|
3211
3043
|
error_config: ErrorConfig,
|
deepeval/evaluate/utils.py
CHANGED
|
@@ -11,12 +11,10 @@ from deepeval.metrics import (
|
|
|
11
11
|
ArenaGEval,
|
|
12
12
|
BaseMetric,
|
|
13
13
|
BaseConversationalMetric,
|
|
14
|
-
BaseMultimodalMetric,
|
|
15
14
|
)
|
|
16
15
|
from deepeval.test_case import (
|
|
17
16
|
LLMTestCase,
|
|
18
17
|
ConversationalTestCase,
|
|
19
|
-
MLLMTestCase,
|
|
20
18
|
)
|
|
21
19
|
from deepeval.test_run import (
|
|
22
20
|
LLMApiTestCase,
|
|
@@ -129,17 +127,14 @@ def create_test_result(
|
|
|
129
127
|
turns=api_test_case.turns,
|
|
130
128
|
)
|
|
131
129
|
else:
|
|
132
|
-
multimodal =
|
|
133
|
-
api_test_case.multimodal_input is not None
|
|
134
|
-
and api_test_case.multimodal_input_actual_output is not None
|
|
135
|
-
)
|
|
130
|
+
multimodal = api_test_case.images_mapping
|
|
136
131
|
if multimodal:
|
|
137
132
|
return TestResult(
|
|
138
133
|
name=name,
|
|
139
134
|
success=api_test_case.success,
|
|
140
135
|
metrics_data=api_test_case.metrics_data,
|
|
141
|
-
input=api_test_case.
|
|
142
|
-
actual_output=api_test_case.
|
|
136
|
+
input=api_test_case.input,
|
|
137
|
+
actual_output=api_test_case.actual_output,
|
|
143
138
|
conversational=False,
|
|
144
139
|
multimodal=True,
|
|
145
140
|
additional_metadata=api_test_case.additional_metadata,
|
|
@@ -222,7 +217,7 @@ def validate_assert_test_inputs(
|
|
|
222
217
|
)
|
|
223
218
|
|
|
224
219
|
if test_case and metrics:
|
|
225
|
-
if isinstance(test_case, LLMTestCase) and not all(
|
|
220
|
+
if (isinstance(test_case, LLMTestCase)) and not all(
|
|
226
221
|
isinstance(metric, BaseMetric) for metric in metrics
|
|
227
222
|
):
|
|
228
223
|
raise ValueError(
|
|
@@ -234,12 +229,6 @@ def validate_assert_test_inputs(
|
|
|
234
229
|
raise ValueError(
|
|
235
230
|
"All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
|
|
236
231
|
)
|
|
237
|
-
if isinstance(test_case, MLLMTestCase) and not all(
|
|
238
|
-
isinstance(metric, BaseMultimodalMetric) for metric in metrics
|
|
239
|
-
):
|
|
240
|
-
raise ValueError(
|
|
241
|
-
"All 'metrics' for an 'MLLMTestCase' must be instances of 'BaseMultimodalMetric' only."
|
|
242
|
-
)
|
|
243
232
|
|
|
244
233
|
if not ((golden and observed_callback) or (test_case and metrics)):
|
|
245
234
|
raise ValueError(
|
|
@@ -251,15 +240,12 @@ def validate_evaluate_inputs(
|
|
|
251
240
|
goldens: Optional[List] = None,
|
|
252
241
|
observed_callback: Optional[Callable] = None,
|
|
253
242
|
test_cases: Optional[
|
|
254
|
-
Union[
|
|
255
|
-
List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
|
|
256
|
-
]
|
|
243
|
+
Union[List[LLMTestCase], List[ConversationalTestCase]]
|
|
257
244
|
] = None,
|
|
258
245
|
metrics: Optional[
|
|
259
246
|
Union[
|
|
260
247
|
List[BaseMetric],
|
|
261
248
|
List[BaseConversationalMetric],
|
|
262
|
-
List[BaseMultimodalMetric],
|
|
263
249
|
]
|
|
264
250
|
] = None,
|
|
265
251
|
metric_collection: Optional[str] = None,
|
|
@@ -292,7 +278,7 @@ def validate_evaluate_inputs(
|
|
|
292
278
|
if test_cases and metrics:
|
|
293
279
|
for test_case in test_cases:
|
|
294
280
|
for metric in metrics:
|
|
295
|
-
if isinstance(test_case, LLMTestCase) and not isinstance(
|
|
281
|
+
if (isinstance(test_case, LLMTestCase)) and not isinstance(
|
|
296
282
|
metric, BaseMetric
|
|
297
283
|
):
|
|
298
284
|
raise ValueError(
|
|
@@ -305,12 +291,6 @@ def validate_evaluate_inputs(
|
|
|
305
291
|
raise ValueError(
|
|
306
292
|
f"Metric {metric.__name__} is not a valid metric for ConversationalTestCase."
|
|
307
293
|
)
|
|
308
|
-
if isinstance(test_case, MLLMTestCase) and not isinstance(
|
|
309
|
-
metric, BaseMultimodalMetric
|
|
310
|
-
):
|
|
311
|
-
raise ValueError(
|
|
312
|
-
f"Metric {metric.__name__} is not a valid metric for MLLMTestCase."
|
|
313
|
-
)
|
|
314
294
|
|
|
315
295
|
|
|
316
296
|
def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
|
|
@@ -1,12 +1,29 @@
|
|
|
1
1
|
import warnings
|
|
2
|
+
from typing import TYPE_CHECKING, Any
|
|
2
3
|
|
|
3
4
|
try:
|
|
4
|
-
from pydantic_ai.agent import Agent
|
|
5
|
+
from pydantic_ai.agent import Agent as _BaseAgent
|
|
5
6
|
|
|
6
7
|
is_pydantic_ai_installed = True
|
|
7
|
-
except:
|
|
8
|
+
except ImportError:
|
|
8
9
|
is_pydantic_ai_installed = False
|
|
9
10
|
|
|
11
|
+
class _BaseAgent:
|
|
12
|
+
"""Dummy fallback so imports don't crash when pydantic-ai is missing."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
15
|
+
# No-op: for compatibility
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
# For type checkers: use the real Agent if available.
|
|
21
|
+
from pydantic_ai.agent import Agent # type: ignore[unused-ignore]
|
|
22
|
+
else:
|
|
23
|
+
# At runtime we always have some base: real Agent or our dummy.
|
|
24
|
+
# This is just to avoid blow-ups.
|
|
25
|
+
Agent = _BaseAgent
|
|
26
|
+
|
|
10
27
|
|
|
11
28
|
class DeepEvalPydanticAIAgent(Agent):
|
|
12
29
|
|
|
@@ -1,40 +1,58 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import logging
|
|
3
5
|
import os
|
|
4
6
|
from time import perf_counter
|
|
5
|
-
from typing import
|
|
7
|
+
from typing import Any, List, Optional, TYPE_CHECKING
|
|
6
8
|
|
|
7
9
|
from deepeval.config.settings import get_settings
|
|
8
10
|
from deepeval.confident.api import get_confident_api_key
|
|
9
11
|
from deepeval.metrics.base_metric import BaseMetric
|
|
10
12
|
from deepeval.prompt import Prompt
|
|
11
13
|
from deepeval.tracing.context import current_trace_context
|
|
12
|
-
from deepeval.tracing.types import Trace
|
|
13
|
-
from deepeval.tracing.otel.utils import to_hex_string
|
|
14
|
-
from deepeval.tracing.tracing import trace_manager
|
|
15
|
-
from deepeval.tracing.otel.utils import normalize_pydantic_ai_messages
|
|
16
14
|
from deepeval.tracing.otel.exporter import ConfidentSpanExporter
|
|
17
|
-
|
|
15
|
+
from deepeval.tracing.otel.test_exporter import test_exporter
|
|
16
|
+
from deepeval.tracing.otel.utils import (
|
|
17
|
+
normalize_pydantic_ai_messages,
|
|
18
|
+
to_hex_string,
|
|
19
|
+
)
|
|
20
|
+
from deepeval.tracing.perf_epoch_bridge import init_clock_bridge
|
|
21
|
+
from deepeval.tracing.tracing import trace_manager
|
|
22
|
+
from deepeval.tracing.types import (
|
|
23
|
+
AgentSpan,
|
|
24
|
+
Trace,
|
|
25
|
+
TraceSpanStatus,
|
|
26
|
+
ToolCall,
|
|
27
|
+
)
|
|
18
28
|
|
|
19
29
|
logger = logging.getLogger(__name__)
|
|
20
30
|
|
|
21
|
-
|
|
22
31
|
try:
|
|
23
|
-
|
|
24
|
-
from opentelemetry.sdk.trace import
|
|
32
|
+
# Optional dependencies
|
|
33
|
+
from opentelemetry.sdk.trace import (
|
|
34
|
+
ReadableSpan as _ReadableSpan,
|
|
35
|
+
SpanProcessor as _SpanProcessor,
|
|
36
|
+
TracerProvider,
|
|
37
|
+
)
|
|
25
38
|
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
26
39
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
|
27
40
|
OTLPSpanExporter,
|
|
28
41
|
)
|
|
29
|
-
from
|
|
42
|
+
from pydantic_ai.models.instrumented import (
|
|
43
|
+
InstrumentationSettings as _BaseInstrumentationSettings,
|
|
44
|
+
)
|
|
30
45
|
|
|
31
46
|
dependency_installed = True
|
|
32
47
|
except ImportError as e:
|
|
48
|
+
dependency_installed = False
|
|
49
|
+
|
|
50
|
+
# Preserve previous behavior: only log when verbose mode is enabled.
|
|
33
51
|
if get_settings().DEEPEVAL_VERBOSE_MODE:
|
|
34
52
|
if isinstance(e, ModuleNotFoundError):
|
|
35
53
|
logger.warning(
|
|
36
54
|
"Optional tracing dependency not installed: %s",
|
|
37
|
-
e
|
|
55
|
+
getattr(e, "name", repr(e)),
|
|
38
56
|
stacklevel=2,
|
|
39
57
|
)
|
|
40
58
|
else:
|
|
@@ -43,26 +61,47 @@ except ImportError as e:
|
|
|
43
61
|
e,
|
|
44
62
|
stacklevel=2,
|
|
45
63
|
)
|
|
46
|
-
|
|
64
|
+
|
|
65
|
+
# Dummy fallbacks so imports and class definitions don't crash when
|
|
66
|
+
# optional deps are missing. Actual use is still guarded by
|
|
67
|
+
# is_dependency_installed().
|
|
68
|
+
class _BaseInstrumentationSettings:
|
|
69
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
class _SpanProcessor:
|
|
73
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
def on_start(self, span: Any, parent_context: Any) -> None:
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
def on_end(self, span: Any) -> None:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
class _ReadableSpan:
|
|
83
|
+
pass
|
|
47
84
|
|
|
48
85
|
|
|
49
|
-
def is_dependency_installed():
|
|
86
|
+
def is_dependency_installed() -> bool:
|
|
50
87
|
if not dependency_installed:
|
|
51
88
|
raise ImportError(
|
|
52
|
-
"Dependencies are not installed. Please install it with
|
|
89
|
+
"Dependencies are not installed. Please install it with "
|
|
90
|
+
"`pip install pydantic-ai opentelemetry-sdk "
|
|
91
|
+
"opentelemetry-exporter-otlp-proto-http`."
|
|
53
92
|
)
|
|
54
93
|
return True
|
|
55
94
|
|
|
56
95
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
from
|
|
60
|
-
from
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
96
|
+
if TYPE_CHECKING:
|
|
97
|
+
# For type checkers, use real types
|
|
98
|
+
from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor
|
|
99
|
+
from pydantic_ai.models.instrumented import InstrumentationSettings
|
|
100
|
+
else:
|
|
101
|
+
# At runtime we always have something to subclass / annotate with
|
|
102
|
+
InstrumentationSettings = _BaseInstrumentationSettings
|
|
103
|
+
SpanProcessor = _SpanProcessor
|
|
104
|
+
ReadableSpan = _ReadableSpan
|
|
66
105
|
|
|
67
106
|
# OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
|
|
68
107
|
OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
|
deepeval/key_handler.py
CHANGED
|
@@ -99,7 +99,10 @@ class ModelKeyValues(Enum):
|
|
|
99
99
|
class EmbeddingKeyValues(Enum):
|
|
100
100
|
# Azure OpenAI
|
|
101
101
|
USE_AZURE_OPENAI_EMBEDDING = "USE_AZURE_OPENAI_EMBEDDING"
|
|
102
|
+
# Azure OpenAI
|
|
103
|
+
AZURE_EMBEDDING_MODEL_NAME = "AZURE_EMBEDDING_MODEL_NAME"
|
|
102
104
|
AZURE_EMBEDDING_DEPLOYMENT_NAME = "AZURE_EMBEDDING_DEPLOYMENT_NAME"
|
|
105
|
+
|
|
103
106
|
# Local
|
|
104
107
|
USE_LOCAL_EMBEDDINGS = "USE_LOCAL_EMBEDDINGS"
|
|
105
108
|
LOCAL_EMBEDDING_MODEL_NAME = "LOCAL_EMBEDDING_MODEL_NAME"
|
deepeval/metrics/__init__.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from .base_metric import (
|
|
2
2
|
BaseMetric,
|
|
3
3
|
BaseConversationalMetric,
|
|
4
|
-
BaseMultimodalMetric,
|
|
5
4
|
BaseArenaMetric,
|
|
6
5
|
)
|
|
7
6
|
|
|
@@ -42,6 +41,16 @@ from .mcp_use_metric.mcp_use_metric import MCPUseMetric
|
|
|
42
41
|
from .turn_relevancy.turn_relevancy import (
|
|
43
42
|
TurnRelevancyMetric,
|
|
44
43
|
)
|
|
44
|
+
from .turn_faithfulness.turn_faithfulness import TurnFaithfulnessMetric
|
|
45
|
+
from .turn_contextual_precision.turn_contextual_precision import (
|
|
46
|
+
TurnContextualPrecisionMetric,
|
|
47
|
+
)
|
|
48
|
+
from .turn_contextual_recall.turn_contextual_recall import (
|
|
49
|
+
TurnContextualRecallMetric,
|
|
50
|
+
)
|
|
51
|
+
from .turn_contextual_relevancy.turn_contextual_relevancy import (
|
|
52
|
+
TurnContextualRelevancyMetric,
|
|
53
|
+
)
|
|
45
54
|
from .conversation_completeness.conversation_completeness import (
|
|
46
55
|
ConversationCompletenessMetric,
|
|
47
56
|
)
|
|
@@ -55,13 +64,6 @@ from .multimodal_metrics import (
|
|
|
55
64
|
ImageCoherenceMetric,
|
|
56
65
|
ImageHelpfulnessMetric,
|
|
57
66
|
ImageReferenceMetric,
|
|
58
|
-
MultimodalContextualRecallMetric,
|
|
59
|
-
MultimodalContextualRelevancyMetric,
|
|
60
|
-
MultimodalContextualPrecisionMetric,
|
|
61
|
-
MultimodalAnswerRelevancyMetric,
|
|
62
|
-
MultimodalFaithfulnessMetric,
|
|
63
|
-
MultimodalToolCorrectnessMetric,
|
|
64
|
-
MultimodalGEval,
|
|
65
67
|
)
|
|
66
68
|
|
|
67
69
|
|
|
@@ -69,7 +71,6 @@ __all__ = [
|
|
|
69
71
|
# Base classes
|
|
70
72
|
"BaseMetric",
|
|
71
73
|
"BaseConversationalMetric",
|
|
72
|
-
"BaseMultimodalMetric",
|
|
73
74
|
"BaseArenaMetric",
|
|
74
75
|
# Non-LLM metrics
|
|
75
76
|
"ExactMatchMetric",
|
|
@@ -119,17 +120,14 @@ __all__ = [
|
|
|
119
120
|
# Conversational metrics
|
|
120
121
|
"TurnRelevancyMetric",
|
|
121
122
|
"ConversationCompletenessMetric",
|
|
123
|
+
"TurnFaithfulnessMetric",
|
|
124
|
+
"TurnContextualPrecisionMetric",
|
|
125
|
+
"TurnContextualRecallMetric",
|
|
126
|
+
"TurnContextualRelevancyMetric",
|
|
122
127
|
# Multimodal metrics
|
|
123
128
|
"TextToImageMetric",
|
|
124
129
|
"ImageEditingMetric",
|
|
125
130
|
"ImageCoherenceMetric",
|
|
126
131
|
"ImageHelpfulnessMetric",
|
|
127
132
|
"ImageReferenceMetric",
|
|
128
|
-
"MultimodalContextualRecallMetric",
|
|
129
|
-
"MultimodalContextualRelevancyMetric",
|
|
130
|
-
"MultimodalContextualPrecisionMetric",
|
|
131
|
-
"MultimodalAnswerRelevancyMetric",
|
|
132
|
-
"MultimodalFaithfulnessMetric",
|
|
133
|
-
"MultimodalToolCorrectnessMetric",
|
|
134
|
-
"MultimodalGEval",
|
|
135
133
|
]
|