deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,386 +0,0 @@
|
|
|
1
|
-
"""LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
|
|
2
|
-
|
|
3
|
-
from typing import Optional, List, Tuple, Type, Union
|
|
4
|
-
from deepeval.models import DeepEvalBaseMLLM
|
|
5
|
-
from deepeval.metrics import BaseMultimodalMetric
|
|
6
|
-
from deepeval.test_case import (
|
|
7
|
-
MLLMTestCaseParams,
|
|
8
|
-
MLLMTestCase,
|
|
9
|
-
)
|
|
10
|
-
from deepeval.metrics.multimodal_metrics.multimodal_g_eval.template import (
|
|
11
|
-
MultimodalGEvalTemplate,
|
|
12
|
-
)
|
|
13
|
-
from deepeval.metrics.multimodal_metrics.multimodal_g_eval.schema import (
|
|
14
|
-
Steps,
|
|
15
|
-
ReasonScore,
|
|
16
|
-
)
|
|
17
|
-
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
18
|
-
from deepeval.metrics.indicator import metric_progress_indicator
|
|
19
|
-
from deepeval.metrics.utils import (
|
|
20
|
-
initialize_multimodal_model,
|
|
21
|
-
check_mllm_test_case_params,
|
|
22
|
-
construct_verbose_logs,
|
|
23
|
-
trimAndLoadJson,
|
|
24
|
-
)
|
|
25
|
-
from deepeval.metrics.multimodal_metrics.multimodal_g_eval.utils import (
|
|
26
|
-
construct_test_case_list,
|
|
27
|
-
no_multimodal_log_prob_support,
|
|
28
|
-
construct_g_eval_params_string,
|
|
29
|
-
)
|
|
30
|
-
from deepeval.metrics.g_eval.utils import (
|
|
31
|
-
Rubric,
|
|
32
|
-
format_rubrics,
|
|
33
|
-
calculate_weighted_summed_score,
|
|
34
|
-
validate_and_sort_rubrics,
|
|
35
|
-
validate_criteria_and_evaluation_steps,
|
|
36
|
-
number_evaluation_steps,
|
|
37
|
-
get_score_range,
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
class MultimodalGEval(BaseMultimodalMetric):
|
|
42
|
-
def __init__(
|
|
43
|
-
self,
|
|
44
|
-
name: str,
|
|
45
|
-
evaluation_params: List[MLLMTestCaseParams],
|
|
46
|
-
criteria: Optional[str] = None,
|
|
47
|
-
evaluation_steps: Optional[List[str]] = None,
|
|
48
|
-
rubric: Optional[List[Rubric]] = None,
|
|
49
|
-
model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
|
|
50
|
-
threshold: float = 0.5,
|
|
51
|
-
top_logprobs: int = 20,
|
|
52
|
-
async_mode: bool = True,
|
|
53
|
-
strict_mode: bool = False,
|
|
54
|
-
verbose_mode: bool = False,
|
|
55
|
-
evaluation_template: Type[
|
|
56
|
-
MultimodalGEvalTemplate
|
|
57
|
-
] = MultimodalGEvalTemplate,
|
|
58
|
-
_include_g_eval_suffix: bool = True,
|
|
59
|
-
):
|
|
60
|
-
validate_criteria_and_evaluation_steps(criteria, evaluation_steps)
|
|
61
|
-
self.name = name
|
|
62
|
-
self.evaluation_params = evaluation_params
|
|
63
|
-
self.criteria = criteria
|
|
64
|
-
self.rubric = validate_and_sort_rubrics(rubric)
|
|
65
|
-
self.model, self.using_native_model = initialize_multimodal_model(model)
|
|
66
|
-
self.evaluation_model = self.model.get_model_name()
|
|
67
|
-
self.evaluation_steps = (
|
|
68
|
-
evaluation_steps
|
|
69
|
-
if evaluation_steps and len(evaluation_steps) > 0
|
|
70
|
-
else None
|
|
71
|
-
)
|
|
72
|
-
self.threshold = 1 if strict_mode else threshold
|
|
73
|
-
self.top_logprobs = top_logprobs
|
|
74
|
-
self.strict_mode = strict_mode
|
|
75
|
-
self.async_mode = async_mode
|
|
76
|
-
self.verbose_mode = verbose_mode
|
|
77
|
-
self._include_g_eval_suffix = _include_g_eval_suffix
|
|
78
|
-
self.evaluation_template = evaluation_template
|
|
79
|
-
|
|
80
|
-
def measure(
|
|
81
|
-
self,
|
|
82
|
-
test_case: MLLMTestCase,
|
|
83
|
-
_show_indicator: bool = True,
|
|
84
|
-
_in_component: bool = False,
|
|
85
|
-
_log_metric_to_confident: bool = True,
|
|
86
|
-
_additional_context: Optional[str] = None,
|
|
87
|
-
) -> float:
|
|
88
|
-
|
|
89
|
-
check_mllm_test_case_params(
|
|
90
|
-
test_case, self.evaluation_params, None, None, self
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
self.evaluation_cost = 0 if self.using_native_model else None
|
|
94
|
-
with metric_progress_indicator(
|
|
95
|
-
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
96
|
-
):
|
|
97
|
-
if self.async_mode:
|
|
98
|
-
loop = get_or_create_event_loop()
|
|
99
|
-
loop.run_until_complete(
|
|
100
|
-
self.a_measure(
|
|
101
|
-
test_case,
|
|
102
|
-
_show_indicator=False,
|
|
103
|
-
_in_component=_in_component,
|
|
104
|
-
_log_metric_to_confident=_log_metric_to_confident,
|
|
105
|
-
_additional_context=_additional_context,
|
|
106
|
-
)
|
|
107
|
-
)
|
|
108
|
-
else:
|
|
109
|
-
self.evaluation_steps: List[str] = (
|
|
110
|
-
self._generate_evaluation_steps()
|
|
111
|
-
)
|
|
112
|
-
g_score, reason = self._evaluate(
|
|
113
|
-
test_case, _additional_context=_additional_context
|
|
114
|
-
)
|
|
115
|
-
self.reason = reason
|
|
116
|
-
self.score = float(g_score) / 10
|
|
117
|
-
self.score = (
|
|
118
|
-
0
|
|
119
|
-
if self.strict_mode and self.score < self.threshold
|
|
120
|
-
else self.score
|
|
121
|
-
)
|
|
122
|
-
self.success = self.score >= self.threshold
|
|
123
|
-
self.verbose_logs = construct_verbose_logs(
|
|
124
|
-
self,
|
|
125
|
-
steps=[
|
|
126
|
-
f"Criteria:\n{self.criteria}",
|
|
127
|
-
f"Evaluation Steps:\n{prettify_list(self.evaluation_steps)}",
|
|
128
|
-
f"Rubric:\n{format_rubrics(self.rubric)}",
|
|
129
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
|
130
|
-
],
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
return self.score
|
|
134
|
-
|
|
135
|
-
async def a_measure(
|
|
136
|
-
self,
|
|
137
|
-
test_case: MLLMTestCase,
|
|
138
|
-
_show_indicator: bool = True,
|
|
139
|
-
_in_component: bool = False,
|
|
140
|
-
_additional_context: Optional[str] = None,
|
|
141
|
-
_log_metric_to_confident: bool = True,
|
|
142
|
-
) -> float:
|
|
143
|
-
|
|
144
|
-
check_mllm_test_case_params(
|
|
145
|
-
test_case, self.evaluation_params, None, None, self
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
self.evaluation_cost = 0 if self.using_native_model else None
|
|
149
|
-
with metric_progress_indicator(
|
|
150
|
-
self,
|
|
151
|
-
async_mode=True,
|
|
152
|
-
_show_indicator=_show_indicator,
|
|
153
|
-
_in_component=_in_component,
|
|
154
|
-
):
|
|
155
|
-
self.evaluation_steps: List[str] = (
|
|
156
|
-
await self._a_generate_evaluation_steps()
|
|
157
|
-
)
|
|
158
|
-
g_score, reason = await self._a_evaluate(
|
|
159
|
-
test_case, _additional_context=_additional_context
|
|
160
|
-
)
|
|
161
|
-
self.reason = reason
|
|
162
|
-
self.score = (
|
|
163
|
-
float(g_score) / 10 if not self.strict_mode else int(g_score)
|
|
164
|
-
)
|
|
165
|
-
self.success = self.score >= self.threshold
|
|
166
|
-
self.verbose_logs = construct_verbose_logs(
|
|
167
|
-
self,
|
|
168
|
-
steps=[
|
|
169
|
-
f"Criteria:\n{self.criteria}",
|
|
170
|
-
f"Evaluation Steps:\n{prettify_list(self.evaluation_steps)}",
|
|
171
|
-
f"Rubric:\n{format_rubrics(self.rubric)}",
|
|
172
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
|
173
|
-
],
|
|
174
|
-
)
|
|
175
|
-
return self.score
|
|
176
|
-
|
|
177
|
-
async def _a_generate_evaluation_steps(self) -> List[str]:
|
|
178
|
-
if self.evaluation_steps:
|
|
179
|
-
return self.evaluation_steps
|
|
180
|
-
|
|
181
|
-
g_eval_params_str = construct_g_eval_params_string(
|
|
182
|
-
self.evaluation_params
|
|
183
|
-
)
|
|
184
|
-
prompt = self.evaluation_template.generate_evaluation_steps(
|
|
185
|
-
criteria=self.criteria, parameters=g_eval_params_str
|
|
186
|
-
)
|
|
187
|
-
if self.using_native_model:
|
|
188
|
-
res, cost = await self.model.a_generate([prompt], schema=Steps)
|
|
189
|
-
self.evaluation_cost += cost
|
|
190
|
-
return res.steps
|
|
191
|
-
else:
|
|
192
|
-
try:
|
|
193
|
-
res: Steps = await self.model.a_generate([prompt], schema=Steps)
|
|
194
|
-
return res.steps
|
|
195
|
-
except TypeError:
|
|
196
|
-
res = await self.model.a_generate([prompt])
|
|
197
|
-
data = trimAndLoadJson(res, self)
|
|
198
|
-
return data["steps"]
|
|
199
|
-
|
|
200
|
-
def _generate_evaluation_steps(self) -> List[str]:
|
|
201
|
-
if self.evaluation_steps:
|
|
202
|
-
return self.evaluation_steps
|
|
203
|
-
|
|
204
|
-
g_eval_params_str = construct_g_eval_params_string(
|
|
205
|
-
self.evaluation_params
|
|
206
|
-
)
|
|
207
|
-
prompt = self.evaluation_template.generate_evaluation_steps(
|
|
208
|
-
criteria=self.criteria, parameters=g_eval_params_str
|
|
209
|
-
)
|
|
210
|
-
if self.using_native_model:
|
|
211
|
-
res, cost = self.model.generate([prompt], schema=Steps)
|
|
212
|
-
self.evaluation_cost += cost
|
|
213
|
-
return res.steps
|
|
214
|
-
else:
|
|
215
|
-
try:
|
|
216
|
-
res: Steps = self.model.generate([prompt], schema=Steps)
|
|
217
|
-
return res.steps
|
|
218
|
-
except TypeError:
|
|
219
|
-
res = self.model.generate([prompt])
|
|
220
|
-
data = trimAndLoadJson(res, self)
|
|
221
|
-
return data["steps"]
|
|
222
|
-
|
|
223
|
-
async def _a_evaluate(
|
|
224
|
-
self, test_case: MLLMTestCase, _additional_context: Optional[str] = None
|
|
225
|
-
) -> Tuple[Union[int, float], str]:
|
|
226
|
-
test_case_list = construct_test_case_list(
|
|
227
|
-
self.evaluation_params, test_case
|
|
228
|
-
)
|
|
229
|
-
g_eval_params_str = construct_g_eval_params_string(
|
|
230
|
-
self.evaluation_params
|
|
231
|
-
)
|
|
232
|
-
|
|
233
|
-
if not self.strict_mode:
|
|
234
|
-
rubric_str = format_rubrics(self.rubric) if self.rubric else None
|
|
235
|
-
prompt = self.evaluation_template.generate_evaluation_results(
|
|
236
|
-
evaluation_steps=number_evaluation_steps(self.evaluation_steps),
|
|
237
|
-
test_case_list=test_case_list,
|
|
238
|
-
parameters=g_eval_params_str,
|
|
239
|
-
rubric=rubric_str,
|
|
240
|
-
score_range=get_score_range(self.rubric),
|
|
241
|
-
_additional_context=_additional_context,
|
|
242
|
-
)
|
|
243
|
-
else:
|
|
244
|
-
prompt = (
|
|
245
|
-
self.evaluation_template.generate_strict_evaluation_results(
|
|
246
|
-
evaluation_steps=number_evaluation_steps(
|
|
247
|
-
self.evaluation_steps
|
|
248
|
-
),
|
|
249
|
-
test_case_list=test_case_list,
|
|
250
|
-
parameters=g_eval_params_str,
|
|
251
|
-
_additional_context=_additional_context,
|
|
252
|
-
)
|
|
253
|
-
)
|
|
254
|
-
try:
|
|
255
|
-
# don't use log probabilities for unsupported gpt models
|
|
256
|
-
if no_multimodal_log_prob_support(self.model):
|
|
257
|
-
raise AttributeError("log_probs unsupported.")
|
|
258
|
-
|
|
259
|
-
# Don't have to check for using native model
|
|
260
|
-
# since generate raw response only exist for deepeval's native model
|
|
261
|
-
res, cost = await self.model.a_generate_raw_response(
|
|
262
|
-
prompt, top_logprobs=self.top_logprobs
|
|
263
|
-
)
|
|
264
|
-
self.evaluation_cost += cost
|
|
265
|
-
data = trimAndLoadJson(res.choices[0].message.content, self)
|
|
266
|
-
|
|
267
|
-
reason = data["reason"]
|
|
268
|
-
score = data["score"]
|
|
269
|
-
if self.strict_mode:
|
|
270
|
-
return score, reason
|
|
271
|
-
|
|
272
|
-
try:
|
|
273
|
-
weighted_summed_score = calculate_weighted_summed_score(
|
|
274
|
-
score, res
|
|
275
|
-
)
|
|
276
|
-
return weighted_summed_score, reason
|
|
277
|
-
except Exception:
|
|
278
|
-
return score, reason
|
|
279
|
-
except (
|
|
280
|
-
AttributeError
|
|
281
|
-
): # This catches the case where a_generate_raw_response doesn't exist.
|
|
282
|
-
if self.using_native_model:
|
|
283
|
-
res, cost = await self.model.a_generate(prompt)
|
|
284
|
-
self.evaluation_cost += cost
|
|
285
|
-
data = trimAndLoadJson(res, self)
|
|
286
|
-
return data["score"], data["reason"]
|
|
287
|
-
else:
|
|
288
|
-
try:
|
|
289
|
-
res: ReasonScore = await self.model.a_generate(
|
|
290
|
-
prompt, schema=ReasonScore
|
|
291
|
-
)
|
|
292
|
-
return res.score, res.reason
|
|
293
|
-
except TypeError:
|
|
294
|
-
res = await self.model.a_generate(prompt)
|
|
295
|
-
data = trimAndLoadJson(res, self)
|
|
296
|
-
return data["score"], data["reason"]
|
|
297
|
-
|
|
298
|
-
def _evaluate(
|
|
299
|
-
self, test_case: MLLMTestCase, _additional_context: Optional[str] = None
|
|
300
|
-
) -> Tuple[Union[int, float], str]:
|
|
301
|
-
test_case_list = construct_test_case_list(
|
|
302
|
-
self.evaluation_params, test_case
|
|
303
|
-
)
|
|
304
|
-
g_eval_params_str = construct_g_eval_params_string(
|
|
305
|
-
self.evaluation_params
|
|
306
|
-
)
|
|
307
|
-
|
|
308
|
-
if not self.strict_mode:
|
|
309
|
-
rubric_str = format_rubrics(self.rubric) if self.rubric else None
|
|
310
|
-
prompt = self.evaluation_template.generate_evaluation_results(
|
|
311
|
-
evaluation_steps=number_evaluation_steps(self.evaluation_steps),
|
|
312
|
-
test_case_list=test_case_list,
|
|
313
|
-
parameters=g_eval_params_str,
|
|
314
|
-
rubric=rubric_str,
|
|
315
|
-
score_range=get_score_range(self.rubric),
|
|
316
|
-
_additional_context=_additional_context,
|
|
317
|
-
)
|
|
318
|
-
else:
|
|
319
|
-
prompt = (
|
|
320
|
-
self.evaluation_template.generate_strict_evaluation_results(
|
|
321
|
-
evaluation_steps=number_evaluation_steps(
|
|
322
|
-
self.evaluation_steps
|
|
323
|
-
),
|
|
324
|
-
test_case_list=test_case_list,
|
|
325
|
-
parameters=g_eval_params_str,
|
|
326
|
-
_additional_context=_additional_context,
|
|
327
|
-
)
|
|
328
|
-
)
|
|
329
|
-
|
|
330
|
-
try:
|
|
331
|
-
# don't use log probabilities for unsupported gpt models
|
|
332
|
-
if no_multimodal_log_prob_support(self.model):
|
|
333
|
-
raise AttributeError("log_probs unsupported.")
|
|
334
|
-
|
|
335
|
-
res, cost = self.model.generate_raw_response(
|
|
336
|
-
prompt, top_logprobs=self.top_logprobs
|
|
337
|
-
)
|
|
338
|
-
self.evaluation_cost += cost
|
|
339
|
-
data = trimAndLoadJson(res.choices[0].message.content, self)
|
|
340
|
-
|
|
341
|
-
reason = data["reason"]
|
|
342
|
-
score = data["score"]
|
|
343
|
-
if self.strict_mode:
|
|
344
|
-
return score, reason
|
|
345
|
-
|
|
346
|
-
try:
|
|
347
|
-
weighted_summed_score = calculate_weighted_summed_score(
|
|
348
|
-
score, res
|
|
349
|
-
)
|
|
350
|
-
return weighted_summed_score, reason
|
|
351
|
-
except Exception:
|
|
352
|
-
return score, reason
|
|
353
|
-
except AttributeError:
|
|
354
|
-
# This catches the case where a_generate_raw_response doesn't exist.
|
|
355
|
-
if self.using_native_model:
|
|
356
|
-
res, cost = self.model.generate(prompt)
|
|
357
|
-
self.evaluation_cost += cost
|
|
358
|
-
data = trimAndLoadJson(res, self)
|
|
359
|
-
return data["score"], data["reason"]
|
|
360
|
-
else:
|
|
361
|
-
try:
|
|
362
|
-
res: ReasonScore = self.model.generate(
|
|
363
|
-
prompt, schema=ReasonScore
|
|
364
|
-
)
|
|
365
|
-
return res.score, res.reason
|
|
366
|
-
except TypeError:
|
|
367
|
-
res = self.model.generate(prompt)
|
|
368
|
-
data = trimAndLoadJson(res, self)
|
|
369
|
-
return data["score"], data["reason"]
|
|
370
|
-
|
|
371
|
-
def is_successful(self) -> bool:
|
|
372
|
-
if self.error is not None:
|
|
373
|
-
self.success = False
|
|
374
|
-
else:
|
|
375
|
-
try:
|
|
376
|
-
self.success = self.score >= self.threshold
|
|
377
|
-
except Exception:
|
|
378
|
-
self.success = False
|
|
379
|
-
return self.success
|
|
380
|
-
|
|
381
|
-
@property
|
|
382
|
-
def __name__(self):
|
|
383
|
-
if self._include_g_eval_suffix:
|
|
384
|
-
return f"{self.name} [GEval]"
|
|
385
|
-
else:
|
|
386
|
-
return self.name
|
|
@@ -1,148 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional, Tuple
|
|
2
|
-
import textwrap
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MultimodalGEvalTemplate:
|
|
6
|
-
|
|
7
|
-
@staticmethod
|
|
8
|
-
def generate_evaluation_steps(parameters: str, criteria: str):
|
|
9
|
-
return textwrap.dedent(
|
|
10
|
-
f"""Given an evaluation criteria which outlines how you should judge the {parameters}, generate 3-4 concise evaluation steps based on the criteria below. You MUST make it clear how to evaluate {parameters} in relation to one another.
|
|
11
|
-
|
|
12
|
-
Evaluation Criteria:
|
|
13
|
-
{criteria}
|
|
14
|
-
|
|
15
|
-
**
|
|
16
|
-
IMPORTANT: Please make sure to only return in JSON format, with the "steps" key as a list of strings. No words or explanation is needed.
|
|
17
|
-
Example JSON:
|
|
18
|
-
{{
|
|
19
|
-
"steps": <list_of_strings>
|
|
20
|
-
}}
|
|
21
|
-
**
|
|
22
|
-
|
|
23
|
-
JSON:
|
|
24
|
-
"""
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
@staticmethod
|
|
28
|
-
def generate_evaluation_results(
|
|
29
|
-
evaluation_steps: str,
|
|
30
|
-
test_case_list: List,
|
|
31
|
-
parameters: str,
|
|
32
|
-
rubric: Optional[str] = None,
|
|
33
|
-
score_range: Tuple[int, int] = (0, 10),
|
|
34
|
-
_additional_context: Optional[str] = None,
|
|
35
|
-
):
|
|
36
|
-
rubric_text = f"Rubric:\n{rubric}\n" if rubric else ""
|
|
37
|
-
dependencies = (
|
|
38
|
-
"evaluation steps and rubric" if rubric else "evaluation steps"
|
|
39
|
-
)
|
|
40
|
-
score_explanation = (
|
|
41
|
-
"based on the rubric provided"
|
|
42
|
-
if rubric
|
|
43
|
-
else f"with {score_range[1]} indicating strong alignment with the evaluation steps and {score_range[0]} indicating no alignment"
|
|
44
|
-
)
|
|
45
|
-
reasoning_expectation = (
|
|
46
|
-
"Be specific and grounded in the evaluation steps and rubric."
|
|
47
|
-
if rubric
|
|
48
|
-
else "Be specific and grounded in the evaluation steps."
|
|
49
|
-
)
|
|
50
|
-
additional_context = (
|
|
51
|
-
f"\n\nAdditional Context:\n{_additional_context}\n"
|
|
52
|
-
if _additional_context
|
|
53
|
-
else ""
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
return (
|
|
57
|
-
[
|
|
58
|
-
textwrap.dedent(
|
|
59
|
-
f"""You are an evaluator. Given the following {dependencies}, assess the response below and return a JSON object with two fields:
|
|
60
|
-
|
|
61
|
-
- `"score"`: an integer between {score_range[0]} and {score_range[1]}, {score_explanation}.
|
|
62
|
-
- `"reason"`: a brief explanation for why the score was given. This must mention specific strengths or shortcomings, referencing relevant details from the input. Do **not** quote the score itself in the explanation.
|
|
63
|
-
|
|
64
|
-
Your explanation should:
|
|
65
|
-
- {reasoning_expectation}
|
|
66
|
-
- Mention key details from the test case parameters.
|
|
67
|
-
- Be concise, clear, and focused on the evaluation logic.
|
|
68
|
-
|
|
69
|
-
Only return valid JSON. Do **not** include any extra commentary or text.
|
|
70
|
-
|
|
71
|
-
---
|
|
72
|
-
|
|
73
|
-
Evaluation Steps:
|
|
74
|
-
{evaluation_steps}
|
|
75
|
-
|
|
76
|
-
{rubric_text}
|
|
77
|
-
Test Case:
|
|
78
|
-
************************
|
|
79
|
-
"""
|
|
80
|
-
)
|
|
81
|
-
]
|
|
82
|
-
+ test_case_list
|
|
83
|
-
+ [
|
|
84
|
-
textwrap.dedent(
|
|
85
|
-
f"""
|
|
86
|
-
************************
|
|
87
|
-
\n\n\n
|
|
88
|
-
Parameters:
|
|
89
|
-
{parameters}
|
|
90
|
-
{additional_context}
|
|
91
|
-
|
|
92
|
-
---
|
|
93
|
-
**Example JSON:**
|
|
94
|
-
{{
|
|
95
|
-
"reason": "your concise and informative reason here",
|
|
96
|
-
"score": {score_range[0]}
|
|
97
|
-
}}
|
|
98
|
-
|
|
99
|
-
JSON:
|
|
100
|
-
"""
|
|
101
|
-
)
|
|
102
|
-
]
|
|
103
|
-
)
|
|
104
|
-
|
|
105
|
-
@staticmethod
|
|
106
|
-
def generate_strict_evaluation_results(
|
|
107
|
-
evaluation_steps: str,
|
|
108
|
-
test_case_list: List,
|
|
109
|
-
parameters: str,
|
|
110
|
-
_additional_context: Optional[str] = None,
|
|
111
|
-
):
|
|
112
|
-
additional_context = (
|
|
113
|
-
f"\n\nAdditional Context:\n{_additional_context}\n"
|
|
114
|
-
if _additional_context
|
|
115
|
-
else ""
|
|
116
|
-
)
|
|
117
|
-
return (
|
|
118
|
-
[
|
|
119
|
-
textwrap.dedent(
|
|
120
|
-
f"""Given the evaluation steps, return a JSON with two keys: 1) a `score` key that is STRICTLY EITHER 1 (follows the criteria 100% outlined in the evaluation steps), OR 0 (does not follow the criteria), and 2) a `reason` key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from {parameters} in your reason, but be very concise with it!
|
|
121
|
-
|
|
122
|
-
Evaluation Steps:
|
|
123
|
-
{evaluation_steps}
|
|
124
|
-
************************
|
|
125
|
-
"""
|
|
126
|
-
)
|
|
127
|
-
]
|
|
128
|
-
+ test_case_list
|
|
129
|
-
+ [
|
|
130
|
-
textwrap.dedent(
|
|
131
|
-
f"""
|
|
132
|
-
************************
|
|
133
|
-
{additional_context}
|
|
134
|
-
**
|
|
135
|
-
IMPORTANT: Please make sure to only return in JSON format, with the "score" and "reason" key. No words or explanation is needed.
|
|
136
|
-
|
|
137
|
-
Example JSON:
|
|
138
|
-
{{
|
|
139
|
-
"reason": "The text does not follow the evaluation steps provided.",
|
|
140
|
-
"score": 0
|
|
141
|
-
}}
|
|
142
|
-
**
|
|
143
|
-
|
|
144
|
-
JSON:
|
|
145
|
-
"""
|
|
146
|
-
)
|
|
147
|
-
]
|
|
148
|
-
)
|
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, ToolCall
|
|
2
|
-
from deepeval.test_case.mllm_test_case import MLLMImage
|
|
3
|
-
from deepeval.models.mlllms.openai_model import (
|
|
4
|
-
unsupported_log_probs_multimodal_gpt_models,
|
|
5
|
-
)
|
|
6
|
-
from deepeval.models import (
|
|
7
|
-
DeepEvalBaseMLLM,
|
|
8
|
-
MultimodalOpenAIModel,
|
|
9
|
-
)
|
|
10
|
-
|
|
11
|
-
from typing import List, Union
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
G_EVAL_PARAMS = {
|
|
15
|
-
MLLMTestCaseParams.INPUT: "Input",
|
|
16
|
-
MLLMTestCaseParams.ACTUAL_OUTPUT: "Actual Output",
|
|
17
|
-
MLLMTestCaseParams.EXPECTED_OUTPUT: "Expected Output",
|
|
18
|
-
MLLMTestCaseParams.CONTEXT: "Context",
|
|
19
|
-
MLLMTestCaseParams.RETRIEVAL_CONTEXT: "Retrieval Context",
|
|
20
|
-
MLLMTestCaseParams.EXPECTED_TOOLS: "Expected Tools",
|
|
21
|
-
MLLMTestCaseParams.TOOLS_CALLED: "Tools Called",
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def construct_g_eval_params_string(
|
|
26
|
-
mllm_test_case_params: List[MLLMTestCaseParams],
|
|
27
|
-
):
|
|
28
|
-
g_eval_params = [G_EVAL_PARAMS[param] for param in mllm_test_case_params]
|
|
29
|
-
if len(g_eval_params) == 1:
|
|
30
|
-
g_eval_params_str = g_eval_params[0]
|
|
31
|
-
elif len(g_eval_params) == 2:
|
|
32
|
-
g_eval_params_str = " and ".join(g_eval_params)
|
|
33
|
-
else:
|
|
34
|
-
g_eval_params_str = (
|
|
35
|
-
", ".join(g_eval_params[:-1]) + ", and " + g_eval_params[-1]
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
return g_eval_params_str
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def construct_test_case_list(
|
|
42
|
-
evaluation_params: List[MLLMTestCaseParams], test_case: MLLMTestCase
|
|
43
|
-
) -> List[Union[str, MLLMImage]]:
|
|
44
|
-
test_case_list = []
|
|
45
|
-
for param in evaluation_params:
|
|
46
|
-
test_case_param_list = [f"\n\n\n{G_EVAL_PARAMS[param]}:\n"]
|
|
47
|
-
value = getattr(test_case, param.value)
|
|
48
|
-
for v in value:
|
|
49
|
-
if isinstance(v, ToolCall):
|
|
50
|
-
test_case_param_list.append(repr(v))
|
|
51
|
-
else:
|
|
52
|
-
test_case_param_list.append(v)
|
|
53
|
-
test_case_list.extend(test_case_param_list)
|
|
54
|
-
return test_case_list
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def no_multimodal_log_prob_support(model: Union[str, DeepEvalBaseMLLM]):
|
|
58
|
-
if (
|
|
59
|
-
isinstance(model, str)
|
|
60
|
-
and model in unsupported_log_probs_multimodal_gpt_models
|
|
61
|
-
):
|
|
62
|
-
return True
|
|
63
|
-
elif (
|
|
64
|
-
isinstance(model, MultimodalOpenAIModel)
|
|
65
|
-
and model.model_name in unsupported_log_probs_multimodal_gpt_models
|
|
66
|
-
):
|
|
67
|
-
return True
|
|
68
|
-
return False
|
|
File without changes
|