deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,290 +0,0 @@
1
- from typing import List, Dict
2
-
3
- from deepeval.metrics.indicator import metric_progress_indicator
4
- from deepeval.metrics.utils import (
5
- construct_verbose_logs,
6
- check_mllm_test_case_params,
7
- )
8
- from deepeval.test_case import (
9
- MLLMTestCase,
10
- MLLMTestCaseParams,
11
- ToolCallParams,
12
- ToolCall,
13
- )
14
- from deepeval.metrics import BaseMultimodalMetric
15
-
16
-
17
- class MultimodalToolCorrectnessMetric(BaseMultimodalMetric):
18
-
19
- _required_params: List[MLLMTestCaseParams] = [
20
- MLLMTestCaseParams.INPUT,
21
- MLLMTestCaseParams.ACTUAL_OUTPUT,
22
- MLLMTestCaseParams.TOOLS_CALLED,
23
- MLLMTestCaseParams.EXPECTED_TOOLS,
24
- ]
25
-
26
- def __init__(
27
- self,
28
- threshold: float = 0.5,
29
- evaluation_params: List[ToolCallParams] = [],
30
- include_reason: bool = True,
31
- strict_mode: bool = False,
32
- verbose_mode: bool = False,
33
- should_exact_match: bool = False,
34
- should_consider_ordering: bool = False,
35
- ):
36
- self.threshold = 1 if strict_mode else threshold
37
- self.include_reason = include_reason
38
- self.strict_mode = strict_mode
39
- self.verbose_mode = verbose_mode
40
- self.evaluation_params: List[ToolCallParams] = evaluation_params
41
- self.should_exact_match = should_exact_match
42
- self.should_consider_ordering = should_consider_ordering
43
-
44
- def measure(
45
- self,
46
- test_case: MLLMTestCase,
47
- _show_indicator: bool = True,
48
- _in_component: bool = False,
49
- _log_metric_to_confident: bool = True,
50
- ) -> float:
51
- check_mllm_test_case_params(
52
- test_case, self._required_params, None, None, self
53
- )
54
- self.test_case = test_case
55
- with metric_progress_indicator(
56
- self, _show_indicator=_show_indicator, _in_component=_in_component
57
- ):
58
- self.tools_called: List[ToolCall] = test_case.tools_called
59
- self.expected_tools: List[ToolCall] = test_case.expected_tools
60
- self.score = self._calculate_score()
61
- self.reason = self._generate_reason()
62
- self.success = self.score >= self.threshold
63
- expected_tools_formatted = (
64
- "Expected Tools:\n[\n"
65
- + ",\n".join(
66
- self.indent_multiline_string(
67
- repr(tool_call), indent_level=4
68
- )
69
- for tool_call in self.expected_tools
70
- )
71
- + "\n]"
72
- )
73
- tools_called_formatted = (
74
- "Tools Called:\n[\n"
75
- + ",\n".join(
76
- self.indent_multiline_string(
77
- repr(tool_call), indent_level=4
78
- )
79
- for tool_call in self.tools_called
80
- )
81
- + "\n]"
82
- )
83
- steps = [
84
- f"{expected_tools_formatted}",
85
- f"{tools_called_formatted}",
86
- ]
87
- steps.append(f"Score: {self.score}\nReason: {self.reason}")
88
- self.verbose_logs = construct_verbose_logs(self, steps=steps)
89
- return self.score
90
-
91
- async def a_measure(
92
- self,
93
- test_case: MLLMTestCase,
94
- _show_indicator: bool = True,
95
- _in_component: bool = False,
96
- _log_metric_to_confident: bool = True,
97
- ) -> float:
98
- return self.measure(
99
- test_case,
100
- _show_indicator=_show_indicator,
101
- _in_component=_in_component,
102
- _log_metric_to_confident=_log_metric_to_confident,
103
- )
104
-
105
- ##################################################
106
- ### Tool Correctness (Tool) ######################
107
- ##################################################
108
-
109
- def _generate_reason(self):
110
- tools_called_names = [
111
- tool_called.name for tool_called in self.tools_called
112
- ]
113
- expected_tools_names = [
114
- expected_tool.name for expected_tool in self.expected_tools
115
- ]
116
-
117
- if self.should_exact_match:
118
- return f"{'Exact match' if self._calculate_exact_match_score() else 'Not an exact match'}: expected {expected_tools_names}, called {tools_called_names}. See details above."
119
-
120
- elif self.should_consider_ordering:
121
- lcs, weighted_length = self._compute_weighted_lcs()
122
- score = weighted_length / len(expected_tools_names)
123
- missing = set(expected_tools_names) - set(tools_called_names)
124
- out_of_order = set(expected_tools_names) - set(
125
- [tool.name for tool in lcs]
126
- )
127
- if score == 1:
128
- return f"Correct ordering: all expected tools {expected_tools_names} were called in the correct order."
129
- else:
130
- issues = []
131
- if missing:
132
- issues.append(f"missing tools {list(missing)}")
133
- if out_of_order:
134
- issues.append(f"out-of-order tools {list(out_of_order)}")
135
- return f"Incorrect tool usage: {' and '.join(issues)}; expected {expected_tools_names}, called {tools_called_names}. See more details above."
136
- else:
137
- used_expected = set(self.tools_called).intersection(
138
- set(self.expected_tools)
139
- )
140
- missing = set(self.expected_tools) - used_expected
141
- if self._calculate_non_exact_match_score() == 1:
142
- return f"All expected tools {expected_tools_names} were called (order not considered)."
143
- else:
144
- return f"Incomplete tool usage: missing tools {list(missing)}; expected {expected_tools_names}, called {tools_called_names}. See more details above."
145
-
146
- ##################################################
147
- ### Score Helper Functions #######################
148
- ##################################################
149
-
150
- # Calculate score
151
- def _calculate_score(self):
152
- if self.should_exact_match:
153
- score = self._calculate_exact_match_score()
154
- elif self.should_consider_ordering:
155
- _, weighted_length = self._compute_weighted_lcs()
156
- score = weighted_length / len(self.expected_tools)
157
- else:
158
- score = self._calculate_non_exact_match_score()
159
- return 0 if self.strict_mode and score < self.threshold else score
160
-
161
- # Exact matching score
162
- def _calculate_exact_match_score(self):
163
- if len(self.tools_called) != len(self.expected_tools):
164
- return 0.0
165
- for i in range(len(self.tools_called)):
166
- if self.tools_called[i].name != self.expected_tools[i].name:
167
- return 0.0
168
- if ToolCallParams.INPUT_PARAMETERS in self.evaluation_params:
169
- if (
170
- self.tools_called[i].input_parameters
171
- != self.expected_tools[i].input_parameters
172
- ):
173
- return 0.0
174
- if ToolCallParams.OUTPUT in self.evaluation_params:
175
- if self.tools_called[i].output != self.expected_tools[i].output:
176
- return 0.0
177
- return 1.0
178
-
179
- # Non exact matching score
180
- def _calculate_non_exact_match_score(self):
181
- total_score = 0.0
182
- matched_called_tools = set()
183
- for expected_tool in self.expected_tools:
184
- best_score = 0.0
185
- for called_tool in self.tools_called:
186
- if called_tool in matched_called_tools:
187
- continue
188
- if expected_tool.name == called_tool.name:
189
- match_score = 1.0
190
- if (
191
- ToolCallParams.INPUT_PARAMETERS
192
- in self.evaluation_params
193
- ):
194
- match_score *= self._compare_dicts(
195
- expected_tool.input_parameters,
196
- called_tool.input_parameters,
197
- )
198
- if (
199
- ToolCallParams.OUTPUT in self.evaluation_params
200
- and expected_tool.output != called_tool.output
201
- ):
202
- match_score = 0.0
203
- if match_score > best_score:
204
- best_score = match_score
205
- best_called_tool = called_tool
206
- if best_score > 0:
207
- total_score += best_score
208
- matched_called_tools.add(best_called_tool)
209
- return (
210
- total_score / len(self.expected_tools)
211
- if self.expected_tools
212
- else 0.0
213
- )
214
-
215
- # Consider ordering score
216
- def _compute_weighted_lcs(self):
217
- m, n = len(self.expected_tools), len(self.tools_called)
218
- dp = [[0.0] * (n + 1) for _ in range(m + 1)]
219
- for i in range(1, m + 1):
220
- for j in range(1, n + 1):
221
- expected_tool, called_tool = (
222
- self.expected_tools[i - 1],
223
- self.tools_called[j - 1],
224
- )
225
- if expected_tool.name != called_tool.name:
226
- dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
227
- continue
228
- score = 1.0
229
- if ToolCallParams.INPUT_PARAMETERS in self.evaluation_params:
230
- score *= self._compare_dicts(
231
- expected_tool.input_parameters,
232
- called_tool.input_parameters,
233
- )
234
- if (
235
- ToolCallParams.OUTPUT in self.evaluation_params
236
- and expected_tool.output != called_tool.output
237
- ):
238
- score = 0.0
239
- dp[i][j] = max(
240
- dp[i - 1][j],
241
- dp[i][j - 1],
242
- dp[i - 1][j - 1] + score if score > 0 else 0,
243
- )
244
- i, j, total_score = m, n, 0.0
245
- lcs = []
246
- while i > 0 and j > 0:
247
- if dp[i][j] == dp[i - 1][j]:
248
- i -= 1
249
- elif dp[i][j] == dp[i][j - 1]:
250
- j -= 1
251
- else:
252
- lcs.append(self.expected_tools[i - 1])
253
- total_score += dp[i][j] - dp[i - 1][j - 1]
254
- i, j = i - 1, j - 1
255
- return lcs[::-1], total_score
256
-
257
- # For matching input parameters
258
- def _compare_dicts(self, dict1: Dict, dict2: Dict):
259
- if self.should_exact_match:
260
- return 1.0 if dict1 == dict2 else 0.0
261
- match_score = 0
262
- matched_keys = set(dict1.keys()).intersection(set(dict2.keys()))
263
- total_keys = set(dict1.keys()).union(set(dict2.keys()))
264
- for key in matched_keys:
265
- if dict1[key] == dict2[key]:
266
- match_score += 1 / len(total_keys)
267
- elif isinstance(dict1[key], dict) and isinstance(dict2[key], dict):
268
- match_score += self._compare_dicts(
269
- dict1[key], dict2[key]
270
- ) / len(total_keys)
271
- return match_score
272
-
273
- ##################################################
274
- ### Others #######################################
275
- ##################################################
276
-
277
- def is_successful(self) -> bool:
278
- try:
279
- self.success = self.score >= self.threshold
280
- except:
281
- self.success = False
282
- return self.success
283
-
284
- @property
285
- def __name__(self):
286
- return "Multi Modal Tool Correctness"
287
-
288
- def indent_multiline_string(self, s, indent_level=4):
289
- indent = " " * indent_level
290
- return "\n".join(f"{indent}{line}" for line in s.splitlines())
@@ -1,4 +0,0 @@
1
- from .openai_model import MultimodalOpenAIModel
2
- from .ollama_model import MultimodalOllamaModel
3
- from .gemini_model import MultimodalGeminiModel
4
- from .azure_model import MultimodalAzureOpenAIMLLMModel
@@ -1,343 +0,0 @@
1
- import base64
2
- from openai.types.chat.chat_completion import ChatCompletion
3
- from openai import AzureOpenAI, AsyncAzureOpenAI
4
- from typing import Optional, Tuple, Union, Dict, List
5
- from pydantic import BaseModel, SecretStr
6
- from io import BytesIO
7
-
8
- from deepeval.config.settings import get_settings
9
- from deepeval.models import DeepEvalBaseMLLM
10
- from deepeval.test_case import MLLMImage
11
- from deepeval.models.llms.openai_model import (
12
- structured_outputs_models,
13
- json_mode_models,
14
- model_pricing,
15
- )
16
- from deepeval.models.retry_policy import (
17
- create_retry_decorator,
18
- sdk_retries_for,
19
- )
20
-
21
- from deepeval.models.llms.utils import trim_and_load_json
22
- from deepeval.models.utils import parse_model_name, require_secret_api_key
23
- from deepeval.constants import ProviderSlug as PS
24
-
25
-
26
- retry_azure = create_retry_decorator(PS.AZURE)
27
-
28
-
29
- class MultimodalAzureOpenAIMLLMModel(DeepEvalBaseMLLM):
30
- def __init__(
31
- self,
32
- deployment_name: Optional[str] = None,
33
- model_name: Optional[str] = None,
34
- azure_openai_api_key: Optional[str] = None,
35
- openai_api_version: Optional[str] = None,
36
- azure_endpoint: Optional[str] = None,
37
- temperature: float = 0,
38
- generation_kwargs: Optional[Dict] = None,
39
- **kwargs,
40
- ):
41
- settings = get_settings()
42
- # fetch Azure deployment parameters
43
- model_name = model_name or settings.AZURE_MODEL_NAME
44
- self.deployment_name = deployment_name or settings.AZURE_DEPLOYMENT_NAME
45
- if azure_openai_api_key is not None:
46
- # keep it secret, keep it safe from serializings, logging and alike
47
- self.azure_openai_api_key: SecretStr | None = SecretStr(
48
- azure_openai_api_key
49
- )
50
- else:
51
- self.azure_openai_api_key = settings.AZURE_OPENAI_API_KEY
52
-
53
- self.openai_api_version = (
54
- openai_api_version or settings.OPENAI_API_VERSION
55
- )
56
- self.azure_endpoint = (
57
- azure_endpoint
58
- or settings.AZURE_OPENAI_ENDPOINT
59
- and str(settings.AZURE_OPENAI_ENDPOINT)
60
- )
61
- if temperature < 0:
62
- raise ValueError("Temperature must be >= 0.")
63
- self.temperature = temperature
64
-
65
- # args and kwargs will be passed to the underlying model, in load_model function
66
- self.kwargs = kwargs
67
- self.generation_kwargs = generation_kwargs or {}
68
- super().__init__(parse_model_name(model_name))
69
-
70
- ###############################################
71
- # Generate functions
72
- ###############################################
73
-
74
- @retry_azure
75
- def generate(
76
- self,
77
- multimodal_input: List[Union[str, MLLMImage]],
78
- schema: Optional[BaseModel] = None,
79
- ) -> Tuple[Union[str, BaseModel], float]:
80
- client = self.load_model(async_mode=False)
81
- prompt = self.generate_prompt(multimodal_input)
82
-
83
- if schema:
84
- if self.model_name in structured_outputs_models:
85
- messages = [{"role": "user", "content": prompt}]
86
- completion = client.beta.chat.completions.parse(
87
- model=self.deployment_name,
88
- messages=messages,
89
- response_format=schema,
90
- temperature=self.temperature,
91
- )
92
- structured_output: BaseModel = completion.choices[
93
- 0
94
- ].message.parsed
95
- cost = self.calculate_cost(
96
- completion.usage.prompt_tokens,
97
- completion.usage.completion_tokens,
98
- )
99
- return structured_output, cost
100
- if self.model_name in json_mode_models:
101
- messages = [{"role": "user", "content": prompt}]
102
- completion = client.beta.chat.completions.parse(
103
- model=self.deployment_name,
104
- messages=messages,
105
- response_format={"type": "json_object"},
106
- temperature=self.temperature,
107
- )
108
- json_output = trim_and_load_json(
109
- completion.choices[0].message.content
110
- )
111
- cost = self.calculate_cost(
112
- completion.usage.prompt_tokens,
113
- completion.usage.completion_tokens,
114
- )
115
- return schema.model_validate(json_output), cost
116
- print("Loading model client:")
117
- print(client.base_url)
118
- completion = client.chat.completions.create(
119
- model=self.deployment_name,
120
- messages=[{"role": "user", "content": prompt}],
121
- temperature=self.temperature,
122
- **self.generation_kwargs,
123
- )
124
- output = completion.choices[0].message.content
125
- cost = self.calculate_cost(
126
- completion.usage.prompt_tokens, completion.usage.completion_tokens
127
- )
128
- if schema:
129
- json_output = trim_and_load_json(output)
130
- return schema.model_validate(json_output), cost
131
- else:
132
- return output, cost
133
-
134
- @retry_azure
135
- async def a_generate(
136
- self,
137
- multimodal_input: List[Union[str, MLLMImage]],
138
- schema: Optional[BaseModel] = None,
139
- ) -> Tuple[Union[str, BaseModel], float]:
140
- client = self.load_model(async_mode=True)
141
- prompt = self.generate_prompt(multimodal_input)
142
-
143
- if schema:
144
- if self.model_name in structured_outputs_models:
145
- messages = [{"role": "user", "content": prompt}]
146
- completion = await client.beta.chat.completions.parse(
147
- model=self.deployment_name,
148
- messages=messages,
149
- response_format=schema,
150
- temperature=self.temperature,
151
- )
152
- structured_output: BaseModel = completion.choices[
153
- 0
154
- ].message.parsed
155
- cost = self.calculate_cost(
156
- completion.usage.prompt_tokens,
157
- completion.usage.completion_tokens,
158
- )
159
- return structured_output, cost
160
- if self.model_name in json_mode_models:
161
- messages = [{"role": "user", "content": prompt}]
162
- completion = await client.beta.chat.completions.parse(
163
- model=self.deployment_name,
164
- messages=messages,
165
- response_format={"type": "json_object"},
166
- temperature=self.temperature,
167
- **self.generation_kwargs,
168
- )
169
- json_output = trim_and_load_json(
170
- completion.choices[0].message.content
171
- )
172
- cost = self.calculate_cost(
173
- completion.usage.prompt_tokens,
174
- completion.usage.completion_tokens,
175
- )
176
- return schema.model_validate(json_output), cost
177
-
178
- completion = await client.chat.completions.create(
179
- model=self.deployment_name,
180
- messages=[{"role": "user", "content": prompt}],
181
- temperature=self.temperature,
182
- **self.generation_kwargs,
183
- )
184
- output = completion.choices[0].message.content
185
- cost = self.calculate_cost(
186
- completion.usage.prompt_tokens,
187
- completion.usage.completion_tokens,
188
- )
189
- if schema:
190
- json_output = trim_and_load_json(output)
191
- return schema.model_validate(json_output), cost
192
- else:
193
- return output, cost
194
-
195
- ###############################################
196
- # Other generate functions
197
- ###############################################
198
-
199
- @retry_azure
200
- def generate_raw_response(
201
- self,
202
- multimodal_input: List[Union[str, MLLMImage]],
203
- top_logprobs: int = 5,
204
- ) -> Tuple[ChatCompletion, float]:
205
- client = self.load_model(async_mode=False)
206
- prompt = self.generate_prompt(multimodal_input)
207
- messages = [{"role": "user", "content": prompt}]
208
-
209
- # Generate completion
210
- completion = client.chat.completions.create(
211
- model=self.deployment_name,
212
- messages=messages,
213
- temperature=self.temperature,
214
- logprobs=True,
215
- top_logprobs=top_logprobs,
216
- **self.generation_kwargs,
217
- )
218
- # Cost calculation
219
- input_tokens = completion.usage.prompt_tokens
220
- output_tokens = completion.usage.completion_tokens
221
- cost = self.calculate_cost(input_tokens, output_tokens)
222
-
223
- return completion, cost
224
-
225
- @retry_azure
226
- async def a_generate_raw_response(
227
- self,
228
- multimodal_input: List[Union[str, MLLMImage]],
229
- top_logprobs: int = 5,
230
- ) -> Tuple[ChatCompletion, float]:
231
- client = self.load_model(async_mode=True)
232
- prompt = self.generate_prompt(multimodal_input)
233
- messages = [{"role": "user", "content": prompt}]
234
-
235
- # Generate completion
236
- completion = await client.chat.completions.create(
237
- model=self.deployment_name,
238
- messages=messages,
239
- temperature=self.temperature,
240
- logprobs=True,
241
- top_logprobs=top_logprobs,
242
- **self.generation_kwargs,
243
- )
244
- # Cost calculation
245
- input_tokens = completion.usage.prompt_tokens
246
- output_tokens = completion.usage.completion_tokens
247
- cost = self.calculate_cost(input_tokens, output_tokens)
248
-
249
- return completion, cost
250
-
251
- ###############################################
252
- # Utilities
253
- ###############################################
254
-
255
- def generate_prompt(
256
- self, multimodal_input: List[Union[str, MLLMImage]] = []
257
- ):
258
- """Convert multimodal input into the proper message format for Azure OpenAI."""
259
- prompt = []
260
- for ele in multimodal_input:
261
- if isinstance(ele, str):
262
- prompt.append({"type": "text", "text": ele})
263
- elif isinstance(ele, MLLMImage):
264
- if ele.local:
265
- import PIL.Image
266
-
267
- image = PIL.Image.open(ele.url)
268
- visual_dict = {
269
- "type": "image_url",
270
- "image_url": {
271
- "url": f"data:image/jpeg;base64,{self.encode_pil_image(image)}"
272
- },
273
- }
274
- else:
275
- visual_dict = {
276
- "type": "image_url",
277
- "image_url": {"url": ele.url},
278
- }
279
- prompt.append(visual_dict)
280
- return prompt
281
-
282
- def encode_pil_image(self, pil_image):
283
- """Encode a PIL image to base64 string."""
284
- image_buffer = BytesIO()
285
- if pil_image.mode in ("RGBA", "LA", "P"):
286
- pil_image = pil_image.convert("RGB")
287
- pil_image.save(image_buffer, format="JPEG")
288
- image_bytes = image_buffer.getvalue()
289
- base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")
290
- return base64_encoded_image
291
-
292
- def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
293
- pricing = model_pricing.get(self.model_name, model_pricing["gpt-4.1"])
294
- input_cost = input_tokens * pricing["input"]
295
- output_cost = output_tokens * pricing["output"]
296
- return input_cost + output_cost
297
-
298
- ###############################################
299
- # Model
300
- ###############################################
301
-
302
- def get_model_name(self):
303
- return f"Azure OpenAI ({self.model_name})"
304
-
305
- def load_model(self, async_mode: bool = False):
306
- if not async_mode:
307
- return self._build_client(AzureOpenAI)
308
- return self._build_client(AsyncAzureOpenAI)
309
-
310
- def _client_kwargs(self) -> Dict:
311
- """
312
- If Tenacity is managing retries, force OpenAI SDK retries off to avoid double retries.
313
- If the user opts into SDK retries for 'azure' via DEEPEVAL_SDK_RETRY_PROVIDERS,
314
- leave their retry settings as is.
315
- """
316
- kwargs = dict(self.kwargs or {})
317
- if not sdk_retries_for(PS.AZURE):
318
- kwargs["max_retries"] = 0
319
- return kwargs
320
-
321
- def _build_client(self, cls):
322
- api_key = require_secret_api_key(
323
- self.azure_openai_api_key,
324
- provider_label="AzureOpenAI",
325
- env_var_name="AZURE_OPENAI_API_KEY",
326
- param_hint="`azure_openai_api_key` to MultimodalAzureOpenAIMLLMModel(...)",
327
- )
328
-
329
- kw = dict(
330
- api_key=api_key,
331
- api_version=self.openai_api_version,
332
- azure_endpoint=self.azure_endpoint,
333
- azure_deployment=self.deployment_name,
334
- **self._client_kwargs(),
335
- )
336
- try:
337
- return cls(**kw)
338
- except TypeError as e:
339
- # older OpenAI SDKs may not accept max_retries, in that case remove and retry once
340
- if "max_retries" in str(e):
341
- kw.pop("max_retries", None)
342
- return cls(**kw)
343
- raise