deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -1,290 +0,0 @@
1
- from typing import List, Dict
2
-
3
- from deepeval.metrics.indicator import metric_progress_indicator
4
- from deepeval.metrics.utils import (
5
- construct_verbose_logs,
6
- check_mllm_test_case_params,
7
- )
8
- from deepeval.test_case import (
9
- MLLMTestCase,
10
- MLLMTestCaseParams,
11
- ToolCallParams,
12
- ToolCall,
13
- )
14
- from deepeval.metrics import BaseMultimodalMetric
15
-
16
-
17
- class MultimodalToolCorrectnessMetric(BaseMultimodalMetric):
18
-
19
- _required_params: List[MLLMTestCaseParams] = [
20
- MLLMTestCaseParams.INPUT,
21
- MLLMTestCaseParams.ACTUAL_OUTPUT,
22
- MLLMTestCaseParams.TOOLS_CALLED,
23
- MLLMTestCaseParams.EXPECTED_TOOLS,
24
- ]
25
-
26
- def __init__(
27
- self,
28
- threshold: float = 0.5,
29
- evaluation_params: List[ToolCallParams] = [],
30
- include_reason: bool = True,
31
- strict_mode: bool = False,
32
- verbose_mode: bool = False,
33
- should_exact_match: bool = False,
34
- should_consider_ordering: bool = False,
35
- ):
36
- self.threshold = 1 if strict_mode else threshold
37
- self.include_reason = include_reason
38
- self.strict_mode = strict_mode
39
- self.verbose_mode = verbose_mode
40
- self.evaluation_params: List[ToolCallParams] = evaluation_params
41
- self.should_exact_match = should_exact_match
42
- self.should_consider_ordering = should_consider_ordering
43
-
44
- def measure(
45
- self,
46
- test_case: MLLMTestCase,
47
- _show_indicator: bool = True,
48
- _in_component: bool = False,
49
- _log_metric_to_confident: bool = True,
50
- ) -> float:
51
- check_mllm_test_case_params(
52
- test_case, self._required_params, None, None, self
53
- )
54
- self.test_case = test_case
55
- with metric_progress_indicator(
56
- self, _show_indicator=_show_indicator, _in_component=_in_component
57
- ):
58
- self.tools_called: List[ToolCall] = test_case.tools_called
59
- self.expected_tools: List[ToolCall] = test_case.expected_tools
60
- self.score = self._calculate_score()
61
- self.reason = self._generate_reason()
62
- self.success = self.score >= self.threshold
63
- expected_tools_formatted = (
64
- "Expected Tools:\n[\n"
65
- + ",\n".join(
66
- self.indent_multiline_string(
67
- repr(tool_call), indent_level=4
68
- )
69
- for tool_call in self.expected_tools
70
- )
71
- + "\n]"
72
- )
73
- tools_called_formatted = (
74
- "Tools Called:\n[\n"
75
- + ",\n".join(
76
- self.indent_multiline_string(
77
- repr(tool_call), indent_level=4
78
- )
79
- for tool_call in self.tools_called
80
- )
81
- + "\n]"
82
- )
83
- steps = [
84
- f"{expected_tools_formatted}",
85
- f"{tools_called_formatted}",
86
- ]
87
- steps.append(f"Score: {self.score}\nReason: {self.reason}")
88
- self.verbose_logs = construct_verbose_logs(self, steps=steps)
89
- return self.score
90
-
91
- async def a_measure(
92
- self,
93
- test_case: MLLMTestCase,
94
- _show_indicator: bool = True,
95
- _in_component: bool = False,
96
- _log_metric_to_confident: bool = True,
97
- ) -> float:
98
- return self.measure(
99
- test_case,
100
- _show_indicator=_show_indicator,
101
- _in_component=_in_component,
102
- _log_metric_to_confident=_log_metric_to_confident,
103
- )
104
-
105
- ##################################################
106
- ### Tool Correctness (Tool) ######################
107
- ##################################################
108
-
109
- def _generate_reason(self):
110
- tools_called_names = [
111
- tool_called.name for tool_called in self.tools_called
112
- ]
113
- expected_tools_names = [
114
- expected_tool.name for expected_tool in self.expected_tools
115
- ]
116
-
117
- if self.should_exact_match:
118
- return f"{'Exact match' if self._calculate_exact_match_score() else 'Not an exact match'}: expected {expected_tools_names}, called {tools_called_names}. See details above."
119
-
120
- elif self.should_consider_ordering:
121
- lcs, weighted_length = self._compute_weighted_lcs()
122
- score = weighted_length / len(expected_tools_names)
123
- missing = set(expected_tools_names) - set(tools_called_names)
124
- out_of_order = set(expected_tools_names) - set(
125
- [tool.name for tool in lcs]
126
- )
127
- if score == 1:
128
- return f"Correct ordering: all expected tools {expected_tools_names} were called in the correct order."
129
- else:
130
- issues = []
131
- if missing:
132
- issues.append(f"missing tools {list(missing)}")
133
- if out_of_order:
134
- issues.append(f"out-of-order tools {list(out_of_order)}")
135
- return f"Incorrect tool usage: {' and '.join(issues)}; expected {expected_tools_names}, called {tools_called_names}. See more details above."
136
- else:
137
- used_expected = set(self.tools_called).intersection(
138
- set(self.expected_tools)
139
- )
140
- missing = set(self.expected_tools) - used_expected
141
- if self._calculate_non_exact_match_score() == 1:
142
- return f"All expected tools {expected_tools_names} were called (order not considered)."
143
- else:
144
- return f"Incomplete tool usage: missing tools {list(missing)}; expected {expected_tools_names}, called {tools_called_names}. See more details above."
145
-
146
- ##################################################
147
- ### Score Helper Functions #######################
148
- ##################################################
149
-
150
- # Calculate score
151
- def _calculate_score(self):
152
- if self.should_exact_match:
153
- score = self._calculate_exact_match_score()
154
- elif self.should_consider_ordering:
155
- _, weighted_length = self._compute_weighted_lcs()
156
- score = weighted_length / len(self.expected_tools)
157
- else:
158
- score = self._calculate_non_exact_match_score()
159
- return 0 if self.strict_mode and score < self.threshold else score
160
-
161
- # Exact matching score
162
- def _calculate_exact_match_score(self):
163
- if len(self.tools_called) != len(self.expected_tools):
164
- return 0.0
165
- for i in range(len(self.tools_called)):
166
- if self.tools_called[i].name != self.expected_tools[i].name:
167
- return 0.0
168
- if ToolCallParams.INPUT_PARAMETERS in self.evaluation_params:
169
- if (
170
- self.tools_called[i].input_parameters
171
- != self.expected_tools[i].input_parameters
172
- ):
173
- return 0.0
174
- if ToolCallParams.OUTPUT in self.evaluation_params:
175
- if self.tools_called[i].output != self.expected_tools[i].output:
176
- return 0.0
177
- return 1.0
178
-
179
- # Non exact matching score
180
- def _calculate_non_exact_match_score(self):
181
- total_score = 0.0
182
- matched_called_tools = set()
183
- for expected_tool in self.expected_tools:
184
- best_score = 0.0
185
- for called_tool in self.tools_called:
186
- if called_tool in matched_called_tools:
187
- continue
188
- if expected_tool.name == called_tool.name:
189
- match_score = 1.0
190
- if (
191
- ToolCallParams.INPUT_PARAMETERS
192
- in self.evaluation_params
193
- ):
194
- match_score *= self._compare_dicts(
195
- expected_tool.input_parameters,
196
- called_tool.input_parameters,
197
- )
198
- if (
199
- ToolCallParams.OUTPUT in self.evaluation_params
200
- and expected_tool.output != called_tool.output
201
- ):
202
- match_score = 0.0
203
- if match_score > best_score:
204
- best_score = match_score
205
- best_called_tool = called_tool
206
- if best_score > 0:
207
- total_score += best_score
208
- matched_called_tools.add(best_called_tool)
209
- return (
210
- total_score / len(self.expected_tools)
211
- if self.expected_tools
212
- else 0.0
213
- )
214
-
215
- # Consider ordering score
216
- def _compute_weighted_lcs(self):
217
- m, n = len(self.expected_tools), len(self.tools_called)
218
- dp = [[0.0] * (n + 1) for _ in range(m + 1)]
219
- for i in range(1, m + 1):
220
- for j in range(1, n + 1):
221
- expected_tool, called_tool = (
222
- self.expected_tools[i - 1],
223
- self.tools_called[j - 1],
224
- )
225
- if expected_tool.name != called_tool.name:
226
- dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
227
- continue
228
- score = 1.0
229
- if ToolCallParams.INPUT_PARAMETERS in self.evaluation_params:
230
- score *= self._compare_dicts(
231
- expected_tool.input_parameters,
232
- called_tool.input_parameters,
233
- )
234
- if (
235
- ToolCallParams.OUTPUT in self.evaluation_params
236
- and expected_tool.output != called_tool.output
237
- ):
238
- score = 0.0
239
- dp[i][j] = max(
240
- dp[i - 1][j],
241
- dp[i][j - 1],
242
- dp[i - 1][j - 1] + score if score > 0 else 0,
243
- )
244
- i, j, total_score = m, n, 0.0
245
- lcs = []
246
- while i > 0 and j > 0:
247
- if dp[i][j] == dp[i - 1][j]:
248
- i -= 1
249
- elif dp[i][j] == dp[i][j - 1]:
250
- j -= 1
251
- else:
252
- lcs.append(self.expected_tools[i - 1])
253
- total_score += dp[i][j] - dp[i - 1][j - 1]
254
- i, j = i - 1, j - 1
255
- return lcs[::-1], total_score
256
-
257
- # For matching input parameters
258
- def _compare_dicts(self, dict1: Dict, dict2: Dict):
259
- if self.should_exact_match:
260
- return 1.0 if dict1 == dict2 else 0.0
261
- match_score = 0
262
- matched_keys = set(dict1.keys()).intersection(set(dict2.keys()))
263
- total_keys = set(dict1.keys()).union(set(dict2.keys()))
264
- for key in matched_keys:
265
- if dict1[key] == dict2[key]:
266
- match_score += 1 / len(total_keys)
267
- elif isinstance(dict1[key], dict) and isinstance(dict2[key], dict):
268
- match_score += self._compare_dicts(
269
- dict1[key], dict2[key]
270
- ) / len(total_keys)
271
- return match_score
272
-
273
- ##################################################
274
- ### Others #######################################
275
- ##################################################
276
-
277
- def is_successful(self) -> bool:
278
- try:
279
- self.success = self.score >= self.threshold
280
- except:
281
- self.success = False
282
- return self.success
283
-
284
- @property
285
- def __name__(self):
286
- return "Multi Modal Tool Correctness"
287
-
288
- def indent_multiline_string(self, s, indent_level=4):
289
- indent = " " * indent_level
290
- return "\n".join(f"{indent}{line}" for line in s.splitlines())
@@ -1,4 +0,0 @@
1
- from .openai_model import MultimodalOpenAIModel
2
- from .ollama_model import MultimodalOllamaModel
3
- from .gemini_model import MultimodalGeminiModel
4
- from .azure_model import MultimodalAzureOpenAIMLLMModel
@@ -1,334 +0,0 @@
1
- from openai.types.chat.chat_completion import ChatCompletion
2
- from openai import AzureOpenAI, AsyncAzureOpenAI
3
- from typing import Optional, Tuple, Union, Dict, List
4
- from pydantic import BaseModel
5
- from io import BytesIO
6
- import base64
7
-
8
- from deepeval.models import DeepEvalBaseMLLM
9
- from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
10
- from deepeval.test_case import MLLMImage
11
- from deepeval.models.llms.openai_model import (
12
- structured_outputs_models,
13
- json_mode_models,
14
- model_pricing,
15
- )
16
- from deepeval.models.retry_policy import (
17
- create_retry_decorator,
18
- sdk_retries_for,
19
- )
20
-
21
- from deepeval.models.llms.utils import trim_and_load_json
22
- from deepeval.models.utils import parse_model_name
23
- from deepeval.constants import ProviderSlug as PS
24
-
25
-
26
- retry_azure = create_retry_decorator(PS.AZURE)
27
-
28
-
29
- class MultimodalAzureOpenAIMLLMModel(DeepEvalBaseMLLM):
30
- def __init__(
31
- self,
32
- deployment_name: Optional[str] = None,
33
- model_name: Optional[str] = None,
34
- azure_openai_api_key: Optional[str] = None,
35
- openai_api_version: Optional[str] = None,
36
- azure_endpoint: Optional[str] = None,
37
- temperature: float = 0,
38
- generation_kwargs: Optional[Dict] = None,
39
- **kwargs,
40
- ):
41
- # fetch Azure deployment parameters
42
- model_name = model_name or KEY_FILE_HANDLER.fetch_data(
43
- ModelKeyValues.AZURE_MODEL_NAME
44
- )
45
- self.deployment_name = deployment_name or KEY_FILE_HANDLER.fetch_data(
46
- ModelKeyValues.AZURE_DEPLOYMENT_NAME
47
- )
48
- self.azure_openai_api_key = (
49
- azure_openai_api_key
50
- or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.AZURE_OPENAI_API_KEY)
51
- )
52
- self.openai_api_version = (
53
- openai_api_version
54
- or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.OPENAI_API_VERSION)
55
- )
56
- self.azure_endpoint = azure_endpoint or KEY_FILE_HANDLER.fetch_data(
57
- ModelKeyValues.AZURE_OPENAI_ENDPOINT
58
- )
59
- if temperature < 0:
60
- raise ValueError("Temperature must be >= 0.")
61
- self.temperature = temperature
62
-
63
- # args and kwargs will be passed to the underlying model, in load_model function
64
- self.kwargs = kwargs
65
- self.generation_kwargs = generation_kwargs or {}
66
- super().__init__(parse_model_name(model_name))
67
-
68
- ###############################################
69
- # Generate functions
70
- ###############################################
71
-
72
- @retry_azure
73
- def generate(
74
- self,
75
- multimodal_input: List[Union[str, MLLMImage]],
76
- schema: Optional[BaseModel] = None,
77
- ) -> Tuple[Union[str, BaseModel], float]:
78
- client = self.load_model(async_mode=False)
79
- prompt = self.generate_prompt(multimodal_input)
80
-
81
- if schema:
82
- if self.model_name in structured_outputs_models:
83
- messages = [{"role": "user", "content": prompt}]
84
- completion = client.beta.chat.completions.parse(
85
- model=self.deployment_name,
86
- messages=messages,
87
- response_format=schema,
88
- temperature=self.temperature,
89
- )
90
- structured_output: BaseModel = completion.choices[
91
- 0
92
- ].message.parsed
93
- cost = self.calculate_cost(
94
- completion.usage.prompt_tokens,
95
- completion.usage.completion_tokens,
96
- )
97
- return structured_output, cost
98
- if self.model_name in json_mode_models:
99
- messages = [{"role": "user", "content": prompt}]
100
- completion = client.beta.chat.completions.parse(
101
- model=self.deployment_name,
102
- messages=messages,
103
- response_format={"type": "json_object"},
104
- temperature=self.temperature,
105
- )
106
- json_output = trim_and_load_json(
107
- completion.choices[0].message.content
108
- )
109
- cost = self.calculate_cost(
110
- completion.usage.prompt_tokens,
111
- completion.usage.completion_tokens,
112
- )
113
- return schema.model_validate(json_output), cost
114
- print("Loading model client:")
115
- print(client.base_url)
116
- completion = client.chat.completions.create(
117
- model=self.deployment_name,
118
- messages=[{"role": "user", "content": prompt}],
119
- temperature=self.temperature,
120
- **self.generation_kwargs,
121
- )
122
- output = completion.choices[0].message.content
123
- cost = self.calculate_cost(
124
- completion.usage.prompt_tokens, completion.usage.completion_tokens
125
- )
126
- if schema:
127
- json_output = trim_and_load_json(output)
128
- return schema.model_validate(json_output), cost
129
- else:
130
- return output, cost
131
-
132
- @retry_azure
133
- async def a_generate(
134
- self,
135
- multimodal_input: List[Union[str, MLLMImage]],
136
- schema: Optional[BaseModel] = None,
137
- ) -> Tuple[Union[str, BaseModel], float]:
138
- client = self.load_model(async_mode=True)
139
- prompt = self.generate_prompt(multimodal_input)
140
-
141
- if schema:
142
- if self.model_name in structured_outputs_models:
143
- messages = [{"role": "user", "content": prompt}]
144
- completion = await client.beta.chat.completions.parse(
145
- model=self.deployment_name,
146
- messages=messages,
147
- response_format=schema,
148
- temperature=self.temperature,
149
- )
150
- structured_output: BaseModel = completion.choices[
151
- 0
152
- ].message.parsed
153
- cost = self.calculate_cost(
154
- completion.usage.prompt_tokens,
155
- completion.usage.completion_tokens,
156
- )
157
- return structured_output, cost
158
- if self.model_name in json_mode_models:
159
- messages = [{"role": "user", "content": prompt}]
160
- completion = await client.beta.chat.completions.parse(
161
- model=self.deployment_name,
162
- messages=messages,
163
- response_format={"type": "json_object"},
164
- temperature=self.temperature,
165
- **self.generation_kwargs,
166
- )
167
- json_output = trim_and_load_json(
168
- completion.choices[0].message.content
169
- )
170
- cost = self.calculate_cost(
171
- completion.usage.prompt_tokens,
172
- completion.usage.completion_tokens,
173
- )
174
- return schema.model_validate(json_output), cost
175
-
176
- completion = await client.chat.completions.create(
177
- model=self.deployment_name,
178
- messages=[{"role": "user", "content": prompt}],
179
- temperature=self.temperature,
180
- **self.generation_kwargs,
181
- )
182
- output = completion.choices[0].message.content
183
- cost = self.calculate_cost(
184
- completion.usage.prompt_tokens,
185
- completion.usage.completion_tokens,
186
- )
187
- if schema:
188
- json_output = trim_and_load_json(output)
189
- return schema.model_validate(json_output), cost
190
- else:
191
- return output, cost
192
-
193
- ###############################################
194
- # Other generate functions
195
- ###############################################
196
-
197
- @retry_azure
198
- def generate_raw_response(
199
- self,
200
- multimodal_input: List[Union[str, MLLMImage]],
201
- top_logprobs: int = 5,
202
- ) -> Tuple[ChatCompletion, float]:
203
- client = self.load_model(async_mode=False)
204
- prompt = self.generate_prompt(multimodal_input)
205
- messages = [{"role": "user", "content": prompt}]
206
-
207
- # Generate completion
208
- completion = client.chat.completions.create(
209
- model=self.deployment_name,
210
- messages=messages,
211
- temperature=self.temperature,
212
- logprobs=True,
213
- top_logprobs=top_logprobs,
214
- **self.generation_kwargs,
215
- )
216
- # Cost calculation
217
- input_tokens = completion.usage.prompt_tokens
218
- output_tokens = completion.usage.completion_tokens
219
- cost = self.calculate_cost(input_tokens, output_tokens)
220
-
221
- return completion, cost
222
-
223
- @retry_azure
224
- async def a_generate_raw_response(
225
- self,
226
- multimodal_input: List[Union[str, MLLMImage]],
227
- top_logprobs: int = 5,
228
- ) -> Tuple[ChatCompletion, float]:
229
- client = self.load_model(async_mode=True)
230
- prompt = self.generate_prompt(multimodal_input)
231
- messages = [{"role": "user", "content": prompt}]
232
-
233
- # Generate completion
234
- completion = await client.chat.completions.create(
235
- model=self.deployment_name,
236
- messages=messages,
237
- temperature=self.temperature,
238
- logprobs=True,
239
- top_logprobs=top_logprobs,
240
- **self.generation_kwargs,
241
- )
242
- # Cost calculation
243
- input_tokens = completion.usage.prompt_tokens
244
- output_tokens = completion.usage.completion_tokens
245
- cost = self.calculate_cost(input_tokens, output_tokens)
246
-
247
- return completion, cost
248
-
249
- ###############################################
250
- # Utilities
251
- ###############################################
252
-
253
- def generate_prompt(
254
- self, multimodal_input: List[Union[str, MLLMImage]] = []
255
- ):
256
- """Convert multimodal input into the proper message format for Azure OpenAI."""
257
- prompt = []
258
- for ele in multimodal_input:
259
- if isinstance(ele, str):
260
- prompt.append({"type": "text", "text": ele})
261
- elif isinstance(ele, MLLMImage):
262
- if ele.local:
263
- import PIL.Image
264
-
265
- image = PIL.Image.open(ele.url)
266
- visual_dict = {
267
- "type": "image_url",
268
- "image_url": {
269
- "url": f"data:image/jpeg;base64,{self.encode_pil_image(image)}"
270
- },
271
- }
272
- else:
273
- visual_dict = {
274
- "type": "image_url",
275
- "image_url": {"url": ele.url},
276
- }
277
- prompt.append(visual_dict)
278
- return prompt
279
-
280
- def encode_pil_image(self, pil_image):
281
- """Encode a PIL image to base64 string."""
282
- image_buffer = BytesIO()
283
- if pil_image.mode in ("RGBA", "LA", "P"):
284
- pil_image = pil_image.convert("RGB")
285
- pil_image.save(image_buffer, format="JPEG")
286
- image_bytes = image_buffer.getvalue()
287
- base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")
288
- return base64_encoded_image
289
-
290
- def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
291
- pricing = model_pricing.get(self.model_name, model_pricing["gpt-4.1"])
292
- input_cost = input_tokens * pricing["input"]
293
- output_cost = output_tokens * pricing["output"]
294
- return input_cost + output_cost
295
-
296
- ###############################################
297
- # Model
298
- ###############################################
299
-
300
- def get_model_name(self):
301
- return f"Azure OpenAI ({self.model_name})"
302
-
303
- def load_model(self, async_mode: bool = False):
304
- if not async_mode:
305
- return self._build_client(AzureOpenAI)
306
- return self._build_client(AsyncAzureOpenAI)
307
-
308
- def _client_kwargs(self) -> Dict:
309
- """
310
- If Tenacity is managing retries, force OpenAI SDK retries off to avoid double retries.
311
- If the user opts into SDK retries for 'azure' via DEEPEVAL_SDK_RETRY_PROVIDERS,
312
- leave their retry settings as is.
313
- """
314
- kwargs = dict(self.kwargs or {})
315
- if not sdk_retries_for(PS.AZURE):
316
- kwargs["max_retries"] = 0
317
- return kwargs
318
-
319
- def _build_client(self, cls):
320
- kw = dict(
321
- api_key=self.azure_openai_api_key,
322
- api_version=self.openai_api_version,
323
- azure_endpoint=self.azure_endpoint,
324
- azure_deployment=self.deployment_name,
325
- **self._client_kwargs(),
326
- )
327
- try:
328
- return cls(**kw)
329
- except TypeError as e:
330
- # older OpenAI SDKs may not accept max_retries, in that case remove and retry once
331
- if "max_retries" in str(e):
332
- kw.pop("max_retries", None)
333
- return cls(**kw)
334
- raise