deepeval 3.7.1__py3-none-any.whl → 3.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.7.1"
1
+ __version__: str = "3.7.3"
@@ -92,7 +92,7 @@ class HumanEval(DeepEvalBaseBenchmark):
92
92
  self.predictions: Optional[pd.DataFrame] = None
93
93
  self.task_scores: Optional[pd.DataFrame] = None
94
94
  self.overall_score: Optional[float] = None
95
- self.verbose_mode: bool = (False,)
95
+ self.verbose_mode: bool = verbose_mode
96
96
 
97
97
  def evaluate(
98
98
  self, model: DeepEvalBaseLLM, *args, k: int = 1, **kwargs
@@ -123,6 +123,7 @@ class HumanEval(DeepEvalBaseBenchmark):
123
123
  task.value,
124
124
  golden.input,
125
125
  prediction,
126
+ task_correct,
126
127
  golden.expected_output,
127
128
  score,
128
129
  )
@@ -189,17 +189,35 @@ class EvaluationDataset:
189
189
  test_case._dataset_alias = self._alias
190
190
  test_case._dataset_id = self._id
191
191
  if isinstance(test_case, LLMTestCase):
192
+ if self._conversational_goldens or self._conversational_test_cases:
193
+ raise TypeError(
194
+ "You cannot add 'LLMTestCase' to a multi-turn dataset."
195
+ )
192
196
  test_case._dataset_rank = len(self._llm_test_cases)
193
197
  self._llm_test_cases.append(test_case)
194
198
  elif isinstance(test_case, ConversationalTestCase):
199
+ if self._goldens or self._llm_test_cases:
200
+ raise TypeError(
201
+ "You cannot add 'ConversationalTestCase' to a single-turn dataset."
202
+ )
203
+ self._multi_turn = True
195
204
  test_case._dataset_rank = len(self._conversational_test_cases)
196
205
  self._conversational_test_cases.append(test_case)
197
206
 
198
207
  def add_golden(self, golden: Union[Golden, ConversationalGolden]):
199
- if self._multi_turn:
200
- self._add_conversational_golden(golden)
201
- else:
208
+ if isinstance(golden, Golden):
209
+ if self._conversational_goldens or self._conversational_test_cases:
210
+ raise TypeError(
211
+ "You cannot add 'Golden' to a multi-turn dataset."
212
+ )
202
213
  self._add_golden(golden)
214
+ else:
215
+ if self._goldens or self._llm_test_cases:
216
+ raise TypeError(
217
+ "You cannot add 'ConversationalGolden' to a single-turn dataset."
218
+ )
219
+ self._multi_turn = True
220
+ self._add_conversational_golden(golden)
203
221
 
204
222
  def _add_golden(self, golden: Union[Golden, ConversationalGolden]):
205
223
  if isinstance(golden, Golden):
@@ -224,16 +242,16 @@ class EvaluationDataset:
224
242
  file_path: str,
225
243
  input_col_name: str,
226
244
  actual_output_col_name: str,
227
- expected_output_col_name: Optional[str] = None,
228
- context_col_name: Optional[str] = None,
245
+ expected_output_col_name: Optional[str] = "expected_output",
246
+ context_col_name: Optional[str] = "context",
229
247
  context_col_delimiter: str = ";",
230
- retrieval_context_col_name: Optional[str] = None,
248
+ retrieval_context_col_name: Optional[str] = "retrieval_context",
231
249
  retrieval_context_col_delimiter: str = ";",
232
- tools_called_col_name: Optional[str] = None,
250
+ tools_called_col_name: Optional[str] = "tools_called",
233
251
  tools_called_col_delimiter: str = ";",
234
- expected_tools_col_name: Optional[str] = None,
252
+ expected_tools_col_name: Optional[str] = "expected_tools",
235
253
  expected_tools_col_delimiter: str = ";",
236
- additional_metadata_col_name: Optional[str] = None,
254
+ additional_metadata_col_name: Optional[str] = "additional_metadata",
237
255
  ):
238
256
  """
239
257
  Load test cases from a CSV file.
@@ -379,6 +397,7 @@ class EvaluationDataset:
379
397
  retrieval_context_key_name: Optional[str] = None,
380
398
  tools_called_key_name: Optional[str] = None,
381
399
  expected_tools_key_name: Optional[str] = None,
400
+ addtional_metadata_key_name: Optional[str] = None,
382
401
  encoding_type: str = "utf-8",
383
402
  ):
384
403
  """
@@ -431,6 +450,7 @@ class EvaluationDataset:
431
450
  tools_called = [ToolCall(**tool) for tool in tools_called_data]
432
451
  expected_tools_data = json_obj.get(expected_tools_key_name, [])
433
452
  expected_tools = [ToolCall(**tool) for tool in expected_tools_data]
453
+ # additional_metadata = json_obj.get(addtional_metadata_key_name)
434
454
 
435
455
  self.add_test_case(
436
456
  LLMTestCase(
@@ -441,6 +461,7 @@ class EvaluationDataset:
441
461
  retrieval_context=retrieval_context,
442
462
  tools_called=tools_called,
443
463
  expected_tools=expected_tools,
464
+ # additional_metadata=additional_metadata,
444
465
  )
445
466
  )
446
467
 
@@ -460,8 +481,8 @@ class EvaluationDataset:
460
481
  expected_tools_col_delimiter: str = ";",
461
482
  comments_key_name: str = "comments",
462
483
  name_key_name: str = "name",
463
- source_file_col_name: Optional[str] = None,
464
- additional_metadata_col_name: Optional[str] = None,
484
+ source_file_col_name: Optional[str] = "source_file",
485
+ additional_metadata_col_name: Optional[str] = "additional_metadata",
465
486
  scenario_col_name: Optional[str] = "scenario",
466
487
  turns_col_name: Optional[str] = "turns",
467
488
  expected_outcome_col_name: Optional[str] = "expected_outcome",
@@ -587,6 +608,7 @@ class EvaluationDataset:
587
608
  context=context,
588
609
  comments=comments,
589
610
  name=name,
611
+ additional_metadata=additional_metadata,
590
612
  )
591
613
  )
592
614
  else:
@@ -645,6 +667,7 @@ class EvaluationDataset:
645
667
  comments = json_obj.get(comments_key_name)
646
668
  name = json_obj.get(name_key_name)
647
669
  parsed_turns = parse_turns(turns) if turns else []
670
+ additional_metadata = json_obj.get(additional_metadata_key_name)
648
671
 
649
672
  self._multi_turn = True
650
673
  self.goldens.append(
@@ -656,6 +679,7 @@ class EvaluationDataset:
656
679
  context=context,
657
680
  comments=comments,
658
681
  name=name,
682
+ additional_metadata=additional_metadata,
659
683
  )
660
684
  )
661
685
  else:
deepeval/dataset/utils.py CHANGED
@@ -24,6 +24,7 @@ def convert_test_cases_to_goldens(
24
24
  "retrieval_context": test_case.retrieval_context,
25
25
  "tools_called": test_case.tools_called,
26
26
  "expected_tools": test_case.expected_tools,
27
+ "additional_metadata": test_case.additional_metadata,
27
28
  }
28
29
  goldens.append(Golden(**golden))
29
30
  return goldens
@@ -70,6 +71,7 @@ def convert_convo_test_cases_to_convo_goldens(
70
71
  "expected_outcome": test_case.expected_outcome,
71
72
  "user_description": test_case.user_description,
72
73
  "context": test_case.context,
74
+ "additional_metadata": test_case.additional_metadata,
73
75
  }
74
76
  goldens.append(ConversationalGolden(**golden))
75
77
  return goldens
@@ -502,10 +502,14 @@ def wrap_up_experiment(
502
502
 
503
503
  try:
504
504
  api = Api()
505
- experiment_request = PostExperimentRequest(testRuns=test_runs, name=name)
505
+ experiment_request = PostExperimentRequest(
506
+ testRuns=test_runs, name=name
507
+ )
506
508
 
507
509
  try:
508
- body = experiment_request.model_dump(by_alias=True, exclude_none=True)
510
+ body = experiment_request.model_dump(
511
+ by_alias=True, exclude_none=True
512
+ )
509
513
  except AttributeError:
510
514
  body = experiment_request.dict(by_alias=True, exclude_none=True)
511
515
  json_str = json.dumps(body, cls=TestRunEncoder)
deepeval/metrics/utils.py CHANGED
@@ -25,6 +25,7 @@ from deepeval.models import (
25
25
  MultimodalOpenAIModel,
26
26
  MultimodalGeminiModel,
27
27
  MultimodalOllamaModel,
28
+ MultimodalAzureOpenAIMLLMModel,
28
29
  AmazonBedrockModel,
29
30
  LiteLLMModel,
30
31
  KimiModel,
@@ -514,6 +515,8 @@ def initialize_multimodal_model(
514
515
  return MultimodalGeminiModel(), True
515
516
  if should_use_ollama_model():
516
517
  return MultimodalOllamaModel(), True
518
+ elif should_use_azure_openai():
519
+ return MultimodalAzureOpenAIMLLMModel(model_name=model), True
517
520
  elif isinstance(model, str) or model is None:
518
521
  return MultimodalOpenAIModel(model=model), True
519
522
  raise TypeError(
@@ -21,6 +21,7 @@ from deepeval.models.mlllms import (
21
21
  MultimodalOpenAIModel,
22
22
  MultimodalOllamaModel,
23
23
  MultimodalGeminiModel,
24
+ MultimodalAzureOpenAIMLLMModel,
24
25
  )
25
26
  from deepeval.models.embedding_models import (
26
27
  OpenAIEmbeddingModel,
@@ -48,6 +49,7 @@ __all__ = [
48
49
  "MultimodalOpenAIModel",
49
50
  "MultimodalOllamaModel",
50
51
  "MultimodalGeminiModel",
52
+ "MultimodalAzureOpenAIMLLMModel",
51
53
  "OpenAIEmbeddingModel",
52
54
  "AzureOpenAIEmbeddingModel",
53
55
  "LocalEmbeddingModel",
@@ -1,3 +1,4 @@
1
1
  from .openai_model import MultimodalOpenAIModel
2
2
  from .ollama_model import MultimodalOllamaModel
3
3
  from .gemini_model import MultimodalGeminiModel
4
+ from .azure_model import MultimodalAzureOpenAIMLLMModel
@@ -0,0 +1,334 @@
1
+ from openai.types.chat.chat_completion import ChatCompletion
2
+ from openai import AzureOpenAI, AsyncAzureOpenAI
3
+ from typing import Optional, Tuple, Union, Dict, List
4
+ from pydantic import BaseModel
5
+ from io import BytesIO
6
+ import base64
7
+
8
+ from deepeval.models import DeepEvalBaseMLLM
9
+ from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
10
+ from deepeval.test_case import MLLMImage
11
+ from deepeval.models.llms.openai_model import (
12
+ structured_outputs_models,
13
+ json_mode_models,
14
+ model_pricing,
15
+ )
16
+ from deepeval.models.retry_policy import (
17
+ create_retry_decorator,
18
+ sdk_retries_for,
19
+ )
20
+
21
+ from deepeval.models.llms.utils import trim_and_load_json
22
+ from deepeval.models.utils import parse_model_name
23
+ from deepeval.constants import ProviderSlug as PS
24
+
25
+
26
+ retry_azure = create_retry_decorator(PS.AZURE)
27
+
28
+
29
+ class MultimodalAzureOpenAIMLLMModel(DeepEvalBaseMLLM):
30
+ def __init__(
31
+ self,
32
+ deployment_name: Optional[str] = None,
33
+ model_name: Optional[str] = None,
34
+ azure_openai_api_key: Optional[str] = None,
35
+ openai_api_version: Optional[str] = None,
36
+ azure_endpoint: Optional[str] = None,
37
+ temperature: float = 0,
38
+ generation_kwargs: Optional[Dict] = None,
39
+ **kwargs,
40
+ ):
41
+ # fetch Azure deployment parameters
42
+ model_name = model_name or KEY_FILE_HANDLER.fetch_data(
43
+ ModelKeyValues.AZURE_MODEL_NAME
44
+ )
45
+ self.deployment_name = deployment_name or KEY_FILE_HANDLER.fetch_data(
46
+ ModelKeyValues.AZURE_DEPLOYMENT_NAME
47
+ )
48
+ self.azure_openai_api_key = (
49
+ azure_openai_api_key
50
+ or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.AZURE_OPENAI_API_KEY)
51
+ )
52
+ self.openai_api_version = (
53
+ openai_api_version
54
+ or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.OPENAI_API_VERSION)
55
+ )
56
+ self.azure_endpoint = azure_endpoint or KEY_FILE_HANDLER.fetch_data(
57
+ ModelKeyValues.AZURE_OPENAI_ENDPOINT
58
+ )
59
+ if temperature < 0:
60
+ raise ValueError("Temperature must be >= 0.")
61
+ self.temperature = temperature
62
+
63
+ # args and kwargs will be passed to the underlying model, in load_model function
64
+ self.kwargs = kwargs
65
+ self.generation_kwargs = generation_kwargs or {}
66
+ super().__init__(parse_model_name(model_name))
67
+
68
+ ###############################################
69
+ # Generate functions
70
+ ###############################################
71
+
72
+ @retry_azure
73
+ def generate(
74
+ self,
75
+ multimodal_input: List[Union[str, MLLMImage]],
76
+ schema: Optional[BaseModel] = None,
77
+ ) -> Tuple[Union[str, BaseModel], float]:
78
+ client = self.load_model(async_mode=False)
79
+ prompt = self.generate_prompt(multimodal_input)
80
+
81
+ if schema:
82
+ if self.model_name in structured_outputs_models:
83
+ messages = [{"role": "user", "content": prompt}]
84
+ completion = client.beta.chat.completions.parse(
85
+ model=self.deployment_name,
86
+ messages=messages,
87
+ response_format=schema,
88
+ temperature=self.temperature,
89
+ )
90
+ structured_output: BaseModel = completion.choices[
91
+ 0
92
+ ].message.parsed
93
+ cost = self.calculate_cost(
94
+ completion.usage.prompt_tokens,
95
+ completion.usage.completion_tokens,
96
+ )
97
+ return structured_output, cost
98
+ if self.model_name in json_mode_models:
99
+ messages = [{"role": "user", "content": prompt}]
100
+ completion = client.beta.chat.completions.parse(
101
+ model=self.deployment_name,
102
+ messages=messages,
103
+ response_format={"type": "json_object"},
104
+ temperature=self.temperature,
105
+ )
106
+ json_output = trim_and_load_json(
107
+ completion.choices[0].message.content
108
+ )
109
+ cost = self.calculate_cost(
110
+ completion.usage.prompt_tokens,
111
+ completion.usage.completion_tokens,
112
+ )
113
+ return schema.model_validate(json_output), cost
114
+ print("Loading model client:")
115
+ print(client.base_url)
116
+ completion = client.chat.completions.create(
117
+ model=self.deployment_name,
118
+ messages=[{"role": "user", "content": prompt}],
119
+ temperature=self.temperature,
120
+ **self.generation_kwargs,
121
+ )
122
+ output = completion.choices[0].message.content
123
+ cost = self.calculate_cost(
124
+ completion.usage.prompt_tokens, completion.usage.completion_tokens
125
+ )
126
+ if schema:
127
+ json_output = trim_and_load_json(output)
128
+ return schema.model_validate(json_output), cost
129
+ else:
130
+ return output, cost
131
+
132
+ @retry_azure
133
+ async def a_generate(
134
+ self,
135
+ multimodal_input: List[Union[str, MLLMImage]],
136
+ schema: Optional[BaseModel] = None,
137
+ ) -> Tuple[Union[str, BaseModel], float]:
138
+ client = self.load_model(async_mode=True)
139
+ prompt = self.generate_prompt(multimodal_input)
140
+
141
+ if schema:
142
+ if self.model_name in structured_outputs_models:
143
+ messages = [{"role": "user", "content": prompt}]
144
+ completion = await client.beta.chat.completions.parse(
145
+ model=self.deployment_name,
146
+ messages=messages,
147
+ response_format=schema,
148
+ temperature=self.temperature,
149
+ )
150
+ structured_output: BaseModel = completion.choices[
151
+ 0
152
+ ].message.parsed
153
+ cost = self.calculate_cost(
154
+ completion.usage.prompt_tokens,
155
+ completion.usage.completion_tokens,
156
+ )
157
+ return structured_output, cost
158
+ if self.model_name in json_mode_models:
159
+ messages = [{"role": "user", "content": prompt}]
160
+ completion = await client.beta.chat.completions.parse(
161
+ model=self.deployment_name,
162
+ messages=messages,
163
+ response_format={"type": "json_object"},
164
+ temperature=self.temperature,
165
+ **self.generation_kwargs,
166
+ )
167
+ json_output = trim_and_load_json(
168
+ completion.choices[0].message.content
169
+ )
170
+ cost = self.calculate_cost(
171
+ completion.usage.prompt_tokens,
172
+ completion.usage.completion_tokens,
173
+ )
174
+ return schema.model_validate(json_output), cost
175
+
176
+ completion = await client.chat.completions.create(
177
+ model=self.deployment_name,
178
+ messages=[{"role": "user", "content": prompt}],
179
+ temperature=self.temperature,
180
+ **self.generation_kwargs,
181
+ )
182
+ output = completion.choices[0].message.content
183
+ cost = self.calculate_cost(
184
+ completion.usage.prompt_tokens,
185
+ completion.usage.completion_tokens,
186
+ )
187
+ if schema:
188
+ json_output = trim_and_load_json(output)
189
+ return schema.model_validate(json_output), cost
190
+ else:
191
+ return output, cost
192
+
193
+ ###############################################
194
+ # Other generate functions
195
+ ###############################################
196
+
197
+ @retry_azure
198
+ def generate_raw_response(
199
+ self,
200
+ multimodal_input: List[Union[str, MLLMImage]],
201
+ top_logprobs: int = 5,
202
+ ) -> Tuple[ChatCompletion, float]:
203
+ client = self.load_model(async_mode=False)
204
+ prompt = self.generate_prompt(multimodal_input)
205
+ messages = [{"role": "user", "content": prompt}]
206
+
207
+ # Generate completion
208
+ completion = client.chat.completions.create(
209
+ model=self.deployment_name,
210
+ messages=messages,
211
+ temperature=self.temperature,
212
+ logprobs=True,
213
+ top_logprobs=top_logprobs,
214
+ **self.generation_kwargs,
215
+ )
216
+ # Cost calculation
217
+ input_tokens = completion.usage.prompt_tokens
218
+ output_tokens = completion.usage.completion_tokens
219
+ cost = self.calculate_cost(input_tokens, output_tokens)
220
+
221
+ return completion, cost
222
+
223
+ @retry_azure
224
+ async def a_generate_raw_response(
225
+ self,
226
+ multimodal_input: List[Union[str, MLLMImage]],
227
+ top_logprobs: int = 5,
228
+ ) -> Tuple[ChatCompletion, float]:
229
+ client = self.load_model(async_mode=True)
230
+ prompt = self.generate_prompt(multimodal_input)
231
+ messages = [{"role": "user", "content": prompt}]
232
+
233
+ # Generate completion
234
+ completion = await client.chat.completions.create(
235
+ model=self.deployment_name,
236
+ messages=messages,
237
+ temperature=self.temperature,
238
+ logprobs=True,
239
+ top_logprobs=top_logprobs,
240
+ **self.generation_kwargs,
241
+ )
242
+ # Cost calculation
243
+ input_tokens = completion.usage.prompt_tokens
244
+ output_tokens = completion.usage.completion_tokens
245
+ cost = self.calculate_cost(input_tokens, output_tokens)
246
+
247
+ return completion, cost
248
+
249
+ ###############################################
250
+ # Utilities
251
+ ###############################################
252
+
253
+ def generate_prompt(
254
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
255
+ ):
256
+ """Convert multimodal input into the proper message format for Azure OpenAI."""
257
+ prompt = []
258
+ for ele in multimodal_input:
259
+ if isinstance(ele, str):
260
+ prompt.append({"type": "text", "text": ele})
261
+ elif isinstance(ele, MLLMImage):
262
+ if ele.local:
263
+ import PIL.Image
264
+
265
+ image = PIL.Image.open(ele.url)
266
+ visual_dict = {
267
+ "type": "image_url",
268
+ "image_url": {
269
+ "url": f"data:image/jpeg;base64,{self.encode_pil_image(image)}"
270
+ },
271
+ }
272
+ else:
273
+ visual_dict = {
274
+ "type": "image_url",
275
+ "image_url": {"url": ele.url},
276
+ }
277
+ prompt.append(visual_dict)
278
+ return prompt
279
+
280
+ def encode_pil_image(self, pil_image):
281
+ """Encode a PIL image to base64 string."""
282
+ image_buffer = BytesIO()
283
+ if pil_image.mode in ("RGBA", "LA", "P"):
284
+ pil_image = pil_image.convert("RGB")
285
+ pil_image.save(image_buffer, format="JPEG")
286
+ image_bytes = image_buffer.getvalue()
287
+ base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")
288
+ return base64_encoded_image
289
+
290
+ def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
291
+ pricing = model_pricing.get(self.model_name, model_pricing["gpt-4.1"])
292
+ input_cost = input_tokens * pricing["input"]
293
+ output_cost = output_tokens * pricing["output"]
294
+ return input_cost + output_cost
295
+
296
+ ###############################################
297
+ # Model
298
+ ###############################################
299
+
300
+ def get_model_name(self):
301
+ return f"Azure OpenAI ({self.model_name})"
302
+
303
+ def load_model(self, async_mode: bool = False):
304
+ if not async_mode:
305
+ return self._build_client(AzureOpenAI)
306
+ return self._build_client(AsyncAzureOpenAI)
307
+
308
+ def _client_kwargs(self) -> Dict:
309
+ """
310
+ If Tenacity is managing retries, force OpenAI SDK retries off to avoid double retries.
311
+ If the user opts into SDK retries for 'azure' via DEEPEVAL_SDK_RETRY_PROVIDERS,
312
+ leave their retry settings as is.
313
+ """
314
+ kwargs = dict(self.kwargs or {})
315
+ if not sdk_retries_for(PS.AZURE):
316
+ kwargs["max_retries"] = 0
317
+ return kwargs
318
+
319
+ def _build_client(self, cls):
320
+ kw = dict(
321
+ api_key=self.azure_openai_api_key,
322
+ api_version=self.openai_api_version,
323
+ azure_endpoint=self.azure_endpoint,
324
+ azure_deployment=self.deployment_name,
325
+ **self._client_kwargs(),
326
+ )
327
+ try:
328
+ return cls(**kw)
329
+ except TypeError as e:
330
+ # older OpenAI SDKs may not accept max_retries, in that case remove and retry once
331
+ if "max_retries" in str(e):
332
+ kw.pop("max_retries", None)
333
+ return cls(**kw)
334
+ raise
@@ -41,6 +41,15 @@ class StylingConfig:
41
41
  expected_output_format: Optional[str] = None
42
42
 
43
43
 
44
+ @dataclass
45
+ class ConversationalStylingConfig:
46
+ scenario_context: Optional[str] = None
47
+ conversational_task: Optional[str] = None
48
+ participant_roles: Optional[str] = None
49
+ scenario_format: Optional[str] = None
50
+ expected_outcome_format: Optional[str] = None
51
+
52
+
44
53
  @dataclass
45
54
  class ContextConstructionConfig:
46
55
  embedder: Optional[Union[str, DeepEvalBaseEmbeddingModel]] = None
@@ -58,3 +58,26 @@ class PromptStyling(BaseModel):
58
58
  scenario: str
59
59
  task: str
60
60
  input_format: str
61
+
62
+
63
+ class ConversationalScenario(BaseModel):
64
+ scenario: str
65
+
66
+
67
+ class ConversationalScenarioList(BaseModel):
68
+ data: List[ConversationalScenario]
69
+
70
+
71
+ class RewrittenScenario(BaseModel):
72
+ rewritten_scenario: str
73
+
74
+
75
+ class ScenarioFeedback(BaseModel):
76
+ score: float
77
+ feedback: str
78
+
79
+
80
+ class ConversationalPromptStyling(BaseModel):
81
+ scenario_context: str
82
+ conversational_task: str
83
+ participant_roles: str