deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -1,17 +1,23 @@
1
+ import base64
1
2
  from openai.types.chat.chat_completion import ChatCompletion
2
- from typing import Optional, Tuple, Union, Dict
3
+ from typing import Optional, Tuple, Union, Dict, List
4
+ from deepeval.test_case import MLLMImage
3
5
  from pydantic import BaseModel, SecretStr
4
-
6
+ from io import BytesIO
5
7
  from openai import (
6
8
  OpenAI,
7
9
  AsyncOpenAI,
8
10
  )
9
-
11
+ from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
10
12
  from deepeval.config.settings import get_settings
11
13
  from deepeval.constants import ProviderSlug as PS
12
14
  from deepeval.models import DeepEvalBaseLLM
13
15
  from deepeval.models.llms.utils import trim_and_load_json
14
- from deepeval.models.utils import parse_model_name, require_secret_api_key
16
+ from deepeval.models.utils import (
17
+ parse_model_name,
18
+ require_secret_api_key,
19
+ normalize_kwargs_and_extract_aliases,
20
+ )
15
21
  from deepeval.models.retry_policy import (
16
22
  create_retry_decorator,
17
23
  sdk_retries_for,
@@ -20,6 +26,7 @@ from deepeval.models.retry_policy import (
20
26
 
21
27
  retry_openai = create_retry_decorator(PS.OPENAI)
22
28
 
29
+
23
30
  valid_gpt_models = [
24
31
  "gpt-3.5-turbo",
25
32
  "gpt-3.5-turbo-0125",
@@ -82,6 +89,15 @@ unsupported_log_probs_gpt_models = [
82
89
  "gpt-5-chat-latest",
83
90
  ]
84
91
 
92
+ unsupported_log_probs_multimodal_gpt_models = [
93
+ "o1",
94
+ "o1-preview",
95
+ "o1-2024-12-17",
96
+ "o1-preview-2024-09-12",
97
+ "gpt-4.5-preview-2025-02-27",
98
+ "o4-mini",
99
+ ]
100
+
85
101
  structured_outputs_models = [
86
102
  "gpt-4o",
87
103
  "gpt-4o-2024-05-13",
@@ -214,20 +230,42 @@ def _request_timeout_seconds() -> float:
214
230
  return timeout if timeout > 0 else 30.0
215
231
 
216
232
 
233
+ _ALIAS_MAP = {
234
+ "api_key": ["_openai_api_key"],
235
+ }
236
+
237
+
217
238
  class GPTModel(DeepEvalBaseLLM):
239
+ valid_multimodal_models = [
240
+ "gpt-4o",
241
+ "gpt-4o-mini",
242
+ "gpt-4.1",
243
+ "gpt-4.1-mini",
244
+ "gpt-5",
245
+ ]
246
+
218
247
  def __init__(
219
248
  self,
220
249
  model: Optional[str] = None,
221
- _openai_api_key: Optional[str] = None,
250
+ api_key: Optional[str] = None,
222
251
  base_url: Optional[str] = None,
252
+ temperature: float = 0,
223
253
  cost_per_input_token: Optional[float] = None,
224
254
  cost_per_output_token: Optional[float] = None,
225
- temperature: float = 0,
226
255
  generation_kwargs: Optional[Dict] = None,
227
256
  **kwargs,
228
257
  ):
258
+ normalized_kwargs, alias_values = normalize_kwargs_and_extract_aliases(
259
+ "GPTModel",
260
+ kwargs,
261
+ _ALIAS_MAP,
262
+ )
263
+
264
+ # re-map depricated keywords to re-named positional args
265
+ if api_key is None and "api_key" in alias_values:
266
+ api_key = alias_values["api_key"]
267
+
229
268
  settings = get_settings()
230
- model_name = None
231
269
  model = model or settings.OPENAI_MODEL_NAME
232
270
  cost_per_input_token = (
233
271
  cost_per_input_token
@@ -240,51 +278,50 @@ class GPTModel(DeepEvalBaseLLM):
240
278
  else settings.OPENAI_COST_PER_OUTPUT_TOKEN
241
279
  )
242
280
 
281
+ if model is None:
282
+ model = default_gpt_model
283
+
243
284
  if isinstance(model, str):
244
- model_name = parse_model_name(model)
245
- if model_name not in valid_gpt_models:
285
+ model = parse_model_name(model)
286
+ if model not in valid_gpt_models:
246
287
  raise ValueError(
247
288
  f"Invalid model. Available GPT models: {', '.join(model for model in valid_gpt_models)}"
248
289
  )
249
- elif model is None:
250
- model_name = default_gpt_model
251
290
 
252
- if model_name not in model_pricing:
291
+ if model not in model_pricing:
253
292
  if cost_per_input_token is None or cost_per_output_token is None:
254
293
  raise ValueError(
255
- f"No pricing available for `{model_name}`. "
294
+ f"No pricing available for `{model}`. "
256
295
  "Please provide both `cost_per_input_token` and `cost_per_output_token` when initializing `GPTModel`, "
257
296
  "or set them via the CLI:\n"
258
297
  " deepeval set-openai --model=[...] --cost_per_input_token=[...] --cost_per_output_token=[...]"
259
298
  )
260
299
  else:
261
- model_pricing[model_name] = {
300
+ model_pricing[model] = {
262
301
  "input": float(cost_per_input_token),
263
302
  "output": float(cost_per_output_token),
264
303
  }
265
304
 
266
- elif model is None:
267
- model_name = default_gpt_model
268
-
269
- if _openai_api_key is not None:
305
+ if api_key is not None:
270
306
  # keep it secret, keep it safe from serializings, logging and alike
271
- self._openai_api_key: SecretStr | None = SecretStr(_openai_api_key)
307
+ self.api_key: SecretStr | None = SecretStr(api_key)
272
308
  else:
273
- self._openai_api_key = get_settings().OPENAI_API_KEY
309
+ self.api_key = get_settings().OPENAI_API_KEY
274
310
 
275
311
  self.base_url = base_url
276
312
  # args and kwargs will be passed to the underlying model, in load_model function
277
313
 
278
314
  # Auto-adjust temperature for models that require it
279
- if model_name in models_requiring_temperature_1:
315
+ if model in models_requiring_temperature_1:
280
316
  temperature = 1
281
317
 
282
318
  if temperature < 0:
283
319
  raise ValueError("Temperature must be >= 0.")
284
320
  self.temperature = temperature
285
- self.kwargs = kwargs
321
+ # Keep sanitized kwargs for client call to strip legacy keys
322
+ self.kwargs = normalized_kwargs
286
323
  self.generation_kwargs = generation_kwargs or {}
287
- super().__init__(model_name)
324
+ super().__init__(model)
288
325
 
289
326
  ###############################################
290
327
  # Generate functions
@@ -295,10 +332,15 @@ class GPTModel(DeepEvalBaseLLM):
295
332
  self, prompt: str, schema: Optional[BaseModel] = None
296
333
  ) -> Tuple[Union[str, Dict], float]:
297
334
  client = self.load_model(async_mode=False)
335
+
336
+ if check_if_multimodal(prompt):
337
+ prompt = convert_to_multi_modal_array(input=prompt)
338
+ prompt = self.generate_prompt(prompt)
339
+
298
340
  if schema:
299
- if self.model_name in structured_outputs_models:
341
+ if self.name in structured_outputs_models:
300
342
  completion = client.beta.chat.completions.parse(
301
- model=self.model_name,
343
+ model=self.name,
302
344
  messages=[
303
345
  {"role": "user", "content": prompt},
304
346
  ],
@@ -314,9 +356,9 @@ class GPTModel(DeepEvalBaseLLM):
314
356
  completion.usage.completion_tokens,
315
357
  )
316
358
  return structured_output, cost
317
- if self.model_name in json_mode_models:
359
+ if self.name in json_mode_models:
318
360
  completion = client.beta.chat.completions.parse(
319
- model=self.model_name,
361
+ model=self.name,
320
362
  messages=[
321
363
  {"role": "user", "content": prompt},
322
364
  ],
@@ -334,7 +376,7 @@ class GPTModel(DeepEvalBaseLLM):
334
376
  return schema.model_validate(json_output), cost
335
377
 
336
378
  completion = client.chat.completions.create(
337
- model=self.model_name,
379
+ model=self.name,
338
380
  messages=[{"role": "user", "content": prompt}],
339
381
  temperature=self.temperature,
340
382
  **self.generation_kwargs,
@@ -354,10 +396,15 @@ class GPTModel(DeepEvalBaseLLM):
354
396
  self, prompt: str, schema: Optional[BaseModel] = None
355
397
  ) -> Tuple[Union[str, BaseModel], float]:
356
398
  client = self.load_model(async_mode=True)
399
+
400
+ if check_if_multimodal(prompt):
401
+ prompt = convert_to_multi_modal_array(input=prompt)
402
+ prompt = self.generate_prompt(prompt)
403
+
357
404
  if schema:
358
- if self.model_name in structured_outputs_models:
405
+ if self.name in structured_outputs_models:
359
406
  completion = await client.beta.chat.completions.parse(
360
- model=self.model_name,
407
+ model=self.name,
361
408
  messages=[
362
409
  {"role": "user", "content": prompt},
363
410
  ],
@@ -373,9 +420,9 @@ class GPTModel(DeepEvalBaseLLM):
373
420
  completion.usage.completion_tokens,
374
421
  )
375
422
  return structured_output, cost
376
- if self.model_name in json_mode_models:
423
+ if self.name in json_mode_models:
377
424
  completion = await client.beta.chat.completions.parse(
378
- model=self.model_name,
425
+ model=self.name,
379
426
  messages=[
380
427
  {"role": "user", "content": prompt},
381
428
  ],
@@ -393,7 +440,7 @@ class GPTModel(DeepEvalBaseLLM):
393
440
  return schema.model_validate(json_output), cost
394
441
 
395
442
  completion = await client.chat.completions.create(
396
- model=self.model_name,
443
+ model=self.name,
397
444
  messages=[{"role": "user", "content": prompt}],
398
445
  temperature=self.temperature,
399
446
  **self.generation_kwargs,
@@ -420,8 +467,11 @@ class GPTModel(DeepEvalBaseLLM):
420
467
  ) -> Tuple[ChatCompletion, float]:
421
468
  # Generate completion
422
469
  client = self.load_model(async_mode=False)
470
+ if check_if_multimodal(prompt):
471
+ prompt = convert_to_multi_modal_array(input=prompt)
472
+ prompt = self.generate_prompt(prompt)
423
473
  completion = client.chat.completions.create(
424
- model=self.model_name,
474
+ model=self.name,
425
475
  messages=[{"role": "user", "content": prompt}],
426
476
  temperature=self.temperature,
427
477
  logprobs=True,
@@ -443,8 +493,11 @@ class GPTModel(DeepEvalBaseLLM):
443
493
  ) -> Tuple[ChatCompletion, float]:
444
494
  # Generate completion
445
495
  client = self.load_model(async_mode=True)
496
+ if check_if_multimodal(prompt):
497
+ prompt = convert_to_multi_modal_array(input=prompt)
498
+ prompt = self.generate_prompt(prompt)
446
499
  completion = await client.chat.completions.create(
447
- model=self.model_name,
500
+ model=self.name,
448
501
  messages=[{"role": "user", "content": prompt}],
449
502
  temperature=self.temperature,
450
503
  logprobs=True,
@@ -463,8 +516,11 @@ class GPTModel(DeepEvalBaseLLM):
463
516
  self, prompt: str, n: int, temperature: float
464
517
  ) -> Tuple[list[str], float]:
465
518
  client = self.load_model(async_mode=False)
519
+ if check_if_multimodal(prompt):
520
+ prompt = convert_to_multi_modal_array(input=prompt)
521
+ prompt = self.generate_prompt(prompt)
466
522
  response = client.chat.completions.create(
467
- model=self.model_name,
523
+ model=self.name,
468
524
  messages=[{"role": "user", "content": prompt}],
469
525
  n=n,
470
526
  temperature=temperature,
@@ -479,7 +535,7 @@ class GPTModel(DeepEvalBaseLLM):
479
535
 
480
536
  def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
481
537
  # TODO: consider loggin a warning instead of defaulting to whole model pricing
482
- pricing = model_pricing.get(self.model_name, model_pricing)
538
+ pricing = model_pricing.get(self.name, model_pricing)
483
539
  input_cost = input_tokens * pricing["input"]
484
540
  output_cost = output_tokens * pricing["output"]
485
541
  return input_cost + output_cost
@@ -488,8 +544,40 @@ class GPTModel(DeepEvalBaseLLM):
488
544
  # Model #
489
545
  #########
490
546
 
491
- def get_model_name(self):
492
- return self.model_name
547
+ def generate_prompt(
548
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
549
+ ):
550
+ prompt = []
551
+ for ele in multimodal_input:
552
+ if isinstance(ele, str):
553
+ prompt.append({"type": "text", "text": ele})
554
+ elif isinstance(ele, MLLMImage):
555
+ if ele.local:
556
+ import PIL.Image
557
+
558
+ image = PIL.Image.open(ele.url)
559
+ visual_dict = {
560
+ "type": "image_url",
561
+ "image_url": {
562
+ "url": f"data:image/jpeg;base64,{self.encode_pil_image(image)}"
563
+ },
564
+ }
565
+ else:
566
+ visual_dict = {
567
+ "type": "image_url",
568
+ "image_url": {"url": ele.url},
569
+ }
570
+ prompt.append(visual_dict)
571
+ return prompt
572
+
573
+ def encode_pil_image(self, pil_image):
574
+ image_buffer = BytesIO()
575
+ if pil_image.mode in ("RGBA", "LA", "P"):
576
+ pil_image = pil_image.convert("RGB")
577
+ pil_image.save(image_buffer, format="JPEG")
578
+ image_bytes = image_buffer.getvalue()
579
+ base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")
580
+ return base64_encoded_image
493
581
 
494
582
  def load_model(self, async_mode: bool = False):
495
583
  if not async_mode:
@@ -512,10 +600,10 @@ class GPTModel(DeepEvalBaseLLM):
512
600
 
513
601
  def _build_client(self, cls):
514
602
  api_key = require_secret_api_key(
515
- self._openai_api_key,
603
+ self.api_key,
516
604
  provider_label="OpenAI",
517
605
  env_var_name="OPENAI_API_KEY",
518
- param_hint="`_openai_api_key` to GPTModel(...)",
606
+ param_hint="`api_key` to GPTModel(...)",
519
607
  )
520
608
 
521
609
  kw = dict(
@@ -531,3 +619,11 @@ class GPTModel(DeepEvalBaseLLM):
531
619
  kw.pop("max_retries", None)
532
620
  return cls(**kw)
533
621
  raise
622
+
623
+ def supports_multimodal(self):
624
+ if self.name in GPTModel.valid_multimodal_models:
625
+ return True
626
+ return False
627
+
628
+ def get_model_name(self):
629
+ return f"{self.name}"
@@ -4,11 +4,18 @@ from typing import Any, Dict, List, Optional, Union
4
4
  from pydantic import AnyUrl, SecretStr
5
5
 
6
6
  from deepeval.config.settings import get_settings
7
- from deepeval.models.utils import require_secret_api_key
7
+ from deepeval.models.utils import (
8
+ require_secret_api_key,
9
+ )
8
10
  from deepeval.models import DeepEvalBaseLLM
9
11
  from deepeval.utils import require_param
10
12
 
11
13
 
14
+ def _request_timeout_seconds() -> float:
15
+ timeout = float(get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
16
+ return timeout if timeout > 0 else 30.0
17
+
18
+
12
19
  class PortkeyModel(DeepEvalBaseLLM):
13
20
  def __init__(
14
21
  self,
@@ -16,11 +23,13 @@ class PortkeyModel(DeepEvalBaseLLM):
16
23
  api_key: Optional[str] = None,
17
24
  base_url: Optional[AnyUrl] = None,
18
25
  provider: Optional[str] = None,
26
+ generation_kwargs: Optional[Dict] = None,
27
+ **kwargs,
19
28
  ):
20
29
  settings = get_settings()
21
30
  model = model or settings.PORTKEY_MODEL_NAME
22
31
 
23
- self.model = require_param(
32
+ self.name = require_param(
24
33
  model,
25
34
  provider_label="Portkey",
26
35
  env_var_name="PORTKEY_MODEL_NAME",
@@ -52,6 +61,9 @@ class PortkeyModel(DeepEvalBaseLLM):
52
61
  env_var_name="PORTKEY_PROVIDER_NAME",
53
62
  param_hint="provider",
54
63
  )
64
+ # Keep sanitized kwargs for client call to strip legacy keys
65
+ self.kwargs = kwargs
66
+ self.generation_kwargs = generation_kwargs or {}
55
67
 
56
68
  def _headers(self) -> Dict[str, str]:
57
69
  api_key = require_secret_api_key(
@@ -70,10 +82,13 @@ class PortkeyModel(DeepEvalBaseLLM):
70
82
  return headers
71
83
 
72
84
  def _payload(self, prompt: str) -> Dict[str, Any]:
73
- return {
74
- "model": self.model,
85
+ payload = {
86
+ "model": self.name,
75
87
  "messages": [{"role": "user", "content": prompt}],
76
88
  }
89
+ if self.generation_kwargs:
90
+ payload.update(self.generation_kwargs)
91
+ return payload
77
92
 
78
93
  def _extract_content(self, data: Dict[str, Any]) -> str:
79
94
  choices: Union[List[Dict[str, Any]], None] = data.get("choices")
@@ -88,6 +103,7 @@ class PortkeyModel(DeepEvalBaseLLM):
88
103
  return ""
89
104
 
90
105
  def generate(self, prompt: str) -> str:
106
+
91
107
  try:
92
108
  response = requests.post(
93
109
  f"{self.base_url}/chat/completions",
@@ -110,6 +126,7 @@ class PortkeyModel(DeepEvalBaseLLM):
110
126
  return self._extract_content(response.json())
111
127
 
112
128
  async def a_generate(self, prompt: str) -> str:
129
+
113
130
  async with aiohttp.ClientSession() as session:
114
131
  async with session.post(
115
132
  f"{self.base_url}/chat/completions",
@@ -125,8 +142,8 @@ class PortkeyModel(DeepEvalBaseLLM):
125
142
  data = await response.json()
126
143
  return self._extract_content(data)
127
144
 
128
- def get_model_name(self) -> str:
129
- return f"Portkey ({self.model})"
130
-
131
145
  def load_model(self):
132
146
  return None
147
+
148
+ def get_model_name(self):
149
+ return f"{self.name} (Portkey)"
@@ -1,8 +1,10 @@
1
- from typing import Dict
1
+ from typing import Dict, List, Optional
2
2
  import re
3
3
  import json
4
4
  import asyncio
5
5
 
6
+ MULTIMODAL_MODELS = ["GPTModel", "AzureModel", "GeminiModel", "OllamaModel"]
7
+
6
8
 
7
9
  def trim_and_load_json(
8
10
  input_string: str,
@@ -38,7 +40,7 @@ def safe_asyncio_run(coro):
38
40
  return loop.run_until_complete(future)
39
41
  else:
40
42
  return loop.run_until_complete(coro)
41
- except Exception as inner_e:
43
+ except Exception:
42
44
  raise
43
- except Exception as e:
45
+ except Exception:
44
46
  raise
@@ -55,6 +55,7 @@ from tenacity.stop import stop_base
55
55
  from tenacity.wait import wait_base
56
56
  from contextvars import ContextVar, copy_context
57
57
 
58
+ from deepeval.utils import require_dependency
58
59
  from deepeval.constants import (
59
60
  ProviderSlug as PS,
60
61
  slugify,
@@ -829,25 +830,23 @@ try:
829
830
  except Exception: # botocore not present (aiobotocore optional)
830
831
  BEDROCK_ERROR_POLICY = None
831
832
 
832
-
833
833
  ####################
834
834
  # Anthropic Policy #
835
835
  ####################
836
836
 
837
837
  try:
838
- from anthropic import (
839
- AuthenticationError,
840
- RateLimitError,
841
- APIConnectionError,
842
- APITimeoutError,
843
- APIStatusError,
838
+
839
+ module = require_dependency(
840
+ "anthropic",
841
+ provider_label="retry_policy",
842
+ install_hint="Install it with `pip install anthropic`.",
844
843
  )
845
844
 
846
845
  ANTHROPIC_ERROR_POLICY = ErrorPolicy(
847
- auth_excs=(AuthenticationError,),
848
- rate_limit_excs=(RateLimitError,),
849
- network_excs=(APIConnectionError, APITimeoutError),
850
- http_excs=(APIStatusError,),
846
+ auth_excs=(module.AuthenticationError,),
847
+ rate_limit_excs=(module.RateLimitError,),
848
+ network_excs=(module.APIConnectionError, module.APITimeoutError),
849
+ http_excs=(module.APIStatusError,),
851
850
  non_retryable_codes=frozenset(), # update if we learn of hard quota codes
852
851
  message_markers={},
853
852
  )
@@ -868,7 +867,11 @@ except Exception: # Anthropic optional
868
867
  # and gate retries using message markers (code sniffing).
869
868
  # See: https://github.com/googleapis/python-genai?tab=readme-ov-file#error-handling
870
869
  try:
871
- from google.genai import errors as gerrors
870
+ module = require_dependency(
871
+ "google.genai",
872
+ provider_label="retry_policy",
873
+ install_hint="Install it with `pip install google-genai`.",
874
+ )
872
875
 
873
876
  _HTTPX_NET_EXCS = _httpx_net_excs()
874
877
  _REQUESTS_EXCS = _requests_net_excs()
@@ -887,9 +890,9 @@ try:
887
890
  GOOGLE_ERROR_POLICY = ErrorPolicy(
888
891
  auth_excs=(), # we will classify 401/403 via markers below (see non-retryable codes)
889
892
  rate_limit_excs=(
890
- gerrors.ClientError,
893
+ module.gerrors.ClientError,
891
894
  ), # includes 429; markers decide retry vs not
892
- network_excs=(gerrors.ServerError,)
895
+ network_excs=(module.gerrors.ServerError,)
893
896
  + _HTTPX_NET_EXCS
894
897
  + _REQUESTS_EXCS, # treat 5xx as transient
895
898
  http_excs=(), # no reliable .status_code on exceptions; handled above
deepeval/models/utils.py CHANGED
@@ -1,9 +1,13 @@
1
- from typing import Optional
1
+ import logging
2
+ from typing import Any, Dict, Optional, Tuple
2
3
  from pydantic import SecretStr
3
4
 
4
5
  from deepeval.errors import DeepEvalError
5
6
 
6
7
 
8
+ logger = logging.getLogger(__name__)
9
+
10
+
7
11
  def parse_model_name(model_name: Optional[str] = None) -> str:
8
12
  """Extract base model name from provider-prefixed format.
9
13
 
@@ -74,3 +78,44 @@ def require_secret_api_key(
74
78
  )
75
79
 
76
80
  return api_key
81
+
82
+
83
+ def normalize_kwargs_and_extract_aliases(
84
+ provider_label: str,
85
+ kwargs: Dict[str, Any],
86
+ alias_map: Dict[str, list],
87
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
88
+ """
89
+ Normalize legacy keyword argument names according to alias_map.
90
+
91
+ alias_map is of the form: {new_name: [old_name1, old_name2, ...]}
92
+
93
+ - Returns (normalized_kwargs, extracted_values)
94
+ where:
95
+ - normalized_kwargs has all legacy keys removed (to prevent forwarding
96
+ to downstream SDK clients).
97
+ - extracted_values maps new_name -> value for any alias that was used.
98
+
99
+ - Logs a warning for each legacy keyword used, so callers know they should
100
+ migrate to the new name.
101
+ """
102
+ normalized = dict(kwargs)
103
+ extracted: Dict[str, Any] = {}
104
+
105
+ for new_name, old_names in alias_map.items():
106
+ for old_name in old_names:
107
+ if old_name in normalized:
108
+ value = normalized.pop(old_name)
109
+
110
+ logger.warning(
111
+ "%s keyword '%s' is deprecated; please use '%s' instead.",
112
+ provider_label,
113
+ old_name,
114
+ new_name,
115
+ )
116
+
117
+ # Only preserve the first alias value we see for a given new_name
118
+ if new_name not in extracted:
119
+ extracted[new_name] = value
120
+
121
+ return normalized, extracted
@@ -0,0 +1,5 @@
1
+ from deepeval.optimizer.prompt_optimizer import PromptOptimizer
2
+
3
+ __all__ = [
4
+ "PromptOptimizer",
5
+ ]
@@ -0,0 +1,6 @@
1
+ from .gepa import GEPA
2
+ from .miprov2 import MIPROV2
3
+ from .copro import COPRO
4
+ from .simba import SIMBA
5
+
6
+ __all__ = ["GEPA", "MIPROV2", "COPRO", "SIMBA"]
@@ -0,0 +1,29 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Union, List, Dict, Tuple
3
+
4
+ from deepeval.models.base_model import DeepEvalBaseLLM
5
+ from deepeval.optimizer.scorer.base import BaseScorer
6
+ from deepeval.prompt.prompt import Prompt
7
+ from deepeval.dataset.golden import Golden, ConversationalGolden
8
+
9
+
10
+ class BaseAlgorithm(ABC):
11
+ name: str
12
+ optimizer_model: DeepEvalBaseLLM
13
+ scorer: BaseScorer
14
+
15
+ @abstractmethod
16
+ def execute(
17
+ self,
18
+ prompt: Prompt,
19
+ goldens: Union[List[Golden], List[ConversationalGolden]],
20
+ ) -> Tuple[Prompt, Dict]:
21
+ raise NotImplementedError
22
+
23
+ @abstractmethod
24
+ async def a_execute(
25
+ self,
26
+ prompt: Prompt,
27
+ goldens: Union[List[Golden], List[ConversationalGolden]],
28
+ ) -> Tuple[Prompt, Dict]:
29
+ raise NotImplementedError
@@ -0,0 +1,18 @@
1
+ # Internal GEPA constants - not exposed to users
2
+ GEPA_MIN_DELTA: float = 0.0
3
+ GEPA_TIE_TOLERANCE: float = 1e-9
4
+ GEPA_REWRITE_INSTRUCTION_MAX_CHARS: int = 4096
5
+
6
+ # Internal MIPROV2 constants - not exposed to users
7
+ MIPROV2_MIN_DELTA: float = 0.0
8
+ MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS: int = 4096
9
+ MIPROV2_DEFAULT_NUM_CANDIDATES: int = 10
10
+ MIPROV2_DEFAULT_NUM_TRIALS: int = 20
11
+ MIPROV2_DEFAULT_MINIBATCH_SIZE: int = 25
12
+ MIPROV2_DEFAULT_MINIBATCH_FULL_EVAL_STEPS: int = 10
13
+ MIPROV2_DEFAULT_MAX_BOOTSTRAPPED_DEMOS: int = 4
14
+ MIPROV2_DEFAULT_MAX_LABELED_DEMOS: int = 4
15
+ MIPROV2_DEFAULT_NUM_DEMO_SETS: int = 5
16
+
17
+ # Internal SIMBA constants - not exposed to users
18
+ SIMBA_DEMO_INPUT_MAX_CHARS: int = 256
@@ -0,0 +1,5 @@
1
+ from .copro import COPRO
2
+
3
+ __all__ = [
4
+ "COPRO",
5
+ ]