deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +70 -26
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +52 -28
- deepeval/models/embedding_models/local_embedding_model.py +18 -14
- deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
- deepeval/models/embedding_models/openai_embedding_model.py +40 -21
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +44 -23
- deepeval/models/llms/azure_model.py +121 -36
- deepeval/models/llms/deepseek_model.py +18 -13
- deepeval/models/llms/gemini_model.py +129 -43
- deepeval/models/llms/grok_model.py +18 -13
- deepeval/models/llms/kimi_model.py +18 -13
- deepeval/models/llms/litellm_model.py +42 -22
- deepeval/models/llms/local_model.py +12 -7
- deepeval/models/llms/ollama_model.py +114 -12
- deepeval/models/llms/openai_model.py +137 -41
- deepeval/models/llms/portkey_model.py +24 -7
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +46 -1
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -1,17 +1,23 @@
|
|
|
1
|
+
import base64
|
|
1
2
|
from openai.types.chat.chat_completion import ChatCompletion
|
|
2
|
-
from typing import Optional, Tuple, Union, Dict
|
|
3
|
+
from typing import Optional, Tuple, Union, Dict, List
|
|
4
|
+
from deepeval.test_case import MLLMImage
|
|
3
5
|
from pydantic import BaseModel, SecretStr
|
|
4
|
-
|
|
6
|
+
from io import BytesIO
|
|
5
7
|
from openai import (
|
|
6
8
|
OpenAI,
|
|
7
9
|
AsyncOpenAI,
|
|
8
10
|
)
|
|
9
|
-
|
|
11
|
+
from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
|
|
10
12
|
from deepeval.config.settings import get_settings
|
|
11
13
|
from deepeval.constants import ProviderSlug as PS
|
|
12
14
|
from deepeval.models import DeepEvalBaseLLM
|
|
13
15
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
14
|
-
from deepeval.models.utils import
|
|
16
|
+
from deepeval.models.utils import (
|
|
17
|
+
parse_model_name,
|
|
18
|
+
require_secret_api_key,
|
|
19
|
+
normalize_kwargs_and_extract_aliases,
|
|
20
|
+
)
|
|
15
21
|
from deepeval.models.retry_policy import (
|
|
16
22
|
create_retry_decorator,
|
|
17
23
|
sdk_retries_for,
|
|
@@ -20,6 +26,7 @@ from deepeval.models.retry_policy import (
|
|
|
20
26
|
|
|
21
27
|
retry_openai = create_retry_decorator(PS.OPENAI)
|
|
22
28
|
|
|
29
|
+
|
|
23
30
|
valid_gpt_models = [
|
|
24
31
|
"gpt-3.5-turbo",
|
|
25
32
|
"gpt-3.5-turbo-0125",
|
|
@@ -82,6 +89,15 @@ unsupported_log_probs_gpt_models = [
|
|
|
82
89
|
"gpt-5-chat-latest",
|
|
83
90
|
]
|
|
84
91
|
|
|
92
|
+
unsupported_log_probs_multimodal_gpt_models = [
|
|
93
|
+
"o1",
|
|
94
|
+
"o1-preview",
|
|
95
|
+
"o1-2024-12-17",
|
|
96
|
+
"o1-preview-2024-09-12",
|
|
97
|
+
"gpt-4.5-preview-2025-02-27",
|
|
98
|
+
"o4-mini",
|
|
99
|
+
]
|
|
100
|
+
|
|
85
101
|
structured_outputs_models = [
|
|
86
102
|
"gpt-4o",
|
|
87
103
|
"gpt-4o-2024-05-13",
|
|
@@ -214,20 +230,42 @@ def _request_timeout_seconds() -> float:
|
|
|
214
230
|
return timeout if timeout > 0 else 30.0
|
|
215
231
|
|
|
216
232
|
|
|
233
|
+
_ALIAS_MAP = {
|
|
234
|
+
"api_key": ["_openai_api_key"],
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
|
|
217
238
|
class GPTModel(DeepEvalBaseLLM):
|
|
239
|
+
valid_multimodal_models = [
|
|
240
|
+
"gpt-4o",
|
|
241
|
+
"gpt-4o-mini",
|
|
242
|
+
"gpt-4.1",
|
|
243
|
+
"gpt-4.1-mini",
|
|
244
|
+
"gpt-5",
|
|
245
|
+
]
|
|
246
|
+
|
|
218
247
|
def __init__(
|
|
219
248
|
self,
|
|
220
249
|
model: Optional[str] = None,
|
|
221
|
-
|
|
250
|
+
api_key: Optional[str] = None,
|
|
222
251
|
base_url: Optional[str] = None,
|
|
252
|
+
temperature: float = 0,
|
|
223
253
|
cost_per_input_token: Optional[float] = None,
|
|
224
254
|
cost_per_output_token: Optional[float] = None,
|
|
225
|
-
temperature: float = 0,
|
|
226
255
|
generation_kwargs: Optional[Dict] = None,
|
|
227
256
|
**kwargs,
|
|
228
257
|
):
|
|
258
|
+
normalized_kwargs, alias_values = normalize_kwargs_and_extract_aliases(
|
|
259
|
+
"GPTModel",
|
|
260
|
+
kwargs,
|
|
261
|
+
_ALIAS_MAP,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
# re-map depricated keywords to re-named positional args
|
|
265
|
+
if api_key is None and "api_key" in alias_values:
|
|
266
|
+
api_key = alias_values["api_key"]
|
|
267
|
+
|
|
229
268
|
settings = get_settings()
|
|
230
|
-
model_name = None
|
|
231
269
|
model = model or settings.OPENAI_MODEL_NAME
|
|
232
270
|
cost_per_input_token = (
|
|
233
271
|
cost_per_input_token
|
|
@@ -240,51 +278,50 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
240
278
|
else settings.OPENAI_COST_PER_OUTPUT_TOKEN
|
|
241
279
|
)
|
|
242
280
|
|
|
281
|
+
if model is None:
|
|
282
|
+
model = default_gpt_model
|
|
283
|
+
|
|
243
284
|
if isinstance(model, str):
|
|
244
|
-
|
|
245
|
-
if
|
|
285
|
+
model = parse_model_name(model)
|
|
286
|
+
if model not in valid_gpt_models:
|
|
246
287
|
raise ValueError(
|
|
247
288
|
f"Invalid model. Available GPT models: {', '.join(model for model in valid_gpt_models)}"
|
|
248
289
|
)
|
|
249
|
-
elif model is None:
|
|
250
|
-
model_name = default_gpt_model
|
|
251
290
|
|
|
252
|
-
if
|
|
291
|
+
if model not in model_pricing:
|
|
253
292
|
if cost_per_input_token is None or cost_per_output_token is None:
|
|
254
293
|
raise ValueError(
|
|
255
|
-
f"No pricing available for `{
|
|
294
|
+
f"No pricing available for `{model}`. "
|
|
256
295
|
"Please provide both `cost_per_input_token` and `cost_per_output_token` when initializing `GPTModel`, "
|
|
257
296
|
"or set them via the CLI:\n"
|
|
258
297
|
" deepeval set-openai --model=[...] --cost_per_input_token=[...] --cost_per_output_token=[...]"
|
|
259
298
|
)
|
|
260
299
|
else:
|
|
261
|
-
model_pricing[
|
|
300
|
+
model_pricing[model] = {
|
|
262
301
|
"input": float(cost_per_input_token),
|
|
263
302
|
"output": float(cost_per_output_token),
|
|
264
303
|
}
|
|
265
304
|
|
|
266
|
-
|
|
267
|
-
model_name = default_gpt_model
|
|
268
|
-
|
|
269
|
-
if _openai_api_key is not None:
|
|
305
|
+
if api_key is not None:
|
|
270
306
|
# keep it secret, keep it safe from serializings, logging and alike
|
|
271
|
-
self.
|
|
307
|
+
self.api_key: SecretStr | None = SecretStr(api_key)
|
|
272
308
|
else:
|
|
273
|
-
self.
|
|
309
|
+
self.api_key = get_settings().OPENAI_API_KEY
|
|
274
310
|
|
|
275
311
|
self.base_url = base_url
|
|
276
312
|
# args and kwargs will be passed to the underlying model, in load_model function
|
|
277
313
|
|
|
278
314
|
# Auto-adjust temperature for models that require it
|
|
279
|
-
if
|
|
315
|
+
if model in models_requiring_temperature_1:
|
|
280
316
|
temperature = 1
|
|
281
317
|
|
|
282
318
|
if temperature < 0:
|
|
283
319
|
raise ValueError("Temperature must be >= 0.")
|
|
284
320
|
self.temperature = temperature
|
|
285
|
-
|
|
321
|
+
# Keep sanitized kwargs for client call to strip legacy keys
|
|
322
|
+
self.kwargs = normalized_kwargs
|
|
286
323
|
self.generation_kwargs = generation_kwargs or {}
|
|
287
|
-
super().__init__(
|
|
324
|
+
super().__init__(model)
|
|
288
325
|
|
|
289
326
|
###############################################
|
|
290
327
|
# Generate functions
|
|
@@ -295,10 +332,15 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
295
332
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
296
333
|
) -> Tuple[Union[str, Dict], float]:
|
|
297
334
|
client = self.load_model(async_mode=False)
|
|
335
|
+
|
|
336
|
+
if check_if_multimodal(prompt):
|
|
337
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
338
|
+
prompt = self.generate_prompt(prompt)
|
|
339
|
+
|
|
298
340
|
if schema:
|
|
299
|
-
if self.
|
|
341
|
+
if self.name in structured_outputs_models:
|
|
300
342
|
completion = client.beta.chat.completions.parse(
|
|
301
|
-
model=self.
|
|
343
|
+
model=self.name,
|
|
302
344
|
messages=[
|
|
303
345
|
{"role": "user", "content": prompt},
|
|
304
346
|
],
|
|
@@ -314,9 +356,9 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
314
356
|
completion.usage.completion_tokens,
|
|
315
357
|
)
|
|
316
358
|
return structured_output, cost
|
|
317
|
-
if self.
|
|
359
|
+
if self.name in json_mode_models:
|
|
318
360
|
completion = client.beta.chat.completions.parse(
|
|
319
|
-
model=self.
|
|
361
|
+
model=self.name,
|
|
320
362
|
messages=[
|
|
321
363
|
{"role": "user", "content": prompt},
|
|
322
364
|
],
|
|
@@ -334,7 +376,7 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
334
376
|
return schema.model_validate(json_output), cost
|
|
335
377
|
|
|
336
378
|
completion = client.chat.completions.create(
|
|
337
|
-
model=self.
|
|
379
|
+
model=self.name,
|
|
338
380
|
messages=[{"role": "user", "content": prompt}],
|
|
339
381
|
temperature=self.temperature,
|
|
340
382
|
**self.generation_kwargs,
|
|
@@ -354,10 +396,15 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
354
396
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
355
397
|
) -> Tuple[Union[str, BaseModel], float]:
|
|
356
398
|
client = self.load_model(async_mode=True)
|
|
399
|
+
|
|
400
|
+
if check_if_multimodal(prompt):
|
|
401
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
402
|
+
prompt = self.generate_prompt(prompt)
|
|
403
|
+
|
|
357
404
|
if schema:
|
|
358
|
-
if self.
|
|
405
|
+
if self.name in structured_outputs_models:
|
|
359
406
|
completion = await client.beta.chat.completions.parse(
|
|
360
|
-
model=self.
|
|
407
|
+
model=self.name,
|
|
361
408
|
messages=[
|
|
362
409
|
{"role": "user", "content": prompt},
|
|
363
410
|
],
|
|
@@ -373,9 +420,9 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
373
420
|
completion.usage.completion_tokens,
|
|
374
421
|
)
|
|
375
422
|
return structured_output, cost
|
|
376
|
-
if self.
|
|
423
|
+
if self.name in json_mode_models:
|
|
377
424
|
completion = await client.beta.chat.completions.parse(
|
|
378
|
-
model=self.
|
|
425
|
+
model=self.name,
|
|
379
426
|
messages=[
|
|
380
427
|
{"role": "user", "content": prompt},
|
|
381
428
|
],
|
|
@@ -393,7 +440,7 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
393
440
|
return schema.model_validate(json_output), cost
|
|
394
441
|
|
|
395
442
|
completion = await client.chat.completions.create(
|
|
396
|
-
model=self.
|
|
443
|
+
model=self.name,
|
|
397
444
|
messages=[{"role": "user", "content": prompt}],
|
|
398
445
|
temperature=self.temperature,
|
|
399
446
|
**self.generation_kwargs,
|
|
@@ -420,8 +467,11 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
420
467
|
) -> Tuple[ChatCompletion, float]:
|
|
421
468
|
# Generate completion
|
|
422
469
|
client = self.load_model(async_mode=False)
|
|
470
|
+
if check_if_multimodal(prompt):
|
|
471
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
472
|
+
prompt = self.generate_prompt(prompt)
|
|
423
473
|
completion = client.chat.completions.create(
|
|
424
|
-
model=self.
|
|
474
|
+
model=self.name,
|
|
425
475
|
messages=[{"role": "user", "content": prompt}],
|
|
426
476
|
temperature=self.temperature,
|
|
427
477
|
logprobs=True,
|
|
@@ -443,8 +493,11 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
443
493
|
) -> Tuple[ChatCompletion, float]:
|
|
444
494
|
# Generate completion
|
|
445
495
|
client = self.load_model(async_mode=True)
|
|
496
|
+
if check_if_multimodal(prompt):
|
|
497
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
498
|
+
prompt = self.generate_prompt(prompt)
|
|
446
499
|
completion = await client.chat.completions.create(
|
|
447
|
-
model=self.
|
|
500
|
+
model=self.name,
|
|
448
501
|
messages=[{"role": "user", "content": prompt}],
|
|
449
502
|
temperature=self.temperature,
|
|
450
503
|
logprobs=True,
|
|
@@ -463,8 +516,11 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
463
516
|
self, prompt: str, n: int, temperature: float
|
|
464
517
|
) -> Tuple[list[str], float]:
|
|
465
518
|
client = self.load_model(async_mode=False)
|
|
519
|
+
if check_if_multimodal(prompt):
|
|
520
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
521
|
+
prompt = self.generate_prompt(prompt)
|
|
466
522
|
response = client.chat.completions.create(
|
|
467
|
-
model=self.
|
|
523
|
+
model=self.name,
|
|
468
524
|
messages=[{"role": "user", "content": prompt}],
|
|
469
525
|
n=n,
|
|
470
526
|
temperature=temperature,
|
|
@@ -479,7 +535,7 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
479
535
|
|
|
480
536
|
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
|
|
481
537
|
# TODO: consider loggin a warning instead of defaulting to whole model pricing
|
|
482
|
-
pricing = model_pricing.get(self.
|
|
538
|
+
pricing = model_pricing.get(self.name, model_pricing)
|
|
483
539
|
input_cost = input_tokens * pricing["input"]
|
|
484
540
|
output_cost = output_tokens * pricing["output"]
|
|
485
541
|
return input_cost + output_cost
|
|
@@ -488,8 +544,40 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
488
544
|
# Model #
|
|
489
545
|
#########
|
|
490
546
|
|
|
491
|
-
def
|
|
492
|
-
|
|
547
|
+
def generate_prompt(
|
|
548
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
549
|
+
):
|
|
550
|
+
prompt = []
|
|
551
|
+
for ele in multimodal_input:
|
|
552
|
+
if isinstance(ele, str):
|
|
553
|
+
prompt.append({"type": "text", "text": ele})
|
|
554
|
+
elif isinstance(ele, MLLMImage):
|
|
555
|
+
if ele.local:
|
|
556
|
+
import PIL.Image
|
|
557
|
+
|
|
558
|
+
image = PIL.Image.open(ele.url)
|
|
559
|
+
visual_dict = {
|
|
560
|
+
"type": "image_url",
|
|
561
|
+
"image_url": {
|
|
562
|
+
"url": f"data:image/jpeg;base64,{self.encode_pil_image(image)}"
|
|
563
|
+
},
|
|
564
|
+
}
|
|
565
|
+
else:
|
|
566
|
+
visual_dict = {
|
|
567
|
+
"type": "image_url",
|
|
568
|
+
"image_url": {"url": ele.url},
|
|
569
|
+
}
|
|
570
|
+
prompt.append(visual_dict)
|
|
571
|
+
return prompt
|
|
572
|
+
|
|
573
|
+
def encode_pil_image(self, pil_image):
|
|
574
|
+
image_buffer = BytesIO()
|
|
575
|
+
if pil_image.mode in ("RGBA", "LA", "P"):
|
|
576
|
+
pil_image = pil_image.convert("RGB")
|
|
577
|
+
pil_image.save(image_buffer, format="JPEG")
|
|
578
|
+
image_bytes = image_buffer.getvalue()
|
|
579
|
+
base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")
|
|
580
|
+
return base64_encoded_image
|
|
493
581
|
|
|
494
582
|
def load_model(self, async_mode: bool = False):
|
|
495
583
|
if not async_mode:
|
|
@@ -512,10 +600,10 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
512
600
|
|
|
513
601
|
def _build_client(self, cls):
|
|
514
602
|
api_key = require_secret_api_key(
|
|
515
|
-
self.
|
|
603
|
+
self.api_key,
|
|
516
604
|
provider_label="OpenAI",
|
|
517
605
|
env_var_name="OPENAI_API_KEY",
|
|
518
|
-
param_hint="`
|
|
606
|
+
param_hint="`api_key` to GPTModel(...)",
|
|
519
607
|
)
|
|
520
608
|
|
|
521
609
|
kw = dict(
|
|
@@ -531,3 +619,11 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
531
619
|
kw.pop("max_retries", None)
|
|
532
620
|
return cls(**kw)
|
|
533
621
|
raise
|
|
622
|
+
|
|
623
|
+
def supports_multimodal(self):
|
|
624
|
+
if self.name in GPTModel.valid_multimodal_models:
|
|
625
|
+
return True
|
|
626
|
+
return False
|
|
627
|
+
|
|
628
|
+
def get_model_name(self):
|
|
629
|
+
return f"{self.name}"
|
|
@@ -4,11 +4,18 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
4
4
|
from pydantic import AnyUrl, SecretStr
|
|
5
5
|
|
|
6
6
|
from deepeval.config.settings import get_settings
|
|
7
|
-
from deepeval.models.utils import
|
|
7
|
+
from deepeval.models.utils import (
|
|
8
|
+
require_secret_api_key,
|
|
9
|
+
)
|
|
8
10
|
from deepeval.models import DeepEvalBaseLLM
|
|
9
11
|
from deepeval.utils import require_param
|
|
10
12
|
|
|
11
13
|
|
|
14
|
+
def _request_timeout_seconds() -> float:
|
|
15
|
+
timeout = float(get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
|
|
16
|
+
return timeout if timeout > 0 else 30.0
|
|
17
|
+
|
|
18
|
+
|
|
12
19
|
class PortkeyModel(DeepEvalBaseLLM):
|
|
13
20
|
def __init__(
|
|
14
21
|
self,
|
|
@@ -16,11 +23,13 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
16
23
|
api_key: Optional[str] = None,
|
|
17
24
|
base_url: Optional[AnyUrl] = None,
|
|
18
25
|
provider: Optional[str] = None,
|
|
26
|
+
generation_kwargs: Optional[Dict] = None,
|
|
27
|
+
**kwargs,
|
|
19
28
|
):
|
|
20
29
|
settings = get_settings()
|
|
21
30
|
model = model or settings.PORTKEY_MODEL_NAME
|
|
22
31
|
|
|
23
|
-
self.
|
|
32
|
+
self.name = require_param(
|
|
24
33
|
model,
|
|
25
34
|
provider_label="Portkey",
|
|
26
35
|
env_var_name="PORTKEY_MODEL_NAME",
|
|
@@ -52,6 +61,9 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
52
61
|
env_var_name="PORTKEY_PROVIDER_NAME",
|
|
53
62
|
param_hint="provider",
|
|
54
63
|
)
|
|
64
|
+
# Keep sanitized kwargs for client call to strip legacy keys
|
|
65
|
+
self.kwargs = kwargs
|
|
66
|
+
self.generation_kwargs = generation_kwargs or {}
|
|
55
67
|
|
|
56
68
|
def _headers(self) -> Dict[str, str]:
|
|
57
69
|
api_key = require_secret_api_key(
|
|
@@ -70,10 +82,13 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
70
82
|
return headers
|
|
71
83
|
|
|
72
84
|
def _payload(self, prompt: str) -> Dict[str, Any]:
|
|
73
|
-
|
|
74
|
-
"model": self.
|
|
85
|
+
payload = {
|
|
86
|
+
"model": self.name,
|
|
75
87
|
"messages": [{"role": "user", "content": prompt}],
|
|
76
88
|
}
|
|
89
|
+
if self.generation_kwargs:
|
|
90
|
+
payload.update(self.generation_kwargs)
|
|
91
|
+
return payload
|
|
77
92
|
|
|
78
93
|
def _extract_content(self, data: Dict[str, Any]) -> str:
|
|
79
94
|
choices: Union[List[Dict[str, Any]], None] = data.get("choices")
|
|
@@ -88,6 +103,7 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
88
103
|
return ""
|
|
89
104
|
|
|
90
105
|
def generate(self, prompt: str) -> str:
|
|
106
|
+
|
|
91
107
|
try:
|
|
92
108
|
response = requests.post(
|
|
93
109
|
f"{self.base_url}/chat/completions",
|
|
@@ -110,6 +126,7 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
110
126
|
return self._extract_content(response.json())
|
|
111
127
|
|
|
112
128
|
async def a_generate(self, prompt: str) -> str:
|
|
129
|
+
|
|
113
130
|
async with aiohttp.ClientSession() as session:
|
|
114
131
|
async with session.post(
|
|
115
132
|
f"{self.base_url}/chat/completions",
|
|
@@ -125,8 +142,8 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
125
142
|
data = await response.json()
|
|
126
143
|
return self._extract_content(data)
|
|
127
144
|
|
|
128
|
-
def get_model_name(self) -> str:
|
|
129
|
-
return f"Portkey ({self.model})"
|
|
130
|
-
|
|
131
145
|
def load_model(self):
|
|
132
146
|
return None
|
|
147
|
+
|
|
148
|
+
def get_model_name(self):
|
|
149
|
+
return f"{self.name} (Portkey)"
|
deepeval/models/llms/utils.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
from typing import Dict
|
|
1
|
+
from typing import Dict, List, Optional
|
|
2
2
|
import re
|
|
3
3
|
import json
|
|
4
4
|
import asyncio
|
|
5
5
|
|
|
6
|
+
MULTIMODAL_MODELS = ["GPTModel", "AzureModel", "GeminiModel", "OllamaModel"]
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
def trim_and_load_json(
|
|
8
10
|
input_string: str,
|
|
@@ -38,7 +40,7 @@ def safe_asyncio_run(coro):
|
|
|
38
40
|
return loop.run_until_complete(future)
|
|
39
41
|
else:
|
|
40
42
|
return loop.run_until_complete(coro)
|
|
41
|
-
except Exception
|
|
43
|
+
except Exception:
|
|
42
44
|
raise
|
|
43
|
-
except Exception
|
|
45
|
+
except Exception:
|
|
44
46
|
raise
|
deepeval/models/retry_policy.py
CHANGED
|
@@ -55,6 +55,7 @@ from tenacity.stop import stop_base
|
|
|
55
55
|
from tenacity.wait import wait_base
|
|
56
56
|
from contextvars import ContextVar, copy_context
|
|
57
57
|
|
|
58
|
+
from deepeval.utils import require_dependency
|
|
58
59
|
from deepeval.constants import (
|
|
59
60
|
ProviderSlug as PS,
|
|
60
61
|
slugify,
|
|
@@ -829,25 +830,23 @@ try:
|
|
|
829
830
|
except Exception: # botocore not present (aiobotocore optional)
|
|
830
831
|
BEDROCK_ERROR_POLICY = None
|
|
831
832
|
|
|
832
|
-
|
|
833
833
|
####################
|
|
834
834
|
# Anthropic Policy #
|
|
835
835
|
####################
|
|
836
836
|
|
|
837
837
|
try:
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
APIStatusError,
|
|
838
|
+
|
|
839
|
+
module = require_dependency(
|
|
840
|
+
"anthropic",
|
|
841
|
+
provider_label="retry_policy",
|
|
842
|
+
install_hint="Install it with `pip install anthropic`.",
|
|
844
843
|
)
|
|
845
844
|
|
|
846
845
|
ANTHROPIC_ERROR_POLICY = ErrorPolicy(
|
|
847
|
-
auth_excs=(AuthenticationError,),
|
|
848
|
-
rate_limit_excs=(RateLimitError,),
|
|
849
|
-
network_excs=(APIConnectionError, APITimeoutError),
|
|
850
|
-
http_excs=(APIStatusError,),
|
|
846
|
+
auth_excs=(module.AuthenticationError,),
|
|
847
|
+
rate_limit_excs=(module.RateLimitError,),
|
|
848
|
+
network_excs=(module.APIConnectionError, module.APITimeoutError),
|
|
849
|
+
http_excs=(module.APIStatusError,),
|
|
851
850
|
non_retryable_codes=frozenset(), # update if we learn of hard quota codes
|
|
852
851
|
message_markers={},
|
|
853
852
|
)
|
|
@@ -868,7 +867,11 @@ except Exception: # Anthropic optional
|
|
|
868
867
|
# and gate retries using message markers (code sniffing).
|
|
869
868
|
# See: https://github.com/googleapis/python-genai?tab=readme-ov-file#error-handling
|
|
870
869
|
try:
|
|
871
|
-
|
|
870
|
+
module = require_dependency(
|
|
871
|
+
"google.genai",
|
|
872
|
+
provider_label="retry_policy",
|
|
873
|
+
install_hint="Install it with `pip install google-genai`.",
|
|
874
|
+
)
|
|
872
875
|
|
|
873
876
|
_HTTPX_NET_EXCS = _httpx_net_excs()
|
|
874
877
|
_REQUESTS_EXCS = _requests_net_excs()
|
|
@@ -887,9 +890,9 @@ try:
|
|
|
887
890
|
GOOGLE_ERROR_POLICY = ErrorPolicy(
|
|
888
891
|
auth_excs=(), # we will classify 401/403 via markers below (see non-retryable codes)
|
|
889
892
|
rate_limit_excs=(
|
|
890
|
-
gerrors.ClientError,
|
|
893
|
+
module.gerrors.ClientError,
|
|
891
894
|
), # includes 429; markers decide retry vs not
|
|
892
|
-
network_excs=(gerrors.ServerError,)
|
|
895
|
+
network_excs=(module.gerrors.ServerError,)
|
|
893
896
|
+ _HTTPX_NET_EXCS
|
|
894
897
|
+ _REQUESTS_EXCS, # treat 5xx as transient
|
|
895
898
|
http_excs=(), # no reliable .status_code on exceptions; handled above
|
deepeval/models/utils.py
CHANGED
|
@@ -1,9 +1,13 @@
|
|
|
1
|
-
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict, Optional, Tuple
|
|
2
3
|
from pydantic import SecretStr
|
|
3
4
|
|
|
4
5
|
from deepeval.errors import DeepEvalError
|
|
5
6
|
|
|
6
7
|
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
7
11
|
def parse_model_name(model_name: Optional[str] = None) -> str:
|
|
8
12
|
"""Extract base model name from provider-prefixed format.
|
|
9
13
|
|
|
@@ -74,3 +78,44 @@ def require_secret_api_key(
|
|
|
74
78
|
)
|
|
75
79
|
|
|
76
80
|
return api_key
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def normalize_kwargs_and_extract_aliases(
|
|
84
|
+
provider_label: str,
|
|
85
|
+
kwargs: Dict[str, Any],
|
|
86
|
+
alias_map: Dict[str, list],
|
|
87
|
+
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
88
|
+
"""
|
|
89
|
+
Normalize legacy keyword argument names according to alias_map.
|
|
90
|
+
|
|
91
|
+
alias_map is of the form: {new_name: [old_name1, old_name2, ...]}
|
|
92
|
+
|
|
93
|
+
- Returns (normalized_kwargs, extracted_values)
|
|
94
|
+
where:
|
|
95
|
+
- normalized_kwargs has all legacy keys removed (to prevent forwarding
|
|
96
|
+
to downstream SDK clients).
|
|
97
|
+
- extracted_values maps new_name -> value for any alias that was used.
|
|
98
|
+
|
|
99
|
+
- Logs a warning for each legacy keyword used, so callers know they should
|
|
100
|
+
migrate to the new name.
|
|
101
|
+
"""
|
|
102
|
+
normalized = dict(kwargs)
|
|
103
|
+
extracted: Dict[str, Any] = {}
|
|
104
|
+
|
|
105
|
+
for new_name, old_names in alias_map.items():
|
|
106
|
+
for old_name in old_names:
|
|
107
|
+
if old_name in normalized:
|
|
108
|
+
value = normalized.pop(old_name)
|
|
109
|
+
|
|
110
|
+
logger.warning(
|
|
111
|
+
"%s keyword '%s' is deprecated; please use '%s' instead.",
|
|
112
|
+
provider_label,
|
|
113
|
+
old_name,
|
|
114
|
+
new_name,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Only preserve the first alias value we see for a given new_name
|
|
118
|
+
if new_name not in extracted:
|
|
119
|
+
extracted[new_name] = value
|
|
120
|
+
|
|
121
|
+
return normalized, extracted
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Union, List, Dict, Tuple
|
|
3
|
+
|
|
4
|
+
from deepeval.models.base_model import DeepEvalBaseLLM
|
|
5
|
+
from deepeval.optimizer.scorer.base import BaseScorer
|
|
6
|
+
from deepeval.prompt.prompt import Prompt
|
|
7
|
+
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseAlgorithm(ABC):
|
|
11
|
+
name: str
|
|
12
|
+
optimizer_model: DeepEvalBaseLLM
|
|
13
|
+
scorer: BaseScorer
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def execute(
|
|
17
|
+
self,
|
|
18
|
+
prompt: Prompt,
|
|
19
|
+
goldens: Union[List[Golden], List[ConversationalGolden]],
|
|
20
|
+
) -> Tuple[Prompt, Dict]:
|
|
21
|
+
raise NotImplementedError
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
async def a_execute(
|
|
25
|
+
self,
|
|
26
|
+
prompt: Prompt,
|
|
27
|
+
goldens: Union[List[Golden], List[ConversationalGolden]],
|
|
28
|
+
) -> Tuple[Prompt, Dict]:
|
|
29
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Internal GEPA constants - not exposed to users
|
|
2
|
+
GEPA_MIN_DELTA: float = 0.0
|
|
3
|
+
GEPA_TIE_TOLERANCE: float = 1e-9
|
|
4
|
+
GEPA_REWRITE_INSTRUCTION_MAX_CHARS: int = 4096
|
|
5
|
+
|
|
6
|
+
# Internal MIPROV2 constants - not exposed to users
|
|
7
|
+
MIPROV2_MIN_DELTA: float = 0.0
|
|
8
|
+
MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS: int = 4096
|
|
9
|
+
MIPROV2_DEFAULT_NUM_CANDIDATES: int = 10
|
|
10
|
+
MIPROV2_DEFAULT_NUM_TRIALS: int = 20
|
|
11
|
+
MIPROV2_DEFAULT_MINIBATCH_SIZE: int = 25
|
|
12
|
+
MIPROV2_DEFAULT_MINIBATCH_FULL_EVAL_STEPS: int = 10
|
|
13
|
+
MIPROV2_DEFAULT_MAX_BOOTSTRAPPED_DEMOS: int = 4
|
|
14
|
+
MIPROV2_DEFAULT_MAX_LABELED_DEMOS: int = 4
|
|
15
|
+
MIPROV2_DEFAULT_NUM_DEMO_SETS: int = 5
|
|
16
|
+
|
|
17
|
+
# Internal SIMBA constants - not exposed to users
|
|
18
|
+
SIMBA_DEMO_INPUT_MAX_CHARS: int = 256
|