deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -3,12 +3,22 @@ import requests
|
|
|
3
3
|
from typing import Any, Dict, List, Optional, Union
|
|
4
4
|
from pydantic import AnyUrl, SecretStr
|
|
5
5
|
|
|
6
|
+
from deepeval.errors import DeepEvalError
|
|
6
7
|
from deepeval.config.settings import get_settings
|
|
7
|
-
from deepeval.models.utils import
|
|
8
|
+
from deepeval.models.utils import (
|
|
9
|
+
require_secret_api_key,
|
|
10
|
+
)
|
|
11
|
+
from deepeval.test_case import MLLMImage
|
|
12
|
+
from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
|
|
8
13
|
from deepeval.models import DeepEvalBaseLLM
|
|
9
14
|
from deepeval.utils import require_param
|
|
10
15
|
|
|
11
16
|
|
|
17
|
+
def _request_timeout_seconds() -> float:
|
|
18
|
+
timeout = float(get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
|
|
19
|
+
return timeout if timeout > 0 else 30.0
|
|
20
|
+
|
|
21
|
+
|
|
12
22
|
class PortkeyModel(DeepEvalBaseLLM):
|
|
13
23
|
def __init__(
|
|
14
24
|
self,
|
|
@@ -16,20 +26,15 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
16
26
|
api_key: Optional[str] = None,
|
|
17
27
|
base_url: Optional[AnyUrl] = None,
|
|
18
28
|
provider: Optional[str] = None,
|
|
29
|
+
generation_kwargs: Optional[Dict] = None,
|
|
30
|
+
**kwargs,
|
|
19
31
|
):
|
|
20
32
|
settings = get_settings()
|
|
21
33
|
model = model or settings.PORTKEY_MODEL_NAME
|
|
22
34
|
|
|
23
|
-
self.model = require_param(
|
|
24
|
-
model,
|
|
25
|
-
provider_label="Portkey",
|
|
26
|
-
env_var_name="PORTKEY_MODEL_NAME",
|
|
27
|
-
param_hint="model",
|
|
28
|
-
)
|
|
29
|
-
|
|
30
35
|
if api_key is not None:
|
|
31
36
|
# keep it secret, keep it safe from serializings, logging and alike
|
|
32
|
-
self.api_key: SecretStr
|
|
37
|
+
self.api_key: Optional[SecretStr] = SecretStr(api_key)
|
|
33
38
|
else:
|
|
34
39
|
self.api_key = settings.PORTKEY_API_KEY
|
|
35
40
|
|
|
@@ -38,6 +43,16 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
38
43
|
elif settings.PORTKEY_BASE_URL is not None:
|
|
39
44
|
base_url = str(settings.PORTKEY_BASE_URL).rstrip("/")
|
|
40
45
|
|
|
46
|
+
provider = provider or settings.PORTKEY_PROVIDER_NAME
|
|
47
|
+
|
|
48
|
+
# validation
|
|
49
|
+
model = require_param(
|
|
50
|
+
model,
|
|
51
|
+
provider_label="Portkey",
|
|
52
|
+
env_var_name="PORTKEY_MODEL_NAME",
|
|
53
|
+
param_hint="model",
|
|
54
|
+
)
|
|
55
|
+
|
|
41
56
|
self.base_url = require_param(
|
|
42
57
|
base_url,
|
|
43
58
|
provider_label="Portkey",
|
|
@@ -45,13 +60,16 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
45
60
|
param_hint="base_url",
|
|
46
61
|
)
|
|
47
62
|
|
|
48
|
-
provider = provider or settings.PORTKEY_PROVIDER_NAME
|
|
49
63
|
self.provider = require_param(
|
|
50
64
|
provider,
|
|
51
65
|
provider_label="Portkey",
|
|
52
66
|
env_var_name="PORTKEY_PROVIDER_NAME",
|
|
53
67
|
param_hint="provider",
|
|
54
68
|
)
|
|
69
|
+
# Keep sanitized kwargs for client call to strip legacy keys
|
|
70
|
+
self.kwargs = kwargs
|
|
71
|
+
self.generation_kwargs = generation_kwargs or {}
|
|
72
|
+
super().__init__(model)
|
|
55
73
|
|
|
56
74
|
def _headers(self) -> Dict[str, str]:
|
|
57
75
|
api_key = require_secret_api_key(
|
|
@@ -70,15 +88,51 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
70
88
|
return headers
|
|
71
89
|
|
|
72
90
|
def _payload(self, prompt: str) -> Dict[str, Any]:
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
91
|
+
if check_if_multimodal(prompt):
|
|
92
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
93
|
+
content = self.generate_content(prompt)
|
|
94
|
+
else:
|
|
95
|
+
content = [{"type": "text", "text": prompt}]
|
|
96
|
+
payload = {
|
|
97
|
+
"model": self.name,
|
|
98
|
+
"messages": [{"role": "user", "content": content}],
|
|
76
99
|
}
|
|
100
|
+
if self.generation_kwargs:
|
|
101
|
+
payload.update(self.generation_kwargs)
|
|
102
|
+
return payload
|
|
103
|
+
|
|
104
|
+
def generate_content(
|
|
105
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
106
|
+
):
|
|
107
|
+
content = []
|
|
108
|
+
for element in multimodal_input:
|
|
109
|
+
if isinstance(element, str):
|
|
110
|
+
content.append({"type": "text", "text": element})
|
|
111
|
+
elif isinstance(element, MLLMImage):
|
|
112
|
+
if element.url and not element.local:
|
|
113
|
+
content.append(
|
|
114
|
+
{
|
|
115
|
+
"type": "image_url",
|
|
116
|
+
"image_url": {"url": element.url},
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
else:
|
|
120
|
+
element.ensure_images_loaded()
|
|
121
|
+
data_uri = (
|
|
122
|
+
f"data:{element.mimeType};base64,{element.dataBase64}"
|
|
123
|
+
)
|
|
124
|
+
content.append(
|
|
125
|
+
{
|
|
126
|
+
"type": "image_url",
|
|
127
|
+
"image_url": {"url": data_uri},
|
|
128
|
+
}
|
|
129
|
+
)
|
|
130
|
+
return content
|
|
77
131
|
|
|
78
132
|
def _extract_content(self, data: Dict[str, Any]) -> str:
|
|
79
133
|
choices: Union[List[Dict[str, Any]], None] = data.get("choices")
|
|
80
134
|
if not choices:
|
|
81
|
-
raise
|
|
135
|
+
raise DeepEvalError("Portkey response did not include any choices.")
|
|
82
136
|
message = choices[0].get("message", {})
|
|
83
137
|
content: Union[str, List[Dict[str, Any]], None] = message.get("content")
|
|
84
138
|
if isinstance(content, str):
|
|
@@ -88,12 +142,13 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
88
142
|
return ""
|
|
89
143
|
|
|
90
144
|
def generate(self, prompt: str) -> str:
|
|
145
|
+
|
|
91
146
|
try:
|
|
92
147
|
response = requests.post(
|
|
93
148
|
f"{self.base_url}/chat/completions",
|
|
94
149
|
json=self._payload(prompt),
|
|
95
150
|
headers=self._headers(),
|
|
96
|
-
timeout=
|
|
151
|
+
timeout=_request_timeout_seconds(),
|
|
97
152
|
)
|
|
98
153
|
response.raise_for_status()
|
|
99
154
|
except requests.HTTPError as error:
|
|
@@ -102,31 +157,35 @@ class PortkeyModel(DeepEvalBaseLLM):
|
|
|
102
157
|
body = response.json()
|
|
103
158
|
except Exception:
|
|
104
159
|
body = response.text
|
|
105
|
-
raise
|
|
160
|
+
raise DeepEvalError(
|
|
106
161
|
f"Portkey request failed with status {response.status_code}: {body}"
|
|
107
162
|
) from error
|
|
108
163
|
except requests.RequestException as error:
|
|
109
|
-
raise
|
|
164
|
+
raise DeepEvalError(f"Portkey request failed: {error}") from error
|
|
110
165
|
return self._extract_content(response.json())
|
|
111
166
|
|
|
112
167
|
async def a_generate(self, prompt: str) -> str:
|
|
168
|
+
|
|
113
169
|
async with aiohttp.ClientSession() as session:
|
|
114
170
|
async with session.post(
|
|
115
171
|
f"{self.base_url}/chat/completions",
|
|
116
172
|
json=self._payload(prompt),
|
|
117
173
|
headers=self._headers(),
|
|
118
|
-
timeout=
|
|
174
|
+
timeout=_request_timeout_seconds(),
|
|
119
175
|
) as response:
|
|
120
176
|
if response.status >= 400:
|
|
121
177
|
body = await response.text()
|
|
122
|
-
raise
|
|
178
|
+
raise DeepEvalError(
|
|
123
179
|
f"Portkey request failed with status {response.status}: {body}"
|
|
124
180
|
)
|
|
125
181
|
data = await response.json()
|
|
126
182
|
return self._extract_content(data)
|
|
127
183
|
|
|
128
|
-
def get_model_name(self) -> str:
|
|
129
|
-
return f"Portkey ({self.model})"
|
|
130
|
-
|
|
131
184
|
def load_model(self):
|
|
132
185
|
return None
|
|
186
|
+
|
|
187
|
+
def get_model_name(self):
|
|
188
|
+
return f"{self.name} (Portkey)"
|
|
189
|
+
|
|
190
|
+
def supports_multimodal(self):
|
|
191
|
+
return True
|
deepeval/models/llms/utils.py
CHANGED
|
@@ -3,6 +3,11 @@ import re
|
|
|
3
3
|
import json
|
|
4
4
|
import asyncio
|
|
5
5
|
|
|
6
|
+
from deepeval.errors import DeepEvalError
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
MULTIMODAL_MODELS = ["GPTModel", "AzureModel", "GeminiModel", "OllamaModel"]
|
|
10
|
+
|
|
6
11
|
|
|
7
12
|
def trim_and_load_json(
|
|
8
13
|
input_string: str,
|
|
@@ -18,7 +23,7 @@ def trim_and_load_json(
|
|
|
18
23
|
return json.loads(jsonStr)
|
|
19
24
|
except json.JSONDecodeError:
|
|
20
25
|
error_str = "Evaluation LLM outputted an invalid JSON. Please use a better evaluation model."
|
|
21
|
-
raise
|
|
26
|
+
raise DeepEvalError(error_str)
|
|
22
27
|
except Exception as e:
|
|
23
28
|
raise Exception(f"An unexpected error occurred: {str(e)}")
|
|
24
29
|
|
|
@@ -38,7 +43,7 @@ def safe_asyncio_run(coro):
|
|
|
38
43
|
return loop.run_until_complete(future)
|
|
39
44
|
else:
|
|
40
45
|
return loop.run_until_complete(coro)
|
|
41
|
-
except Exception
|
|
46
|
+
except Exception:
|
|
42
47
|
raise
|
|
43
|
-
except Exception
|
|
48
|
+
except Exception:
|
|
44
49
|
raise
|
deepeval/models/retry_policy.py
CHANGED
|
@@ -55,6 +55,7 @@ from tenacity.stop import stop_base
|
|
|
55
55
|
from tenacity.wait import wait_base
|
|
56
56
|
from contextvars import ContextVar, copy_context
|
|
57
57
|
|
|
58
|
+
from deepeval.utils import require_dependency
|
|
58
59
|
from deepeval.constants import (
|
|
59
60
|
ProviderSlug as PS,
|
|
60
61
|
slugify,
|
|
@@ -829,25 +830,23 @@ try:
|
|
|
829
830
|
except Exception: # botocore not present (aiobotocore optional)
|
|
830
831
|
BEDROCK_ERROR_POLICY = None
|
|
831
832
|
|
|
832
|
-
|
|
833
833
|
####################
|
|
834
834
|
# Anthropic Policy #
|
|
835
835
|
####################
|
|
836
836
|
|
|
837
837
|
try:
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
APIStatusError,
|
|
838
|
+
|
|
839
|
+
module = require_dependency(
|
|
840
|
+
"anthropic",
|
|
841
|
+
provider_label="retry_policy",
|
|
842
|
+
install_hint="Install it with `pip install anthropic`.",
|
|
844
843
|
)
|
|
845
844
|
|
|
846
845
|
ANTHROPIC_ERROR_POLICY = ErrorPolicy(
|
|
847
|
-
auth_excs=(AuthenticationError,),
|
|
848
|
-
rate_limit_excs=(RateLimitError,),
|
|
849
|
-
network_excs=(APIConnectionError, APITimeoutError),
|
|
850
|
-
http_excs=(APIStatusError,),
|
|
846
|
+
auth_excs=(module.AuthenticationError,),
|
|
847
|
+
rate_limit_excs=(module.RateLimitError,),
|
|
848
|
+
network_excs=(module.APIConnectionError, module.APITimeoutError),
|
|
849
|
+
http_excs=(module.APIStatusError,),
|
|
851
850
|
non_retryable_codes=frozenset(), # update if we learn of hard quota codes
|
|
852
851
|
message_markers={},
|
|
853
852
|
)
|
|
@@ -868,7 +867,11 @@ except Exception: # Anthropic optional
|
|
|
868
867
|
# and gate retries using message markers (code sniffing).
|
|
869
868
|
# See: https://github.com/googleapis/python-genai?tab=readme-ov-file#error-handling
|
|
870
869
|
try:
|
|
871
|
-
|
|
870
|
+
module = require_dependency(
|
|
871
|
+
"google.genai",
|
|
872
|
+
provider_label="retry_policy",
|
|
873
|
+
install_hint="Install it with `pip install google-genai`.",
|
|
874
|
+
)
|
|
872
875
|
|
|
873
876
|
_HTTPX_NET_EXCS = _httpx_net_excs()
|
|
874
877
|
_REQUESTS_EXCS = _requests_net_excs()
|
|
@@ -887,9 +890,9 @@ try:
|
|
|
887
890
|
GOOGLE_ERROR_POLICY = ErrorPolicy(
|
|
888
891
|
auth_excs=(), # we will classify 401/403 via markers below (see non-retryable codes)
|
|
889
892
|
rate_limit_excs=(
|
|
890
|
-
gerrors.ClientError,
|
|
893
|
+
module.gerrors.ClientError,
|
|
891
894
|
), # includes 429; markers decide retry vs not
|
|
892
|
-
network_excs=(gerrors.ServerError,)
|
|
895
|
+
network_excs=(module.gerrors.ServerError,)
|
|
893
896
|
+ _HTTPX_NET_EXCS
|
|
894
897
|
+ _REQUESTS_EXCS, # treat 5xx as transient
|
|
895
898
|
http_excs=(), # no reliable .status_code on exceptions; handled above
|
deepeval/models/utils.py
CHANGED
|
@@ -1,10 +1,14 @@
|
|
|
1
|
-
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict, Optional, Tuple
|
|
2
3
|
from pydantic import SecretStr
|
|
3
4
|
|
|
4
5
|
from deepeval.errors import DeepEvalError
|
|
5
6
|
|
|
6
7
|
|
|
7
|
-
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def parse_model_name(model_name: Optional[str] = None) -> Optional[str]:
|
|
8
12
|
"""Extract base model name from provider-prefixed format.
|
|
9
13
|
|
|
10
14
|
This function is useful for extracting the actual model name from a
|
|
@@ -28,9 +32,9 @@ def parse_model_name(model_name: Optional[str] = None) -> str:
|
|
|
28
32
|
if model_name is None:
|
|
29
33
|
return None
|
|
30
34
|
|
|
31
|
-
if "/" in model_name:
|
|
32
|
-
|
|
33
|
-
|
|
35
|
+
# if "/" in model_name:
|
|
36
|
+
# _, parsed_model_name = model_name.split("/", 1)
|
|
37
|
+
# return parsed_model_name
|
|
34
38
|
return model_name
|
|
35
39
|
|
|
36
40
|
|
|
@@ -74,3 +78,100 @@ def require_secret_api_key(
|
|
|
74
78
|
)
|
|
75
79
|
|
|
76
80
|
return api_key
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def require_costs(
|
|
84
|
+
model_data,
|
|
85
|
+
model_name: str,
|
|
86
|
+
input_token_envvar: str,
|
|
87
|
+
output_token_envvar: str,
|
|
88
|
+
cost_per_input_token: Optional[float] = None,
|
|
89
|
+
cost_per_output_token: Optional[float] = None,
|
|
90
|
+
) -> Tuple[Optional[float], Optional[float]]:
|
|
91
|
+
"""
|
|
92
|
+
Validates and returns the cost parameters (input and output tokens) for a model.
|
|
93
|
+
|
|
94
|
+
Arguments:
|
|
95
|
+
- model_data: The model's data object, which should contain `input_price` and `output_price`.
|
|
96
|
+
- model_name: The model name used for error messaging.
|
|
97
|
+
- cost_per_input_token: The input token cost provided during model initialization (optional).
|
|
98
|
+
- cost_per_output_token: The output token cost provided during model initialization (optional).
|
|
99
|
+
- input_token_envvar: The environment variable name for input cost.
|
|
100
|
+
- output_token_envvar: The environment variable name for output cost.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
- A tuple of validated values (input_cost, output_cost). If the values are provided, they are returned.
|
|
104
|
+
If not provided, they are fetched from settings or environment variables.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
def validate_cost(
|
|
108
|
+
value: Optional[float], envvar_name: str
|
|
109
|
+
) -> Optional[float]:
|
|
110
|
+
"""Helper function to validate the cost values."""
|
|
111
|
+
if value is not None and value < 0:
|
|
112
|
+
raise DeepEvalError(f"{envvar_name} must be >= 0.")
|
|
113
|
+
return value
|
|
114
|
+
|
|
115
|
+
# Validate provided token costs
|
|
116
|
+
cost_per_input_token = validate_cost(
|
|
117
|
+
cost_per_input_token, input_token_envvar
|
|
118
|
+
)
|
|
119
|
+
cost_per_output_token = validate_cost(
|
|
120
|
+
cost_per_output_token, output_token_envvar
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# If model data doesn't have pricing, use provided values or environment variables
|
|
124
|
+
if model_data.input_price is None or model_data.output_price is None:
|
|
125
|
+
if cost_per_input_token is None or cost_per_output_token is None:
|
|
126
|
+
raise DeepEvalError(
|
|
127
|
+
f"No pricing available for `{model_name}`. "
|
|
128
|
+
f"Please provide both `cost_per_input_token` and `cost_per_output_token` when initializing `{model_name}`, "
|
|
129
|
+
f"or set {input_token_envvar} and {output_token_envvar} environment variables."
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Return the validated cost values as a tuple
|
|
133
|
+
return cost_per_input_token, cost_per_output_token
|
|
134
|
+
|
|
135
|
+
# If no custom cost values are provided, return model's default cost values
|
|
136
|
+
return model_data.input_price, model_data.output_price
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def normalize_kwargs_and_extract_aliases(
|
|
140
|
+
provider_label: str,
|
|
141
|
+
kwargs: Dict[str, Any],
|
|
142
|
+
alias_map: Dict[str, list],
|
|
143
|
+
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
144
|
+
"""
|
|
145
|
+
Normalize legacy keyword argument names according to alias_map.
|
|
146
|
+
|
|
147
|
+
alias_map is of the form: {new_name: [old_name1, old_name2, ...]}
|
|
148
|
+
|
|
149
|
+
- Returns (normalized_kwargs, extracted_values)
|
|
150
|
+
where:
|
|
151
|
+
- normalized_kwargs has all legacy keys removed (to prevent forwarding
|
|
152
|
+
to downstream SDK clients).
|
|
153
|
+
- extracted_values maps new_name -> value for any alias that was used.
|
|
154
|
+
|
|
155
|
+
- Logs a warning for each legacy keyword used, so callers know they should
|
|
156
|
+
migrate to the new name.
|
|
157
|
+
"""
|
|
158
|
+
normalized = dict(kwargs)
|
|
159
|
+
extracted: Dict[str, Any] = {}
|
|
160
|
+
|
|
161
|
+
for new_name, old_names in alias_map.items():
|
|
162
|
+
for old_name in old_names:
|
|
163
|
+
if old_name in normalized:
|
|
164
|
+
value = normalized.pop(old_name)
|
|
165
|
+
|
|
166
|
+
logger.warning(
|
|
167
|
+
"%s keyword '%s' is deprecated; please use '%s' instead.",
|
|
168
|
+
provider_label,
|
|
169
|
+
old_name,
|
|
170
|
+
new_name,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Only preserve the first alias value we see for a given new_name
|
|
174
|
+
if new_name not in extracted:
|
|
175
|
+
extracted[new_name] = value
|
|
176
|
+
|
|
177
|
+
return normalized, extracted
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Union, List, Dict, Tuple
|
|
3
|
+
|
|
4
|
+
from deepeval.models.base_model import DeepEvalBaseLLM
|
|
5
|
+
from deepeval.optimizer.scorer.base import BaseScorer
|
|
6
|
+
from deepeval.prompt.prompt import Prompt
|
|
7
|
+
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseAlgorithm(ABC):
|
|
11
|
+
name: str
|
|
12
|
+
optimizer_model: DeepEvalBaseLLM
|
|
13
|
+
scorer: BaseScorer
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def execute(
|
|
17
|
+
self,
|
|
18
|
+
prompt: Prompt,
|
|
19
|
+
goldens: Union[List[Golden], List[ConversationalGolden]],
|
|
20
|
+
) -> Tuple[Prompt, Dict]:
|
|
21
|
+
raise NotImplementedError
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
async def a_execute(
|
|
25
|
+
self,
|
|
26
|
+
prompt: Prompt,
|
|
27
|
+
goldens: Union[List[Golden], List[ConversationalGolden]],
|
|
28
|
+
) -> Tuple[Prompt, Dict]:
|
|
29
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Internal GEPA constants - not exposed to users
|
|
2
|
+
GEPA_MIN_DELTA: float = 0.0
|
|
3
|
+
GEPA_TIE_TOLERANCE: float = 1e-9
|
|
4
|
+
GEPA_REWRITE_INSTRUCTION_MAX_CHARS: int = 4096
|
|
5
|
+
|
|
6
|
+
# Internal MIPROV2 constants - not exposed to users
|
|
7
|
+
MIPROV2_MIN_DELTA: float = 0.0
|
|
8
|
+
MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS: int = 4096
|
|
9
|
+
MIPROV2_DEFAULT_NUM_CANDIDATES: int = 10
|
|
10
|
+
MIPROV2_DEFAULT_NUM_TRIALS: int = 20
|
|
11
|
+
MIPROV2_DEFAULT_MINIBATCH_SIZE: int = 25
|
|
12
|
+
MIPROV2_DEFAULT_MINIBATCH_FULL_EVAL_STEPS: int = 10
|
|
13
|
+
MIPROV2_DEFAULT_MAX_BOOTSTRAPPED_DEMOS: int = 4
|
|
14
|
+
MIPROV2_DEFAULT_MAX_LABELED_DEMOS: int = 4
|
|
15
|
+
MIPROV2_DEFAULT_NUM_DEMO_SETS: int = 5
|
|
16
|
+
|
|
17
|
+
# Internal SIMBA constants - not exposed to users
|
|
18
|
+
SIMBA_DEMO_INPUT_MAX_CHARS: int = 256
|