deepeval 3.7.9__py3-none-any.whl → 3.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/annotation/annotation.py +2 -2
- deepeval/cli/main.py +168 -0
- deepeval/confident/api.py +2 -0
- deepeval/config/settings.py +13 -0
- deepeval/constants.py +1 -0
- deepeval/dataset/dataset.py +6 -4
- deepeval/integrations/langchain/callback.py +330 -158
- deepeval/integrations/langchain/utils.py +31 -8
- deepeval/key_handler.py +8 -1
- deepeval/metrics/contextual_recall/contextual_recall.py +25 -6
- deepeval/metrics/contextual_recall/schema.py +6 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +35 -0
- deepeval/metrics/g_eval/g_eval.py +35 -1
- deepeval/metrics/g_eval/utils.py +65 -0
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +10 -1
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +10 -1
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +10 -1
- deepeval/metrics/utils.py +1 -1
- deepeval/models/__init__.py +2 -0
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +51 -6
- deepeval/models/llms/azure_model.py +33 -7
- deepeval/models/llms/constants.py +23 -0
- deepeval/models/llms/gemini_model.py +6 -1
- deepeval/models/llms/openai_model.py +5 -4
- deepeval/models/llms/openrouter_model.py +398 -0
- deepeval/models/retry_policy.py +3 -0
- deepeval/prompt/api.py +1 -0
- deepeval/prompt/prompt.py +7 -5
- deepeval/test_case/llm_test_case.py +1 -0
- deepeval/tracing/tracing.py +6 -1
- deepeval/tracing/types.py +1 -1
- {deepeval-3.7.9.dist-info → deepeval-3.8.1.dist-info}/METADATA +3 -3
- {deepeval-3.7.9.dist-info → deepeval-3.8.1.dist-info}/RECORD +38 -37
- {deepeval-3.7.9.dist-info → deepeval-3.8.1.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.9.dist-info → deepeval-3.8.1.dist-info}/WHEEL +0 -0
- {deepeval-3.7.9.dist-info → deepeval-3.8.1.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from openai.types.chat.chat_completion import ChatCompletion
|
|
2
2
|
from openai import AzureOpenAI, AsyncAzureOpenAI
|
|
3
|
-
from typing import Optional, Tuple, Union, Dict, List
|
|
3
|
+
from typing import Optional, Tuple, Union, Dict, List, Callable, Awaitable
|
|
4
4
|
from pydantic import BaseModel, SecretStr
|
|
5
5
|
|
|
6
6
|
from deepeval.errors import DeepEvalError
|
|
@@ -42,6 +42,10 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
|
|
|
42
42
|
model: Optional[str] = None,
|
|
43
43
|
api_key: Optional[str] = None,
|
|
44
44
|
base_url: Optional[str] = None,
|
|
45
|
+
azure_ad_token_provider: Optional[
|
|
46
|
+
Callable[[], "str | Awaitable[str]"]
|
|
47
|
+
] = None,
|
|
48
|
+
azure_ad_token: Optional[str] = None,
|
|
45
49
|
temperature: Optional[float] = None,
|
|
46
50
|
cost_per_input_token: Optional[float] = None,
|
|
47
51
|
cost_per_output_token: Optional[float] = None,
|
|
@@ -67,12 +71,19 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
|
|
|
67
71
|
model = model or settings.AZURE_MODEL_NAME
|
|
68
72
|
deployment_name = deployment_name or settings.AZURE_DEPLOYMENT_NAME
|
|
69
73
|
|
|
74
|
+
self.azure_ad_token_provider = azure_ad_token_provider
|
|
75
|
+
|
|
70
76
|
if api_key is not None:
|
|
71
77
|
# keep it secret, keep it safe from serializings, logging and alike
|
|
72
78
|
self.api_key: Optional[SecretStr] = SecretStr(api_key)
|
|
73
79
|
else:
|
|
74
80
|
self.api_key = settings.AZURE_OPENAI_API_KEY
|
|
75
81
|
|
|
82
|
+
if azure_ad_token is not None:
|
|
83
|
+
self.azure_ad_token = azure_ad_token
|
|
84
|
+
else:
|
|
85
|
+
self.azure_ad_token = settings.AZURE_OPENAI_AD_TOKEN
|
|
86
|
+
|
|
76
87
|
api_version = api_version or settings.OPENAI_API_VERSION
|
|
77
88
|
if base_url is not None:
|
|
78
89
|
base_url = str(base_url).rstrip("/")
|
|
@@ -431,18 +442,33 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
|
|
|
431
442
|
return kwargs
|
|
432
443
|
|
|
433
444
|
def _build_client(self, cls):
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
445
|
+
# Only require the API key / Azure ad token if no token provider is supplied
|
|
446
|
+
azure_ad_token = None
|
|
447
|
+
api_key = None
|
|
448
|
+
|
|
449
|
+
if self.azure_ad_token_provider is None:
|
|
450
|
+
if self.azure_ad_token is not None:
|
|
451
|
+
azure_ad_token = require_secret_api_key(
|
|
452
|
+
self.azure_ad_token,
|
|
453
|
+
provider_label="AzureOpenAI",
|
|
454
|
+
env_var_name="AZURE_OPENAI_AD_TOKEN",
|
|
455
|
+
param_hint="`azure_ad_token` to AzureOpenAIModel(...)",
|
|
456
|
+
)
|
|
457
|
+
else:
|
|
458
|
+
api_key = require_secret_api_key(
|
|
459
|
+
self.api_key,
|
|
460
|
+
provider_label="AzureOpenAI",
|
|
461
|
+
env_var_name="AZURE_OPENAI_API_KEY",
|
|
462
|
+
param_hint="`api_key` to AzureOpenAIModel(...)",
|
|
463
|
+
)
|
|
440
464
|
|
|
441
465
|
kw = dict(
|
|
442
466
|
api_key=api_key,
|
|
443
467
|
api_version=self.api_version,
|
|
444
468
|
azure_endpoint=self.base_url,
|
|
445
469
|
azure_deployment=self.deployment_name,
|
|
470
|
+
azure_ad_token_provider=self.azure_ad_token_provider,
|
|
471
|
+
azure_ad_token=azure_ad_token,
|
|
446
472
|
**self._client_kwargs(),
|
|
447
473
|
)
|
|
448
474
|
try:
|
|
@@ -3,6 +3,11 @@ from typing import Any, Callable, Union
|
|
|
3
3
|
from deepeval.models.base_model import DeepEvalModelData
|
|
4
4
|
|
|
5
5
|
|
|
6
|
+
DEFAULT_GPT_MODEL = "gpt-4.1"
|
|
7
|
+
# OpenRouter uses provider/model format (e.g., "openai/gpt-4", "anthropic/claude-3-opus")
|
|
8
|
+
# DeepEval does not validate OpenRouter model strings.
|
|
9
|
+
DEFAULT_OPENROUTER_MODEL = f"openai/{DEFAULT_GPT_MODEL}"
|
|
10
|
+
|
|
6
11
|
ModelDataFactory = Callable[[], DeepEvalModelData]
|
|
7
12
|
ModelDataValue = Union[DeepEvalModelData, ModelDataFactory]
|
|
8
13
|
|
|
@@ -366,6 +371,24 @@ OPENAI_MODELS_DATA = ModelDataRegistry(
|
|
|
366
371
|
input_price=1.25 / 1e6,
|
|
367
372
|
output_price=10.00 / 1e6,
|
|
368
373
|
),
|
|
374
|
+
"gpt-5.1": make_model_data(
|
|
375
|
+
supports_log_probs=False,
|
|
376
|
+
supports_multimodal=True,
|
|
377
|
+
supports_structured_outputs=True,
|
|
378
|
+
supports_json=False,
|
|
379
|
+
supports_temperature=False,
|
|
380
|
+
input_price=1.25 / 1e6,
|
|
381
|
+
output_price=10.00 / 1e6,
|
|
382
|
+
),
|
|
383
|
+
"gpt-5.2": make_model_data(
|
|
384
|
+
supports_log_probs=False,
|
|
385
|
+
supports_multimodal=True,
|
|
386
|
+
supports_structured_outputs=True,
|
|
387
|
+
supports_json=False,
|
|
388
|
+
supports_temperature=False,
|
|
389
|
+
input_price=1.75 / 1e6,
|
|
390
|
+
output_price=14.00 / 1e6,
|
|
391
|
+
),
|
|
369
392
|
}
|
|
370
393
|
)
|
|
371
394
|
|
|
@@ -65,6 +65,7 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
65
65
|
project: Optional[str] = None,
|
|
66
66
|
location: Optional[str] = None,
|
|
67
67
|
service_account_key: Optional[Union[str, Dict[str, str]]] = None,
|
|
68
|
+
use_vertexai: Optional[bool] = None,
|
|
68
69
|
generation_kwargs: Optional[Dict] = None,
|
|
69
70
|
**kwargs,
|
|
70
71
|
):
|
|
@@ -93,7 +94,11 @@ class GeminiModel(DeepEvalBaseLLM):
|
|
|
93
94
|
location if location is not None else settings.GOOGLE_CLOUD_LOCATION
|
|
94
95
|
)
|
|
95
96
|
self.location = str(location).strip() if location is not None else None
|
|
96
|
-
self.use_vertexai =
|
|
97
|
+
self.use_vertexai = (
|
|
98
|
+
use_vertexai
|
|
99
|
+
if use_vertexai is not None
|
|
100
|
+
else settings.GOOGLE_GENAI_USE_VERTEXAI
|
|
101
|
+
)
|
|
97
102
|
|
|
98
103
|
self.service_account_key: Optional[SecretStr] = None
|
|
99
104
|
if service_account_key is None:
|
|
@@ -24,14 +24,13 @@ from deepeval.models.retry_policy import (
|
|
|
24
24
|
sdk_retries_for,
|
|
25
25
|
)
|
|
26
26
|
from deepeval.models.llms.constants import (
|
|
27
|
+
DEFAULT_GPT_MODEL,
|
|
27
28
|
OPENAI_MODELS_DATA,
|
|
28
29
|
)
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
retry_openai = create_retry_decorator(PS.OPENAI)
|
|
32
33
|
|
|
33
|
-
default_gpt_model = "gpt-4.1"
|
|
34
|
-
|
|
35
34
|
|
|
36
35
|
def _request_timeout_seconds() -> float:
|
|
37
36
|
timeout = float(get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
|
|
@@ -70,7 +69,7 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
70
69
|
|
|
71
70
|
model = model or settings.OPENAI_MODEL_NAME
|
|
72
71
|
if model is None:
|
|
73
|
-
model =
|
|
72
|
+
model = DEFAULT_GPT_MODEL
|
|
74
73
|
|
|
75
74
|
cost_per_input_token = (
|
|
76
75
|
cost_per_input_token
|
|
@@ -377,7 +376,9 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
377
376
|
# Utilities #
|
|
378
377
|
#############
|
|
379
378
|
|
|
380
|
-
def calculate_cost(
|
|
379
|
+
def calculate_cost(
|
|
380
|
+
self, input_tokens: int, output_tokens: int
|
|
381
|
+
) -> Optional[float]:
|
|
381
382
|
if self.model_data.input_price and self.model_data.output_price:
|
|
382
383
|
input_cost = input_tokens * self.model_data.input_price
|
|
383
384
|
output_cost = output_tokens * self.model_data.output_price
|
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
import inspect
|
|
3
|
+
|
|
4
|
+
from typing import Optional, Tuple, Union, Dict, Type
|
|
5
|
+
from pydantic import BaseModel, SecretStr
|
|
6
|
+
from openai.types.chat.chat_completion import ChatCompletion
|
|
7
|
+
from openai import (
|
|
8
|
+
OpenAI,
|
|
9
|
+
AsyncOpenAI,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
from deepeval.config.settings import get_settings
|
|
13
|
+
from deepeval.constants import ProviderSlug as PS
|
|
14
|
+
from deepeval.errors import DeepEvalError
|
|
15
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
16
|
+
from deepeval.models.llms.constants import DEFAULT_OPENROUTER_MODEL
|
|
17
|
+
from deepeval.models.llms.utils import trim_and_load_json
|
|
18
|
+
from deepeval.models.utils import require_secret_api_key
|
|
19
|
+
from deepeval.models.retry_policy import (
|
|
20
|
+
create_retry_decorator,
|
|
21
|
+
sdk_retries_for,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
retry_openrouter = create_retry_decorator(PS.OPENROUTER)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _request_timeout_seconds() -> float:
|
|
29
|
+
timeout = float(get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
|
|
30
|
+
return timeout if timeout > 0 else 30.0
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _convert_schema_to_openrouter_format(
|
|
34
|
+
schema: Union[Type[BaseModel], BaseModel],
|
|
35
|
+
) -> Dict:
|
|
36
|
+
"""
|
|
37
|
+
Convert Pydantic BaseModel to OpenRouter's JSON Schema format.
|
|
38
|
+
|
|
39
|
+
OpenRouter expects:
|
|
40
|
+
{
|
|
41
|
+
"type": "json_schema",
|
|
42
|
+
"json_schema": {
|
|
43
|
+
"name": "schema_name",
|
|
44
|
+
"strict": true,
|
|
45
|
+
"schema": { ... JSON Schema ... }
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
"""
|
|
49
|
+
json_schema = schema.model_json_schema()
|
|
50
|
+
schema_name = (
|
|
51
|
+
schema.__name__
|
|
52
|
+
if inspect.isclass(schema)
|
|
53
|
+
else schema.__class__.__name__
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# OpenRouter requires additionalProperties: false when strict: true
|
|
57
|
+
# Ensure it's set at the root level of the schema
|
|
58
|
+
if "additionalProperties" not in json_schema:
|
|
59
|
+
json_schema["additionalProperties"] = False
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
"type": "json_schema",
|
|
63
|
+
"json_schema": {
|
|
64
|
+
"name": schema_name,
|
|
65
|
+
"strict": True,
|
|
66
|
+
"schema": json_schema,
|
|
67
|
+
},
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class OpenRouterModel(DeepEvalBaseLLM):
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
model: Optional[str] = None,
|
|
75
|
+
api_key: Optional[str] = None,
|
|
76
|
+
base_url: Optional[str] = None,
|
|
77
|
+
temperature: Optional[float] = None,
|
|
78
|
+
cost_per_input_token: Optional[float] = None,
|
|
79
|
+
cost_per_output_token: Optional[float] = None,
|
|
80
|
+
generation_kwargs: Optional[Dict] = None,
|
|
81
|
+
**kwargs,
|
|
82
|
+
):
|
|
83
|
+
settings = get_settings()
|
|
84
|
+
model = model or settings.OPENROUTER_MODEL_NAME
|
|
85
|
+
if model is None:
|
|
86
|
+
model = DEFAULT_OPENROUTER_MODEL
|
|
87
|
+
|
|
88
|
+
if api_key is not None:
|
|
89
|
+
# keep it secret, keep it safe from serializings, logging and alike
|
|
90
|
+
self.api_key: Optional[SecretStr] = SecretStr(api_key)
|
|
91
|
+
else:
|
|
92
|
+
self.api_key = settings.OPENROUTER_API_KEY
|
|
93
|
+
|
|
94
|
+
if base_url is not None:
|
|
95
|
+
base_url = str(base_url).rstrip("/")
|
|
96
|
+
elif settings.OPENROUTER_BASE_URL is not None:
|
|
97
|
+
base_url = str(settings.OPENROUTER_BASE_URL).rstrip("/")
|
|
98
|
+
else:
|
|
99
|
+
base_url = "https://openrouter.ai/api/v1"
|
|
100
|
+
|
|
101
|
+
cost_per_input_token = (
|
|
102
|
+
cost_per_input_token
|
|
103
|
+
if cost_per_input_token is not None
|
|
104
|
+
else settings.OPENROUTER_COST_PER_INPUT_TOKEN
|
|
105
|
+
)
|
|
106
|
+
cost_per_output_token = (
|
|
107
|
+
cost_per_output_token
|
|
108
|
+
if cost_per_output_token is not None
|
|
109
|
+
else settings.OPENROUTER_COST_PER_OUTPUT_TOKEN
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if temperature is not None:
|
|
113
|
+
temperature = float(temperature)
|
|
114
|
+
elif settings.TEMPERATURE is not None:
|
|
115
|
+
temperature = settings.TEMPERATURE
|
|
116
|
+
else:
|
|
117
|
+
temperature = 0.0
|
|
118
|
+
|
|
119
|
+
# validation
|
|
120
|
+
if temperature < 0:
|
|
121
|
+
raise DeepEvalError("Temperature must be >= 0.")
|
|
122
|
+
|
|
123
|
+
self.base_url = base_url
|
|
124
|
+
self.cost_per_input_token = cost_per_input_token
|
|
125
|
+
self.cost_per_output_token = cost_per_output_token
|
|
126
|
+
self.temperature = temperature
|
|
127
|
+
|
|
128
|
+
self.kwargs = dict(kwargs)
|
|
129
|
+
self.kwargs.pop("temperature", None)
|
|
130
|
+
|
|
131
|
+
self.generation_kwargs = dict(generation_kwargs or {})
|
|
132
|
+
self.generation_kwargs.pop("temperature", None)
|
|
133
|
+
|
|
134
|
+
super().__init__(model)
|
|
135
|
+
|
|
136
|
+
###############################################
|
|
137
|
+
# Generate functions
|
|
138
|
+
###############################################
|
|
139
|
+
|
|
140
|
+
async def _generate_with_client(
|
|
141
|
+
self,
|
|
142
|
+
client: AsyncOpenAI,
|
|
143
|
+
prompt: str,
|
|
144
|
+
schema: Optional[BaseModel] = None,
|
|
145
|
+
) -> Tuple[Union[str, Dict], float]:
|
|
146
|
+
"""
|
|
147
|
+
Core generation logic shared between generate() and a_generate().
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
client: AsyncOpenAI client
|
|
151
|
+
prompt: The prompt to send
|
|
152
|
+
schema: Optional Pydantic schema for structured outputs
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Tuple of (output, cost)
|
|
156
|
+
"""
|
|
157
|
+
if schema:
|
|
158
|
+
# Try OpenRouter's native JSON Schema format
|
|
159
|
+
try:
|
|
160
|
+
openrouter_response_format = (
|
|
161
|
+
_convert_schema_to_openrouter_format(schema)
|
|
162
|
+
)
|
|
163
|
+
completion = await client.chat.completions.create(
|
|
164
|
+
model=self.name,
|
|
165
|
+
messages=[{"role": "user", "content": prompt}],
|
|
166
|
+
response_format=openrouter_response_format,
|
|
167
|
+
temperature=self.temperature,
|
|
168
|
+
**self.generation_kwargs,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Parse the JSON response and validate against schema
|
|
172
|
+
json_output = trim_and_load_json(
|
|
173
|
+
completion.choices[0].message.content
|
|
174
|
+
)
|
|
175
|
+
cost = self.calculate_cost(
|
|
176
|
+
completion.usage.prompt_tokens,
|
|
177
|
+
completion.usage.completion_tokens,
|
|
178
|
+
response=completion,
|
|
179
|
+
)
|
|
180
|
+
return schema.model_validate(json_output), cost
|
|
181
|
+
except Exception as e:
|
|
182
|
+
# Warn if structured outputs fail
|
|
183
|
+
warnings.warn(
|
|
184
|
+
f"Structured outputs not supported for model '{self.name}'. "
|
|
185
|
+
f"Falling back to regular generation with JSON parsing. "
|
|
186
|
+
f"Error: {str(e)}",
|
|
187
|
+
UserWarning,
|
|
188
|
+
stacklevel=3,
|
|
189
|
+
)
|
|
190
|
+
# Fall back to regular generation and parse JSON manually (like Bedrock)
|
|
191
|
+
# This works with any model that can generate JSON in text
|
|
192
|
+
pass
|
|
193
|
+
|
|
194
|
+
# Regular generation (or fallback if structured outputs failed)
|
|
195
|
+
completion = await client.chat.completions.create(
|
|
196
|
+
model=self.name,
|
|
197
|
+
messages=[{"role": "user", "content": prompt}],
|
|
198
|
+
temperature=self.temperature,
|
|
199
|
+
**self.generation_kwargs,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
output = completion.choices[0].message.content
|
|
203
|
+
cost = self.calculate_cost(
|
|
204
|
+
completion.usage.prompt_tokens,
|
|
205
|
+
completion.usage.completion_tokens,
|
|
206
|
+
response=completion,
|
|
207
|
+
)
|
|
208
|
+
if schema:
|
|
209
|
+
# Parse JSON from text and validate against schema (like Bedrock)
|
|
210
|
+
json_output = trim_and_load_json(output)
|
|
211
|
+
return schema.model_validate(json_output), cost
|
|
212
|
+
else:
|
|
213
|
+
return output, cost
|
|
214
|
+
|
|
215
|
+
@retry_openrouter
|
|
216
|
+
def generate(
|
|
217
|
+
self, prompt: str, schema: Optional[BaseModel] = None
|
|
218
|
+
) -> Tuple[Union[str, Dict], float]:
|
|
219
|
+
from deepeval.models.llms.utils import safe_asyncio_run
|
|
220
|
+
|
|
221
|
+
client = self.load_model(async_mode=True)
|
|
222
|
+
return safe_asyncio_run(
|
|
223
|
+
self._generate_with_client(client, prompt, schema)
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
@retry_openrouter
|
|
227
|
+
async def a_generate(
|
|
228
|
+
self, prompt: str, schema: Optional[BaseModel] = None
|
|
229
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
230
|
+
client = self.load_model(async_mode=True)
|
|
231
|
+
return await self._generate_with_client(client, prompt, schema)
|
|
232
|
+
|
|
233
|
+
###############################################
|
|
234
|
+
# Other generate functions
|
|
235
|
+
###############################################
|
|
236
|
+
|
|
237
|
+
@retry_openrouter
|
|
238
|
+
def generate_raw_response(
|
|
239
|
+
self,
|
|
240
|
+
prompt: str,
|
|
241
|
+
top_logprobs: int = 5,
|
|
242
|
+
) -> Tuple[ChatCompletion, float]:
|
|
243
|
+
# Generate completion
|
|
244
|
+
client = self.load_model(async_mode=False)
|
|
245
|
+
completion = client.chat.completions.create(
|
|
246
|
+
model=self.name,
|
|
247
|
+
messages=[{"role": "user", "content": prompt}],
|
|
248
|
+
temperature=self.temperature,
|
|
249
|
+
logprobs=True,
|
|
250
|
+
top_logprobs=top_logprobs,
|
|
251
|
+
**self.generation_kwargs,
|
|
252
|
+
)
|
|
253
|
+
# Cost calculation
|
|
254
|
+
input_tokens = completion.usage.prompt_tokens
|
|
255
|
+
output_tokens = completion.usage.completion_tokens
|
|
256
|
+
cost = self.calculate_cost(
|
|
257
|
+
input_tokens, output_tokens, response=completion
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
return completion, cost
|
|
261
|
+
|
|
262
|
+
@retry_openrouter
|
|
263
|
+
async def a_generate_raw_response(
|
|
264
|
+
self,
|
|
265
|
+
prompt: str,
|
|
266
|
+
top_logprobs: int = 5,
|
|
267
|
+
) -> Tuple[ChatCompletion, float]:
|
|
268
|
+
# Generate completion
|
|
269
|
+
client = self.load_model(async_mode=True)
|
|
270
|
+
completion = await client.chat.completions.create(
|
|
271
|
+
model=self.name,
|
|
272
|
+
messages=[{"role": "user", "content": prompt}],
|
|
273
|
+
temperature=self.temperature,
|
|
274
|
+
logprobs=True,
|
|
275
|
+
top_logprobs=top_logprobs,
|
|
276
|
+
**self.generation_kwargs,
|
|
277
|
+
)
|
|
278
|
+
# Cost calculation
|
|
279
|
+
input_tokens = completion.usage.prompt_tokens
|
|
280
|
+
output_tokens = completion.usage.completion_tokens
|
|
281
|
+
cost = self.calculate_cost(
|
|
282
|
+
input_tokens, output_tokens, response=completion
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
return completion, cost
|
|
286
|
+
|
|
287
|
+
@retry_openrouter
|
|
288
|
+
def generate_samples(
|
|
289
|
+
self, prompt: str, n: int, temperature: float
|
|
290
|
+
) -> Tuple[list[str], float]:
|
|
291
|
+
client = self.load_model(async_mode=False)
|
|
292
|
+
response = client.chat.completions.create(
|
|
293
|
+
model=self.name,
|
|
294
|
+
messages=[{"role": "user", "content": prompt}],
|
|
295
|
+
n=n,
|
|
296
|
+
temperature=temperature,
|
|
297
|
+
**self.generation_kwargs,
|
|
298
|
+
)
|
|
299
|
+
completions = [choice.message.content for choice in response.choices]
|
|
300
|
+
cost = self.calculate_cost(
|
|
301
|
+
response.usage.prompt_tokens,
|
|
302
|
+
response.usage.completion_tokens,
|
|
303
|
+
response=response,
|
|
304
|
+
)
|
|
305
|
+
return completions, cost
|
|
306
|
+
|
|
307
|
+
###############################################
|
|
308
|
+
# Utilities
|
|
309
|
+
###############################################
|
|
310
|
+
|
|
311
|
+
def calculate_cost(
|
|
312
|
+
self, input_tokens: int, output_tokens: int, response=None
|
|
313
|
+
) -> Optional[float]:
|
|
314
|
+
"""
|
|
315
|
+
Calculate cost with priority:
|
|
316
|
+
1. User-provided pricing (highest priority)
|
|
317
|
+
2. Try to extract from API response (if OpenRouter includes pricing)
|
|
318
|
+
3. Return None if cost cannot be determined
|
|
319
|
+
"""
|
|
320
|
+
# Priority 1: User-provided pricing
|
|
321
|
+
if (
|
|
322
|
+
self.cost_per_input_token is not None
|
|
323
|
+
and self.cost_per_output_token is not None
|
|
324
|
+
):
|
|
325
|
+
return (
|
|
326
|
+
input_tokens * self.cost_per_input_token
|
|
327
|
+
+ output_tokens * self.cost_per_output_token
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Priority 2: Try to extract from API response (if OpenRouter includes pricing)
|
|
331
|
+
# Note: OpenRouter may include pricing in response metadata
|
|
332
|
+
if response is not None:
|
|
333
|
+
# Check if response has cost information
|
|
334
|
+
usage_cost = getattr(getattr(response, "usage", None), "cost", None)
|
|
335
|
+
if usage_cost is not None:
|
|
336
|
+
try:
|
|
337
|
+
return float(usage_cost)
|
|
338
|
+
except (ValueError, TypeError):
|
|
339
|
+
pass
|
|
340
|
+
# Some responses might have cost at the top level
|
|
341
|
+
response_cost = getattr(response, "cost", None)
|
|
342
|
+
if response_cost is not None:
|
|
343
|
+
try:
|
|
344
|
+
return float(response_cost)
|
|
345
|
+
except (ValueError, TypeError):
|
|
346
|
+
pass
|
|
347
|
+
|
|
348
|
+
# Priority 3: Return None since cost is unknown
|
|
349
|
+
return None
|
|
350
|
+
|
|
351
|
+
###############################################
|
|
352
|
+
# Model
|
|
353
|
+
###############################################
|
|
354
|
+
|
|
355
|
+
def get_model_name(self):
|
|
356
|
+
return f"{self.name} (OpenRouter)"
|
|
357
|
+
|
|
358
|
+
def load_model(self, async_mode: bool = False):
|
|
359
|
+
if not async_mode:
|
|
360
|
+
return self._build_client(OpenAI)
|
|
361
|
+
return self._build_client(AsyncOpenAI)
|
|
362
|
+
|
|
363
|
+
def _client_kwargs(self) -> Dict:
|
|
364
|
+
"""
|
|
365
|
+
If Tenacity is managing retries, force OpenAI SDK retries off to avoid double retries.
|
|
366
|
+
If the user opts into SDK retries for 'openrouter' via DEEPEVAL_SDK_RETRY_PROVIDERS,
|
|
367
|
+
leave their retry settings as is.
|
|
368
|
+
"""
|
|
369
|
+
kwargs = dict(self.kwargs or {})
|
|
370
|
+
if not sdk_retries_for(PS.OPENROUTER):
|
|
371
|
+
kwargs["max_retries"] = 0
|
|
372
|
+
|
|
373
|
+
if not kwargs.get("timeout"):
|
|
374
|
+
kwargs["timeout"] = _request_timeout_seconds()
|
|
375
|
+
|
|
376
|
+
return kwargs
|
|
377
|
+
|
|
378
|
+
def _build_client(self, cls):
|
|
379
|
+
api_key = require_secret_api_key(
|
|
380
|
+
self.api_key,
|
|
381
|
+
provider_label="OpenRouter",
|
|
382
|
+
env_var_name="OPENROUTER_API_KEY",
|
|
383
|
+
param_hint="`api_key` to OpenRouterModel(...)",
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
kw = dict(
|
|
387
|
+
api_key=api_key,
|
|
388
|
+
base_url=self.base_url,
|
|
389
|
+
**self._client_kwargs(),
|
|
390
|
+
)
|
|
391
|
+
try:
|
|
392
|
+
return cls(**kw)
|
|
393
|
+
except TypeError as e:
|
|
394
|
+
# older OpenAI SDKs may not accept max_retries, in that case remove and retry once
|
|
395
|
+
if "max_retries" in str(e):
|
|
396
|
+
kw.pop("max_retries", None)
|
|
397
|
+
return cls(**kw)
|
|
398
|
+
raise
|
deepeval/models/retry_policy.py
CHANGED
|
@@ -772,6 +772,7 @@ AZURE_OPENAI_ERROR_POLICY = OPENAI_ERROR_POLICY
|
|
|
772
772
|
DEEPSEEK_ERROR_POLICY = OPENAI_ERROR_POLICY
|
|
773
773
|
KIMI_ERROR_POLICY = OPENAI_ERROR_POLICY
|
|
774
774
|
LOCAL_ERROR_POLICY = OPENAI_ERROR_POLICY
|
|
775
|
+
OPENROUTER_ERROR_POLICY = OPENAI_ERROR_POLICY
|
|
775
776
|
|
|
776
777
|
######################
|
|
777
778
|
# AWS Bedrock Policy #
|
|
@@ -998,6 +999,7 @@ _POLICY_BY_SLUG: dict[str, Optional[ErrorPolicy]] = {
|
|
|
998
999
|
PS.LITELLM.value: LITELLM_ERROR_POLICY,
|
|
999
1000
|
PS.LOCAL.value: LOCAL_ERROR_POLICY,
|
|
1000
1001
|
PS.OLLAMA.value: OLLAMA_ERROR_POLICY,
|
|
1002
|
+
PS.OPENROUTER.value: OPENROUTER_ERROR_POLICY,
|
|
1001
1003
|
}
|
|
1002
1004
|
|
|
1003
1005
|
|
|
@@ -1019,6 +1021,7 @@ _STATIC_PRED_BY_SLUG: dict[str, Optional[Callable[[Exception], bool]]] = {
|
|
|
1019
1021
|
PS.LITELLM.value: _opt_pred(LITELLM_ERROR_POLICY),
|
|
1020
1022
|
PS.LOCAL.value: _opt_pred(LOCAL_ERROR_POLICY),
|
|
1021
1023
|
PS.OLLAMA.value: _opt_pred(OLLAMA_ERROR_POLICY),
|
|
1024
|
+
PS.OPENROUTER.value: _opt_pred(OPENROUTER_ERROR_POLICY),
|
|
1022
1025
|
}
|
|
1023
1026
|
|
|
1024
1027
|
|
deepeval/prompt/api.py
CHANGED
deepeval/prompt/prompt.py
CHANGED
|
@@ -114,6 +114,7 @@ class Prompt:
|
|
|
114
114
|
output_type: Optional[OutputType] = None,
|
|
115
115
|
output_schema: Optional[Type[BaseModel]] = None,
|
|
116
116
|
interpolation_type: Optional[PromptInterpolationType] = None,
|
|
117
|
+
confident_api_key: Optional[str] = None,
|
|
117
118
|
):
|
|
118
119
|
if text_template and messages_template:
|
|
119
120
|
raise TypeError(
|
|
@@ -129,6 +130,7 @@ class Prompt:
|
|
|
129
130
|
self.interpolation_type: PromptInterpolationType = (
|
|
130
131
|
interpolation_type or PromptInterpolationType.FSTRING
|
|
131
132
|
)
|
|
133
|
+
self.confident_api_key = confident_api_key
|
|
132
134
|
|
|
133
135
|
self._version = None
|
|
134
136
|
self._prompt_version_id: Optional[str] = None
|
|
@@ -244,7 +246,7 @@ class Prompt:
|
|
|
244
246
|
raise ValueError(
|
|
245
247
|
"Prompt alias is not set. Please set an alias to continue."
|
|
246
248
|
)
|
|
247
|
-
api = Api()
|
|
249
|
+
api = Api(api_key=self.confident_api_key)
|
|
248
250
|
data, _ = api.send_request(
|
|
249
251
|
method=HttpMethods.GET,
|
|
250
252
|
endpoint=Endpoints.PROMPTS_VERSIONS_ENDPOINT,
|
|
@@ -496,7 +498,7 @@ class Prompt:
|
|
|
496
498
|
except Exception:
|
|
497
499
|
pass
|
|
498
500
|
|
|
499
|
-
api = Api()
|
|
501
|
+
api = Api(api_key=self.confident_api_key)
|
|
500
502
|
with Progress(
|
|
501
503
|
SpinnerColumn(style="rgb(106,0,255)"),
|
|
502
504
|
BarColumn(bar_width=60),
|
|
@@ -635,7 +637,7 @@ class Prompt:
|
|
|
635
637
|
# Pydantic version below 2.0
|
|
636
638
|
body = body.dict(by_alias=True, exclude_none=True)
|
|
637
639
|
|
|
638
|
-
api = Api()
|
|
640
|
+
api = Api(api_key=self.confident_api_key)
|
|
639
641
|
_, link = api.send_request(
|
|
640
642
|
method=HttpMethods.POST,
|
|
641
643
|
endpoint=Endpoints.PROMPTS_ENDPOINT,
|
|
@@ -692,7 +694,7 @@ class Prompt:
|
|
|
692
694
|
)
|
|
693
695
|
except AttributeError:
|
|
694
696
|
body = body.dict(by_alias=True, exclude_none=True)
|
|
695
|
-
api = Api()
|
|
697
|
+
api = Api(api_key=self.confident_api_key)
|
|
696
698
|
data, _ = api.send_request(
|
|
697
699
|
method=HttpMethods.PUT,
|
|
698
700
|
endpoint=Endpoints.PROMPTS_VERSION_ID_ENDPOINT,
|
|
@@ -765,7 +767,7 @@ class Prompt:
|
|
|
765
767
|
while True:
|
|
766
768
|
await asyncio.sleep(self._refresh_map[CACHE_KEY][cache_value])
|
|
767
769
|
|
|
768
|
-
api = Api()
|
|
770
|
+
api = Api(api_key=self.confident_api_key)
|
|
769
771
|
try:
|
|
770
772
|
if label:
|
|
771
773
|
data, _ = api.send_request(
|
|
@@ -386,6 +386,7 @@ class LLMTestCase(BaseModel):
|
|
|
386
386
|
[
|
|
387
387
|
re.search(pattern, self.input or "") is not None,
|
|
388
388
|
re.search(pattern, self.actual_output or "") is not None,
|
|
389
|
+
re.search(pattern, self.expected_output or "") is not None,
|
|
389
390
|
]
|
|
390
391
|
)
|
|
391
392
|
if isinstance(self.input, str)
|
deepeval/tracing/tracing.py
CHANGED
|
@@ -847,7 +847,12 @@ class Observer:
|
|
|
847
847
|
self.trace_uuid = parent_span.trace_uuid
|
|
848
848
|
else:
|
|
849
849
|
current_trace = current_trace_context.get()
|
|
850
|
-
|
|
850
|
+
# IMPORTANT: Verify trace is still active, not just in context
|
|
851
|
+
# (a previous failed async operation might leave a dead trace in context)
|
|
852
|
+
if (
|
|
853
|
+
current_trace
|
|
854
|
+
and current_trace.uuid in trace_manager.active_traces
|
|
855
|
+
):
|
|
851
856
|
self.trace_uuid = current_trace.uuid
|
|
852
857
|
else:
|
|
853
858
|
trace = trace_manager.start_new_trace(
|