deepeval 3.7.8__py3-none-any.whl → 3.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/drop/drop.py +5 -2
- deepeval/benchmarks/mmlu/mmlu.py +6 -4
- deepeval/cli/main.py +168 -0
- deepeval/cli/utils.py +2 -2
- deepeval/confident/api.py +2 -0
- deepeval/config/settings.py +10 -0
- deepeval/constants.py +1 -0
- deepeval/integrations/langchain/callback.py +330 -158
- deepeval/integrations/langchain/utils.py +31 -8
- deepeval/key_handler.py +8 -1
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +35 -0
- deepeval/metrics/g_eval/g_eval.py +35 -1
- deepeval/metrics/g_eval/utils.py +65 -0
- deepeval/models/__init__.py +2 -0
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/constants.py +23 -0
- deepeval/models/llms/gemini_model.py +27 -29
- deepeval/models/llms/openai_model.py +5 -4
- deepeval/models/llms/openrouter_model.py +398 -0
- deepeval/models/retry_policy.py +3 -0
- deepeval/prompt/api.py +1 -0
- deepeval/synthesizer/synthesizer.py +190 -82
- deepeval/tracing/tracing.py +6 -1
- deepeval/tracing/types.py +1 -1
- deepeval/utils.py +21 -6
- {deepeval-3.7.8.dist-info → deepeval-3.8.0.dist-info}/METADATA +7 -7
- {deepeval-3.7.8.dist-info → deepeval-3.8.0.dist-info}/RECORD +31 -30
- {deepeval-3.7.8.dist-info → deepeval-3.8.0.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.8.dist-info → deepeval-3.8.0.dist-info}/WHEEL +0 -0
- {deepeval-3.7.8.dist-info → deepeval-3.8.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
import inspect
|
|
3
|
+
|
|
4
|
+
from typing import Optional, Tuple, Union, Dict, Type
|
|
5
|
+
from pydantic import BaseModel, SecretStr
|
|
6
|
+
from openai.types.chat.chat_completion import ChatCompletion
|
|
7
|
+
from openai import (
|
|
8
|
+
OpenAI,
|
|
9
|
+
AsyncOpenAI,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
from deepeval.config.settings import get_settings
|
|
13
|
+
from deepeval.constants import ProviderSlug as PS
|
|
14
|
+
from deepeval.errors import DeepEvalError
|
|
15
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
16
|
+
from deepeval.models.llms.constants import DEFAULT_OPENROUTER_MODEL
|
|
17
|
+
from deepeval.models.llms.utils import trim_and_load_json
|
|
18
|
+
from deepeval.models.utils import require_secret_api_key
|
|
19
|
+
from deepeval.models.retry_policy import (
|
|
20
|
+
create_retry_decorator,
|
|
21
|
+
sdk_retries_for,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
retry_openrouter = create_retry_decorator(PS.OPENROUTER)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _request_timeout_seconds() -> float:
|
|
29
|
+
timeout = float(get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
|
|
30
|
+
return timeout if timeout > 0 else 30.0
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _convert_schema_to_openrouter_format(
|
|
34
|
+
schema: Union[Type[BaseModel], BaseModel],
|
|
35
|
+
) -> Dict:
|
|
36
|
+
"""
|
|
37
|
+
Convert Pydantic BaseModel to OpenRouter's JSON Schema format.
|
|
38
|
+
|
|
39
|
+
OpenRouter expects:
|
|
40
|
+
{
|
|
41
|
+
"type": "json_schema",
|
|
42
|
+
"json_schema": {
|
|
43
|
+
"name": "schema_name",
|
|
44
|
+
"strict": true,
|
|
45
|
+
"schema": { ... JSON Schema ... }
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
"""
|
|
49
|
+
json_schema = schema.model_json_schema()
|
|
50
|
+
schema_name = (
|
|
51
|
+
schema.__name__
|
|
52
|
+
if inspect.isclass(schema)
|
|
53
|
+
else schema.__class__.__name__
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# OpenRouter requires additionalProperties: false when strict: true
|
|
57
|
+
# Ensure it's set at the root level of the schema
|
|
58
|
+
if "additionalProperties" not in json_schema:
|
|
59
|
+
json_schema["additionalProperties"] = False
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
"type": "json_schema",
|
|
63
|
+
"json_schema": {
|
|
64
|
+
"name": schema_name,
|
|
65
|
+
"strict": True,
|
|
66
|
+
"schema": json_schema,
|
|
67
|
+
},
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class OpenRouterModel(DeepEvalBaseLLM):
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
model: Optional[str] = None,
|
|
75
|
+
api_key: Optional[str] = None,
|
|
76
|
+
base_url: Optional[str] = None,
|
|
77
|
+
temperature: Optional[float] = None,
|
|
78
|
+
cost_per_input_token: Optional[float] = None,
|
|
79
|
+
cost_per_output_token: Optional[float] = None,
|
|
80
|
+
generation_kwargs: Optional[Dict] = None,
|
|
81
|
+
**kwargs,
|
|
82
|
+
):
|
|
83
|
+
settings = get_settings()
|
|
84
|
+
model = model or settings.OPENROUTER_MODEL_NAME
|
|
85
|
+
if model is None:
|
|
86
|
+
model = DEFAULT_OPENROUTER_MODEL
|
|
87
|
+
|
|
88
|
+
if api_key is not None:
|
|
89
|
+
# keep it secret, keep it safe from serializings, logging and alike
|
|
90
|
+
self.api_key: Optional[SecretStr] = SecretStr(api_key)
|
|
91
|
+
else:
|
|
92
|
+
self.api_key = settings.OPENROUTER_API_KEY
|
|
93
|
+
|
|
94
|
+
if base_url is not None:
|
|
95
|
+
base_url = str(base_url).rstrip("/")
|
|
96
|
+
elif settings.OPENROUTER_BASE_URL is not None:
|
|
97
|
+
base_url = str(settings.OPENROUTER_BASE_URL).rstrip("/")
|
|
98
|
+
else:
|
|
99
|
+
base_url = "https://openrouter.ai/api/v1"
|
|
100
|
+
|
|
101
|
+
cost_per_input_token = (
|
|
102
|
+
cost_per_input_token
|
|
103
|
+
if cost_per_input_token is not None
|
|
104
|
+
else settings.OPENROUTER_COST_PER_INPUT_TOKEN
|
|
105
|
+
)
|
|
106
|
+
cost_per_output_token = (
|
|
107
|
+
cost_per_output_token
|
|
108
|
+
if cost_per_output_token is not None
|
|
109
|
+
else settings.OPENROUTER_COST_PER_OUTPUT_TOKEN
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if temperature is not None:
|
|
113
|
+
temperature = float(temperature)
|
|
114
|
+
elif settings.TEMPERATURE is not None:
|
|
115
|
+
temperature = settings.TEMPERATURE
|
|
116
|
+
else:
|
|
117
|
+
temperature = 0.0
|
|
118
|
+
|
|
119
|
+
# validation
|
|
120
|
+
if temperature < 0:
|
|
121
|
+
raise DeepEvalError("Temperature must be >= 0.")
|
|
122
|
+
|
|
123
|
+
self.base_url = base_url
|
|
124
|
+
self.cost_per_input_token = cost_per_input_token
|
|
125
|
+
self.cost_per_output_token = cost_per_output_token
|
|
126
|
+
self.temperature = temperature
|
|
127
|
+
|
|
128
|
+
self.kwargs = dict(kwargs)
|
|
129
|
+
self.kwargs.pop("temperature", None)
|
|
130
|
+
|
|
131
|
+
self.generation_kwargs = dict(generation_kwargs or {})
|
|
132
|
+
self.generation_kwargs.pop("temperature", None)
|
|
133
|
+
|
|
134
|
+
super().__init__(model)
|
|
135
|
+
|
|
136
|
+
###############################################
|
|
137
|
+
# Generate functions
|
|
138
|
+
###############################################
|
|
139
|
+
|
|
140
|
+
async def _generate_with_client(
|
|
141
|
+
self,
|
|
142
|
+
client: AsyncOpenAI,
|
|
143
|
+
prompt: str,
|
|
144
|
+
schema: Optional[BaseModel] = None,
|
|
145
|
+
) -> Tuple[Union[str, Dict], float]:
|
|
146
|
+
"""
|
|
147
|
+
Core generation logic shared between generate() and a_generate().
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
client: AsyncOpenAI client
|
|
151
|
+
prompt: The prompt to send
|
|
152
|
+
schema: Optional Pydantic schema for structured outputs
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Tuple of (output, cost)
|
|
156
|
+
"""
|
|
157
|
+
if schema:
|
|
158
|
+
# Try OpenRouter's native JSON Schema format
|
|
159
|
+
try:
|
|
160
|
+
openrouter_response_format = (
|
|
161
|
+
_convert_schema_to_openrouter_format(schema)
|
|
162
|
+
)
|
|
163
|
+
completion = await client.chat.completions.create(
|
|
164
|
+
model=self.name,
|
|
165
|
+
messages=[{"role": "user", "content": prompt}],
|
|
166
|
+
response_format=openrouter_response_format,
|
|
167
|
+
temperature=self.temperature,
|
|
168
|
+
**self.generation_kwargs,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Parse the JSON response and validate against schema
|
|
172
|
+
json_output = trim_and_load_json(
|
|
173
|
+
completion.choices[0].message.content
|
|
174
|
+
)
|
|
175
|
+
cost = self.calculate_cost(
|
|
176
|
+
completion.usage.prompt_tokens,
|
|
177
|
+
completion.usage.completion_tokens,
|
|
178
|
+
response=completion,
|
|
179
|
+
)
|
|
180
|
+
return schema.model_validate(json_output), cost
|
|
181
|
+
except Exception as e:
|
|
182
|
+
# Warn if structured outputs fail
|
|
183
|
+
warnings.warn(
|
|
184
|
+
f"Structured outputs not supported for model '{self.name}'. "
|
|
185
|
+
f"Falling back to regular generation with JSON parsing. "
|
|
186
|
+
f"Error: {str(e)}",
|
|
187
|
+
UserWarning,
|
|
188
|
+
stacklevel=3,
|
|
189
|
+
)
|
|
190
|
+
# Fall back to regular generation and parse JSON manually (like Bedrock)
|
|
191
|
+
# This works with any model that can generate JSON in text
|
|
192
|
+
pass
|
|
193
|
+
|
|
194
|
+
# Regular generation (or fallback if structured outputs failed)
|
|
195
|
+
completion = await client.chat.completions.create(
|
|
196
|
+
model=self.name,
|
|
197
|
+
messages=[{"role": "user", "content": prompt}],
|
|
198
|
+
temperature=self.temperature,
|
|
199
|
+
**self.generation_kwargs,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
output = completion.choices[0].message.content
|
|
203
|
+
cost = self.calculate_cost(
|
|
204
|
+
completion.usage.prompt_tokens,
|
|
205
|
+
completion.usage.completion_tokens,
|
|
206
|
+
response=completion,
|
|
207
|
+
)
|
|
208
|
+
if schema:
|
|
209
|
+
# Parse JSON from text and validate against schema (like Bedrock)
|
|
210
|
+
json_output = trim_and_load_json(output)
|
|
211
|
+
return schema.model_validate(json_output), cost
|
|
212
|
+
else:
|
|
213
|
+
return output, cost
|
|
214
|
+
|
|
215
|
+
@retry_openrouter
|
|
216
|
+
def generate(
|
|
217
|
+
self, prompt: str, schema: Optional[BaseModel] = None
|
|
218
|
+
) -> Tuple[Union[str, Dict], float]:
|
|
219
|
+
from deepeval.models.llms.utils import safe_asyncio_run
|
|
220
|
+
|
|
221
|
+
client = self.load_model(async_mode=True)
|
|
222
|
+
return safe_asyncio_run(
|
|
223
|
+
self._generate_with_client(client, prompt, schema)
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
@retry_openrouter
|
|
227
|
+
async def a_generate(
|
|
228
|
+
self, prompt: str, schema: Optional[BaseModel] = None
|
|
229
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
230
|
+
client = self.load_model(async_mode=True)
|
|
231
|
+
return await self._generate_with_client(client, prompt, schema)
|
|
232
|
+
|
|
233
|
+
###############################################
|
|
234
|
+
# Other generate functions
|
|
235
|
+
###############################################
|
|
236
|
+
|
|
237
|
+
@retry_openrouter
|
|
238
|
+
def generate_raw_response(
|
|
239
|
+
self,
|
|
240
|
+
prompt: str,
|
|
241
|
+
top_logprobs: int = 5,
|
|
242
|
+
) -> Tuple[ChatCompletion, float]:
|
|
243
|
+
# Generate completion
|
|
244
|
+
client = self.load_model(async_mode=False)
|
|
245
|
+
completion = client.chat.completions.create(
|
|
246
|
+
model=self.name,
|
|
247
|
+
messages=[{"role": "user", "content": prompt}],
|
|
248
|
+
temperature=self.temperature,
|
|
249
|
+
logprobs=True,
|
|
250
|
+
top_logprobs=top_logprobs,
|
|
251
|
+
**self.generation_kwargs,
|
|
252
|
+
)
|
|
253
|
+
# Cost calculation
|
|
254
|
+
input_tokens = completion.usage.prompt_tokens
|
|
255
|
+
output_tokens = completion.usage.completion_tokens
|
|
256
|
+
cost = self.calculate_cost(
|
|
257
|
+
input_tokens, output_tokens, response=completion
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
return completion, cost
|
|
261
|
+
|
|
262
|
+
@retry_openrouter
|
|
263
|
+
async def a_generate_raw_response(
|
|
264
|
+
self,
|
|
265
|
+
prompt: str,
|
|
266
|
+
top_logprobs: int = 5,
|
|
267
|
+
) -> Tuple[ChatCompletion, float]:
|
|
268
|
+
# Generate completion
|
|
269
|
+
client = self.load_model(async_mode=True)
|
|
270
|
+
completion = await client.chat.completions.create(
|
|
271
|
+
model=self.name,
|
|
272
|
+
messages=[{"role": "user", "content": prompt}],
|
|
273
|
+
temperature=self.temperature,
|
|
274
|
+
logprobs=True,
|
|
275
|
+
top_logprobs=top_logprobs,
|
|
276
|
+
**self.generation_kwargs,
|
|
277
|
+
)
|
|
278
|
+
# Cost calculation
|
|
279
|
+
input_tokens = completion.usage.prompt_tokens
|
|
280
|
+
output_tokens = completion.usage.completion_tokens
|
|
281
|
+
cost = self.calculate_cost(
|
|
282
|
+
input_tokens, output_tokens, response=completion
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
return completion, cost
|
|
286
|
+
|
|
287
|
+
@retry_openrouter
|
|
288
|
+
def generate_samples(
|
|
289
|
+
self, prompt: str, n: int, temperature: float
|
|
290
|
+
) -> Tuple[list[str], float]:
|
|
291
|
+
client = self.load_model(async_mode=False)
|
|
292
|
+
response = client.chat.completions.create(
|
|
293
|
+
model=self.name,
|
|
294
|
+
messages=[{"role": "user", "content": prompt}],
|
|
295
|
+
n=n,
|
|
296
|
+
temperature=temperature,
|
|
297
|
+
**self.generation_kwargs,
|
|
298
|
+
)
|
|
299
|
+
completions = [choice.message.content for choice in response.choices]
|
|
300
|
+
cost = self.calculate_cost(
|
|
301
|
+
response.usage.prompt_tokens,
|
|
302
|
+
response.usage.completion_tokens,
|
|
303
|
+
response=response,
|
|
304
|
+
)
|
|
305
|
+
return completions, cost
|
|
306
|
+
|
|
307
|
+
###############################################
|
|
308
|
+
# Utilities
|
|
309
|
+
###############################################
|
|
310
|
+
|
|
311
|
+
def calculate_cost(
|
|
312
|
+
self, input_tokens: int, output_tokens: int, response=None
|
|
313
|
+
) -> Optional[float]:
|
|
314
|
+
"""
|
|
315
|
+
Calculate cost with priority:
|
|
316
|
+
1. User-provided pricing (highest priority)
|
|
317
|
+
2. Try to extract from API response (if OpenRouter includes pricing)
|
|
318
|
+
3. Return None if cost cannot be determined
|
|
319
|
+
"""
|
|
320
|
+
# Priority 1: User-provided pricing
|
|
321
|
+
if (
|
|
322
|
+
self.cost_per_input_token is not None
|
|
323
|
+
and self.cost_per_output_token is not None
|
|
324
|
+
):
|
|
325
|
+
return (
|
|
326
|
+
input_tokens * self.cost_per_input_token
|
|
327
|
+
+ output_tokens * self.cost_per_output_token
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Priority 2: Try to extract from API response (if OpenRouter includes pricing)
|
|
331
|
+
# Note: OpenRouter may include pricing in response metadata
|
|
332
|
+
if response is not None:
|
|
333
|
+
# Check if response has cost information
|
|
334
|
+
usage_cost = getattr(getattr(response, "usage", None), "cost", None)
|
|
335
|
+
if usage_cost is not None:
|
|
336
|
+
try:
|
|
337
|
+
return float(usage_cost)
|
|
338
|
+
except (ValueError, TypeError):
|
|
339
|
+
pass
|
|
340
|
+
# Some responses might have cost at the top level
|
|
341
|
+
response_cost = getattr(response, "cost", None)
|
|
342
|
+
if response_cost is not None:
|
|
343
|
+
try:
|
|
344
|
+
return float(response_cost)
|
|
345
|
+
except (ValueError, TypeError):
|
|
346
|
+
pass
|
|
347
|
+
|
|
348
|
+
# Priority 3: Return None since cost is unknown
|
|
349
|
+
return None
|
|
350
|
+
|
|
351
|
+
###############################################
|
|
352
|
+
# Model
|
|
353
|
+
###############################################
|
|
354
|
+
|
|
355
|
+
def get_model_name(self):
|
|
356
|
+
return f"{self.name} (OpenRouter)"
|
|
357
|
+
|
|
358
|
+
def load_model(self, async_mode: bool = False):
|
|
359
|
+
if not async_mode:
|
|
360
|
+
return self._build_client(OpenAI)
|
|
361
|
+
return self._build_client(AsyncOpenAI)
|
|
362
|
+
|
|
363
|
+
def _client_kwargs(self) -> Dict:
|
|
364
|
+
"""
|
|
365
|
+
If Tenacity is managing retries, force OpenAI SDK retries off to avoid double retries.
|
|
366
|
+
If the user opts into SDK retries for 'openrouter' via DEEPEVAL_SDK_RETRY_PROVIDERS,
|
|
367
|
+
leave their retry settings as is.
|
|
368
|
+
"""
|
|
369
|
+
kwargs = dict(self.kwargs or {})
|
|
370
|
+
if not sdk_retries_for(PS.OPENROUTER):
|
|
371
|
+
kwargs["max_retries"] = 0
|
|
372
|
+
|
|
373
|
+
if not kwargs.get("timeout"):
|
|
374
|
+
kwargs["timeout"] = _request_timeout_seconds()
|
|
375
|
+
|
|
376
|
+
return kwargs
|
|
377
|
+
|
|
378
|
+
def _build_client(self, cls):
|
|
379
|
+
api_key = require_secret_api_key(
|
|
380
|
+
self.api_key,
|
|
381
|
+
provider_label="OpenRouter",
|
|
382
|
+
env_var_name="OPENROUTER_API_KEY",
|
|
383
|
+
param_hint="`api_key` to OpenRouterModel(...)",
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
kw = dict(
|
|
387
|
+
api_key=api_key,
|
|
388
|
+
base_url=self.base_url,
|
|
389
|
+
**self._client_kwargs(),
|
|
390
|
+
)
|
|
391
|
+
try:
|
|
392
|
+
return cls(**kw)
|
|
393
|
+
except TypeError as e:
|
|
394
|
+
# older OpenAI SDKs may not accept max_retries, in that case remove and retry once
|
|
395
|
+
if "max_retries" in str(e):
|
|
396
|
+
kw.pop("max_retries", None)
|
|
397
|
+
return cls(**kw)
|
|
398
|
+
raise
|
deepeval/models/retry_policy.py
CHANGED
|
@@ -772,6 +772,7 @@ AZURE_OPENAI_ERROR_POLICY = OPENAI_ERROR_POLICY
|
|
|
772
772
|
DEEPSEEK_ERROR_POLICY = OPENAI_ERROR_POLICY
|
|
773
773
|
KIMI_ERROR_POLICY = OPENAI_ERROR_POLICY
|
|
774
774
|
LOCAL_ERROR_POLICY = OPENAI_ERROR_POLICY
|
|
775
|
+
OPENROUTER_ERROR_POLICY = OPENAI_ERROR_POLICY
|
|
775
776
|
|
|
776
777
|
######################
|
|
777
778
|
# AWS Bedrock Policy #
|
|
@@ -998,6 +999,7 @@ _POLICY_BY_SLUG: dict[str, Optional[ErrorPolicy]] = {
|
|
|
998
999
|
PS.LITELLM.value: LITELLM_ERROR_POLICY,
|
|
999
1000
|
PS.LOCAL.value: LOCAL_ERROR_POLICY,
|
|
1000
1001
|
PS.OLLAMA.value: OLLAMA_ERROR_POLICY,
|
|
1002
|
+
PS.OPENROUTER.value: OPENROUTER_ERROR_POLICY,
|
|
1001
1003
|
}
|
|
1002
1004
|
|
|
1003
1005
|
|
|
@@ -1019,6 +1021,7 @@ _STATIC_PRED_BY_SLUG: dict[str, Optional[Callable[[Exception], bool]]] = {
|
|
|
1019
1021
|
PS.LITELLM.value: _opt_pred(LITELLM_ERROR_POLICY),
|
|
1020
1022
|
PS.LOCAL.value: _opt_pred(LOCAL_ERROR_POLICY),
|
|
1021
1023
|
PS.OLLAMA.value: _opt_pred(OLLAMA_ERROR_POLICY),
|
|
1024
|
+
PS.OPENROUTER.value: _opt_pred(OPENROUTER_ERROR_POLICY),
|
|
1022
1025
|
}
|
|
1023
1026
|
|
|
1024
1027
|
|