deepeval 3.7.6__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +658 -262
- deepeval/config/utils.py +9 -1
- deepeval/evaluate/execute.py +153 -94
- deepeval/key_handler.py +121 -51
- deepeval/metrics/base_metric.py +9 -3
- deepeval/metrics/g_eval/g_eval.py +6 -1
- deepeval/metrics/indicator.py +8 -4
- deepeval/metrics/mcp/mcp_task_completion.py +15 -16
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +15 -15
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +8 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +6 -3
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +30 -28
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +8 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +15 -14
- deepeval/metrics/turn_contextual_precision/template.py +8 -1
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +44 -86
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +44 -82
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +48 -92
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +76 -130
- deepeval/metrics/utils.py +16 -1
- deepeval/models/__init__.py +2 -0
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +5 -4
- deepeval/models/llms/anthropic_model.py +4 -3
- deepeval/models/llms/azure_model.py +4 -3
- deepeval/models/llms/deepseek_model.py +5 -8
- deepeval/models/llms/grok_model.py +5 -8
- deepeval/models/llms/kimi_model.py +5 -8
- deepeval/models/llms/litellm_model.py +2 -0
- deepeval/models/llms/local_model.py +1 -1
- deepeval/models/llms/openai_model.py +4 -3
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +1 -5
- deepeval/simulator/conversation_simulator.py +6 -2
- deepeval/simulator/template.py +3 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.6.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.6.dist-info → deepeval-3.7.7.dist-info}/RECORD +54 -53
- {deepeval-3.7.6.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.6.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.6.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -187,6 +187,13 @@ class TurnFaithfulnessTemplate:
|
|
|
187
187
|
Context:
|
|
188
188
|
This metric evaluates conversational faithfulness by extracting truths from retrieval context, extracting claims from the assistant's output, and generating verdicts that compare each claim against the truths. Each interaction yields a reason indicating why a verdict failed or succeeded. You are given all those reasons.
|
|
189
189
|
|
|
190
|
+
**
|
|
191
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
192
|
+
Example JSON:
|
|
193
|
+
{{
|
|
194
|
+
"reason": "The score is <turn_faithfulness_score> because <your_reason>."
|
|
195
|
+
}}
|
|
196
|
+
|
|
190
197
|
Inputs:
|
|
191
198
|
- final_score: the averaged score across all interactions.
|
|
192
199
|
- success: whether the metric passed or failed
|
|
@@ -213,6 +220,6 @@ class TurnFaithfulnessTemplate:
|
|
|
213
220
|
|
|
214
221
|
Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
|
|
215
222
|
|
|
216
|
-
|
|
223
|
+
JSON:
|
|
217
224
|
"""
|
|
218
225
|
)
|
|
@@ -14,6 +14,8 @@ from deepeval.metrics.utils import (
|
|
|
14
14
|
get_unit_interactions,
|
|
15
15
|
get_turns_in_sliding_window,
|
|
16
16
|
initialize_model,
|
|
17
|
+
generate_with_schema_and_extract,
|
|
18
|
+
a_generate_with_schema_and_extract,
|
|
17
19
|
)
|
|
18
20
|
from deepeval.models import DeepEvalBaseLLM
|
|
19
21
|
from deepeval.metrics.turn_faithfulness.template import (
|
|
@@ -273,18 +275,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
273
275
|
extraction_limit=self.truths_extraction_limit,
|
|
274
276
|
multimodal=multimodal,
|
|
275
277
|
)
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
self
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
except TypeError:
|
|
285
|
-
res = await self.model.a_generate(prompt)
|
|
286
|
-
data = trimAndLoadJson(res, self)
|
|
287
|
-
return data["truths"]
|
|
278
|
+
|
|
279
|
+
return await a_generate_with_schema_and_extract(
|
|
280
|
+
metric=self,
|
|
281
|
+
prompt=prompt,
|
|
282
|
+
schema_cls=Truths,
|
|
283
|
+
extract_schema=lambda s: s.truths,
|
|
284
|
+
extract_json=lambda data: data["truths"],
|
|
285
|
+
)
|
|
288
286
|
|
|
289
287
|
def _generate_truths(
|
|
290
288
|
self, retrieval_context: str, multimodal: bool
|
|
@@ -294,18 +292,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
294
292
|
extraction_limit=self.truths_extraction_limit,
|
|
295
293
|
multimodal=multimodal,
|
|
296
294
|
)
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
self
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
except TypeError:
|
|
306
|
-
res = self.model.generate(prompt)
|
|
307
|
-
data = trimAndLoadJson(res, self)
|
|
308
|
-
return data["truths"]
|
|
295
|
+
|
|
296
|
+
return generate_with_schema_and_extract(
|
|
297
|
+
metric=self,
|
|
298
|
+
prompt=prompt,
|
|
299
|
+
schema_cls=Truths,
|
|
300
|
+
extract_schema=lambda s: s.truths,
|
|
301
|
+
extract_json=lambda data: data["truths"],
|
|
302
|
+
)
|
|
309
303
|
|
|
310
304
|
async def _a_generate_claims(
|
|
311
305
|
self, user_content: str, assistant_content: str, multimodal: bool
|
|
@@ -315,18 +309,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
315
309
|
assistant_output=assistant_content,
|
|
316
310
|
multimodal=multimodal,
|
|
317
311
|
)
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
self
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
except TypeError:
|
|
327
|
-
res = await self.model.a_generate(prompt)
|
|
328
|
-
data = trimAndLoadJson(res, self)
|
|
329
|
-
return data["claims"]
|
|
312
|
+
|
|
313
|
+
return await a_generate_with_schema_and_extract(
|
|
314
|
+
metric=self,
|
|
315
|
+
prompt=prompt,
|
|
316
|
+
schema_cls=Claims,
|
|
317
|
+
extract_schema=lambda s: s.claims,
|
|
318
|
+
extract_json=lambda data: data["claims"],
|
|
319
|
+
)
|
|
330
320
|
|
|
331
321
|
def _generate_claims(
|
|
332
322
|
self, user_content: str, assistant_content: str, multimodal: bool
|
|
@@ -336,18 +326,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
336
326
|
assistant_output=assistant_content,
|
|
337
327
|
multimodal=multimodal,
|
|
338
328
|
)
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
self
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
except TypeError:
|
|
348
|
-
res = self.model.generate(prompt)
|
|
349
|
-
data = trimAndLoadJson(res, self)
|
|
350
|
-
return data["claims"]
|
|
329
|
+
|
|
330
|
+
return generate_with_schema_and_extract(
|
|
331
|
+
metric=self,
|
|
332
|
+
prompt=prompt,
|
|
333
|
+
schema_cls=Claims,
|
|
334
|
+
extract_schema=lambda s: s.claims,
|
|
335
|
+
extract_json=lambda data: data["claims"],
|
|
336
|
+
)
|
|
351
337
|
|
|
352
338
|
async def _a_generate_verdicts(
|
|
353
339
|
self, claims: Claims, truths: Truths, multimodal: bool
|
|
@@ -363,25 +349,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
363
349
|
multimodal=multimodal,
|
|
364
350
|
)
|
|
365
351
|
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
res: Verdicts = await self.model.a_generate(
|
|
374
|
-
prompt, schema=Verdicts
|
|
375
|
-
)
|
|
376
|
-
verdicts = [item for item in res.verdicts]
|
|
377
|
-
return verdicts
|
|
378
|
-
except TypeError:
|
|
379
|
-
res = await self.model.a_generate(prompt)
|
|
380
|
-
data = trimAndLoadJson(res, self)
|
|
381
|
-
verdicts = [
|
|
382
|
-
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
383
|
-
]
|
|
384
|
-
return verdicts
|
|
352
|
+
return await a_generate_with_schema_and_extract(
|
|
353
|
+
metric=self,
|
|
354
|
+
prompt=prompt,
|
|
355
|
+
schema_cls=Verdicts,
|
|
356
|
+
extract_schema=lambda s: s.verdicts,
|
|
357
|
+
extract_json=lambda data: data["verdicts"],
|
|
358
|
+
)
|
|
385
359
|
|
|
386
360
|
def _generate_verdicts(
|
|
387
361
|
self, claims: Claims, truths: Truths, multimodal: bool
|
|
@@ -397,23 +371,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
397
371
|
multimodal=multimodal,
|
|
398
372
|
)
|
|
399
373
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
|
408
|
-
verdicts = [item for item in res.verdicts]
|
|
409
|
-
return verdicts
|
|
410
|
-
except TypeError:
|
|
411
|
-
res = self.model.generate(prompt)
|
|
412
|
-
data = trimAndLoadJson(res, self)
|
|
413
|
-
verdicts = [
|
|
414
|
-
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
415
|
-
]
|
|
416
|
-
return verdicts
|
|
374
|
+
return generate_with_schema_and_extract(
|
|
375
|
+
metric=self,
|
|
376
|
+
prompt=prompt,
|
|
377
|
+
schema_cls=Verdicts,
|
|
378
|
+
extract_schema=lambda s: s.verdicts,
|
|
379
|
+
extract_json=lambda data: data["verdicts"],
|
|
380
|
+
)
|
|
417
381
|
|
|
418
382
|
def _get_interaction_score_and_reason(
|
|
419
383
|
self, verdicts, multimodal: bool
|
|
@@ -486,22 +450,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
486
450
|
multimodal=multimodal,
|
|
487
451
|
)
|
|
488
452
|
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
try:
|
|
497
|
-
res: FaithfulnessScoreReason = await self.model.a_generate(
|
|
498
|
-
prompt, schema=FaithfulnessScoreReason
|
|
499
|
-
)
|
|
500
|
-
return res.reason
|
|
501
|
-
except TypeError:
|
|
502
|
-
res = await self.model.a_generate(prompt)
|
|
503
|
-
data = trimAndLoadJson(res, self)
|
|
504
|
-
return data["reason"]
|
|
453
|
+
return await a_generate_with_schema_and_extract(
|
|
454
|
+
metric=self,
|
|
455
|
+
prompt=prompt,
|
|
456
|
+
schema_cls=FaithfulnessScoreReason,
|
|
457
|
+
extract_schema=lambda s: s.reason,
|
|
458
|
+
extract_json=lambda data: data["reason"],
|
|
459
|
+
)
|
|
505
460
|
|
|
506
461
|
def _get_interaction_reason(self, score, verdicts, multimodal: bool) -> str:
|
|
507
462
|
if self.include_reason is False:
|
|
@@ -518,22 +473,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
518
473
|
multimodal=multimodal,
|
|
519
474
|
)
|
|
520
475
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
try:
|
|
529
|
-
res: FaithfulnessScoreReason = self.model.generate(
|
|
530
|
-
prompt, schema=FaithfulnessScoreReason
|
|
531
|
-
)
|
|
532
|
-
return res.reason
|
|
533
|
-
except TypeError:
|
|
534
|
-
res = self.model.generate(prompt)
|
|
535
|
-
data = trimAndLoadJson(res, self)
|
|
536
|
-
return data["reason"]
|
|
476
|
+
return generate_with_schema_and_extract(
|
|
477
|
+
metric=self,
|
|
478
|
+
prompt=prompt,
|
|
479
|
+
schema_cls=FaithfulnessScoreReason,
|
|
480
|
+
extract_schema=lambda s: s.reason,
|
|
481
|
+
extract_json=lambda data: data["reason"],
|
|
482
|
+
)
|
|
537
483
|
|
|
538
484
|
def _get_verbose_steps(
|
|
539
485
|
self, interaction_scores: List[InteractionFaithfulnessScore]
|
|
@@ -568,13 +514,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
568
514
|
self.score, self.success, reasons
|
|
569
515
|
)
|
|
570
516
|
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
517
|
+
return generate_with_schema_and_extract(
|
|
518
|
+
metric=self,
|
|
519
|
+
prompt=prompt,
|
|
520
|
+
schema_cls=FaithfulnessScoreReason,
|
|
521
|
+
extract_schema=lambda s: s.reason,
|
|
522
|
+
extract_json=lambda data: data["reason"],
|
|
523
|
+
)
|
|
578
524
|
|
|
579
525
|
async def _a_generate_reason(
|
|
580
526
|
self, scores: List[InteractionFaithfulnessScore]
|
|
@@ -593,13 +539,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
593
539
|
self.score, self.success, reasons
|
|
594
540
|
)
|
|
595
541
|
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
542
|
+
return await a_generate_with_schema_and_extract(
|
|
543
|
+
metric=self,
|
|
544
|
+
prompt=prompt,
|
|
545
|
+
schema_cls=FaithfulnessScoreReason,
|
|
546
|
+
extract_schema=lambda s: s.reason,
|
|
547
|
+
extract_json=lambda data: data["reason"],
|
|
548
|
+
)
|
|
603
549
|
|
|
604
550
|
def _calculate_score(
|
|
605
551
|
self, scores: List[InteractionFaithfulnessScore]
|
deepeval/metrics/utils.py
CHANGED
|
@@ -32,6 +32,7 @@ from deepeval.models import (
|
|
|
32
32
|
GeminiModel,
|
|
33
33
|
AmazonBedrockModel,
|
|
34
34
|
LiteLLMModel,
|
|
35
|
+
PortkeyModel,
|
|
35
36
|
KimiModel,
|
|
36
37
|
GrokModel,
|
|
37
38
|
DeepSeekModel,
|
|
@@ -458,6 +459,11 @@ async def a_generate_with_schema_and_extract(
|
|
|
458
459
|
###############################################
|
|
459
460
|
|
|
460
461
|
|
|
462
|
+
def should_use_anthropic_model():
|
|
463
|
+
value = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.USE_ANTHROPIC_MODEL)
|
|
464
|
+
return value.lower() == "yes" if value is not None else False
|
|
465
|
+
|
|
466
|
+
|
|
461
467
|
def should_use_azure_openai():
|
|
462
468
|
value = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.USE_AZURE_OPENAI)
|
|
463
469
|
return value.lower() == "yes" if value is not None else False
|
|
@@ -488,6 +494,11 @@ def should_use_litellm():
|
|
|
488
494
|
return value.lower() == "yes" if value is not None else False
|
|
489
495
|
|
|
490
496
|
|
|
497
|
+
def should_use_portkey():
|
|
498
|
+
value = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.USE_PORTKEY_MODEL)
|
|
499
|
+
return value.lower() == "yes" if value is not None else False
|
|
500
|
+
|
|
501
|
+
|
|
491
502
|
def should_use_deepseek_model():
|
|
492
503
|
value = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.USE_DEEPSEEK_MODEL)
|
|
493
504
|
return value.lower() == "yes" if value is not None else False
|
|
@@ -526,6 +537,8 @@ def initialize_model(
|
|
|
526
537
|
return GeminiModel(), True
|
|
527
538
|
if should_use_litellm():
|
|
528
539
|
return LiteLLMModel(), True
|
|
540
|
+
if should_use_portkey():
|
|
541
|
+
return PortkeyModel(), True
|
|
529
542
|
if should_use_ollama_model():
|
|
530
543
|
return OllamaModel(), True
|
|
531
544
|
elif should_use_local_model():
|
|
@@ -535,9 +548,11 @@ def initialize_model(
|
|
|
535
548
|
elif should_use_moonshot_model():
|
|
536
549
|
return KimiModel(model=model), True
|
|
537
550
|
elif should_use_grok_model():
|
|
538
|
-
return GrokModel(
|
|
551
|
+
return GrokModel(), True
|
|
539
552
|
elif should_use_deepseek_model():
|
|
540
553
|
return DeepSeekModel(model=model), True
|
|
554
|
+
elif should_use_anthropic_model():
|
|
555
|
+
return AnthropicModel(), True
|
|
541
556
|
elif isinstance(model, str) or model is None:
|
|
542
557
|
return GPTModel(model=model), True
|
|
543
558
|
|
deepeval/models/__init__.py
CHANGED
|
@@ -15,6 +15,7 @@ from deepeval.models.llms import (
|
|
|
15
15
|
KimiModel,
|
|
16
16
|
GrokModel,
|
|
17
17
|
DeepSeekModel,
|
|
18
|
+
PortkeyModel,
|
|
18
19
|
)
|
|
19
20
|
from deepeval.models.embedding_models import (
|
|
20
21
|
OpenAIEmbeddingModel,
|
|
@@ -42,4 +43,5 @@ __all__ = [
|
|
|
42
43
|
"AzureOpenAIEmbeddingModel",
|
|
43
44
|
"LocalEmbeddingModel",
|
|
44
45
|
"OllamaEmbeddingModel",
|
|
46
|
+
"PortkeyModel",
|
|
45
47
|
]
|
deepeval/models/llms/__init__.py
CHANGED
|
@@ -9,6 +9,7 @@ from .litellm_model import LiteLLMModel
|
|
|
9
9
|
from .kimi_model import KimiModel
|
|
10
10
|
from .grok_model import GrokModel
|
|
11
11
|
from .deepseek_model import DeepSeekModel
|
|
12
|
+
from .portkey_model import PortkeyModel
|
|
12
13
|
|
|
13
14
|
__all__ = [
|
|
14
15
|
"AzureOpenAIModel",
|
|
@@ -22,4 +23,5 @@ __all__ = [
|
|
|
22
23
|
"KimiModel",
|
|
23
24
|
"GrokModel",
|
|
24
25
|
"DeepSeekModel",
|
|
26
|
+
"PortkeyModel",
|
|
25
27
|
]
|
|
@@ -29,6 +29,7 @@ retry_bedrock = create_retry_decorator(PS.BEDROCK)
|
|
|
29
29
|
|
|
30
30
|
_ALIAS_MAP = {
|
|
31
31
|
"model": ["model_id"],
|
|
32
|
+
"region": ["region_name"],
|
|
32
33
|
"cost_per_input_token": ["input_token_cost"],
|
|
33
34
|
"cost_per_output_token": ["output_token_cost"],
|
|
34
35
|
}
|
|
@@ -303,10 +304,10 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
|
|
|
303
304
|
}
|
|
304
305
|
|
|
305
306
|
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
|
|
306
|
-
|
|
307
|
-
input_tokens * self.
|
|
308
|
-
|
|
309
|
-
|
|
307
|
+
if self.model_data.input_price and self.model_data.output_price:
|
|
308
|
+
input_cost = input_tokens * self.model_data.input_price
|
|
309
|
+
output_cost = output_tokens * self.model_data.output_price
|
|
310
|
+
return input_cost + output_cost
|
|
310
311
|
|
|
311
312
|
def load_model(self):
|
|
312
313
|
pass
|
|
@@ -227,9 +227,10 @@ class AnthropicModel(DeepEvalBaseLLM):
|
|
|
227
227
|
###############################################
|
|
228
228
|
|
|
229
229
|
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
230
|
+
if self.model_data.input_price and self.model_data.output_price:
|
|
231
|
+
input_cost = input_tokens * self.model_data.input_price
|
|
232
|
+
output_cost = output_tokens * self.model_data.output_price
|
|
233
|
+
return input_cost + output_cost
|
|
233
234
|
|
|
234
235
|
#########################
|
|
235
236
|
# Capabilities #
|
|
@@ -386,9 +386,10 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
|
|
|
386
386
|
###############################################
|
|
387
387
|
|
|
388
388
|
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
389
|
+
if self.model_data.input_price and self.model_data.output_price:
|
|
390
|
+
input_cost = input_tokens * self.model_data.input_price
|
|
391
|
+
output_cost = output_tokens * self.model_data.output_price
|
|
392
|
+
return input_cost + output_cost
|
|
392
393
|
|
|
393
394
|
###############################################
|
|
394
395
|
# Capabilities
|
|
@@ -176,14 +176,11 @@ class DeepSeekModel(DeepEvalBaseLLM):
|
|
|
176
176
|
# Utilities
|
|
177
177
|
###############################################
|
|
178
178
|
|
|
179
|
-
def calculate_cost(
|
|
180
|
-
self
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
input_cost = input_tokens * self.model_data.input_price
|
|
185
|
-
output_cost = output_tokens * self.model_data.output_price
|
|
186
|
-
return input_cost + output_cost
|
|
179
|
+
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
|
|
180
|
+
if self.model_data.input_price and self.model_data.output_price:
|
|
181
|
+
input_cost = input_tokens * self.model_data.input_price
|
|
182
|
+
output_cost = output_tokens * self.model_data.output_price
|
|
183
|
+
return input_cost + output_cost
|
|
187
184
|
|
|
188
185
|
###############################################
|
|
189
186
|
# Capabilities
|
|
@@ -224,14 +224,11 @@ class GrokModel(DeepEvalBaseLLM):
|
|
|
224
224
|
# Utilities
|
|
225
225
|
###############################################
|
|
226
226
|
|
|
227
|
-
def calculate_cost(
|
|
228
|
-
self
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
input_cost = input_tokens * self.model_data.input_price
|
|
233
|
-
output_cost = output_tokens * self.model_data.output_price
|
|
234
|
-
return input_cost + output_cost
|
|
227
|
+
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
|
|
228
|
+
if self.model_data.input_price and self.model_data.output_price:
|
|
229
|
+
input_cost = input_tokens * self.model_data.input_price
|
|
230
|
+
output_cost = output_tokens * self.model_data.output_price
|
|
231
|
+
return input_cost + output_cost
|
|
235
232
|
|
|
236
233
|
###############################################
|
|
237
234
|
# Capabilities
|
|
@@ -223,14 +223,11 @@ class KimiModel(DeepEvalBaseLLM):
|
|
|
223
223
|
# Utilities
|
|
224
224
|
###############################################
|
|
225
225
|
|
|
226
|
-
def calculate_cost(
|
|
227
|
-
self
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
input_cost = input_tokens * self.model_data.input_price
|
|
232
|
-
output_cost = output_tokens * self.model_data.output_price
|
|
233
|
-
return input_cost + output_cost
|
|
226
|
+
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
|
|
227
|
+
if self.model_data.input_price and self.model_data.output_price:
|
|
228
|
+
input_cost = input_tokens * self.model_data.input_price
|
|
229
|
+
output_cost = output_tokens * self.model_data.output_price
|
|
230
|
+
return input_cost + output_cost
|
|
234
231
|
|
|
235
232
|
###############################################
|
|
236
233
|
# Capabilities
|
|
@@ -289,6 +289,7 @@ class LiteLLMModel(DeepEvalBaseLLM):
|
|
|
289
289
|
"top_logprobs": top_logprobs,
|
|
290
290
|
}
|
|
291
291
|
completion_params.update(self.kwargs)
|
|
292
|
+
completion_params.update(self.generation_kwargs)
|
|
292
293
|
|
|
293
294
|
response = completion(**completion_params)
|
|
294
295
|
cost = self.calculate_cost(response)
|
|
@@ -335,6 +336,7 @@ class LiteLLMModel(DeepEvalBaseLLM):
|
|
|
335
336
|
"top_logprobs": top_logprobs,
|
|
336
337
|
}
|
|
337
338
|
completion_params.update(self.kwargs)
|
|
339
|
+
completion_params.update(self.generation_kwargs)
|
|
338
340
|
|
|
339
341
|
response = await acompletion(**completion_params)
|
|
340
342
|
cost = self.calculate_cost(response)
|
|
@@ -52,7 +52,7 @@ class LocalModel(DeepEvalBaseLLM):
|
|
|
52
52
|
self.base_url = (
|
|
53
53
|
str(base_url).rstrip("/") if base_url is not None else None
|
|
54
54
|
)
|
|
55
|
-
self.format = format or settings.LOCAL_MODEL_FORMAT
|
|
55
|
+
self.format = format or settings.LOCAL_MODEL_FORMAT or "json"
|
|
56
56
|
|
|
57
57
|
if temperature is not None:
|
|
58
58
|
temperature = float(temperature)
|
|
@@ -378,9 +378,10 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
378
378
|
#############
|
|
379
379
|
|
|
380
380
|
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
381
|
+
if self.model_data.input_price and self.model_data.output_price:
|
|
382
|
+
input_cost = input_tokens * self.model_data.input_price
|
|
383
|
+
output_cost = output_tokens * self.model_data.output_price
|
|
384
|
+
return input_cost + output_cost
|
|
384
385
|
|
|
385
386
|
#########################
|
|
386
387
|
# Capabilities #
|
deepeval/models/retry_policy.py
CHANGED
|
@@ -87,6 +87,8 @@ def set_outer_deadline(seconds: float | None):
|
|
|
87
87
|
call, which must be passed to `reset_outer_deadline` to restore the
|
|
88
88
|
previous value.
|
|
89
89
|
"""
|
|
90
|
+
if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
|
|
91
|
+
return _OUTER_DEADLINE.set(None)
|
|
90
92
|
if seconds and seconds > 0:
|
|
91
93
|
return _OUTER_DEADLINE.set(time.monotonic() + seconds)
|
|
92
94
|
return _OUTER_DEADLINE.set(None)
|
|
@@ -131,11 +133,10 @@ def resolve_effective_attempt_timeout():
|
|
|
131
133
|
float: Seconds to use for the inner per-attempt timeout. `0` means
|
|
132
134
|
disable inner timeout and rely on the outer budget instead.
|
|
133
135
|
"""
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
)
|
|
136
|
+
settings = get_settings()
|
|
137
|
+
per_attempt = float(settings.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
|
|
137
138
|
# 0 or None disable inner wait_for. That means rely on outer task cap for timeouts instead.
|
|
138
|
-
if per_attempt <= 0:
|
|
139
|
+
if settings.DEEPEVAL_DISABLE_TIMEOUTS or per_attempt <= 0:
|
|
139
140
|
return 0
|
|
140
141
|
# If we do have a positive per-attempt, use up to remaining outer budget.
|
|
141
142
|
rem = _remaining_budget()
|
|
@@ -557,7 +558,11 @@ def run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
|
|
|
557
558
|
BaseException: If `func` raises, the same exception is re-raised with its
|
|
558
559
|
original traceback.
|
|
559
560
|
"""
|
|
560
|
-
if
|
|
561
|
+
if (
|
|
562
|
+
get_settings().DEEPEVAL_DISABLE_TIMEOUTS
|
|
563
|
+
or not timeout_seconds
|
|
564
|
+
or timeout_seconds <= 0
|
|
565
|
+
):
|
|
561
566
|
return func(*args, **kwargs)
|
|
562
567
|
|
|
563
568
|
# try to respect the global cap on concurrent timeout workers
|
deepeval/models/utils.py
CHANGED
|
@@ -123,11 +123,7 @@ def require_costs(
|
|
|
123
123
|
# If model data doesn't have pricing, use provided values or environment variables
|
|
124
124
|
if model_data.input_price is None or model_data.output_price is None:
|
|
125
125
|
if cost_per_input_token is None or cost_per_output_token is None:
|
|
126
|
-
|
|
127
|
-
f"No pricing available for `{model_name}`. "
|
|
128
|
-
f"Please provide both `cost_per_input_token` and `cost_per_output_token` when initializing `{model_name}`, "
|
|
129
|
-
f"or set {input_token_envvar} and {output_token_envvar} environment variables."
|
|
130
|
-
)
|
|
126
|
+
return None, None
|
|
131
127
|
|
|
132
128
|
# Return the validated cost values as a tuple
|
|
133
129
|
return cost_per_input_token, cost_per_output_token
|
|
@@ -514,7 +514,9 @@ class ConversationSimulator:
|
|
|
514
514
|
):
|
|
515
515
|
if not self.run_remote:
|
|
516
516
|
conversation_history = json.dumps(
|
|
517
|
-
[t.model_dump() for t in turns],
|
|
517
|
+
[t.model_dump() for t in turns],
|
|
518
|
+
indent=4,
|
|
519
|
+
ensure_ascii=False,
|
|
518
520
|
)
|
|
519
521
|
prompt = self.template.stop_simulation(
|
|
520
522
|
conversation_history, golden.expected_outcome
|
|
@@ -559,7 +561,9 @@ class ConversationSimulator:
|
|
|
559
561
|
):
|
|
560
562
|
if not self.run_remote:
|
|
561
563
|
conversation_history = json.dumps(
|
|
562
|
-
[t.model_dump() for t in turns],
|
|
564
|
+
[t.model_dump() for t in turns],
|
|
565
|
+
indent=4,
|
|
566
|
+
ensure_ascii=False,
|
|
563
567
|
)
|
|
564
568
|
prompt = self.template.stop_simulation(
|
|
565
569
|
conversation_history, golden.expected_outcome
|
deepeval/simulator/template.py
CHANGED
|
@@ -57,7 +57,9 @@ class ConversationSimulatorTemplate:
|
|
|
57
57
|
language: str,
|
|
58
58
|
) -> str:
|
|
59
59
|
previous_conversation = json.dumps(
|
|
60
|
-
[t.model_dump() for t in turns],
|
|
60
|
+
[t.model_dump() for t in turns],
|
|
61
|
+
indent=4,
|
|
62
|
+
ensure_ascii=False,
|
|
61
63
|
)
|
|
62
64
|
prompt = textwrap.dedent(
|
|
63
65
|
f"""
|