deepeval 3.7.6__py3-none-any.whl → 3.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +658 -262
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/test_run_tracer.py +4 -6
  8. deepeval/evaluate/execute.py +153 -94
  9. deepeval/integrations/pydantic_ai/instrumentator.py +4 -2
  10. deepeval/integrations/pydantic_ai/otel.py +5 -1
  11. deepeval/key_handler.py +121 -51
  12. deepeval/metrics/base_metric.py +9 -3
  13. deepeval/metrics/g_eval/g_eval.py +6 -1
  14. deepeval/metrics/indicator.py +8 -4
  15. deepeval/metrics/mcp/mcp_task_completion.py +15 -16
  16. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +15 -15
  17. deepeval/metrics/mcp/schema.py +4 -0
  18. deepeval/metrics/mcp/template.py +8 -1
  19. deepeval/metrics/prompt_alignment/prompt_alignment.py +6 -3
  20. deepeval/metrics/tool_use/schema.py +4 -0
  21. deepeval/metrics/tool_use/template.py +16 -2
  22. deepeval/metrics/tool_use/tool_use.py +30 -28
  23. deepeval/metrics/topic_adherence/schema.py +4 -0
  24. deepeval/metrics/topic_adherence/template.py +8 -1
  25. deepeval/metrics/topic_adherence/topic_adherence.py +15 -14
  26. deepeval/metrics/turn_contextual_precision/template.py +8 -1
  27. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +44 -86
  28. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  29. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +44 -82
  30. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  31. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +48 -92
  32. deepeval/metrics/turn_faithfulness/template.py +8 -1
  33. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +76 -130
  34. deepeval/metrics/utils.py +16 -1
  35. deepeval/models/__init__.py +2 -0
  36. deepeval/models/llms/__init__.py +2 -0
  37. deepeval/models/llms/amazon_bedrock_model.py +5 -4
  38. deepeval/models/llms/anthropic_model.py +4 -3
  39. deepeval/models/llms/azure_model.py +4 -3
  40. deepeval/models/llms/deepseek_model.py +5 -8
  41. deepeval/models/llms/grok_model.py +5 -8
  42. deepeval/models/llms/kimi_model.py +5 -8
  43. deepeval/models/llms/litellm_model.py +2 -0
  44. deepeval/models/llms/local_model.py +1 -1
  45. deepeval/models/llms/openai_model.py +4 -3
  46. deepeval/models/retry_policy.py +10 -5
  47. deepeval/models/utils.py +1 -5
  48. deepeval/simulator/conversation_simulator.py +6 -2
  49. deepeval/simulator/template.py +3 -1
  50. deepeval/synthesizer/synthesizer.py +19 -17
  51. deepeval/test_run/test_run.py +6 -1
  52. deepeval/utils.py +26 -0
  53. {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/METADATA +3 -3
  54. {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/RECORD +57 -56
  55. {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/LICENSE.md +0 -0
  56. {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/WHEEL +0 -0
  57. {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/entry_points.txt +0 -0
@@ -187,6 +187,13 @@ class TurnFaithfulnessTemplate:
187
187
  Context:
188
188
  This metric evaluates conversational faithfulness by extracting truths from retrieval context, extracting claims from the assistant's output, and generating verdicts that compare each claim against the truths. Each interaction yields a reason indicating why a verdict failed or succeeded. You are given all those reasons.
189
189
 
190
+ **
191
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
192
+ Example JSON:
193
+ {{
194
+ "reason": "The score is <turn_faithfulness_score> because <your_reason>."
195
+ }}
196
+
190
197
  Inputs:
191
198
  - final_score: the averaged score across all interactions.
192
199
  - success: whether the metric passed or failed
@@ -213,6 +220,6 @@ class TurnFaithfulnessTemplate:
213
220
 
214
221
  Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
215
222
 
216
- The final reason:
223
+ JSON:
217
224
  """
218
225
  )
@@ -14,6 +14,8 @@ from deepeval.metrics.utils import (
14
14
  get_unit_interactions,
15
15
  get_turns_in_sliding_window,
16
16
  initialize_model,
17
+ generate_with_schema_and_extract,
18
+ a_generate_with_schema_and_extract,
17
19
  )
18
20
  from deepeval.models import DeepEvalBaseLLM
19
21
  from deepeval.metrics.turn_faithfulness.template import (
@@ -273,18 +275,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
273
275
  extraction_limit=self.truths_extraction_limit,
274
276
  multimodal=multimodal,
275
277
  )
276
- if self.using_native_model:
277
- res, cost = await self.model.a_generate(prompt, schema=Truths)
278
- self.evaluation_cost += cost
279
- return res.truths
280
- else:
281
- try:
282
- res: Truths = await self.model.a_generate(prompt, schema=Truths)
283
- return res.truths
284
- except TypeError:
285
- res = await self.model.a_generate(prompt)
286
- data = trimAndLoadJson(res, self)
287
- return data["truths"]
278
+
279
+ return await a_generate_with_schema_and_extract(
280
+ metric=self,
281
+ prompt=prompt,
282
+ schema_cls=Truths,
283
+ extract_schema=lambda s: s.truths,
284
+ extract_json=lambda data: data["truths"],
285
+ )
288
286
 
289
287
  def _generate_truths(
290
288
  self, retrieval_context: str, multimodal: bool
@@ -294,18 +292,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
294
292
  extraction_limit=self.truths_extraction_limit,
295
293
  multimodal=multimodal,
296
294
  )
297
- if self.using_native_model:
298
- res, cost = self.model.generate(prompt, schema=Truths)
299
- self.evaluation_cost += cost
300
- return res.truths
301
- else:
302
- try:
303
- res: Truths = self.model.generate(prompt, schema=Truths)
304
- return res.truths
305
- except TypeError:
306
- res = self.model.generate(prompt)
307
- data = trimAndLoadJson(res, self)
308
- return data["truths"]
295
+
296
+ return generate_with_schema_and_extract(
297
+ metric=self,
298
+ prompt=prompt,
299
+ schema_cls=Truths,
300
+ extract_schema=lambda s: s.truths,
301
+ extract_json=lambda data: data["truths"],
302
+ )
309
303
 
310
304
  async def _a_generate_claims(
311
305
  self, user_content: str, assistant_content: str, multimodal: bool
@@ -315,18 +309,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
315
309
  assistant_output=assistant_content,
316
310
  multimodal=multimodal,
317
311
  )
318
- if self.using_native_model:
319
- res, cost = await self.model.a_generate(prompt, schema=Claims)
320
- self.evaluation_cost += cost
321
- return res.claims
322
- else:
323
- try:
324
- res: Claims = await self.model.a_generate(prompt, schema=Claims)
325
- return res.claims
326
- except TypeError:
327
- res = await self.model.a_generate(prompt)
328
- data = trimAndLoadJson(res, self)
329
- return data["claims"]
312
+
313
+ return await a_generate_with_schema_and_extract(
314
+ metric=self,
315
+ prompt=prompt,
316
+ schema_cls=Claims,
317
+ extract_schema=lambda s: s.claims,
318
+ extract_json=lambda data: data["claims"],
319
+ )
330
320
 
331
321
  def _generate_claims(
332
322
  self, user_content: str, assistant_content: str, multimodal: bool
@@ -336,18 +326,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
336
326
  assistant_output=assistant_content,
337
327
  multimodal=multimodal,
338
328
  )
339
- if self.using_native_model:
340
- res, cost = self.model.generate(prompt, schema=Claims)
341
- self.evaluation_cost += cost
342
- return res.claims
343
- else:
344
- try:
345
- res: Claims = self.model.generate(prompt, schema=Claims)
346
- return res.claims
347
- except TypeError:
348
- res = self.model.generate(prompt)
349
- data = trimAndLoadJson(res, self)
350
- return data["claims"]
329
+
330
+ return generate_with_schema_and_extract(
331
+ metric=self,
332
+ prompt=prompt,
333
+ schema_cls=Claims,
334
+ extract_schema=lambda s: s.claims,
335
+ extract_json=lambda data: data["claims"],
336
+ )
351
337
 
352
338
  async def _a_generate_verdicts(
353
339
  self, claims: Claims, truths: Truths, multimodal: bool
@@ -363,25 +349,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
363
349
  multimodal=multimodal,
364
350
  )
365
351
 
366
- if self.using_native_model:
367
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
368
- self.evaluation_cost += cost
369
- verdicts = [item for item in res.verdicts]
370
- return verdicts
371
- else:
372
- try:
373
- res: Verdicts = await self.model.a_generate(
374
- prompt, schema=Verdicts
375
- )
376
- verdicts = [item for item in res.verdicts]
377
- return verdicts
378
- except TypeError:
379
- res = await self.model.a_generate(prompt)
380
- data = trimAndLoadJson(res, self)
381
- verdicts = [
382
- FaithfulnessVerdict(**item) for item in data["verdicts"]
383
- ]
384
- return verdicts
352
+ return await a_generate_with_schema_and_extract(
353
+ metric=self,
354
+ prompt=prompt,
355
+ schema_cls=Verdicts,
356
+ extract_schema=lambda s: s.verdicts,
357
+ extract_json=lambda data: data["verdicts"],
358
+ )
385
359
 
386
360
  def _generate_verdicts(
387
361
  self, claims: Claims, truths: Truths, multimodal: bool
@@ -397,23 +371,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
397
371
  multimodal=multimodal,
398
372
  )
399
373
 
400
- if self.using_native_model:
401
- res, cost = self.model.generate(prompt, schema=Verdicts)
402
- self.evaluation_cost += cost
403
- verdicts = [item for item in res.verdicts]
404
- return verdicts
405
- else:
406
- try:
407
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
408
- verdicts = [item for item in res.verdicts]
409
- return verdicts
410
- except TypeError:
411
- res = self.model.generate(prompt)
412
- data = trimAndLoadJson(res, self)
413
- verdicts = [
414
- FaithfulnessVerdict(**item) for item in data["verdicts"]
415
- ]
416
- return verdicts
374
+ return generate_with_schema_and_extract(
375
+ metric=self,
376
+ prompt=prompt,
377
+ schema_cls=Verdicts,
378
+ extract_schema=lambda s: s.verdicts,
379
+ extract_json=lambda data: data["verdicts"],
380
+ )
417
381
 
418
382
  def _get_interaction_score_and_reason(
419
383
  self, verdicts, multimodal: bool
@@ -486,22 +450,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
486
450
  multimodal=multimodal,
487
451
  )
488
452
 
489
- if self.using_native_model:
490
- res, cost = await self.model.a_generate(
491
- prompt, schema=FaithfulnessScoreReason
492
- )
493
- self.evaluation_cost += cost
494
- return res.reason
495
- else:
496
- try:
497
- res: FaithfulnessScoreReason = await self.model.a_generate(
498
- prompt, schema=FaithfulnessScoreReason
499
- )
500
- return res.reason
501
- except TypeError:
502
- res = await self.model.a_generate(prompt)
503
- data = trimAndLoadJson(res, self)
504
- return data["reason"]
453
+ return await a_generate_with_schema_and_extract(
454
+ metric=self,
455
+ prompt=prompt,
456
+ schema_cls=FaithfulnessScoreReason,
457
+ extract_schema=lambda s: s.reason,
458
+ extract_json=lambda data: data["reason"],
459
+ )
505
460
 
506
461
  def _get_interaction_reason(self, score, verdicts, multimodal: bool) -> str:
507
462
  if self.include_reason is False:
@@ -518,22 +473,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
518
473
  multimodal=multimodal,
519
474
  )
520
475
 
521
- if self.using_native_model:
522
- res, cost = self.model.generate(
523
- prompt, schema=FaithfulnessScoreReason
524
- )
525
- self.evaluation_cost += cost
526
- return res.reason
527
- else:
528
- try:
529
- res: FaithfulnessScoreReason = self.model.generate(
530
- prompt, schema=FaithfulnessScoreReason
531
- )
532
- return res.reason
533
- except TypeError:
534
- res = self.model.generate(prompt)
535
- data = trimAndLoadJson(res, self)
536
- return data["reason"]
476
+ return generate_with_schema_and_extract(
477
+ metric=self,
478
+ prompt=prompt,
479
+ schema_cls=FaithfulnessScoreReason,
480
+ extract_schema=lambda s: s.reason,
481
+ extract_json=lambda data: data["reason"],
482
+ )
537
483
 
538
484
  def _get_verbose_steps(
539
485
  self, interaction_scores: List[InteractionFaithfulnessScore]
@@ -568,13 +514,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
568
514
  self.score, self.success, reasons
569
515
  )
570
516
 
571
- if self.using_native_model:
572
- res, cost = self.model.generate(prompt)
573
- self.evaluation_cost += cost
574
- return res
575
- else:
576
- res = self.model.generate(prompt)
577
- return res
517
+ return generate_with_schema_and_extract(
518
+ metric=self,
519
+ prompt=prompt,
520
+ schema_cls=FaithfulnessScoreReason,
521
+ extract_schema=lambda s: s.reason,
522
+ extract_json=lambda data: data["reason"],
523
+ )
578
524
 
579
525
  async def _a_generate_reason(
580
526
  self, scores: List[InteractionFaithfulnessScore]
@@ -593,13 +539,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
593
539
  self.score, self.success, reasons
594
540
  )
595
541
 
596
- if self.using_native_model:
597
- res, cost = await self.model.a_generate(prompt)
598
- self.evaluation_cost += cost
599
- return res
600
- else:
601
- res = await self.model.a_generate(prompt)
602
- return res
542
+ return await a_generate_with_schema_and_extract(
543
+ metric=self,
544
+ prompt=prompt,
545
+ schema_cls=FaithfulnessScoreReason,
546
+ extract_schema=lambda s: s.reason,
547
+ extract_json=lambda data: data["reason"],
548
+ )
603
549
 
604
550
  def _calculate_score(
605
551
  self, scores: List[InteractionFaithfulnessScore]
deepeval/metrics/utils.py CHANGED
@@ -32,6 +32,7 @@ from deepeval.models import (
32
32
  GeminiModel,
33
33
  AmazonBedrockModel,
34
34
  LiteLLMModel,
35
+ PortkeyModel,
35
36
  KimiModel,
36
37
  GrokModel,
37
38
  DeepSeekModel,
@@ -458,6 +459,11 @@ async def a_generate_with_schema_and_extract(
458
459
  ###############################################
459
460
 
460
461
 
462
+ def should_use_anthropic_model():
463
+ value = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.USE_ANTHROPIC_MODEL)
464
+ return value.lower() == "yes" if value is not None else False
465
+
466
+
461
467
  def should_use_azure_openai():
462
468
  value = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.USE_AZURE_OPENAI)
463
469
  return value.lower() == "yes" if value is not None else False
@@ -488,6 +494,11 @@ def should_use_litellm():
488
494
  return value.lower() == "yes" if value is not None else False
489
495
 
490
496
 
497
+ def should_use_portkey():
498
+ value = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.USE_PORTKEY_MODEL)
499
+ return value.lower() == "yes" if value is not None else False
500
+
501
+
491
502
  def should_use_deepseek_model():
492
503
  value = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.USE_DEEPSEEK_MODEL)
493
504
  return value.lower() == "yes" if value is not None else False
@@ -526,6 +537,8 @@ def initialize_model(
526
537
  return GeminiModel(), True
527
538
  if should_use_litellm():
528
539
  return LiteLLMModel(), True
540
+ if should_use_portkey():
541
+ return PortkeyModel(), True
529
542
  if should_use_ollama_model():
530
543
  return OllamaModel(), True
531
544
  elif should_use_local_model():
@@ -535,9 +548,11 @@ def initialize_model(
535
548
  elif should_use_moonshot_model():
536
549
  return KimiModel(model=model), True
537
550
  elif should_use_grok_model():
538
- return GrokModel(model=model), True
551
+ return GrokModel(), True
539
552
  elif should_use_deepseek_model():
540
553
  return DeepSeekModel(model=model), True
554
+ elif should_use_anthropic_model():
555
+ return AnthropicModel(), True
541
556
  elif isinstance(model, str) or model is None:
542
557
  return GPTModel(model=model), True
543
558
 
@@ -15,6 +15,7 @@ from deepeval.models.llms import (
15
15
  KimiModel,
16
16
  GrokModel,
17
17
  DeepSeekModel,
18
+ PortkeyModel,
18
19
  )
19
20
  from deepeval.models.embedding_models import (
20
21
  OpenAIEmbeddingModel,
@@ -42,4 +43,5 @@ __all__ = [
42
43
  "AzureOpenAIEmbeddingModel",
43
44
  "LocalEmbeddingModel",
44
45
  "OllamaEmbeddingModel",
46
+ "PortkeyModel",
45
47
  ]
@@ -9,6 +9,7 @@ from .litellm_model import LiteLLMModel
9
9
  from .kimi_model import KimiModel
10
10
  from .grok_model import GrokModel
11
11
  from .deepseek_model import DeepSeekModel
12
+ from .portkey_model import PortkeyModel
12
13
 
13
14
  __all__ = [
14
15
  "AzureOpenAIModel",
@@ -22,4 +23,5 @@ __all__ = [
22
23
  "KimiModel",
23
24
  "GrokModel",
24
25
  "DeepSeekModel",
26
+ "PortkeyModel",
25
27
  ]
@@ -29,6 +29,7 @@ retry_bedrock = create_retry_decorator(PS.BEDROCK)
29
29
 
30
30
  _ALIAS_MAP = {
31
31
  "model": ["model_id"],
32
+ "region": ["region_name"],
32
33
  "cost_per_input_token": ["input_token_cost"],
33
34
  "cost_per_output_token": ["output_token_cost"],
34
35
  }
@@ -303,10 +304,10 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
303
304
  }
304
305
 
305
306
  def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
306
- return (
307
- input_tokens * self.cost_per_input_token
308
- + output_tokens * self.cost_per_output_token
309
- )
307
+ if self.model_data.input_price and self.model_data.output_price:
308
+ input_cost = input_tokens * self.model_data.input_price
309
+ output_cost = output_tokens * self.model_data.output_price
310
+ return input_cost + output_cost
310
311
 
311
312
  def load_model(self):
312
313
  pass
@@ -227,9 +227,10 @@ class AnthropicModel(DeepEvalBaseLLM):
227
227
  ###############################################
228
228
 
229
229
  def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
230
- input_cost = input_tokens * self.model_data.input_price
231
- output_cost = output_tokens * self.model_data.output_price
232
- return input_cost + output_cost
230
+ if self.model_data.input_price and self.model_data.output_price:
231
+ input_cost = input_tokens * self.model_data.input_price
232
+ output_cost = output_tokens * self.model_data.output_price
233
+ return input_cost + output_cost
233
234
 
234
235
  #########################
235
236
  # Capabilities #
@@ -386,9 +386,10 @@ class AzureOpenAIModel(DeepEvalBaseLLM):
386
386
  ###############################################
387
387
 
388
388
  def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
389
- input_cost = input_tokens * self.model_data.input_price
390
- output_cost = output_tokens * self.model_data.output_price
391
- return input_cost + output_cost
389
+ if self.model_data.input_price and self.model_data.output_price:
390
+ input_cost = input_tokens * self.model_data.input_price
391
+ output_cost = output_tokens * self.model_data.output_price
392
+ return input_cost + output_cost
392
393
 
393
394
  ###############################################
394
395
  # Capabilities
@@ -176,14 +176,11 @@ class DeepSeekModel(DeepEvalBaseLLM):
176
176
  # Utilities
177
177
  ###############################################
178
178
 
179
- def calculate_cost(
180
- self,
181
- input_tokens: int,
182
- output_tokens: int,
183
- ) -> float:
184
- input_cost = input_tokens * self.model_data.input_price
185
- output_cost = output_tokens * self.model_data.output_price
186
- return input_cost + output_cost
179
+ def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
180
+ if self.model_data.input_price and self.model_data.output_price:
181
+ input_cost = input_tokens * self.model_data.input_price
182
+ output_cost = output_tokens * self.model_data.output_price
183
+ return input_cost + output_cost
187
184
 
188
185
  ###############################################
189
186
  # Capabilities
@@ -224,14 +224,11 @@ class GrokModel(DeepEvalBaseLLM):
224
224
  # Utilities
225
225
  ###############################################
226
226
 
227
- def calculate_cost(
228
- self,
229
- input_tokens: int,
230
- output_tokens: int,
231
- ) -> float:
232
- input_cost = input_tokens * self.model_data.input_price
233
- output_cost = output_tokens * self.model_data.output_price
234
- return input_cost + output_cost
227
+ def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
228
+ if self.model_data.input_price and self.model_data.output_price:
229
+ input_cost = input_tokens * self.model_data.input_price
230
+ output_cost = output_tokens * self.model_data.output_price
231
+ return input_cost + output_cost
235
232
 
236
233
  ###############################################
237
234
  # Capabilities
@@ -223,14 +223,11 @@ class KimiModel(DeepEvalBaseLLM):
223
223
  # Utilities
224
224
  ###############################################
225
225
 
226
- def calculate_cost(
227
- self,
228
- input_tokens: int,
229
- output_tokens: int,
230
- ) -> float:
231
- input_cost = input_tokens * self.model_data.input_price
232
- output_cost = output_tokens * self.model_data.output_price
233
- return input_cost + output_cost
226
+ def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
227
+ if self.model_data.input_price and self.model_data.output_price:
228
+ input_cost = input_tokens * self.model_data.input_price
229
+ output_cost = output_tokens * self.model_data.output_price
230
+ return input_cost + output_cost
234
231
 
235
232
  ###############################################
236
233
  # Capabilities
@@ -289,6 +289,7 @@ class LiteLLMModel(DeepEvalBaseLLM):
289
289
  "top_logprobs": top_logprobs,
290
290
  }
291
291
  completion_params.update(self.kwargs)
292
+ completion_params.update(self.generation_kwargs)
292
293
 
293
294
  response = completion(**completion_params)
294
295
  cost = self.calculate_cost(response)
@@ -335,6 +336,7 @@ class LiteLLMModel(DeepEvalBaseLLM):
335
336
  "top_logprobs": top_logprobs,
336
337
  }
337
338
  completion_params.update(self.kwargs)
339
+ completion_params.update(self.generation_kwargs)
338
340
 
339
341
  response = await acompletion(**completion_params)
340
342
  cost = self.calculate_cost(response)
@@ -52,7 +52,7 @@ class LocalModel(DeepEvalBaseLLM):
52
52
  self.base_url = (
53
53
  str(base_url).rstrip("/") if base_url is not None else None
54
54
  )
55
- self.format = format or settings.LOCAL_MODEL_FORMAT
55
+ self.format = format or settings.LOCAL_MODEL_FORMAT or "json"
56
56
 
57
57
  if temperature is not None:
58
58
  temperature = float(temperature)
@@ -378,9 +378,10 @@ class GPTModel(DeepEvalBaseLLM):
378
378
  #############
379
379
 
380
380
  def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
381
- input_cost = input_tokens * self.model_data.input_price
382
- output_cost = output_tokens * self.model_data.output_price
383
- return input_cost + output_cost
381
+ if self.model_data.input_price and self.model_data.output_price:
382
+ input_cost = input_tokens * self.model_data.input_price
383
+ output_cost = output_tokens * self.model_data.output_price
384
+ return input_cost + output_cost
384
385
 
385
386
  #########################
386
387
  # Capabilities #
@@ -87,6 +87,8 @@ def set_outer_deadline(seconds: float | None):
87
87
  call, which must be passed to `reset_outer_deadline` to restore the
88
88
  previous value.
89
89
  """
90
+ if get_settings().DEEPEVAL_DISABLE_TIMEOUTS:
91
+ return _OUTER_DEADLINE.set(None)
90
92
  if seconds and seconds > 0:
91
93
  return _OUTER_DEADLINE.set(time.monotonic() + seconds)
92
94
  return _OUTER_DEADLINE.set(None)
@@ -131,11 +133,10 @@ def resolve_effective_attempt_timeout():
131
133
  float: Seconds to use for the inner per-attempt timeout. `0` means
132
134
  disable inner timeout and rely on the outer budget instead.
133
135
  """
134
- per_attempt = float(
135
- get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
136
- )
136
+ settings = get_settings()
137
+ per_attempt = float(settings.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
137
138
  # 0 or None disable inner wait_for. That means rely on outer task cap for timeouts instead.
138
- if per_attempt <= 0:
139
+ if settings.DEEPEVAL_DISABLE_TIMEOUTS or per_attempt <= 0:
139
140
  return 0
140
141
  # If we do have a positive per-attempt, use up to remaining outer budget.
141
142
  rem = _remaining_budget()
@@ -557,7 +558,11 @@ def run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
557
558
  BaseException: If `func` raises, the same exception is re-raised with its
558
559
  original traceback.
559
560
  """
560
- if not timeout_seconds or timeout_seconds <= 0:
561
+ if (
562
+ get_settings().DEEPEVAL_DISABLE_TIMEOUTS
563
+ or not timeout_seconds
564
+ or timeout_seconds <= 0
565
+ ):
561
566
  return func(*args, **kwargs)
562
567
 
563
568
  # try to respect the global cap on concurrent timeout workers
deepeval/models/utils.py CHANGED
@@ -123,11 +123,7 @@ def require_costs(
123
123
  # If model data doesn't have pricing, use provided values or environment variables
124
124
  if model_data.input_price is None or model_data.output_price is None:
125
125
  if cost_per_input_token is None or cost_per_output_token is None:
126
- raise DeepEvalError(
127
- f"No pricing available for `{model_name}`. "
128
- f"Please provide both `cost_per_input_token` and `cost_per_output_token` when initializing `{model_name}`, "
129
- f"or set {input_token_envvar} and {output_token_envvar} environment variables."
130
- )
126
+ return None, None
131
127
 
132
128
  # Return the validated cost values as a tuple
133
129
  return cost_per_input_token, cost_per_output_token
@@ -514,7 +514,9 @@ class ConversationSimulator:
514
514
  ):
515
515
  if not self.run_remote:
516
516
  conversation_history = json.dumps(
517
- [t.model_dump() for t in turns], indent=4
517
+ [t.model_dump() for t in turns],
518
+ indent=4,
519
+ ensure_ascii=False,
518
520
  )
519
521
  prompt = self.template.stop_simulation(
520
522
  conversation_history, golden.expected_outcome
@@ -559,7 +561,9 @@ class ConversationSimulator:
559
561
  ):
560
562
  if not self.run_remote:
561
563
  conversation_history = json.dumps(
562
- [t.model_dump() for t in turns], indent=4
564
+ [t.model_dump() for t in turns],
565
+ indent=4,
566
+ ensure_ascii=False,
563
567
  )
564
568
  prompt = self.template.stop_simulation(
565
569
  conversation_history, golden.expected_outcome
@@ -57,7 +57,9 @@ class ConversationSimulatorTemplate:
57
57
  language: str,
58
58
  ) -> str:
59
59
  previous_conversation = json.dumps(
60
- [t.model_dump() for t in turns], indent=4
60
+ [t.model_dump() for t in turns],
61
+ indent=4,
62
+ ensure_ascii=False,
61
63
  )
62
64
  prompt = textwrap.dedent(
63
65
  f"""