deepeval 3.7.5__py3-none-any.whl β†’ 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info β†’ deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info β†’ deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info β†’ deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info β†’ deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info β†’ deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  from typing import List, Dict, Optional, Union
2
- from dataclasses import dataclass
2
+ from dataclasses import dataclass, field
3
3
  from pydantic import BaseModel
4
-
4
+ import re
5
5
  from deepeval.test_case import (
6
6
  LLMTestCase,
7
7
  )
@@ -19,6 +19,7 @@ class Contestant(BaseModel):
19
19
  @dataclass
20
20
  class ArenaTestCase:
21
21
  contestants: List[Contestant]
22
+ multimodal: bool = field(default=False)
22
23
 
23
24
  def __post_init__(self):
24
25
  contestant_names = [contestant.name for contestant in self.contestants]
@@ -38,6 +39,10 @@ class ArenaTestCase:
38
39
  "All contestants must have the same 'expected_output'."
39
40
  )
40
41
 
42
+ for contestant in self.contestants:
43
+ if contestant.test_case.multimodal:
44
+ self.multimodal = True
45
+
41
46
 
42
47
  class Arena:
43
48
  test_cases: List[ArenaTestCase]
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from pydantic import (
2
3
  BaseModel,
3
4
  Field,
@@ -17,6 +18,7 @@ from deepeval.test_case.mcp import (
17
18
  MCPToolCall,
18
19
  validate_mcp_servers,
19
20
  )
21
+ from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
20
22
 
21
23
 
22
24
  class TurnParams(Enum):
@@ -170,12 +172,28 @@ class ConversationalTestCase(BaseModel):
170
172
  return self
171
173
 
172
174
  pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
173
- self.multimodal = any(
174
- [
175
- re.search(pattern, turn.content) is not None
176
- for turn in self.turns
177
- ]
178
- )
175
+ if self.scenario:
176
+ if re.search(pattern, self.scenario) is not None:
177
+ self.multimodal = True
178
+ return self
179
+ if self.expected_outcome:
180
+ if re.search(pattern, self.expected_outcome) is not None:
181
+ self.multimodal = True
182
+ return self
183
+ if self.user_description:
184
+ if re.search(pattern, self.user_description) is not None:
185
+ self.multimodal = True
186
+ return self
187
+ if self.turns:
188
+ for turn in self.turns:
189
+ if re.search(pattern, turn.content) is not None:
190
+ self.multimodal = True
191
+ return self
192
+ if turn.retrieval_context is not None:
193
+ self.multimodal = any(
194
+ re.search(pattern, context) is not None
195
+ for context in turn.retrieval_context
196
+ )
179
197
 
180
198
  return self
181
199
 
@@ -215,3 +233,34 @@ class ConversationalTestCase(BaseModel):
215
233
  data["turns"] = copied_turns
216
234
 
217
235
  return data
236
+
237
+ def _get_images_mapping(self) -> Dict[str, MLLMImage]:
238
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
239
+ image_ids = set()
240
+
241
+ def extract_ids_from_string(s: Optional[str]) -> None:
242
+ """Helper to extract image IDs from a string."""
243
+ if s is not None and isinstance(s, str):
244
+ matches = re.findall(pattern, s)
245
+ image_ids.update(matches)
246
+
247
+ def extract_ids_from_list(lst: Optional[List[str]]) -> None:
248
+ """Helper to extract image IDs from a list of strings."""
249
+ if lst is not None:
250
+ for item in lst:
251
+ extract_ids_from_string(item)
252
+
253
+ extract_ids_from_string(self.scenario)
254
+ extract_ids_from_string(self.expected_outcome)
255
+ extract_ids_from_list(self.context)
256
+ extract_ids_from_string(self.user_description)
257
+ for turn in self.turns:
258
+ extract_ids_from_string(turn.content)
259
+ extract_ids_from_list(turn.retrieval_context)
260
+
261
+ images_mapping = {}
262
+ for img_id in image_ids:
263
+ if img_id in _MLLM_IMAGE_REGISTRY:
264
+ images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
265
+
266
+ return images_mapping if len(images_mapping) > 0 else None
@@ -60,19 +60,34 @@ class MLLMImage:
60
60
  if self.local:
61
61
  path = self.process_url(self.url)
62
62
  self.filename = os.path.basename(path)
63
- self.mimeType = (
64
- mimetypes.guess_type(path)[0] or "application/octet-stream"
65
- )
66
- with open(path, "rb") as f:
67
- raw = f.read()
68
- self.dataBase64 = base64.b64encode(raw).decode("ascii")
63
+ self.mimeType = mimetypes.guess_type(path)[0] or "image/jpeg"
64
+
65
+ if not os.path.exists(path):
66
+ raise FileNotFoundError(f"Image file not found: {path}")
67
+
68
+ self._load_base64(path)
69
69
  else:
70
+ if not self.url.startswith(("http://", "https://")):
71
+ raise ValueError(
72
+ f"Invalid remote URL format: {self.url}. URL must start with http:// or https://"
73
+ )
70
74
  self.filename = None
71
75
  self.mimeType = None
72
76
  self.dataBase64 = None
73
77
 
74
78
  _MLLM_IMAGE_REGISTRY[self._id] = self
75
79
 
80
+ def _load_base64(self, path: str):
81
+ with open(path, "rb") as f:
82
+ raw = f.read()
83
+ self.dataBase64 = base64.b64encode(raw).decode("ascii")
84
+
85
+ def ensure_images_loaded(self):
86
+ if self.local and self.dataBase64 is None:
87
+ path = self.process_url(self.url)
88
+ self._load_base64(path)
89
+ return self
90
+
76
91
  def _placeholder(self) -> str:
77
92
  return f"[DEEPEVAL:IMAGE:{self._id}]"
78
93
 
@@ -376,6 +391,16 @@ class LLMTestCase(BaseModel):
376
391
  if isinstance(self.input, str)
377
392
  else self.multimodal
378
393
  )
394
+ if self.retrieval_context is not None:
395
+ auto_detect = auto_detect or any(
396
+ re.search(pattern, context) is not None
397
+ for context in self.retrieval_context
398
+ )
399
+ if self.context is not None:
400
+ auto_detect = auto_detect or any(
401
+ re.search(pattern, context) is not None
402
+ for context in self.context
403
+ )
379
404
 
380
405
  self.multimodal = auto_detect
381
406
  return self
@@ -486,3 +511,32 @@ class LLMTestCase(BaseModel):
486
511
  )
487
512
 
488
513
  return data
514
+
515
+ def _get_images_mapping(self) -> Dict[str, MLLMImage]:
516
+ pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
517
+ image_ids = set()
518
+
519
+ def extract_ids_from_string(s: Optional[str]) -> None:
520
+ """Helper to extract image IDs from a string."""
521
+ if s is not None and isinstance(s, str):
522
+ matches = re.findall(pattern, s)
523
+ image_ids.update(matches)
524
+
525
+ def extract_ids_from_list(lst: Optional[List[str]]) -> None:
526
+ """Helper to extract image IDs from a list of strings."""
527
+ if lst is not None:
528
+ for item in lst:
529
+ extract_ids_from_string(item)
530
+
531
+ extract_ids_from_string(self.input)
532
+ extract_ids_from_string(self.actual_output)
533
+ extract_ids_from_string(self.expected_output)
534
+ extract_ids_from_list(self.context)
535
+ extract_ids_from_list(self.retrieval_context)
536
+
537
+ images_mapping = {}
538
+ for img_id in image_ids:
539
+ if img_id in _MLLM_IMAGE_REGISTRY:
540
+ images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
541
+
542
+ return images_mapping if len(images_mapping) > 0 else None
deepeval/test_run/api.py CHANGED
@@ -126,6 +126,9 @@ class ConversationalApiTestCase(BaseModel):
126
126
  additional_metadata: Optional[Dict] = Field(
127
127
  None, alias="additionalMetadata"
128
128
  )
129
+ images_mapping: Optional[Dict[str, MLLMImage]] = Field(
130
+ None, alias="imagesMapping"
131
+ )
129
132
  tags: Optional[List[str]] = Field(None)
130
133
 
131
134
  def update_metric_data(self, metrics_data: MetricData):
@@ -1028,8 +1028,13 @@ class TestRunManager:
1028
1028
  LATEST_TEST_RUN_FILE_PATH,
1029
1029
  save_under_key=LATEST_TEST_RUN_DATA_KEY,
1030
1030
  )
1031
+ token_cost = (
1032
+ f"{test_run.evaluation_cost} USD"
1033
+ if test_run.evaluation_cost
1034
+ else "None"
1035
+ )
1031
1036
  console.print(
1032
- f"\n\n[rgb(5,245,141)]βœ“[/rgb(5,245,141)] Evaluation completed πŸŽ‰! (time taken: {round(runDuration, 2)}s | token cost: {test_run.evaluation_cost} USD)\n"
1037
+ f"\n\n[rgb(5,245,141)]βœ“[/rgb(5,245,141)] Evaluation completed πŸŽ‰! (time taken: {round(runDuration, 2)}s | token cost: {token_cost})\n"
1033
1038
  f"Β» Test Results ({test_run.test_passed + test_run.test_failed} total tests):\n",
1034
1039
  f" Β» Pass Rate: {round((test_run.test_passed / (test_run.test_passed + test_run.test_failed)) * 100, 2)}% | Passed: [bold green]{test_run.test_passed}[/bold green] | Failed: [bold red]{test_run.test_failed}[/bold red]\n\n",
1035
1040
  "=" * 80,
deepeval/utils.py CHANGED
@@ -268,6 +268,32 @@ def set_should_use_cache(yes: bool):
268
268
  s.ENABLE_DEEPEVAL_CACHE = yes
269
269
 
270
270
 
271
+ ###################
272
+ # Timeout Helpers #
273
+ ###################
274
+ def are_timeouts_disabled() -> bool:
275
+ return bool(get_settings().DEEPEVAL_DISABLE_TIMEOUTS)
276
+
277
+
278
+ def get_per_task_timeout_seconds() -> float:
279
+ return get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
280
+
281
+
282
+ def get_per_task_timeout() -> Optional[float]:
283
+ return None if are_timeouts_disabled() else get_per_task_timeout_seconds()
284
+
285
+
286
+ def get_gather_timeout_seconds() -> float:
287
+ return (
288
+ get_per_task_timeout_seconds()
289
+ + get_settings().DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
290
+ )
291
+
292
+
293
+ def get_gather_timeout() -> Optional[float]:
294
+ return None if are_timeouts_disabled() else get_gather_timeout_seconds()
295
+
296
+
271
297
  def login(api_key: str):
272
298
  if not api_key or not isinstance(api_key, str):
273
299
  raise ValueError("Oh no! Please provide an api key string to login.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.7.5
3
+ Version: 3.7.7
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -100,9 +100,9 @@ Description-Content-Type: text/markdown
100
100
  <a href="https://www.readme-i18n.com/confident-ai/deepeval?lang=zh">δΈ­ζ–‡</a>
101
101
  </p>
102
102
 
103
- **DeepEval** is a simple-to-use, open-source LLM evaluation framework, for evaluating and testing large-language model systems. It is similar to Pytest but specialized for unit testing LLM outputs. DeepEval incorporates the latest research to evaluate LLM outputs based on metrics such as G-Eval, hallucination, answer relevancy, RAGAS, etc., which uses LLMs and various other NLP models that runs **locally on your machine** for evaluation.
103
+ **DeepEval** is a simple-to-use, open-source LLM evaluation framework, for evaluating and testing large-language model systems. It is similar to Pytest but specialized for unit testing LLM outputs. DeepEval incorporates the latest research to evaluate LLM outputs based on metrics such as G-Eval, task completion, answer relevancy, hallucination, etc., which uses LLM-as-a-judge and other NLP models that runs **locally on your machine** for evaluation.
104
104
 
105
- Whether your LLM applications are RAG pipelines, chatbots, AI agents, implemented via LangChain or LlamaIndex, DeepEval has you covered. With it, you can easily determine the optimal models, prompts, and architecture to improve your RAG pipeline, agentic workflows, prevent prompt drifting, or even transition from OpenAI to hosting your own Deepseek R1 with confidence.
105
+ Whether your LLM applications are AI agents, RAG pipelines, or chatbots, implemented via LangChain or OpenAI, DeepEval has you covered. With it, you can easily determine the optimal models, prompts, and architecture to improve your RAG pipeline, agentic workflows, prevent prompt drifting, or even transition from OpenAI to hosting your own Deepseek R1 with confidence.
106
106
 
107
107
  > [!IMPORTANT]
108
108
  > Need a place for your DeepEval testing data to live 🏑❀️? [Sign up to the DeepEval platform](https://confident-ai.com?utm_source=GitHub) to compare iterations of your LLM app, generate & share testing reports, and more.