deepeval 3.7.5__py3-none-any.whl β 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info β deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info β deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info β deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info β deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info β deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List, Dict, Optional, Union
|
|
2
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
3
|
from pydantic import BaseModel
|
|
4
|
-
|
|
4
|
+
import re
|
|
5
5
|
from deepeval.test_case import (
|
|
6
6
|
LLMTestCase,
|
|
7
7
|
)
|
|
@@ -19,6 +19,7 @@ class Contestant(BaseModel):
|
|
|
19
19
|
@dataclass
|
|
20
20
|
class ArenaTestCase:
|
|
21
21
|
contestants: List[Contestant]
|
|
22
|
+
multimodal: bool = field(default=False)
|
|
22
23
|
|
|
23
24
|
def __post_init__(self):
|
|
24
25
|
contestant_names = [contestant.name for contestant in self.contestants]
|
|
@@ -38,6 +39,10 @@ class ArenaTestCase:
|
|
|
38
39
|
"All contestants must have the same 'expected_output'."
|
|
39
40
|
)
|
|
40
41
|
|
|
42
|
+
for contestant in self.contestants:
|
|
43
|
+
if contestant.test_case.multimodal:
|
|
44
|
+
self.multimodal = True
|
|
45
|
+
|
|
41
46
|
|
|
42
47
|
class Arena:
|
|
43
48
|
test_cases: List[ArenaTestCase]
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from pydantic import (
|
|
2
3
|
BaseModel,
|
|
3
4
|
Field,
|
|
@@ -17,6 +18,7 @@ from deepeval.test_case.mcp import (
|
|
|
17
18
|
MCPToolCall,
|
|
18
19
|
validate_mcp_servers,
|
|
19
20
|
)
|
|
21
|
+
from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
class TurnParams(Enum):
|
|
@@ -170,12 +172,28 @@ class ConversationalTestCase(BaseModel):
|
|
|
170
172
|
return self
|
|
171
173
|
|
|
172
174
|
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
173
|
-
self.
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
175
|
+
if self.scenario:
|
|
176
|
+
if re.search(pattern, self.scenario) is not None:
|
|
177
|
+
self.multimodal = True
|
|
178
|
+
return self
|
|
179
|
+
if self.expected_outcome:
|
|
180
|
+
if re.search(pattern, self.expected_outcome) is not None:
|
|
181
|
+
self.multimodal = True
|
|
182
|
+
return self
|
|
183
|
+
if self.user_description:
|
|
184
|
+
if re.search(pattern, self.user_description) is not None:
|
|
185
|
+
self.multimodal = True
|
|
186
|
+
return self
|
|
187
|
+
if self.turns:
|
|
188
|
+
for turn in self.turns:
|
|
189
|
+
if re.search(pattern, turn.content) is not None:
|
|
190
|
+
self.multimodal = True
|
|
191
|
+
return self
|
|
192
|
+
if turn.retrieval_context is not None:
|
|
193
|
+
self.multimodal = any(
|
|
194
|
+
re.search(pattern, context) is not None
|
|
195
|
+
for context in turn.retrieval_context
|
|
196
|
+
)
|
|
179
197
|
|
|
180
198
|
return self
|
|
181
199
|
|
|
@@ -215,3 +233,34 @@ class ConversationalTestCase(BaseModel):
|
|
|
215
233
|
data["turns"] = copied_turns
|
|
216
234
|
|
|
217
235
|
return data
|
|
236
|
+
|
|
237
|
+
def _get_images_mapping(self) -> Dict[str, MLLMImage]:
|
|
238
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
239
|
+
image_ids = set()
|
|
240
|
+
|
|
241
|
+
def extract_ids_from_string(s: Optional[str]) -> None:
|
|
242
|
+
"""Helper to extract image IDs from a string."""
|
|
243
|
+
if s is not None and isinstance(s, str):
|
|
244
|
+
matches = re.findall(pattern, s)
|
|
245
|
+
image_ids.update(matches)
|
|
246
|
+
|
|
247
|
+
def extract_ids_from_list(lst: Optional[List[str]]) -> None:
|
|
248
|
+
"""Helper to extract image IDs from a list of strings."""
|
|
249
|
+
if lst is not None:
|
|
250
|
+
for item in lst:
|
|
251
|
+
extract_ids_from_string(item)
|
|
252
|
+
|
|
253
|
+
extract_ids_from_string(self.scenario)
|
|
254
|
+
extract_ids_from_string(self.expected_outcome)
|
|
255
|
+
extract_ids_from_list(self.context)
|
|
256
|
+
extract_ids_from_string(self.user_description)
|
|
257
|
+
for turn in self.turns:
|
|
258
|
+
extract_ids_from_string(turn.content)
|
|
259
|
+
extract_ids_from_list(turn.retrieval_context)
|
|
260
|
+
|
|
261
|
+
images_mapping = {}
|
|
262
|
+
for img_id in image_ids:
|
|
263
|
+
if img_id in _MLLM_IMAGE_REGISTRY:
|
|
264
|
+
images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
|
|
265
|
+
|
|
266
|
+
return images_mapping if len(images_mapping) > 0 else None
|
|
@@ -60,19 +60,34 @@ class MLLMImage:
|
|
|
60
60
|
if self.local:
|
|
61
61
|
path = self.process_url(self.url)
|
|
62
62
|
self.filename = os.path.basename(path)
|
|
63
|
-
self.mimeType = (
|
|
64
|
-
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
self.
|
|
63
|
+
self.mimeType = mimetypes.guess_type(path)[0] or "image/jpeg"
|
|
64
|
+
|
|
65
|
+
if not os.path.exists(path):
|
|
66
|
+
raise FileNotFoundError(f"Image file not found: {path}")
|
|
67
|
+
|
|
68
|
+
self._load_base64(path)
|
|
69
69
|
else:
|
|
70
|
+
if not self.url.startswith(("http://", "https://")):
|
|
71
|
+
raise ValueError(
|
|
72
|
+
f"Invalid remote URL format: {self.url}. URL must start with http:// or https://"
|
|
73
|
+
)
|
|
70
74
|
self.filename = None
|
|
71
75
|
self.mimeType = None
|
|
72
76
|
self.dataBase64 = None
|
|
73
77
|
|
|
74
78
|
_MLLM_IMAGE_REGISTRY[self._id] = self
|
|
75
79
|
|
|
80
|
+
def _load_base64(self, path: str):
|
|
81
|
+
with open(path, "rb") as f:
|
|
82
|
+
raw = f.read()
|
|
83
|
+
self.dataBase64 = base64.b64encode(raw).decode("ascii")
|
|
84
|
+
|
|
85
|
+
def ensure_images_loaded(self):
|
|
86
|
+
if self.local and self.dataBase64 is None:
|
|
87
|
+
path = self.process_url(self.url)
|
|
88
|
+
self._load_base64(path)
|
|
89
|
+
return self
|
|
90
|
+
|
|
76
91
|
def _placeholder(self) -> str:
|
|
77
92
|
return f"[DEEPEVAL:IMAGE:{self._id}]"
|
|
78
93
|
|
|
@@ -376,6 +391,16 @@ class LLMTestCase(BaseModel):
|
|
|
376
391
|
if isinstance(self.input, str)
|
|
377
392
|
else self.multimodal
|
|
378
393
|
)
|
|
394
|
+
if self.retrieval_context is not None:
|
|
395
|
+
auto_detect = auto_detect or any(
|
|
396
|
+
re.search(pattern, context) is not None
|
|
397
|
+
for context in self.retrieval_context
|
|
398
|
+
)
|
|
399
|
+
if self.context is not None:
|
|
400
|
+
auto_detect = auto_detect or any(
|
|
401
|
+
re.search(pattern, context) is not None
|
|
402
|
+
for context in self.context
|
|
403
|
+
)
|
|
379
404
|
|
|
380
405
|
self.multimodal = auto_detect
|
|
381
406
|
return self
|
|
@@ -486,3 +511,32 @@ class LLMTestCase(BaseModel):
|
|
|
486
511
|
)
|
|
487
512
|
|
|
488
513
|
return data
|
|
514
|
+
|
|
515
|
+
def _get_images_mapping(self) -> Dict[str, MLLMImage]:
|
|
516
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
517
|
+
image_ids = set()
|
|
518
|
+
|
|
519
|
+
def extract_ids_from_string(s: Optional[str]) -> None:
|
|
520
|
+
"""Helper to extract image IDs from a string."""
|
|
521
|
+
if s is not None and isinstance(s, str):
|
|
522
|
+
matches = re.findall(pattern, s)
|
|
523
|
+
image_ids.update(matches)
|
|
524
|
+
|
|
525
|
+
def extract_ids_from_list(lst: Optional[List[str]]) -> None:
|
|
526
|
+
"""Helper to extract image IDs from a list of strings."""
|
|
527
|
+
if lst is not None:
|
|
528
|
+
for item in lst:
|
|
529
|
+
extract_ids_from_string(item)
|
|
530
|
+
|
|
531
|
+
extract_ids_from_string(self.input)
|
|
532
|
+
extract_ids_from_string(self.actual_output)
|
|
533
|
+
extract_ids_from_string(self.expected_output)
|
|
534
|
+
extract_ids_from_list(self.context)
|
|
535
|
+
extract_ids_from_list(self.retrieval_context)
|
|
536
|
+
|
|
537
|
+
images_mapping = {}
|
|
538
|
+
for img_id in image_ids:
|
|
539
|
+
if img_id in _MLLM_IMAGE_REGISTRY:
|
|
540
|
+
images_mapping[img_id] = _MLLM_IMAGE_REGISTRY[img_id]
|
|
541
|
+
|
|
542
|
+
return images_mapping if len(images_mapping) > 0 else None
|
deepeval/test_run/api.py
CHANGED
|
@@ -126,6 +126,9 @@ class ConversationalApiTestCase(BaseModel):
|
|
|
126
126
|
additional_metadata: Optional[Dict] = Field(
|
|
127
127
|
None, alias="additionalMetadata"
|
|
128
128
|
)
|
|
129
|
+
images_mapping: Optional[Dict[str, MLLMImage]] = Field(
|
|
130
|
+
None, alias="imagesMapping"
|
|
131
|
+
)
|
|
129
132
|
tags: Optional[List[str]] = Field(None)
|
|
130
133
|
|
|
131
134
|
def update_metric_data(self, metrics_data: MetricData):
|
deepeval/test_run/test_run.py
CHANGED
|
@@ -1028,8 +1028,13 @@ class TestRunManager:
|
|
|
1028
1028
|
LATEST_TEST_RUN_FILE_PATH,
|
|
1029
1029
|
save_under_key=LATEST_TEST_RUN_DATA_KEY,
|
|
1030
1030
|
)
|
|
1031
|
+
token_cost = (
|
|
1032
|
+
f"{test_run.evaluation_cost} USD"
|
|
1033
|
+
if test_run.evaluation_cost
|
|
1034
|
+
else "None"
|
|
1035
|
+
)
|
|
1031
1036
|
console.print(
|
|
1032
|
-
f"\n\n[rgb(5,245,141)]β[/rgb(5,245,141)] Evaluation completed π! (time taken: {round(runDuration, 2)}s | token cost: {
|
|
1037
|
+
f"\n\n[rgb(5,245,141)]β[/rgb(5,245,141)] Evaluation completed π! (time taken: {round(runDuration, 2)}s | token cost: {token_cost})\n"
|
|
1033
1038
|
f"Β» Test Results ({test_run.test_passed + test_run.test_failed} total tests):\n",
|
|
1034
1039
|
f" Β» Pass Rate: {round((test_run.test_passed / (test_run.test_passed + test_run.test_failed)) * 100, 2)}% | Passed: [bold green]{test_run.test_passed}[/bold green] | Failed: [bold red]{test_run.test_failed}[/bold red]\n\n",
|
|
1035
1040
|
"=" * 80,
|
deepeval/utils.py
CHANGED
|
@@ -268,6 +268,32 @@ def set_should_use_cache(yes: bool):
|
|
|
268
268
|
s.ENABLE_DEEPEVAL_CACHE = yes
|
|
269
269
|
|
|
270
270
|
|
|
271
|
+
###################
|
|
272
|
+
# Timeout Helpers #
|
|
273
|
+
###################
|
|
274
|
+
def are_timeouts_disabled() -> bool:
|
|
275
|
+
return bool(get_settings().DEEPEVAL_DISABLE_TIMEOUTS)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def get_per_task_timeout_seconds() -> float:
|
|
279
|
+
return get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def get_per_task_timeout() -> Optional[float]:
|
|
283
|
+
return None if are_timeouts_disabled() else get_per_task_timeout_seconds()
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def get_gather_timeout_seconds() -> float:
|
|
287
|
+
return (
|
|
288
|
+
get_per_task_timeout_seconds()
|
|
289
|
+
+ get_settings().DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def get_gather_timeout() -> Optional[float]:
|
|
294
|
+
return None if are_timeouts_disabled() else get_gather_timeout_seconds()
|
|
295
|
+
|
|
296
|
+
|
|
271
297
|
def login(api_key: str):
|
|
272
298
|
if not api_key or not isinstance(api_key, str):
|
|
273
299
|
raise ValueError("Oh no! Please provide an api key string to login.")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepeval
|
|
3
|
-
Version: 3.7.
|
|
3
|
+
Version: 3.7.7
|
|
4
4
|
Summary: The LLM Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/confident-ai/deepeval
|
|
6
6
|
License: Apache-2.0
|
|
@@ -100,9 +100,9 @@ Description-Content-Type: text/markdown
|
|
|
100
100
|
<a href="https://www.readme-i18n.com/confident-ai/deepeval?lang=zh">δΈζ</a>
|
|
101
101
|
</p>
|
|
102
102
|
|
|
103
|
-
**DeepEval** is a simple-to-use, open-source LLM evaluation framework, for evaluating and testing large-language model systems. It is similar to Pytest but specialized for unit testing LLM outputs. DeepEval incorporates the latest research to evaluate LLM outputs based on metrics such as G-Eval,
|
|
103
|
+
**DeepEval** is a simple-to-use, open-source LLM evaluation framework, for evaluating and testing large-language model systems. It is similar to Pytest but specialized for unit testing LLM outputs. DeepEval incorporates the latest research to evaluate LLM outputs based on metrics such as G-Eval, task completion, answer relevancy, hallucination, etc., which uses LLM-as-a-judge and other NLP models that runs **locally on your machine** for evaluation.
|
|
104
104
|
|
|
105
|
-
Whether your LLM applications are RAG pipelines, chatbots,
|
|
105
|
+
Whether your LLM applications are AI agents, RAG pipelines, or chatbots, implemented via LangChain or OpenAI, DeepEval has you covered. With it, you can easily determine the optimal models, prompts, and architecture to improve your RAG pipeline, agentic workflows, prevent prompt drifting, or even transition from OpenAI to hosting your own Deepseek R1 with confidence.
|
|
106
106
|
|
|
107
107
|
> [!IMPORTANT]
|
|
108
108
|
> Need a place for your DeepEval testing data to live π‘β€οΈ? [Sign up to the DeepEval platform](https://confident-ai.com?utm_source=GitHub) to compare iterations of your LLM app, generate & share testing reports, and more.
|