deepeval 3.7.2__py3-none-any.whl → 3.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/human_eval/human_eval.py +2 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/dataset/dataset.py +35 -11
- deepeval/dataset/utils.py +2 -0
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/execute.py +4 -1
- deepeval/metrics/answer_relevancy/template.py +4 -4
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/template.py +6 -6
- deepeval/metrics/contextual_recall/template.py +2 -2
- deepeval/metrics/contextual_relevancy/template.py +3 -3
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +4 -4
- deepeval/metrics/faithfulness/template.py +4 -4
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +7 -7
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +6 -6
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +2 -2
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +3 -3
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +9 -9
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +4 -4
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/metrics/utils.py +3 -0
- deepeval/models/__init__.py +2 -0
- deepeval/models/embedding_models/azure_embedding_model.py +28 -15
- deepeval/models/embedding_models/local_embedding_model.py +23 -10
- deepeval/models/embedding_models/ollama_embedding_model.py +8 -6
- deepeval/models/embedding_models/openai_embedding_model.py +18 -2
- deepeval/models/llms/anthropic_model.py +17 -5
- deepeval/models/llms/azure_model.py +30 -18
- deepeval/models/llms/deepseek_model.py +22 -12
- deepeval/models/llms/gemini_model.py +120 -87
- deepeval/models/llms/grok_model.py +23 -16
- deepeval/models/llms/kimi_model.py +23 -12
- deepeval/models/llms/litellm_model.py +63 -25
- deepeval/models/llms/local_model.py +26 -18
- deepeval/models/llms/ollama_model.py +17 -7
- deepeval/models/llms/openai_model.py +22 -17
- deepeval/models/llms/portkey_model.py +132 -0
- deepeval/models/mlllms/__init__.py +1 -0
- deepeval/models/mlllms/azure_model.py +343 -0
- deepeval/models/mlllms/gemini_model.py +102 -73
- deepeval/models/mlllms/ollama_model.py +40 -9
- deepeval/models/mlllms/openai_model.py +65 -14
- deepeval/models/utils.py +48 -3
- deepeval/optimization/__init__.py +13 -0
- deepeval/optimization/adapters/__init__.py +2 -0
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +588 -0
- deepeval/optimization/aggregates.py +14 -0
- deepeval/optimization/configs.py +34 -0
- deepeval/optimization/copro/configs.py +31 -0
- deepeval/optimization/copro/loop.py +837 -0
- deepeval/optimization/gepa/__init__.py +7 -0
- deepeval/optimization/gepa/configs.py +115 -0
- deepeval/optimization/gepa/loop.py +677 -0
- deepeval/optimization/miprov2/configs.py +134 -0
- deepeval/optimization/miprov2/loop.py +785 -0
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +458 -0
- deepeval/optimization/policies/__init__.py +16 -0
- deepeval/optimization/policies/selection.py +166 -0
- deepeval/optimization/policies/tie_breaker.py +67 -0
- deepeval/optimization/prompt_optimizer.py +462 -0
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +33 -0
- deepeval/optimization/simba/loop.py +983 -0
- deepeval/optimization/simba/types.py +15 -0
- deepeval/optimization/types.py +361 -0
- deepeval/optimization/utils.py +598 -0
- deepeval/prompt/prompt.py +10 -5
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/tracing/context.py +3 -0
- deepeval/tracing/tracing.py +22 -11
- deepeval/utils.py +24 -0
- {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/METADATA +1 -1
- {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/RECORD +92 -66
- {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/entry_points.txt +1 -1
- {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/WHEEL +0 -0
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.7.
|
|
1
|
+
__version__: str = "3.7.4"
|
|
@@ -92,7 +92,7 @@ class HumanEval(DeepEvalBaseBenchmark):
|
|
|
92
92
|
self.predictions: Optional[pd.DataFrame] = None
|
|
93
93
|
self.task_scores: Optional[pd.DataFrame] = None
|
|
94
94
|
self.overall_score: Optional[float] = None
|
|
95
|
-
self.verbose_mode: bool =
|
|
95
|
+
self.verbose_mode: bool = verbose_mode
|
|
96
96
|
|
|
97
97
|
def evaluate(
|
|
98
98
|
self, model: DeepEvalBaseLLM, *args, k: int = 1, **kwargs
|
|
@@ -123,6 +123,7 @@ class HumanEval(DeepEvalBaseBenchmark):
|
|
|
123
123
|
task.value,
|
|
124
124
|
golden.input,
|
|
125
125
|
prediction,
|
|
126
|
+
task_correct,
|
|
126
127
|
golden.expected_output,
|
|
127
128
|
score,
|
|
128
129
|
)
|
deepeval/cli/test.py
CHANGED
|
@@ -160,7 +160,7 @@ def run(
|
|
|
160
160
|
pytest_args.extend(["--identifier", identifier])
|
|
161
161
|
|
|
162
162
|
# Add the deepeval plugin file to pytest arguments
|
|
163
|
-
pytest_args.extend(["-p", "
|
|
163
|
+
pytest_args.extend(["-p", "deepeval"])
|
|
164
164
|
# Append the extra arguments collected by allow_extra_args=True
|
|
165
165
|
# Pytest will raise its own error if the arguments are invalid (error:
|
|
166
166
|
if ctx.args:
|
deepeval/config/settings.py
CHANGED
|
@@ -49,6 +49,8 @@ _DEPRECATED_TO_OVERRIDE = {
|
|
|
49
49
|
"DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS": "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE",
|
|
50
50
|
"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS": "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE",
|
|
51
51
|
}
|
|
52
|
+
# Track which secrets we've warned about when loading from the legacy keyfile
|
|
53
|
+
_LEGACY_KEYFILE_SECRET_WARNED: set[str] = set()
|
|
52
54
|
|
|
53
55
|
|
|
54
56
|
def _find_legacy_enum(env_key: str):
|
|
@@ -88,6 +90,82 @@ def _is_secret_key(settings: "Settings", env_key: str) -> bool:
|
|
|
88
90
|
return False
|
|
89
91
|
|
|
90
92
|
|
|
93
|
+
def _merge_legacy_keyfile_into_env() -> None:
|
|
94
|
+
"""
|
|
95
|
+
Backwards compatibility: merge values from the legacy .deepeval/.deepeval
|
|
96
|
+
JSON keystore into os.environ for known Settings fields, without
|
|
97
|
+
overwriting existing process env vars.
|
|
98
|
+
|
|
99
|
+
This runs before we compute the Settings env fingerprint so that Pydantic
|
|
100
|
+
can see these values on first construction.
|
|
101
|
+
|
|
102
|
+
Precedence: process env -> dotenv -> legacy json
|
|
103
|
+
"""
|
|
104
|
+
# if somebody really wants to skip this behavior
|
|
105
|
+
if parse_bool(os.getenv("DEEPEVAL_DISABLE_LEGACY_KEYFILE"), default=False):
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
from deepeval.constants import HIDDEN_DIR, KEY_FILE
|
|
109
|
+
from deepeval.key_handler import (
|
|
110
|
+
KeyValues,
|
|
111
|
+
ModelKeyValues,
|
|
112
|
+
EmbeddingKeyValues,
|
|
113
|
+
SECRET_KEYS,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
key_path = Path(HIDDEN_DIR) / KEY_FILE
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
with key_path.open("r", encoding="utf-8") as f:
|
|
120
|
+
try:
|
|
121
|
+
data = json.load(f)
|
|
122
|
+
except json.JSONDecodeError:
|
|
123
|
+
# Corrupted file -> ignore, same as KeyFileHandler
|
|
124
|
+
return
|
|
125
|
+
except FileNotFoundError:
|
|
126
|
+
# No legacy store -> nothing to merge
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
if not isinstance(data, dict):
|
|
130
|
+
return
|
|
131
|
+
|
|
132
|
+
# Map JSON keys (enum .value) -> env keys (enum .name)
|
|
133
|
+
mapping: Dict[str, str] = {}
|
|
134
|
+
for enum in (KeyValues, ModelKeyValues, EmbeddingKeyValues):
|
|
135
|
+
for member in enum:
|
|
136
|
+
mapping[member.value] = member.name
|
|
137
|
+
|
|
138
|
+
for json_key, raw in data.items():
|
|
139
|
+
env_key = mapping.get(json_key)
|
|
140
|
+
if not env_key:
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
# Process env always wins
|
|
144
|
+
if env_key in os.environ:
|
|
145
|
+
continue
|
|
146
|
+
if raw is None:
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
# Mirror the legacy warning semantics for secrets, but only once per key
|
|
150
|
+
if (
|
|
151
|
+
json_key in SECRET_KEYS
|
|
152
|
+
and json_key not in _LEGACY_KEYFILE_SECRET_WARNED
|
|
153
|
+
):
|
|
154
|
+
logger.warning(
|
|
155
|
+
"Reading secret '%s' from legacy %s/%s. "
|
|
156
|
+
"Persisting API keys in plaintext is deprecated. "
|
|
157
|
+
"Move this to your environment (.env / .env.local). "
|
|
158
|
+
"This fallback will be removed in a future release.",
|
|
159
|
+
json_key,
|
|
160
|
+
HIDDEN_DIR,
|
|
161
|
+
KEY_FILE,
|
|
162
|
+
)
|
|
163
|
+
_LEGACY_KEYFILE_SECRET_WARNED.add(json_key)
|
|
164
|
+
|
|
165
|
+
# Let Settings validators coerce types; we just inject the raw string
|
|
166
|
+
os.environ[env_key] = str(raw)
|
|
167
|
+
|
|
168
|
+
|
|
91
169
|
def _read_env_file(path: Path) -> Dict[str, str]:
|
|
92
170
|
if not path.exists():
|
|
93
171
|
return {}
|
|
@@ -258,6 +336,7 @@ class Settings(BaseSettings):
|
|
|
258
336
|
GOOGLE_GENAI_USE_VERTEXAI: Optional[bool] = None
|
|
259
337
|
GOOGLE_CLOUD_PROJECT: Optional[str] = None
|
|
260
338
|
GOOGLE_CLOUD_LOCATION: Optional[str] = None
|
|
339
|
+
GOOGLE_SERVICE_ACCOUNT_KEY: Optional[str] = None
|
|
261
340
|
# Grok
|
|
262
341
|
USE_GROK_MODEL: Optional[bool] = None
|
|
263
342
|
GROK_API_KEY: Optional[SecretStr] = None
|
|
@@ -291,6 +370,12 @@ class Settings(BaseSettings):
|
|
|
291
370
|
OPENAI_MODEL_NAME: Optional[str] = None
|
|
292
371
|
OPENAI_COST_PER_INPUT_TOKEN: Optional[float] = None
|
|
293
372
|
OPENAI_COST_PER_OUTPUT_TOKEN: Optional[float] = None
|
|
373
|
+
# PortKey
|
|
374
|
+
USE_PORTKEY_MODEL: Optional[bool] = None
|
|
375
|
+
PORTKEY_API_KEY: Optional[SecretStr] = None
|
|
376
|
+
PORTKEY_MODEL_NAME: Optional[str] = None
|
|
377
|
+
PORTKEY_BASE_URL: Optional[AnyUrl] = None
|
|
378
|
+
PORTKEY_PROVIDER_NAME: Optional[str] = None
|
|
294
379
|
# Vertex AI
|
|
295
380
|
VERTEX_AI_MODEL_NAME: Optional[str] = None
|
|
296
381
|
# VLLM
|
|
@@ -516,29 +601,30 @@ class Settings(BaseSettings):
|
|
|
516
601
|
"CONFIDENT_OPEN_BROWSER",
|
|
517
602
|
"CONFIDENT_TRACE_FLUSH",
|
|
518
603
|
"CONFIDENT_TRACE_VERBOSE",
|
|
604
|
+
"CUDA_LAUNCH_BLOCKING",
|
|
605
|
+
"DEEPEVAL_VERBOSE_MODE",
|
|
606
|
+
"DEEPEVAL_GRPC_LOGGING",
|
|
607
|
+
"DEEPEVAL_DISABLE_DOTENV",
|
|
608
|
+
"DEEPEVAL_TELEMETRY_OPT_OUT",
|
|
609
|
+
"DEEPEVAL_UPDATE_WARNING_OPT_IN",
|
|
610
|
+
"ENABLE_DEEPEVAL_CACHE",
|
|
611
|
+
"ERROR_REPORTING",
|
|
612
|
+
"GOOGLE_GENAI_USE_VERTEXAI",
|
|
613
|
+
"IGNORE_DEEPEVAL_ERRORS",
|
|
614
|
+
"SKIP_DEEPEVAL_MISSING_PARAMS",
|
|
615
|
+
"TOKENIZERS_PARALLELISM",
|
|
616
|
+
"TRANSFORMERS_NO_ADVISORY_WARNINGS",
|
|
519
617
|
"USE_OPENAI_MODEL",
|
|
520
618
|
"USE_AZURE_OPENAI",
|
|
521
619
|
"USE_LOCAL_MODEL",
|
|
522
620
|
"USE_GEMINI_MODEL",
|
|
523
|
-
"GOOGLE_GENAI_USE_VERTEXAI",
|
|
524
621
|
"USE_MOONSHOT_MODEL",
|
|
525
622
|
"USE_GROK_MODEL",
|
|
526
623
|
"USE_DEEPSEEK_MODEL",
|
|
527
624
|
"USE_LITELLM",
|
|
528
625
|
"USE_AZURE_OPENAI_EMBEDDING",
|
|
529
626
|
"USE_LOCAL_EMBEDDINGS",
|
|
530
|
-
"
|
|
531
|
-
"DEEPEVAL_DISABLE_DOTENV",
|
|
532
|
-
"DEEPEVAL_TELEMETRY_OPT_OUT",
|
|
533
|
-
"DEEPEVAL_UPDATE_WARNING_OPT_IN",
|
|
534
|
-
"TOKENIZERS_PARALLELISM",
|
|
535
|
-
"TRANSFORMERS_NO_ADVISORY_WARNINGS",
|
|
536
|
-
"CUDA_LAUNCH_BLOCKING",
|
|
537
|
-
"ERROR_REPORTING",
|
|
538
|
-
"IGNORE_DEEPEVAL_ERRORS",
|
|
539
|
-
"SKIP_DEEPEVAL_MISSING_PARAMS",
|
|
540
|
-
"DEEPEVAL_VERBOSE_MODE",
|
|
541
|
-
"ENABLE_DEEPEVAL_CACHE",
|
|
627
|
+
"USE_PORTKEY_MODEL",
|
|
542
628
|
mode="before",
|
|
543
629
|
)
|
|
544
630
|
@classmethod
|
|
@@ -1008,6 +1094,9 @@ _settings_lock = threading.RLock()
|
|
|
1008
1094
|
|
|
1009
1095
|
|
|
1010
1096
|
def _calc_env_fingerprint() -> str:
|
|
1097
|
+
# Pull legacy .deepeval JSON-based settings into the process env before hashing
|
|
1098
|
+
_merge_legacy_keyfile_into_env()
|
|
1099
|
+
|
|
1011
1100
|
env = os.environ.copy()
|
|
1012
1101
|
# must hash in a stable order.
|
|
1013
1102
|
keys = sorted(
|
deepeval/dataset/dataset.py
CHANGED
|
@@ -189,17 +189,35 @@ class EvaluationDataset:
|
|
|
189
189
|
test_case._dataset_alias = self._alias
|
|
190
190
|
test_case._dataset_id = self._id
|
|
191
191
|
if isinstance(test_case, LLMTestCase):
|
|
192
|
+
if self._conversational_goldens or self._conversational_test_cases:
|
|
193
|
+
raise TypeError(
|
|
194
|
+
"You cannot add 'LLMTestCase' to a multi-turn dataset."
|
|
195
|
+
)
|
|
192
196
|
test_case._dataset_rank = len(self._llm_test_cases)
|
|
193
197
|
self._llm_test_cases.append(test_case)
|
|
194
198
|
elif isinstance(test_case, ConversationalTestCase):
|
|
199
|
+
if self._goldens or self._llm_test_cases:
|
|
200
|
+
raise TypeError(
|
|
201
|
+
"You cannot add 'ConversationalTestCase' to a single-turn dataset."
|
|
202
|
+
)
|
|
203
|
+
self._multi_turn = True
|
|
195
204
|
test_case._dataset_rank = len(self._conversational_test_cases)
|
|
196
205
|
self._conversational_test_cases.append(test_case)
|
|
197
206
|
|
|
198
207
|
def add_golden(self, golden: Union[Golden, ConversationalGolden]):
|
|
199
|
-
if
|
|
200
|
-
self.
|
|
201
|
-
|
|
208
|
+
if isinstance(golden, Golden):
|
|
209
|
+
if self._conversational_goldens or self._conversational_test_cases:
|
|
210
|
+
raise TypeError(
|
|
211
|
+
"You cannot add 'Golden' to a multi-turn dataset."
|
|
212
|
+
)
|
|
202
213
|
self._add_golden(golden)
|
|
214
|
+
else:
|
|
215
|
+
if self._goldens or self._llm_test_cases:
|
|
216
|
+
raise TypeError(
|
|
217
|
+
"You cannot add 'ConversationalGolden' to a single-turn dataset."
|
|
218
|
+
)
|
|
219
|
+
self._multi_turn = True
|
|
220
|
+
self._add_conversational_golden(golden)
|
|
203
221
|
|
|
204
222
|
def _add_golden(self, golden: Union[Golden, ConversationalGolden]):
|
|
205
223
|
if isinstance(golden, Golden):
|
|
@@ -224,16 +242,16 @@ class EvaluationDataset:
|
|
|
224
242
|
file_path: str,
|
|
225
243
|
input_col_name: str,
|
|
226
244
|
actual_output_col_name: str,
|
|
227
|
-
expected_output_col_name: Optional[str] =
|
|
228
|
-
context_col_name: Optional[str] =
|
|
245
|
+
expected_output_col_name: Optional[str] = "expected_output",
|
|
246
|
+
context_col_name: Optional[str] = "context",
|
|
229
247
|
context_col_delimiter: str = ";",
|
|
230
|
-
retrieval_context_col_name: Optional[str] =
|
|
248
|
+
retrieval_context_col_name: Optional[str] = "retrieval_context",
|
|
231
249
|
retrieval_context_col_delimiter: str = ";",
|
|
232
|
-
tools_called_col_name: Optional[str] =
|
|
250
|
+
tools_called_col_name: Optional[str] = "tools_called",
|
|
233
251
|
tools_called_col_delimiter: str = ";",
|
|
234
|
-
expected_tools_col_name: Optional[str] =
|
|
252
|
+
expected_tools_col_name: Optional[str] = "expected_tools",
|
|
235
253
|
expected_tools_col_delimiter: str = ";",
|
|
236
|
-
additional_metadata_col_name: Optional[str] =
|
|
254
|
+
additional_metadata_col_name: Optional[str] = "additional_metadata",
|
|
237
255
|
):
|
|
238
256
|
"""
|
|
239
257
|
Load test cases from a CSV file.
|
|
@@ -379,6 +397,7 @@ class EvaluationDataset:
|
|
|
379
397
|
retrieval_context_key_name: Optional[str] = None,
|
|
380
398
|
tools_called_key_name: Optional[str] = None,
|
|
381
399
|
expected_tools_key_name: Optional[str] = None,
|
|
400
|
+
addtional_metadata_key_name: Optional[str] = None,
|
|
382
401
|
encoding_type: str = "utf-8",
|
|
383
402
|
):
|
|
384
403
|
"""
|
|
@@ -431,6 +450,7 @@ class EvaluationDataset:
|
|
|
431
450
|
tools_called = [ToolCall(**tool) for tool in tools_called_data]
|
|
432
451
|
expected_tools_data = json_obj.get(expected_tools_key_name, [])
|
|
433
452
|
expected_tools = [ToolCall(**tool) for tool in expected_tools_data]
|
|
453
|
+
# additional_metadata = json_obj.get(addtional_metadata_key_name)
|
|
434
454
|
|
|
435
455
|
self.add_test_case(
|
|
436
456
|
LLMTestCase(
|
|
@@ -441,6 +461,7 @@ class EvaluationDataset:
|
|
|
441
461
|
retrieval_context=retrieval_context,
|
|
442
462
|
tools_called=tools_called,
|
|
443
463
|
expected_tools=expected_tools,
|
|
464
|
+
# additional_metadata=additional_metadata,
|
|
444
465
|
)
|
|
445
466
|
)
|
|
446
467
|
|
|
@@ -460,8 +481,8 @@ class EvaluationDataset:
|
|
|
460
481
|
expected_tools_col_delimiter: str = ";",
|
|
461
482
|
comments_key_name: str = "comments",
|
|
462
483
|
name_key_name: str = "name",
|
|
463
|
-
source_file_col_name: Optional[str] =
|
|
464
|
-
additional_metadata_col_name: Optional[str] =
|
|
484
|
+
source_file_col_name: Optional[str] = "source_file",
|
|
485
|
+
additional_metadata_col_name: Optional[str] = "additional_metadata",
|
|
465
486
|
scenario_col_name: Optional[str] = "scenario",
|
|
466
487
|
turns_col_name: Optional[str] = "turns",
|
|
467
488
|
expected_outcome_col_name: Optional[str] = "expected_outcome",
|
|
@@ -587,6 +608,7 @@ class EvaluationDataset:
|
|
|
587
608
|
context=context,
|
|
588
609
|
comments=comments,
|
|
589
610
|
name=name,
|
|
611
|
+
additional_metadata=additional_metadata,
|
|
590
612
|
)
|
|
591
613
|
)
|
|
592
614
|
else:
|
|
@@ -645,6 +667,7 @@ class EvaluationDataset:
|
|
|
645
667
|
comments = json_obj.get(comments_key_name)
|
|
646
668
|
name = json_obj.get(name_key_name)
|
|
647
669
|
parsed_turns = parse_turns(turns) if turns else []
|
|
670
|
+
additional_metadata = json_obj.get(additional_metadata_key_name)
|
|
648
671
|
|
|
649
672
|
self._multi_turn = True
|
|
650
673
|
self.goldens.append(
|
|
@@ -656,6 +679,7 @@ class EvaluationDataset:
|
|
|
656
679
|
context=context,
|
|
657
680
|
comments=comments,
|
|
658
681
|
name=name,
|
|
682
|
+
additional_metadata=additional_metadata,
|
|
659
683
|
)
|
|
660
684
|
)
|
|
661
685
|
else:
|
deepeval/dataset/utils.py
CHANGED
|
@@ -24,6 +24,7 @@ def convert_test_cases_to_goldens(
|
|
|
24
24
|
"retrieval_context": test_case.retrieval_context,
|
|
25
25
|
"tools_called": test_case.tools_called,
|
|
26
26
|
"expected_tools": test_case.expected_tools,
|
|
27
|
+
"additional_metadata": test_case.additional_metadata,
|
|
27
28
|
}
|
|
28
29
|
goldens.append(Golden(**golden))
|
|
29
30
|
return goldens
|
|
@@ -70,6 +71,7 @@ def convert_convo_test_cases_to_convo_goldens(
|
|
|
70
71
|
"expected_outcome": test_case.expected_outcome,
|
|
71
72
|
"user_description": test_case.user_description,
|
|
72
73
|
"context": test_case.context,
|
|
74
|
+
"additional_metadata": test_case.additional_metadata,
|
|
73
75
|
}
|
|
74
76
|
goldens.append(ConversationalGolden(**golden))
|
|
75
77
|
return goldens
|
deepeval/evaluate/configs.py
CHANGED
deepeval/evaluate/execute.py
CHANGED
|
@@ -718,6 +718,8 @@ async def a_execute_test_cases(
|
|
|
718
718
|
"Gather timed out after %.1fs. Some metrics may be marked as timed out.",
|
|
719
719
|
_gather_timeout(),
|
|
720
720
|
)
|
|
721
|
+
if not error_config.ignore_errors:
|
|
722
|
+
raise
|
|
721
723
|
|
|
722
724
|
else:
|
|
723
725
|
for test_case in test_cases:
|
|
@@ -803,7 +805,8 @@ async def a_execute_test_cases(
|
|
|
803
805
|
if not t.done():
|
|
804
806
|
t.cancel()
|
|
805
807
|
await asyncio.gather(*tasks, return_exceptions=True)
|
|
806
|
-
|
|
808
|
+
if not error_config.ignore_errors:
|
|
809
|
+
raise
|
|
807
810
|
|
|
808
811
|
return test_results
|
|
809
812
|
|
|
@@ -49,12 +49,12 @@ Expected JSON format:
|
|
|
49
49
|
"verdict": "yes"
|
|
50
50
|
}},
|
|
51
51
|
{{
|
|
52
|
-
"
|
|
53
|
-
"
|
|
52
|
+
"reason": <explanation_for_irrelevance>,
|
|
53
|
+
"verdict": "no"
|
|
54
54
|
}},
|
|
55
55
|
{{
|
|
56
|
-
"
|
|
57
|
-
"
|
|
56
|
+
"reason": <explanation_for_ambiguity>,
|
|
57
|
+
"verdict": "idk"
|
|
58
58
|
}}
|
|
59
59
|
]
|
|
60
60
|
}}
|
|
@@ -70,8 +70,8 @@ class ArgumentCorrectnessTemplate:
|
|
|
70
70
|
"verdict": "yes"
|
|
71
71
|
}},
|
|
72
72
|
{{
|
|
73
|
-
"
|
|
74
|
-
"
|
|
73
|
+
"reason": "Recommending romantic Parisian comedies does not help find the highest temperature in 2023.",
|
|
74
|
+
"verdict": "no"
|
|
75
75
|
}}
|
|
76
76
|
]
|
|
77
77
|
}}
|
|
@@ -64,15 +64,15 @@ Example JSON:
|
|
|
64
64
|
{{
|
|
65
65
|
"verdicts": [
|
|
66
66
|
{{
|
|
67
|
-
"
|
|
68
|
-
"
|
|
67
|
+
"reason": "The opinion "Government meddling in healthcare bloats costs and quashes innovation" reveals a political bias, emphasizing negative views on government involvement.",
|
|
68
|
+
"verdict": "yes"
|
|
69
69
|
}},
|
|
70
70
|
{{
|
|
71
71
|
"verdict": "no"
|
|
72
72
|
}},
|
|
73
73
|
{{
|
|
74
74
|
"verdict": "no"
|
|
75
|
-
}}
|
|
75
|
+
}}
|
|
76
76
|
]
|
|
77
77
|
}}
|
|
78
78
|
|
|
@@ -19,16 +19,16 @@ Example:
|
|
|
19
19
|
{{
|
|
20
20
|
"verdicts": [
|
|
21
21
|
{{
|
|
22
|
-
"
|
|
23
|
-
"
|
|
22
|
+
"reason": "It clearly addresses the question by stating that 'Einstein won the Nobel Prize for his discovery of the photoelectric effect.'",
|
|
23
|
+
"verdict": "yes"
|
|
24
24
|
}},
|
|
25
25
|
{{
|
|
26
|
-
"
|
|
27
|
-
"
|
|
26
|
+
"reason": "The text verifies that the prize was indeed won in 1968.",
|
|
27
|
+
"verdict": "yes"
|
|
28
28
|
}},
|
|
29
29
|
{{
|
|
30
|
-
"
|
|
31
|
-
"
|
|
30
|
+
"reason": "'There was a cat' is not at all relevant to the topic of winning a Nobel Prize.",
|
|
31
|
+
"verdict": "no"
|
|
32
32
|
}}
|
|
33
33
|
]
|
|
34
34
|
}}
|
|
@@ -55,13 +55,13 @@ Example:
|
|
|
55
55
|
{{
|
|
56
56
|
"verdicts": [
|
|
57
57
|
{{
|
|
58
|
-
"verdict": "yes",
|
|
59
58
|
"statement": "Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1968",
|
|
59
|
+
"verdict": "yes"
|
|
60
60
|
}},
|
|
61
61
|
{{
|
|
62
|
-
"verdict": "no",
|
|
63
62
|
"statement": "There was a cat.",
|
|
64
|
-
"reason": "The retrieval context contained the information 'There was a cat' when it has nothing to do with Einstein's achievements."
|
|
63
|
+
"reason": "The retrieval context contained the information 'There was a cat' when it has nothing to do with Einstein's achievements.",
|
|
64
|
+
"verdict": "no"
|
|
65
65
|
}}
|
|
66
66
|
]
|
|
67
67
|
}}
|
|
@@ -86,8 +86,8 @@ User wants to tell the assistant something.
|
|
|
86
86
|
|
|
87
87
|
Example JSON:
|
|
88
88
|
{{
|
|
89
|
-
"
|
|
90
|
-
"
|
|
89
|
+
"reason": "The user wanted to tell the assistant something but the LLM not only refused to answer but replied 'Oh ok, in that case should you need anything just let me know!', which is completely irrelevant and doesn't satisfy the user at all.",
|
|
90
|
+
"verdict": "no"
|
|
91
91
|
}}
|
|
92
92
|
===== END OF EXAMPLE ======
|
|
93
93
|
|
|
@@ -77,8 +77,8 @@ class ConversationalBinaryJudgementTemplate:
|
|
|
77
77
|
|
|
78
78
|
Example:
|
|
79
79
|
{{
|
|
80
|
-
"
|
|
81
|
-
"
|
|
80
|
+
"reason": "The assistant provided a clear and direct answer in response to every user query.",
|
|
81
|
+
"verdict": true
|
|
82
82
|
}}
|
|
83
83
|
**
|
|
84
84
|
JSON:
|
|
@@ -108,8 +108,8 @@ class ConversationalNonBinaryJudgementTemplate:
|
|
|
108
108
|
|
|
109
109
|
Example:
|
|
110
110
|
{{
|
|
111
|
-
"
|
|
112
|
-
"
|
|
111
|
+
"reason": "The assistant partially addressed the user's issue but missed clarifying their follow-up question.",
|
|
112
|
+
"verdict": "{options[1]}"
|
|
113
113
|
}}
|
|
114
114
|
**
|
|
115
115
|
JSON:
|
|
@@ -63,8 +63,8 @@ class BinaryJudgementTemplate:
|
|
|
63
63
|
IMPORTANT: Please make sure to only return a json with two keys: `verdict` (True or False), and the 'reason' key providing the reason. The verdict must be a boolean only, either True or False.
|
|
64
64
|
Example JSON:
|
|
65
65
|
{{
|
|
66
|
-
"
|
|
67
|
-
"
|
|
66
|
+
"reason": "...",
|
|
67
|
+
"verdict": True
|
|
68
68
|
}}
|
|
69
69
|
**
|
|
70
70
|
|
|
@@ -85,8 +85,8 @@ class NonBinaryJudgementTemplate:
|
|
|
85
85
|
IMPORTANT: Please make sure to only return a json with two keys: 'verdict' {options} and 'reason' providing the reason.
|
|
86
86
|
Example JSON:
|
|
87
87
|
{{
|
|
88
|
-
"
|
|
89
|
-
"
|
|
88
|
+
"reason": "...",
|
|
89
|
+
"verdict": {options}
|
|
90
90
|
}}
|
|
91
91
|
**
|
|
92
92
|
|
|
@@ -83,12 +83,12 @@ Expected JSON format:
|
|
|
83
83
|
"verdict": "yes"
|
|
84
84
|
}},
|
|
85
85
|
{{
|
|
86
|
-
"
|
|
87
|
-
"
|
|
86
|
+
"reason": <explanation_for_contradiction>,
|
|
87
|
+
"verdict": "no"
|
|
88
88
|
}},
|
|
89
89
|
{{
|
|
90
|
-
"
|
|
91
|
-
"
|
|
90
|
+
"reason": <explanation_for_uncertainty>,
|
|
91
|
+
"verdict": "idk"
|
|
92
92
|
}}
|
|
93
93
|
]
|
|
94
94
|
}}
|
|
@@ -17,12 +17,12 @@ Example:
|
|
|
17
17
|
{{
|
|
18
18
|
"verdicts": [
|
|
19
19
|
{{
|
|
20
|
-
"
|
|
21
|
-
"
|
|
20
|
+
"reason": "The actual output agrees with the provided context which states that Einstein won the Nobel Prize for his discovery of the photoelectric effect.",
|
|
21
|
+
"verdict": "yes"
|
|
22
22
|
}},
|
|
23
23
|
{{
|
|
24
|
-
"
|
|
25
|
-
"
|
|
24
|
+
"reason": "The actual output contradicts the provided context which states that Einstein won the Nobel Prize in 1968, not 1969.",
|
|
25
|
+
"verdict": "no"
|
|
26
26
|
}}
|
|
27
27
|
]
|
|
28
28
|
}}
|
|
@@ -40,8 +40,8 @@ Example JSON:
|
|
|
40
40
|
{{
|
|
41
41
|
"verdicts": [
|
|
42
42
|
{{
|
|
43
|
-
"
|
|
44
|
-
"
|
|
43
|
+
"reason": "This request falls outside the {domain} domain and should be handled by a different specialist.",
|
|
44
|
+
"verdict": "yes"
|
|
45
45
|
}},
|
|
46
46
|
{{
|
|
47
47
|
"verdict": "no"
|
|
@@ -50,19 +50,19 @@ class MultimodalAnswerRelevancyTemplate:
|
|
|
50
50
|
{{
|
|
51
51
|
"verdicts": [
|
|
52
52
|
{{
|
|
53
|
-
"
|
|
54
|
-
"
|
|
53
|
+
"reason": "The 'Shoes.' statement made in the actual output is completely irrelevant to the input, which asks about what to do in the event of an earthquake.",
|
|
54
|
+
"verdict": "no"
|
|
55
55
|
}},
|
|
56
56
|
{{
|
|
57
|
-
"
|
|
58
|
-
"
|
|
57
|
+
"reason": "The statement thanking the user for asking the question is not directly relevant to the input, but is not entirely irrelevant.",
|
|
58
|
+
"verdict": "idk"
|
|
59
59
|
}},
|
|
60
60
|
{{
|
|
61
|
-
"
|
|
62
|
-
"
|
|
61
|
+
"reason": "The question about whether there is anything else the user can help with is not directly relevant to the input, but is not entirely irrelevant.",
|
|
62
|
+
"verdict": "idk"
|
|
63
63
|
}},
|
|
64
64
|
{{
|
|
65
|
-
"verdict": "yes"
|
|
65
|
+
"verdict": "yes"
|
|
66
66
|
}}
|
|
67
67
|
]
|
|
68
68
|
}}
|
|
@@ -27,16 +27,16 @@ class MultiModalContextualPrecisionTemplate:
|
|
|
27
27
|
{{
|
|
28
28
|
"verdicts": [
|
|
29
29
|
{{
|
|
30
|
-
"
|
|
31
|
-
"
|
|
30
|
+
"reason": "It clearly addresses the question by stating that 'Einstein won the Nobel Prize for his discovery of the photoelectric effect.'",
|
|
31
|
+
"verdict": "yes"
|
|
32
32
|
}},
|
|
33
33
|
{{
|
|
34
|
-
"
|
|
35
|
-
"
|
|
34
|
+
"reason": "The text verifies that the prize was indeed won in 1968.",
|
|
35
|
+
"verdict": "yes"
|
|
36
36
|
}},
|
|
37
37
|
{{
|
|
38
|
-
"
|
|
39
|
-
"
|
|
38
|
+
"reason": "'There was a cat' is not at all relevant to the topic of winning a Nobel Prize.",
|
|
39
|
+
"verdict": "no"
|
|
40
40
|
}}
|
|
41
41
|
]
|
|
42
42
|
}}
|