deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -15,6 +15,8 @@ from deepeval.metrics.utils import (
|
|
|
15
15
|
trimAndLoadJson,
|
|
16
16
|
initialize_model,
|
|
17
17
|
check_llm_test_case_params,
|
|
18
|
+
generate_with_schema_and_extract,
|
|
19
|
+
a_generate_with_schema_and_extract,
|
|
18
20
|
)
|
|
19
21
|
from deepeval.models import DeepEvalBaseLLM
|
|
20
22
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
@@ -82,7 +84,19 @@ class GEval(BaseMetric):
|
|
|
82
84
|
_log_metric_to_confident: bool = True,
|
|
83
85
|
_additional_context: Optional[str] = None,
|
|
84
86
|
) -> float:
|
|
85
|
-
|
|
87
|
+
|
|
88
|
+
multimodal = test_case.multimodal
|
|
89
|
+
|
|
90
|
+
check_llm_test_case_params(
|
|
91
|
+
test_case,
|
|
92
|
+
self.evaluation_params,
|
|
93
|
+
None,
|
|
94
|
+
None,
|
|
95
|
+
self,
|
|
96
|
+
self.model,
|
|
97
|
+
multimodal,
|
|
98
|
+
)
|
|
99
|
+
|
|
86
100
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
87
101
|
|
|
88
102
|
with metric_progress_indicator(
|
|
@@ -96,18 +110,25 @@ class GEval(BaseMetric):
|
|
|
96
110
|
_in_component=_in_component,
|
|
97
111
|
_additional_context=_additional_context,
|
|
98
112
|
)
|
|
113
|
+
settings = get_settings()
|
|
99
114
|
loop.run_until_complete(
|
|
100
115
|
asyncio.wait_for(
|
|
101
116
|
coro,
|
|
102
|
-
timeout=
|
|
117
|
+
timeout=(
|
|
118
|
+
None
|
|
119
|
+
if settings.DEEPEVAL_DISABLE_TIMEOUTS
|
|
120
|
+
else settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
121
|
+
),
|
|
103
122
|
)
|
|
104
123
|
)
|
|
105
124
|
else:
|
|
106
125
|
self.evaluation_steps: List[str] = (
|
|
107
|
-
self._generate_evaluation_steps()
|
|
126
|
+
self._generate_evaluation_steps(multimodal)
|
|
108
127
|
)
|
|
109
128
|
g_score, reason = self._evaluate(
|
|
110
|
-
test_case,
|
|
129
|
+
test_case,
|
|
130
|
+
_additional_context=_additional_context,
|
|
131
|
+
multimodal=multimodal,
|
|
111
132
|
)
|
|
112
133
|
self.score = (
|
|
113
134
|
(float(g_score) - self.score_range[0])
|
|
@@ -143,7 +164,18 @@ class GEval(BaseMetric):
|
|
|
143
164
|
_log_metric_to_confident: bool = True,
|
|
144
165
|
_additional_context: Optional[str] = None,
|
|
145
166
|
) -> float:
|
|
146
|
-
|
|
167
|
+
|
|
168
|
+
multimodal = test_case.multimodal
|
|
169
|
+
|
|
170
|
+
check_llm_test_case_params(
|
|
171
|
+
test_case,
|
|
172
|
+
self.evaluation_params,
|
|
173
|
+
None,
|
|
174
|
+
None,
|
|
175
|
+
self,
|
|
176
|
+
self.model,
|
|
177
|
+
multimodal,
|
|
178
|
+
)
|
|
147
179
|
|
|
148
180
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
149
181
|
with metric_progress_indicator(
|
|
@@ -153,10 +185,12 @@ class GEval(BaseMetric):
|
|
|
153
185
|
_in_component=_in_component,
|
|
154
186
|
):
|
|
155
187
|
self.evaluation_steps: List[str] = (
|
|
156
|
-
await self._a_generate_evaluation_steps()
|
|
188
|
+
await self._a_generate_evaluation_steps(multimodal)
|
|
157
189
|
)
|
|
158
190
|
g_score, reason = await self._a_evaluate(
|
|
159
|
-
test_case,
|
|
191
|
+
test_case,
|
|
192
|
+
_additional_context=_additional_context,
|
|
193
|
+
multimodal=multimodal,
|
|
160
194
|
)
|
|
161
195
|
self.score = (
|
|
162
196
|
(float(g_score) - self.score_range[0]) / self.score_range_span
|
|
@@ -182,7 +216,7 @@ class GEval(BaseMetric):
|
|
|
182
216
|
)
|
|
183
217
|
return self.score
|
|
184
218
|
|
|
185
|
-
async def _a_generate_evaluation_steps(self) -> List[str]:
|
|
219
|
+
async def _a_generate_evaluation_steps(self, multimodal: bool) -> List[str]:
|
|
186
220
|
if self.evaluation_steps:
|
|
187
221
|
return self.evaluation_steps
|
|
188
222
|
|
|
@@ -190,25 +224,19 @@ class GEval(BaseMetric):
|
|
|
190
224
|
self.evaluation_params
|
|
191
225
|
)
|
|
192
226
|
prompt = self.evaluation_template.generate_evaluation_steps(
|
|
193
|
-
criteria=self.criteria,
|
|
227
|
+
criteria=self.criteria,
|
|
228
|
+
parameters=g_eval_params_str,
|
|
229
|
+
multimodal=multimodal,
|
|
230
|
+
)
|
|
231
|
+
return await a_generate_with_schema_and_extract(
|
|
232
|
+
metric=self,
|
|
233
|
+
prompt=prompt,
|
|
234
|
+
schema_cls=gschema.Steps,
|
|
235
|
+
extract_schema=lambda s: s.steps,
|
|
236
|
+
extract_json=lambda d: d["steps"],
|
|
194
237
|
)
|
|
195
|
-
if self.using_native_model:
|
|
196
|
-
res, cost = await self.model.a_generate(prompt)
|
|
197
|
-
self.evaluation_cost += cost
|
|
198
|
-
data = trimAndLoadJson(res, self)
|
|
199
|
-
return data["steps"]
|
|
200
|
-
else:
|
|
201
|
-
try:
|
|
202
|
-
res: gschema.Steps = await self.model.a_generate(
|
|
203
|
-
prompt, schema=gschema.Steps
|
|
204
|
-
)
|
|
205
|
-
return res.steps
|
|
206
|
-
except TypeError:
|
|
207
|
-
res = await self.model.a_generate(prompt)
|
|
208
|
-
data = trimAndLoadJson(res, self)
|
|
209
|
-
return data["steps"]
|
|
210
238
|
|
|
211
|
-
def _generate_evaluation_steps(self) -> List[str]:
|
|
239
|
+
def _generate_evaluation_steps(self, multimodal: bool) -> List[str]:
|
|
212
240
|
if self.evaluation_steps:
|
|
213
241
|
return self.evaluation_steps
|
|
214
242
|
|
|
@@ -216,26 +244,23 @@ class GEval(BaseMetric):
|
|
|
216
244
|
self.evaluation_params
|
|
217
245
|
)
|
|
218
246
|
prompt = self.evaluation_template.generate_evaluation_steps(
|
|
219
|
-
criteria=self.criteria,
|
|
247
|
+
criteria=self.criteria,
|
|
248
|
+
parameters=g_eval_params_str,
|
|
249
|
+
multimodal=multimodal,
|
|
250
|
+
)
|
|
251
|
+
return generate_with_schema_and_extract(
|
|
252
|
+
metric=self,
|
|
253
|
+
prompt=prompt,
|
|
254
|
+
schema_cls=gschema.Steps,
|
|
255
|
+
extract_schema=lambda s: s.steps,
|
|
256
|
+
extract_json=lambda d: d["steps"],
|
|
220
257
|
)
|
|
221
|
-
if self.using_native_model:
|
|
222
|
-
res, cost = self.model.generate(prompt)
|
|
223
|
-
self.evaluation_cost += cost
|
|
224
|
-
data = trimAndLoadJson(res, self)
|
|
225
|
-
return data["steps"]
|
|
226
|
-
else:
|
|
227
|
-
try:
|
|
228
|
-
res: gschema.Steps = self.model.generate(
|
|
229
|
-
prompt, schema=gschema.Steps
|
|
230
|
-
)
|
|
231
|
-
return res.steps
|
|
232
|
-
except TypeError:
|
|
233
|
-
res = self.model.generate(prompt)
|
|
234
|
-
data = trimAndLoadJson(res, self)
|
|
235
|
-
return data["steps"]
|
|
236
258
|
|
|
237
259
|
async def _a_evaluate(
|
|
238
|
-
self,
|
|
260
|
+
self,
|
|
261
|
+
test_case: LLMTestCase,
|
|
262
|
+
multimodal: bool,
|
|
263
|
+
_additional_context: Optional[str] = None,
|
|
239
264
|
) -> Tuple[Union[int, float], str]:
|
|
240
265
|
test_case_content = construct_test_case_string(
|
|
241
266
|
self.evaluation_params, test_case
|
|
@@ -252,6 +277,7 @@ class GEval(BaseMetric):
|
|
|
252
277
|
rubric=rubric_str,
|
|
253
278
|
score_range=self.score_range,
|
|
254
279
|
_additional_context=_additional_context,
|
|
280
|
+
multimodal=multimodal,
|
|
255
281
|
)
|
|
256
282
|
else:
|
|
257
283
|
prompt = (
|
|
@@ -262,6 +288,7 @@ class GEval(BaseMetric):
|
|
|
262
288
|
test_case_content=test_case_content,
|
|
263
289
|
parameters=g_eval_params_str,
|
|
264
290
|
_additional_context=_additional_context,
|
|
291
|
+
multimodal=multimodal,
|
|
265
292
|
)
|
|
266
293
|
)
|
|
267
294
|
try:
|
|
@@ -275,8 +302,7 @@ class GEval(BaseMetric):
|
|
|
275
302
|
prompt, top_logprobs=self.top_logprobs
|
|
276
303
|
)
|
|
277
304
|
|
|
278
|
-
|
|
279
|
-
self.evaluation_cost += cost
|
|
305
|
+
self._accrue_cost(cost)
|
|
280
306
|
|
|
281
307
|
data = trimAndLoadJson(res.choices[0].message.content, self)
|
|
282
308
|
|
|
@@ -292,27 +318,21 @@ class GEval(BaseMetric):
|
|
|
292
318
|
return weighted_summed_score, reason
|
|
293
319
|
except (KeyError, AttributeError, TypeError, ValueError):
|
|
294
320
|
return score, reason
|
|
295
|
-
except
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
try:
|
|
305
|
-
res: gschema.ReasonScore = await self.model.a_generate(
|
|
306
|
-
prompt, schema=gschema.ReasonScore
|
|
307
|
-
)
|
|
308
|
-
return res.score, res.reason
|
|
309
|
-
except TypeError:
|
|
310
|
-
res = await self.model.a_generate(prompt)
|
|
311
|
-
data = trimAndLoadJson(res, self)
|
|
312
|
-
return data["score"], data["reason"]
|
|
321
|
+
except AttributeError:
|
|
322
|
+
# This catches the case where a_generate_raw_response doesn't exist.
|
|
323
|
+
return await a_generate_with_schema_and_extract(
|
|
324
|
+
metric=self,
|
|
325
|
+
prompt=prompt,
|
|
326
|
+
schema_cls=gschema.ReasonScore,
|
|
327
|
+
extract_schema=lambda s: (s.score, s.reason),
|
|
328
|
+
extract_json=lambda d: (d["score"], d["reason"]),
|
|
329
|
+
)
|
|
313
330
|
|
|
314
331
|
def _evaluate(
|
|
315
|
-
self,
|
|
332
|
+
self,
|
|
333
|
+
test_case: LLMTestCase,
|
|
334
|
+
multimodal: bool,
|
|
335
|
+
_additional_context: Optional[str] = None,
|
|
316
336
|
) -> Tuple[Union[int, float], str]:
|
|
317
337
|
test_case_content = construct_test_case_string(
|
|
318
338
|
self.evaluation_params, test_case
|
|
@@ -330,6 +350,7 @@ class GEval(BaseMetric):
|
|
|
330
350
|
rubric=rubric_str,
|
|
331
351
|
score_range=self.score_range,
|
|
332
352
|
_additional_context=_additional_context,
|
|
353
|
+
multimodal=multimodal,
|
|
333
354
|
)
|
|
334
355
|
else:
|
|
335
356
|
prompt = (
|
|
@@ -340,6 +361,7 @@ class GEval(BaseMetric):
|
|
|
340
361
|
test_case_content=test_case_content,
|
|
341
362
|
parameters=g_eval_params_str,
|
|
342
363
|
_additional_context=_additional_context,
|
|
364
|
+
multimodal=multimodal,
|
|
343
365
|
)
|
|
344
366
|
)
|
|
345
367
|
|
|
@@ -351,7 +373,7 @@ class GEval(BaseMetric):
|
|
|
351
373
|
res, cost = self.model.generate_raw_response(
|
|
352
374
|
prompt, top_logprobs=self.top_logprobs
|
|
353
375
|
)
|
|
354
|
-
self.
|
|
376
|
+
self._accrue_cost(cost)
|
|
355
377
|
data = trimAndLoadJson(res.choices[0].message.content, self)
|
|
356
378
|
|
|
357
379
|
reason = data["reason"]
|
|
@@ -368,21 +390,13 @@ class GEval(BaseMetric):
|
|
|
368
390
|
return score, reason
|
|
369
391
|
except AttributeError:
|
|
370
392
|
# This catches the case where a_generate_raw_response doesn't exist.
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
res: gschema.ReasonScore = self.model.generate(
|
|
379
|
-
prompt, schema=gschema.ReasonScore
|
|
380
|
-
)
|
|
381
|
-
return res.score, res.reason
|
|
382
|
-
except TypeError:
|
|
383
|
-
res = self.model.generate(prompt)
|
|
384
|
-
data = trimAndLoadJson(res, self)
|
|
385
|
-
return data["score"], data["reason"]
|
|
393
|
+
return generate_with_schema_and_extract(
|
|
394
|
+
metric=self,
|
|
395
|
+
prompt=prompt,
|
|
396
|
+
schema_cls=gschema.ReasonScore,
|
|
397
|
+
extract_schema=lambda s: (s.score, s.reason),
|
|
398
|
+
extract_json=lambda d: (d["score"], d["reason"]),
|
|
399
|
+
)
|
|
386
400
|
|
|
387
401
|
def is_successful(self) -> bool:
|
|
388
402
|
if self.error is not None:
|
|
@@ -3,11 +3,23 @@ import textwrap
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class GEvalTemplate:
|
|
6
|
+
multimodal_rules = """
|
|
7
|
+
--- MULTIMODAL INPUT RULES ---
|
|
8
|
+
- Treat image content as factual evidence.
|
|
9
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
10
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
11
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
12
|
+
"""
|
|
13
|
+
|
|
6
14
|
@staticmethod
|
|
7
|
-
def generate_evaluation_steps(
|
|
15
|
+
def generate_evaluation_steps(
|
|
16
|
+
parameters: str, criteria: str, multimodal: bool = False
|
|
17
|
+
):
|
|
8
18
|
return textwrap.dedent(
|
|
9
19
|
f"""Given an evaluation criteria which outlines how you should judge the {parameters}, generate 3-4 concise evaluation steps based on the criteria below. You MUST make it clear how to evaluate {parameters} in relation to one another.
|
|
10
20
|
|
|
21
|
+
{GEvalTemplate.multimodal_rules if multimodal else ""}
|
|
22
|
+
|
|
11
23
|
Evaluation Criteria:
|
|
12
24
|
{criteria}
|
|
13
25
|
|
|
@@ -31,6 +43,7 @@ class GEvalTemplate:
|
|
|
31
43
|
rubric: Optional[str] = None,
|
|
32
44
|
score_range: Tuple[int, int] = (0, 10),
|
|
33
45
|
_additional_context: Optional[str] = None,
|
|
46
|
+
multimodal: bool = False,
|
|
34
47
|
):
|
|
35
48
|
rubric_text = f"Rubric:\n{rubric}\n" if rubric else ""
|
|
36
49
|
dependencies = (
|
|
@@ -62,6 +75,7 @@ class GEvalTemplate:
|
|
|
62
75
|
- {reasoning_expectation}
|
|
63
76
|
- Mention key details from the test case parameters.
|
|
64
77
|
- Be concise, clear, and focused on the evaluation logic.
|
|
78
|
+
{GEvalTemplate.multimodal_rules if multimodal else ""}
|
|
65
79
|
|
|
66
80
|
Only return valid JSON. Do **not** include any extra commentary or text.
|
|
67
81
|
|
|
@@ -95,6 +109,7 @@ class GEvalTemplate:
|
|
|
95
109
|
test_case_content: str,
|
|
96
110
|
parameters: str,
|
|
97
111
|
_additional_context: Optional[str] = None,
|
|
112
|
+
multimodal: bool = False,
|
|
98
113
|
):
|
|
99
114
|
additional_context = (
|
|
100
115
|
f"\n\nAdditional Context:\n{_additional_context}\n"
|
|
@@ -104,6 +119,8 @@ class GEvalTemplate:
|
|
|
104
119
|
return textwrap.dedent(
|
|
105
120
|
f"""Given the evaluation steps, return a JSON with two keys: 1) a `score` key that is STRICTLY EITHER 1 (follows the criteria 100% outlined in the evaluation steps), OR 0 (does not follow the criteria), and 2) a `reason` key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from {parameters} in your reason, but be very concise with it!
|
|
106
121
|
|
|
122
|
+
{GEvalTemplate.multimodal_rules if multimodal else ""}
|
|
123
|
+
|
|
107
124
|
Evaluation Steps:
|
|
108
125
|
{evaluation_steps}
|
|
109
126
|
|
deepeval/metrics/g_eval/utils.py
CHANGED
|
@@ -9,8 +9,8 @@ from deepeval.test_case import (
|
|
|
9
9
|
LLMTestCase,
|
|
10
10
|
ToolCall,
|
|
11
11
|
)
|
|
12
|
-
from deepeval.models.llms.openai_model import unsupported_log_probs_gpt_models
|
|
13
12
|
from pydantic import BaseModel, field_validator
|
|
13
|
+
from deepeval.models.llms.constants import OPENAI_MODELS_DATA
|
|
14
14
|
|
|
15
15
|
from deepeval.test_case.conversational_test_case import ConversationalTestCase
|
|
16
16
|
|
|
@@ -114,16 +114,17 @@ def format_rubrics(rubrics: Optional[List[Rubric]]) -> Optional[str]:
|
|
|
114
114
|
|
|
115
115
|
def no_log_prob_support(model: Union[str, DeepEvalBaseLLM]):
|
|
116
116
|
|
|
117
|
-
if isinstance(model, str)
|
|
118
|
-
|
|
117
|
+
if isinstance(model, str):
|
|
118
|
+
model_data = OPENAI_MODELS_DATA.get(model)
|
|
119
|
+
if not model_data.supports_log_probs:
|
|
120
|
+
return True
|
|
119
121
|
elif (
|
|
120
|
-
isinstance(model, GPTModel)
|
|
121
|
-
and model.get_model_name() in unsupported_log_probs_gpt_models
|
|
122
|
+
isinstance(model, GPTModel) and not model.model_data.supports_log_probs
|
|
122
123
|
):
|
|
123
124
|
return True
|
|
124
125
|
elif (
|
|
125
126
|
isinstance(model, AzureOpenAIModel)
|
|
126
|
-
and model.
|
|
127
|
+
and not model.model_data.supports_log_probs
|
|
127
128
|
):
|
|
128
129
|
return True
|
|
129
130
|
|
|
@@ -3,11 +3,12 @@ import asyncio
|
|
|
3
3
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
4
4
|
from deepeval.metrics.utils import (
|
|
5
5
|
construct_verbose_logs,
|
|
6
|
-
trimAndLoadJson,
|
|
7
6
|
get_unit_interactions,
|
|
8
7
|
print_tools_called,
|
|
9
8
|
check_conversational_test_case_params,
|
|
10
9
|
initialize_model,
|
|
10
|
+
a_generate_with_schema_and_extract,
|
|
11
|
+
generate_with_schema_and_extract,
|
|
11
12
|
)
|
|
12
13
|
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
13
14
|
from deepeval.metrics import BaseConversationalMetric
|
|
@@ -55,8 +56,14 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
55
56
|
_in_component: bool = False,
|
|
56
57
|
_log_metric_to_confident: bool = True,
|
|
57
58
|
):
|
|
59
|
+
multimodal = test_case.multimodal
|
|
58
60
|
check_conversational_test_case_params(
|
|
59
|
-
test_case,
|
|
61
|
+
test_case,
|
|
62
|
+
self._required_test_case_params,
|
|
63
|
+
self,
|
|
64
|
+
None,
|
|
65
|
+
self.model,
|
|
66
|
+
multimodal,
|
|
60
67
|
)
|
|
61
68
|
|
|
62
69
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -80,17 +87,21 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
80
87
|
)
|
|
81
88
|
goal_scores = [
|
|
82
89
|
self._get_goal_accuracy_score(
|
|
83
|
-
task.user_goal, task.steps_taken
|
|
90
|
+
task.user_goal, task.steps_taken, multimodal
|
|
84
91
|
)
|
|
85
92
|
for task in goal_and_steps_taken
|
|
86
93
|
]
|
|
87
94
|
plan_scores = [
|
|
88
|
-
self._get_plan_scores(
|
|
95
|
+
self._get_plan_scores(
|
|
96
|
+
task.user_goal, task.steps_taken, multimodal
|
|
97
|
+
)
|
|
89
98
|
for task in goal_and_steps_taken
|
|
90
99
|
]
|
|
91
100
|
self.score = self._calculate_score(goal_scores, plan_scores)
|
|
92
101
|
self.success = self.score >= self.threshold
|
|
93
|
-
self.reason = self._generate_reason(
|
|
102
|
+
self.reason = self._generate_reason(
|
|
103
|
+
goal_scores, plan_scores, multimodal
|
|
104
|
+
)
|
|
94
105
|
|
|
95
106
|
self.verbose_logs = construct_verbose_logs(
|
|
96
107
|
self,
|
|
@@ -117,8 +128,14 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
117
128
|
_in_component: bool = False,
|
|
118
129
|
_log_metric_to_confident: bool = True,
|
|
119
130
|
):
|
|
131
|
+
multimodal = test_case.multimodal
|
|
120
132
|
check_conversational_test_case_params(
|
|
121
|
-
test_case,
|
|
133
|
+
test_case,
|
|
134
|
+
self._required_test_case_params,
|
|
135
|
+
self,
|
|
136
|
+
None,
|
|
137
|
+
self.model,
|
|
138
|
+
multimodal,
|
|
122
139
|
)
|
|
123
140
|
|
|
124
141
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -134,21 +151,23 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
134
151
|
goal_scores = await asyncio.gather(
|
|
135
152
|
*[
|
|
136
153
|
self._a_get_goal_accuracy_score(
|
|
137
|
-
task.user_goal, task.steps_taken
|
|
154
|
+
task.user_goal, task.steps_taken, multimodal
|
|
138
155
|
)
|
|
139
156
|
for task in goal_and_steps_taken
|
|
140
157
|
]
|
|
141
158
|
)
|
|
142
159
|
plan_scores = await asyncio.gather(
|
|
143
160
|
*[
|
|
144
|
-
self._a_get_plan_scores(
|
|
161
|
+
self._a_get_plan_scores(
|
|
162
|
+
task.user_goal, task.steps_taken, multimodal
|
|
163
|
+
)
|
|
145
164
|
for task in goal_and_steps_taken
|
|
146
165
|
]
|
|
147
166
|
)
|
|
148
167
|
self.score = self._calculate_score(goal_scores, plan_scores)
|
|
149
168
|
self.success = self.score >= self.threshold
|
|
150
169
|
self.reason = await self._a_generate_reason(
|
|
151
|
-
goal_scores, plan_scores
|
|
170
|
+
goal_scores, plan_scores, multimodal
|
|
152
171
|
)
|
|
153
172
|
|
|
154
173
|
self.verbose_logs = construct_verbose_logs(
|
|
@@ -191,41 +210,31 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
191
210
|
goal_and_steps_taken.append(new_goal_steps)
|
|
192
211
|
return goal_and_steps_taken
|
|
193
212
|
|
|
194
|
-
def _get_plan_scores(self, user_goal, steps_taken):
|
|
213
|
+
def _get_plan_scores(self, user_goal, steps_taken, multimodal: bool):
|
|
195
214
|
prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
|
|
196
|
-
user_goal, "\n".join(steps_taken)
|
|
215
|
+
user_goal, "\n".join(steps_taken), multimodal
|
|
216
|
+
)
|
|
217
|
+
return generate_with_schema_and_extract(
|
|
218
|
+
metric=self,
|
|
219
|
+
prompt=prompt,
|
|
220
|
+
schema_cls=PlanScore,
|
|
221
|
+
extract_schema=lambda s: s,
|
|
222
|
+
extract_json=lambda data: PlanScore(**data),
|
|
197
223
|
)
|
|
198
|
-
if self.using_native_model:
|
|
199
|
-
res, cost = self.model.generate(prompt, schema=PlanScore)
|
|
200
|
-
self.evaluation_cost += cost
|
|
201
|
-
return res
|
|
202
|
-
else:
|
|
203
|
-
try:
|
|
204
|
-
res: PlanScore = self.model.generate(prompt, schema=PlanScore)
|
|
205
|
-
return res
|
|
206
|
-
except TypeError:
|
|
207
|
-
res = self.model.generate(prompt)
|
|
208
|
-
data = trimAndLoadJson(res, self)
|
|
209
|
-
return PlanScore(**data)
|
|
210
224
|
|
|
211
|
-
async def _a_get_plan_scores(
|
|
225
|
+
async def _a_get_plan_scores(
|
|
226
|
+
self, user_goal, steps_taken, multimodal: bool
|
|
227
|
+
):
|
|
212
228
|
prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
|
|
213
|
-
user_goal, "\n".join(steps_taken)
|
|
229
|
+
user_goal, "\n".join(steps_taken), multimodal
|
|
230
|
+
)
|
|
231
|
+
return await a_generate_with_schema_and_extract(
|
|
232
|
+
metric=self,
|
|
233
|
+
prompt=prompt,
|
|
234
|
+
schema_cls=PlanScore,
|
|
235
|
+
extract_schema=lambda s: s,
|
|
236
|
+
extract_json=lambda data: PlanScore(**data),
|
|
214
237
|
)
|
|
215
|
-
if self.using_native_model:
|
|
216
|
-
res, cost = await self.model.a_generate(prompt, schema=PlanScore)
|
|
217
|
-
self.evaluation_cost += cost
|
|
218
|
-
return res
|
|
219
|
-
else:
|
|
220
|
-
try:
|
|
221
|
-
res: PlanScore = await self.model.a_generate(
|
|
222
|
-
prompt, schema=PlanScore
|
|
223
|
-
)
|
|
224
|
-
return res
|
|
225
|
-
except TypeError:
|
|
226
|
-
res = await self.model.a_generate(prompt)
|
|
227
|
-
data = trimAndLoadJson(res, self)
|
|
228
|
-
return PlanScore(**data)
|
|
229
238
|
|
|
230
239
|
def _calculate_score(
|
|
231
240
|
self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
|
|
@@ -240,7 +249,10 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
240
249
|
return 0 if self.strict_mode and score < self.threshold else score
|
|
241
250
|
|
|
242
251
|
def _generate_reason(
|
|
243
|
-
self,
|
|
252
|
+
self,
|
|
253
|
+
goal_scores: List[GoalScore],
|
|
254
|
+
plan_scores: List[PlanScore],
|
|
255
|
+
multimodal: bool,
|
|
244
256
|
):
|
|
245
257
|
goal_evaluations = ""
|
|
246
258
|
for goal_score in goal_scores:
|
|
@@ -254,18 +266,25 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
254
266
|
)
|
|
255
267
|
|
|
256
268
|
prompt = GoalAccuracyTemplate.get_final_reason(
|
|
257
|
-
self.score,
|
|
269
|
+
self.score,
|
|
270
|
+
self.threshold,
|
|
271
|
+
goal_evaluations,
|
|
272
|
+
plan_evalautions,
|
|
273
|
+
multimodal,
|
|
258
274
|
)
|
|
259
275
|
if self.using_native_model:
|
|
260
276
|
res, cost = self.model.generate(prompt)
|
|
261
|
-
self.
|
|
277
|
+
self._accrue_cost(cost)
|
|
262
278
|
return res
|
|
263
279
|
else:
|
|
264
280
|
res = self.model.generate(prompt)
|
|
265
281
|
return res
|
|
266
282
|
|
|
267
283
|
async def _a_generate_reason(
|
|
268
|
-
self,
|
|
284
|
+
self,
|
|
285
|
+
goal_scores: List[GoalScore],
|
|
286
|
+
plan_scores: List[PlanScore],
|
|
287
|
+
multimodal: bool,
|
|
269
288
|
):
|
|
270
289
|
goal_evaluations = ""
|
|
271
290
|
for goal_score in goal_scores:
|
|
@@ -279,51 +298,47 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
279
298
|
)
|
|
280
299
|
|
|
281
300
|
prompt = GoalAccuracyTemplate.get_final_reason(
|
|
282
|
-
self.score,
|
|
301
|
+
self.score,
|
|
302
|
+
self.threshold,
|
|
303
|
+
goal_evaluations,
|
|
304
|
+
plan_evalautions,
|
|
305
|
+
multimodal,
|
|
283
306
|
)
|
|
284
307
|
if self.using_native_model:
|
|
285
308
|
res, cost = await self.model.a_generate(prompt)
|
|
286
|
-
self.
|
|
309
|
+
self._accrue_cost(cost)
|
|
287
310
|
return res
|
|
288
311
|
else:
|
|
289
312
|
res = await self.model.a_generate(prompt)
|
|
290
313
|
return res
|
|
291
314
|
|
|
292
|
-
def _get_goal_accuracy_score(
|
|
315
|
+
def _get_goal_accuracy_score(
|
|
316
|
+
self, user_goal, steps_taken, multimodal: bool
|
|
317
|
+
):
|
|
293
318
|
prompt = GoalAccuracyTemplate.get_accuracy_score(
|
|
294
|
-
user_goal, "\n".join(steps_taken)
|
|
319
|
+
user_goal, "\n".join(steps_taken), multimodal
|
|
320
|
+
)
|
|
321
|
+
return generate_with_schema_and_extract(
|
|
322
|
+
metric=self,
|
|
323
|
+
prompt=prompt,
|
|
324
|
+
schema_cls=GoalScore,
|
|
325
|
+
extract_schema=lambda s: s,
|
|
326
|
+
extract_json=lambda data: GoalScore(**data),
|
|
295
327
|
)
|
|
296
|
-
if self.using_native_model:
|
|
297
|
-
res, cost = self.model.generate(prompt, schema=GoalScore)
|
|
298
|
-
self.evaluation_cost += cost
|
|
299
|
-
return res
|
|
300
|
-
else:
|
|
301
|
-
try:
|
|
302
|
-
res: GoalScore = self.model.generate(prompt, schema=GoalScore)
|
|
303
|
-
return res
|
|
304
|
-
except TypeError:
|
|
305
|
-
res = self.model.generate(prompt)
|
|
306
|
-
data = trimAndLoadJson(res, self)
|
|
307
|
-
return GoalScore(**data)
|
|
308
328
|
|
|
309
|
-
async def _a_get_goal_accuracy_score(
|
|
329
|
+
async def _a_get_goal_accuracy_score(
|
|
330
|
+
self, user_goal, steps_taken, multimodal: bool
|
|
331
|
+
):
|
|
310
332
|
prompt = GoalAccuracyTemplate.get_accuracy_score(
|
|
311
|
-
user_goal, "\n".join(steps_taken)
|
|
333
|
+
user_goal, "\n".join(steps_taken), multimodal
|
|
334
|
+
)
|
|
335
|
+
return await a_generate_with_schema_and_extract(
|
|
336
|
+
metric=self,
|
|
337
|
+
prompt=prompt,
|
|
338
|
+
schema_cls=GoalScore,
|
|
339
|
+
extract_schema=lambda s: s,
|
|
340
|
+
extract_json=lambda data: GoalScore(**data),
|
|
312
341
|
)
|
|
313
|
-
if self.using_native_model:
|
|
314
|
-
res, cost = await self.model.a_generate(prompt, schema=GoalScore)
|
|
315
|
-
self.evaluation_cost += cost
|
|
316
|
-
return res
|
|
317
|
-
else:
|
|
318
|
-
try:
|
|
319
|
-
res: GoalScore = await self.model.a_generate(
|
|
320
|
-
prompt, schema=GoalScore
|
|
321
|
-
)
|
|
322
|
-
return res
|
|
323
|
-
except TypeError:
|
|
324
|
-
res = await self.model.a_generate(prompt)
|
|
325
|
-
data = trimAndLoadJson(res, self)
|
|
326
|
-
return GoalScore(**data)
|
|
327
342
|
|
|
328
343
|
def print_goals_and_steps_taken(self, goals_and_steps):
|
|
329
344
|
final_goals_and_steps = ""
|
|
@@ -340,7 +355,7 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
340
355
|
else:
|
|
341
356
|
try:
|
|
342
357
|
self.success = self.score >= self.threshold
|
|
343
|
-
except:
|
|
358
|
+
except TypeError:
|
|
344
359
|
self.success = False
|
|
345
360
|
return self.success
|
|
346
361
|
|