deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +10 -222
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +3 -6
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +1 -1
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +145 -90
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/amazon_bedrock_model.py +226 -71
- deepeval/models/llms/anthropic_model.py +141 -47
- deepeval/models/llms/azure_model.py +167 -94
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +79 -29
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +125 -59
- deepeval/models/llms/kimi_model.py +126 -81
- deepeval/models/llms/litellm_model.py +92 -18
- deepeval/models/llms/local_model.py +114 -15
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +167 -310
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/utils.py +60 -4
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -15,6 +15,8 @@ from deepeval.metrics.utils import (
|
|
|
15
15
|
trimAndLoadJson,
|
|
16
16
|
initialize_model,
|
|
17
17
|
check_llm_test_case_params,
|
|
18
|
+
generate_with_schema_and_extract,
|
|
19
|
+
a_generate_with_schema_and_extract,
|
|
18
20
|
)
|
|
19
21
|
from deepeval.models import DeepEvalBaseLLM
|
|
20
22
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
@@ -82,7 +84,19 @@ class GEval(BaseMetric):
|
|
|
82
84
|
_log_metric_to_confident: bool = True,
|
|
83
85
|
_additional_context: Optional[str] = None,
|
|
84
86
|
) -> float:
|
|
85
|
-
|
|
87
|
+
|
|
88
|
+
multimodal = test_case.multimodal
|
|
89
|
+
|
|
90
|
+
check_llm_test_case_params(
|
|
91
|
+
test_case,
|
|
92
|
+
self.evaluation_params,
|
|
93
|
+
None,
|
|
94
|
+
None,
|
|
95
|
+
self,
|
|
96
|
+
self.model,
|
|
97
|
+
multimodal,
|
|
98
|
+
)
|
|
99
|
+
|
|
86
100
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
87
101
|
|
|
88
102
|
with metric_progress_indicator(
|
|
@@ -104,10 +118,12 @@ class GEval(BaseMetric):
|
|
|
104
118
|
)
|
|
105
119
|
else:
|
|
106
120
|
self.evaluation_steps: List[str] = (
|
|
107
|
-
self._generate_evaluation_steps()
|
|
121
|
+
self._generate_evaluation_steps(multimodal)
|
|
108
122
|
)
|
|
109
123
|
g_score, reason = self._evaluate(
|
|
110
|
-
test_case,
|
|
124
|
+
test_case,
|
|
125
|
+
_additional_context=_additional_context,
|
|
126
|
+
multimodal=multimodal,
|
|
111
127
|
)
|
|
112
128
|
self.score = (
|
|
113
129
|
(float(g_score) - self.score_range[0])
|
|
@@ -143,7 +159,18 @@ class GEval(BaseMetric):
|
|
|
143
159
|
_log_metric_to_confident: bool = True,
|
|
144
160
|
_additional_context: Optional[str] = None,
|
|
145
161
|
) -> float:
|
|
146
|
-
|
|
162
|
+
|
|
163
|
+
multimodal = test_case.multimodal
|
|
164
|
+
|
|
165
|
+
check_llm_test_case_params(
|
|
166
|
+
test_case,
|
|
167
|
+
self.evaluation_params,
|
|
168
|
+
None,
|
|
169
|
+
None,
|
|
170
|
+
self,
|
|
171
|
+
self.model,
|
|
172
|
+
multimodal,
|
|
173
|
+
)
|
|
147
174
|
|
|
148
175
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
149
176
|
with metric_progress_indicator(
|
|
@@ -153,10 +180,12 @@ class GEval(BaseMetric):
|
|
|
153
180
|
_in_component=_in_component,
|
|
154
181
|
):
|
|
155
182
|
self.evaluation_steps: List[str] = (
|
|
156
|
-
await self._a_generate_evaluation_steps()
|
|
183
|
+
await self._a_generate_evaluation_steps(multimodal)
|
|
157
184
|
)
|
|
158
185
|
g_score, reason = await self._a_evaluate(
|
|
159
|
-
test_case,
|
|
186
|
+
test_case,
|
|
187
|
+
_additional_context=_additional_context,
|
|
188
|
+
multimodal=multimodal,
|
|
160
189
|
)
|
|
161
190
|
self.score = (
|
|
162
191
|
(float(g_score) - self.score_range[0]) / self.score_range_span
|
|
@@ -182,7 +211,7 @@ class GEval(BaseMetric):
|
|
|
182
211
|
)
|
|
183
212
|
return self.score
|
|
184
213
|
|
|
185
|
-
async def _a_generate_evaluation_steps(self) -> List[str]:
|
|
214
|
+
async def _a_generate_evaluation_steps(self, multimodal: bool) -> List[str]:
|
|
186
215
|
if self.evaluation_steps:
|
|
187
216
|
return self.evaluation_steps
|
|
188
217
|
|
|
@@ -190,25 +219,19 @@ class GEval(BaseMetric):
|
|
|
190
219
|
self.evaluation_params
|
|
191
220
|
)
|
|
192
221
|
prompt = self.evaluation_template.generate_evaluation_steps(
|
|
193
|
-
criteria=self.criteria,
|
|
222
|
+
criteria=self.criteria,
|
|
223
|
+
parameters=g_eval_params_str,
|
|
224
|
+
multimodal=multimodal,
|
|
225
|
+
)
|
|
226
|
+
return await a_generate_with_schema_and_extract(
|
|
227
|
+
metric=self,
|
|
228
|
+
prompt=prompt,
|
|
229
|
+
schema_cls=gschema.Steps,
|
|
230
|
+
extract_schema=lambda s: s.steps,
|
|
231
|
+
extract_json=lambda d: d["steps"],
|
|
194
232
|
)
|
|
195
|
-
if self.using_native_model:
|
|
196
|
-
res, cost = await self.model.a_generate(prompt)
|
|
197
|
-
self.evaluation_cost += cost
|
|
198
|
-
data = trimAndLoadJson(res, self)
|
|
199
|
-
return data["steps"]
|
|
200
|
-
else:
|
|
201
|
-
try:
|
|
202
|
-
res: gschema.Steps = await self.model.a_generate(
|
|
203
|
-
prompt, schema=gschema.Steps
|
|
204
|
-
)
|
|
205
|
-
return res.steps
|
|
206
|
-
except TypeError:
|
|
207
|
-
res = await self.model.a_generate(prompt)
|
|
208
|
-
data = trimAndLoadJson(res, self)
|
|
209
|
-
return data["steps"]
|
|
210
233
|
|
|
211
|
-
def _generate_evaluation_steps(self) -> List[str]:
|
|
234
|
+
def _generate_evaluation_steps(self, multimodal: bool) -> List[str]:
|
|
212
235
|
if self.evaluation_steps:
|
|
213
236
|
return self.evaluation_steps
|
|
214
237
|
|
|
@@ -216,26 +239,23 @@ class GEval(BaseMetric):
|
|
|
216
239
|
self.evaluation_params
|
|
217
240
|
)
|
|
218
241
|
prompt = self.evaluation_template.generate_evaluation_steps(
|
|
219
|
-
criteria=self.criteria,
|
|
242
|
+
criteria=self.criteria,
|
|
243
|
+
parameters=g_eval_params_str,
|
|
244
|
+
multimodal=multimodal,
|
|
245
|
+
)
|
|
246
|
+
return generate_with_schema_and_extract(
|
|
247
|
+
metric=self,
|
|
248
|
+
prompt=prompt,
|
|
249
|
+
schema_cls=gschema.Steps,
|
|
250
|
+
extract_schema=lambda s: s.steps,
|
|
251
|
+
extract_json=lambda d: d["steps"],
|
|
220
252
|
)
|
|
221
|
-
if self.using_native_model:
|
|
222
|
-
res, cost = self.model.generate(prompt)
|
|
223
|
-
self.evaluation_cost += cost
|
|
224
|
-
data = trimAndLoadJson(res, self)
|
|
225
|
-
return data["steps"]
|
|
226
|
-
else:
|
|
227
|
-
try:
|
|
228
|
-
res: gschema.Steps = self.model.generate(
|
|
229
|
-
prompt, schema=gschema.Steps
|
|
230
|
-
)
|
|
231
|
-
return res.steps
|
|
232
|
-
except TypeError:
|
|
233
|
-
res = self.model.generate(prompt)
|
|
234
|
-
data = trimAndLoadJson(res, self)
|
|
235
|
-
return data["steps"]
|
|
236
253
|
|
|
237
254
|
async def _a_evaluate(
|
|
238
|
-
self,
|
|
255
|
+
self,
|
|
256
|
+
test_case: LLMTestCase,
|
|
257
|
+
multimodal: bool,
|
|
258
|
+
_additional_context: Optional[str] = None,
|
|
239
259
|
) -> Tuple[Union[int, float], str]:
|
|
240
260
|
test_case_content = construct_test_case_string(
|
|
241
261
|
self.evaluation_params, test_case
|
|
@@ -252,6 +272,7 @@ class GEval(BaseMetric):
|
|
|
252
272
|
rubric=rubric_str,
|
|
253
273
|
score_range=self.score_range,
|
|
254
274
|
_additional_context=_additional_context,
|
|
275
|
+
multimodal=multimodal,
|
|
255
276
|
)
|
|
256
277
|
else:
|
|
257
278
|
prompt = (
|
|
@@ -262,6 +283,7 @@ class GEval(BaseMetric):
|
|
|
262
283
|
test_case_content=test_case_content,
|
|
263
284
|
parameters=g_eval_params_str,
|
|
264
285
|
_additional_context=_additional_context,
|
|
286
|
+
multimodal=multimodal,
|
|
265
287
|
)
|
|
266
288
|
)
|
|
267
289
|
try:
|
|
@@ -275,8 +297,7 @@ class GEval(BaseMetric):
|
|
|
275
297
|
prompt, top_logprobs=self.top_logprobs
|
|
276
298
|
)
|
|
277
299
|
|
|
278
|
-
|
|
279
|
-
self.evaluation_cost += cost
|
|
300
|
+
self._accrue_cost(cost)
|
|
280
301
|
|
|
281
302
|
data = trimAndLoadJson(res.choices[0].message.content, self)
|
|
282
303
|
|
|
@@ -292,27 +313,21 @@ class GEval(BaseMetric):
|
|
|
292
313
|
return weighted_summed_score, reason
|
|
293
314
|
except (KeyError, AttributeError, TypeError, ValueError):
|
|
294
315
|
return score, reason
|
|
295
|
-
except
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
try:
|
|
305
|
-
res: gschema.ReasonScore = await self.model.a_generate(
|
|
306
|
-
prompt, schema=gschema.ReasonScore
|
|
307
|
-
)
|
|
308
|
-
return res.score, res.reason
|
|
309
|
-
except TypeError:
|
|
310
|
-
res = await self.model.a_generate(prompt)
|
|
311
|
-
data = trimAndLoadJson(res, self)
|
|
312
|
-
return data["score"], data["reason"]
|
|
316
|
+
except AttributeError:
|
|
317
|
+
# This catches the case where a_generate_raw_response doesn't exist.
|
|
318
|
+
return await a_generate_with_schema_and_extract(
|
|
319
|
+
metric=self,
|
|
320
|
+
prompt=prompt,
|
|
321
|
+
schema_cls=gschema.ReasonScore,
|
|
322
|
+
extract_schema=lambda s: (s.score, s.reason),
|
|
323
|
+
extract_json=lambda d: (d["score"], d["reason"]),
|
|
324
|
+
)
|
|
313
325
|
|
|
314
326
|
def _evaluate(
|
|
315
|
-
self,
|
|
327
|
+
self,
|
|
328
|
+
test_case: LLMTestCase,
|
|
329
|
+
multimodal: bool,
|
|
330
|
+
_additional_context: Optional[str] = None,
|
|
316
331
|
) -> Tuple[Union[int, float], str]:
|
|
317
332
|
test_case_content = construct_test_case_string(
|
|
318
333
|
self.evaluation_params, test_case
|
|
@@ -330,6 +345,7 @@ class GEval(BaseMetric):
|
|
|
330
345
|
rubric=rubric_str,
|
|
331
346
|
score_range=self.score_range,
|
|
332
347
|
_additional_context=_additional_context,
|
|
348
|
+
multimodal=multimodal,
|
|
333
349
|
)
|
|
334
350
|
else:
|
|
335
351
|
prompt = (
|
|
@@ -340,6 +356,7 @@ class GEval(BaseMetric):
|
|
|
340
356
|
test_case_content=test_case_content,
|
|
341
357
|
parameters=g_eval_params_str,
|
|
342
358
|
_additional_context=_additional_context,
|
|
359
|
+
multimodal=multimodal,
|
|
343
360
|
)
|
|
344
361
|
)
|
|
345
362
|
|
|
@@ -351,7 +368,7 @@ class GEval(BaseMetric):
|
|
|
351
368
|
res, cost = self.model.generate_raw_response(
|
|
352
369
|
prompt, top_logprobs=self.top_logprobs
|
|
353
370
|
)
|
|
354
|
-
self.
|
|
371
|
+
self._accrue_cost(cost)
|
|
355
372
|
data = trimAndLoadJson(res.choices[0].message.content, self)
|
|
356
373
|
|
|
357
374
|
reason = data["reason"]
|
|
@@ -368,21 +385,13 @@ class GEval(BaseMetric):
|
|
|
368
385
|
return score, reason
|
|
369
386
|
except AttributeError:
|
|
370
387
|
# This catches the case where a_generate_raw_response doesn't exist.
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
res: gschema.ReasonScore = self.model.generate(
|
|
379
|
-
prompt, schema=gschema.ReasonScore
|
|
380
|
-
)
|
|
381
|
-
return res.score, res.reason
|
|
382
|
-
except TypeError:
|
|
383
|
-
res = self.model.generate(prompt)
|
|
384
|
-
data = trimAndLoadJson(res, self)
|
|
385
|
-
return data["score"], data["reason"]
|
|
388
|
+
return generate_with_schema_and_extract(
|
|
389
|
+
metric=self,
|
|
390
|
+
prompt=prompt,
|
|
391
|
+
schema_cls=gschema.ReasonScore,
|
|
392
|
+
extract_schema=lambda s: (s.score, s.reason),
|
|
393
|
+
extract_json=lambda d: (d["score"], d["reason"]),
|
|
394
|
+
)
|
|
386
395
|
|
|
387
396
|
def is_successful(self) -> bool:
|
|
388
397
|
if self.error is not None:
|
|
@@ -3,11 +3,23 @@ import textwrap
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class GEvalTemplate:
|
|
6
|
+
multimodal_rules = """
|
|
7
|
+
--- MULTIMODAL INPUT RULES ---
|
|
8
|
+
- Treat image content as factual evidence.
|
|
9
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
10
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
11
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
12
|
+
"""
|
|
13
|
+
|
|
6
14
|
@staticmethod
|
|
7
|
-
def generate_evaluation_steps(
|
|
15
|
+
def generate_evaluation_steps(
|
|
16
|
+
parameters: str, criteria: str, multimodal: bool = False
|
|
17
|
+
):
|
|
8
18
|
return textwrap.dedent(
|
|
9
19
|
f"""Given an evaluation criteria which outlines how you should judge the {parameters}, generate 3-4 concise evaluation steps based on the criteria below. You MUST make it clear how to evaluate {parameters} in relation to one another.
|
|
10
20
|
|
|
21
|
+
{GEvalTemplate.multimodal_rules if multimodal else ""}
|
|
22
|
+
|
|
11
23
|
Evaluation Criteria:
|
|
12
24
|
{criteria}
|
|
13
25
|
|
|
@@ -31,6 +43,7 @@ class GEvalTemplate:
|
|
|
31
43
|
rubric: Optional[str] = None,
|
|
32
44
|
score_range: Tuple[int, int] = (0, 10),
|
|
33
45
|
_additional_context: Optional[str] = None,
|
|
46
|
+
multimodal: bool = False,
|
|
34
47
|
):
|
|
35
48
|
rubric_text = f"Rubric:\n{rubric}\n" if rubric else ""
|
|
36
49
|
dependencies = (
|
|
@@ -62,6 +75,7 @@ class GEvalTemplate:
|
|
|
62
75
|
- {reasoning_expectation}
|
|
63
76
|
- Mention key details from the test case parameters.
|
|
64
77
|
- Be concise, clear, and focused on the evaluation logic.
|
|
78
|
+
{GEvalTemplate.multimodal_rules if multimodal else ""}
|
|
65
79
|
|
|
66
80
|
Only return valid JSON. Do **not** include any extra commentary or text.
|
|
67
81
|
|
|
@@ -95,6 +109,7 @@ class GEvalTemplate:
|
|
|
95
109
|
test_case_content: str,
|
|
96
110
|
parameters: str,
|
|
97
111
|
_additional_context: Optional[str] = None,
|
|
112
|
+
multimodal: bool = False,
|
|
98
113
|
):
|
|
99
114
|
additional_context = (
|
|
100
115
|
f"\n\nAdditional Context:\n{_additional_context}\n"
|
|
@@ -104,6 +119,8 @@ class GEvalTemplate:
|
|
|
104
119
|
return textwrap.dedent(
|
|
105
120
|
f"""Given the evaluation steps, return a JSON with two keys: 1) a `score` key that is STRICTLY EITHER 1 (follows the criteria 100% outlined in the evaluation steps), OR 0 (does not follow the criteria), and 2) a `reason` key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from {parameters} in your reason, but be very concise with it!
|
|
106
121
|
|
|
122
|
+
{GEvalTemplate.multimodal_rules if multimodal else ""}
|
|
123
|
+
|
|
107
124
|
Evaluation Steps:
|
|
108
125
|
{evaluation_steps}
|
|
109
126
|
|
deepeval/metrics/g_eval/utils.py
CHANGED
|
@@ -9,8 +9,8 @@ from deepeval.test_case import (
|
|
|
9
9
|
LLMTestCase,
|
|
10
10
|
ToolCall,
|
|
11
11
|
)
|
|
12
|
-
from deepeval.models.llms.openai_model import unsupported_log_probs_gpt_models
|
|
13
12
|
from pydantic import BaseModel, field_validator
|
|
13
|
+
from deepeval.models.llms.constants import OPENAI_MODELS_DATA
|
|
14
14
|
|
|
15
15
|
from deepeval.test_case.conversational_test_case import ConversationalTestCase
|
|
16
16
|
|
|
@@ -114,16 +114,17 @@ def format_rubrics(rubrics: Optional[List[Rubric]]) -> Optional[str]:
|
|
|
114
114
|
|
|
115
115
|
def no_log_prob_support(model: Union[str, DeepEvalBaseLLM]):
|
|
116
116
|
|
|
117
|
-
if isinstance(model, str)
|
|
118
|
-
|
|
117
|
+
if isinstance(model, str):
|
|
118
|
+
model_data = OPENAI_MODELS_DATA.get(model)
|
|
119
|
+
if not model_data.supports_log_probs:
|
|
120
|
+
return True
|
|
119
121
|
elif (
|
|
120
|
-
isinstance(model, GPTModel)
|
|
121
|
-
and model.get_model_name() in unsupported_log_probs_gpt_models
|
|
122
|
+
isinstance(model, GPTModel) and not model.model_data.supports_log_probs
|
|
122
123
|
):
|
|
123
124
|
return True
|
|
124
125
|
elif (
|
|
125
126
|
isinstance(model, AzureOpenAIModel)
|
|
126
|
-
and model.
|
|
127
|
+
and not model.model_data.supports_log_probs
|
|
127
128
|
):
|
|
128
129
|
return True
|
|
129
130
|
|
|
@@ -3,11 +3,12 @@ import asyncio
|
|
|
3
3
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
4
4
|
from deepeval.metrics.utils import (
|
|
5
5
|
construct_verbose_logs,
|
|
6
|
-
trimAndLoadJson,
|
|
7
6
|
get_unit_interactions,
|
|
8
7
|
print_tools_called,
|
|
9
8
|
check_conversational_test_case_params,
|
|
10
9
|
initialize_model,
|
|
10
|
+
a_generate_with_schema_and_extract,
|
|
11
|
+
generate_with_schema_and_extract,
|
|
11
12
|
)
|
|
12
13
|
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
13
14
|
from deepeval.metrics import BaseConversationalMetric
|
|
@@ -55,8 +56,14 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
55
56
|
_in_component: bool = False,
|
|
56
57
|
_log_metric_to_confident: bool = True,
|
|
57
58
|
):
|
|
59
|
+
multimodal = test_case.multimodal
|
|
58
60
|
check_conversational_test_case_params(
|
|
59
|
-
test_case,
|
|
61
|
+
test_case,
|
|
62
|
+
self._required_test_case_params,
|
|
63
|
+
self,
|
|
64
|
+
None,
|
|
65
|
+
self.model,
|
|
66
|
+
multimodal,
|
|
60
67
|
)
|
|
61
68
|
|
|
62
69
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -80,17 +87,21 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
80
87
|
)
|
|
81
88
|
goal_scores = [
|
|
82
89
|
self._get_goal_accuracy_score(
|
|
83
|
-
task.user_goal, task.steps_taken
|
|
90
|
+
task.user_goal, task.steps_taken, multimodal
|
|
84
91
|
)
|
|
85
92
|
for task in goal_and_steps_taken
|
|
86
93
|
]
|
|
87
94
|
plan_scores = [
|
|
88
|
-
self._get_plan_scores(
|
|
95
|
+
self._get_plan_scores(
|
|
96
|
+
task.user_goal, task.steps_taken, multimodal
|
|
97
|
+
)
|
|
89
98
|
for task in goal_and_steps_taken
|
|
90
99
|
]
|
|
91
100
|
self.score = self._calculate_score(goal_scores, plan_scores)
|
|
92
101
|
self.success = self.score >= self.threshold
|
|
93
|
-
self.reason = self._generate_reason(
|
|
102
|
+
self.reason = self._generate_reason(
|
|
103
|
+
goal_scores, plan_scores, multimodal
|
|
104
|
+
)
|
|
94
105
|
|
|
95
106
|
self.verbose_logs = construct_verbose_logs(
|
|
96
107
|
self,
|
|
@@ -117,8 +128,14 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
117
128
|
_in_component: bool = False,
|
|
118
129
|
_log_metric_to_confident: bool = True,
|
|
119
130
|
):
|
|
131
|
+
multimodal = test_case.multimodal
|
|
120
132
|
check_conversational_test_case_params(
|
|
121
|
-
test_case,
|
|
133
|
+
test_case,
|
|
134
|
+
self._required_test_case_params,
|
|
135
|
+
self,
|
|
136
|
+
None,
|
|
137
|
+
self.model,
|
|
138
|
+
multimodal,
|
|
122
139
|
)
|
|
123
140
|
|
|
124
141
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -134,21 +151,23 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
134
151
|
goal_scores = await asyncio.gather(
|
|
135
152
|
*[
|
|
136
153
|
self._a_get_goal_accuracy_score(
|
|
137
|
-
task.user_goal, task.steps_taken
|
|
154
|
+
task.user_goal, task.steps_taken, multimodal
|
|
138
155
|
)
|
|
139
156
|
for task in goal_and_steps_taken
|
|
140
157
|
]
|
|
141
158
|
)
|
|
142
159
|
plan_scores = await asyncio.gather(
|
|
143
160
|
*[
|
|
144
|
-
self._a_get_plan_scores(
|
|
161
|
+
self._a_get_plan_scores(
|
|
162
|
+
task.user_goal, task.steps_taken, multimodal
|
|
163
|
+
)
|
|
145
164
|
for task in goal_and_steps_taken
|
|
146
165
|
]
|
|
147
166
|
)
|
|
148
167
|
self.score = self._calculate_score(goal_scores, plan_scores)
|
|
149
168
|
self.success = self.score >= self.threshold
|
|
150
169
|
self.reason = await self._a_generate_reason(
|
|
151
|
-
goal_scores, plan_scores
|
|
170
|
+
goal_scores, plan_scores, multimodal
|
|
152
171
|
)
|
|
153
172
|
|
|
154
173
|
self.verbose_logs = construct_verbose_logs(
|
|
@@ -191,41 +210,31 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
191
210
|
goal_and_steps_taken.append(new_goal_steps)
|
|
192
211
|
return goal_and_steps_taken
|
|
193
212
|
|
|
194
|
-
def _get_plan_scores(self, user_goal, steps_taken):
|
|
213
|
+
def _get_plan_scores(self, user_goal, steps_taken, multimodal: bool):
|
|
195
214
|
prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
|
|
196
|
-
user_goal, "\n".join(steps_taken)
|
|
215
|
+
user_goal, "\n".join(steps_taken), multimodal
|
|
216
|
+
)
|
|
217
|
+
return generate_with_schema_and_extract(
|
|
218
|
+
metric=self,
|
|
219
|
+
prompt=prompt,
|
|
220
|
+
schema_cls=PlanScore,
|
|
221
|
+
extract_schema=lambda s: s,
|
|
222
|
+
extract_json=lambda data: PlanScore(**data),
|
|
197
223
|
)
|
|
198
|
-
if self.using_native_model:
|
|
199
|
-
res, cost = self.model.generate(prompt, schema=PlanScore)
|
|
200
|
-
self.evaluation_cost += cost
|
|
201
|
-
return res
|
|
202
|
-
else:
|
|
203
|
-
try:
|
|
204
|
-
res: PlanScore = self.model.generate(prompt, schema=PlanScore)
|
|
205
|
-
return res
|
|
206
|
-
except TypeError:
|
|
207
|
-
res = self.model.generate(prompt)
|
|
208
|
-
data = trimAndLoadJson(res, self)
|
|
209
|
-
return PlanScore(**data)
|
|
210
224
|
|
|
211
|
-
async def _a_get_plan_scores(
|
|
225
|
+
async def _a_get_plan_scores(
|
|
226
|
+
self, user_goal, steps_taken, multimodal: bool
|
|
227
|
+
):
|
|
212
228
|
prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
|
|
213
|
-
user_goal, "\n".join(steps_taken)
|
|
229
|
+
user_goal, "\n".join(steps_taken), multimodal
|
|
230
|
+
)
|
|
231
|
+
return await a_generate_with_schema_and_extract(
|
|
232
|
+
metric=self,
|
|
233
|
+
prompt=prompt,
|
|
234
|
+
schema_cls=PlanScore,
|
|
235
|
+
extract_schema=lambda s: s,
|
|
236
|
+
extract_json=lambda data: PlanScore(**data),
|
|
214
237
|
)
|
|
215
|
-
if self.using_native_model:
|
|
216
|
-
res, cost = await self.model.a_generate(prompt, schema=PlanScore)
|
|
217
|
-
self.evaluation_cost += cost
|
|
218
|
-
return res
|
|
219
|
-
else:
|
|
220
|
-
try:
|
|
221
|
-
res: PlanScore = await self.model.a_generate(
|
|
222
|
-
prompt, schema=PlanScore
|
|
223
|
-
)
|
|
224
|
-
return res
|
|
225
|
-
except TypeError:
|
|
226
|
-
res = await self.model.a_generate(prompt)
|
|
227
|
-
data = trimAndLoadJson(res, self)
|
|
228
|
-
return PlanScore(**data)
|
|
229
238
|
|
|
230
239
|
def _calculate_score(
|
|
231
240
|
self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
|
|
@@ -240,7 +249,10 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
240
249
|
return 0 if self.strict_mode and score < self.threshold else score
|
|
241
250
|
|
|
242
251
|
def _generate_reason(
|
|
243
|
-
self,
|
|
252
|
+
self,
|
|
253
|
+
goal_scores: List[GoalScore],
|
|
254
|
+
plan_scores: List[PlanScore],
|
|
255
|
+
multimodal: bool,
|
|
244
256
|
):
|
|
245
257
|
goal_evaluations = ""
|
|
246
258
|
for goal_score in goal_scores:
|
|
@@ -254,18 +266,25 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
254
266
|
)
|
|
255
267
|
|
|
256
268
|
prompt = GoalAccuracyTemplate.get_final_reason(
|
|
257
|
-
self.score,
|
|
269
|
+
self.score,
|
|
270
|
+
self.threshold,
|
|
271
|
+
goal_evaluations,
|
|
272
|
+
plan_evalautions,
|
|
273
|
+
multimodal,
|
|
258
274
|
)
|
|
259
275
|
if self.using_native_model:
|
|
260
276
|
res, cost = self.model.generate(prompt)
|
|
261
|
-
self.
|
|
277
|
+
self._accrue_cost(cost)
|
|
262
278
|
return res
|
|
263
279
|
else:
|
|
264
280
|
res = self.model.generate(prompt)
|
|
265
281
|
return res
|
|
266
282
|
|
|
267
283
|
async def _a_generate_reason(
|
|
268
|
-
self,
|
|
284
|
+
self,
|
|
285
|
+
goal_scores: List[GoalScore],
|
|
286
|
+
plan_scores: List[PlanScore],
|
|
287
|
+
multimodal: bool,
|
|
269
288
|
):
|
|
270
289
|
goal_evaluations = ""
|
|
271
290
|
for goal_score in goal_scores:
|
|
@@ -279,51 +298,47 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
279
298
|
)
|
|
280
299
|
|
|
281
300
|
prompt = GoalAccuracyTemplate.get_final_reason(
|
|
282
|
-
self.score,
|
|
301
|
+
self.score,
|
|
302
|
+
self.threshold,
|
|
303
|
+
goal_evaluations,
|
|
304
|
+
plan_evalautions,
|
|
305
|
+
multimodal,
|
|
283
306
|
)
|
|
284
307
|
if self.using_native_model:
|
|
285
308
|
res, cost = await self.model.a_generate(prompt)
|
|
286
|
-
self.
|
|
309
|
+
self._accrue_cost(cost)
|
|
287
310
|
return res
|
|
288
311
|
else:
|
|
289
312
|
res = await self.model.a_generate(prompt)
|
|
290
313
|
return res
|
|
291
314
|
|
|
292
|
-
def _get_goal_accuracy_score(
|
|
315
|
+
def _get_goal_accuracy_score(
|
|
316
|
+
self, user_goal, steps_taken, multimodal: bool
|
|
317
|
+
):
|
|
293
318
|
prompt = GoalAccuracyTemplate.get_accuracy_score(
|
|
294
|
-
user_goal, "\n".join(steps_taken)
|
|
319
|
+
user_goal, "\n".join(steps_taken), multimodal
|
|
320
|
+
)
|
|
321
|
+
return generate_with_schema_and_extract(
|
|
322
|
+
metric=self,
|
|
323
|
+
prompt=prompt,
|
|
324
|
+
schema_cls=GoalScore,
|
|
325
|
+
extract_schema=lambda s: s,
|
|
326
|
+
extract_json=lambda data: GoalScore(**data),
|
|
295
327
|
)
|
|
296
|
-
if self.using_native_model:
|
|
297
|
-
res, cost = self.model.generate(prompt, schema=GoalScore)
|
|
298
|
-
self.evaluation_cost += cost
|
|
299
|
-
return res
|
|
300
|
-
else:
|
|
301
|
-
try:
|
|
302
|
-
res: GoalScore = self.model.generate(prompt, schema=GoalScore)
|
|
303
|
-
return res
|
|
304
|
-
except TypeError:
|
|
305
|
-
res = self.model.generate(prompt)
|
|
306
|
-
data = trimAndLoadJson(res, self)
|
|
307
|
-
return GoalScore(**data)
|
|
308
328
|
|
|
309
|
-
async def _a_get_goal_accuracy_score(
|
|
329
|
+
async def _a_get_goal_accuracy_score(
|
|
330
|
+
self, user_goal, steps_taken, multimodal: bool
|
|
331
|
+
):
|
|
310
332
|
prompt = GoalAccuracyTemplate.get_accuracy_score(
|
|
311
|
-
user_goal, "\n".join(steps_taken)
|
|
333
|
+
user_goal, "\n".join(steps_taken), multimodal
|
|
334
|
+
)
|
|
335
|
+
return await a_generate_with_schema_and_extract(
|
|
336
|
+
metric=self,
|
|
337
|
+
prompt=prompt,
|
|
338
|
+
schema_cls=GoalScore,
|
|
339
|
+
extract_schema=lambda s: s,
|
|
340
|
+
extract_json=lambda data: GoalScore(**data),
|
|
312
341
|
)
|
|
313
|
-
if self.using_native_model:
|
|
314
|
-
res, cost = await self.model.a_generate(prompt, schema=GoalScore)
|
|
315
|
-
self.evaluation_cost += cost
|
|
316
|
-
return res
|
|
317
|
-
else:
|
|
318
|
-
try:
|
|
319
|
-
res: GoalScore = await self.model.a_generate(
|
|
320
|
-
prompt, schema=GoalScore
|
|
321
|
-
)
|
|
322
|
-
return res
|
|
323
|
-
except TypeError:
|
|
324
|
-
res = await self.model.a_generate(prompt)
|
|
325
|
-
data = trimAndLoadJson(res, self)
|
|
326
|
-
return GoalScore(**data)
|
|
327
342
|
|
|
328
343
|
def print_goals_and_steps_taken(self, goals_and_steps):
|
|
329
344
|
final_goals_and_steps = ""
|
|
@@ -340,7 +355,7 @@ class GoalAccuracyMetric(BaseConversationalMetric):
|
|
|
340
355
|
else:
|
|
341
356
|
try:
|
|
342
357
|
self.success = self.score >= self.threshold
|
|
343
|
-
except:
|
|
358
|
+
except TypeError:
|
|
344
359
|
self.success = False
|
|
345
360
|
return self.success
|
|
346
361
|
|