deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
deepeval/metrics/dag/nodes.py
CHANGED
|
@@ -18,7 +18,11 @@ from deepeval.metrics.dag.templates import (
|
|
|
18
18
|
from deepeval.metrics.base_metric import BaseMetric
|
|
19
19
|
from deepeval.metrics.g_eval.g_eval import GEval
|
|
20
20
|
from deepeval.metrics.g_eval.utils import G_EVAL_PARAMS
|
|
21
|
-
from deepeval.metrics.utils import
|
|
21
|
+
from deepeval.metrics.utils import (
|
|
22
|
+
copy_metrics,
|
|
23
|
+
a_generate_with_schema_and_extract,
|
|
24
|
+
generate_with_schema_and_extract,
|
|
25
|
+
)
|
|
22
26
|
from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall
|
|
23
27
|
from deepeval.utils import prettify_list
|
|
24
28
|
|
|
@@ -222,20 +226,13 @@ class VerdictNode(BaseNode):
|
|
|
222
226
|
score=metric.score,
|
|
223
227
|
name=metric.__name__,
|
|
224
228
|
)
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
)
|
|
233
|
-
except TypeError:
|
|
234
|
-
res = metric.model.generate(prompt)
|
|
235
|
-
data = trimAndLoadJson(res, self)
|
|
236
|
-
res = MetricScoreReason(**data)
|
|
237
|
-
|
|
238
|
-
return res.reason
|
|
229
|
+
return generate_with_schema_and_extract(
|
|
230
|
+
metric=metric,
|
|
231
|
+
prompt=prompt,
|
|
232
|
+
schema_cls=MetricScoreReason,
|
|
233
|
+
extract_schema=lambda s: s.reason,
|
|
234
|
+
extract_json=lambda data: data["reason"],
|
|
235
|
+
)
|
|
239
236
|
|
|
240
237
|
async def _a_generate_reason(self, metric: BaseMetric):
|
|
241
238
|
prompt = VerdictNodeTemplate.generate_reason(
|
|
@@ -243,22 +240,13 @@ class VerdictNode(BaseNode):
|
|
|
243
240
|
score=metric.score,
|
|
244
241
|
name=metric.__name__,
|
|
245
242
|
)
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
res: MetricScoreReason = await metric.model.a_generate(
|
|
254
|
-
prompt, schema=MetricScoreReason
|
|
255
|
-
)
|
|
256
|
-
except TypeError:
|
|
257
|
-
res = await metric.model.a_generate(prompt)
|
|
258
|
-
data = trimAndLoadJson(res, self)
|
|
259
|
-
res = MetricScoreReason(**data)
|
|
260
|
-
|
|
261
|
-
return res.reason
|
|
243
|
+
return await a_generate_with_schema_and_extract(
|
|
244
|
+
metric=metric,
|
|
245
|
+
prompt=prompt,
|
|
246
|
+
schema_cls=MetricScoreReason,
|
|
247
|
+
extract_schema=lambda s: s.reason,
|
|
248
|
+
extract_json=lambda data: data["reason"],
|
|
249
|
+
)
|
|
262
250
|
|
|
263
251
|
|
|
264
252
|
@dataclass
|
|
@@ -317,20 +305,13 @@ class TaskNode(BaseNode):
|
|
|
317
305
|
instructions=self.instructions,
|
|
318
306
|
text=text,
|
|
319
307
|
)
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
prompt, schema=TaskNodeOutput
|
|
328
|
-
)
|
|
329
|
-
self._output = res.output
|
|
330
|
-
except TypeError:
|
|
331
|
-
res = metric.model.generate(prompt)
|
|
332
|
-
data = trimAndLoadJson(res, self)
|
|
333
|
-
self._output = TaskNodeOutput(**data).output
|
|
308
|
+
self._output = generate_with_schema_and_extract(
|
|
309
|
+
metric=metric,
|
|
310
|
+
prompt=prompt,
|
|
311
|
+
schema_cls=TaskNodeOutput,
|
|
312
|
+
extract_schema=lambda s: s.output,
|
|
313
|
+
extract_json=lambda data: data["output"],
|
|
314
|
+
)
|
|
334
315
|
|
|
335
316
|
metric._verbose_steps.append(
|
|
336
317
|
construct_node_verbose_log(self, self._depth)
|
|
@@ -371,22 +352,13 @@ class TaskNode(BaseNode):
|
|
|
371
352
|
text=text,
|
|
372
353
|
)
|
|
373
354
|
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
try:
|
|
382
|
-
res: TaskNodeOutput = await metric.model.a_generate(
|
|
383
|
-
prompt, schema=TaskNodeOutput
|
|
384
|
-
)
|
|
385
|
-
self._output = res.output
|
|
386
|
-
except TypeError:
|
|
387
|
-
res = await metric.model.a_generate(prompt)
|
|
388
|
-
data = trimAndLoadJson(res, self)
|
|
389
|
-
self._output = TaskNodeOutput(**data).output
|
|
355
|
+
self._output = await a_generate_with_schema_and_extract(
|
|
356
|
+
metric=metric,
|
|
357
|
+
prompt=prompt,
|
|
358
|
+
schema_cls=TaskNodeOutput,
|
|
359
|
+
extract_schema=lambda s: s.output,
|
|
360
|
+
extract_json=lambda data: data["output"],
|
|
361
|
+
)
|
|
390
362
|
|
|
391
363
|
metric._verbose_steps.append(
|
|
392
364
|
construct_node_verbose_log(self, self._depth)
|
|
@@ -470,23 +442,13 @@ class BinaryJudgementNode(BaseNode):
|
|
|
470
442
|
criteria=self.criteria,
|
|
471
443
|
text=text,
|
|
472
444
|
)
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
try:
|
|
481
|
-
res: BinaryJudgementVerdict = metric.model.generate(
|
|
482
|
-
prompt, schema=BinaryJudgementVerdict
|
|
483
|
-
)
|
|
484
|
-
self._verdict = res
|
|
485
|
-
except TypeError:
|
|
486
|
-
res = metric.model.generate(prompt)
|
|
487
|
-
data = trimAndLoadJson(res, self)
|
|
488
|
-
self._verdict = BinaryJudgementVerdict(**data)
|
|
489
|
-
|
|
445
|
+
self._verdict = generate_with_schema_and_extract(
|
|
446
|
+
metric=metric,
|
|
447
|
+
prompt=prompt,
|
|
448
|
+
schema_cls=BinaryJudgementVerdict,
|
|
449
|
+
extract_schema=lambda s: s,
|
|
450
|
+
extract_json=lambda data: BinaryJudgementVerdict(**data),
|
|
451
|
+
)
|
|
490
452
|
metric._verbose_steps.append(
|
|
491
453
|
construct_node_verbose_log(self, self._depth)
|
|
492
454
|
)
|
|
@@ -520,22 +482,13 @@ class BinaryJudgementNode(BaseNode):
|
|
|
520
482
|
criteria=self.criteria,
|
|
521
483
|
text=text,
|
|
522
484
|
)
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
try:
|
|
531
|
-
res: BinaryJudgementVerdict = await metric.model.a_generate(
|
|
532
|
-
prompt, schema=BinaryJudgementVerdict
|
|
533
|
-
)
|
|
534
|
-
self._verdict = res
|
|
535
|
-
except TypeError:
|
|
536
|
-
res = await metric.model.a_generate(prompt)
|
|
537
|
-
data = trimAndLoadJson(res, self)
|
|
538
|
-
self._verdict = BinaryJudgementVerdict(**data)
|
|
485
|
+
self._verdict = await a_generate_with_schema_and_extract(
|
|
486
|
+
metric=metric,
|
|
487
|
+
prompt=prompt,
|
|
488
|
+
schema_cls=BinaryJudgementVerdict,
|
|
489
|
+
extract_schema=lambda s: s,
|
|
490
|
+
extract_json=lambda data: BinaryJudgementVerdict(**data),
|
|
491
|
+
)
|
|
539
492
|
|
|
540
493
|
metric._verbose_steps.append(
|
|
541
494
|
construct_node_verbose_log(self, self._depth)
|
|
@@ -629,22 +582,14 @@ class NonBinaryJudgementNode(BaseNode):
|
|
|
629
582
|
prompt = NonBinaryJudgementTemplate.generate_non_binary_verdict(
|
|
630
583
|
criteria=self.criteria, text=text, options=self._verdict_options
|
|
631
584
|
)
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
res: self._verdict_schema = metric.model.generate(
|
|
641
|
-
prompt, schema=self._verdict_schema
|
|
642
|
-
)
|
|
643
|
-
self._verdict = res
|
|
644
|
-
except TypeError:
|
|
645
|
-
res = metric.model.generate(prompt)
|
|
646
|
-
data = trimAndLoadJson(res, self)
|
|
647
|
-
self._verdict = self._verdict_schema(**data)
|
|
585
|
+
|
|
586
|
+
self._verdict = generate_with_schema_and_extract(
|
|
587
|
+
metric=metric,
|
|
588
|
+
prompt=prompt,
|
|
589
|
+
schema_cls=self._verdict_schema,
|
|
590
|
+
extract_schema=lambda s: s,
|
|
591
|
+
extract_json=lambda data: self._verdict_schema(**data),
|
|
592
|
+
)
|
|
648
593
|
|
|
649
594
|
metric._verbose_steps.append(
|
|
650
595
|
construct_node_verbose_log(self, self._depth)
|
|
@@ -678,22 +623,14 @@ class NonBinaryJudgementNode(BaseNode):
|
|
|
678
623
|
prompt = NonBinaryJudgementTemplate.generate_non_binary_verdict(
|
|
679
624
|
criteria=self.criteria, text=text, options=self._verdict_options
|
|
680
625
|
)
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
res: self._verdict_schema = await metric.model.a_generate(
|
|
690
|
-
prompt, schema=self._verdict_schema
|
|
691
|
-
)
|
|
692
|
-
self._verdict = res
|
|
693
|
-
except TypeError:
|
|
694
|
-
res = await metric.model.a_generate(prompt)
|
|
695
|
-
data = trimAndLoadJson(res, self)
|
|
696
|
-
self._verdict = self._verdict_schema(**data)
|
|
626
|
+
|
|
627
|
+
self._verdict = await a_generate_with_schema_and_extract(
|
|
628
|
+
metric=metric,
|
|
629
|
+
prompt=prompt,
|
|
630
|
+
schema_cls=self._verdict_schema,
|
|
631
|
+
extract_schema=lambda s: s,
|
|
632
|
+
extract_json=lambda data: self._verdict_schema(**data),
|
|
633
|
+
)
|
|
697
634
|
|
|
698
635
|
metric._verbose_steps.append(
|
|
699
636
|
construct_node_verbose_log(self, self._depth)
|
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
|
|
3
|
+
multimodal_rules = """
|
|
4
|
+
--- MULTIMODAL INPUT RULES ---
|
|
5
|
+
- Treat image content as factual evidence.
|
|
6
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
7
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
8
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
9
|
+
"""
|
|
10
|
+
|
|
3
11
|
|
|
4
12
|
class VerdictNodeTemplate:
|
|
5
13
|
@staticmethod
|
|
@@ -34,6 +42,8 @@ class TaskNodeTemplate:
|
|
|
34
42
|
def generate_task_output(instructions: str, text: str):
|
|
35
43
|
return f"""Given the following instructions, generate an output.
|
|
36
44
|
|
|
45
|
+
{multimodal_rules}
|
|
46
|
+
|
|
37
47
|
{instructions}
|
|
38
48
|
|
|
39
49
|
{text}
|
|
@@ -57,6 +67,8 @@ class BinaryJudgementTemplate:
|
|
|
57
67
|
def generate_binary_verdict(criteria: str, text: str):
|
|
58
68
|
return f"""{criteria}
|
|
59
69
|
|
|
70
|
+
{multimodal_rules}
|
|
71
|
+
|
|
60
72
|
{text}
|
|
61
73
|
|
|
62
74
|
**
|
|
@@ -79,6 +91,8 @@ class NonBinaryJudgementTemplate:
|
|
|
79
91
|
):
|
|
80
92
|
return f"""{criteria}
|
|
81
93
|
|
|
94
|
+
{multimodal_rules}
|
|
95
|
+
|
|
82
96
|
{text}
|
|
83
97
|
|
|
84
98
|
**
|
|
@@ -32,7 +32,15 @@ class ExactMatchMetric(BaseMetric):
|
|
|
32
32
|
_in_component: bool = False,
|
|
33
33
|
_log_metric_to_confident: bool = True,
|
|
34
34
|
) -> float:
|
|
35
|
-
check_llm_test_case_params(
|
|
35
|
+
check_llm_test_case_params(
|
|
36
|
+
test_case,
|
|
37
|
+
self._required_params,
|
|
38
|
+
None,
|
|
39
|
+
None,
|
|
40
|
+
self,
|
|
41
|
+
None,
|
|
42
|
+
test_case.multimodal,
|
|
43
|
+
)
|
|
36
44
|
|
|
37
45
|
with metric_progress_indicator(
|
|
38
46
|
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List, Optional, Union, Type
|
|
2
2
|
import asyncio
|
|
3
3
|
|
|
4
|
-
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
|
4
|
+
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
|
5
5
|
from deepeval.metrics import BaseMetric
|
|
6
6
|
from deepeval.utils import (
|
|
7
7
|
get_or_create_event_loop,
|
|
@@ -9,10 +9,10 @@ from deepeval.utils import (
|
|
|
9
9
|
)
|
|
10
10
|
from deepeval.metrics.utils import (
|
|
11
11
|
construct_verbose_logs,
|
|
12
|
-
trimAndLoadJson,
|
|
13
12
|
check_llm_test_case_params,
|
|
14
|
-
check_mllm_test_case_params,
|
|
15
13
|
initialize_model,
|
|
14
|
+
a_generate_with_schema_and_extract,
|
|
15
|
+
generate_with_schema_and_extract,
|
|
16
16
|
)
|
|
17
17
|
from deepeval.models import DeepEvalBaseLLM
|
|
18
18
|
from deepeval.metrics.faithfulness.template import FaithfulnessTemplate
|
|
@@ -69,12 +69,15 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
69
69
|
) -> float:
|
|
70
70
|
|
|
71
71
|
multimodal = test_case.multimodal
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
72
|
+
check_llm_test_case_params(
|
|
73
|
+
test_case,
|
|
74
|
+
self._required_params,
|
|
75
|
+
None,
|
|
76
|
+
None,
|
|
77
|
+
self,
|
|
78
|
+
self.model,
|
|
79
|
+
multimodal,
|
|
80
|
+
)
|
|
78
81
|
|
|
79
82
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
80
83
|
with metric_progress_indicator(
|
|
@@ -127,12 +130,15 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
127
130
|
) -> float:
|
|
128
131
|
|
|
129
132
|
multimodal = test_case.multimodal
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
133
|
+
check_llm_test_case_params(
|
|
134
|
+
test_case,
|
|
135
|
+
self._required_params,
|
|
136
|
+
None,
|
|
137
|
+
None,
|
|
138
|
+
self,
|
|
139
|
+
self.model,
|
|
140
|
+
multimodal,
|
|
141
|
+
)
|
|
136
142
|
|
|
137
143
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
138
144
|
with metric_progress_indicator(
|
|
@@ -182,22 +188,13 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
182
188
|
multimodal=multimodal,
|
|
183
189
|
)
|
|
184
190
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
try:
|
|
193
|
-
res: FaithfulnessScoreReason = await self.model.a_generate(
|
|
194
|
-
prompt, schema=FaithfulnessScoreReason
|
|
195
|
-
)
|
|
196
|
-
return res.reason
|
|
197
|
-
except TypeError:
|
|
198
|
-
res = await self.model.a_generate(prompt)
|
|
199
|
-
data = trimAndLoadJson(res, self)
|
|
200
|
-
return data["reason"]
|
|
191
|
+
return await a_generate_with_schema_and_extract(
|
|
192
|
+
metric=self,
|
|
193
|
+
prompt=prompt,
|
|
194
|
+
schema_cls=FaithfulnessScoreReason,
|
|
195
|
+
extract_schema=lambda s: s.reason,
|
|
196
|
+
extract_json=lambda data: data["reason"],
|
|
197
|
+
)
|
|
201
198
|
|
|
202
199
|
def _generate_reason(self, multimodal: bool) -> str:
|
|
203
200
|
if self.include_reason is False:
|
|
@@ -214,22 +211,13 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
214
211
|
multimodal=multimodal,
|
|
215
212
|
)
|
|
216
213
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
try:
|
|
225
|
-
res: FaithfulnessScoreReason = self.model.generate(
|
|
226
|
-
prompt, schema=FaithfulnessScoreReason
|
|
227
|
-
)
|
|
228
|
-
return res.reason
|
|
229
|
-
except TypeError:
|
|
230
|
-
res = self.model.generate(prompt)
|
|
231
|
-
data = trimAndLoadJson(res, self)
|
|
232
|
-
return data["reason"]
|
|
214
|
+
return generate_with_schema_and_extract(
|
|
215
|
+
metric=self,
|
|
216
|
+
prompt=prompt,
|
|
217
|
+
schema_cls=FaithfulnessScoreReason,
|
|
218
|
+
extract_schema=lambda s: s.reason,
|
|
219
|
+
extract_json=lambda data: data["reason"],
|
|
220
|
+
)
|
|
233
221
|
|
|
234
222
|
async def _a_generate_verdicts(
|
|
235
223
|
self, multimodal: bool
|
|
@@ -237,63 +225,41 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
237
225
|
if len(self.claims) == 0:
|
|
238
226
|
return []
|
|
239
227
|
|
|
240
|
-
verdicts: List[FaithfulnessVerdict] = []
|
|
241
|
-
|
|
242
228
|
prompt = self.evaluation_template.generate_verdicts(
|
|
243
229
|
claims=self.claims,
|
|
244
230
|
retrieval_context="\n\n".join(self.truths),
|
|
245
231
|
multimodal=multimodal,
|
|
246
232
|
)
|
|
247
233
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
)
|
|
258
|
-
verdicts = [item for item in res.verdicts]
|
|
259
|
-
return verdicts
|
|
260
|
-
except TypeError:
|
|
261
|
-
res = await self.model.a_generate(prompt)
|
|
262
|
-
data = trimAndLoadJson(res, self)
|
|
263
|
-
verdicts = [
|
|
264
|
-
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
265
|
-
]
|
|
266
|
-
return verdicts
|
|
234
|
+
return await a_generate_with_schema_and_extract(
|
|
235
|
+
metric=self,
|
|
236
|
+
prompt=prompt,
|
|
237
|
+
schema_cls=Verdicts,
|
|
238
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
239
|
+
extract_json=lambda data: [
|
|
240
|
+
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
241
|
+
],
|
|
242
|
+
)
|
|
267
243
|
|
|
268
244
|
def _generate_verdicts(self, multimodal: bool) -> List[FaithfulnessVerdict]:
|
|
269
245
|
if len(self.claims) == 0:
|
|
270
246
|
return []
|
|
271
247
|
|
|
272
|
-
verdicts: List[FaithfulnessVerdict] = []
|
|
273
|
-
|
|
274
248
|
prompt = self.evaluation_template.generate_verdicts(
|
|
275
249
|
claims=self.claims,
|
|
276
250
|
retrieval_context="\n\n".join(self.truths),
|
|
277
251
|
multimodal=multimodal,
|
|
278
252
|
)
|
|
279
253
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
return verdicts
|
|
290
|
-
except TypeError:
|
|
291
|
-
res = self.model.generate(prompt)
|
|
292
|
-
data = trimAndLoadJson(res, self)
|
|
293
|
-
verdicts = [
|
|
294
|
-
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
295
|
-
]
|
|
296
|
-
return verdicts
|
|
254
|
+
return generate_with_schema_and_extract(
|
|
255
|
+
metric=self,
|
|
256
|
+
prompt=prompt,
|
|
257
|
+
schema_cls=Verdicts,
|
|
258
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
259
|
+
extract_json=lambda data: [
|
|
260
|
+
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
261
|
+
],
|
|
262
|
+
)
|
|
297
263
|
|
|
298
264
|
async def _a_generate_truths(
|
|
299
265
|
self, retrieval_context: str, multimodal: bool
|
|
@@ -303,18 +269,13 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
303
269
|
extraction_limit=self.truths_extraction_limit,
|
|
304
270
|
multimodal=multimodal,
|
|
305
271
|
)
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
return res.truths
|
|
314
|
-
except TypeError:
|
|
315
|
-
res = await self.model.a_generate(prompt)
|
|
316
|
-
data = trimAndLoadJson(res, self)
|
|
317
|
-
return data["truths"]
|
|
272
|
+
return await a_generate_with_schema_and_extract(
|
|
273
|
+
metric=self,
|
|
274
|
+
prompt=prompt,
|
|
275
|
+
schema_cls=Truths,
|
|
276
|
+
extract_schema=lambda s: s.truths,
|
|
277
|
+
extract_json=lambda data: data["truths"],
|
|
278
|
+
)
|
|
318
279
|
|
|
319
280
|
def _generate_truths(
|
|
320
281
|
self, retrieval_context: str, multimodal: bool
|
|
@@ -324,18 +285,13 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
324
285
|
extraction_limit=self.truths_extraction_limit,
|
|
325
286
|
multimodal=multimodal,
|
|
326
287
|
)
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
return res.truths
|
|
335
|
-
except TypeError:
|
|
336
|
-
res = self.model.generate(prompt)
|
|
337
|
-
data = trimAndLoadJson(res, self)
|
|
338
|
-
return data["truths"]
|
|
288
|
+
return generate_with_schema_and_extract(
|
|
289
|
+
metric=self,
|
|
290
|
+
prompt=prompt,
|
|
291
|
+
schema_cls=Truths,
|
|
292
|
+
extract_schema=lambda s: s.truths,
|
|
293
|
+
extract_json=lambda data: data["truths"],
|
|
294
|
+
)
|
|
339
295
|
|
|
340
296
|
async def _a_generate_claims(
|
|
341
297
|
self, actual_output: str, multimodal: bool
|
|
@@ -343,18 +299,13 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
343
299
|
prompt = self.evaluation_template.generate_claims(
|
|
344
300
|
actual_output=actual_output, multimodal=multimodal
|
|
345
301
|
)
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
return res.claims
|
|
354
|
-
except TypeError:
|
|
355
|
-
res = await self.model.a_generate(prompt)
|
|
356
|
-
data = trimAndLoadJson(res, self)
|
|
357
|
-
return data["claims"]
|
|
302
|
+
return await a_generate_with_schema_and_extract(
|
|
303
|
+
metric=self,
|
|
304
|
+
prompt=prompt,
|
|
305
|
+
schema_cls=Claims,
|
|
306
|
+
extract_schema=lambda s: s.claims,
|
|
307
|
+
extract_json=lambda data: data["claims"],
|
|
308
|
+
)
|
|
358
309
|
|
|
359
310
|
def _generate_claims(
|
|
360
311
|
self, actual_output: str, multimodal: bool
|
|
@@ -362,18 +313,13 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
362
313
|
prompt = self.evaluation_template.generate_claims(
|
|
363
314
|
actual_output=actual_output, multimodal=multimodal
|
|
364
315
|
)
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
return res.claims
|
|
373
|
-
except TypeError:
|
|
374
|
-
res = self.model.generate(prompt)
|
|
375
|
-
data = trimAndLoadJson(res, self)
|
|
376
|
-
return data["claims"]
|
|
316
|
+
return generate_with_schema_and_extract(
|
|
317
|
+
metric=self,
|
|
318
|
+
prompt=prompt,
|
|
319
|
+
schema_cls=Claims,
|
|
320
|
+
extract_schema=lambda s: s.claims,
|
|
321
|
+
extract_json=lambda data: data["claims"],
|
|
322
|
+
)
|
|
377
323
|
|
|
378
324
|
def _calculate_score(self) -> float:
|
|
379
325
|
number_of_verdicts = len(self.verdicts)
|
|
@@ -400,7 +346,7 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
400
346
|
else:
|
|
401
347
|
try:
|
|
402
348
|
self.success = self.score >= self.threshold
|
|
403
|
-
except:
|
|
349
|
+
except TypeError:
|
|
404
350
|
self.success = False
|
|
405
351
|
return self.success
|
|
406
352
|
|