deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -8,7 +8,11 @@ from deepeval.metrics.conversational_g_eval.conversational_g_eval import (
|
|
|
8
8
|
ConversationalGEval,
|
|
9
9
|
)
|
|
10
10
|
from deepeval.metrics.g_eval.utils import CONVERSATIONAL_G_EVAL_PARAMS
|
|
11
|
-
from deepeval.metrics.utils import
|
|
11
|
+
from deepeval.metrics.utils import (
|
|
12
|
+
copy_metrics,
|
|
13
|
+
a_generate_with_schema_and_extract,
|
|
14
|
+
generate_with_schema_and_extract,
|
|
15
|
+
)
|
|
12
16
|
from deepeval.test_case import (
|
|
13
17
|
ConversationalTestCase,
|
|
14
18
|
TurnParams,
|
|
@@ -263,20 +267,14 @@ class ConversationalVerdictNode(ConversationalBaseNode):
|
|
|
263
267
|
score=metric.score,
|
|
264
268
|
name=metric.__name__,
|
|
265
269
|
)
|
|
266
|
-
if metric.using_native_model:
|
|
267
|
-
res, cost = metric.model.generate(prompt, schema=MetricScoreReason)
|
|
268
|
-
metric.evaluation_cost += cost
|
|
269
|
-
else:
|
|
270
|
-
try:
|
|
271
|
-
res: MetricScoreReason = metric.model.generate(
|
|
272
|
-
prompt, schema=MetricScoreReason
|
|
273
|
-
)
|
|
274
|
-
except TypeError:
|
|
275
|
-
res = metric.model.generate(prompt)
|
|
276
|
-
data = trimAndLoadJson(res, self)
|
|
277
|
-
res = MetricScoreReason(**data)
|
|
278
270
|
|
|
279
|
-
return
|
|
271
|
+
return generate_with_schema_and_extract(
|
|
272
|
+
metric=metric,
|
|
273
|
+
prompt=prompt,
|
|
274
|
+
schema_cls=MetricScoreReason,
|
|
275
|
+
extract_schema=lambda score_reason: score_reason.reason,
|
|
276
|
+
extract_json=lambda data: data["reason"],
|
|
277
|
+
)
|
|
280
278
|
|
|
281
279
|
async def _a_generate_reason(self, metric: BaseConversationalMetric):
|
|
282
280
|
prompt = ConversationalVerdictNodeTemplate.generate_reason(
|
|
@@ -284,22 +282,14 @@ class ConversationalVerdictNode(ConversationalBaseNode):
|
|
|
284
282
|
score=metric.score,
|
|
285
283
|
name=metric.__name__,
|
|
286
284
|
)
|
|
287
|
-
if metric.using_native_model:
|
|
288
|
-
res, cost = await metric.model.a_generate(
|
|
289
|
-
prompt, schema=MetricScoreReason
|
|
290
|
-
)
|
|
291
|
-
metric.evaluation_cost += cost
|
|
292
|
-
else:
|
|
293
|
-
try:
|
|
294
|
-
res: MetricScoreReason = await metric.model.a_generate(
|
|
295
|
-
prompt, schema=MetricScoreReason
|
|
296
|
-
)
|
|
297
|
-
except TypeError:
|
|
298
|
-
res = await metric.model.a_generate(prompt)
|
|
299
|
-
data = trimAndLoadJson(res, self)
|
|
300
|
-
res = MetricScoreReason(**data)
|
|
301
285
|
|
|
302
|
-
return
|
|
286
|
+
return await a_generate_with_schema_and_extract(
|
|
287
|
+
metric=metric,
|
|
288
|
+
prompt=prompt,
|
|
289
|
+
schema_cls=MetricScoreReason,
|
|
290
|
+
extract_schema=lambda score_reason: score_reason.reason,
|
|
291
|
+
extract_json=lambda data: data["reason"],
|
|
292
|
+
)
|
|
303
293
|
|
|
304
294
|
|
|
305
295
|
@dataclass
|
|
@@ -372,20 +362,14 @@ class ConversationalTaskNode(ConversationalBaseNode):
|
|
|
372
362
|
instructions=self.instructions,
|
|
373
363
|
text=text,
|
|
374
364
|
)
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
metric
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
)
|
|
384
|
-
self._output = res.output
|
|
385
|
-
except TypeError:
|
|
386
|
-
res = metric.model.generate(prompt)
|
|
387
|
-
data = trimAndLoadJson(res, self)
|
|
388
|
-
self._output = TaskNodeOutput(**data).output
|
|
365
|
+
|
|
366
|
+
self._output = generate_with_schema_and_extract(
|
|
367
|
+
metric=metric,
|
|
368
|
+
prompt=prompt,
|
|
369
|
+
schema_cls=TaskNodeOutput,
|
|
370
|
+
extract_schema=lambda s: s.output,
|
|
371
|
+
extract_json=lambda data: data["output"],
|
|
372
|
+
)
|
|
389
373
|
|
|
390
374
|
metric._verbose_steps.append(
|
|
391
375
|
construct_node_verbose_log(self, self._depth)
|
|
@@ -439,23 +423,14 @@ class ConversationalTaskNode(ConversationalBaseNode):
|
|
|
439
423
|
instructions=self.instructions,
|
|
440
424
|
text=text,
|
|
441
425
|
)
|
|
442
|
-
if metric.using_native_model:
|
|
443
|
-
res, cost = await metric.model.a_generate(
|
|
444
|
-
prompt, schema=TaskNodeOutput
|
|
445
|
-
)
|
|
446
|
-
metric.evaluation_cost += cost
|
|
447
|
-
self._output = res.output
|
|
448
|
-
else:
|
|
449
|
-
try:
|
|
450
|
-
res: TaskNodeOutput = await metric.model.a_generate(
|
|
451
|
-
prompt, schema=TaskNodeOutput
|
|
452
|
-
)
|
|
453
|
-
self._output = res.output
|
|
454
|
-
except TypeError:
|
|
455
|
-
res = await metric.model.a_generate(prompt)
|
|
456
|
-
data = trimAndLoadJson(res, self)
|
|
457
|
-
self._output = TaskNodeOutput(**data).output
|
|
458
426
|
|
|
427
|
+
self._output = await a_generate_with_schema_and_extract(
|
|
428
|
+
metric=metric,
|
|
429
|
+
prompt=prompt,
|
|
430
|
+
schema_cls=TaskNodeOutput,
|
|
431
|
+
extract_schema=lambda s: s.output,
|
|
432
|
+
extract_json=lambda data: data["output"],
|
|
433
|
+
)
|
|
459
434
|
metric._verbose_steps.append(
|
|
460
435
|
construct_node_verbose_log(self, self._depth)
|
|
461
436
|
)
|
|
@@ -559,22 +534,14 @@ class ConversationalBinaryJudgementNode(ConversationalBaseNode):
|
|
|
559
534
|
criteria=self.criteria,
|
|
560
535
|
text=text,
|
|
561
536
|
)
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
res: BinaryJudgementVerdict = metric.model.generate(
|
|
571
|
-
prompt, schema=BinaryJudgementVerdict
|
|
572
|
-
)
|
|
573
|
-
self._verdict = res
|
|
574
|
-
except TypeError:
|
|
575
|
-
res = metric.model.generate(prompt)
|
|
576
|
-
data = trimAndLoadJson(res, self)
|
|
577
|
-
self._verdict = BinaryJudgementVerdict(**data)
|
|
537
|
+
|
|
538
|
+
self._verdict = generate_with_schema_and_extract(
|
|
539
|
+
metric=metric,
|
|
540
|
+
prompt=prompt,
|
|
541
|
+
schema_cls=BinaryJudgementVerdict,
|
|
542
|
+
extract_schema=lambda s: s,
|
|
543
|
+
extract_json=lambda data: BinaryJudgementVerdict(**data),
|
|
544
|
+
)
|
|
578
545
|
|
|
579
546
|
metric._verbose_steps.append(
|
|
580
547
|
construct_node_verbose_log(self, self._depth)
|
|
@@ -623,22 +590,14 @@ class ConversationalBinaryJudgementNode(ConversationalBaseNode):
|
|
|
623
590
|
criteria=self.criteria,
|
|
624
591
|
text=text,
|
|
625
592
|
)
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
res: BinaryJudgementVerdict = await metric.model.a_generate(
|
|
635
|
-
prompt, schema=BinaryJudgementVerdict
|
|
636
|
-
)
|
|
637
|
-
self._verdict = res
|
|
638
|
-
except TypeError:
|
|
639
|
-
res = await metric.model.a_generate(prompt)
|
|
640
|
-
data = trimAndLoadJson(res, self)
|
|
641
|
-
self._verdict = BinaryJudgementVerdict(**data)
|
|
593
|
+
|
|
594
|
+
self._verdict = await a_generate_with_schema_and_extract(
|
|
595
|
+
metric=metric,
|
|
596
|
+
prompt=prompt,
|
|
597
|
+
schema_cls=BinaryJudgementVerdict,
|
|
598
|
+
extract_schema=lambda s: s,
|
|
599
|
+
extract_json=lambda data: BinaryJudgementVerdict(**data),
|
|
600
|
+
)
|
|
642
601
|
|
|
643
602
|
metric._verbose_steps.append(
|
|
644
603
|
construct_node_verbose_log(self, self._depth)
|
|
@@ -753,22 +712,14 @@ class ConversationalNonBinaryJudgementNode(ConversationalBaseNode):
|
|
|
753
712
|
prompt = ConversationalNonBinaryJudgementTemplate.generate_non_binary_verdict(
|
|
754
713
|
criteria=self.criteria, text=text, options=self._verdict_options
|
|
755
714
|
)
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
res: self._verdict_schema = metric.model.generate(
|
|
765
|
-
prompt, schema=self._verdict_schema
|
|
766
|
-
)
|
|
767
|
-
self._verdict = res
|
|
768
|
-
except TypeError:
|
|
769
|
-
res = metric.model.generate(prompt)
|
|
770
|
-
data = trimAndLoadJson(res, self)
|
|
771
|
-
self._verdict = self._verdict_schema(**data)
|
|
715
|
+
|
|
716
|
+
self._verdict = generate_with_schema_and_extract(
|
|
717
|
+
metric=metric,
|
|
718
|
+
prompt=prompt,
|
|
719
|
+
schema_cls=self._verdict_schema,
|
|
720
|
+
extract_schema=lambda s: s,
|
|
721
|
+
extract_json=lambda data: self._verdict_schema(**data),
|
|
722
|
+
)
|
|
772
723
|
|
|
773
724
|
metric._verbose_steps.append(
|
|
774
725
|
construct_node_verbose_log(self, self._depth)
|
|
@@ -816,22 +767,14 @@ class ConversationalNonBinaryJudgementNode(ConversationalBaseNode):
|
|
|
816
767
|
prompt = ConversationalNonBinaryJudgementTemplate.generate_non_binary_verdict(
|
|
817
768
|
criteria=self.criteria, text=text, options=self._verdict_options
|
|
818
769
|
)
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
res: self._verdict_schema = await metric.model.a_generate(
|
|
828
|
-
prompt, schema=self._verdict_schema
|
|
829
|
-
)
|
|
830
|
-
self._verdict = res
|
|
831
|
-
except TypeError:
|
|
832
|
-
res = await metric.model.a_generate(prompt)
|
|
833
|
-
data = trimAndLoadJson(res, self)
|
|
834
|
-
self._verdict = self._verdict_schema(**data)
|
|
770
|
+
|
|
771
|
+
self._verdict = await a_generate_with_schema_and_extract(
|
|
772
|
+
metric=metric,
|
|
773
|
+
prompt=prompt,
|
|
774
|
+
schema_cls=self._verdict_schema,
|
|
775
|
+
extract_schema=lambda s: s,
|
|
776
|
+
extract_json=lambda data: self._verdict_schema(**data),
|
|
777
|
+
)
|
|
835
778
|
|
|
836
779
|
metric._verbose_steps.append(
|
|
837
780
|
construct_node_verbose_log(self, self._depth)
|
|
@@ -2,7 +2,17 @@ from typing import List
|
|
|
2
2
|
from textwrap import dedent
|
|
3
3
|
|
|
4
4
|
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
|
|
5
14
|
class ConversationalVerdictNodeTemplate:
|
|
15
|
+
|
|
6
16
|
@staticmethod
|
|
7
17
|
def generate_reason(verbose_steps: List[str], score: float, name: str):
|
|
8
18
|
return dedent(
|
|
@@ -40,6 +50,8 @@ class ConversationalTaskNodeTemplate:
|
|
|
40
50
|
return dedent(
|
|
41
51
|
f"""You are given a set of task instructions and a full conversation between a user and an assistant.
|
|
42
52
|
|
|
53
|
+
{multimodal_rules}
|
|
54
|
+
|
|
43
55
|
Instructions:
|
|
44
56
|
{instructions}
|
|
45
57
|
|
|
@@ -67,6 +79,8 @@ class ConversationalBinaryJudgementTemplate:
|
|
|
67
79
|
|
|
68
80
|
Below is the full conversation you should evaluate. Consider dialogue context, speaker roles, and how responses were handled.
|
|
69
81
|
|
|
82
|
+
{multimodal_rules}
|
|
83
|
+
|
|
70
84
|
Full Conversation:
|
|
71
85
|
{text}
|
|
72
86
|
|
|
@@ -96,6 +110,8 @@ class ConversationalNonBinaryJudgementTemplate:
|
|
|
96
110
|
|
|
97
111
|
You are evaluating the following conversation. Choose one of the options that best reflects the assistant's behavior.
|
|
98
112
|
|
|
113
|
+
{multimodal_rules}
|
|
114
|
+
|
|
99
115
|
Options: {options}
|
|
100
116
|
|
|
101
117
|
Full Conversation:
|
|
@@ -26,6 +26,8 @@ from deepeval.metrics.utils import (
|
|
|
26
26
|
trimAndLoadJson,
|
|
27
27
|
initialize_model,
|
|
28
28
|
convert_turn_to_dict,
|
|
29
|
+
a_generate_with_schema_and_extract,
|
|
30
|
+
generate_with_schema_and_extract,
|
|
29
31
|
)
|
|
30
32
|
from deepeval.models import DeepEvalBaseLLM
|
|
31
33
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
@@ -89,8 +91,14 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
89
91
|
_in_component: bool = False,
|
|
90
92
|
_log_metric_to_confident: bool = True,
|
|
91
93
|
) -> float:
|
|
94
|
+
multimodal = test_case.multimodal
|
|
92
95
|
check_conversational_test_case_params(
|
|
93
|
-
test_case,
|
|
96
|
+
test_case,
|
|
97
|
+
self.evaluation_params,
|
|
98
|
+
self,
|
|
99
|
+
False,
|
|
100
|
+
self.model,
|
|
101
|
+
multimodal,
|
|
94
102
|
)
|
|
95
103
|
|
|
96
104
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -143,8 +151,14 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
143
151
|
_in_component: bool = False,
|
|
144
152
|
_log_metric_to_confident: bool = True,
|
|
145
153
|
) -> float:
|
|
154
|
+
multimodal = test_case.multimodal
|
|
146
155
|
check_conversational_test_case_params(
|
|
147
|
-
test_case,
|
|
156
|
+
test_case,
|
|
157
|
+
self.evaluation_params,
|
|
158
|
+
self,
|
|
159
|
+
False,
|
|
160
|
+
self.model,
|
|
161
|
+
multimodal,
|
|
148
162
|
)
|
|
149
163
|
|
|
150
164
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -192,22 +206,13 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
192
206
|
prompt = self.evaluation_template.generate_evaluation_steps(
|
|
193
207
|
criteria=self.criteria, parameters=g_eval_params_str
|
|
194
208
|
)
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
try:
|
|
203
|
-
res: cgschema.Steps = await self.model.a_generate(
|
|
204
|
-
prompt, schema=cgschema.Steps
|
|
205
|
-
)
|
|
206
|
-
return res.steps
|
|
207
|
-
except TypeError:
|
|
208
|
-
res = await self.model.a_generate(prompt)
|
|
209
|
-
data = trimAndLoadJson(res, self)
|
|
210
|
-
return data["steps"]
|
|
209
|
+
return await a_generate_with_schema_and_extract(
|
|
210
|
+
metric=self,
|
|
211
|
+
prompt=prompt,
|
|
212
|
+
schema_cls=cgschema.Steps,
|
|
213
|
+
extract_schema=lambda s: s.steps,
|
|
214
|
+
extract_json=lambda data: data["steps"],
|
|
215
|
+
)
|
|
211
216
|
|
|
212
217
|
def _generate_evaluation_steps(self) -> List[str]:
|
|
213
218
|
if self.evaluation_steps:
|
|
@@ -219,20 +224,13 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
219
224
|
prompt = self.evaluation_template.generate_evaluation_steps(
|
|
220
225
|
criteria=self.criteria, parameters=g_eval_params_str
|
|
221
226
|
)
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
prompt, schema=cgschema.Steps
|
|
230
|
-
)
|
|
231
|
-
return res.steps
|
|
232
|
-
except TypeError:
|
|
233
|
-
res = self.model.generate(prompt)
|
|
234
|
-
data = trimAndLoadJson(res, self)
|
|
235
|
-
return data["steps"]
|
|
227
|
+
return generate_with_schema_and_extract(
|
|
228
|
+
metric=self,
|
|
229
|
+
prompt=prompt,
|
|
230
|
+
schema_cls=cgschema.Steps,
|
|
231
|
+
extract_schema=lambda s: s.steps,
|
|
232
|
+
extract_json=lambda data: data["steps"],
|
|
233
|
+
)
|
|
236
234
|
|
|
237
235
|
async def _a_evaluate(
|
|
238
236
|
self, test_case: ConversationalTestCase
|
|
@@ -269,7 +267,8 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
269
267
|
res, cost = await self.model.a_generate_raw_response(
|
|
270
268
|
prompt, top_logprobs=20
|
|
271
269
|
)
|
|
272
|
-
|
|
270
|
+
|
|
271
|
+
self._accrue_cost(cost)
|
|
273
272
|
data = trimAndLoadJson(res.choices[0].message.content, self)
|
|
274
273
|
|
|
275
274
|
reason = data["reason"]
|
|
@@ -287,22 +286,13 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
287
286
|
except (
|
|
288
287
|
AttributeError
|
|
289
288
|
): # This catches the case where a_generate_raw_response doesn't exist.
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
try:
|
|
298
|
-
res: cgschema.ReasonScore = await self.model.a_generate(
|
|
299
|
-
prompt, schema=cgschema.ReasonScore
|
|
300
|
-
)
|
|
301
|
-
return res.score, res.reason
|
|
302
|
-
except TypeError:
|
|
303
|
-
res = await self.model.a_generate(prompt)
|
|
304
|
-
data = trimAndLoadJson(res, self)
|
|
305
|
-
return data["score"], data["reason"]
|
|
289
|
+
return await a_generate_with_schema_and_extract(
|
|
290
|
+
metric=self,
|
|
291
|
+
prompt=prompt,
|
|
292
|
+
schema_cls=cgschema.ReasonScore,
|
|
293
|
+
extract_schema=lambda r: (r.score, r.reason),
|
|
294
|
+
extract_json=lambda data: (data["score"], data["reason"]),
|
|
295
|
+
)
|
|
306
296
|
|
|
307
297
|
def evaluate(
|
|
308
298
|
self, test_case: ConversationalTestCase
|
|
@@ -339,7 +329,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
339
329
|
res, cost = self.model.generate_raw_response(
|
|
340
330
|
prompt, top_logprobs=20
|
|
341
331
|
)
|
|
342
|
-
self.
|
|
332
|
+
self._accrue_cost(cost)
|
|
343
333
|
data = trimAndLoadJson(res.choices[0].message.content, self)
|
|
344
334
|
|
|
345
335
|
reason = data["reason"]
|
|
@@ -356,22 +346,13 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
356
346
|
return score, reason
|
|
357
347
|
except AttributeError:
|
|
358
348
|
# This catches the case where a_generate_raw_response doesn't exist.
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
try:
|
|
367
|
-
res: cgschema.ReasonScore = self.model.generate(
|
|
368
|
-
prompt, schema=cgschema.ReasonScore
|
|
369
|
-
)
|
|
370
|
-
return res.score, res.reason
|
|
371
|
-
except TypeError:
|
|
372
|
-
res = self.model.generate(prompt)
|
|
373
|
-
data = trimAndLoadJson(res, self)
|
|
374
|
-
return data["score"], data["reason"]
|
|
349
|
+
return generate_with_schema_and_extract(
|
|
350
|
+
metric=self,
|
|
351
|
+
prompt=prompt,
|
|
352
|
+
schema_cls=cgschema.ReasonScore,
|
|
353
|
+
extract_schema=lambda r: (r.score, r.reason),
|
|
354
|
+
extract_json=lambda data: (data["score"], data["reason"]),
|
|
355
|
+
)
|
|
375
356
|
|
|
376
357
|
def generate_weighted_summed_score(
|
|
377
358
|
self, raw_score: int, raw_response: ChatCompletion
|
deepeval/metrics/dag/dag.py
CHANGED
|
@@ -62,10 +62,15 @@ class DAGMetric(BaseMetric):
|
|
|
62
62
|
_in_component: bool = False,
|
|
63
63
|
_log_metric_to_confident: bool = True,
|
|
64
64
|
) -> float:
|
|
65
|
+
multimodal = test_case.multimodal
|
|
65
66
|
check_llm_test_case_params(
|
|
66
67
|
test_case,
|
|
67
68
|
extract_required_params(self.dag.root_nodes, self.dag.multiturn),
|
|
69
|
+
None,
|
|
70
|
+
None,
|
|
68
71
|
self,
|
|
72
|
+
self.model,
|
|
73
|
+
multimodal,
|
|
69
74
|
)
|
|
70
75
|
|
|
71
76
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -105,10 +110,15 @@ class DAGMetric(BaseMetric):
|
|
|
105
110
|
_in_component: bool = False,
|
|
106
111
|
_log_metric_to_confident: bool = True,
|
|
107
112
|
) -> float:
|
|
113
|
+
multimodal = test_case.multimodal
|
|
108
114
|
check_llm_test_case_params(
|
|
109
115
|
test_case,
|
|
110
116
|
extract_required_params(self.dag.root_nodes, self.dag.multiturn),
|
|
117
|
+
None,
|
|
118
|
+
None,
|
|
111
119
|
self,
|
|
120
|
+
self.model,
|
|
121
|
+
multimodal,
|
|
112
122
|
)
|
|
113
123
|
|
|
114
124
|
self.evaluation_cost = 0 if self.using_native_model else None
|