deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
deepeval/metrics/dag/nodes.py
CHANGED
|
@@ -18,7 +18,11 @@ from deepeval.metrics.dag.templates import (
|
|
|
18
18
|
from deepeval.metrics.base_metric import BaseMetric
|
|
19
19
|
from deepeval.metrics.g_eval.g_eval import GEval
|
|
20
20
|
from deepeval.metrics.g_eval.utils import G_EVAL_PARAMS
|
|
21
|
-
from deepeval.metrics.utils import
|
|
21
|
+
from deepeval.metrics.utils import (
|
|
22
|
+
copy_metrics,
|
|
23
|
+
a_generate_with_schema_and_extract,
|
|
24
|
+
generate_with_schema_and_extract,
|
|
25
|
+
)
|
|
22
26
|
from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall
|
|
23
27
|
from deepeval.utils import prettify_list
|
|
24
28
|
|
|
@@ -222,20 +226,13 @@ class VerdictNode(BaseNode):
|
|
|
222
226
|
score=metric.score,
|
|
223
227
|
name=metric.__name__,
|
|
224
228
|
)
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
)
|
|
233
|
-
except TypeError:
|
|
234
|
-
res = metric.model.generate(prompt)
|
|
235
|
-
data = trimAndLoadJson(res, self)
|
|
236
|
-
res = MetricScoreReason(**data)
|
|
237
|
-
|
|
238
|
-
return res.reason
|
|
229
|
+
return generate_with_schema_and_extract(
|
|
230
|
+
metric=metric,
|
|
231
|
+
prompt=prompt,
|
|
232
|
+
schema_cls=MetricScoreReason,
|
|
233
|
+
extract_schema=lambda s: s.reason,
|
|
234
|
+
extract_json=lambda data: data["reason"],
|
|
235
|
+
)
|
|
239
236
|
|
|
240
237
|
async def _a_generate_reason(self, metric: BaseMetric):
|
|
241
238
|
prompt = VerdictNodeTemplate.generate_reason(
|
|
@@ -243,22 +240,13 @@ class VerdictNode(BaseNode):
|
|
|
243
240
|
score=metric.score,
|
|
244
241
|
name=metric.__name__,
|
|
245
242
|
)
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
res: MetricScoreReason = await metric.model.a_generate(
|
|
254
|
-
prompt, schema=MetricScoreReason
|
|
255
|
-
)
|
|
256
|
-
except TypeError:
|
|
257
|
-
res = await metric.model.a_generate(prompt)
|
|
258
|
-
data = trimAndLoadJson(res, self)
|
|
259
|
-
res = MetricScoreReason(**data)
|
|
260
|
-
|
|
261
|
-
return res.reason
|
|
243
|
+
return await a_generate_with_schema_and_extract(
|
|
244
|
+
metric=metric,
|
|
245
|
+
prompt=prompt,
|
|
246
|
+
schema_cls=MetricScoreReason,
|
|
247
|
+
extract_schema=lambda s: s.reason,
|
|
248
|
+
extract_json=lambda data: data["reason"],
|
|
249
|
+
)
|
|
262
250
|
|
|
263
251
|
|
|
264
252
|
@dataclass
|
|
@@ -317,20 +305,13 @@ class TaskNode(BaseNode):
|
|
|
317
305
|
instructions=self.instructions,
|
|
318
306
|
text=text,
|
|
319
307
|
)
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
prompt, schema=TaskNodeOutput
|
|
328
|
-
)
|
|
329
|
-
self._output = res.output
|
|
330
|
-
except TypeError:
|
|
331
|
-
res = metric.model.generate(prompt)
|
|
332
|
-
data = trimAndLoadJson(res, self)
|
|
333
|
-
self._output = TaskNodeOutput(**data).output
|
|
308
|
+
self._output = generate_with_schema_and_extract(
|
|
309
|
+
metric=metric,
|
|
310
|
+
prompt=prompt,
|
|
311
|
+
schema_cls=TaskNodeOutput,
|
|
312
|
+
extract_schema=lambda s: s.output,
|
|
313
|
+
extract_json=lambda data: data["output"],
|
|
314
|
+
)
|
|
334
315
|
|
|
335
316
|
metric._verbose_steps.append(
|
|
336
317
|
construct_node_verbose_log(self, self._depth)
|
|
@@ -371,22 +352,13 @@ class TaskNode(BaseNode):
|
|
|
371
352
|
text=text,
|
|
372
353
|
)
|
|
373
354
|
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
try:
|
|
382
|
-
res: TaskNodeOutput = await metric.model.a_generate(
|
|
383
|
-
prompt, schema=TaskNodeOutput
|
|
384
|
-
)
|
|
385
|
-
self._output = res.output
|
|
386
|
-
except TypeError:
|
|
387
|
-
res = await metric.model.a_generate(prompt)
|
|
388
|
-
data = trimAndLoadJson(res, self)
|
|
389
|
-
self._output = TaskNodeOutput(**data).output
|
|
355
|
+
self._output = await a_generate_with_schema_and_extract(
|
|
356
|
+
metric=metric,
|
|
357
|
+
prompt=prompt,
|
|
358
|
+
schema_cls=TaskNodeOutput,
|
|
359
|
+
extract_schema=lambda s: s.output,
|
|
360
|
+
extract_json=lambda data: data["output"],
|
|
361
|
+
)
|
|
390
362
|
|
|
391
363
|
metric._verbose_steps.append(
|
|
392
364
|
construct_node_verbose_log(self, self._depth)
|
|
@@ -470,23 +442,13 @@ class BinaryJudgementNode(BaseNode):
|
|
|
470
442
|
criteria=self.criteria,
|
|
471
443
|
text=text,
|
|
472
444
|
)
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
try:
|
|
481
|
-
res: BinaryJudgementVerdict = metric.model.generate(
|
|
482
|
-
prompt, schema=BinaryJudgementVerdict
|
|
483
|
-
)
|
|
484
|
-
self._verdict = res
|
|
485
|
-
except TypeError:
|
|
486
|
-
res = metric.model.generate(prompt)
|
|
487
|
-
data = trimAndLoadJson(res, self)
|
|
488
|
-
self._verdict = BinaryJudgementVerdict(**data)
|
|
489
|
-
|
|
445
|
+
self._verdict = generate_with_schema_and_extract(
|
|
446
|
+
metric=metric,
|
|
447
|
+
prompt=prompt,
|
|
448
|
+
schema_cls=BinaryJudgementVerdict,
|
|
449
|
+
extract_schema=lambda s: s,
|
|
450
|
+
extract_json=lambda data: BinaryJudgementVerdict(**data),
|
|
451
|
+
)
|
|
490
452
|
metric._verbose_steps.append(
|
|
491
453
|
construct_node_verbose_log(self, self._depth)
|
|
492
454
|
)
|
|
@@ -520,22 +482,13 @@ class BinaryJudgementNode(BaseNode):
|
|
|
520
482
|
criteria=self.criteria,
|
|
521
483
|
text=text,
|
|
522
484
|
)
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
try:
|
|
531
|
-
res: BinaryJudgementVerdict = await metric.model.a_generate(
|
|
532
|
-
prompt, schema=BinaryJudgementVerdict
|
|
533
|
-
)
|
|
534
|
-
self._verdict = res
|
|
535
|
-
except TypeError:
|
|
536
|
-
res = await metric.model.a_generate(prompt)
|
|
537
|
-
data = trimAndLoadJson(res, self)
|
|
538
|
-
self._verdict = BinaryJudgementVerdict(**data)
|
|
485
|
+
self._verdict = await a_generate_with_schema_and_extract(
|
|
486
|
+
metric=metric,
|
|
487
|
+
prompt=prompt,
|
|
488
|
+
schema_cls=BinaryJudgementVerdict,
|
|
489
|
+
extract_schema=lambda s: s,
|
|
490
|
+
extract_json=lambda data: BinaryJudgementVerdict(**data),
|
|
491
|
+
)
|
|
539
492
|
|
|
540
493
|
metric._verbose_steps.append(
|
|
541
494
|
construct_node_verbose_log(self, self._depth)
|
|
@@ -629,22 +582,14 @@ class NonBinaryJudgementNode(BaseNode):
|
|
|
629
582
|
prompt = NonBinaryJudgementTemplate.generate_non_binary_verdict(
|
|
630
583
|
criteria=self.criteria, text=text, options=self._verdict_options
|
|
631
584
|
)
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
res: self._verdict_schema = metric.model.generate(
|
|
641
|
-
prompt, schema=self._verdict_schema
|
|
642
|
-
)
|
|
643
|
-
self._verdict = res
|
|
644
|
-
except TypeError:
|
|
645
|
-
res = metric.model.generate(prompt)
|
|
646
|
-
data = trimAndLoadJson(res, self)
|
|
647
|
-
self._verdict = self._verdict_schema(**data)
|
|
585
|
+
|
|
586
|
+
self._verdict = generate_with_schema_and_extract(
|
|
587
|
+
metric=metric,
|
|
588
|
+
prompt=prompt,
|
|
589
|
+
schema_cls=self._verdict_schema,
|
|
590
|
+
extract_schema=lambda s: s,
|
|
591
|
+
extract_json=lambda data: self._verdict_schema(**data),
|
|
592
|
+
)
|
|
648
593
|
|
|
649
594
|
metric._verbose_steps.append(
|
|
650
595
|
construct_node_verbose_log(self, self._depth)
|
|
@@ -678,22 +623,14 @@ class NonBinaryJudgementNode(BaseNode):
|
|
|
678
623
|
prompt = NonBinaryJudgementTemplate.generate_non_binary_verdict(
|
|
679
624
|
criteria=self.criteria, text=text, options=self._verdict_options
|
|
680
625
|
)
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
res: self._verdict_schema = await metric.model.a_generate(
|
|
690
|
-
prompt, schema=self._verdict_schema
|
|
691
|
-
)
|
|
692
|
-
self._verdict = res
|
|
693
|
-
except TypeError:
|
|
694
|
-
res = await metric.model.a_generate(prompt)
|
|
695
|
-
data = trimAndLoadJson(res, self)
|
|
696
|
-
self._verdict = self._verdict_schema(**data)
|
|
626
|
+
|
|
627
|
+
self._verdict = await a_generate_with_schema_and_extract(
|
|
628
|
+
metric=metric,
|
|
629
|
+
prompt=prompt,
|
|
630
|
+
schema_cls=self._verdict_schema,
|
|
631
|
+
extract_schema=lambda s: s,
|
|
632
|
+
extract_json=lambda data: self._verdict_schema(**data),
|
|
633
|
+
)
|
|
697
634
|
|
|
698
635
|
metric._verbose_steps.append(
|
|
699
636
|
construct_node_verbose_log(self, self._depth)
|
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
|
|
3
|
+
multimodal_rules = """
|
|
4
|
+
--- MULTIMODAL INPUT RULES ---
|
|
5
|
+
- Treat image content as factual evidence.
|
|
6
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
7
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
8
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
9
|
+
"""
|
|
10
|
+
|
|
3
11
|
|
|
4
12
|
class VerdictNodeTemplate:
|
|
5
13
|
@staticmethod
|
|
@@ -34,6 +42,8 @@ class TaskNodeTemplate:
|
|
|
34
42
|
def generate_task_output(instructions: str, text: str):
|
|
35
43
|
return f"""Given the following instructions, generate an output.
|
|
36
44
|
|
|
45
|
+
{multimodal_rules}
|
|
46
|
+
|
|
37
47
|
{instructions}
|
|
38
48
|
|
|
39
49
|
{text}
|
|
@@ -57,14 +67,16 @@ class BinaryJudgementTemplate:
|
|
|
57
67
|
def generate_binary_verdict(criteria: str, text: str):
|
|
58
68
|
return f"""{criteria}
|
|
59
69
|
|
|
70
|
+
{multimodal_rules}
|
|
71
|
+
|
|
60
72
|
{text}
|
|
61
73
|
|
|
62
74
|
**
|
|
63
|
-
IMPORTANT: Please make sure to only return a json with two keys: `verdict` (
|
|
75
|
+
IMPORTANT: Please make sure to only return a json with two keys: `verdict` (true or false), and the 'reason' key providing the reason. The verdict must be a boolean only, either true or false.
|
|
64
76
|
Example JSON:
|
|
65
77
|
{{
|
|
66
78
|
"reason": "...",
|
|
67
|
-
"verdict":
|
|
79
|
+
"verdict": true
|
|
68
80
|
}}
|
|
69
81
|
**
|
|
70
82
|
|
|
@@ -79,6 +91,8 @@ class NonBinaryJudgementTemplate:
|
|
|
79
91
|
):
|
|
80
92
|
return f"""{criteria}
|
|
81
93
|
|
|
94
|
+
{multimodal_rules}
|
|
95
|
+
|
|
82
96
|
{text}
|
|
83
97
|
|
|
84
98
|
**
|
|
@@ -32,7 +32,15 @@ class ExactMatchMetric(BaseMetric):
|
|
|
32
32
|
_in_component: bool = False,
|
|
33
33
|
_log_metric_to_confident: bool = True,
|
|
34
34
|
) -> float:
|
|
35
|
-
check_llm_test_case_params(
|
|
35
|
+
check_llm_test_case_params(
|
|
36
|
+
test_case,
|
|
37
|
+
self._required_params,
|
|
38
|
+
None,
|
|
39
|
+
None,
|
|
40
|
+
self,
|
|
41
|
+
None,
|
|
42
|
+
test_case.multimodal,
|
|
43
|
+
)
|
|
36
44
|
|
|
37
45
|
with metric_progress_indicator(
|
|
38
46
|
self, _show_indicator=_show_indicator, _in_component=_in_component
|