deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +106 -21
  5. deepeval/evaluate/evaluate.py +0 -3
  6. deepeval/evaluate/execute.py +10 -222
  7. deepeval/evaluate/utils.py +6 -30
  8. deepeval/key_handler.py +3 -0
  9. deepeval/metrics/__init__.py +0 -4
  10. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  11. deepeval/metrics/answer_relevancy/template.py +102 -179
  12. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  13. deepeval/metrics/arena_g_eval/template.py +17 -1
  14. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  15. deepeval/metrics/argument_correctness/template.py +19 -2
  16. deepeval/metrics/base_metric.py +13 -41
  17. deepeval/metrics/bias/bias.py +102 -108
  18. deepeval/metrics/bias/template.py +14 -2
  19. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  20. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  22. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  23. deepeval/metrics/conversation_completeness/template.py +23 -3
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  25. deepeval/metrics/conversational_dag/nodes.py +66 -123
  26. deepeval/metrics/conversational_dag/templates.py +16 -0
  27. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  28. deepeval/metrics/dag/dag.py +10 -0
  29. deepeval/metrics/dag/nodes.py +63 -126
  30. deepeval/metrics/dag/templates.py +14 -0
  31. deepeval/metrics/exact_match/exact_match.py +9 -1
  32. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  33. deepeval/metrics/g_eval/g_eval.py +87 -78
  34. deepeval/metrics/g_eval/template.py +18 -1
  35. deepeval/metrics/g_eval/utils.py +7 -6
  36. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  37. deepeval/metrics/goal_accuracy/template.py +21 -3
  38. deepeval/metrics/hallucination/hallucination.py +60 -75
  39. deepeval/metrics/hallucination/template.py +13 -0
  40. deepeval/metrics/indicator.py +3 -6
  41. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  42. deepeval/metrics/json_correctness/template.py +10 -0
  43. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  44. deepeval/metrics/knowledge_retention/schema.py +9 -3
  45. deepeval/metrics/knowledge_retention/template.py +12 -0
  46. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  47. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  48. deepeval/metrics/mcp/template.py +52 -0
  49. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  50. deepeval/metrics/mcp_use_metric/template.py +12 -0
  51. deepeval/metrics/misuse/misuse.py +77 -97
  52. deepeval/metrics/misuse/template.py +15 -0
  53. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  58. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  59. deepeval/metrics/non_advice/non_advice.py +79 -105
  60. deepeval/metrics/non_advice/template.py +12 -0
  61. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  62. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  63. deepeval/metrics/pii_leakage/template.py +14 -0
  64. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  65. deepeval/metrics/plan_adherence/template.py +11 -0
  66. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  67. deepeval/metrics/plan_quality/template.py +9 -0
  68. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  69. deepeval/metrics/prompt_alignment/template.py +12 -0
  70. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  71. deepeval/metrics/role_adherence/template.py +14 -0
  72. deepeval/metrics/role_violation/role_violation.py +75 -108
  73. deepeval/metrics/role_violation/template.py +12 -0
  74. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  75. deepeval/metrics/step_efficiency/template.py +11 -0
  76. deepeval/metrics/summarization/summarization.py +115 -183
  77. deepeval/metrics/summarization/template.py +19 -0
  78. deepeval/metrics/task_completion/task_completion.py +67 -73
  79. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  80. deepeval/metrics/tool_use/tool_use.py +42 -66
  81. deepeval/metrics/topic_adherence/template.py +13 -0
  82. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  83. deepeval/metrics/toxicity/template.py +13 -0
  84. deepeval/metrics/toxicity/toxicity.py +80 -99
  85. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  86. deepeval/metrics/turn_contextual_precision/template.py +1 -1
  87. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
  88. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  89. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
  90. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  91. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
  92. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  93. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
  94. deepeval/metrics/turn_relevancy/template.py +14 -0
  95. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  96. deepeval/metrics/utils.py +145 -90
  97. deepeval/models/base_model.py +44 -6
  98. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  99. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  100. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  101. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  102. deepeval/models/llms/amazon_bedrock_model.py +226 -71
  103. deepeval/models/llms/anthropic_model.py +141 -47
  104. deepeval/models/llms/azure_model.py +167 -94
  105. deepeval/models/llms/constants.py +2032 -0
  106. deepeval/models/llms/deepseek_model.py +79 -29
  107. deepeval/models/llms/gemini_model.py +126 -67
  108. deepeval/models/llms/grok_model.py +125 -59
  109. deepeval/models/llms/kimi_model.py +126 -81
  110. deepeval/models/llms/litellm_model.py +92 -18
  111. deepeval/models/llms/local_model.py +114 -15
  112. deepeval/models/llms/ollama_model.py +97 -76
  113. deepeval/models/llms/openai_model.py +167 -310
  114. deepeval/models/llms/portkey_model.py +58 -16
  115. deepeval/models/llms/utils.py +5 -2
  116. deepeval/models/utils.py +60 -4
  117. deepeval/simulator/conversation_simulator.py +43 -0
  118. deepeval/simulator/template.py +13 -0
  119. deepeval/test_case/api.py +24 -45
  120. deepeval/test_case/arena_test_case.py +7 -2
  121. deepeval/test_case/conversational_test_case.py +55 -6
  122. deepeval/test_case/llm_test_case.py +60 -6
  123. deepeval/test_run/api.py +3 -0
  124. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
  125. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
  126. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  127. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  128. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  129. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  130. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  131. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  132. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  133. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,16 +1,17 @@
1
1
  import asyncio
2
2
  from typing import Optional, List, Tuple, Union
3
3
 
4
- from deepeval.metrics import BaseMultimodalMetric
4
+ from deepeval.metrics import BaseMetric
5
5
  from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
6
6
  from deepeval.metrics.multimodal_metrics.image_helpfulness.template import (
7
7
  ImageHelpfulnessTemplate,
8
8
  )
9
9
  from deepeval.metrics.utils import (
10
10
  construct_verbose_logs,
11
- trimAndLoadJson,
12
- check_mllm_test_case_params,
11
+ check_llm_test_case_params,
13
12
  initialize_model,
13
+ a_generate_with_schema_and_extract,
14
+ generate_with_schema_and_extract,
14
15
  )
15
16
  from deepeval.models import DeepEvalBaseLLM
16
17
  from deepeval.metrics.multimodal_metrics.image_helpfulness.schema import (
@@ -23,7 +24,7 @@ from deepeval.utils import (
23
24
  )
24
25
 
25
26
 
26
- class ImageHelpfulnessMetric(BaseMultimodalMetric):
27
+ class ImageHelpfulnessMetric(BaseMetric):
27
28
 
28
29
  _required_params: List[LLMTestCaseParams] = [
29
30
  LLMTestCaseParams.INPUT,
@@ -54,8 +55,14 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
54
55
  _in_component: bool = False,
55
56
  _log_metric_to_confident: bool = True,
56
57
  ) -> float:
57
- check_mllm_test_case_params(
58
- test_case, self._required_params, None, None, self, self.model
58
+ check_llm_test_case_params(
59
+ test_case,
60
+ self._required_params,
61
+ None,
62
+ None,
63
+ self,
64
+ self.model,
65
+ test_case.multimodal,
59
66
  )
60
67
  self.evaluation_cost = 0 if self.using_native_model else None
61
68
  with metric_progress_indicator(
@@ -156,8 +163,14 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
156
163
  _in_component: bool = False,
157
164
  _log_metric_to_confident: bool = True,
158
165
  ) -> float:
159
- check_mllm_test_case_params(
160
- test_case, self._required_params, None, None, self, self.model
166
+ check_llm_test_case_params(
167
+ test_case,
168
+ self._required_params,
169
+ None,
170
+ None,
171
+ self,
172
+ self.model,
173
+ test_case.multimodal,
161
174
  )
162
175
  self.evaluation_cost = 0 if self.using_native_model else None
163
176
  with metric_progress_indicator(
@@ -262,20 +275,13 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
262
275
  context_above, context_below
263
276
  )
264
277
  prompt = f"{instructions} \nImages: {image}"
265
- if self.using_native_model:
266
- res, cost = self.model.generate(prompt, schema=ReasonScore)
267
- self.evaluation_cost += cost
268
- return res.score, res.reasoning
269
- else:
270
- try:
271
- res: ReasonScore = self.model.generate(
272
- prompt, schema=ReasonScore
273
- )
274
- return res.score, res.reasoning
275
- except TypeError:
276
- res = self.model.generate(prompt)
277
- data = trimAndLoadJson(res, self)
278
- return data["score"], data["reasoning"]
278
+ return generate_with_schema_and_extract(
279
+ metric=self,
280
+ prompt=prompt,
281
+ schema_cls=ReasonScore,
282
+ extract_schema=lambda s: (s.score, s.reasoning),
283
+ extract_json=lambda data: (data["score"], data["reasoning"]),
284
+ )
279
285
 
280
286
  async def a_evaluate_image_helpfulness(
281
287
  self,
@@ -287,20 +293,13 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
287
293
  context_above, context_below
288
294
  )
289
295
  prompt = f"{instructions} \nImages: {image}"
290
- if self.using_native_model:
291
- res, cost = await self.model.a_generate(prompt, schema=ReasonScore)
292
- self.evaluation_cost += cost
293
- return res.score, res.reasoning
294
- else:
295
- try:
296
- res: ReasonScore = await self.model.a_generate(
297
- prompt, schema=ReasonScore
298
- )
299
- return res.score, res.reasoning
300
- except TypeError:
301
- res = await self.model.a_generate(prompt)
302
- data = trimAndLoadJson(res, self)
303
- return data["score"], data["reasoning"]
296
+ return await a_generate_with_schema_and_extract(
297
+ metric=self,
298
+ prompt=prompt,
299
+ schema_cls=ReasonScore,
300
+ extract_schema=lambda s: (s.score, s.reasoning),
301
+ extract_json=lambda data: (data["score"], data["reasoning"]),
302
+ )
304
303
 
305
304
  def get_image_context(
306
305
  self, image_index: int, actual_output: List[Union[str, MLLMImage]]
@@ -335,7 +334,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
335
334
  if isinstance(element, MLLMImage)
336
335
  ]
337
336
 
338
- def calculate_score(self, scores: List[float]):
337
+ def calculate_score(self, scores: List[float]) -> float:
339
338
  return sum(scores) / len(scores)
340
339
 
341
340
  def is_successful(self) -> bool:
@@ -344,7 +343,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
344
343
  else:
345
344
  try:
346
345
  self.success = self.score >= self.threshold
347
- except:
346
+ except TypeError:
348
347
  self.success = False
349
348
  return self.success
350
349
 
@@ -1,16 +1,17 @@
1
1
  import asyncio
2
2
  from typing import Optional, List, Tuple, Union
3
3
 
4
- from deepeval.metrics import BaseMultimodalMetric
4
+ from deepeval.metrics import BaseMetric
5
5
  from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
6
6
  from deepeval.metrics.multimodal_metrics.image_reference.template import (
7
7
  ImageReferenceTemplate,
8
8
  )
9
9
  from deepeval.metrics.utils import (
10
10
  construct_verbose_logs,
11
- trimAndLoadJson,
12
- check_mllm_test_case_params,
11
+ check_llm_test_case_params,
13
12
  initialize_model,
13
+ a_generate_with_schema_and_extract,
14
+ generate_with_schema_and_extract,
14
15
  )
15
16
  from deepeval.models import DeepEvalBaseLLM
16
17
  from deepeval.metrics.multimodal_metrics.image_reference.schema import (
@@ -23,7 +24,7 @@ from deepeval.utils import (
23
24
  )
24
25
 
25
26
 
26
- class ImageReferenceMetric(BaseMultimodalMetric):
27
+ class ImageReferenceMetric(BaseMetric):
27
28
 
28
29
  _required_params: List[LLMTestCaseParams] = [
29
30
  LLMTestCaseParams.INPUT,
@@ -54,8 +55,14 @@ class ImageReferenceMetric(BaseMultimodalMetric):
54
55
  _in_component: bool = False,
55
56
  _log_metric_to_confident: bool = True,
56
57
  ) -> float:
57
- check_mllm_test_case_params(
58
- test_case, self._required_params, None, None, self, self.model
58
+ check_llm_test_case_params(
59
+ test_case,
60
+ self._required_params,
61
+ None,
62
+ None,
63
+ self,
64
+ self.model,
65
+ test_case.multimodal,
59
66
  )
60
67
  self.evaluation_cost = 0 if self.using_native_model else None
61
68
  with metric_progress_indicator(
@@ -156,8 +163,14 @@ class ImageReferenceMetric(BaseMultimodalMetric):
156
163
  _in_component: bool = False,
157
164
  _log_metric_to_confident: bool = True,
158
165
  ) -> float:
159
- check_mllm_test_case_params(
160
- test_case, self._required_params, None, None, self, self.model
166
+ check_llm_test_case_params(
167
+ test_case,
168
+ self._required_params,
169
+ None,
170
+ None,
171
+ self,
172
+ self.model,
173
+ test_case.multimodal,
161
174
  )
162
175
  self.evaluation_cost = 0 if self.using_native_model else None
163
176
  with metric_progress_indicator(
@@ -262,20 +275,13 @@ class ImageReferenceMetric(BaseMultimodalMetric):
262
275
  context_above, context_below
263
276
  )
264
277
  prompt = f"{instructions} \nImages: {image}"
265
- if self.using_native_model:
266
- res, cost = self.model.generate(prompt, schema=ReasonScore)
267
- self.evaluation_cost += cost
268
- return res.score, res.reasoning
269
- else:
270
- try:
271
- res: ReasonScore = self.model.generate(
272
- prompt, schema=ReasonScore
273
- )
274
- return res.score, res.reasoning
275
- except TypeError:
276
- res = self.model.generate(prompt)
277
- data = trimAndLoadJson(res, self)
278
- return data["score"], data["reasoning"]
278
+ return generate_with_schema_and_extract(
279
+ metric=self,
280
+ prompt=prompt,
281
+ schema_cls=ReasonScore,
282
+ extract_schema=lambda s: (s.score, s.reasoning),
283
+ extract_json=lambda data: (data["score"], data["reasoning"]),
284
+ )
279
285
 
280
286
  async def a_evaluate_image_reference(
281
287
  self,
@@ -287,20 +293,13 @@ class ImageReferenceMetric(BaseMultimodalMetric):
287
293
  context_above, context_below
288
294
  )
289
295
  prompt = f"{instructions} \nImages: {image}"
290
- if self.using_native_model:
291
- res, cost = await self.model.a_generate(prompt, schema=ReasonScore)
292
- self.evaluation_cost += cost
293
- return res.score, res.reasoning
294
- else:
295
- try:
296
- res: ReasonScore = await self.model.a_generate(
297
- prompt, schema=ReasonScore
298
- )
299
- return res.score, res.reasoning
300
- except TypeError:
301
- res = await self.model.a_generate(prompt)
302
- data = trimAndLoadJson(res, self)
303
- return data["score"], data["reasoning"]
296
+ return await a_generate_with_schema_and_extract(
297
+ metric=self,
298
+ prompt=prompt,
299
+ schema_cls=ReasonScore,
300
+ extract_schema=lambda s: (s.score, s.reasoning),
301
+ extract_json=lambda data: (data["score"], data["reasoning"]),
302
+ )
304
303
 
305
304
  def get_image_context(
306
305
  self, image_index: int, actual_output: List[Union[str, MLLMImage]]
@@ -335,7 +334,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
335
334
  if isinstance(element, MLLMImage)
336
335
  ]
337
336
 
338
- def calculate_score(self, scores: List[float]):
337
+ def calculate_score(self, scores: List[float]) -> float:
339
338
  return sum(scores) / len(scores)
340
339
 
341
340
  def is_successful(self) -> bool:
@@ -344,7 +343,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
344
343
  else:
345
344
  try:
346
345
  self.success = self.score >= self.threshold
347
- except:
346
+ except TypeError:
348
347
  self.success = False
349
348
  return self.success
350
349
 
@@ -3,7 +3,7 @@ from typing import Optional, List, Tuple, Union
3
3
  import math
4
4
  import textwrap
5
5
 
6
- from deepeval.metrics import BaseMultimodalMetric
6
+ from deepeval.metrics import BaseMetric
7
7
  from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
8
8
  from deepeval.metrics.multimodal_metrics.text_to_image.template import (
9
9
  TextToImageTemplate,
@@ -14,9 +14,10 @@ from deepeval.utils import (
14
14
  )
15
15
  from deepeval.metrics.utils import (
16
16
  construct_verbose_logs,
17
- trimAndLoadJson,
18
- check_mllm_test_case_params,
17
+ check_llm_test_case_params,
19
18
  initialize_model,
19
+ a_generate_with_schema_and_extract,
20
+ generate_with_schema_and_extract,
20
21
  )
21
22
  from deepeval.models import DeepEvalBaseLLM
22
23
  from deepeval.metrics.multimodal_metrics.text_to_image.schema import ReasonScore
@@ -28,7 +29,7 @@ required_params: List[LLMTestCaseParams] = [
28
29
  ]
29
30
 
30
31
 
31
- class TextToImageMetric(BaseMultimodalMetric):
32
+ class TextToImageMetric(BaseMetric):
32
33
  def __init__(
33
34
  self,
34
35
  model: Optional[Union[str, DeepEvalBaseLLM]] = None,
@@ -50,8 +51,14 @@ class TextToImageMetric(BaseMultimodalMetric):
50
51
  _show_indicator: bool = True,
51
52
  _in_component: bool = False,
52
53
  ) -> float:
53
- check_mllm_test_case_params(
54
- test_case, required_params, 0, 1, self, self.model
54
+ check_llm_test_case_params(
55
+ test_case,
56
+ required_params,
57
+ 0,
58
+ 1,
59
+ self,
60
+ self.model,
61
+ test_case.multimodal,
55
62
  )
56
63
 
57
64
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -97,7 +104,7 @@ class TextToImageMetric(BaseMultimodalMetric):
97
104
  steps=[
98
105
  f"Semantic Consistency Scores:\n{self.SC_scores}",
99
106
  f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
100
- f"Perceptual Quality Scores:\n{self.SC_scores}",
107
+ f"Perceptual Quality Scores:\n{self.PQ_scores}",
101
108
  f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
102
109
  f"Score: {self.score}\nReason: {self.reason}",
103
110
  ],
@@ -110,8 +117,14 @@ class TextToImageMetric(BaseMultimodalMetric):
110
117
  _show_indicator: bool = True,
111
118
  _in_component: bool = False,
112
119
  ) -> float:
113
- check_mllm_test_case_params(
114
- test_case, required_params, 0, 1, self, self.model
120
+ check_llm_test_case_params(
121
+ test_case,
122
+ required_params,
123
+ 0,
124
+ 1,
125
+ self,
126
+ self.model,
127
+ test_case.multimodal,
115
128
  )
116
129
 
117
130
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -150,7 +163,7 @@ class TextToImageMetric(BaseMultimodalMetric):
150
163
  steps=[
151
164
  f"Semantic Consistency Scores:\n{self.SC_scores}",
152
165
  f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
153
- f"Perceptual Quality Scores:\n{self.SC_scores}",
166
+ f"Perceptual Quality Scores:\n{self.PQ_scores}",
154
167
  f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
155
168
  f"Score: {self.score}\nReason: {self.reason}",
156
169
  ],
@@ -174,8 +187,7 @@ class TextToImageMetric(BaseMultimodalMetric):
174
187
  text_prompt: str,
175
188
  actual_image_output: MLLMImage,
176
189
  ) -> Tuple[List[int], str]:
177
- images: List[MLLMImage] = []
178
- images.append(actual_image_output)
190
+ images: List[MLLMImage] = [actual_image_output]
179
191
  prompt = f"""
180
192
  {
181
193
  TextToImageTemplate.generate_semantic_consistency_evaluation_results(
@@ -185,28 +197,20 @@ class TextToImageMetric(BaseMultimodalMetric):
185
197
  Images:
186
198
  {images}
187
199
  """
188
- if self.using_native_model:
189
- res, cost = await self.model.a_generate(prompt, ReasonScore)
190
- self.evaluation_cost += cost
191
- return res.score, res.reasoning
192
- else:
193
- try:
194
- res: ReasonScore = await self.model.a_generate(
195
- prompt, schema=ReasonScore
196
- )
197
- return res.score, res.reasoning
198
- except TypeError:
199
- res = await self.model.a_generate(prompt, input_text=prompt)
200
- data = trimAndLoadJson(res, self)
201
- return data["score"], data["reasoning"]
200
+ return await a_generate_with_schema_and_extract(
201
+ metric=self,
202
+ prompt=prompt,
203
+ schema_cls=ReasonScore,
204
+ extract_schema=lambda s: (s.score, s.reasoning),
205
+ extract_json=lambda data: (data["score"], data["reasoning"]),
206
+ )
202
207
 
203
208
  def _evaluate_semantic_consistency(
204
209
  self,
205
210
  text_prompt: str,
206
211
  actual_image_output: MLLMImage,
207
212
  ) -> Tuple[List[int], str]:
208
- images: List[MLLMImage] = []
209
- images.append(actual_image_output)
213
+ images: List[MLLMImage] = [actual_image_output]
210
214
  prompt = f"""
211
215
  {
212
216
  TextToImageTemplate.generate_semantic_consistency_evaluation_results(
@@ -216,20 +220,13 @@ class TextToImageMetric(BaseMultimodalMetric):
216
220
  Images:
217
221
  {images}
218
222
  """
219
- if self.using_native_model:
220
- res, cost = self.model.generate(prompt, ReasonScore)
221
- self.evaluation_cost += cost
222
- return res.score, res.reasoning
223
- else:
224
- try:
225
- res: ReasonScore = self.model.generate(
226
- prompt, schema=ReasonScore
227
- )
228
- return res.score, res.reasoning
229
- except TypeError:
230
- res = self.model.generate(prompt)
231
- data = trimAndLoadJson(res, self)
232
- return data["score"], data["reasoning"]
223
+ return generate_with_schema_and_extract(
224
+ metric=self,
225
+ prompt=prompt,
226
+ schema_cls=ReasonScore,
227
+ extract_schema=lambda s: (s.score, s.reasoning),
228
+ extract_json=lambda data: (data["score"], data["reasoning"]),
229
+ )
233
230
 
234
231
  async def _a_evaluate_perceptual_quality(
235
232
  self, actual_image_output: MLLMImage
@@ -242,20 +239,13 @@ class TextToImageMetric(BaseMultimodalMetric):
242
239
  Images:
243
240
  {images}
244
241
  """
245
- if self.using_native_model:
246
- res, cost = await self.model.a_generate(prompt, ReasonScore)
247
- self.evaluation_cost += cost
248
- return res.score, res.reasoning
249
- else:
250
- try:
251
- res: ReasonScore = await self.model.a_generate(
252
- prompt, schema=ReasonScore
253
- )
254
- return res.score, res.reasoning
255
- except TypeError:
256
- res = await self.model.a_generate(prompt)
257
- data = trimAndLoadJson(res, self)
258
- return data["score"], data["reasoning"]
242
+ return await a_generate_with_schema_and_extract(
243
+ metric=self,
244
+ prompt=prompt,
245
+ schema_cls=ReasonScore,
246
+ extract_schema=lambda s: (s.score, s.reasoning),
247
+ extract_json=lambda data: (data["score"], data["reasoning"]),
248
+ )
259
249
 
260
250
  def _evaluate_perceptual_quality(
261
251
  self, actual_image_output: MLLMImage
@@ -268,22 +258,15 @@ class TextToImageMetric(BaseMultimodalMetric):
268
258
  Images:
269
259
  {images}
270
260
  """
271
- if self.using_native_model:
272
- res, cost = self.model.generate(prompt + images, ReasonScore)
273
- self.evaluation_cost += cost
274
- return res.score, res.reasoning
275
- else:
276
- try:
277
- res: ReasonScore = self.model.generate(
278
- prompt, schema=ReasonScore
279
- )
280
- return res.score, res.reasoning
281
- except TypeError:
282
- res = self.model.generate(prompt)
283
- data = trimAndLoadJson(res, self)
284
- return data["score"], data["reasoning"]
261
+ return generate_with_schema_and_extract(
262
+ metric=self,
263
+ prompt=prompt,
264
+ schema_cls=ReasonScore,
265
+ extract_schema=lambda s: (s.score, s.reasoning),
266
+ extract_json=lambda data: (data["score"], data["reasoning"]),
267
+ )
285
268
 
286
- def _calculate_score(self) -> List[str]:
269
+ def _calculate_score(self) -> float:
287
270
  min_SC_score = min(self.SC_scores)
288
271
  min_PQ_score = min(self.PQ_scores)
289
272
  return math.sqrt(min_SC_score * min_PQ_score) / 10
@@ -293,14 +276,12 @@ class TextToImageMetric(BaseMultimodalMetric):
293
276
  self.success = False
294
277
  else:
295
278
  try:
296
- self.score >= self.threshold
297
- except:
279
+ self.success = self.score >= self.threshold
280
+ except TypeError:
298
281
  self.success = False
299
282
  return self.success
300
283
 
301
- def _generate_reason(
302
- self,
303
- ) -> Tuple[List[float], str]:
284
+ def _generate_reason(self) -> str:
304
285
  return textwrap.dedent(
305
286
  f"""
306
287
  The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)}