deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -1,386 +0,0 @@
1
- """LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
2
-
3
- from typing import Optional, List, Tuple, Type, Union
4
- from deepeval.models import DeepEvalBaseLLM
5
- from deepeval.metrics import BaseMultimodalMetric
6
- from deepeval.test_case import (
7
- LLMTestCaseParams,
8
- LLMTestCase,
9
- )
10
- from deepeval.metrics.multimodal_metrics.multimodal_g_eval.template import (
11
- MultimodalGEvalTemplate,
12
- )
13
- from deepeval.metrics.multimodal_metrics.multimodal_g_eval.schema import (
14
- Steps,
15
- ReasonScore,
16
- )
17
- from deepeval.utils import get_or_create_event_loop, prettify_list
18
- from deepeval.metrics.indicator import metric_progress_indicator
19
- from deepeval.metrics.utils import (
20
- initialize_model,
21
- check_mllm_test_case_params,
22
- construct_verbose_logs,
23
- trimAndLoadJson,
24
- )
25
- from deepeval.metrics.multimodal_metrics.multimodal_g_eval.utils import (
26
- construct_test_case_list,
27
- no_multimodal_log_prob_support,
28
- construct_g_eval_params_string,
29
- )
30
- from deepeval.metrics.g_eval.utils import (
31
- Rubric,
32
- format_rubrics,
33
- calculate_weighted_summed_score,
34
- validate_and_sort_rubrics,
35
- validate_criteria_and_evaluation_steps,
36
- number_evaluation_steps,
37
- get_score_range,
38
- )
39
-
40
-
41
- class MultimodalGEval(BaseMultimodalMetric):
42
- def __init__(
43
- self,
44
- name: str,
45
- evaluation_params: List[LLMTestCaseParams],
46
- criteria: Optional[str] = None,
47
- evaluation_steps: Optional[List[str]] = None,
48
- rubric: Optional[List[Rubric]] = None,
49
- model: Optional[Union[str, DeepEvalBaseLLM]] = None,
50
- threshold: float = 0.5,
51
- top_logprobs: int = 20,
52
- async_mode: bool = True,
53
- strict_mode: bool = False,
54
- verbose_mode: bool = False,
55
- evaluation_template: Type[
56
- MultimodalGEvalTemplate
57
- ] = MultimodalGEvalTemplate,
58
- _include_g_eval_suffix: bool = True,
59
- ):
60
- validate_criteria_and_evaluation_steps(criteria, evaluation_steps)
61
- self.name = name
62
- self.evaluation_params = evaluation_params
63
- self.criteria = criteria
64
- self.rubric = validate_and_sort_rubrics(rubric)
65
- self.model, self.using_native_model = initialize_model(model)
66
- self.evaluation_model = self.model.get_model_name()
67
- self.evaluation_steps = (
68
- evaluation_steps
69
- if evaluation_steps and len(evaluation_steps) > 0
70
- else None
71
- )
72
- self.threshold = 1 if strict_mode else threshold
73
- self.top_logprobs = top_logprobs
74
- self.strict_mode = strict_mode
75
- self.async_mode = async_mode
76
- self.verbose_mode = verbose_mode
77
- self._include_g_eval_suffix = _include_g_eval_suffix
78
- self.evaluation_template = evaluation_template
79
-
80
- def measure(
81
- self,
82
- test_case: LLMTestCase,
83
- _show_indicator: bool = True,
84
- _in_component: bool = False,
85
- _log_metric_to_confident: bool = True,
86
- _additional_context: Optional[str] = None,
87
- ) -> float:
88
-
89
- check_mllm_test_case_params(
90
- test_case, self.evaluation_params, None, None, self, self.model
91
- )
92
-
93
- self.evaluation_cost = 0 if self.using_native_model else None
94
- with metric_progress_indicator(
95
- self, _show_indicator=_show_indicator, _in_component=_in_component
96
- ):
97
- if self.async_mode:
98
- loop = get_or_create_event_loop()
99
- loop.run_until_complete(
100
- self.a_measure(
101
- test_case,
102
- _show_indicator=False,
103
- _in_component=_in_component,
104
- _log_metric_to_confident=_log_metric_to_confident,
105
- _additional_context=_additional_context,
106
- )
107
- )
108
- else:
109
- self.evaluation_steps: List[str] = (
110
- self._generate_evaluation_steps()
111
- )
112
- g_score, reason = self._evaluate(
113
- test_case, _additional_context=_additional_context
114
- )
115
- self.reason = reason
116
- self.score = float(g_score) / 10
117
- self.score = (
118
- 0
119
- if self.strict_mode and self.score < self.threshold
120
- else self.score
121
- )
122
- self.success = self.score >= self.threshold
123
- self.verbose_logs = construct_verbose_logs(
124
- self,
125
- steps=[
126
- f"Criteria:\n{self.criteria}",
127
- f"Evaluation Steps:\n{prettify_list(self.evaluation_steps)}",
128
- f"Rubric:\n{format_rubrics(self.rubric)}",
129
- f"Score: {self.score}\nReason: {self.reason}",
130
- ],
131
- )
132
-
133
- return self.score
134
-
135
- async def a_measure(
136
- self,
137
- test_case: LLMTestCase,
138
- _show_indicator: bool = True,
139
- _in_component: bool = False,
140
- _additional_context: Optional[str] = None,
141
- _log_metric_to_confident: bool = True,
142
- ) -> float:
143
-
144
- check_mllm_test_case_params(
145
- test_case, self.evaluation_params, None, None, self, self.model
146
- )
147
-
148
- self.evaluation_cost = 0 if self.using_native_model else None
149
- with metric_progress_indicator(
150
- self,
151
- async_mode=True,
152
- _show_indicator=_show_indicator,
153
- _in_component=_in_component,
154
- ):
155
- self.evaluation_steps: List[str] = (
156
- await self._a_generate_evaluation_steps()
157
- )
158
- g_score, reason = await self._a_evaluate(
159
- test_case, _additional_context=_additional_context
160
- )
161
- self.reason = reason
162
- self.score = (
163
- float(g_score) / 10 if not self.strict_mode else int(g_score)
164
- )
165
- self.success = self.score >= self.threshold
166
- self.verbose_logs = construct_verbose_logs(
167
- self,
168
- steps=[
169
- f"Criteria:\n{self.criteria}",
170
- f"Evaluation Steps:\n{prettify_list(self.evaluation_steps)}",
171
- f"Rubric:\n{format_rubrics(self.rubric)}",
172
- f"Score: {self.score}\nReason: {self.reason}",
173
- ],
174
- )
175
- return self.score
176
-
177
- async def _a_generate_evaluation_steps(self) -> List[str]:
178
- if self.evaluation_steps:
179
- return self.evaluation_steps
180
-
181
- g_eval_params_str = construct_g_eval_params_string(
182
- self.evaluation_params
183
- )
184
- prompt = self.evaluation_template.generate_evaluation_steps(
185
- criteria=self.criteria, parameters=g_eval_params_str
186
- )
187
- if self.using_native_model:
188
- res, cost = await self.model.a_generate(prompt, schema=Steps)
189
- self.evaluation_cost += cost
190
- return res.steps
191
- else:
192
- try:
193
- res: Steps = await self.model.a_generate(prompt, schema=Steps)
194
- return res.steps
195
- except TypeError:
196
- res = await self.model.a_generate(prompt)
197
- data = trimAndLoadJson(res, self)
198
- return data["steps"]
199
-
200
- def _generate_evaluation_steps(self) -> List[str]:
201
- if self.evaluation_steps:
202
- return self.evaluation_steps
203
-
204
- g_eval_params_str = construct_g_eval_params_string(
205
- self.evaluation_params
206
- )
207
- prompt = self.evaluation_template.generate_evaluation_steps(
208
- criteria=self.criteria, parameters=g_eval_params_str
209
- )
210
- if self.using_native_model:
211
- res, cost = self.model.generate(prompt, schema=Steps)
212
- self.evaluation_cost += cost
213
- return res.steps
214
- else:
215
- try:
216
- res: Steps = self.model.generate(prompt, schema=Steps)
217
- return res.steps
218
- except TypeError:
219
- res = self.model.generate(prompt)
220
- data = trimAndLoadJson(res, self)
221
- return data["steps"]
222
-
223
- async def _a_evaluate(
224
- self, test_case: LLMTestCase, _additional_context: Optional[str] = None
225
- ) -> Tuple[Union[int, float], str]:
226
- test_case_list = construct_test_case_list(
227
- self.evaluation_params, test_case
228
- )
229
- g_eval_params_str = construct_g_eval_params_string(
230
- self.evaluation_params
231
- )
232
-
233
- if not self.strict_mode:
234
- rubric_str = format_rubrics(self.rubric) if self.rubric else None
235
- prompt = self.evaluation_template.generate_evaluation_results(
236
- evaluation_steps=number_evaluation_steps(self.evaluation_steps),
237
- test_case_list=test_case_list,
238
- parameters=g_eval_params_str,
239
- rubric=rubric_str,
240
- score_range=get_score_range(self.rubric),
241
- _additional_context=_additional_context,
242
- )
243
- else:
244
- prompt = (
245
- self.evaluation_template.generate_strict_evaluation_results(
246
- evaluation_steps=number_evaluation_steps(
247
- self.evaluation_steps
248
- ),
249
- test_case_list=test_case_list,
250
- parameters=g_eval_params_str,
251
- _additional_context=_additional_context,
252
- )
253
- )
254
- try:
255
- # don't use log probabilities for unsupported gpt models
256
- if no_multimodal_log_prob_support(self.model):
257
- raise AttributeError("log_probs unsupported.")
258
-
259
- # Don't have to check for using native model
260
- # since generate raw response only exist for deepeval's native model
261
- res, cost = await self.model.a_generate_raw_response(
262
- prompt, top_logprobs=self.top_logprobs
263
- )
264
- self.evaluation_cost += cost
265
- data = trimAndLoadJson(res.choices[0].message.content, self)
266
-
267
- reason = data["reason"]
268
- score = data["score"]
269
- if self.strict_mode:
270
- return score, reason
271
-
272
- try:
273
- weighted_summed_score = calculate_weighted_summed_score(
274
- score, res
275
- )
276
- return weighted_summed_score, reason
277
- except Exception:
278
- return score, reason
279
- except (
280
- AttributeError
281
- ): # This catches the case where a_generate_raw_response doesn't exist.
282
- if self.using_native_model:
283
- res, cost = await self.model.a_generate(prompt)
284
- self.evaluation_cost += cost
285
- data = trimAndLoadJson(res, self)
286
- return data["score"], data["reason"]
287
- else:
288
- try:
289
- res: ReasonScore = await self.model.a_generate(
290
- prompt, schema=ReasonScore
291
- )
292
- return res.score, res.reason
293
- except TypeError:
294
- res = await self.model.a_generate(prompt)
295
- data = trimAndLoadJson(res, self)
296
- return data["score"], data["reason"]
297
-
298
- def _evaluate(
299
- self, test_case: LLMTestCase, _additional_context: Optional[str] = None
300
- ) -> Tuple[Union[int, float], str]:
301
- test_case_list = construct_test_case_list(
302
- self.evaluation_params, test_case
303
- )
304
- g_eval_params_str = construct_g_eval_params_string(
305
- self.evaluation_params
306
- )
307
-
308
- if not self.strict_mode:
309
- rubric_str = format_rubrics(self.rubric) if self.rubric else None
310
- prompt = self.evaluation_template.generate_evaluation_results(
311
- evaluation_steps=number_evaluation_steps(self.evaluation_steps),
312
- test_case_list=test_case_list,
313
- parameters=g_eval_params_str,
314
- rubric=rubric_str,
315
- score_range=get_score_range(self.rubric),
316
- _additional_context=_additional_context,
317
- )
318
- else:
319
- prompt = (
320
- self.evaluation_template.generate_strict_evaluation_results(
321
- evaluation_steps=number_evaluation_steps(
322
- self.evaluation_steps
323
- ),
324
- test_case_list=test_case_list,
325
- parameters=g_eval_params_str,
326
- _additional_context=_additional_context,
327
- )
328
- )
329
-
330
- try:
331
- # don't use log probabilities for unsupported gpt models
332
- if no_multimodal_log_prob_support(self.model):
333
- raise AttributeError("log_probs unsupported.")
334
-
335
- res, cost = self.model.generate_raw_response(
336
- prompt, top_logprobs=self.top_logprobs
337
- )
338
- self.evaluation_cost += cost
339
- data = trimAndLoadJson(res.choices[0].message.content, self)
340
-
341
- reason = data["reason"]
342
- score = data["score"]
343
- if self.strict_mode:
344
- return score, reason
345
-
346
- try:
347
- weighted_summed_score = calculate_weighted_summed_score(
348
- score, res
349
- )
350
- return weighted_summed_score, reason
351
- except Exception:
352
- return score, reason
353
- except AttributeError:
354
- # This catches the case where a_generate_raw_response doesn't exist.
355
- if self.using_native_model:
356
- res, cost = self.model.generate(prompt)
357
- self.evaluation_cost += cost
358
- data = trimAndLoadJson(res, self)
359
- return data["score"], data["reason"]
360
- else:
361
- try:
362
- res: ReasonScore = self.model.generate(
363
- prompt, schema=ReasonScore
364
- )
365
- return res.score, res.reason
366
- except TypeError:
367
- res = self.model.generate(prompt)
368
- data = trimAndLoadJson(res, self)
369
- return data["score"], data["reason"]
370
-
371
- def is_successful(self) -> bool:
372
- if self.error is not None:
373
- self.success = False
374
- else:
375
- try:
376
- self.success = self.score >= self.threshold
377
- except Exception:
378
- self.success = False
379
- return self.success
380
-
381
- @property
382
- def __name__(self):
383
- if self._include_g_eval_suffix:
384
- return f"{self.name} [GEval]"
385
- else:
386
- return self.name
@@ -1,11 +0,0 @@
1
- from typing import List
2
- from pydantic import BaseModel
3
-
4
-
5
- class ReasonScore(BaseModel):
6
- reason: str
7
- score: float
8
-
9
-
10
- class Steps(BaseModel):
11
- steps: List[str]
@@ -1,133 +0,0 @@
1
- from typing import List, Optional, Tuple
2
- import textwrap
3
-
4
-
5
- class MultimodalGEvalTemplate:
6
-
7
- @staticmethod
8
- def generate_evaluation_steps(parameters: str, criteria: str):
9
- return textwrap.dedent(
10
- f"""Given an evaluation criteria which outlines how you should judge the {parameters}, generate 3-4 concise evaluation steps based on the criteria below. You MUST make it clear how to evaluate {parameters} in relation to one another.
11
-
12
- Evaluation Criteria:
13
- {criteria}
14
-
15
- **
16
- IMPORTANT: Please make sure to only return in JSON format, with the "steps" key as a list of strings. No words or explanation is needed.
17
- Example JSON:
18
- {{
19
- "steps": <list_of_strings>
20
- }}
21
- **
22
-
23
- JSON:
24
- """
25
- )
26
-
27
- @staticmethod
28
- def generate_evaluation_results(
29
- evaluation_steps: str,
30
- test_case_list: List,
31
- parameters: str,
32
- rubric: Optional[str] = None,
33
- score_range: Tuple[int, int] = (0, 10),
34
- _additional_context: Optional[str] = None,
35
- ):
36
- rubric_text = f"Rubric:\n{rubric}\n" if rubric else ""
37
- dependencies = (
38
- "evaluation steps and rubric" if rubric else "evaluation steps"
39
- )
40
- score_explanation = (
41
- "based on the rubric provided"
42
- if rubric
43
- else f"with {score_range[1]} indicating strong alignment with the evaluation steps and {score_range[0]} indicating no alignment"
44
- )
45
- reasoning_expectation = (
46
- "Be specific and grounded in the evaluation steps and rubric."
47
- if rubric
48
- else "Be specific and grounded in the evaluation steps."
49
- )
50
- additional_context = (
51
- f"\n\nAdditional Context:\n{_additional_context}\n"
52
- if _additional_context
53
- else ""
54
- )
55
-
56
- return textwrap.dedent(
57
- f"""You are an evaluator. Given the following {dependencies}, assess the response below and return a JSON object with two fields:
58
-
59
- - `"score"`: an integer between {score_range[0]} and {score_range[1]}, {score_explanation}.
60
- - `"reason"`: a brief explanation for why the score was given. This must mention specific strengths or shortcomings, referencing relevant details from the input. Do **not** quote the score itself in the explanation.
61
-
62
- Your explanation should:
63
- - {reasoning_expectation}
64
- - Mention key details from the test case parameters.
65
- - Be concise, clear, and focused on the evaluation logic.
66
-
67
- Only return valid JSON. Do **not** include any extra commentary or text.
68
-
69
- ---
70
-
71
- Evaluation Steps:
72
- {evaluation_steps}
73
-
74
- {rubric_text}
75
- Test Case:
76
- ************************
77
-
78
- {test_case_list}
79
-
80
- ************************
81
- \n\n\n
82
- Parameters:
83
- {parameters}
84
- {additional_context}
85
-
86
- ---
87
- **Example JSON:**
88
- {{
89
- "reason": "your concise and informative reason here",
90
- "score": {score_range[0]}
91
- }}
92
-
93
- JSON:
94
- """
95
- )
96
-
97
- @staticmethod
98
- def generate_strict_evaluation_results(
99
- evaluation_steps: str,
100
- test_case_list: List,
101
- parameters: str,
102
- _additional_context: Optional[str] = None,
103
- ):
104
- additional_context = (
105
- f"\n\nAdditional Context:\n{_additional_context}\n"
106
- if _additional_context
107
- else ""
108
- )
109
- return textwrap.dedent(
110
- f"""Given the evaluation steps, return a JSON with two keys: 1) a `score` key that is STRICTLY EITHER 1 (follows the criteria 100% outlined in the evaluation steps), OR 0 (does not follow the criteria), and 2) a `reason` key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from {parameters} in your reason, but be very concise with it!
111
-
112
- Evaluation Steps:
113
- {evaluation_steps}
114
- ************************
115
-
116
- {test_case_list}
117
-
118
- ************************
119
- {additional_context}
120
-
121
- **
122
- IMPORTANT: Please make sure to only return in JSON format, with the "score" and "reason" key. No words or explanation is needed.
123
-
124
- Example JSON:
125
- {{
126
- "reason": "The text does not follow the evaluation steps provided.",
127
- "score": 0
128
- }}
129
- **
130
-
131
- JSON:
132
- """
133
- )
@@ -1,68 +0,0 @@
1
- from deepeval.test_case import LLMTestCaseParams, LLMTestCase, ToolCall
2
- from deepeval.test_case import MLLMImage
3
- from deepeval.models.llms.openai_model import (
4
- unsupported_log_probs_multimodal_gpt_models,
5
- )
6
- from deepeval.models import DeepEvalBaseLLM, GPTModel
7
-
8
- from typing import List, Union
9
-
10
-
11
- G_EVAL_PARAMS = {
12
- LLMTestCaseParams.INPUT: "Input",
13
- LLMTestCaseParams.ACTUAL_OUTPUT: "Actual Output",
14
- LLMTestCaseParams.EXPECTED_OUTPUT: "Expected Output",
15
- LLMTestCaseParams.CONTEXT: "Context",
16
- LLMTestCaseParams.RETRIEVAL_CONTEXT: "Retrieval Context",
17
- LLMTestCaseParams.EXPECTED_TOOLS: "Expected Tools",
18
- LLMTestCaseParams.TOOLS_CALLED: "Tools Called",
19
- }
20
-
21
-
22
- def construct_g_eval_params_string(
23
- mllm_test_case_params: List[LLMTestCaseParams],
24
- ):
25
- g_eval_params = [G_EVAL_PARAMS[param] for param in mllm_test_case_params]
26
- if len(g_eval_params) == 1:
27
- g_eval_params_str = g_eval_params[0]
28
- elif len(g_eval_params) == 2:
29
- g_eval_params_str = " and ".join(g_eval_params)
30
- else:
31
- g_eval_params_str = (
32
- ", ".join(g_eval_params[:-1]) + ", and " + g_eval_params[-1]
33
- )
34
-
35
- return g_eval_params_str
36
-
37
-
38
- def construct_test_case_list(
39
- evaluation_params: List[LLMTestCaseParams], test_case: LLMTestCase
40
- ) -> List[Union[str, MLLMImage]]:
41
- from deepeval.utils import convert_to_multi_modal_array
42
-
43
- test_case_list = []
44
- for param in evaluation_params:
45
- test_case_param_list = [f"\n\n\n{G_EVAL_PARAMS[param]}:\n"]
46
- value = convert_to_multi_modal_array(getattr(test_case, param.value))
47
- for v in value:
48
- if isinstance(v, ToolCall):
49
- test_case_param_list.append(repr(v))
50
- else:
51
- test_case_param_list.append(v)
52
- test_case_list.extend(test_case_param_list)
53
- return test_case_list
54
-
55
-
56
- def no_multimodal_log_prob_support(model: Union[str, DeepEvalBaseLLM]):
57
- if (
58
- isinstance(model, str)
59
- and model in unsupported_log_probs_multimodal_gpt_models
60
- ):
61
- return True
62
- elif (
63
- isinstance(model, GPTModel)
64
- and model.get_model_name()
65
- in unsupported_log_probs_multimodal_gpt_models
66
- ):
67
- return True
68
- return False