deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,355 @@
1
+ from typing import Optional, List, Union
2
+
3
+ from deepeval.utils import get_or_create_event_loop, prettify_list
4
+ from deepeval.metrics.utils import (
5
+ construct_verbose_logs,
6
+ trimAndLoadJson,
7
+ get_unit_interactions,
8
+ check_conversational_test_case_params,
9
+ initialize_model,
10
+ )
11
+ from deepeval.test_case import ConversationalTestCase, TurnParams
12
+ from deepeval.metrics import BaseConversationalMetric
13
+ from deepeval.models import DeepEvalBaseLLM
14
+ from deepeval.metrics.indicator import metric_progress_indicator
15
+ from deepeval.metrics.topic_adherence.template import TopicAdherenceTemplate
16
+ from deepeval.metrics.topic_adherence.schema import (
17
+ RelevancyVerdict,
18
+ QAPairs,
19
+ QAPair,
20
+ )
21
+ from deepeval.metrics.api import metric_data_manager
22
+
23
+
24
+ class TopicAdherenceMetric(BaseConversationalMetric):
25
+
26
+ _required_test_case_params = [
27
+ TurnParams.ROLE,
28
+ TurnParams.CONTENT,
29
+ ]
30
+
31
+ def __init__(
32
+ self,
33
+ relevant_topics: List[str],
34
+ threshold: float = 0.5,
35
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
36
+ include_reason: bool = True,
37
+ async_mode: bool = True,
38
+ strict_mode: bool = False,
39
+ verbose_mode: bool = False,
40
+ ):
41
+ self.relevant_topics = relevant_topics
42
+ self.threshold = 1 if strict_mode else threshold
43
+ self.model, self.using_native_model = initialize_model(model)
44
+ self.evaluation_model = self.model.get_model_name()
45
+ self.include_reason = include_reason
46
+ self.async_mode = async_mode
47
+ self.strict_mode = strict_mode
48
+ self.verbose_mode = verbose_mode
49
+
50
+ def measure(
51
+ self,
52
+ test_case: ConversationalTestCase,
53
+ _show_indicator: bool = True,
54
+ _in_component: bool = False,
55
+ _log_metric_to_confident: bool = True,
56
+ ):
57
+ check_conversational_test_case_params(
58
+ test_case, self._required_test_case_params, self
59
+ )
60
+
61
+ self.evaluation_cost = 0 if self.using_native_model else None
62
+ with metric_progress_indicator(
63
+ self, _show_indicator=_show_indicator, _in_component=_in_component
64
+ ):
65
+ if self.async_mode:
66
+ loop = get_or_create_event_loop()
67
+ loop.run_until_complete(
68
+ self.a_measure(
69
+ test_case,
70
+ _show_indicator=False,
71
+ _in_component=_in_component,
72
+ _log_metric_to_confident=_log_metric_to_confident,
73
+ )
74
+ )
75
+ else:
76
+ unit_interactions = get_unit_interactions(test_case.turns)
77
+ interaction_pairs = self._get_qa_pairs(unit_interactions)
78
+ True_Positives = [0, []]
79
+ True_Negatives = [0, []]
80
+ False_Positives = [0, []]
81
+ False_Negatives = [0, []]
82
+ for interaction_pair in interaction_pairs:
83
+ for qa_pair in interaction_pair.qa_pairs:
84
+ qa_verdict: RelevancyVerdict = self._get_qa_verdict(
85
+ qa_pair
86
+ )
87
+ if qa_verdict.verdict == "TP":
88
+ True_Positives[0] += 1
89
+ True_Positives[1].append(qa_verdict.reason)
90
+ elif qa_verdict.verdict == "TN":
91
+ True_Negatives[0] += 1
92
+ True_Negatives[1].append(qa_verdict.reason)
93
+ elif qa_verdict.verdict == "FP":
94
+ False_Positives[0] += 1
95
+ False_Positives[1].append(qa_verdict.reason)
96
+ elif qa_verdict.verdict == "FN":
97
+ False_Negatives[0] += 1
98
+ False_Negatives[1].append(qa_verdict.reason)
99
+
100
+ self.score = self._get_score(
101
+ True_Positives,
102
+ True_Negatives,
103
+ False_Positives,
104
+ False_Negatives,
105
+ )
106
+ self.success = self.score >= self.threshold
107
+ self.reason = self._generate_reason(
108
+ True_Positives,
109
+ True_Negatives,
110
+ False_Positives,
111
+ False_Negatives,
112
+ )
113
+
114
+ self.verbose_logs = construct_verbose_logs(
115
+ self,
116
+ steps=[
117
+ f"Interaction Pairs: \n{prettify_list(interaction_pairs)} \n",
118
+ f"Truth Table:",
119
+ f"\nTrue Positives:",
120
+ f"Count: {True_Positives[0]}, Reasons: {prettify_list(True_Positives[1])} \n",
121
+ f"\nTrue Negatives: ",
122
+ f"Count: {True_Negatives[0]}, Reasons: {prettify_list(True_Negatives[1])} \n",
123
+ f"\nFalse Positives: ",
124
+ f"Count: {False_Positives[0]}, Reasons: {prettify_list(False_Positives[1])} \n",
125
+ f"\nFalse Negatives: ",
126
+ f"Count: {False_Negatives[0]}, Reasons: {prettify_list(False_Negatives[1])} \n",
127
+ f"Final Score: {self.score}",
128
+ f"Final Reason: {self.reason}",
129
+ ],
130
+ )
131
+
132
+ if _log_metric_to_confident:
133
+ metric_data_manager.post_metric_if_enabled(
134
+ self, test_case=test_case
135
+ )
136
+
137
+ return self.score
138
+
139
+ async def a_measure(
140
+ self,
141
+ test_case: ConversationalTestCase,
142
+ _show_indicator: bool = True,
143
+ _in_component: bool = False,
144
+ _log_metric_to_confident: bool = True,
145
+ ):
146
+ check_conversational_test_case_params(
147
+ test_case, self._required_test_case_params, self
148
+ )
149
+
150
+ self.evaluation_cost = 0 if self.using_native_model else None
151
+
152
+ with metric_progress_indicator(
153
+ self,
154
+ async_mode=True,
155
+ _show_indicator=_show_indicator,
156
+ _in_component=_in_component,
157
+ ):
158
+ unit_interactions = get_unit_interactions(test_case.turns)
159
+ interaction_pairs = await self._a_get_qa_pairs(unit_interactions)
160
+ True_Positives = [0, []]
161
+ True_Negatives = [0, []]
162
+ False_Positives = [0, []]
163
+ False_Negatives = [0, []]
164
+ for interaction_pair in interaction_pairs:
165
+ for qa_pair in interaction_pair.qa_pairs:
166
+ qa_verdict: RelevancyVerdict = self._get_qa_verdict(qa_pair)
167
+ if qa_verdict.verdict == "TP":
168
+ True_Positives[0] += 1
169
+ True_Positives[1].append(qa_verdict.reason)
170
+ elif qa_verdict.verdict == "TN":
171
+ True_Negatives[0] += 1
172
+ True_Negatives[1].append(qa_verdict.reason)
173
+ elif qa_verdict.verdict == "FP":
174
+ False_Positives[0] += 1
175
+ False_Positives[1].append(qa_verdict.reason)
176
+ elif qa_verdict.verdict == "FN":
177
+ False_Negatives[0] += 1
178
+ False_Negatives[1].append(qa_verdict.reason)
179
+
180
+ self.score = self._get_score(
181
+ True_Positives, True_Negatives, False_Positives, False_Negatives
182
+ )
183
+ self.success = self.score >= self.threshold
184
+ self.reason = await self._a_generate_reason(
185
+ True_Positives, True_Negatives, False_Positives, False_Negatives
186
+ )
187
+
188
+ self.verbose_logs = construct_verbose_logs(
189
+ self,
190
+ steps=[
191
+ f"Interaction Pairs: \n{prettify_list(interaction_pairs)} \n",
192
+ f"Truth Table:",
193
+ f"\nTrue Positives:",
194
+ f"Count: {True_Positives[0]}, Reasons: {prettify_list(True_Positives[1])} \n",
195
+ f"\nTrue Negatives: ",
196
+ f"Count: {True_Negatives[0]}, Reasons: {prettify_list(True_Negatives[1])} \n",
197
+ f"\nFalse Positives: ",
198
+ f"Count: {False_Positives[0]}, Reasons: {prettify_list(False_Positives[1])} \n",
199
+ f"\nFalse Negatives: ",
200
+ f"Count: {False_Negatives[0]}, Reasons: {prettify_list(False_Negatives[1])} \n",
201
+ f"Final Score: {self.score}",
202
+ f"Final Reason: {self.reason}",
203
+ ],
204
+ )
205
+
206
+ if _log_metric_to_confident:
207
+ metric_data_manager.post_metric_if_enabled(
208
+ self, test_case=test_case
209
+ )
210
+
211
+ return self.score
212
+
213
+ def _generate_reason(self, TP, TN, FP, FN):
214
+ total = TP[0] + TN[0] + FP[0] + FN[0]
215
+ if total <= 0:
216
+ return "There were no question-answer pairs to evaluate. Please enable verbose logs to look at the evaluation steps taken"
217
+ prompt = TopicAdherenceTemplate.generate_reason(
218
+ self.success, self.score, self.threshold, TP, TN, FP, FN
219
+ )
220
+ if self.using_native_model:
221
+ res, cost = self.model.generate(prompt)
222
+ self.evaluation_cost += cost
223
+ return res
224
+ else:
225
+ res = self.model.generate(prompt)
226
+ return res
227
+
228
+ async def _a_generate_reason(self, TP, TN, FP, FN):
229
+ prompt = TopicAdherenceTemplate.generate_reason(
230
+ self.success, self.score, self.threshold, TP, TN, FP, FN
231
+ )
232
+ if self.using_native_model:
233
+ res, cost = await self.model.a_generate(prompt)
234
+ self.evaluation_cost += cost
235
+ return res
236
+ else:
237
+ res = await self.model.a_generate(prompt)
238
+ return res
239
+
240
+ def _get_score(self, TP, TN, FP, FN) -> float:
241
+ true_values = TP[0] + TN[0]
242
+ total = TP[0] + TN[0] + FP[0] + FN[0]
243
+ if total <= 0:
244
+ score = 0
245
+ else:
246
+ score = true_values / total
247
+ return 0 if self.strict_mode and score < self.threshold else score
248
+
249
+ def _get_qa_verdict(self, qa_pair: QAPair) -> RelevancyVerdict:
250
+ prompt = TopicAdherenceTemplate.get_qa_pair_verdict(
251
+ self.relevant_topics, qa_pair.question, qa_pair.response
252
+ )
253
+ if self.using_native_model:
254
+ res, cost = self.model.generate(prompt, schema=RelevancyVerdict)
255
+ self.evaluation_cost += cost
256
+ return res
257
+ else:
258
+ try:
259
+ res = self.model.generate(prompt, schema=RelevancyVerdict)
260
+ return res
261
+ except TypeError:
262
+ res = self.model.generate(prompt)
263
+ data = trimAndLoadJson(res, self)
264
+ return RelevancyVerdict(**data)
265
+
266
+ async def _a_get_qa_verdict(self, qa_pair: QAPair) -> RelevancyVerdict:
267
+ prompt = TopicAdherenceTemplate.get_qa_pair_verdict(
268
+ self.relevant_topics, qa_pair.question, qa_pair.response
269
+ )
270
+ if self.using_native_model:
271
+ res, cost = await self.model.a_generate(
272
+ prompt, schema=RelevancyVerdict
273
+ )
274
+ self.evaluation_cost += cost
275
+ return res
276
+ else:
277
+ try:
278
+ res = await self.model.a_generate(
279
+ prompt, schema=RelevancyVerdict
280
+ )
281
+ return res
282
+ except TypeError:
283
+ res = await self.model.a_generate(prompt)
284
+ data = trimAndLoadJson(res, self)
285
+ return RelevancyVerdict(**data)
286
+
287
+ def _get_qa_pairs(self, unit_interactions: List) -> List[QAPairs]:
288
+ qa_pairs = []
289
+ for unit_interaction in unit_interactions:
290
+ conversation = "Conversation: \n"
291
+ for turn in unit_interaction:
292
+ conversation += f"{turn.role} \n"
293
+ conversation += f"{turn.content} \n\n"
294
+ prompt = TopicAdherenceTemplate.get_qa_pairs(conversation)
295
+ new_pair = None
296
+
297
+ if self.using_native_model:
298
+ res, cost = self.model.generate(prompt, schema=QAPairs)
299
+ self.evaluation_cost += cost
300
+ new_pair = res
301
+ else:
302
+ try:
303
+ res = self.model.generate(prompt, schema=QAPairs)
304
+ new_pair = res
305
+ except TypeError:
306
+ res = self.model.generate(prompt)
307
+ data = trimAndLoadJson(res, self)
308
+ new_pair = QAPairs(**data)
309
+
310
+ if new_pair is not None:
311
+ qa_pairs.append(new_pair)
312
+
313
+ return qa_pairs
314
+
315
+ async def _a_get_qa_pairs(self, unit_interactions: List) -> List[QAPairs]:
316
+ qa_pairs = []
317
+ for unit_interaction in unit_interactions:
318
+ conversation = "Conversation: \n"
319
+ for turn in unit_interaction:
320
+ conversation += f"{turn.role} \n"
321
+ conversation += f"{turn.content} \n\n"
322
+ prompt = TopicAdherenceTemplate.get_qa_pairs(conversation)
323
+ new_pair = None
324
+
325
+ if self.using_native_model:
326
+ res, cost = await self.model.a_generate(prompt, schema=QAPairs)
327
+ self.evaluation_cost += cost
328
+ new_pair = res
329
+ else:
330
+ try:
331
+ res = await self.model.a_generate(prompt, schema=QAPairs)
332
+ new_pair = res
333
+ except TypeError:
334
+ res = await self.model.a_generate(prompt)
335
+ data = trimAndLoadJson(res, self)
336
+ new_pair = QAPairs(**data)
337
+
338
+ if new_pair is not None:
339
+ qa_pairs.append(new_pair)
340
+
341
+ return qa_pairs
342
+
343
+ def is_successful(self) -> bool:
344
+ if self.error is not None:
345
+ self.success = False
346
+ else:
347
+ try:
348
+ self.score >= self.threshold
349
+ except:
350
+ self.success = False
351
+ return self.success
352
+
353
+ @property
354
+ def __name__(self):
355
+ return "Topic Adherence"
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
17
17
  )
18
18
  from deepeval.metrics.toxicity.template import ToxicityTemplate
19
19
  from deepeval.metrics.toxicity.schema import *
20
+ from deepeval.metrics.api import metric_data_manager
20
21
 
21
22
 
22
23
  class ToxicityMetric(BaseMetric):
@@ -50,6 +51,7 @@ class ToxicityMetric(BaseMetric):
50
51
  test_case: LLMTestCase,
51
52
  _show_indicator: bool = True,
52
53
  _in_component: bool = False,
54
+ _log_metric_to_confident: bool = True,
53
55
  ) -> float:
54
56
 
55
57
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -65,6 +67,7 @@ class ToxicityMetric(BaseMetric):
65
67
  test_case,
66
68
  _show_indicator=False,
67
69
  _in_component=_in_component,
70
+ _log_metric_to_confident=_log_metric_to_confident,
68
71
  )
69
72
  )
70
73
  else:
@@ -84,6 +87,10 @@ class ToxicityMetric(BaseMetric):
84
87
  f"Score: {self.score}\nReason: {self.reason}",
85
88
  ],
86
89
  )
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
87
94
 
88
95
  return self.score
89
96
 
@@ -92,6 +99,7 @@ class ToxicityMetric(BaseMetric):
92
99
  test_case: LLMTestCase,
93
100
  _show_indicator: bool = True,
94
101
  _in_component: bool = False,
102
+ _log_metric_to_confident: bool = True,
95
103
  ) -> float:
96
104
 
97
105
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -122,6 +130,10 @@ class ToxicityMetric(BaseMetric):
122
130
  f"Score: {self.score}\nReason: {self.reason}",
123
131
  ],
124
132
  )
133
+ if _log_metric_to_confident:
134
+ metric_data_manager.post_metric_if_enabled(
135
+ self, test_case=test_case
136
+ )
125
137
 
126
138
  return self.score
127
139
 
@@ -20,6 +20,7 @@ from deepeval.metrics.indicator import metric_progress_indicator
20
20
  from deepeval.test_case import ConversationalTestCase, Turn, TurnParams
21
21
  from deepeval.utils import get_or_create_event_loop, prettify_list
22
22
  from deepeval.metrics.turn_relevancy.schema import *
23
+ from deepeval.metrics.api import metric_data_manager
23
24
 
24
25
 
25
26
  class TurnRelevancyMetric(BaseConversationalMetric):
@@ -49,6 +50,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
49
50
  test_case: ConversationalTestCase,
50
51
  _show_indicator: bool = True,
51
52
  _in_component: bool = False,
53
+ _log_metric_to_confident: bool = True,
52
54
  ):
53
55
  check_conversational_test_case_params(
54
56
  test_case, self._required_test_case_params, self
@@ -65,6 +67,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
65
67
  test_case,
66
68
  _show_indicator=False,
67
69
  _in_component=_in_component,
70
+ _log_metric_to_confident=_log_metric_to_confident,
68
71
  )
69
72
  )
70
73
  else:
@@ -91,6 +94,10 @@ class TurnRelevancyMetric(BaseConversationalMetric):
91
94
  f"Score: {self.score}\nReason: {self.reason}",
92
95
  ],
93
96
  )
97
+ if _log_metric_to_confident:
98
+ metric_data_manager.post_metric_if_enabled(
99
+ self, test_case=test_case
100
+ )
94
101
  return self.score
95
102
 
96
103
  async def a_measure(
@@ -98,6 +105,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
98
105
  test_case: ConversationalTestCase,
99
106
  _show_indicator: bool = True,
100
107
  _in_component: bool = False,
108
+ _log_metric_to_confident: bool = True,
101
109
  ) -> float:
102
110
  check_conversational_test_case_params(
103
111
  test_case, self._required_test_case_params, self
@@ -134,6 +142,10 @@ class TurnRelevancyMetric(BaseConversationalMetric):
134
142
  f"Score: {self.score}\nReason: {self.reason}",
135
143
  ],
136
144
  )
145
+ if _log_metric_to_confident:
146
+ metric_data_manager.post_metric_if_enabled(
147
+ self, test_case=test_case
148
+ )
137
149
  return self.score
138
150
 
139
151
  async def _a_generate_reason(self) -> str:
@@ -1,4 +1,4 @@
1
- from typing import Dict, List
1
+ from typing import Dict, List, Optional
2
2
  from openai import AzureOpenAI, AsyncAzureOpenAI
3
3
  from deepeval.key_handler import (
4
4
  EmbeddingKeyValues,
@@ -17,28 +17,39 @@ retry_azure = create_retry_decorator(PS.AZURE)
17
17
 
18
18
 
19
19
  class AzureOpenAIEmbeddingModel(DeepEvalBaseEmbeddingModel):
20
- def __init__(self, **kwargs):
21
- self.azure_openai_api_key = KEY_FILE_HANDLER.fetch_data(
20
+ def __init__(
21
+ self,
22
+ openai_api_key: Optional[str] = None,
23
+ openai_api_version: Optional[str] = None,
24
+ azure_endpoint: Optional[str] = None,
25
+ azure_deployment: Optional[str] = None,
26
+ model: Optional[str] = None,
27
+ generation_kwargs: Optional[Dict] = None,
28
+ **client_kwargs,
29
+ ):
30
+ self.openai_api_key = openai_api_key or KEY_FILE_HANDLER.fetch_data(
22
31
  ModelKeyValues.AZURE_OPENAI_API_KEY
23
32
  )
24
- self.openai_api_version = KEY_FILE_HANDLER.fetch_data(
25
- ModelKeyValues.OPENAI_API_VERSION
33
+ self.openai_api_version = (
34
+ openai_api_version
35
+ or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.OPENAI_API_VERSION)
26
36
  )
27
- self.azure_embedding_deployment = KEY_FILE_HANDLER.fetch_data(
28
- EmbeddingKeyValues.AZURE_EMBEDDING_DEPLOYMENT_NAME
29
- )
30
- self.azure_endpoint = KEY_FILE_HANDLER.fetch_data(
37
+ self.azure_endpoint = azure_endpoint or KEY_FILE_HANDLER.fetch_data(
31
38
  ModelKeyValues.AZURE_OPENAI_ENDPOINT
32
39
  )
33
- self.model_name = self.azure_embedding_deployment
34
- self.kwargs = kwargs
40
+ self.azure_deployment = azure_deployment or KEY_FILE_HANDLER.fetch_data(
41
+ EmbeddingKeyValues.AZURE_EMBEDDING_DEPLOYMENT_NAME
42
+ )
43
+ self.client_kwargs = client_kwargs or {}
44
+ self.model_name = model or self.azure_deployment
45
+ self.generation_kwargs = generation_kwargs or {}
46
+ super().__init__(self.model_name)
35
47
 
36
48
  @retry_azure
37
49
  def embed_text(self, text: str) -> List[float]:
38
50
  client = self.load_model(async_mode=False)
39
51
  response = client.embeddings.create(
40
- input=text,
41
- model=self.azure_embedding_deployment,
52
+ input=text, model=self.model_name, **self.generation_kwargs
42
53
  )
43
54
  return response.data[0].embedding
44
55
 
@@ -46,8 +57,7 @@ class AzureOpenAIEmbeddingModel(DeepEvalBaseEmbeddingModel):
46
57
  def embed_texts(self, texts: List[str]) -> List[List[float]]:
47
58
  client = self.load_model(async_mode=False)
48
59
  response = client.embeddings.create(
49
- input=texts,
50
- model=self.azure_embedding_deployment,
60
+ input=texts, model=self.model_name, **self.generation_kwargs
51
61
  )
52
62
  return [item.embedding for item in response.data]
53
63
 
@@ -55,8 +65,7 @@ class AzureOpenAIEmbeddingModel(DeepEvalBaseEmbeddingModel):
55
65
  async def a_embed_text(self, text: str) -> List[float]:
56
66
  client = self.load_model(async_mode=True)
57
67
  response = await client.embeddings.create(
58
- input=text,
59
- model=self.azure_embedding_deployment,
68
+ input=text, model=self.model_name, **self.generation_kwargs
60
69
  )
61
70
  return response.data[0].embedding
62
71
 
@@ -64,8 +73,7 @@ class AzureOpenAIEmbeddingModel(DeepEvalBaseEmbeddingModel):
64
73
  async def a_embed_texts(self, texts: List[str]) -> List[List[float]]:
65
74
  client = self.load_model(async_mode=True)
66
75
  response = await client.embeddings.create(
67
- input=texts,
68
- model=self.azure_embedding_deployment,
76
+ input=texts, model=self.model_name, **self.generation_kwargs
69
77
  )
70
78
  return [item.embedding for item in response.data]
71
79
 
@@ -77,30 +85,23 @@ class AzureOpenAIEmbeddingModel(DeepEvalBaseEmbeddingModel):
77
85
  return self._build_client(AzureOpenAI)
78
86
  return self._build_client(AsyncAzureOpenAI)
79
87
 
80
- def _client_kwargs(self) -> Dict:
81
- """
82
- If Tenacity is managing retries, force OpenAI SDK retries off to avoid double retries.
83
- If the user opts into SDK retries for 'azure' via DEEPEVAL_SDK_RETRY_PROVIDERS,
84
- leave their retry settings as is.
85
- """
86
- kwargs = dict(self.kwargs or {})
88
+ def _build_client(self, cls):
89
+ client_kwargs = self.client_kwargs.copy()
87
90
  if not sdk_retries_for(PS.AZURE):
88
- kwargs["max_retries"] = 0
89
- return kwargs
91
+ client_kwargs["max_retries"] = 0
90
92
 
91
- def _build_client(self, cls):
92
- kw = dict(
93
- api_key=self.azure_openai_api_key,
93
+ client_init_kwargs = dict(
94
+ api_key=self.openai_api_key,
94
95
  api_version=self.openai_api_version,
95
96
  azure_endpoint=self.azure_endpoint,
96
- azure_deployment=self.azure_embedding_deployment,
97
- **self._client_kwargs(),
97
+ azure_deployment=self.azure_deployment,
98
+ **client_kwargs,
98
99
  )
99
100
  try:
100
- return cls(**kw)
101
+ return cls(**client_init_kwargs)
101
102
  except TypeError as e:
102
103
  # older OpenAI SDKs may not accept max_retries, in that case remove and retry once
103
104
  if "max_retries" in str(e):
104
- kw.pop("max_retries", None)
105
- return cls(**kw)
105
+ client_init_kwargs.pop("max_retries", None)
106
+ return cls(**client_init_kwargs)
106
107
  raise