deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,458 @@
1
+ from typing import Optional, List, Union
2
+ import asyncio
3
+ from deepeval.utils import get_or_create_event_loop, prettify_list
4
+ from deepeval.metrics.utils import (
5
+ construct_verbose_logs,
6
+ trimAndLoadJson,
7
+ get_unit_interactions,
8
+ print_tools_called,
9
+ check_conversational_test_case_params,
10
+ initialize_model,
11
+ )
12
+ from deepeval.test_case import (
13
+ ConversationalTestCase,
14
+ TurnParams,
15
+ ToolCall,
16
+ Turn,
17
+ )
18
+ from deepeval.metrics import BaseConversationalMetric
19
+ from deepeval.models import DeepEvalBaseLLM
20
+ from deepeval.metrics.indicator import metric_progress_indicator
21
+ from deepeval.metrics.tool_use.template import ToolUseTemplate
22
+ from deepeval.metrics.tool_use.schema import (
23
+ ToolSelectionScore,
24
+ UserInputAndTools,
25
+ ArgumentCorrectnessScore,
26
+ )
27
+ from deepeval.metrics.api import metric_data_manager
28
+
29
+
30
+ class ToolUseMetric(BaseConversationalMetric):
31
+
32
+ _required_test_case_params = [
33
+ TurnParams.ROLE,
34
+ TurnParams.CONTENT,
35
+ ]
36
+
37
+ def __init__(
38
+ self,
39
+ available_tools: List[ToolCall],
40
+ threshold: float = 0.5,
41
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
42
+ include_reason: bool = True,
43
+ async_mode: bool = True,
44
+ strict_mode: bool = False,
45
+ verbose_mode: bool = False,
46
+ ):
47
+ self.available_tools = available_tools
48
+ self.threshold = 1 if strict_mode else threshold
49
+ self.model, self.using_native_model = initialize_model(model)
50
+ self.evaluation_model = self.model.get_model_name()
51
+ self.include_reason = include_reason
52
+ self.async_mode = async_mode
53
+ self.strict_mode = strict_mode
54
+ self.verbose_mode = verbose_mode
55
+
56
+ def measure(
57
+ self,
58
+ test_case: ConversationalTestCase,
59
+ _show_indicator: bool = True,
60
+ _in_component: bool = False,
61
+ _log_metric_to_confident: bool = True,
62
+ ):
63
+ check_conversational_test_case_params(
64
+ test_case, self._required_test_case_params, self
65
+ )
66
+
67
+ self.evaluation_cost = 0 if self.using_native_model else None
68
+ with metric_progress_indicator(
69
+ self, _show_indicator=_show_indicator, _in_component=_in_component
70
+ ):
71
+ if self.async_mode:
72
+ loop = get_or_create_event_loop()
73
+ loop.run_until_complete(
74
+ self.a_measure(
75
+ test_case,
76
+ _show_indicator=False,
77
+ _in_component=_in_component,
78
+ _log_metric_to_confident=_log_metric_to_confident,
79
+ )
80
+ )
81
+ else:
82
+ unit_interactions = get_unit_interactions(test_case.turns)
83
+ user_input_and_tools = self._get_user_input_and_turns(
84
+ unit_interactions
85
+ )
86
+ tool_selection_scores = [
87
+ self._get_tool_selection_score(user_and_tools)
88
+ for user_and_tools in user_input_and_tools
89
+ ]
90
+ argument_correctness_scores = [
91
+ self._get_argument_correctness_score(user_and_tools)
92
+ for user_and_tools in user_input_and_tools
93
+ if user_and_tools.tools_used
94
+ ]
95
+ self.score = self._calculate_score(
96
+ tool_selection_scores, argument_correctness_scores
97
+ )
98
+ tool_selection_reason = (
99
+ self._generate_reason_for_tool_selection(
100
+ tool_selection_scores
101
+ )
102
+ )
103
+ argument_correctness_reason = (
104
+ self._generate_reason_for_argument_correctness(
105
+ argument_correctness_scores
106
+ )
107
+ )
108
+ self.reason = str(
109
+ "\n".join(
110
+ [tool_selection_reason, argument_correctness_reason]
111
+ )
112
+ )
113
+
114
+ self.verbose_logs = construct_verbose_logs(
115
+ self,
116
+ steps=[
117
+ f"Tool Selection Scores: {prettify_list(tool_selection_scores)} \n",
118
+ f"Argument Correctness Scores: {prettify_list(argument_correctness_scores)} \n",
119
+ f"Final Score: {self.score}",
120
+ f"Final Reason: {self.reason}",
121
+ ],
122
+ )
123
+
124
+ if _log_metric_to_confident:
125
+ metric_data_manager.post_metric_if_enabled(
126
+ self, test_case=test_case
127
+ )
128
+
129
+ return self.score
130
+
131
+ async def a_measure(
132
+ self,
133
+ test_case: ConversationalTestCase,
134
+ _show_indicator: bool = True,
135
+ _in_component: bool = False,
136
+ _log_metric_to_confident: bool = True,
137
+ ):
138
+ check_conversational_test_case_params(
139
+ test_case, self._required_test_case_params, self
140
+ )
141
+
142
+ self.evaluation_cost = 0 if self.using_native_model else None
143
+ with metric_progress_indicator(
144
+ self,
145
+ async_mode=True,
146
+ _show_indicator=_show_indicator,
147
+ _in_component=_in_component,
148
+ ):
149
+ unit_interactions = get_unit_interactions(test_case.turns)
150
+ user_input_and_tools = self._get_user_input_and_turns(
151
+ unit_interactions
152
+ )
153
+ tool_selection_scores = await asyncio.gather(
154
+ *[
155
+ self._a_get_tool_selection_score(user_and_tools)
156
+ for user_and_tools in user_input_and_tools
157
+ ]
158
+ )
159
+ argument_correctness_scores = await asyncio.gather(
160
+ *[
161
+ self._a_get_argument_correctness_score(user_and_tools)
162
+ for user_and_tools in user_input_and_tools
163
+ if user_and_tools.tools_used
164
+ ]
165
+ )
166
+ self.score = self._calculate_score(
167
+ tool_selection_scores, argument_correctness_scores
168
+ )
169
+ tool_selection_reason = (
170
+ await self._a_generate_reason_for_tool_selection(
171
+ tool_selection_scores
172
+ )
173
+ )
174
+ argument_correctness_reason = (
175
+ await self._a_generate_reason_for_argument_correctness(
176
+ argument_correctness_scores
177
+ )
178
+ )
179
+ self.reason = str(
180
+ "\n".join([tool_selection_reason, argument_correctness_reason])
181
+ )
182
+
183
+ self.verbose_logs = construct_verbose_logs(
184
+ self,
185
+ steps=[
186
+ f"Tool Selection Scores: {prettify_list(tool_selection_scores)} \n",
187
+ f"Argument Correctness Scores: {prettify_list(argument_correctness_scores)} \n",
188
+ f"Final Score: {self.score}",
189
+ f"Final Reason: {self.reason}",
190
+ ],
191
+ )
192
+
193
+ if _log_metric_to_confident:
194
+ metric_data_manager.post_metric_if_enabled(
195
+ self, test_case=test_case
196
+ )
197
+
198
+ return self.score
199
+
200
+ def _get_argument_correctness_score(
201
+ self, user_and_tools: UserInputAndTools
202
+ ):
203
+ prompt = ToolUseTemplate.get_argument_correctness_score(
204
+ user_and_tools.user_messages,
205
+ user_and_tools.assistant_messages,
206
+ user_and_tools.tools_called,
207
+ user_and_tools.available_tools,
208
+ )
209
+ if self.using_native_model:
210
+ res, cost = self.model.generate(
211
+ prompt, schema=ArgumentCorrectnessScore
212
+ )
213
+ self.evaluation_cost += cost
214
+ return res
215
+ else:
216
+ try:
217
+ res: ArgumentCorrectnessScore = self.model.generate(
218
+ prompt, schema=ArgumentCorrectnessScore
219
+ )
220
+ return res
221
+ except TypeError:
222
+ res = self.model.generate(prompt)
223
+ data = trimAndLoadJson(res, self)
224
+ return ArgumentCorrectnessScore(**data)
225
+
226
+ async def _a_get_argument_correctness_score(
227
+ self,
228
+ user_and_tools: UserInputAndTools,
229
+ ):
230
+ prompt = ToolUseTemplate.get_argument_correctness_score(
231
+ user_and_tools.user_messages,
232
+ user_and_tools.assistant_messages,
233
+ user_and_tools.tools_called,
234
+ user_and_tools.available_tools,
235
+ )
236
+ if self.using_native_model:
237
+ res, cost = await self.model.a_generate(
238
+ prompt, schema=ArgumentCorrectnessScore
239
+ )
240
+ self.evaluation_cost += cost
241
+ return res
242
+ else:
243
+ try:
244
+ res: ArgumentCorrectnessScore = await self.model.a_generate(
245
+ prompt, schema=ArgumentCorrectnessScore
246
+ )
247
+ return res
248
+ except TypeError:
249
+ res = await self.model.a_generate(prompt)
250
+ data = trimAndLoadJson(res, self)
251
+ return ArgumentCorrectnessScore(**data)
252
+
253
+ def _get_tool_selection_score(
254
+ self,
255
+ user_and_tools: UserInputAndTools,
256
+ ):
257
+ prompt = ToolUseTemplate.get_tool_selection_score(
258
+ user_and_tools.user_messages,
259
+ user_and_tools.assistant_messages,
260
+ user_and_tools.tools_called,
261
+ user_and_tools.available_tools,
262
+ )
263
+ if self.using_native_model:
264
+ res, cost = self.model.generate(prompt, schema=ToolSelectionScore)
265
+ self.evaluation_cost += cost
266
+ return res
267
+ else:
268
+ try:
269
+ res: ToolSelectionScore = self.model.generate(
270
+ prompt, schema=ToolSelectionScore
271
+ )
272
+ return res
273
+ except TypeError:
274
+ res = self.model.generate(prompt)
275
+ data = trimAndLoadJson(res, self)
276
+ return ToolSelectionScore(**data)
277
+
278
+ async def _a_get_tool_selection_score(
279
+ self,
280
+ user_and_tools: UserInputAndTools,
281
+ ):
282
+ prompt = ToolUseTemplate.get_tool_selection_score(
283
+ user_and_tools.user_messages,
284
+ user_and_tools.assistant_messages,
285
+ user_and_tools.tools_called,
286
+ user_and_tools.available_tools,
287
+ )
288
+ if self.using_native_model:
289
+ res, cost = await self.model.a_generate(
290
+ prompt, schema=ToolSelectionScore
291
+ )
292
+ self.evaluation_cost += cost
293
+ return res
294
+ else:
295
+ try:
296
+ res: ToolSelectionScore = await self.model.a_generate(
297
+ prompt, schema=ToolSelectionScore
298
+ )
299
+ return res
300
+ except TypeError:
301
+ res = await self.model.a_generate(prompt)
302
+ data = trimAndLoadJson(res, self)
303
+ return ToolSelectionScore(**data)
304
+
305
+ def _get_user_input_and_turns(
306
+ self,
307
+ unit_interactions: List[List[Turn]],
308
+ ) -> List[UserInputAndTools]:
309
+ user_inputs_and_tools = []
310
+ available_tools = ",".join(
311
+ [repr(tool) for tool in self.available_tools]
312
+ )
313
+ for unit_interaction in unit_interactions:
314
+ if len(unit_interaction) < 2:
315
+ continue
316
+ user_messages = ""
317
+ assistant_messages = ""
318
+ tools_called = []
319
+ tools_used = False
320
+ for turn in unit_interaction:
321
+ if turn.role == "user":
322
+ user_messages += f"{turn.content} \n"
323
+ else:
324
+ break
325
+ for turn in unit_interaction[1:]:
326
+ if turn.role == "assistant":
327
+ assistant_messages += f"{turn.content} \n"
328
+ if turn.tools_called:
329
+ tools_called.extend(turn.tools_called)
330
+ tools_used = True
331
+ tools_called = ",".join([repr(tool) for tool in tools_called])
332
+ new_user_input_tools = UserInputAndTools(
333
+ user_messages=user_messages,
334
+ assistant_messages=assistant_messages,
335
+ tools_called=tools_called,
336
+ available_tools=available_tools,
337
+ tools_used=tools_used,
338
+ )
339
+ user_inputs_and_tools.append(new_user_input_tools)
340
+ return user_inputs_and_tools
341
+
342
+ def _calculate_score(
343
+ self,
344
+ tool_use_scores: List[ToolSelectionScore],
345
+ argument_correctness_scores: List[ArgumentCorrectnessScore],
346
+ ):
347
+ tools_scores_sum = sum(
348
+ [tool_use_score.score for tool_use_score in tool_use_scores]
349
+ )
350
+ arguments_scores_sum = sum(
351
+ [
352
+ argument_correctness_score.score
353
+ for argument_correctness_score in argument_correctness_scores
354
+ ]
355
+ )
356
+ tool_selections_scores_divisor = (
357
+ len(tool_use_scores) if len(tool_use_scores) > 0 else 1
358
+ )
359
+ argument_correctness_score_divisor = (
360
+ len(argument_correctness_scores)
361
+ if len(argument_correctness_scores) > 0
362
+ else 1
363
+ )
364
+ tools_selction_score = tools_scores_sum / tool_selections_scores_divisor
365
+ argument_correctness_score = (
366
+ arguments_scores_sum / argument_correctness_score_divisor
367
+ )
368
+ score = min(tools_selction_score, argument_correctness_score)
369
+ return 0 if self.strict_mode and score < self.threshold else score
370
+
371
+ def _generate_reason_for_tool_selection(
372
+ self,
373
+ tool_use_scores: List[ToolSelectionScore],
374
+ ):
375
+ scores_and_reasons = ""
376
+ for tool_use in tool_use_scores:
377
+ scores_and_reasons += (
378
+ f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
379
+ )
380
+ prompt = ToolUseTemplate.get_tool_selection_final_reason(
381
+ scores_and_reasons, self.score, self.threshold
382
+ )
383
+ if self.using_native_model:
384
+ res, cost = self.model.generate(prompt)
385
+ self.evaluation_cost += cost
386
+ return res
387
+ else:
388
+ res = self.model.generate(prompt)
389
+ return res
390
+
391
+ def _generate_reason_for_argument_correctness(
392
+ self,
393
+ argument_correctness_scores: List[ArgumentCorrectnessScore],
394
+ ):
395
+ scores_and_reasons = ""
396
+ for tool_use in argument_correctness_scores:
397
+ scores_and_reasons += (
398
+ f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
399
+ )
400
+ prompt = ToolUseTemplate.get_tool_selection_final_reason(
401
+ scores_and_reasons, self.score, self.threshold
402
+ )
403
+ if self.using_native_model:
404
+ res, cost = self.model.generate(prompt)
405
+ self.evaluation_cost += cost
406
+ return res
407
+ else:
408
+ res = self.model.generate(prompt)
409
+ return res
410
+
411
+ async def _a_generate_reason_for_tool_selection(
412
+ self, tool_use_scores: List[ToolSelectionScore]
413
+ ):
414
+ scores_and_reasons = ""
415
+ for tool_use in tool_use_scores:
416
+ scores_and_reasons += (
417
+ f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
418
+ )
419
+ prompt = ToolUseTemplate.get_tool_selection_final_reason(
420
+ scores_and_reasons, self.score, self.threshold
421
+ )
422
+ if self.using_native_model:
423
+ res, cost = await self.model.a_generate(prompt)
424
+ self.evaluation_cost += cost
425
+ return res
426
+ else:
427
+ res = await self.model.a_generate(prompt)
428
+ return res
429
+
430
+ async def _a_generate_reason_for_argument_correctness(
431
+ self, argument_correctness_scores: List[ArgumentCorrectnessScore]
432
+ ):
433
+ scores_and_reasons = ""
434
+ for tool_use in argument_correctness_scores:
435
+ scores_and_reasons += (
436
+ f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
437
+ )
438
+ prompt = ToolUseTemplate.get_tool_selection_final_reason(
439
+ scores_and_reasons, self.score, self.threshold
440
+ )
441
+ if self.using_native_model:
442
+ res, cost = await self.model.a_generate(prompt)
443
+ self.evaluation_cost += cost
444
+ return res
445
+ else:
446
+ res = await self.model.a_generate(prompt)
447
+ return res
448
+
449
+ def is_successful(self) -> bool:
450
+ try:
451
+ self.success = self.score >= self.threshold
452
+ except (AttributeError, TypeError):
453
+ self.success = False
454
+ return self.success
455
+
456
+ @property
457
+ def __name__(self):
458
+ return "Tool Use"
@@ -0,0 +1 @@
1
+ from .topic_adherence import TopicAdherenceMetric
@@ -0,0 +1,16 @@
1
+ from pydantic import BaseModel
2
+ from typing import List, Dict, Literal
3
+
4
+
5
+ class QAPair(BaseModel):
6
+ question: str
7
+ response: str
8
+
9
+
10
+ class QAPairs(BaseModel):
11
+ qa_pairs: List[QAPair]
12
+
13
+
14
+ class RelevancyVerdict(BaseModel):
15
+ verdict: Literal["TP", "TN", "FP", "FN"]
16
+ reason: str
@@ -0,0 +1,162 @@
1
+ from typing import List
2
+ import textwrap
3
+
4
+
5
+ class TopicAdherenceTemplate:
6
+
7
+ @staticmethod
8
+ def get_qa_pairs(
9
+ conversation: str,
10
+ ) -> str:
11
+ return textwrap.dedent(
12
+ f"""Your task is to extract question-answer (QA) pairs from a multi-turn conversation between a `user` and an `assistant`.
13
+
14
+ You must return only valid pairs where:
15
+ - The **question** comes from the `user`.
16
+ - The **response** comes from the `assistant`.
17
+ - Both question and response must appear **explicitly** in the conversation.
18
+
19
+ Do not infer information beyond what is stated. Ignore irrelevant or conversational turns (e.g. greetings, affirmations) that do not constitute clear QA pairs.
20
+ If there are multiple questions and multiple answers in a single sentence, break them into separate pairs. Each pair must be standalone, and should not contain more than one question or response.
21
+
22
+ OUTPUT Format:
23
+ Return a **JSON object** with a single 2 keys:
24
+ - `"question"`: the user's question
25
+ - `"response"`: the assistant's direct response
26
+
27
+ If no valid QA pairs are found, return:
28
+ ```json
29
+ {{
30
+ question: "",
31
+ response: ""
32
+ }}
33
+
34
+ CHAIN OF THOUGHT:
35
+ - Read the full conversation sequentially.
36
+ - Identify user turns that clearly ask a question (explicit or strongly implied).
37
+ - Match each question with the immediate assistant response.
38
+ - Only include pairs where the assistant's reply directly addresses the user's question.
39
+ - Do not include incomplete, ambiguous, or out-of-context entries.
40
+
41
+ EXAMPLE:
42
+
43
+ Conversation:
44
+
45
+ user: Which food is best for diabetic patients?
46
+ assistant: Steel-cut oats are good for diabetic patients
47
+ user: Is it better if I eat muesli instead of oats?
48
+ assistant: While muesli is good for diabetic people, steel-cut oats are preferred. Refer to your nutritionist for better guidance.
49
+
50
+ Example JSON:
51
+ {{
52
+ "question": "Which food is best for diabetic patients?",
53
+ "response": "Steel-cut oats are good for diabetic patients"
54
+ }}
55
+ ===== END OF EXAMPLE ======
56
+
57
+ **
58
+ IMPORTANT: Please make sure to only return in JSON format with one key: 'qa_pairs' and the value MUST be a list of dictionaries
59
+ **
60
+
61
+ Conversation:
62
+ {conversation}
63
+ JSON:
64
+ """
65
+ )
66
+
67
+ @staticmethod
68
+ def get_qa_pair_verdict(
69
+ relevant_topics: List[str],
70
+ question: str,
71
+ response: str,
72
+ ) -> str:
73
+ return textwrap.dedent(
74
+ f"""You are given:
75
+ - A list of **relevant topics**
76
+ - A **user question**
77
+ - An **assistant response**
78
+
79
+ Your task is to:
80
+ 1. Determine if the question is relevant to the list of topics.
81
+ 2. If it is relevant, evaluate whether the response properly answers the question.
82
+ 3. Based on both relevance and correctness, assign one of four possible verdicts.
83
+ 4. Give a simple, comprehensive reason explaining why this question-answer pair was assigned this verdict
84
+
85
+ VERDICTS:
86
+ - `"TP"` (True Positive): Question is relevant and the response correctly answers it.
87
+ - `"FN"` (False Negative): Question is relevant, but the assistant refused to answer or gave an irrelevant response.
88
+ - `"FP"` (False Positive): Question is NOT relevant, but the assistant still gave an answer (based on general/training knowledge).
89
+ - `"TN"` (True Negative): Question is NOT relevant, and the assistant correctly refused to answer.
90
+
91
+ OUTPUT FORMAT:
92
+ Return only a **JSON object** with one key:
93
+ ```json
94
+ {{
95
+ "verdict": "TP" // or TN, FP, FN
96
+ "reason": "Reason why the verdict is 'TP'"
97
+ }}
98
+
99
+ CHAIN OF THOUGHT:
100
+ - Check if the question aligns with any of the relevant topics.
101
+ - If yes:
102
+ - Assess if the response is correct, complete, and directly answers the question.
103
+ - If no:
104
+ - Check if the assistant refused appropriately or gave an unwarranted answer.
105
+ - Choose the correct verdict using the definitions above.
106
+
107
+ EXAMPLE:
108
+
109
+ Relevant topics: ["heath nutrition", "food and their benefits"]
110
+ Question: "Which food is best for diabetic patients?"
111
+ Response: "Steel-cut oats are good for diabetic patients"
112
+
113
+ Example JSON:
114
+ {{
115
+ "verdict": "TP",
116
+ "reason": The question asks about food for diabetic patients and the response clearly answers that oats are good for diabetic patients. Both align with the relevant topics of heath nutrition and food and their benefits...
117
+ }}
118
+
119
+ ===== END OF EXAMPLE ======
120
+
121
+ **
122
+ IMPORTANT: Please make sure to only return in JSON format with two keys: 'verdict' and 'reason'
123
+ **
124
+
125
+ Relevant topics: {relevant_topics}
126
+ Question: {question}
127
+ Response: {response}
128
+
129
+ JSON:
130
+ """
131
+ )
132
+
133
+ @staticmethod
134
+ def generate_reason(success, score, threshold, TP, TN, FP, FN) -> str:
135
+ return textwrap.dedent(
136
+ f"""You are given a score for a metric that calculates whether an agent has adhered to it's topics.
137
+ You are also given a list of reasons for the truth table values that were used to calculate final score.
138
+
139
+ Your task is to go through these reasons and give a single final explaination that clearly explains why this metric has failed or passed.
140
+
141
+ Pass: {success}
142
+ Score: {score}
143
+ Threshold: {threshold}
144
+
145
+ Here are the reasons for all truth table entries:
146
+
147
+ True positive reasons: {TP[1]}
148
+ True negative reasons: {TN[1]}
149
+ False positives reasons: {FP[1]}
150
+ False negatives reasons: {FN[1]}
151
+
152
+ Score calculation = Number of True Positives + Number of True Negatives / Total number of table entries
153
+
154
+ **
155
+ IMPORTANT: Now generate a comprehensive reason that explains why this metric failed. You MUST output only the reason as a string and nothing else.
156
+ **
157
+
158
+ Output ONLY the reason, DON"T output anything else.
159
+
160
+ Reason:
161
+ """
162
+ )