deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +104 -36
  3. deepeval/config/utils.py +5 -0
  4. deepeval/dataset/dataset.py +162 -30
  5. deepeval/dataset/utils.py +41 -13
  6. deepeval/errors.py +20 -2
  7. deepeval/evaluate/execute.py +1662 -688
  8. deepeval/evaluate/types.py +1 -0
  9. deepeval/evaluate/utils.py +13 -3
  10. deepeval/integrations/crewai/__init__.py +2 -1
  11. deepeval/integrations/crewai/tool.py +71 -0
  12. deepeval/integrations/llama_index/__init__.py +0 -4
  13. deepeval/integrations/llama_index/handler.py +20 -21
  14. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  15. deepeval/metrics/__init__.py +13 -0
  16. deepeval/metrics/base_metric.py +1 -0
  17. deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
  18. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  19. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
  20. deepeval/metrics/dag/schema.py +1 -1
  21. deepeval/metrics/dag/templates.py +2 -2
  22. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  23. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  24. deepeval/metrics/goal_accuracy/schema.py +17 -0
  25. deepeval/metrics/goal_accuracy/template.py +235 -0
  26. deepeval/metrics/hallucination/hallucination.py +8 -8
  27. deepeval/metrics/indicator.py +21 -1
  28. deepeval/metrics/mcp/mcp_task_completion.py +7 -2
  29. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
  30. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
  31. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
  32. deepeval/metrics/plan_adherence/__init__.py +1 -0
  33. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  34. deepeval/metrics/plan_adherence/schema.py +11 -0
  35. deepeval/metrics/plan_adherence/template.py +170 -0
  36. deepeval/metrics/plan_quality/__init__.py +1 -0
  37. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  38. deepeval/metrics/plan_quality/schema.py +11 -0
  39. deepeval/metrics/plan_quality/template.py +101 -0
  40. deepeval/metrics/step_efficiency/__init__.py +1 -0
  41. deepeval/metrics/step_efficiency/schema.py +11 -0
  42. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  43. deepeval/metrics/step_efficiency/template.py +256 -0
  44. deepeval/metrics/task_completion/task_completion.py +1 -0
  45. deepeval/metrics/tool_correctness/schema.py +6 -0
  46. deepeval/metrics/tool_correctness/template.py +88 -0
  47. deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
  48. deepeval/metrics/tool_use/__init__.py +1 -0
  49. deepeval/metrics/tool_use/schema.py +19 -0
  50. deepeval/metrics/tool_use/template.py +220 -0
  51. deepeval/metrics/tool_use/tool_use.py +458 -0
  52. deepeval/metrics/topic_adherence/__init__.py +1 -0
  53. deepeval/metrics/topic_adherence/schema.py +16 -0
  54. deepeval/metrics/topic_adherence/template.py +162 -0
  55. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  56. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  57. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  58. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  59. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  60. deepeval/models/llms/amazon_bedrock_model.py +20 -17
  61. deepeval/models/llms/openai_model.py +10 -1
  62. deepeval/models/retry_policy.py +103 -20
  63. deepeval/openai/extractors.py +61 -16
  64. deepeval/openai/patch.py +8 -12
  65. deepeval/openai/types.py +1 -1
  66. deepeval/openai/utils.py +108 -1
  67. deepeval/prompt/prompt.py +1 -0
  68. deepeval/prompt/utils.py +43 -14
  69. deepeval/simulator/conversation_simulator.py +25 -18
  70. deepeval/synthesizer/chunking/context_generator.py +9 -1
  71. deepeval/synthesizer/synthesizer.py +11 -10
  72. deepeval/test_case/llm_test_case.py +6 -2
  73. deepeval/test_run/test_run.py +190 -207
  74. deepeval/tracing/__init__.py +2 -1
  75. deepeval/tracing/otel/exporter.py +3 -4
  76. deepeval/tracing/otel/utils.py +23 -4
  77. deepeval/tracing/trace_context.py +53 -38
  78. deepeval/tracing/tracing.py +23 -0
  79. deepeval/tracing/types.py +16 -14
  80. deepeval/utils.py +21 -0
  81. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
  82. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
  83. deepeval/integrations/llama_index/agent/patched.py +0 -68
  84. deepeval/tracing/message_types/__init__.py +0 -10
  85. deepeval/tracing/message_types/base.py +0 -6
  86. deepeval/tracing/message_types/messages.py +0 -14
  87. deepeval/tracing/message_types/tools.py +0 -18
  88. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
  89. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
  90. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,292 @@
1
+ from typing import Optional, List, Union, Dict
2
+
3
+ from deepeval.utils import get_or_create_event_loop, prettify_list
4
+ from deepeval.metrics.utils import (
5
+ construct_verbose_logs,
6
+ trimAndLoadJson,
7
+ check_llm_test_case_params,
8
+ initialize_model,
9
+ )
10
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams
11
+ from deepeval.metrics import BaseMetric
12
+ from deepeval.models import DeepEvalBaseLLM
13
+ from deepeval.metrics.indicator import metric_progress_indicator
14
+ from deepeval.metrics.step_efficiency.template import (
15
+ StepEfficiencyTemplate,
16
+ )
17
+ from deepeval.metrics.step_efficiency.schema import Task
18
+ from deepeval.metrics.plan_quality.schema import (
19
+ AgentPlan,
20
+ PlanQualityScore,
21
+ )
22
+ from deepeval.metrics.plan_quality.template import (
23
+ PlanQualityTemplate,
24
+ )
25
+ from deepeval.metrics.plan_adherence.template import (
26
+ PlanAdherenceTemplate,
27
+ )
28
+ from deepeval.metrics.api import metric_data_manager
29
+
30
+
31
+ class PlanQualityMetric(BaseMetric):
32
+
33
+ _required_params: List[LLMTestCaseParams] = [
34
+ LLMTestCaseParams.INPUT,
35
+ LLMTestCaseParams.ACTUAL_OUTPUT,
36
+ LLMTestCaseParams.TOOLS_CALLED,
37
+ ]
38
+
39
+ def __init__(
40
+ self,
41
+ threshold: float = 0.5,
42
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
43
+ include_reason: bool = True,
44
+ async_mode: bool = True,
45
+ strict_mode: bool = False,
46
+ verbose_mode: bool = False,
47
+ ):
48
+ self.threshold = 1 if strict_mode else threshold
49
+ self.model, self.using_native_model = initialize_model(model)
50
+ self.evaluation_model = self.model.get_model_name()
51
+ self.include_reason = include_reason
52
+ self.async_mode = async_mode
53
+ self.strict_mode = strict_mode
54
+ self.verbose_mode = verbose_mode
55
+ self.requires_trace = True
56
+
57
+ def measure(
58
+ self,
59
+ test_case: LLMTestCase,
60
+ _show_indicator: bool = True,
61
+ _in_component: bool = False,
62
+ _log_metric_to_confident: bool = True,
63
+ ):
64
+ has_trace: bool = isinstance(test_case._trace_dict, Dict)
65
+ if not has_trace:
66
+ check_llm_test_case_params(test_case, self._required_params, self)
67
+
68
+ self.evaluation_cost = 0 if self.using_native_model else None
69
+ with metric_progress_indicator(
70
+ self, _show_indicator=_show_indicator, _in_component=_in_component
71
+ ):
72
+ if self.async_mode:
73
+ loop = get_or_create_event_loop()
74
+ loop.run_until_complete(
75
+ self.a_measure(
76
+ test_case,
77
+ _show_indicator=False,
78
+ _in_component=_in_component,
79
+ _log_metric_to_confident=_log_metric_to_confident,
80
+ )
81
+ )
82
+ else:
83
+ task = self._extract_task_from_trace(test_case)
84
+ agent_plan = self._extract_plan_from_trace(test_case)
85
+ if len(agent_plan.plan) == 0:
86
+ self.score = 1
87
+ self.reason = "There were no plans to evaluate within the trace of your agent's execution. Please check if the agent's planning or reasoning or thinking is stored in any one of the trace attributes."
88
+ else:
89
+ plan_quality_score = self._get_plan_quality_score(
90
+ task, agent_plan.plan
91
+ )
92
+ self.score = (
93
+ 0
94
+ if self.strict_mode
95
+ and plan_quality_score.score < self.threshold
96
+ else plan_quality_score.score
97
+ )
98
+ self.reason = plan_quality_score.reason
99
+ self.success = self.score >= self.threshold
100
+ self.verbose_logs = construct_verbose_logs(
101
+ self,
102
+ steps=[
103
+ f"Task: {task} \n",
104
+ f"Agent Plan: \n{prettify_list(agent_plan.plan)} \n",
105
+ f"Final Score Score: {self.score} \n",
106
+ f"Final Score Reason: {self.reason} \n",
107
+ ],
108
+ )
109
+
110
+ if _log_metric_to_confident:
111
+ metric_data_manager.post_metric_if_enabled(
112
+ self, test_case=test_case
113
+ )
114
+
115
+ return self.score
116
+
117
+ async def a_measure(
118
+ self,
119
+ test_case: LLMTestCase,
120
+ _show_indicator: bool = True,
121
+ _in_component: bool = False,
122
+ _log_metric_to_confident: bool = True,
123
+ ):
124
+ has_trace: bool = isinstance(test_case._trace_dict, Dict)
125
+ if not has_trace:
126
+ check_llm_test_case_params(test_case, self._required_params, self)
127
+
128
+ self.evaluation_cost = 0 if self.using_native_model else None
129
+
130
+ with metric_progress_indicator(
131
+ self,
132
+ async_mode=True,
133
+ _show_indicator=_show_indicator,
134
+ _in_component=_in_component,
135
+ ):
136
+ task = await self._a_extract_task_from_trace(test_case)
137
+ agent_plan = await self._a_extract_plan_from_trace(test_case)
138
+ if len(agent_plan.plan) == 0:
139
+ self.score = 1
140
+ self.reason = "There are no plans to evaluate within the trace of your agent's execution. Please check if the agent's planning or reasoning or thinking is stored in the trace attributes."
141
+ else:
142
+ plan_quality_score = await self._a_get_plan_quality_score(
143
+ task, agent_plan.plan
144
+ )
145
+ self.score = (
146
+ 0
147
+ if self.strict_mode
148
+ and plan_quality_score.score < self.threshold
149
+ else plan_quality_score.score
150
+ )
151
+ self.reason = plan_quality_score.reason
152
+ self.success = self.score >= self.threshold
153
+ self.verbose_logs = construct_verbose_logs(
154
+ self,
155
+ steps=[
156
+ f"Task: {task} \n",
157
+ f"Agent Plan: \n{prettify_list(agent_plan.plan)} \n",
158
+ f"Final Score: {self.score} \n",
159
+ f"Final Reason: {self.reason} \n",
160
+ ],
161
+ )
162
+
163
+ if _log_metric_to_confident:
164
+ metric_data_manager.post_metric_if_enabled(
165
+ self, test_case=test_case
166
+ )
167
+
168
+ return self.score
169
+
170
+ def _get_plan_quality_score(self, task, plan):
171
+ prompt = PlanQualityTemplate.evaluate_plan_quality(
172
+ task, "\n".join(plan)
173
+ )
174
+ if self.using_native_model:
175
+ res, cost = self.model.generate(prompt, schema=PlanQualityScore)
176
+ self.evaluation_cost += cost
177
+ return res
178
+ else:
179
+ try:
180
+ res: Task = self.model.generate(prompt, schema=PlanQualityScore)
181
+ return res
182
+ except TypeError:
183
+ res = self.model.generate(prompt)
184
+ data = trimAndLoadJson(res, self)
185
+ return PlanQualityScore(**data)
186
+
187
+ async def _a_get_plan_quality_score(self, task, plan):
188
+ prompt = PlanQualityTemplate.evaluate_plan_quality(
189
+ task, "\n".join(plan)
190
+ )
191
+ if self.using_native_model:
192
+ res, cost = await self.model.a_generate(
193
+ prompt, schema=PlanQualityScore
194
+ )
195
+ self.evaluation_cost += cost
196
+ return res
197
+ else:
198
+ try:
199
+ res: Task = await self.model.a_generate(
200
+ prompt, schema=PlanQualityScore
201
+ )
202
+ return res
203
+ except TypeError:
204
+ res = await self.model.a_generate(prompt)
205
+ data = trimAndLoadJson(res, self)
206
+ return PlanQualityScore(**data)
207
+
208
+ def _extract_plan_from_trace(self, test_case: LLMTestCase) -> AgentPlan:
209
+ prompt = PlanAdherenceTemplate.extract_plan_from_trace(
210
+ test_case._trace_dict
211
+ )
212
+ if self.using_native_model:
213
+ res, cost = self.model.generate(prompt, schema=AgentPlan)
214
+ self.evaluation_cost += cost
215
+ return res
216
+ else:
217
+ try:
218
+ res: Task = self.model.generate(prompt, schema=AgentPlan)
219
+ return res
220
+ except TypeError:
221
+ res = self.model.generate(prompt)
222
+ data = trimAndLoadJson(res, self)
223
+ return AgentPlan(**data)
224
+
225
+ async def _a_extract_plan_from_trace(
226
+ self, test_case: LLMTestCase
227
+ ) -> AgentPlan:
228
+ prompt = PlanAdherenceTemplate.extract_plan_from_trace(
229
+ test_case._trace_dict
230
+ )
231
+ if self.using_native_model:
232
+ res, cost = await self.model.a_generate(prompt, schema=AgentPlan)
233
+ self.evaluation_cost += cost
234
+ return res
235
+ else:
236
+ try:
237
+ res: Task = await self.model.a_generate(
238
+ prompt, schema=AgentPlan
239
+ )
240
+ return res
241
+ except TypeError:
242
+ res = await self.model.a_generate(prompt)
243
+ data = trimAndLoadJson(res, self)
244
+ return AgentPlan(**data)
245
+
246
+ def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
247
+ prompt = StepEfficiencyTemplate.extract_task_from_trace(
248
+ test_case._trace_dict
249
+ )
250
+ if self.using_native_model:
251
+ res, cost = self.model.generate(prompt, schema=Task)
252
+ self.evaluation_cost += cost
253
+ return res.task
254
+ else:
255
+ try:
256
+ res: Task = self.model.generate(prompt, schema=Task)
257
+ return res.task
258
+ except TypeError:
259
+ res = self.model.generate(prompt)
260
+ data = trimAndLoadJson(res, self)
261
+ return data["task"]
262
+
263
+ async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
264
+ prompt = StepEfficiencyTemplate.extract_task_from_trace(
265
+ test_case._trace_dict
266
+ )
267
+ if self.using_native_model:
268
+ res, cost = await self.model.a_generate(prompt, schema=Task)
269
+ self.evaluation_cost += cost
270
+ return res.task
271
+ else:
272
+ try:
273
+ res: Task = await self.model.a_generate(prompt, schema=Task)
274
+ return res.task
275
+ except TypeError:
276
+ res = await self.model.a_generate(prompt)
277
+ data = trimAndLoadJson(res, self)
278
+ return data["task"]
279
+
280
+ def is_successful(self) -> bool:
281
+ if self.error is not None:
282
+ self.success = False
283
+ else:
284
+ try:
285
+ self.success = self.score >= self.threshold
286
+ except:
287
+ self.success = False
288
+ return self.success
289
+
290
+ @property
291
+ def __name__(self):
292
+ return "Plan Quality"
@@ -0,0 +1,11 @@
1
+ from pydantic import BaseModel
2
+ from typing import List, Dict, Literal
3
+
4
+
5
+ class AgentPlan(BaseModel):
6
+ plan: List[str]
7
+
8
+
9
+ class PlanQualityScore(BaseModel):
10
+ score: float
11
+ reason: str
@@ -0,0 +1,101 @@
1
+ import textwrap
2
+ import json
3
+ from deepeval.tracing.utils import make_json_serializable
4
+
5
+
6
+ class PlanQualityTemplate:
7
+
8
+ @staticmethod
9
+ def evaluate_plan_quality(user_task: str, agent_plan: list) -> str:
10
+ return textwrap.dedent(
11
+ f"""You are a **plan quality evaluator**. Your task is to critically assess the **quality, completeness, and optimality** of an AI agent's plan to accomplish the given user task.
12
+
13
+ INPUTS:
14
+
15
+ - **User Task:** The user's explicit goal or instruction.
16
+ - **Agent Plan:** The ordered list of steps the agent intends to follow to achieve that goal.
17
+
18
+ EVALUATION OBJECTIVE:
19
+
20
+ Judge the **intrinsic quality** of the plan — whether the plan itself is strong enough to fully and efficiently achieve the user's task.
21
+
22
+ The evaluation must be **strict**. If the plan is incomplete, inefficient, redundant, or missing critical details, assign a very low score.
23
+
24
+ STRICT EVALUATION CRITERIA:
25
+
26
+ 1. Completeness (Most Important)
27
+ - The plan must fully address all major requirements of the user task.
28
+ - Missing even one critical subtask or dependency should reduce the score sharply.
29
+ - The plan must include all prerequisite actions necessary for the final outcome.
30
+
31
+ 2. Logical Coherence
32
+ - Steps must follow a clear, rational sequence that leads directly to completing the task.
33
+ - Disordered, redundant, or circular reasoning should be penalized heavily.
34
+ - Every step must have a clear purpose; no filler or irrelevant actions.
35
+
36
+ 3. Optimality and Efficiency
37
+ - The plan must be **minimal but sufficient** — no unnecessary or repetitive steps.
38
+ - If a more direct, simpler, or logically superior plan could achieve the same outcome, the current plan should receive a lower score.
39
+
40
+ 4. Level of Detail
41
+ - Each step should be specific enough for an agent to execute it reliably without ambiguity.
42
+ - Vague steps (e.g., “Do research”, “Handle results”) that lack operational clarity
43
+ lower the score.
44
+
45
+ 5. Alignment with Task
46
+ - The plan must explicitly and directly target the user's stated goal.
47
+ - If any step diverges from the main objective, the score should drop significantly.
48
+
49
+ ---
50
+
51
+ SCORING SCALE (STRICT)
52
+
53
+ - **1.0 — Excellent plan**
54
+ - Fully complete, logically ordered, and optimally efficient.
55
+ - No missing, redundant, or ambiguous steps.
56
+ - Directly fulfills every aspect of the user task.
57
+
58
+ - **0.75 — Good plan**
59
+ - Covers nearly all aspects of the task with clear logic.
60
+ - Minor gaps or small inefficiencies that do not block task completion.
61
+
62
+ - **0.5 — Adequate but flawed plan**
63
+ - Partially complete; key details missing or step order inefficient.
64
+ - Some ambiguity or redundancy that would likely affect execution success.
65
+
66
+ - **0.25 — Weak plan**
67
+ - Major missing steps or unclear logic.
68
+ - The plan would likely fail to complete the task as written.
69
+
70
+ - **0.0 — Inadequate plan**
71
+ - Irrelevant, incoherent, or severely incomplete plan.
72
+ - Does not align with the user’s task or cannot plausibly achieve it.
73
+
74
+ *When in doubt, assign the lower score.*
75
+
76
+ OUTPUT FORMAT:
77
+
78
+ Return a JSON object with this exact structure:
79
+
80
+ {{
81
+ "score": 0.0,
82
+ "reason": "1-3 short, precise sentences explaining what the plan lacks or how it could fail."
83
+ }}
84
+
85
+ The `"reason"` must:
86
+ - Reference specific missing, unclear, or inefficient steps.
87
+ - Avoid vague language (“seems fine”, “mostly works”).
88
+ - Use objective terms describing gaps or weaknesses.
89
+
90
+ PROVIDED DATA
91
+
92
+ User Task:
93
+ {user_task}
94
+
95
+ Agent Plan:
96
+ {agent_plan}
97
+
98
+
99
+ JSON:
100
+ """
101
+ )
@@ -0,0 +1 @@
1
+ from .step_efficiency import StepEfficiencyMetric
@@ -0,0 +1,11 @@
1
+ from pydantic import BaseModel
2
+ from typing import List, Dict, Literal
3
+
4
+
5
+ class Task(BaseModel):
6
+ task: str
7
+
8
+
9
+ class EfficiencyVerdict(BaseModel):
10
+ score: float
11
+ reason: str
@@ -0,0 +1,234 @@
1
+ from typing import Optional, List, Union, Dict
2
+
3
+ from deepeval.utils import get_or_create_event_loop
4
+ from deepeval.metrics.utils import (
5
+ construct_verbose_logs,
6
+ trimAndLoadJson,
7
+ check_llm_test_case_params,
8
+ initialize_model,
9
+ )
10
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams
11
+ from deepeval.metrics import BaseMetric
12
+ from deepeval.models import DeepEvalBaseLLM
13
+ from deepeval.metrics.indicator import metric_progress_indicator
14
+ from deepeval.metrics.step_efficiency.template import (
15
+ StepEfficiencyTemplate,
16
+ )
17
+ from deepeval.metrics.step_efficiency.schema import Task, EfficiencyVerdict
18
+ from deepeval.metrics.api import metric_data_manager
19
+
20
+
21
+ class StepEfficiencyMetric(BaseMetric):
22
+
23
+ _required_params: List[LLMTestCaseParams] = [
24
+ LLMTestCaseParams.INPUT,
25
+ LLMTestCaseParams.ACTUAL_OUTPUT,
26
+ LLMTestCaseParams.TOOLS_CALLED,
27
+ ]
28
+
29
+ def __init__(
30
+ self,
31
+ threshold: float = 0.5,
32
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
33
+ include_reason: bool = True,
34
+ async_mode: bool = True,
35
+ strict_mode: bool = False,
36
+ verbose_mode: bool = False,
37
+ ):
38
+ self.threshold = 1 if strict_mode else threshold
39
+ self.model, self.using_native_model = initialize_model(model)
40
+ self.evaluation_model = self.model.get_model_name()
41
+ self.include_reason = include_reason
42
+ self.async_mode = async_mode
43
+ self.strict_mode = strict_mode
44
+ self.verbose_mode = verbose_mode
45
+ self.requires_trace = True
46
+
47
+ def measure(
48
+ self,
49
+ test_case: LLMTestCase,
50
+ _show_indicator: bool = True,
51
+ _in_component: bool = False,
52
+ _log_metric_to_confident: bool = True,
53
+ ):
54
+ has_trace: bool = isinstance(test_case._trace_dict, Dict)
55
+ if not has_trace:
56
+ check_llm_test_case_params(test_case, self._required_params, self)
57
+
58
+ self.evaluation_cost = 0 if self.using_native_model else None
59
+ with metric_progress_indicator(
60
+ self, _show_indicator=_show_indicator, _in_component=_in_component
61
+ ):
62
+ if self.async_mode:
63
+ loop = get_or_create_event_loop()
64
+ loop.run_until_complete(
65
+ self.a_measure(
66
+ test_case,
67
+ _show_indicator=False,
68
+ _in_component=_in_component,
69
+ _log_metric_to_confident=_log_metric_to_confident,
70
+ )
71
+ )
72
+ else:
73
+ task = self._extract_task_from_trace(test_case)
74
+ efficiency_verdict = self._get_score(task, test_case)
75
+ self.score = (
76
+ 0
77
+ if self.strict_mode
78
+ and efficiency_verdict.score < self.threshold
79
+ else efficiency_verdict.score
80
+ )
81
+ self.reason = efficiency_verdict.reason
82
+ self.success = self.score >= self.threshold
83
+ self.verbose_logs = construct_verbose_logs(
84
+ self,
85
+ steps=[
86
+ f"Task: {task} \n",
87
+ f"Efficiency Score: {self.score}",
88
+ f"Efficiency Reason: {self.reason}",
89
+ ],
90
+ )
91
+
92
+ if _log_metric_to_confident:
93
+ metric_data_manager.post_metric_if_enabled(
94
+ self, test_case=test_case
95
+ )
96
+
97
+ return self.score
98
+
99
+ async def a_measure(
100
+ self,
101
+ test_case: LLMTestCase,
102
+ _show_indicator: bool = True,
103
+ _in_component: bool = False,
104
+ _log_metric_to_confident: bool = True,
105
+ ):
106
+ has_trace: bool = isinstance(test_case._trace_dict, Dict)
107
+ if not has_trace:
108
+ check_llm_test_case_params(test_case, self._required_params, self)
109
+
110
+ self.evaluation_cost = 0 if self.using_native_model else None
111
+
112
+ with metric_progress_indicator(
113
+ self,
114
+ async_mode=True,
115
+ _show_indicator=_show_indicator,
116
+ _in_component=_in_component,
117
+ ):
118
+ task = await self._a_extract_task_from_trace(test_case)
119
+ efficiency_verdict = await self._a_get_score(task, test_case)
120
+ self.score = (
121
+ 0
122
+ if self.strict_mode
123
+ and efficiency_verdict.score < self.threshold
124
+ else efficiency_verdict.score
125
+ )
126
+ self.reason = efficiency_verdict.reason
127
+ self.success = self.score >= self.threshold
128
+ self.verbose_logs = construct_verbose_logs(
129
+ self,
130
+ steps=[
131
+ f"Task: {task} \n",
132
+ f"Efficiency Score: {self.score}",
133
+ f"Efficiency Reason: {self.reason}",
134
+ ],
135
+ )
136
+
137
+ if _log_metric_to_confident:
138
+ metric_data_manager.post_metric_if_enabled(
139
+ self, test_case=test_case
140
+ )
141
+
142
+ return self.score
143
+
144
+ def _get_score(self, task: str, test_case: LLMTestCase):
145
+ if test_case._trace_dict is not None:
146
+ prompt = StepEfficiencyTemplate.get_execution_efficiency(
147
+ task, test_case._trace_dict
148
+ )
149
+
150
+ if self.using_native_model:
151
+ res, cost = self.model.generate(prompt, schema=EfficiencyVerdict)
152
+ self.evaluation_cost += cost
153
+ return res
154
+ else:
155
+ try:
156
+ res: Task = self.model.generate(
157
+ prompt, schema=EfficiencyVerdict
158
+ )
159
+ return res
160
+ except TypeError:
161
+ res = self.model.generate(prompt)
162
+ data = trimAndLoadJson(res, self)
163
+ return EfficiencyVerdict(**data)
164
+
165
+ async def _a_get_score(self, task: str, test_case: LLMTestCase):
166
+ if test_case._trace_dict is not None:
167
+ prompt = StepEfficiencyTemplate.get_execution_efficiency(
168
+ task, test_case._trace_dict
169
+ )
170
+
171
+ if self.using_native_model:
172
+ res, cost = await self.model.a_generate(
173
+ prompt, schema=EfficiencyVerdict
174
+ )
175
+ self.evaluation_cost += cost
176
+ return res
177
+ else:
178
+ try:
179
+ res: Task = await self.model.a_generate(
180
+ prompt, schema=EfficiencyVerdict
181
+ )
182
+ return res
183
+ except TypeError:
184
+ res = await self.model.a_generate(prompt)
185
+ data = trimAndLoadJson(res, self)
186
+ return EfficiencyVerdict(**data)
187
+
188
+ def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
189
+ prompt = StepEfficiencyTemplate.extract_task_from_trace(
190
+ test_case._trace_dict
191
+ )
192
+ if self.using_native_model:
193
+ res, cost = self.model.generate(prompt, schema=Task)
194
+ self.evaluation_cost += cost
195
+ return res.task
196
+ else:
197
+ try:
198
+ res: Task = self.model.generate(prompt, schema=Task)
199
+ return res.task
200
+ except TypeError:
201
+ res = self.model.generate(prompt)
202
+ data = trimAndLoadJson(res, self)
203
+ return data["task"]
204
+
205
+ async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
206
+ prompt = StepEfficiencyTemplate.extract_task_from_trace(
207
+ test_case._trace_dict
208
+ )
209
+ if self.using_native_model:
210
+ res, cost = await self.model.a_generate(prompt, schema=Task)
211
+ self.evaluation_cost += cost
212
+ return res.task
213
+ else:
214
+ try:
215
+ res: Task = await self.model.a_generate(prompt, schema=Task)
216
+ return res.task
217
+ except TypeError:
218
+ res = await self.model.a_generate(prompt)
219
+ data = trimAndLoadJson(res, self)
220
+ return data["task"]
221
+
222
+ def is_successful(self) -> bool:
223
+ if self.error is not None:
224
+ self.success = False
225
+ else:
226
+ try:
227
+ self.success = self.score >= self.threshold
228
+ except:
229
+ self.success = False
230
+ return self.success
231
+
232
+ @property
233
+ def __name__(self):
234
+ return "Execution Efficiency"