deepeval 3.6.7__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/execute.py +725 -217
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +13 -3
- deepeval/integrations/crewai/__init__.py +2 -1
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +8 -8
- deepeval/metrics/mcp/mcp_task_completion.py +7 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/task_completion/task_completion.py +1 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/openai/extractors.py +61 -16
- deepeval/openai/patch.py +8 -12
- deepeval/openai/types.py +1 -1
- deepeval/openai/utils.py +108 -1
- deepeval/prompt/prompt.py +1 -0
- deepeval/prompt/utils.py +43 -14
- deepeval/synthesizer/synthesizer.py +11 -10
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/test_run.py +190 -207
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +23 -4
- deepeval/tracing/trace_context.py +53 -38
- deepeval/tracing/tracing.py +23 -0
- deepeval/tracing/types.py +16 -14
- deepeval/utils.py +21 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/RECORD +75 -53
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- deepeval/tracing/message_types/__init__.py +0 -10
- deepeval/tracing/message_types/base.py +0 -6
- deepeval/tracing/message_types/messages.py +0 -14
- deepeval/tracing/message_types/tools.py +0 -18
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""A slightly modified tailored version of the LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
|
|
2
2
|
|
|
3
3
|
from openai.types.chat.chat_completion import ChatCompletion
|
|
4
|
-
from typing import Optional, List, Tuple, Union, Dict
|
|
4
|
+
from typing import Optional, List, Tuple, Union, Dict, Type
|
|
5
5
|
import math
|
|
6
6
|
from deepeval.metrics import BaseConversationalMetric
|
|
7
7
|
from deepeval.metrics.g_eval.utils import (
|
|
@@ -44,6 +44,9 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
44
44
|
async_mode: bool = True,
|
|
45
45
|
strict_mode: bool = False,
|
|
46
46
|
verbose_mode: bool = False,
|
|
47
|
+
evaluation_template: Type[
|
|
48
|
+
ConversationalGEvalTemplate
|
|
49
|
+
] = ConversationalGEvalTemplate,
|
|
47
50
|
_include_g_eval_suffix: bool = True,
|
|
48
51
|
):
|
|
49
52
|
if evaluation_params is not None and len(evaluation_params) == 0:
|
|
@@ -85,6 +88,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
85
88
|
self.strict_mode = strict_mode
|
|
86
89
|
self.async_mode = async_mode
|
|
87
90
|
self.verbose_mode = verbose_mode
|
|
91
|
+
self.evaluation_template = evaluation_template
|
|
88
92
|
self._include_g_eval_suffix = _include_g_eval_suffix
|
|
89
93
|
|
|
90
94
|
def measure(
|
|
@@ -194,7 +198,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
194
198
|
g_eval_params_str = construct_conversational_g_eval_turn_params_string(
|
|
195
199
|
self.evaluation_params
|
|
196
200
|
)
|
|
197
|
-
prompt =
|
|
201
|
+
prompt = self.evaluation_template.generate_evaluation_steps(
|
|
198
202
|
criteria=self.criteria, parameters=g_eval_params_str
|
|
199
203
|
)
|
|
200
204
|
if self.using_native_model:
|
|
@@ -221,7 +225,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
221
225
|
g_eval_params_str = construct_conversational_g_eval_turn_params_string(
|
|
222
226
|
self.evaluation_params
|
|
223
227
|
)
|
|
224
|
-
prompt =
|
|
228
|
+
prompt = self.evaluation_template.generate_evaluation_steps(
|
|
225
229
|
criteria=self.criteria, parameters=g_eval_params_str
|
|
226
230
|
)
|
|
227
231
|
if self.using_native_model:
|
|
@@ -250,7 +254,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
250
254
|
)
|
|
251
255
|
if not self.strict_mode:
|
|
252
256
|
rubric_str = format_rubrics(self.rubric) if self.rubric else None
|
|
253
|
-
prompt =
|
|
257
|
+
prompt = self.evaluation_template.generate_evaluation_results(
|
|
254
258
|
evaluation_steps=self.number_evaluation_steps(),
|
|
255
259
|
test_case_content=test_case_content,
|
|
256
260
|
turns=[
|
|
@@ -261,7 +265,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
261
265
|
rubric=rubric_str,
|
|
262
266
|
)
|
|
263
267
|
else:
|
|
264
|
-
prompt =
|
|
268
|
+
prompt = self.evaluation_template.generate_evaluation_results(
|
|
265
269
|
evaluation_steps=self.number_evaluation_steps(),
|
|
266
270
|
test_case_content=test_case_content,
|
|
267
271
|
turns=[
|
|
@@ -320,7 +324,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
320
324
|
)
|
|
321
325
|
if not self.strict_mode:
|
|
322
326
|
rubric_str = format_rubrics(self.rubric) if self.rubric else None
|
|
323
|
-
prompt =
|
|
327
|
+
prompt = self.evaluation_template.generate_evaluation_results(
|
|
324
328
|
evaluation_steps=self.number_evaluation_steps(),
|
|
325
329
|
test_case_content=test_case_content,
|
|
326
330
|
turns=[
|
|
@@ -331,7 +335,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
331
335
|
rubric=rubric_str,
|
|
332
336
|
)
|
|
333
337
|
else:
|
|
334
|
-
prompt =
|
|
338
|
+
prompt = self.evaluation_template.generate_evaluation_results(
|
|
335
339
|
evaluation_steps=self.number_evaluation_steps(),
|
|
336
340
|
test_case_content=test_case_content,
|
|
337
341
|
turns=[
|
deepeval/metrics/dag/schema.py
CHANGED
|
@@ -60,10 +60,10 @@ class BinaryJudgementTemplate:
|
|
|
60
60
|
{text}
|
|
61
61
|
|
|
62
62
|
**
|
|
63
|
-
IMPORTANT: Please make sure to only return a json with two keys: `verdict` (
|
|
63
|
+
IMPORTANT: Please make sure to only return a json with two keys: `verdict` (True or False), and the 'reason' key providing the reason. The verdict must be a boolean only, either True or False.
|
|
64
64
|
Example JSON:
|
|
65
65
|
{{
|
|
66
|
-
"verdict":
|
|
66
|
+
"verdict": True,
|
|
67
67
|
"reason": "..."
|
|
68
68
|
}}
|
|
69
69
|
**
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .goal_accuracy import GoalAccuracyMetric
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
from typing import Optional, List, Union
|
|
2
|
+
import asyncio
|
|
3
|
+
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
4
|
+
from deepeval.metrics.utils import (
|
|
5
|
+
construct_verbose_logs,
|
|
6
|
+
trimAndLoadJson,
|
|
7
|
+
get_unit_interactions,
|
|
8
|
+
print_tools_called,
|
|
9
|
+
check_conversational_test_case_params,
|
|
10
|
+
initialize_model,
|
|
11
|
+
)
|
|
12
|
+
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
13
|
+
from deepeval.metrics import BaseConversationalMetric
|
|
14
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
15
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
16
|
+
from deepeval.metrics.goal_accuracy.template import (
|
|
17
|
+
GoalAccuracyTemplate,
|
|
18
|
+
)
|
|
19
|
+
from deepeval.metrics.goal_accuracy.schema import (
|
|
20
|
+
GoalSteps,
|
|
21
|
+
GoalScore,
|
|
22
|
+
PlanScore,
|
|
23
|
+
)
|
|
24
|
+
from deepeval.metrics.api import metric_data_manager
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class GoalAccuracyMetric(BaseConversationalMetric):
|
|
28
|
+
|
|
29
|
+
_required_test_case_params = [
|
|
30
|
+
TurnParams.ROLE,
|
|
31
|
+
TurnParams.CONTENT,
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
threshold: float = 0.5,
|
|
37
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
38
|
+
include_reason: bool = True,
|
|
39
|
+
async_mode: bool = True,
|
|
40
|
+
strict_mode: bool = False,
|
|
41
|
+
verbose_mode: bool = False,
|
|
42
|
+
):
|
|
43
|
+
self.threshold = 1 if strict_mode else threshold
|
|
44
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
45
|
+
self.evaluation_model = self.model.get_model_name()
|
|
46
|
+
self.include_reason = include_reason
|
|
47
|
+
self.async_mode = async_mode
|
|
48
|
+
self.strict_mode = strict_mode
|
|
49
|
+
self.verbose_mode = verbose_mode
|
|
50
|
+
|
|
51
|
+
def measure(
|
|
52
|
+
self,
|
|
53
|
+
test_case: ConversationalTestCase,
|
|
54
|
+
_show_indicator: bool = True,
|
|
55
|
+
_in_component: bool = False,
|
|
56
|
+
_log_metric_to_confident: bool = True,
|
|
57
|
+
):
|
|
58
|
+
check_conversational_test_case_params(
|
|
59
|
+
test_case, self._required_test_case_params, self
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
63
|
+
with metric_progress_indicator(
|
|
64
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
65
|
+
):
|
|
66
|
+
if self.async_mode:
|
|
67
|
+
loop = get_or_create_event_loop()
|
|
68
|
+
loop.run_until_complete(
|
|
69
|
+
self.a_measure(
|
|
70
|
+
test_case,
|
|
71
|
+
_show_indicator=False,
|
|
72
|
+
_in_component=_in_component,
|
|
73
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
else:
|
|
77
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
78
|
+
goal_and_steps_taken = self._goal_and_steps_taken(
|
|
79
|
+
unit_interactions
|
|
80
|
+
)
|
|
81
|
+
goal_scores = [
|
|
82
|
+
self._get_goal_accuracy_score(
|
|
83
|
+
task.user_goal, task.steps_taken
|
|
84
|
+
)
|
|
85
|
+
for task in goal_and_steps_taken
|
|
86
|
+
]
|
|
87
|
+
plan_scores = [
|
|
88
|
+
self._get_plan_scores(task.user_goal, task.steps_taken)
|
|
89
|
+
for task in goal_and_steps_taken
|
|
90
|
+
]
|
|
91
|
+
self.score = self._calculate_score(goal_scores, plan_scores)
|
|
92
|
+
self.success = self.score >= self.threshold
|
|
93
|
+
self.reason = self._generate_reason(goal_scores, plan_scores)
|
|
94
|
+
|
|
95
|
+
self.verbose_logs = construct_verbose_logs(
|
|
96
|
+
self,
|
|
97
|
+
steps=[
|
|
98
|
+
f"Goals and steps taken: \n{self.print_goals_and_steps_taken(goal_and_steps_taken)} \n",
|
|
99
|
+
f"Goal evaluations: {prettify_list(goal_scores)} \n\n"
|
|
100
|
+
f"Plan evaluations: {prettify_list(plan_scores)} \n\n"
|
|
101
|
+
f"Final Score: {self.score}",
|
|
102
|
+
f"Final Reason: {self.reason}",
|
|
103
|
+
],
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if _log_metric_to_confident:
|
|
107
|
+
metric_data_manager.post_metric_if_enabled(
|
|
108
|
+
self, test_case=test_case
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return self.score
|
|
112
|
+
|
|
113
|
+
async def a_measure(
|
|
114
|
+
self,
|
|
115
|
+
test_case: ConversationalTestCase,
|
|
116
|
+
_show_indicator: bool = True,
|
|
117
|
+
_in_component: bool = False,
|
|
118
|
+
_log_metric_to_confident: bool = True,
|
|
119
|
+
):
|
|
120
|
+
check_conversational_test_case_params(
|
|
121
|
+
test_case, self._required_test_case_params, self
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
125
|
+
|
|
126
|
+
with metric_progress_indicator(
|
|
127
|
+
self,
|
|
128
|
+
async_mode=True,
|
|
129
|
+
_show_indicator=_show_indicator,
|
|
130
|
+
_in_component=_in_component,
|
|
131
|
+
):
|
|
132
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
133
|
+
goal_and_steps_taken = self._goal_and_steps_taken(unit_interactions)
|
|
134
|
+
goal_scores = await asyncio.gather(
|
|
135
|
+
*[
|
|
136
|
+
self._a_get_goal_accuracy_score(
|
|
137
|
+
task.user_goal, task.steps_taken
|
|
138
|
+
)
|
|
139
|
+
for task in goal_and_steps_taken
|
|
140
|
+
]
|
|
141
|
+
)
|
|
142
|
+
plan_scores = await asyncio.gather(
|
|
143
|
+
*[
|
|
144
|
+
self._a_get_plan_scores(task.user_goal, task.steps_taken)
|
|
145
|
+
for task in goal_and_steps_taken
|
|
146
|
+
]
|
|
147
|
+
)
|
|
148
|
+
self.score = self._calculate_score(goal_scores, plan_scores)
|
|
149
|
+
self.success = self.score >= self.threshold
|
|
150
|
+
self.reason = await self._a_generate_reason(
|
|
151
|
+
goal_scores, plan_scores
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
self.verbose_logs = construct_verbose_logs(
|
|
155
|
+
self,
|
|
156
|
+
steps=[
|
|
157
|
+
f"Goals and steps taken: \n{self.print_goals_and_steps_taken(goal_and_steps_taken)} \n",
|
|
158
|
+
f"Goal evaluations: {prettify_list(goal_scores)} \n\n"
|
|
159
|
+
f"Plan evaluations: {prettify_list(plan_scores)} \n\n"
|
|
160
|
+
f"Final Score: {self.score}",
|
|
161
|
+
f"Final Reason: {self.reason}",
|
|
162
|
+
],
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
if _log_metric_to_confident:
|
|
166
|
+
metric_data_manager.post_metric_if_enabled(
|
|
167
|
+
self, test_case=test_case
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
return self.score
|
|
171
|
+
|
|
172
|
+
def _goal_and_steps_taken(
|
|
173
|
+
self, unit_interactions: List[List[Turn]]
|
|
174
|
+
) -> List[GoalSteps]:
|
|
175
|
+
goal_and_steps_taken = []
|
|
176
|
+
for unit_interaction in unit_interactions:
|
|
177
|
+
user_messages = "User messages: \n"
|
|
178
|
+
for turn in unit_interaction:
|
|
179
|
+
if turn.role == "user":
|
|
180
|
+
user_messages += turn.content + "\n"
|
|
181
|
+
else:
|
|
182
|
+
break
|
|
183
|
+
new_goal_steps = GoalSteps(user_goal=user_messages, steps_taken=[])
|
|
184
|
+
assistant_messages = "Assistant messages: \n"
|
|
185
|
+
for turn in unit_interaction[1:]:
|
|
186
|
+
if turn.role == "assistant":
|
|
187
|
+
assistant_messages += f"{turn.content} \n"
|
|
188
|
+
if turn.tools_called:
|
|
189
|
+
assistant_messages += f"Tools called: \n{print_tools_called(turn.tools_called)} \n"
|
|
190
|
+
new_goal_steps.steps_taken.append(assistant_messages)
|
|
191
|
+
goal_and_steps_taken.append(new_goal_steps)
|
|
192
|
+
return goal_and_steps_taken
|
|
193
|
+
|
|
194
|
+
def _get_plan_scores(self, user_goal, steps_taken):
|
|
195
|
+
prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
|
|
196
|
+
user_goal, "\n".join(steps_taken)
|
|
197
|
+
)
|
|
198
|
+
if self.using_native_model:
|
|
199
|
+
res, cost = self.model.generate(prompt, schema=PlanScore)
|
|
200
|
+
self.evaluation_cost += cost
|
|
201
|
+
return res
|
|
202
|
+
else:
|
|
203
|
+
try:
|
|
204
|
+
res: PlanScore = self.model.generate(prompt, schema=PlanScore)
|
|
205
|
+
return res
|
|
206
|
+
except TypeError:
|
|
207
|
+
res = self.model.generate(prompt)
|
|
208
|
+
data = trimAndLoadJson(res, self)
|
|
209
|
+
return PlanScore(**data)
|
|
210
|
+
|
|
211
|
+
async def _a_get_plan_scores(self, user_goal, steps_taken):
|
|
212
|
+
prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
|
|
213
|
+
user_goal, "\n".join(steps_taken)
|
|
214
|
+
)
|
|
215
|
+
if self.using_native_model:
|
|
216
|
+
res, cost = await self.model.a_generate(prompt, schema=PlanScore)
|
|
217
|
+
self.evaluation_cost += cost
|
|
218
|
+
return res
|
|
219
|
+
else:
|
|
220
|
+
try:
|
|
221
|
+
res: PlanScore = await self.model.a_generate(
|
|
222
|
+
prompt, schema=PlanScore
|
|
223
|
+
)
|
|
224
|
+
return res
|
|
225
|
+
except TypeError:
|
|
226
|
+
res = await self.model.a_generate(prompt)
|
|
227
|
+
data = trimAndLoadJson(res, self)
|
|
228
|
+
return PlanScore(**data)
|
|
229
|
+
|
|
230
|
+
def _calculate_score(
|
|
231
|
+
self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
|
|
232
|
+
):
|
|
233
|
+
goal_scores = [goal_score.score for goal_score in goal_scores]
|
|
234
|
+
plan_scores = [plan_score.score for plan_score in plan_scores]
|
|
235
|
+
goal_score_divisor = len(goal_scores) if len(goal_scores) > 0 else 1
|
|
236
|
+
plan_score_divisor = len(plan_scores) if len(plan_scores) > 0 else 1
|
|
237
|
+
goal_avg = sum(goal_scores) / goal_score_divisor
|
|
238
|
+
plan_avg = sum(plan_scores) / plan_score_divisor
|
|
239
|
+
score = (goal_avg + plan_avg) / 2
|
|
240
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
|
241
|
+
|
|
242
|
+
def _generate_reason(
|
|
243
|
+
self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
|
|
244
|
+
):
|
|
245
|
+
goal_evaluations = ""
|
|
246
|
+
for goal_score in goal_scores:
|
|
247
|
+
goal_evaluations += (
|
|
248
|
+
f"Score: {goal_score.score}, Reason: {goal_score.reason}"
|
|
249
|
+
)
|
|
250
|
+
plan_evalautions = ""
|
|
251
|
+
for plan_score in plan_scores:
|
|
252
|
+
plan_evalautions += (
|
|
253
|
+
f"Score: {plan_score.score}, Reason: {plan_score.reason} \n"
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
prompt = GoalAccuracyTemplate.get_final_reason(
|
|
257
|
+
self.score, self.threshold, goal_evaluations, plan_evalautions
|
|
258
|
+
)
|
|
259
|
+
if self.using_native_model:
|
|
260
|
+
res, cost = self.model.generate(prompt)
|
|
261
|
+
self.evaluation_cost += cost
|
|
262
|
+
return res
|
|
263
|
+
else:
|
|
264
|
+
res = self.model.generate(prompt)
|
|
265
|
+
return res
|
|
266
|
+
|
|
267
|
+
async def _a_generate_reason(
|
|
268
|
+
self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
|
|
269
|
+
):
|
|
270
|
+
goal_evaluations = ""
|
|
271
|
+
for goal_score in goal_scores:
|
|
272
|
+
goal_evaluations += (
|
|
273
|
+
f"Score: {goal_score.score}, Reason: {goal_score.reason}"
|
|
274
|
+
)
|
|
275
|
+
plan_evalautions = ""
|
|
276
|
+
for plan_score in plan_scores:
|
|
277
|
+
plan_evalautions += (
|
|
278
|
+
f"Score: {plan_score.score}, Reason: {plan_score.reason} \n"
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
prompt = GoalAccuracyTemplate.get_final_reason(
|
|
282
|
+
self.score, self.threshold, goal_evaluations, plan_evalautions
|
|
283
|
+
)
|
|
284
|
+
if self.using_native_model:
|
|
285
|
+
res, cost = await self.model.a_generate(prompt)
|
|
286
|
+
self.evaluation_cost += cost
|
|
287
|
+
return res
|
|
288
|
+
else:
|
|
289
|
+
res = await self.model.a_generate(prompt)
|
|
290
|
+
return res
|
|
291
|
+
|
|
292
|
+
def _get_goal_accuracy_score(self, user_goal, steps_taken):
|
|
293
|
+
prompt = GoalAccuracyTemplate.get_accuracy_score(
|
|
294
|
+
user_goal, "\n".join(steps_taken)
|
|
295
|
+
)
|
|
296
|
+
if self.using_native_model:
|
|
297
|
+
res, cost = self.model.generate(prompt, schema=GoalScore)
|
|
298
|
+
self.evaluation_cost += cost
|
|
299
|
+
return res
|
|
300
|
+
else:
|
|
301
|
+
try:
|
|
302
|
+
res: GoalScore = self.model.generate(prompt, schema=GoalScore)
|
|
303
|
+
return res
|
|
304
|
+
except TypeError:
|
|
305
|
+
res = self.model.generate(prompt)
|
|
306
|
+
data = trimAndLoadJson(res, self)
|
|
307
|
+
return GoalScore(**data)
|
|
308
|
+
|
|
309
|
+
async def _a_get_goal_accuracy_score(self, user_goal, steps_taken):
|
|
310
|
+
prompt = GoalAccuracyTemplate.get_accuracy_score(
|
|
311
|
+
user_goal, "\n".join(steps_taken)
|
|
312
|
+
)
|
|
313
|
+
if self.using_native_model:
|
|
314
|
+
res, cost = await self.model.a_generate(prompt, schema=GoalScore)
|
|
315
|
+
self.evaluation_cost += cost
|
|
316
|
+
return res
|
|
317
|
+
else:
|
|
318
|
+
try:
|
|
319
|
+
res: GoalScore = await self.model.a_generate(
|
|
320
|
+
prompt, schema=GoalScore
|
|
321
|
+
)
|
|
322
|
+
return res
|
|
323
|
+
except TypeError:
|
|
324
|
+
res = await self.model.a_generate(prompt)
|
|
325
|
+
data = trimAndLoadJson(res, self)
|
|
326
|
+
return GoalScore(**data)
|
|
327
|
+
|
|
328
|
+
def print_goals_and_steps_taken(self, goals_and_steps):
|
|
329
|
+
final_goals_and_steps = ""
|
|
330
|
+
for goal_step in goals_and_steps:
|
|
331
|
+
final_goals_and_steps += f"{goal_step.user_goal} \n"
|
|
332
|
+
final_goals_and_steps += (
|
|
333
|
+
f"c{prettify_list(goal_step.steps_taken)} \n\n"
|
|
334
|
+
)
|
|
335
|
+
return final_goals_and_steps
|
|
336
|
+
|
|
337
|
+
def is_successful(self) -> bool:
|
|
338
|
+
if self.error is not None:
|
|
339
|
+
self.success = False
|
|
340
|
+
else:
|
|
341
|
+
try:
|
|
342
|
+
self.success = self.score >= self.threshold
|
|
343
|
+
except:
|
|
344
|
+
self.success = False
|
|
345
|
+
return self.success
|
|
346
|
+
|
|
347
|
+
@property
|
|
348
|
+
def __name__(self):
|
|
349
|
+
return "Goal Accuracy"
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class GoalSteps(BaseModel):
|
|
6
|
+
user_goal: str
|
|
7
|
+
steps_taken: List[str]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class GoalScore(BaseModel):
|
|
11
|
+
score: float
|
|
12
|
+
reason: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PlanScore(BaseModel):
|
|
16
|
+
score: float
|
|
17
|
+
reason: str
|