deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +104 -36
  3. deepeval/config/utils.py +5 -0
  4. deepeval/dataset/dataset.py +162 -30
  5. deepeval/dataset/utils.py +41 -13
  6. deepeval/errors.py +20 -2
  7. deepeval/evaluate/execute.py +1662 -688
  8. deepeval/evaluate/types.py +1 -0
  9. deepeval/evaluate/utils.py +13 -3
  10. deepeval/integrations/crewai/__init__.py +2 -1
  11. deepeval/integrations/crewai/tool.py +71 -0
  12. deepeval/integrations/llama_index/__init__.py +0 -4
  13. deepeval/integrations/llama_index/handler.py +20 -21
  14. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  15. deepeval/metrics/__init__.py +13 -0
  16. deepeval/metrics/base_metric.py +1 -0
  17. deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
  18. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  19. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
  20. deepeval/metrics/dag/schema.py +1 -1
  21. deepeval/metrics/dag/templates.py +2 -2
  22. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  23. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  24. deepeval/metrics/goal_accuracy/schema.py +17 -0
  25. deepeval/metrics/goal_accuracy/template.py +235 -0
  26. deepeval/metrics/hallucination/hallucination.py +8 -8
  27. deepeval/metrics/indicator.py +21 -1
  28. deepeval/metrics/mcp/mcp_task_completion.py +7 -2
  29. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
  30. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
  31. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
  32. deepeval/metrics/plan_adherence/__init__.py +1 -0
  33. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  34. deepeval/metrics/plan_adherence/schema.py +11 -0
  35. deepeval/metrics/plan_adherence/template.py +170 -0
  36. deepeval/metrics/plan_quality/__init__.py +1 -0
  37. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  38. deepeval/metrics/plan_quality/schema.py +11 -0
  39. deepeval/metrics/plan_quality/template.py +101 -0
  40. deepeval/metrics/step_efficiency/__init__.py +1 -0
  41. deepeval/metrics/step_efficiency/schema.py +11 -0
  42. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  43. deepeval/metrics/step_efficiency/template.py +256 -0
  44. deepeval/metrics/task_completion/task_completion.py +1 -0
  45. deepeval/metrics/tool_correctness/schema.py +6 -0
  46. deepeval/metrics/tool_correctness/template.py +88 -0
  47. deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
  48. deepeval/metrics/tool_use/__init__.py +1 -0
  49. deepeval/metrics/tool_use/schema.py +19 -0
  50. deepeval/metrics/tool_use/template.py +220 -0
  51. deepeval/metrics/tool_use/tool_use.py +458 -0
  52. deepeval/metrics/topic_adherence/__init__.py +1 -0
  53. deepeval/metrics/topic_adherence/schema.py +16 -0
  54. deepeval/metrics/topic_adherence/template.py +162 -0
  55. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  56. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  57. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  58. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  59. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  60. deepeval/models/llms/amazon_bedrock_model.py +20 -17
  61. deepeval/models/llms/openai_model.py +10 -1
  62. deepeval/models/retry_policy.py +103 -20
  63. deepeval/openai/extractors.py +61 -16
  64. deepeval/openai/patch.py +8 -12
  65. deepeval/openai/types.py +1 -1
  66. deepeval/openai/utils.py +108 -1
  67. deepeval/prompt/prompt.py +1 -0
  68. deepeval/prompt/utils.py +43 -14
  69. deepeval/simulator/conversation_simulator.py +25 -18
  70. deepeval/synthesizer/chunking/context_generator.py +9 -1
  71. deepeval/synthesizer/synthesizer.py +11 -10
  72. deepeval/test_case/llm_test_case.py +6 -2
  73. deepeval/test_run/test_run.py +190 -207
  74. deepeval/tracing/__init__.py +2 -1
  75. deepeval/tracing/otel/exporter.py +3 -4
  76. deepeval/tracing/otel/utils.py +23 -4
  77. deepeval/tracing/trace_context.py +53 -38
  78. deepeval/tracing/tracing.py +23 -0
  79. deepeval/tracing/types.py +16 -14
  80. deepeval/utils.py +21 -0
  81. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
  82. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
  83. deepeval/integrations/llama_index/agent/patched.py +0 -68
  84. deepeval/tracing/message_types/__init__.py +0 -10
  85. deepeval/tracing/message_types/base.py +0 -6
  86. deepeval/tracing/message_types/messages.py +0 -14
  87. deepeval/tracing/message_types/tools.py +0 -18
  88. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
  89. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
  90. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,3 @@
1
+ from .template import ConversationalGEvalTemplate
2
+
3
+ __all__ = ["ConversationalGEvalTemplate"]
@@ -1,7 +1,7 @@
1
1
  """A slightly modified tailored version of the LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
2
2
 
3
3
  from openai.types.chat.chat_completion import ChatCompletion
4
- from typing import Optional, List, Tuple, Union, Dict
4
+ from typing import Optional, List, Tuple, Union, Dict, Type
5
5
  import math
6
6
  from deepeval.metrics import BaseConversationalMetric
7
7
  from deepeval.metrics.g_eval.utils import (
@@ -44,6 +44,9 @@ class ConversationalGEval(BaseConversationalMetric):
44
44
  async_mode: bool = True,
45
45
  strict_mode: bool = False,
46
46
  verbose_mode: bool = False,
47
+ evaluation_template: Type[
48
+ ConversationalGEvalTemplate
49
+ ] = ConversationalGEvalTemplate,
47
50
  _include_g_eval_suffix: bool = True,
48
51
  ):
49
52
  if evaluation_params is not None and len(evaluation_params) == 0:
@@ -85,6 +88,7 @@ class ConversationalGEval(BaseConversationalMetric):
85
88
  self.strict_mode = strict_mode
86
89
  self.async_mode = async_mode
87
90
  self.verbose_mode = verbose_mode
91
+ self.evaluation_template = evaluation_template
88
92
  self._include_g_eval_suffix = _include_g_eval_suffix
89
93
 
90
94
  def measure(
@@ -194,7 +198,7 @@ class ConversationalGEval(BaseConversationalMetric):
194
198
  g_eval_params_str = construct_conversational_g_eval_turn_params_string(
195
199
  self.evaluation_params
196
200
  )
197
- prompt = ConversationalGEvalTemplate.generate_evaluation_steps(
201
+ prompt = self.evaluation_template.generate_evaluation_steps(
198
202
  criteria=self.criteria, parameters=g_eval_params_str
199
203
  )
200
204
  if self.using_native_model:
@@ -221,7 +225,7 @@ class ConversationalGEval(BaseConversationalMetric):
221
225
  g_eval_params_str = construct_conversational_g_eval_turn_params_string(
222
226
  self.evaluation_params
223
227
  )
224
- prompt = ConversationalGEvalTemplate.generate_evaluation_steps(
228
+ prompt = self.evaluation_template.generate_evaluation_steps(
225
229
  criteria=self.criteria, parameters=g_eval_params_str
226
230
  )
227
231
  if self.using_native_model:
@@ -250,7 +254,7 @@ class ConversationalGEval(BaseConversationalMetric):
250
254
  )
251
255
  if not self.strict_mode:
252
256
  rubric_str = format_rubrics(self.rubric) if self.rubric else None
253
- prompt = ConversationalGEvalTemplate.generate_evaluation_results(
257
+ prompt = self.evaluation_template.generate_evaluation_results(
254
258
  evaluation_steps=self.number_evaluation_steps(),
255
259
  test_case_content=test_case_content,
256
260
  turns=[
@@ -261,7 +265,7 @@ class ConversationalGEval(BaseConversationalMetric):
261
265
  rubric=rubric_str,
262
266
  )
263
267
  else:
264
- prompt = ConversationalGEvalTemplate.generate_evaluation_results(
268
+ prompt = self.evaluation_template.generate_evaluation_results(
265
269
  evaluation_steps=self.number_evaluation_steps(),
266
270
  test_case_content=test_case_content,
267
271
  turns=[
@@ -320,7 +324,7 @@ class ConversationalGEval(BaseConversationalMetric):
320
324
  )
321
325
  if not self.strict_mode:
322
326
  rubric_str = format_rubrics(self.rubric) if self.rubric else None
323
- prompt = ConversationalGEvalTemplate.generate_evaluation_results(
327
+ prompt = self.evaluation_template.generate_evaluation_results(
324
328
  evaluation_steps=self.number_evaluation_steps(),
325
329
  test_case_content=test_case_content,
326
330
  turns=[
@@ -331,7 +335,7 @@ class ConversationalGEval(BaseConversationalMetric):
331
335
  rubric=rubric_str,
332
336
  )
333
337
  else:
334
- prompt = ConversationalGEvalTemplate.generate_evaluation_results(
338
+ prompt = self.evaluation_template.generate_evaluation_results(
335
339
  evaluation_steps=self.number_evaluation_steps(),
336
340
  test_case_content=test_case_content,
337
341
  turns=[
@@ -11,7 +11,7 @@ class TaskNodeOutput(BaseModel):
11
11
 
12
12
 
13
13
  class BinaryJudgementVerdict(BaseModel):
14
- verdict: Literal[True, False]
14
+ verdict: bool
15
15
  reason: str
16
16
 
17
17
 
@@ -60,10 +60,10 @@ class BinaryJudgementTemplate:
60
60
  {text}
61
61
 
62
62
  **
63
- IMPORTANT: Please make sure to only return a json with two keys: `verdict` (true or false), and the 'reason' key providing the reason.
63
+ IMPORTANT: Please make sure to only return a json with two keys: `verdict` (True or False), and the 'reason' key providing the reason. The verdict must be a boolean only, either True or False.
64
64
  Example JSON:
65
65
  {{
66
- "verdict": true,
66
+ "verdict": True,
67
67
  "reason": "..."
68
68
  }}
69
69
  **
@@ -0,0 +1 @@
1
+ from .goal_accuracy import GoalAccuracyMetric
@@ -0,0 +1,349 @@
1
+ from typing import Optional, List, Union
2
+ import asyncio
3
+ from deepeval.utils import get_or_create_event_loop, prettify_list
4
+ from deepeval.metrics.utils import (
5
+ construct_verbose_logs,
6
+ trimAndLoadJson,
7
+ get_unit_interactions,
8
+ print_tools_called,
9
+ check_conversational_test_case_params,
10
+ initialize_model,
11
+ )
12
+ from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
13
+ from deepeval.metrics import BaseConversationalMetric
14
+ from deepeval.models import DeepEvalBaseLLM
15
+ from deepeval.metrics.indicator import metric_progress_indicator
16
+ from deepeval.metrics.goal_accuracy.template import (
17
+ GoalAccuracyTemplate,
18
+ )
19
+ from deepeval.metrics.goal_accuracy.schema import (
20
+ GoalSteps,
21
+ GoalScore,
22
+ PlanScore,
23
+ )
24
+ from deepeval.metrics.api import metric_data_manager
25
+
26
+
27
+ class GoalAccuracyMetric(BaseConversationalMetric):
28
+
29
+ _required_test_case_params = [
30
+ TurnParams.ROLE,
31
+ TurnParams.CONTENT,
32
+ ]
33
+
34
+ def __init__(
35
+ self,
36
+ threshold: float = 0.5,
37
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
38
+ include_reason: bool = True,
39
+ async_mode: bool = True,
40
+ strict_mode: bool = False,
41
+ verbose_mode: bool = False,
42
+ ):
43
+ self.threshold = 1 if strict_mode else threshold
44
+ self.model, self.using_native_model = initialize_model(model)
45
+ self.evaluation_model = self.model.get_model_name()
46
+ self.include_reason = include_reason
47
+ self.async_mode = async_mode
48
+ self.strict_mode = strict_mode
49
+ self.verbose_mode = verbose_mode
50
+
51
+ def measure(
52
+ self,
53
+ test_case: ConversationalTestCase,
54
+ _show_indicator: bool = True,
55
+ _in_component: bool = False,
56
+ _log_metric_to_confident: bool = True,
57
+ ):
58
+ check_conversational_test_case_params(
59
+ test_case, self._required_test_case_params, self
60
+ )
61
+
62
+ self.evaluation_cost = 0 if self.using_native_model else None
63
+ with metric_progress_indicator(
64
+ self, _show_indicator=_show_indicator, _in_component=_in_component
65
+ ):
66
+ if self.async_mode:
67
+ loop = get_or_create_event_loop()
68
+ loop.run_until_complete(
69
+ self.a_measure(
70
+ test_case,
71
+ _show_indicator=False,
72
+ _in_component=_in_component,
73
+ _log_metric_to_confident=_log_metric_to_confident,
74
+ )
75
+ )
76
+ else:
77
+ unit_interactions = get_unit_interactions(test_case.turns)
78
+ goal_and_steps_taken = self._goal_and_steps_taken(
79
+ unit_interactions
80
+ )
81
+ goal_scores = [
82
+ self._get_goal_accuracy_score(
83
+ task.user_goal, task.steps_taken
84
+ )
85
+ for task in goal_and_steps_taken
86
+ ]
87
+ plan_scores = [
88
+ self._get_plan_scores(task.user_goal, task.steps_taken)
89
+ for task in goal_and_steps_taken
90
+ ]
91
+ self.score = self._calculate_score(goal_scores, plan_scores)
92
+ self.success = self.score >= self.threshold
93
+ self.reason = self._generate_reason(goal_scores, plan_scores)
94
+
95
+ self.verbose_logs = construct_verbose_logs(
96
+ self,
97
+ steps=[
98
+ f"Goals and steps taken: \n{self.print_goals_and_steps_taken(goal_and_steps_taken)} \n",
99
+ f"Goal evaluations: {prettify_list(goal_scores)} \n\n"
100
+ f"Plan evaluations: {prettify_list(plan_scores)} \n\n"
101
+ f"Final Score: {self.score}",
102
+ f"Final Reason: {self.reason}",
103
+ ],
104
+ )
105
+
106
+ if _log_metric_to_confident:
107
+ metric_data_manager.post_metric_if_enabled(
108
+ self, test_case=test_case
109
+ )
110
+
111
+ return self.score
112
+
113
+ async def a_measure(
114
+ self,
115
+ test_case: ConversationalTestCase,
116
+ _show_indicator: bool = True,
117
+ _in_component: bool = False,
118
+ _log_metric_to_confident: bool = True,
119
+ ):
120
+ check_conversational_test_case_params(
121
+ test_case, self._required_test_case_params, self
122
+ )
123
+
124
+ self.evaluation_cost = 0 if self.using_native_model else None
125
+
126
+ with metric_progress_indicator(
127
+ self,
128
+ async_mode=True,
129
+ _show_indicator=_show_indicator,
130
+ _in_component=_in_component,
131
+ ):
132
+ unit_interactions = get_unit_interactions(test_case.turns)
133
+ goal_and_steps_taken = self._goal_and_steps_taken(unit_interactions)
134
+ goal_scores = await asyncio.gather(
135
+ *[
136
+ self._a_get_goal_accuracy_score(
137
+ task.user_goal, task.steps_taken
138
+ )
139
+ for task in goal_and_steps_taken
140
+ ]
141
+ )
142
+ plan_scores = await asyncio.gather(
143
+ *[
144
+ self._a_get_plan_scores(task.user_goal, task.steps_taken)
145
+ for task in goal_and_steps_taken
146
+ ]
147
+ )
148
+ self.score = self._calculate_score(goal_scores, plan_scores)
149
+ self.success = self.score >= self.threshold
150
+ self.reason = await self._a_generate_reason(
151
+ goal_scores, plan_scores
152
+ )
153
+
154
+ self.verbose_logs = construct_verbose_logs(
155
+ self,
156
+ steps=[
157
+ f"Goals and steps taken: \n{self.print_goals_and_steps_taken(goal_and_steps_taken)} \n",
158
+ f"Goal evaluations: {prettify_list(goal_scores)} \n\n"
159
+ f"Plan evaluations: {prettify_list(plan_scores)} \n\n"
160
+ f"Final Score: {self.score}",
161
+ f"Final Reason: {self.reason}",
162
+ ],
163
+ )
164
+
165
+ if _log_metric_to_confident:
166
+ metric_data_manager.post_metric_if_enabled(
167
+ self, test_case=test_case
168
+ )
169
+
170
+ return self.score
171
+
172
+ def _goal_and_steps_taken(
173
+ self, unit_interactions: List[List[Turn]]
174
+ ) -> List[GoalSteps]:
175
+ goal_and_steps_taken = []
176
+ for unit_interaction in unit_interactions:
177
+ user_messages = "User messages: \n"
178
+ for turn in unit_interaction:
179
+ if turn.role == "user":
180
+ user_messages += turn.content + "\n"
181
+ else:
182
+ break
183
+ new_goal_steps = GoalSteps(user_goal=user_messages, steps_taken=[])
184
+ assistant_messages = "Assistant messages: \n"
185
+ for turn in unit_interaction[1:]:
186
+ if turn.role == "assistant":
187
+ assistant_messages += f"{turn.content} \n"
188
+ if turn.tools_called:
189
+ assistant_messages += f"Tools called: \n{print_tools_called(turn.tools_called)} \n"
190
+ new_goal_steps.steps_taken.append(assistant_messages)
191
+ goal_and_steps_taken.append(new_goal_steps)
192
+ return goal_and_steps_taken
193
+
194
+ def _get_plan_scores(self, user_goal, steps_taken):
195
+ prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
196
+ user_goal, "\n".join(steps_taken)
197
+ )
198
+ if self.using_native_model:
199
+ res, cost = self.model.generate(prompt, schema=PlanScore)
200
+ self.evaluation_cost += cost
201
+ return res
202
+ else:
203
+ try:
204
+ res: PlanScore = self.model.generate(prompt, schema=PlanScore)
205
+ return res
206
+ except TypeError:
207
+ res = self.model.generate(prompt)
208
+ data = trimAndLoadJson(res, self)
209
+ return PlanScore(**data)
210
+
211
+ async def _a_get_plan_scores(self, user_goal, steps_taken):
212
+ prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
213
+ user_goal, "\n".join(steps_taken)
214
+ )
215
+ if self.using_native_model:
216
+ res, cost = await self.model.a_generate(prompt, schema=PlanScore)
217
+ self.evaluation_cost += cost
218
+ return res
219
+ else:
220
+ try:
221
+ res: PlanScore = await self.model.a_generate(
222
+ prompt, schema=PlanScore
223
+ )
224
+ return res
225
+ except TypeError:
226
+ res = await self.model.a_generate(prompt)
227
+ data = trimAndLoadJson(res, self)
228
+ return PlanScore(**data)
229
+
230
+ def _calculate_score(
231
+ self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
232
+ ):
233
+ goal_scores = [goal_score.score for goal_score in goal_scores]
234
+ plan_scores = [plan_score.score for plan_score in plan_scores]
235
+ goal_score_divisor = len(goal_scores) if len(goal_scores) > 0 else 1
236
+ plan_score_divisor = len(plan_scores) if len(plan_scores) > 0 else 1
237
+ goal_avg = sum(goal_scores) / goal_score_divisor
238
+ plan_avg = sum(plan_scores) / plan_score_divisor
239
+ score = (goal_avg + plan_avg) / 2
240
+ return 0 if self.strict_mode and score < self.threshold else score
241
+
242
+ def _generate_reason(
243
+ self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
244
+ ):
245
+ goal_evaluations = ""
246
+ for goal_score in goal_scores:
247
+ goal_evaluations += (
248
+ f"Score: {goal_score.score}, Reason: {goal_score.reason}"
249
+ )
250
+ plan_evalautions = ""
251
+ for plan_score in plan_scores:
252
+ plan_evalautions += (
253
+ f"Score: {plan_score.score}, Reason: {plan_score.reason} \n"
254
+ )
255
+
256
+ prompt = GoalAccuracyTemplate.get_final_reason(
257
+ self.score, self.threshold, goal_evaluations, plan_evalautions
258
+ )
259
+ if self.using_native_model:
260
+ res, cost = self.model.generate(prompt)
261
+ self.evaluation_cost += cost
262
+ return res
263
+ else:
264
+ res = self.model.generate(prompt)
265
+ return res
266
+
267
+ async def _a_generate_reason(
268
+ self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
269
+ ):
270
+ goal_evaluations = ""
271
+ for goal_score in goal_scores:
272
+ goal_evaluations += (
273
+ f"Score: {goal_score.score}, Reason: {goal_score.reason}"
274
+ )
275
+ plan_evalautions = ""
276
+ for plan_score in plan_scores:
277
+ plan_evalautions += (
278
+ f"Score: {plan_score.score}, Reason: {plan_score.reason} \n"
279
+ )
280
+
281
+ prompt = GoalAccuracyTemplate.get_final_reason(
282
+ self.score, self.threshold, goal_evaluations, plan_evalautions
283
+ )
284
+ if self.using_native_model:
285
+ res, cost = await self.model.a_generate(prompt)
286
+ self.evaluation_cost += cost
287
+ return res
288
+ else:
289
+ res = await self.model.a_generate(prompt)
290
+ return res
291
+
292
+ def _get_goal_accuracy_score(self, user_goal, steps_taken):
293
+ prompt = GoalAccuracyTemplate.get_accuracy_score(
294
+ user_goal, "\n".join(steps_taken)
295
+ )
296
+ if self.using_native_model:
297
+ res, cost = self.model.generate(prompt, schema=GoalScore)
298
+ self.evaluation_cost += cost
299
+ return res
300
+ else:
301
+ try:
302
+ res: GoalScore = self.model.generate(prompt, schema=GoalScore)
303
+ return res
304
+ except TypeError:
305
+ res = self.model.generate(prompt)
306
+ data = trimAndLoadJson(res, self)
307
+ return GoalScore(**data)
308
+
309
+ async def _a_get_goal_accuracy_score(self, user_goal, steps_taken):
310
+ prompt = GoalAccuracyTemplate.get_accuracy_score(
311
+ user_goal, "\n".join(steps_taken)
312
+ )
313
+ if self.using_native_model:
314
+ res, cost = await self.model.a_generate(prompt, schema=GoalScore)
315
+ self.evaluation_cost += cost
316
+ return res
317
+ else:
318
+ try:
319
+ res: GoalScore = await self.model.a_generate(
320
+ prompt, schema=GoalScore
321
+ )
322
+ return res
323
+ except TypeError:
324
+ res = await self.model.a_generate(prompt)
325
+ data = trimAndLoadJson(res, self)
326
+ return GoalScore(**data)
327
+
328
+ def print_goals_and_steps_taken(self, goals_and_steps):
329
+ final_goals_and_steps = ""
330
+ for goal_step in goals_and_steps:
331
+ final_goals_and_steps += f"{goal_step.user_goal} \n"
332
+ final_goals_and_steps += (
333
+ f"c{prettify_list(goal_step.steps_taken)} \n\n"
334
+ )
335
+ return final_goals_and_steps
336
+
337
+ def is_successful(self) -> bool:
338
+ if self.error is not None:
339
+ self.success = False
340
+ else:
341
+ try:
342
+ self.success = self.score >= self.threshold
343
+ except:
344
+ self.success = False
345
+ return self.success
346
+
347
+ @property
348
+ def __name__(self):
349
+ return "Goal Accuracy"
@@ -0,0 +1,17 @@
1
+ from pydantic import BaseModel
2
+ from typing import List
3
+
4
+
5
+ class GoalSteps(BaseModel):
6
+ user_goal: str
7
+ steps_taken: List[str]
8
+
9
+
10
+ class GoalScore(BaseModel):
11
+ score: float
12
+ reason: str
13
+
14
+
15
+ class PlanScore(BaseModel):
16
+ score: float
17
+ reason: str