deepeval 3.4.7__py3-none-any.whl → 3.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +8 -7
- deepeval/_version.py +1 -1
- deepeval/cli/dotenv_handler.py +71 -0
- deepeval/cli/main.py +1021 -280
- deepeval/cli/utils.py +116 -2
- deepeval/confident/api.py +29 -14
- deepeval/config/__init__.py +0 -0
- deepeval/config/settings.py +565 -0
- deepeval/config/settings_manager.py +133 -0
- deepeval/config/utils.py +86 -0
- deepeval/dataset/__init__.py +1 -0
- deepeval/dataset/dataset.py +70 -10
- deepeval/dataset/test_run_tracer.py +82 -0
- deepeval/dataset/utils.py +23 -0
- deepeval/key_handler.py +64 -2
- deepeval/metrics/__init__.py +4 -1
- deepeval/metrics/answer_relevancy/template.py +7 -2
- deepeval/metrics/conversational_dag/__init__.py +7 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +139 -0
- deepeval/metrics/conversational_dag/nodes.py +931 -0
- deepeval/metrics/conversational_dag/templates.py +117 -0
- deepeval/metrics/dag/dag.py +13 -4
- deepeval/metrics/dag/graph.py +47 -15
- deepeval/metrics/dag/utils.py +103 -38
- deepeval/metrics/faithfulness/template.py +11 -8
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +6 -4
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +6 -4
- deepeval/metrics/tool_correctness/tool_correctness.py +7 -3
- deepeval/models/llms/amazon_bedrock_model.py +24 -3
- deepeval/models/llms/openai_model.py +37 -41
- deepeval/models/retry_policy.py +280 -0
- deepeval/openai_agents/agent.py +4 -2
- deepeval/synthesizer/chunking/doc_chunker.py +87 -51
- deepeval/test_run/api.py +1 -0
- deepeval/tracing/otel/exporter.py +20 -8
- deepeval/tracing/otel/utils.py +57 -0
- deepeval/tracing/tracing.py +37 -16
- deepeval/tracing/utils.py +98 -1
- deepeval/utils.py +111 -70
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/METADATA +3 -1
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/RECORD +44 -34
- deepeval/env.py +0 -35
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/LICENSE.md +0 -0
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/WHEEL +0 -0
- {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,931 @@
|
|
|
1
|
+
from typing import Optional, List, Union, Literal, Tuple
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pydantic import create_model
|
|
4
|
+
import asyncio
|
|
5
|
+
|
|
6
|
+
from deepeval.metrics.base_metric import BaseConversationalMetric
|
|
7
|
+
from deepeval.metrics.conversational_g_eval.conversational_g_eval import (
|
|
8
|
+
ConversationalGEval,
|
|
9
|
+
)
|
|
10
|
+
from deepeval.metrics.g_eval.utils import CONVERSATIONAL_G_EVAL_PARAMS
|
|
11
|
+
from deepeval.metrics.utils import copy_metrics, trimAndLoadJson
|
|
12
|
+
from deepeval.test_case import (
|
|
13
|
+
ConversationalTestCase,
|
|
14
|
+
TurnParams,
|
|
15
|
+
ToolCall,
|
|
16
|
+
Turn,
|
|
17
|
+
)
|
|
18
|
+
from deepeval.utils import prettify_list
|
|
19
|
+
|
|
20
|
+
from .templates import (
|
|
21
|
+
ConversationalBinaryJudgementTemplate,
|
|
22
|
+
ConversationalNonBinaryJudgementTemplate,
|
|
23
|
+
ConversationalTaskNodeTemplate,
|
|
24
|
+
ConversationalVerdictNodeTemplate,
|
|
25
|
+
)
|
|
26
|
+
from deepeval.metrics.dag.schema import (
|
|
27
|
+
BinaryJudgementVerdict,
|
|
28
|
+
MetricScoreReason,
|
|
29
|
+
NonBinaryJudgementVerdict,
|
|
30
|
+
TaskNodeOutput,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ConversationalBaseNode:
|
|
35
|
+
_indegree: int = 0
|
|
36
|
+
_depth: int = 0
|
|
37
|
+
|
|
38
|
+
def set_parent(self, parent: "ConversationalBaseNode"):
|
|
39
|
+
if hasattr(self, "_parent"):
|
|
40
|
+
self._parent = parent
|
|
41
|
+
elif hasattr(self, "_parents"):
|
|
42
|
+
if self._parents is None:
|
|
43
|
+
self._parents = []
|
|
44
|
+
self._parents.append(parent)
|
|
45
|
+
|
|
46
|
+
def _execute(
|
|
47
|
+
self,
|
|
48
|
+
metric: BaseConversationalMetric,
|
|
49
|
+
test_case: ConversationalTestCase,
|
|
50
|
+
depth: int,
|
|
51
|
+
):
|
|
52
|
+
raise NotImplementedError(
|
|
53
|
+
"This node type must implement the _execute method."
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
async def _a_execute(
|
|
57
|
+
self,
|
|
58
|
+
metric: BaseConversationalMetric,
|
|
59
|
+
test_case: ConversationalTestCase,
|
|
60
|
+
depth: int,
|
|
61
|
+
):
|
|
62
|
+
raise NotImplementedError(
|
|
63
|
+
"This node type must implement the _a_execute method."
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def increment_indegree(node: ConversationalBaseNode):
|
|
68
|
+
node._indegree += 1
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def decrement_indegree(node: ConversationalBaseNode):
|
|
72
|
+
node._indegree -= 1
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class ConversationalVerdictNode(ConversationalBaseNode):
|
|
77
|
+
verdict: Union[str, bool]
|
|
78
|
+
score: Optional[int] = None
|
|
79
|
+
child: Optional[
|
|
80
|
+
Union[
|
|
81
|
+
ConversationalBaseNode,
|
|
82
|
+
ConversationalGEval,
|
|
83
|
+
BaseConversationalMetric,
|
|
84
|
+
]
|
|
85
|
+
] = None
|
|
86
|
+
_parent: Optional[ConversationalBaseNode] = None
|
|
87
|
+
|
|
88
|
+
def __hash__(self):
|
|
89
|
+
return id(self)
|
|
90
|
+
|
|
91
|
+
def __post_init__(self):
|
|
92
|
+
# Ensure either `score` or `child` is set, but not both
|
|
93
|
+
if self.score is not None and self.child is not None:
|
|
94
|
+
raise ValueError(
|
|
95
|
+
"A ConversationalVerdictNode can have either a 'score' or a 'child', but not both."
|
|
96
|
+
)
|
|
97
|
+
if self.score is None and self.child is None:
|
|
98
|
+
raise ValueError(
|
|
99
|
+
"A ConversationalVerdictNode must have either a 'score' or a 'child'."
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
if self.score is not None:
|
|
103
|
+
if not (0 <= self.score <= 10):
|
|
104
|
+
raise ValueError(
|
|
105
|
+
"The score must be between 0 and 10, inclusive."
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def _execute(
|
|
109
|
+
self,
|
|
110
|
+
metric: BaseConversationalMetric,
|
|
111
|
+
test_case: ConversationalTestCase,
|
|
112
|
+
depth: int,
|
|
113
|
+
):
|
|
114
|
+
decrement_indegree(self)
|
|
115
|
+
if self._indegree > 0:
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
if isinstance(
|
|
119
|
+
self._parent, ConversationalNonBinaryJudgementNode
|
|
120
|
+
) or isinstance(self._parent, ConversationalBinaryJudgementNode):
|
|
121
|
+
if self._parent._verdict.verdict != self.verdict:
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
if self.child is not None:
|
|
125
|
+
if isinstance(self.child, ConversationalGEval):
|
|
126
|
+
convo_g_eval_args = {
|
|
127
|
+
"name": self.child.name,
|
|
128
|
+
"model": metric.model,
|
|
129
|
+
"verbose_mode": False,
|
|
130
|
+
}
|
|
131
|
+
if self.child.criteria:
|
|
132
|
+
convo_g_eval_args["criteria"] = self.child.criteria
|
|
133
|
+
else:
|
|
134
|
+
convo_g_eval_args["evaluation_steps"] = (
|
|
135
|
+
self.child.evaluation_steps
|
|
136
|
+
)
|
|
137
|
+
if self.child.evaluation_params:
|
|
138
|
+
convo_g_eval_args["evaluation_params"] = (
|
|
139
|
+
self.child.evaluation_params
|
|
140
|
+
)
|
|
141
|
+
copied_convo_g_eval = ConversationalGEval(**convo_g_eval_args)
|
|
142
|
+
|
|
143
|
+
copied_convo_g_eval.measure(
|
|
144
|
+
test_case=test_case, _show_indicator=False
|
|
145
|
+
)
|
|
146
|
+
metric._verbose_steps.append(
|
|
147
|
+
construct_node_verbose_log(self, depth, copied_convo_g_eval)
|
|
148
|
+
)
|
|
149
|
+
metric.score = copied_convo_g_eval.score
|
|
150
|
+
if metric.include_reason:
|
|
151
|
+
metric.reason = copied_convo_g_eval.reason
|
|
152
|
+
|
|
153
|
+
elif isinstance(self.child, BaseConversationalMetric):
|
|
154
|
+
copied_metric: BaseConversationalMetric = copy_metrics(
|
|
155
|
+
[self.child]
|
|
156
|
+
)[0]
|
|
157
|
+
copied_metric.verbose_mode = False
|
|
158
|
+
|
|
159
|
+
copied_metric.measure(
|
|
160
|
+
test_case=test_case, _show_indicator=False
|
|
161
|
+
)
|
|
162
|
+
metric._verbose_steps.append(
|
|
163
|
+
construct_node_verbose_log(self, depth, copied_metric)
|
|
164
|
+
)
|
|
165
|
+
metric.score = copied_metric.score
|
|
166
|
+
if metric.include_reason:
|
|
167
|
+
metric.reason = copied_metric.reason
|
|
168
|
+
else:
|
|
169
|
+
self.child._execute(
|
|
170
|
+
metric=metric, test_case=test_case, depth=depth
|
|
171
|
+
)
|
|
172
|
+
else:
|
|
173
|
+
metric._verbose_steps.append(
|
|
174
|
+
construct_node_verbose_log(self, depth)
|
|
175
|
+
)
|
|
176
|
+
metric.score = self.score / 10
|
|
177
|
+
if metric.include_reason:
|
|
178
|
+
metric.reason = self._generate_reason(metric=metric)
|
|
179
|
+
|
|
180
|
+
async def _a_execute(
|
|
181
|
+
self,
|
|
182
|
+
metric: BaseConversationalMetric,
|
|
183
|
+
test_case: ConversationalTestCase,
|
|
184
|
+
depth: int,
|
|
185
|
+
):
|
|
186
|
+
decrement_indegree(self)
|
|
187
|
+
if self._indegree > 0:
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
if isinstance(
|
|
191
|
+
self._parent, ConversationalNonBinaryJudgementNode
|
|
192
|
+
) or isinstance(self._parent, ConversationalBinaryJudgementNode):
|
|
193
|
+
if self._parent._verdict.verdict != self.verdict:
|
|
194
|
+
return
|
|
195
|
+
|
|
196
|
+
if self.child is not None:
|
|
197
|
+
if isinstance(self.child, ConversationalGEval):
|
|
198
|
+
convo_g_eval_args = {
|
|
199
|
+
"name": self.child.name,
|
|
200
|
+
"model": metric.model,
|
|
201
|
+
"verbose_mode": False,
|
|
202
|
+
}
|
|
203
|
+
if self.child.criteria:
|
|
204
|
+
convo_g_eval_args["criteria"] = self.child.criteria
|
|
205
|
+
else:
|
|
206
|
+
convo_g_eval_args["evaluation_steps"] = (
|
|
207
|
+
self.child.evaluation_steps
|
|
208
|
+
)
|
|
209
|
+
if self.child.evaluation_params:
|
|
210
|
+
convo_g_eval_args["evaluation_params"] = (
|
|
211
|
+
self.child.evaluation_params
|
|
212
|
+
)
|
|
213
|
+
copied_convo_g_eval = ConversationalGEval(**convo_g_eval_args)
|
|
214
|
+
|
|
215
|
+
await copied_convo_g_eval.a_measure(
|
|
216
|
+
test_case=test_case, _show_indicator=False
|
|
217
|
+
)
|
|
218
|
+
metric._verbose_steps.append(
|
|
219
|
+
construct_node_verbose_log(self, depth, copied_convo_g_eval)
|
|
220
|
+
)
|
|
221
|
+
metric.score = copied_convo_g_eval.score
|
|
222
|
+
if metric.include_reason:
|
|
223
|
+
metric.reason = copied_convo_g_eval.reason
|
|
224
|
+
|
|
225
|
+
elif isinstance(self.child, BaseConversationalMetric):
|
|
226
|
+
copied_metric: BaseConversationalMetric = copy_metrics(
|
|
227
|
+
[self.child]
|
|
228
|
+
)[0]
|
|
229
|
+
copied_metric.verbose_mode = False
|
|
230
|
+
|
|
231
|
+
await copied_metric.a_measure(
|
|
232
|
+
test_case=test_case, _show_indicator=False
|
|
233
|
+
)
|
|
234
|
+
metric._verbose_steps.append(
|
|
235
|
+
construct_node_verbose_log(self, depth, copied_metric)
|
|
236
|
+
)
|
|
237
|
+
metric.score = copied_metric.score
|
|
238
|
+
if metric.include_reason:
|
|
239
|
+
metric.reason = copied_metric.reason
|
|
240
|
+
else:
|
|
241
|
+
await self.child._a_execute(
|
|
242
|
+
metric=metric, test_case=test_case, depth=depth
|
|
243
|
+
)
|
|
244
|
+
else:
|
|
245
|
+
metric._verbose_steps.append(
|
|
246
|
+
construct_node_verbose_log(self, depth)
|
|
247
|
+
)
|
|
248
|
+
metric.score = self.score / 10
|
|
249
|
+
if metric.include_reason:
|
|
250
|
+
metric.reason = await self._a_generate_reason(metric=metric)
|
|
251
|
+
|
|
252
|
+
def _generate_reason(self, metric: BaseConversationalMetric):
|
|
253
|
+
prompt = ConversationalVerdictNodeTemplate.generate_reason(
|
|
254
|
+
verbose_steps=metric._verbose_steps,
|
|
255
|
+
score=metric.score,
|
|
256
|
+
name=metric.__name__,
|
|
257
|
+
)
|
|
258
|
+
if metric.using_native_model:
|
|
259
|
+
res, cost = metric.model.generate(prompt, schema=MetricScoreReason)
|
|
260
|
+
metric.evaluation_cost += cost
|
|
261
|
+
else:
|
|
262
|
+
try:
|
|
263
|
+
res: MetricScoreReason = metric.model.generate(
|
|
264
|
+
prompt, schema=MetricScoreReason
|
|
265
|
+
)
|
|
266
|
+
except TypeError:
|
|
267
|
+
res = metric.model.generate(prompt)
|
|
268
|
+
data = trimAndLoadJson(res, self)
|
|
269
|
+
res = MetricScoreReason(**data)
|
|
270
|
+
|
|
271
|
+
return res.reason
|
|
272
|
+
|
|
273
|
+
async def _a_generate_reason(self, metric: BaseConversationalMetric):
|
|
274
|
+
prompt = ConversationalVerdictNodeTemplate.generate_reason(
|
|
275
|
+
verbose_steps=metric._verbose_steps,
|
|
276
|
+
score=metric.score,
|
|
277
|
+
name=metric.__name__,
|
|
278
|
+
)
|
|
279
|
+
if metric.using_native_model:
|
|
280
|
+
res, cost = await metric.model.a_generate(
|
|
281
|
+
prompt, schema=MetricScoreReason
|
|
282
|
+
)
|
|
283
|
+
metric.evaluation_cost += cost
|
|
284
|
+
else:
|
|
285
|
+
try:
|
|
286
|
+
res: MetricScoreReason = await metric.model.a_generate(
|
|
287
|
+
prompt, schema=MetricScoreReason
|
|
288
|
+
)
|
|
289
|
+
except TypeError:
|
|
290
|
+
res = await metric.model.a_generate(prompt)
|
|
291
|
+
data = trimAndLoadJson(res, self)
|
|
292
|
+
res = MetricScoreReason(**data)
|
|
293
|
+
|
|
294
|
+
return res.reason
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
@dataclass
|
|
298
|
+
class ConversationalTaskNode(ConversationalBaseNode):
|
|
299
|
+
instructions: str
|
|
300
|
+
output_label: str
|
|
301
|
+
children: List[ConversationalBaseNode]
|
|
302
|
+
evaluation_params: List[TurnParams] = None
|
|
303
|
+
turn_window: Tuple[int, int] = None
|
|
304
|
+
label: Optional[str] = None
|
|
305
|
+
_verbose_logs: Optional[str] = None
|
|
306
|
+
_output: Optional[str] = None
|
|
307
|
+
_parents: Optional[List[ConversationalBaseNode]] = None
|
|
308
|
+
|
|
309
|
+
def __hash__(self):
|
|
310
|
+
return id(self)
|
|
311
|
+
|
|
312
|
+
def __post_init__(self):
|
|
313
|
+
for child in self.children:
|
|
314
|
+
if isinstance(child, ConversationalVerdictNode):
|
|
315
|
+
raise ValueError(
|
|
316
|
+
"A ConversationalTaskNode must not have a ConversationalVerdictNode as one of their 'children'."
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
for child in self.children:
|
|
320
|
+
child.set_parent(self)
|
|
321
|
+
increment_indegree(child)
|
|
322
|
+
|
|
323
|
+
def _execute(
|
|
324
|
+
self,
|
|
325
|
+
metric: BaseConversationalMetric,
|
|
326
|
+
test_case: ConversationalTestCase,
|
|
327
|
+
depth: int,
|
|
328
|
+
):
|
|
329
|
+
self._depth = max(0, self._depth, depth)
|
|
330
|
+
decrement_indegree(self)
|
|
331
|
+
if self._indegree > 0:
|
|
332
|
+
return
|
|
333
|
+
|
|
334
|
+
if self.evaluation_params is None and self._parents is None:
|
|
335
|
+
raise ValueError(
|
|
336
|
+
"A ConversationalTaskNode must have either a 'evaluation_params' or parent node(s)."
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
if self.turn_window is not None:
|
|
340
|
+
is_valid_turn_window(self.turn_window, test_case.turns)
|
|
341
|
+
|
|
342
|
+
if not self.turn_window:
|
|
343
|
+
self.turn_window = 0, len(test_case.turns) - 1
|
|
344
|
+
|
|
345
|
+
text = """"""
|
|
346
|
+
start, end = self.turn_window
|
|
347
|
+
if self._parents is not None:
|
|
348
|
+
for parent in self._parents:
|
|
349
|
+
if isinstance(parent, ConversationalTaskNode):
|
|
350
|
+
text += f"{parent.output_label}:\n{parent._output}\n\n"
|
|
351
|
+
|
|
352
|
+
if self.evaluation_params is not None:
|
|
353
|
+
text += "Full Conversation: \n"
|
|
354
|
+
for index in range(start, end + 1):
|
|
355
|
+
turn = test_case.turns[index]
|
|
356
|
+
for param in self.evaluation_params:
|
|
357
|
+
value = getattr(turn, param.value)
|
|
358
|
+
if isinstance(value, ToolCall):
|
|
359
|
+
value = repr(value)
|
|
360
|
+
text += f"{CONVERSATIONAL_G_EVAL_PARAMS[param]}:\n{value}\n"
|
|
361
|
+
text += "\n"
|
|
362
|
+
|
|
363
|
+
prompt = ConversationalTaskNodeTemplate.generate_task_output(
|
|
364
|
+
instructions=self.instructions,
|
|
365
|
+
text=text,
|
|
366
|
+
)
|
|
367
|
+
if metric.using_native_model:
|
|
368
|
+
res, cost = metric.model.generate(prompt, schema=TaskNodeOutput)
|
|
369
|
+
metric.evaluation_cost += cost
|
|
370
|
+
self._output = res.output
|
|
371
|
+
else:
|
|
372
|
+
try:
|
|
373
|
+
res: TaskNodeOutput = metric.model.generate(
|
|
374
|
+
prompt, schema=TaskNodeOutput
|
|
375
|
+
)
|
|
376
|
+
self._output = res.output
|
|
377
|
+
except TypeError:
|
|
378
|
+
res = metric.model.generate(prompt)
|
|
379
|
+
data = trimAndLoadJson(res, self)
|
|
380
|
+
self._output = TaskNodeOutput(**data).output
|
|
381
|
+
|
|
382
|
+
metric._verbose_steps.append(
|
|
383
|
+
construct_node_verbose_log(self, self._depth)
|
|
384
|
+
)
|
|
385
|
+
for children in self.children:
|
|
386
|
+
children._execute(
|
|
387
|
+
metric=metric, test_case=test_case, depth=self._depth + 1
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
async def _a_execute(
|
|
391
|
+
self,
|
|
392
|
+
metric: BaseConversationalMetric,
|
|
393
|
+
test_case: ConversationalTestCase,
|
|
394
|
+
depth: int,
|
|
395
|
+
):
|
|
396
|
+
self._depth = max(0, self._depth, depth)
|
|
397
|
+
decrement_indegree(self)
|
|
398
|
+
if self._indegree > 0:
|
|
399
|
+
return
|
|
400
|
+
|
|
401
|
+
if self.evaluation_params is None and self._parents is None:
|
|
402
|
+
raise ValueError(
|
|
403
|
+
"A ConversationalTaskNode must have either a 'evaluation_params' or parent node(s)."
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
if self.turn_window is not None:
|
|
407
|
+
is_valid_turn_window(self.turn_window, test_case.turns)
|
|
408
|
+
|
|
409
|
+
if not self.turn_window:
|
|
410
|
+
self.turn_window = 0, len(test_case.turns) - 1
|
|
411
|
+
|
|
412
|
+
text = """"""
|
|
413
|
+
start, end = self.turn_window
|
|
414
|
+
if self._parents is not None:
|
|
415
|
+
for parent in self._parents:
|
|
416
|
+
if isinstance(parent, ConversationalTaskNode):
|
|
417
|
+
text += f"{parent.output_label}:\n{parent._output}\n\n"
|
|
418
|
+
|
|
419
|
+
if self.evaluation_params is not None:
|
|
420
|
+
text += "Full Conversation: \n"
|
|
421
|
+
for index in range(start, end + 1):
|
|
422
|
+
turn = test_case.turns[index]
|
|
423
|
+
for param in self.evaluation_params:
|
|
424
|
+
value = getattr(turn, param.value)
|
|
425
|
+
if isinstance(value, ToolCall):
|
|
426
|
+
value = repr(value)
|
|
427
|
+
text += f"{CONVERSATIONAL_G_EVAL_PARAMS[param]}:\n{value}\n"
|
|
428
|
+
text += "\n"
|
|
429
|
+
|
|
430
|
+
prompt = ConversationalTaskNodeTemplate.generate_task_output(
|
|
431
|
+
instructions=self.instructions,
|
|
432
|
+
text=text,
|
|
433
|
+
)
|
|
434
|
+
if metric.using_native_model:
|
|
435
|
+
res, cost = await metric.model.a_generate(
|
|
436
|
+
prompt, schema=TaskNodeOutput
|
|
437
|
+
)
|
|
438
|
+
metric.evaluation_cost += cost
|
|
439
|
+
self._output = res.output
|
|
440
|
+
else:
|
|
441
|
+
try:
|
|
442
|
+
res: TaskNodeOutput = await metric.model.a_generate(
|
|
443
|
+
prompt, schema=TaskNodeOutput
|
|
444
|
+
)
|
|
445
|
+
self._output = res.output
|
|
446
|
+
except TypeError:
|
|
447
|
+
res = await metric.model.a_generate(prompt)
|
|
448
|
+
data = trimAndLoadJson(res, self)
|
|
449
|
+
self._output = TaskNodeOutput(**data).output
|
|
450
|
+
|
|
451
|
+
metric._verbose_steps.append(
|
|
452
|
+
construct_node_verbose_log(self, self._depth)
|
|
453
|
+
)
|
|
454
|
+
await asyncio.gather(
|
|
455
|
+
*(
|
|
456
|
+
child._a_execute(
|
|
457
|
+
metric=metric, test_case=test_case, depth=self._depth + 1
|
|
458
|
+
)
|
|
459
|
+
for child in self.children
|
|
460
|
+
)
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
@dataclass
|
|
465
|
+
class ConversationalBinaryJudgementNode(ConversationalBaseNode):
|
|
466
|
+
criteria: str
|
|
467
|
+
children: List[ConversationalVerdictNode]
|
|
468
|
+
evaluation_params: Optional[List[TurnParams]] = None
|
|
469
|
+
turn_window: Tuple[int, int] = None
|
|
470
|
+
label: Optional[str] = None
|
|
471
|
+
_verbose_logs: Optional[str] = None
|
|
472
|
+
_verdict: Optional[BinaryJudgementVerdict] = None
|
|
473
|
+
_parents: Optional[List[ConversationalBaseNode]] = None
|
|
474
|
+
|
|
475
|
+
def __hash__(self):
|
|
476
|
+
return id(self)
|
|
477
|
+
|
|
478
|
+
def __post_init__(self):
|
|
479
|
+
if len(self.children) != 2:
|
|
480
|
+
raise ValueError(
|
|
481
|
+
"ConversationalBinaryJudgementNode must have exactly 2 children."
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
# Check if all children are ClassificationResultNode and their classifications are boolean
|
|
485
|
+
for child in self.children:
|
|
486
|
+
if not isinstance(child, ConversationalVerdictNode):
|
|
487
|
+
raise TypeError(
|
|
488
|
+
"All children of ConversationalBinaryJudgementNode must be of type ConversationalVerdictNode."
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
if not isinstance(child.verdict, bool):
|
|
492
|
+
raise ValueError(
|
|
493
|
+
"All children of ConversationalBinaryJudgementNode must have a boolean verdict."
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
# Check if there is one True and one False classification
|
|
497
|
+
verdicts = [child.verdict for child in self.children]
|
|
498
|
+
if verdicts.count(True) != 1 or verdicts.count(False) != 1:
|
|
499
|
+
raise ValueError(
|
|
500
|
+
"ConversationalBinaryJudgementNode must have one True and one False ConversationalVerdictNode child."
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
# print("-------")
|
|
504
|
+
for child in self.children:
|
|
505
|
+
child.set_parent(self)
|
|
506
|
+
increment_indegree(child)
|
|
507
|
+
if child.child is not None and isinstance(
|
|
508
|
+
child.child, ConversationalBaseNode
|
|
509
|
+
):
|
|
510
|
+
increment_indegree(child.child)
|
|
511
|
+
# print("binary node nested", child.child.__class__.__name__, id(child.child), child.child._indegree)
|
|
512
|
+
# print("binary node", child.__class__.__name__, id(child), child._indegree)
|
|
513
|
+
# print("-------")
|
|
514
|
+
|
|
515
|
+
def _execute(
|
|
516
|
+
self,
|
|
517
|
+
metric: BaseConversationalMetric,
|
|
518
|
+
test_case: ConversationalTestCase,
|
|
519
|
+
depth: int,
|
|
520
|
+
):
|
|
521
|
+
self._depth = max(0, self._depth, depth)
|
|
522
|
+
decrement_indegree(self)
|
|
523
|
+
if self._indegree > 0:
|
|
524
|
+
return
|
|
525
|
+
|
|
526
|
+
if self.turn_window is not None:
|
|
527
|
+
is_valid_turn_window(self.turn_window, test_case.turns)
|
|
528
|
+
|
|
529
|
+
if not self.turn_window:
|
|
530
|
+
self.turn_window = 0, len(test_case.turns) - 1
|
|
531
|
+
|
|
532
|
+
text = """"""
|
|
533
|
+
start, end = self.turn_window
|
|
534
|
+
if self._parents is not None:
|
|
535
|
+
for parent in self._parents:
|
|
536
|
+
if isinstance(parent, ConversationalTaskNode):
|
|
537
|
+
text += f"{parent.output_label}:\n{parent._output}\n\n"
|
|
538
|
+
|
|
539
|
+
if self.evaluation_params is not None:
|
|
540
|
+
text += "Full Conversation: \n"
|
|
541
|
+
for index in range(start, end + 1):
|
|
542
|
+
turn = test_case.turns[index]
|
|
543
|
+
for param in self.evaluation_params:
|
|
544
|
+
value = getattr(turn, param.value)
|
|
545
|
+
if isinstance(value, ToolCall):
|
|
546
|
+
value = repr(value)
|
|
547
|
+
text += f"{CONVERSATIONAL_G_EVAL_PARAMS[param]}:\n{value}\n"
|
|
548
|
+
text += "\n"
|
|
549
|
+
|
|
550
|
+
prompt = ConversationalBinaryJudgementTemplate.generate_binary_verdict(
|
|
551
|
+
criteria=self.criteria,
|
|
552
|
+
text=text,
|
|
553
|
+
)
|
|
554
|
+
if metric.using_native_model:
|
|
555
|
+
res, cost = metric.model.generate(
|
|
556
|
+
prompt, schema=BinaryJudgementVerdict
|
|
557
|
+
)
|
|
558
|
+
metric.evaluation_cost += cost
|
|
559
|
+
self._verdict = res
|
|
560
|
+
else:
|
|
561
|
+
try:
|
|
562
|
+
res: BinaryJudgementVerdict = metric.model.generate(
|
|
563
|
+
prompt, schema=BinaryJudgementVerdict
|
|
564
|
+
)
|
|
565
|
+
self._verdict = res
|
|
566
|
+
except TypeError:
|
|
567
|
+
res = metric.model.generate(prompt)
|
|
568
|
+
data = trimAndLoadJson(res, self)
|
|
569
|
+
self._verdict = BinaryJudgementVerdict(**data)
|
|
570
|
+
|
|
571
|
+
metric._verbose_steps.append(
|
|
572
|
+
construct_node_verbose_log(self, self._depth)
|
|
573
|
+
)
|
|
574
|
+
for children in self.children:
|
|
575
|
+
children._execute(
|
|
576
|
+
metric=metric, test_case=test_case, depth=self._depth + 1
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
async def _a_execute(
|
|
580
|
+
self,
|
|
581
|
+
metric: BaseConversationalMetric,
|
|
582
|
+
test_case: ConversationalTestCase,
|
|
583
|
+
depth: int,
|
|
584
|
+
):
|
|
585
|
+
self._depth = max(0, self._depth, depth)
|
|
586
|
+
decrement_indegree(self)
|
|
587
|
+
if self._indegree > 0:
|
|
588
|
+
return
|
|
589
|
+
|
|
590
|
+
if self.turn_window is not None:
|
|
591
|
+
is_valid_turn_window(self.turn_window, test_case.turns)
|
|
592
|
+
|
|
593
|
+
if not self.turn_window:
|
|
594
|
+
self.turn_window = 0, len(test_case.turns) - 1
|
|
595
|
+
|
|
596
|
+
text = """"""
|
|
597
|
+
start, end = self.turn_window
|
|
598
|
+
if self._parents is not None:
|
|
599
|
+
for parent in self._parents:
|
|
600
|
+
if isinstance(parent, ConversationalTaskNode):
|
|
601
|
+
text += f"{parent.output_label}:\n{parent._output}\n\n"
|
|
602
|
+
|
|
603
|
+
if self.evaluation_params is not None:
|
|
604
|
+
text += "Full Conversation: \n"
|
|
605
|
+
for index in range(start, end + 1):
|
|
606
|
+
turn = test_case.turns[index]
|
|
607
|
+
for param in self.evaluation_params:
|
|
608
|
+
value = getattr(turn, param.value)
|
|
609
|
+
if isinstance(value, ToolCall):
|
|
610
|
+
value = repr(value)
|
|
611
|
+
text += f"{CONVERSATIONAL_G_EVAL_PARAMS[param]}:\n{value}\n"
|
|
612
|
+
text += "\n"
|
|
613
|
+
|
|
614
|
+
prompt = ConversationalBinaryJudgementTemplate.generate_binary_verdict(
|
|
615
|
+
criteria=self.criteria,
|
|
616
|
+
text=text,
|
|
617
|
+
)
|
|
618
|
+
if metric.using_native_model:
|
|
619
|
+
res, cost = await metric.model.a_generate(
|
|
620
|
+
prompt, schema=BinaryJudgementVerdict
|
|
621
|
+
)
|
|
622
|
+
metric.evaluation_cost += cost
|
|
623
|
+
self._verdict = res
|
|
624
|
+
else:
|
|
625
|
+
try:
|
|
626
|
+
res: BinaryJudgementVerdict = await metric.model.a_generate(
|
|
627
|
+
prompt, schema=BinaryJudgementVerdict
|
|
628
|
+
)
|
|
629
|
+
self._verdict = res
|
|
630
|
+
except TypeError:
|
|
631
|
+
res = await metric.model.a_generate(prompt)
|
|
632
|
+
data = trimAndLoadJson(res, self)
|
|
633
|
+
self._verdict = BinaryJudgementVerdict(**data)
|
|
634
|
+
|
|
635
|
+
metric._verbose_steps.append(
|
|
636
|
+
construct_node_verbose_log(self, self._depth)
|
|
637
|
+
)
|
|
638
|
+
await asyncio.gather(
|
|
639
|
+
*(
|
|
640
|
+
child._a_execute(
|
|
641
|
+
metric=metric, test_case=test_case, depth=self._depth + 1
|
|
642
|
+
)
|
|
643
|
+
for child in self.children
|
|
644
|
+
)
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
@dataclass
|
|
649
|
+
class ConversationalNonBinaryJudgementNode(ConversationalBaseNode):
|
|
650
|
+
criteria: str
|
|
651
|
+
children: List[ConversationalVerdictNode]
|
|
652
|
+
evaluation_params: Optional[List[TurnParams]] = None
|
|
653
|
+
turn_window: Tuple[int, int] = None
|
|
654
|
+
label: Optional[str] = None
|
|
655
|
+
_verbose_logs: Optional[str] = None
|
|
656
|
+
_verdict: Optional[NonBinaryJudgementVerdict] = None
|
|
657
|
+
_parents: Optional[List[ConversationalBaseNode]] = None
|
|
658
|
+
|
|
659
|
+
def __hash__(self):
|
|
660
|
+
return id(self)
|
|
661
|
+
|
|
662
|
+
def __post_init__(self):
|
|
663
|
+
# Check if children is not empty
|
|
664
|
+
if not self.children:
|
|
665
|
+
raise ValueError(
|
|
666
|
+
"ConversationalNonBinaryJudgementNode must have at least one child."
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
verdicts_set = set()
|
|
670
|
+
for child in self.children:
|
|
671
|
+
if not isinstance(child, ConversationalVerdictNode):
|
|
672
|
+
raise TypeError(
|
|
673
|
+
"All children must be of type ConversationalVerdictNode."
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
# Check if the verdict attribute of each child is a string
|
|
677
|
+
if not isinstance(child.verdict, str):
|
|
678
|
+
raise ValueError(
|
|
679
|
+
"The verdict attribute of all children must be a string."
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
# Check for duplicate verdicts
|
|
683
|
+
if child.verdict in verdicts_set:
|
|
684
|
+
raise ValueError(
|
|
685
|
+
f"Duplicate verdict found: {child.verdict} in children of ConversationalNonBinaryJudgementNode."
|
|
686
|
+
)
|
|
687
|
+
verdicts_set.add(child.verdict)
|
|
688
|
+
|
|
689
|
+
self._verdict_options = list(verdicts_set)
|
|
690
|
+
|
|
691
|
+
# Dynamically create ConversationalNonBinaryJudgementNode class
|
|
692
|
+
self._verdict_schema = create_model(
|
|
693
|
+
"ConversationalNonBinaryJudgementNode",
|
|
694
|
+
verdict=(Literal[tuple(self._verdict_options)], ...),
|
|
695
|
+
reason=(str, ...),
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
# print("-------")
|
|
699
|
+
for child in self.children:
|
|
700
|
+
child.set_parent(self)
|
|
701
|
+
increment_indegree(child)
|
|
702
|
+
if child.child is not None and isinstance(
|
|
703
|
+
child.child, ConversationalBaseNode
|
|
704
|
+
):
|
|
705
|
+
increment_indegree(child.child)
|
|
706
|
+
# print("non binary node nested", child.child.__class__.__name__, id(child.child), child.child._indegree)
|
|
707
|
+
# print("non binary node", child.__class__.__name__, id(child), child._indegree)
|
|
708
|
+
# print("-------")
|
|
709
|
+
|
|
710
|
+
def _execute(
|
|
711
|
+
self,
|
|
712
|
+
metric: BaseConversationalMetric,
|
|
713
|
+
test_case: ConversationalTestCase,
|
|
714
|
+
depth: int,
|
|
715
|
+
):
|
|
716
|
+
self._depth = max(0, self._depth, depth)
|
|
717
|
+
decrement_indegree(self)
|
|
718
|
+
if self._indegree > 0:
|
|
719
|
+
return
|
|
720
|
+
|
|
721
|
+
if self.turn_window is not None:
|
|
722
|
+
is_valid_turn_window(self.turn_window, test_case.turns)
|
|
723
|
+
|
|
724
|
+
if not self.turn_window:
|
|
725
|
+
self.turn_window = 0, len(test_case.turns) - 1
|
|
726
|
+
|
|
727
|
+
text = """"""
|
|
728
|
+
start, end = self.turn_window
|
|
729
|
+
if self._parents is not None:
|
|
730
|
+
for parent in self._parents:
|
|
731
|
+
if isinstance(parent, ConversationalTaskNode):
|
|
732
|
+
text += f"{parent.output_label}:\n{parent._output}\n\n"
|
|
733
|
+
|
|
734
|
+
if self.evaluation_params is not None:
|
|
735
|
+
text += "Full Conversation: \n"
|
|
736
|
+
for index in range(start, end + 1):
|
|
737
|
+
turn = test_case.turns[index]
|
|
738
|
+
for param in self.evaluation_params:
|
|
739
|
+
value = getattr(turn, param.value)
|
|
740
|
+
if isinstance(value, ToolCall):
|
|
741
|
+
value = repr(value)
|
|
742
|
+
text += f"{CONVERSATIONAL_G_EVAL_PARAMS[param]}:\n{value}\n"
|
|
743
|
+
text += "\n"
|
|
744
|
+
|
|
745
|
+
prompt = ConversationalNonBinaryJudgementTemplate.generate_non_binary_verdict(
|
|
746
|
+
criteria=self.criteria, text=text, options=self._verdict_options
|
|
747
|
+
)
|
|
748
|
+
if metric.using_native_model:
|
|
749
|
+
res, cost = metric.model.generate(
|
|
750
|
+
prompt, schema=self._verdict_schema
|
|
751
|
+
)
|
|
752
|
+
metric.evaluation_cost += cost
|
|
753
|
+
self._verdict = res
|
|
754
|
+
else:
|
|
755
|
+
try:
|
|
756
|
+
res: self._verdict_schema = metric.model.generate(
|
|
757
|
+
prompt, schema=self._verdict_schema
|
|
758
|
+
)
|
|
759
|
+
self._verdict = res
|
|
760
|
+
except TypeError:
|
|
761
|
+
res = metric.model.generate(prompt)
|
|
762
|
+
data = trimAndLoadJson(res, self)
|
|
763
|
+
self._verdict = self._verdict_schema(**data)
|
|
764
|
+
|
|
765
|
+
metric._verbose_steps.append(
|
|
766
|
+
construct_node_verbose_log(self, self._depth)
|
|
767
|
+
)
|
|
768
|
+
for children in self.children:
|
|
769
|
+
children._execute(
|
|
770
|
+
metric=metric, test_case=test_case, depth=self._depth + 1
|
|
771
|
+
)
|
|
772
|
+
|
|
773
|
+
async def _a_execute(
|
|
774
|
+
self,
|
|
775
|
+
metric: BaseConversationalMetric,
|
|
776
|
+
test_case: ConversationalTestCase,
|
|
777
|
+
depth: int,
|
|
778
|
+
):
|
|
779
|
+
self._depth = max(0, self._depth, depth)
|
|
780
|
+
decrement_indegree(self)
|
|
781
|
+
if self._indegree > 0:
|
|
782
|
+
return
|
|
783
|
+
|
|
784
|
+
if self.turn_window is not None:
|
|
785
|
+
is_valid_turn_window(self.turn_window, test_case.turns)
|
|
786
|
+
|
|
787
|
+
if not self.turn_window:
|
|
788
|
+
self.turn_window = 0, len(test_case.turns) - 1
|
|
789
|
+
|
|
790
|
+
text = """"""
|
|
791
|
+
start, end = self.turn_window
|
|
792
|
+
if self._parents is not None:
|
|
793
|
+
for parent in self._parents:
|
|
794
|
+
if isinstance(parent, ConversationalTaskNode):
|
|
795
|
+
text += f"{parent.output_label}:\n{parent._output}\n\n"
|
|
796
|
+
|
|
797
|
+
if self.evaluation_params is not None:
|
|
798
|
+
text += "Full Conversation: \n"
|
|
799
|
+
for index in range(start, end + 1):
|
|
800
|
+
turn = test_case.turns[index]
|
|
801
|
+
for param in self.evaluation_params:
|
|
802
|
+
value = getattr(turn, param.value)
|
|
803
|
+
if isinstance(value, ToolCall):
|
|
804
|
+
value = repr(value)
|
|
805
|
+
text += f"{CONVERSATIONAL_G_EVAL_PARAMS[param]}:\n{value}\n"
|
|
806
|
+
text += "\n"
|
|
807
|
+
|
|
808
|
+
prompt = ConversationalNonBinaryJudgementTemplate.generate_non_binary_verdict(
|
|
809
|
+
criteria=self.criteria, text=text, options=self._verdict_options
|
|
810
|
+
)
|
|
811
|
+
if metric.using_native_model:
|
|
812
|
+
res, cost = await metric.model.a_generate(
|
|
813
|
+
prompt, schema=self._verdict_schema
|
|
814
|
+
)
|
|
815
|
+
metric.evaluation_cost += cost
|
|
816
|
+
self._verdict = res
|
|
817
|
+
else:
|
|
818
|
+
try:
|
|
819
|
+
res: self._verdict_schema = await metric.model.a_generate(
|
|
820
|
+
prompt, schema=self._verdict_schema
|
|
821
|
+
)
|
|
822
|
+
self._verdict = res
|
|
823
|
+
except TypeError:
|
|
824
|
+
res = await metric.model.a_generate(prompt)
|
|
825
|
+
data = trimAndLoadJson(res, self)
|
|
826
|
+
self._verdict = self._verdict_schema(**data)
|
|
827
|
+
|
|
828
|
+
metric._verbose_steps.append(
|
|
829
|
+
construct_node_verbose_log(self, self._depth)
|
|
830
|
+
)
|
|
831
|
+
await asyncio.gather(
|
|
832
|
+
*(
|
|
833
|
+
child._a_execute(
|
|
834
|
+
metric=metric, test_case=test_case, depth=self._depth + 1
|
|
835
|
+
)
|
|
836
|
+
for child in self.children
|
|
837
|
+
)
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
|
|
841
|
+
def construct_node_verbose_log(
|
|
842
|
+
node: ConversationalBaseNode,
|
|
843
|
+
depth: int,
|
|
844
|
+
node_metric: Optional[
|
|
845
|
+
Union[ConversationalGEval, BaseConversationalMetric]
|
|
846
|
+
] = None,
|
|
847
|
+
) -> str:
|
|
848
|
+
if (
|
|
849
|
+
isinstance(node, ConversationalBinaryJudgementNode)
|
|
850
|
+
or isinstance(node, ConversationalNonBinaryJudgementNode)
|
|
851
|
+
or isinstance(node, ConversationalTaskNode)
|
|
852
|
+
):
|
|
853
|
+
label = node.label if node.label else "None"
|
|
854
|
+
|
|
855
|
+
if isinstance(node, ConversationalBinaryJudgementNode) or isinstance(
|
|
856
|
+
node, ConversationalNonBinaryJudgementNode
|
|
857
|
+
):
|
|
858
|
+
is_binary_node = isinstance(node, ConversationalBinaryJudgementNode)
|
|
859
|
+
node_type = (
|
|
860
|
+
"ConversationalBinaryJudgementNode"
|
|
861
|
+
if is_binary_node
|
|
862
|
+
else "ConversationalNonBinaryJudgementNode"
|
|
863
|
+
)
|
|
864
|
+
underscore_multiple = 34 if is_binary_node else 37
|
|
865
|
+
star_multiple = 48 if is_binary_node else 53
|
|
866
|
+
return (
|
|
867
|
+
f"{'_' * underscore_multiple}\n"
|
|
868
|
+
f"| {node_type} | Level == {depth} |\n"
|
|
869
|
+
f"{'*' * star_multiple}\n"
|
|
870
|
+
f"Label: {label}\n\n"
|
|
871
|
+
"Criteria:\n"
|
|
872
|
+
f"{node.criteria}\n\n"
|
|
873
|
+
f"Verdict: {node._verdict.verdict}\n"
|
|
874
|
+
f"Reason: {node._verdict.reason}\n"
|
|
875
|
+
)
|
|
876
|
+
elif isinstance(node, ConversationalTaskNode):
|
|
877
|
+
return (
|
|
878
|
+
"______________________________________________\n"
|
|
879
|
+
f"| ConversationalTaskNode | Level == {depth} |\n"
|
|
880
|
+
"**********************************************\n"
|
|
881
|
+
f"Label: {label}\n\n"
|
|
882
|
+
"Instructions:\n"
|
|
883
|
+
f"{node.instructions}\n\n"
|
|
884
|
+
f"{node.output_label}:\n{node._output}\n"
|
|
885
|
+
)
|
|
886
|
+
elif isinstance(node, ConversationalVerdictNode):
|
|
887
|
+
type = None
|
|
888
|
+
if node_metric:
|
|
889
|
+
if isinstance(node_metric, ConversationalGEval) or isinstance(
|
|
890
|
+
node_metric, BaseConversationalMetric
|
|
891
|
+
):
|
|
892
|
+
type = f"{node_metric.__name__} Metric"
|
|
893
|
+
else:
|
|
894
|
+
type = "Deterministic"
|
|
895
|
+
|
|
896
|
+
verbose_log = (
|
|
897
|
+
"_________________________________________________\n"
|
|
898
|
+
f"| ConversationalVerdictNode | Level == {depth} |\n"
|
|
899
|
+
"*************************************************\n"
|
|
900
|
+
f"Verdict: {node.verdict}\n"
|
|
901
|
+
f"Type: {type}"
|
|
902
|
+
)
|
|
903
|
+
if isinstance(node_metric, ConversationalGEval):
|
|
904
|
+
verbose_log += f"\n\nCriteria:\n{node_metric.criteria}\n"
|
|
905
|
+
verbose_log += f"Evaluation Steps:\n{prettify_list(node_metric.evaluation_steps)}"
|
|
906
|
+
elif isinstance(node_metric, BaseConversationalMetric):
|
|
907
|
+
verbose_log += f"\n\n{node_metric.verbose_logs}"
|
|
908
|
+
|
|
909
|
+
return verbose_log
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
def is_valid_turn_window(
|
|
913
|
+
turn_window: Tuple[int, int], turns: List[Turn]
|
|
914
|
+
) -> bool:
|
|
915
|
+
if len(turn_window) != 2:
|
|
916
|
+
raise ValueError(
|
|
917
|
+
"A 'turn_window' must have only 2 indices representing start and end"
|
|
918
|
+
)
|
|
919
|
+
start, end = turn_window
|
|
920
|
+
if (
|
|
921
|
+
start > end
|
|
922
|
+
or start == end
|
|
923
|
+
or (end - start) >= len(turns)
|
|
924
|
+
or start < 0
|
|
925
|
+
or end < 0
|
|
926
|
+
or end == len(turns)
|
|
927
|
+
):
|
|
928
|
+
raise ValueError(
|
|
929
|
+
"The 'turn_window' passed is invalid. Please recheck your 'turn_window' values."
|
|
930
|
+
)
|
|
931
|
+
return True
|